comparison libpurple/protocols/gg/lib/encoding.c @ 31860:93b08d43f684

matekm and kkszysiu collaborated on this patch to update our internal libgadu to version 1.10.1.
author John Bailey <rekkanoryo@rekkanoryo.org>
date Thu, 24 Mar 2011 20:53:13 +0000
parents
children 3a90a59ddea2
comparison
equal deleted inserted replaced
31859:5043fc53f957 31860:93b08d43f684
1 /*
2 * (C) Copyright 2008-2009 Jakub Zawadzki <darkjames@darkjames.ath.cx>
3 * Wojtek Kaniewski <wojtekka@irc.pl>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public License Version
7 * 2.1 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307,
17 * USA.
18 */
19
20 #include <stdlib.h>
21 #include <string.h>
22 #include <errno.h>
23
24 #include "libgadu.h"
25
26 /**
27 * \file encoding.c
28 *
29 * \brief Funkcje konwersji kodowania tekstu
30 */
31
32 /**
33 * \internal Tablica konwersji CP1250 na Unikod.
34 */
35 static const uint16_t table_cp1250[] =
36 {
37 0x20ac, '?', 0x201a, '?', 0x201e, 0x2026, 0x2020, 0x2021,
38 '?', 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179,
39 '?', 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
40 '?', 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a,
41 0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7,
42 0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b,
43 0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
44 0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c,
45 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,
46 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
47 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,
48 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
49 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,
50 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
51 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,
52 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9,
53 };
54
55 /**
56 * \internal Zamienia tekst kodowany CP1250 na UTF-8.
57 *
58 * \param src Tekst źródłowy w CP1250.
59 * \param src_length Długość ciągu źródłowego (nigdy ujemna).
60 * \param dst_length Długość ciągu docelowego (jeśli -1, nieograniczona).
61 *
62 * \return Zaalokowany bufor z tekstem w UTF-8.
63 */
64 static char *gg_encoding_convert_cp1250_utf8(const char *src, int src_length, int dst_length)
65 {
66 int i, j, len;
67 char *result = NULL;
68
69 for (i = 0, len = 0; (src[i] != 0) && (i < src_length); i++) {
70 uint16_t uc;
71
72 if ((unsigned char) src[i] < 0x80)
73 uc = (unsigned char) src[i];
74 else
75 uc = table_cp1250[(unsigned char) src[i] - 128];
76
77 if (uc < 0x80)
78 len += 1;
79 else if (uc < 0x800)
80 len += 2;
81 else
82 len += 3;
83 }
84
85 if ((dst_length != -1) && (len > dst_length))
86 len = dst_length;
87
88 result = malloc(len + 1);
89
90 if (result == NULL)
91 return NULL;
92
93 for (i = 0, j = 0; (src[i] != 0) && (i < src_length) && (j < len); i++) {
94 uint16_t uc;
95
96 if ((unsigned char) src[i] < 0x80)
97 uc = (unsigned char) src[i];
98 else
99 uc = table_cp1250[(unsigned char) src[i] - 128];
100
101 if (uc < 0x80)
102 result[j++] = uc;
103 else if (uc < 0x800) {
104 if (j + 1 > len)
105 break;
106 result[j++] = 0xc0 | ((uc >> 6) & 0x1f);
107 result[j++] = 0x80 | (uc & 0x3f);
108 } else {
109 if (j + 2 > len)
110 break;
111 result[j++] = 0xe0 | ((uc >> 12) & 0x1f);
112 result[j++] = 0x80 | ((uc >> 6) & 0x3f);
113 result[j++] = 0x80 | (uc & 0x3f);
114 }
115 }
116
117 result[j] = 0;
118
119 return result;
120 }
121
122 /**
123 * \internal Zamienia tekst kodowany UTF-8 na CP1250.
124 *
125 * \param src Tekst źródłowy w UTF-8.
126 * \param src_length Długość ciągu źródłowego (nigdy ujemna).
127 * \param dst_length Długość ciągu docelowego (jeśli -1, nieograniczona).
128 *
129 * \return Zaalokowany bufor z tekstem w CP1250.
130 */
131 static char *gg_encoding_convert_utf8_cp1250(const char *src, int src_length, int dst_length)
132 {
133 char *result;
134 int i, j, len, uc_left = 0;
135 uint32_t uc = 0, uc_min = 0;
136
137 for (i = 0, len = 0; (src[i] != 0) && (i < src_length); i++) {
138 if ((src[i] & 0xc0) == 0xc0) {
139 len++;
140 } else if ((src[i] & 0x80) == 0x00) {
141 len++;
142 }
143 }
144
145 if ((dst_length != -1) && (len > dst_length))
146 len = dst_length;
147
148 result = malloc(len + 1);
149
150 if (result == NULL)
151 return NULL;
152
153 for (i = 0, j = 0; (src[i] != 0) && (i < src_length) && (j < len); i++) {
154 if ((unsigned char) src[i] >= 0xf5) {
155 if (uc_left != 0)
156 result[j++] = '?';
157 /* Restricted sequences */
158 result[j++] = '?';
159 uc_left = 0;
160 } else if ((src[i] & 0xf8) == 0xf0) {
161 if (uc_left != 0)
162 result[j++] = '?';
163 uc = src[i] & 0x07;
164 uc_left = 3;
165 uc_min = 0x10000;
166 } else if ((src[i] & 0xf0) == 0xe0) {
167 if (uc_left != 0)
168 result[j++] = '?';
169 uc = src[i] & 0x0f;
170 uc_left = 2;
171 uc_min = 0x800;
172 } else if ((src[i] & 0xe0) == 0xc0) {
173 if (uc_left != 0)
174 result[j++] = '?';
175 uc = src[i] & 0x1f;
176 uc_left = 1;
177 uc_min = 0x80;
178 } else if ((src[i] & 0xc0) == 0x80) {
179 if (uc_left > 0) {
180 uc <<= 6;
181 uc |= src[i] & 0x3f;
182 uc_left--;
183
184 if (uc_left == 0) {
185 int valid = 0;
186 int k;
187
188 if (uc >= uc_min) {
189 for (k = 0; k < 128; k++) {
190 if (uc == table_cp1250[k]) {
191 result[j++] = k + 128;
192 valid = 1;
193 break;
194 }
195 }
196 }
197
198 if (!valid && uc != 0xfeff) /* Byte Order Mark */
199 result[j++] = '?';
200 }
201 }
202 } else {
203 if (uc_left != 0) {
204 result[j++] = '?';
205 uc_left = 0;
206 }
207 result[j++] = src[i];
208 }
209 }
210
211 if ((uc_left != 0) && (src[i] == 0))
212 result[j++] = '?';
213
214 result[j] = 0;
215
216 return result;
217 }
218
219 /**
220 * \internal Zamienia kodowanie tekstu.
221 *
222 * \param src Tekst źródłowy.
223 * \param src_encoding Kodowanie tekstu źródłowego.
224 * \param dst_encoding Kodowanie tekstu docelowego.
225 * \param src_length Długość ciągu źródłowego w bajtach (nigdy ujemna).
226 * \param dst_length Długość ciągu docelowego w bajtach (jeśli -1, nieograniczona).
227 *
228 * \return Zaalokowany bufor z tekstem w kodowaniu docelowym.
229 */
230 char *gg_encoding_convert(const char *src, gg_encoding_t src_encoding, gg_encoding_t dst_encoding, int src_length, int dst_length)
231 {
232 char *result;
233
234 if (src == NULL) {
235 errno = EINVAL;
236 return NULL;
237 }
238
239 // specjalny przypadek obsługiwany ekspresowo
240 if ((dst_encoding == src_encoding) && (dst_length == -1) && (src_length == -1))
241 return strdup(src);
242
243 if (src_length == -1)
244 src_length = strlen(src);
245
246 if (dst_encoding == src_encoding) {
247 int len;
248
249 if (dst_length == -1)
250 len = src_length;
251 else
252 len = (src_length < dst_length) ? src_length : dst_length;
253
254 result = malloc(len + 1);
255
256 if (result == NULL)
257 return NULL;
258
259 strncpy(result, src, len);
260 result[len] = 0;
261
262 return result;
263 }
264
265 if (dst_encoding == GG_ENCODING_CP1250 && src_encoding == GG_ENCODING_UTF8)
266 return gg_encoding_convert_utf8_cp1250(src, src_length, dst_length);
267
268 if (dst_encoding == GG_ENCODING_UTF8 && src_encoding == GG_ENCODING_CP1250)
269 return gg_encoding_convert_cp1250_utf8(src, src_length, dst_length);
270
271 errno = EINVAL;
272 return NULL;
273 }
274