comparison src/libguess/cjk_impl.c @ 3213:1b9251ab3655 trunk

Split off CJK implementation from guess.c logic.
author William Pitcock <nenolod@atheme-project.org>
date Thu, 02 Aug 2007 01:38:15 -0500
parents
children 9418f74acdb7
comparison
equal deleted inserted replaced
3210:5939941ba48b 3213:1b9251ab3655
1 /*
2 * This code is derivative of guess.c of Gauche-0.8.3.
3 * The following is the original copyright notice.
4 */
5
6 /*
7 * guess.c - guessing character encoding
8 *
9 * Copyright (c) 2000-2003 Shiro Kawai, All rights reserved.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 *
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 *
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 *
22 * 3. Neither the name of the authors nor the names of its contributors
23 * may be used to endorse or promote products derived from this
24 * software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
32 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 *
38 */
39
40 #include "libguess.h"
41 #define NULL ((void *)0)
42
43 /* take precedence if scores are same. you can customize the order as: */
44 /* ORDER_** &highest, &second, ... &lowest */
45 #define ORDER_JP &utf8, &sjis, &eucj
46 #define ORDER_TW &utf8, &big5
47 #define ORDER_CN &utf8, &gb2312, &gb18030
48 #define ORDER_KR &utf8, &euck, &johab
49
50 /* workaround for that glib's g_convert can't convert
51 properly from UCS-2BE/LE trailing after BOM. */
52 #define WITH_G_CONVERT 1
53 /* #undef WITH_G_CONVERT */
54
55 #ifdef WITH_G_CONVERT
56 const char UCS_2BE[] = "UTF-16";
57 const char UCS_2LE[] = "UTF-16";
58 #else
59 const char UCS_2BE[] = "UCS-2BE";
60 const char UCS_2LE[] = "UCS-2LE";
61 #endif
62
63 /* data types */
64 typedef struct guess_arc_rec
65 {
66 unsigned int next; /* next state */
67 double score; /* score */
68 } guess_arc;
69
70 typedef struct guess_dfa_rec
71 {
72 signed char (*states)[256];
73 guess_arc *arcs;
74 int state;
75 double score;
76 } guess_dfa;
77
78 /* macros */
79 #define DFA_INIT(st, ar) \
80 { st, ar, 0, 1.0 }
81
82 #define DFA_NEXT(dfa, ch) \
83 do { \
84 int arc__; \
85 if (dfa.state >= 0) { \
86 arc__ = dfa.states[dfa.state][ch]; \
87 if (arc__ < 0) { \
88 dfa.state = -1; \
89 } else { \
90 dfa.state = dfa.arcs[arc__].next; \
91 dfa.score *= dfa.arcs[arc__].score; \
92 } \
93 } \
94 } while (0)
95
96 #define DFA_ALIVE(dfa) (dfa.state >= 0)
97
98 /* include DFA table generated by guess.scm */
99 #include "guess_tab.c"
100
101
102 int dfa_validate_utf8(const char *buf, int buflen)
103 {
104 int i;
105 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
106
107 for (i = 0; i < buflen; i++) {
108 int c = (unsigned char) buf[i];
109
110 if (DFA_ALIVE(utf8))
111 DFA_NEXT(utf8, c);
112 else
113 break;
114 }
115
116 if(DFA_ALIVE(utf8))
117 return 1;
118 else
119 return 0;
120 }
121
122 const char *guess_jp(const char *buf, int buflen)
123 {
124 int i;
125 guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar);
126 guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar);
127 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
128 guess_dfa *top = NULL;
129
130 guess_dfa *order[] = { ORDER_JP, NULL };
131
132 for (i = 0; i < buflen; i++) {
133 int c = (unsigned char) buf[i];
134
135 /* special treatment of iso-2022 escape sequence */
136 if (c == 0x1b) {
137 if (i < buflen - 1) {
138 c = (unsigned char) buf[++i];
139 if (c == '$' || c == '(')
140 return "ISO-2022-JP";
141 }
142 }
143
144 /* special treatment of BOM */
145 if (i == 0 && c == 0xff) {
146 if (i < buflen - 1) {
147 c = (unsigned char) buf[i + 1];
148 if (c == 0xfe)
149 return UCS_2LE;
150 }
151 }
152 if (i == 0 && c == 0xfe) {
153 if (i < buflen - 1) {
154 c = (unsigned char) buf[i + 1];
155 if (c == 0xff)
156 return UCS_2BE;
157 }
158 }
159
160 if (DFA_ALIVE(eucj)) {
161 if (!DFA_ALIVE(sjis) && !DFA_ALIVE(utf8))
162 return "EUC-JP";
163 DFA_NEXT(eucj, c);
164 }
165 if (DFA_ALIVE(sjis)) {
166 if (!DFA_ALIVE(eucj) && !DFA_ALIVE(utf8))
167 return "SJIS";
168 DFA_NEXT(sjis, c);
169 }
170 if (DFA_ALIVE(utf8)) {
171 if (!DFA_ALIVE(sjis) && !DFA_ALIVE(eucj))
172 return "UTF-8";
173 DFA_NEXT(utf8, c);
174 }
175
176 if (!DFA_ALIVE(eucj) && !DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) {
177 /* we ran out the possibilities */
178 return NULL;
179 }
180 }
181
182 /* Now, we have ambigous code. Pick the highest score. If more than
183 one candidate tie, pick the default encoding. */
184 for (i = 0; order[i] != NULL; i++) {
185 if (order[i]->state >= 0) { //DFA_ALIVE()
186 if (top == NULL || order[i]->score > top->score)
187 top = order[i];
188 }
189 }
190
191 if (top == &eucj)
192 return "EUC-JP";
193 if (top == &utf8)
194 return "UTF-8";
195 if (top == &sjis)
196 return "SJIS";
197 return NULL;
198 }
199
200 const char *guess_tw(const char *buf, int buflen)
201 {
202 int i;
203 guess_dfa big5 = DFA_INIT(guess_big5_st, guess_big5_ar);
204 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
205 guess_dfa *top = NULL;
206
207 guess_dfa *order[] = { ORDER_TW, NULL };
208
209 for (i = 0; i < buflen; i++) {
210 int c = (unsigned char) buf[i];
211
212 /* special treatment of iso-2022 escape sequence */
213 if (c == 0x1b) {
214 if (i < buflen - 1) {
215 c = (unsigned char) buf[++i];
216 if (c == '$' || c == '(')
217 return "ISO-2022-TW";
218 }
219 }
220
221 /* special treatment of BOM */
222 if (i == 0 && c == 0xff) {
223 if (i < buflen - 1) {
224 c = (unsigned char) buf[i + 1];
225 if (c == 0xfe)
226 return UCS_2LE;
227 }
228 }
229 if (i == 0 && c == 0xfe) {
230 if (i < buflen - 1) {
231 c = (unsigned char) buf[i + 1];
232 if (c == 0xff)
233 return UCS_2BE;
234 }
235 }
236
237 if (DFA_ALIVE(big5)) {
238 if (!DFA_ALIVE(utf8))
239 return "BIG5";
240 DFA_NEXT(big5, c);
241 }
242 if (DFA_ALIVE(utf8)) {
243 if (!DFA_ALIVE(big5))
244 return "UTF-8";
245 DFA_NEXT(utf8, c);
246 }
247
248 if (!DFA_ALIVE(big5) && !DFA_ALIVE(utf8)) {
249 /* we ran out the possibilities */
250 return NULL;
251 }
252 }
253
254 /* Now, we have ambigous code. Pick the highest score. If more than
255 one candidate tie, pick the default encoding. */
256 for (i = 0; order[i] != NULL; i++) {
257 if (order[i]->state >= 0) { //DFA_ALIVE()
258 if (top == NULL || order[i]->score > top->score)
259 top = order[i];
260 }
261 }
262
263 if (top == &big5)
264 return "BIG5";
265 if (top == &utf8)
266 return "UTF-8";
267 return NULL;
268 }
269
270 const char *guess_cn(const char *buf, int buflen)
271 {
272 int i;
273 guess_dfa gb2312 = DFA_INIT(guess_gb2312_st, guess_gb2312_ar);
274 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
275 guess_dfa gb18030 = DFA_INIT(guess_gb18030_st, guess_gb18030_ar);
276 guess_dfa *top = NULL;
277
278 guess_dfa *order[] = { ORDER_CN, NULL };
279
280 for (i = 0; i < buflen; i++) {
281 int c = (unsigned char) buf[i];
282 int c2;
283
284 /* special treatment of iso-2022 escape sequence */
285 if (c == 0x1b) {
286 if (i < buflen - 1) {
287 c = (unsigned char) buf[i + 1];
288 c2 = (unsigned char) buf[i + 2];
289 if (c == '$' && (c2 == ')' || c2 == '+'))
290 return "ISO-2022-CN";
291 }
292 }
293
294 /* special treatment of BOM */
295 if (i == 0 && c == 0xff) {
296 if (i < buflen - 1) {
297 c = (unsigned char) buf[i + 1];
298 if (c == 0xfe)
299 return UCS_2LE;
300 }
301 }
302 if (i == 0 && c == 0xfe) {
303 if (i < buflen - 1) {
304 c = (unsigned char) buf[i + 1];
305 if (c == 0xff)
306 return UCS_2BE;
307 }
308 }
309
310 if (DFA_ALIVE(gb2312)) {
311 if (!DFA_ALIVE(utf8) && !DFA_ALIVE(gb18030))
312 return "GB2312";
313 DFA_NEXT(gb2312, c);
314 }
315 if (DFA_ALIVE(utf8)) {
316 if (!DFA_ALIVE(gb2312) && !DFA_ALIVE(gb18030))
317 return "UTF-8";
318 DFA_NEXT(utf8, c);
319 }
320 if (DFA_ALIVE(gb18030)) {
321 if (!DFA_ALIVE(utf8) && !DFA_ALIVE(gb2312))
322 return "GB18030";
323 DFA_NEXT(gb18030, c);
324 }
325
326 if (!DFA_ALIVE(gb2312) && !DFA_ALIVE(utf8) && !DFA_ALIVE(gb18030)) {
327 /* we ran out the possibilities */
328 return NULL;
329 }
330 }
331
332 /* Now, we have ambigous code. Pick the highest score. If more than
333 one candidate tie, pick the default encoding. */
334 for (i = 0; order[i] != NULL; i++) {
335 if (order[i]->state >= 0) { //DFA_ALIVE()
336 if (top == NULL || order[i]->score > top->score)
337 top = order[i];
338 }
339 }
340
341 if (top == &gb2312)
342 return "GB2312";
343 if (top == &utf8)
344 return "UTF-8";
345 if (top == &gb18030)
346 return "GB18030";
347 return NULL;
348 }
349
350 const char *guess_kr(const char *buf, int buflen)
351 {
352 int i;
353 guess_dfa euck = DFA_INIT(guess_euck_st, guess_euck_ar);
354 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
355 guess_dfa johab = DFA_INIT(guess_johab_st, guess_johab_ar);
356 guess_dfa *top = NULL;
357
358 guess_dfa *order[] = { ORDER_KR, NULL };
359
360 for (i = 0; i < buflen; i++) {
361 int c = (unsigned char) buf[i];
362 int c2;
363
364 /* special treatment of iso-2022 escape sequence */
365 if (c == 0x1b) {
366 if (i < buflen - 1) {
367 c = (unsigned char) buf[i + 1];
368 c2 = (unsigned char) buf[i + 2];
369 if (c == '$' && c2 == ')')
370 return "ISO-2022-KR";
371 }
372 }
373
374 /* special treatment of BOM */
375 if (i == 0 && c == 0xff) {
376 if (i < buflen - 1) {
377 c = (unsigned char) buf[i + 1];
378 if (c == 0xfe)
379 return UCS_2LE;
380 }
381 }
382 if (i == 0 && c == 0xfe) {
383 if (i < buflen - 1) {
384 c = (unsigned char) buf[i + 1];
385 if (c == 0xff)
386 return UCS_2BE;
387 }
388 }
389
390 if (DFA_ALIVE(euck)) {
391 if (!DFA_ALIVE(johab) && !DFA_ALIVE(utf8))
392 return "EUC-KR";
393 DFA_NEXT(euck, c);
394 }
395 if (DFA_ALIVE(johab)) {
396 if (!DFA_ALIVE(euck) && !DFA_ALIVE(utf8))
397 return "JOHAB";
398 DFA_NEXT(johab, c);
399 }
400 if (DFA_ALIVE(utf8)) {
401 if (!DFA_ALIVE(euck) && !DFA_ALIVE(johab))
402 return "UTF-8";
403 DFA_NEXT(utf8, c);
404 }
405
406 if (!DFA_ALIVE(euck) && !DFA_ALIVE(johab) && !DFA_ALIVE(utf8)) {
407 /* we ran out the possibilities */
408 return NULL;
409 }
410 }
411
412 /* Now, we have ambigous code. Pick the highest score. If more than
413 one candidate tie, pick the default encoding. */
414 for (i = 0; order[i] != NULL; i++) {
415 if (order[i]->state >= 0) { //DFA_ALIVE()
416 if (top == NULL || order[i]->score > top->score)
417 top = order[i];
418 }
419 }
420
421 if (top == &euck)
422 return "EUC-KR";
423 if (top == &utf8)
424 return "UTF-8";
425 if (top == &johab)
426 return "JOHAB";
427 return NULL;
428 }