Mercurial > libguess
annotate cjk_impl.c @ 2:754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
- new UCS-2LE/BE DFAs
- now arabic_impl.c uses arabic DFAs
- dfa common macros have been moved to dfa.h
- minor cleanups
author | Yoshiki Yazawa <yaz@cc.rim.or.jp> |
---|---|
date | Wed, 11 Jun 2008 00:11:30 +0900 |
parents | d9b6ff839eab |
children | 70e2c306231e |
rev | line source |
---|---|
0 | 1 /* |
2 * This code is derivative of guess.c of Gauche-0.8.3. | |
3 * The following is the original copyright notice. | |
4 */ | |
5 | |
6 /* | |
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
7 * guess.c - guessing character encoding |
0 | 8 * |
9 * Copyright (c) 2000-2003 Shiro Kawai, All rights reserved. | |
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
10 * |
0 | 11 * Redistribution and use in source and binary forms, with or without |
12 * modification, are permitted provided that the following conditions | |
13 * are met: | |
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
14 * |
0 | 15 * 1. Redistributions of source code must retain the above copyright |
16 * notice, this list of conditions and the following disclaimer. | |
17 * | |
18 * 2. Redistributions in binary form must reproduce the above copyright | |
19 * notice, this list of conditions and the following disclaimer in the | |
20 * documentation and/or other materials provided with the distribution. | |
21 * | |
22 * 3. Neither the name of the authors nor the names of its contributors | |
23 * may be used to endorse or promote products derived from this | |
24 * software without specific prior written permission. | |
25 * | |
26 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
27 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
28 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
29 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
30 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
31 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED | |
32 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
33 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | |
34 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
35 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
36 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
37 * | |
38 */ | |
39 | |
40 #include "libguess.h" | |
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
41 #include "dfa.h" |
0 | 42 |
43 /* workaround for that glib's g_convert can't convert | |
44 properly from UCS-2BE/LE trailing after BOM. */ | |
45 #define WITH_G_CONVERT 1 | |
46 /* #undef WITH_G_CONVERT */ | |
47 | |
48 #ifdef WITH_G_CONVERT | |
49 const char UCS_2BE[] = "UTF-16"; | |
50 const char UCS_2LE[] = "UTF-16"; | |
51 #else | |
52 const char UCS_2BE[] = "UCS-2BE"; | |
53 const char UCS_2LE[] = "UCS-2LE"; | |
54 #endif | |
55 | |
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
56 /* take precedence if scores are same. you can customize the order as: */ |
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
57 /* ORDER_** &highest, &second, ... &lowest */ |
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
58 #define ORDER_JP &utf8, &sjis, &eucj |
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
59 #define ORDER_TW &utf8, &big5 |
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
60 #define ORDER_CN &utf8, &gb2312, &gb18030 |
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
61 #define ORDER_KR &utf8, &euck, &johab |
0 | 62 |
63 /* include DFA table generated by guess.scm */ | |
64 #include "guess_tab.c" | |
65 | |
66 | |
67 int dfa_validate_utf8(const char *buf, int buflen) | |
68 { | |
69 int i; | |
70 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); | |
71 | |
72 for (i = 0; i < buflen; i++) { | |
73 int c = (unsigned char) buf[i]; | |
74 | |
75 if (DFA_ALIVE(utf8)) | |
76 DFA_NEXT(utf8, c); | |
77 else | |
78 break; | |
79 } | |
80 | |
81 DFA_NEXT(utf8, '\0'); //Bug #53 | |
82 | |
83 if(DFA_ALIVE(utf8)) | |
84 return 1; | |
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
85 else |
0 | 86 return 0; |
87 } | |
88 | |
89 const char *guess_jp(const char *buf, int buflen) | |
90 { | |
91 int i; | |
92 guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar); | |
93 guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar); | |
94 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); | |
95 guess_dfa *top = NULL; | |
96 | |
97 guess_dfa *order[] = { ORDER_JP, NULL }; | |
98 | |
99 for (i = 0; i < buflen; i++) { | |
100 int c = (unsigned char) buf[i]; | |
101 | |
102 /* special treatment of iso-2022 escape sequence */ | |
103 if (c == 0x1b) { | |
104 if (i < buflen - 1) { | |
105 c = (unsigned char) buf[++i]; | |
106 if (c == '$' || c == '(') | |
107 return "ISO-2022-JP"; | |
108 } | |
109 } | |
110 | |
111 /* special treatment of BOM */ | |
112 if (i == 0 && c == 0xff) { | |
113 if (i < buflen - 1) { | |
114 c = (unsigned char) buf[i + 1]; | |
115 if (c == 0xfe) | |
116 return UCS_2LE; | |
117 } | |
118 } | |
119 if (i == 0 && c == 0xfe) { | |
120 if (i < buflen - 1) { | |
121 c = (unsigned char) buf[i + 1]; | |
122 if (c == 0xff) | |
123 return UCS_2BE; | |
124 } | |
125 } | |
126 | |
127 if (DFA_ALIVE(eucj)) { | |
128 if (!DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) | |
129 return "EUC-JP"; | |
130 DFA_NEXT(eucj, c); | |
131 } | |
132 if (DFA_ALIVE(sjis)) { | |
133 if (!DFA_ALIVE(eucj) && !DFA_ALIVE(utf8)) | |
134 return "SJIS"; | |
135 DFA_NEXT(sjis, c); | |
136 } | |
137 if (DFA_ALIVE(utf8)) { | |
138 if (!DFA_ALIVE(sjis) && !DFA_ALIVE(eucj)) | |
139 return "UTF-8"; | |
140 DFA_NEXT(utf8, c); | |
141 } | |
142 | |
143 if (!DFA_ALIVE(eucj) && !DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) { | |
144 /* we ran out the possibilities */ | |
145 return NULL; | |
146 } | |
147 } | |
148 | |
149 /* Now, we have ambigous code. Pick the highest score. If more than | |
150 one candidate tie, pick the default encoding. */ | |
151 for (i = 0; order[i] != NULL; i++) { | |
152 if (order[i]->state >= 0) { //DFA_ALIVE() | |
153 if (top == NULL || order[i]->score > top->score) | |
154 top = order[i]; | |
155 } | |
156 } | |
157 | |
158 if (top == &eucj) | |
159 return "EUC-JP"; | |
160 if (top == &utf8) | |
161 return "UTF-8"; | |
162 if (top == &sjis) | |
163 return "SJIS"; | |
164 return NULL; | |
165 } | |
166 | |
167 const char *guess_tw(const char *buf, int buflen) | |
168 { | |
169 int i; | |
170 guess_dfa big5 = DFA_INIT(guess_big5_st, guess_big5_ar); | |
171 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); | |
172 guess_dfa *top = NULL; | |
173 | |
174 guess_dfa *order[] = { ORDER_TW, NULL }; | |
175 | |
176 for (i = 0; i < buflen; i++) { | |
177 int c = (unsigned char) buf[i]; | |
178 | |
179 /* special treatment of iso-2022 escape sequence */ | |
180 if (c == 0x1b) { | |
181 if (i < buflen - 1) { | |
182 c = (unsigned char) buf[++i]; | |
183 if (c == '$' || c == '(') | |
184 return "ISO-2022-TW"; | |
185 } | |
186 } | |
187 | |
188 /* special treatment of BOM */ | |
189 if (i == 0 && c == 0xff) { | |
190 if (i < buflen - 1) { | |
191 c = (unsigned char) buf[i + 1]; | |
192 if (c == 0xfe) | |
193 return UCS_2LE; | |
194 } | |
195 } | |
196 if (i == 0 && c == 0xfe) { | |
197 if (i < buflen - 1) { | |
198 c = (unsigned char) buf[i + 1]; | |
199 if (c == 0xff) | |
200 return UCS_2BE; | |
201 } | |
202 } | |
203 | |
204 if (DFA_ALIVE(big5)) { | |
205 if (!DFA_ALIVE(utf8)) | |
206 return "BIG5"; | |
207 DFA_NEXT(big5, c); | |
208 } | |
209 if (DFA_ALIVE(utf8)) { | |
210 if (!DFA_ALIVE(big5)) | |
211 return "UTF-8"; | |
212 DFA_NEXT(utf8, c); | |
213 } | |
214 | |
215 if (!DFA_ALIVE(big5) && !DFA_ALIVE(utf8)) { | |
216 /* we ran out the possibilities */ | |
217 return NULL; | |
218 } | |
219 } | |
220 | |
221 /* Now, we have ambigous code. Pick the highest score. If more than | |
222 one candidate tie, pick the default encoding. */ | |
223 for (i = 0; order[i] != NULL; i++) { | |
224 if (order[i]->state >= 0) { //DFA_ALIVE() | |
225 if (top == NULL || order[i]->score > top->score) | |
226 top = order[i]; | |
227 } | |
228 } | |
229 | |
230 if (top == &big5) | |
231 return "BIG5"; | |
232 if (top == &utf8) | |
233 return "UTF-8"; | |
234 return NULL; | |
235 } | |
236 | |
237 const char *guess_cn(const char *buf, int buflen) | |
238 { | |
239 int i; | |
240 guess_dfa gb2312 = DFA_INIT(guess_gb2312_st, guess_gb2312_ar); | |
241 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); | |
242 guess_dfa gb18030 = DFA_INIT(guess_gb18030_st, guess_gb18030_ar); | |
243 guess_dfa *top = NULL; | |
244 | |
245 guess_dfa *order[] = { ORDER_CN, NULL }; | |
246 | |
247 for (i = 0; i < buflen; i++) { | |
248 int c = (unsigned char) buf[i]; | |
249 int c2; | |
250 | |
251 /* special treatment of iso-2022 escape sequence */ | |
252 if (c == 0x1b) { | |
253 if (i < buflen - 1) { | |
254 c = (unsigned char) buf[i + 1]; | |
255 c2 = (unsigned char) buf[i + 2]; | |
256 if (c == '$' && (c2 == ')' || c2 == '+')) | |
257 return "ISO-2022-CN"; | |
258 } | |
259 } | |
260 | |
261 /* special treatment of BOM */ | |
262 if (i == 0 && c == 0xff) { | |
263 if (i < buflen - 1) { | |
264 c = (unsigned char) buf[i + 1]; | |
265 if (c == 0xfe) | |
266 return UCS_2LE; | |
267 } | |
268 } | |
269 if (i == 0 && c == 0xfe) { | |
270 if (i < buflen - 1) { | |
271 c = (unsigned char) buf[i + 1]; | |
272 if (c == 0xff) | |
273 return UCS_2BE; | |
274 } | |
275 } | |
276 | |
277 if (DFA_ALIVE(gb2312)) { | |
278 if (!DFA_ALIVE(utf8) && !DFA_ALIVE(gb18030)) | |
279 return "GB2312"; | |
280 DFA_NEXT(gb2312, c); | |
281 } | |
282 if (DFA_ALIVE(utf8)) { | |
283 if (!DFA_ALIVE(gb2312) && !DFA_ALIVE(gb18030)) | |
284 return "UTF-8"; | |
285 DFA_NEXT(utf8, c); | |
286 } | |
287 if (DFA_ALIVE(gb18030)) { | |
288 if (!DFA_ALIVE(utf8) && !DFA_ALIVE(gb2312)) | |
289 return "GB18030"; | |
290 DFA_NEXT(gb18030, c); | |
291 } | |
292 | |
293 if (!DFA_ALIVE(gb2312) && !DFA_ALIVE(utf8) && !DFA_ALIVE(gb18030)) { | |
294 /* we ran out the possibilities */ | |
295 return NULL; | |
296 } | |
297 } | |
298 | |
299 /* Now, we have ambigous code. Pick the highest score. If more than | |
300 one candidate tie, pick the default encoding. */ | |
301 for (i = 0; order[i] != NULL; i++) { | |
302 if (order[i]->state >= 0) { //DFA_ALIVE() | |
303 if (top == NULL || order[i]->score > top->score) | |
304 top = order[i]; | |
305 } | |
306 } | |
307 | |
308 if (top == &gb2312) | |
309 return "GB2312"; | |
310 if (top == &utf8) | |
311 return "UTF-8"; | |
312 if (top == &gb18030) | |
313 return "GB18030"; | |
314 return NULL; | |
315 } | |
316 | |
317 const char *guess_kr(const char *buf, int buflen) | |
318 { | |
319 int i; | |
320 guess_dfa euck = DFA_INIT(guess_euck_st, guess_euck_ar); | |
321 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); | |
322 guess_dfa johab = DFA_INIT(guess_johab_st, guess_johab_ar); | |
323 guess_dfa *top = NULL; | |
324 | |
325 guess_dfa *order[] = { ORDER_KR, NULL }; | |
326 | |
327 for (i = 0; i < buflen; i++) { | |
328 int c = (unsigned char) buf[i]; | |
329 int c2; | |
330 | |
331 /* special treatment of iso-2022 escape sequence */ | |
332 if (c == 0x1b) { | |
333 if (i < buflen - 1) { | |
334 c = (unsigned char) buf[i + 1]; | |
335 c2 = (unsigned char) buf[i + 2]; | |
336 if (c == '$' && c2 == ')') | |
337 return "ISO-2022-KR"; | |
338 } | |
339 } | |
340 | |
341 /* special treatment of BOM */ | |
342 if (i == 0 && c == 0xff) { | |
343 if (i < buflen - 1) { | |
344 c = (unsigned char) buf[i + 1]; | |
345 if (c == 0xfe) | |
346 return UCS_2LE; | |
347 } | |
348 } | |
349 if (i == 0 && c == 0xfe) { | |
350 if (i < buflen - 1) { | |
351 c = (unsigned char) buf[i + 1]; | |
352 if (c == 0xff) | |
353 return UCS_2BE; | |
354 } | |
355 } | |
356 | |
357 if (DFA_ALIVE(euck)) { | |
358 if (!DFA_ALIVE(johab) && !DFA_ALIVE(utf8)) | |
359 return "EUC-KR"; | |
360 DFA_NEXT(euck, c); | |
361 } | |
362 if (DFA_ALIVE(johab)) { | |
363 if (!DFA_ALIVE(euck) && !DFA_ALIVE(utf8)) | |
364 return "JOHAB"; | |
365 DFA_NEXT(johab, c); | |
366 } | |
367 if (DFA_ALIVE(utf8)) { | |
368 if (!DFA_ALIVE(euck) && !DFA_ALIVE(johab)) | |
369 return "UTF-8"; | |
370 DFA_NEXT(utf8, c); | |
371 } | |
372 | |
373 if (!DFA_ALIVE(euck) && !DFA_ALIVE(johab) && !DFA_ALIVE(utf8)) { | |
374 /* we ran out the possibilities */ | |
375 return NULL; | |
376 } | |
377 } | |
378 | |
379 /* Now, we have ambigous code. Pick the highest score. If more than | |
380 one candidate tie, pick the default encoding. */ | |
381 for (i = 0; order[i] != NULL; i++) { | |
382 if (order[i]->state >= 0) { //DFA_ALIVE() | |
383 if (top == NULL || order[i]->score > top->score) | |
384 top = order[i]; | |
385 } | |
386 } | |
387 | |
388 if (top == &euck) | |
389 return "EUC-KR"; | |
390 if (top == &utf8) | |
391 return "UTF-8"; | |
392 if (top == &johab) | |
393 return "JOHAB"; | |
394 return NULL; | |
395 } |