Mercurial > libguess
comparison cjk_impl.c @ 3:70e2c306231e
- implemented dfa utility functions.
- added dfa.c.
- rewrote guess functions for ar, gr, hw and tr scripts with dfa utilities.
- guess functions for cjk scripts too.
author | Yoshiki Yazawa <yaz@cc.rim.or.jp> |
---|---|
date | Thu, 12 Jun 2008 20:20:43 +0900 |
parents | 754a4550c64e |
children |
comparison
equal
deleted
inserted
replaced
2:754a4550c64e | 3:70e2c306231e |
---|---|
38 */ | 38 */ |
39 | 39 |
40 #include "libguess.h" | 40 #include "libguess.h" |
41 #include "dfa.h" | 41 #include "dfa.h" |
42 | 42 |
43 /* workaround for that glib's g_convert can't convert | 43 #include <stdio.h> |
44 properly from UCS-2BE/LE trailing after BOM. */ | |
45 #define WITH_G_CONVERT 1 | |
46 /* #undef WITH_G_CONVERT */ | |
47 | |
48 #ifdef WITH_G_CONVERT | |
49 const char UCS_2BE[] = "UTF-16"; | |
50 const char UCS_2LE[] = "UTF-16"; | |
51 #else | |
52 const char UCS_2BE[] = "UCS-2BE"; | |
53 const char UCS_2LE[] = "UCS-2LE"; | |
54 #endif | |
55 | 44 |
56 /* take precedence if scores are same. you can customize the order as: */ | 45 /* take precedence if scores are same. you can customize the order as: */ |
57 /* ORDER_** &highest, &second, ... &lowest */ | 46 /* ORDER_** &highest, &second, ... &lowest */ |
58 #define ORDER_JP &utf8, &sjis, &eucj | 47 #define ORDER_JP &utf8, &sjis, &eucj |
59 #define ORDER_TW &utf8, &big5 | 48 #define ORDER_TW &utf8, &big5 |
61 #define ORDER_KR &utf8, &euck, &johab | 50 #define ORDER_KR &utf8, &euck, &johab |
62 | 51 |
63 /* include DFA table generated by guess.scm */ | 52 /* include DFA table generated by guess.scm */ |
64 #include "guess_tab.c" | 53 #include "guess_tab.c" |
65 | 54 |
66 | |
67 int dfa_validate_utf8(const char *buf, int buflen) | 55 int dfa_validate_utf8(const char *buf, int buflen) |
68 { | 56 { |
69 int i; | 57 int i; |
70 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); | 58 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); |
71 | 59 |
72 for (i = 0; i < buflen; i++) { | 60 for (i = 0; i < buflen; i++) { |
73 int c = (unsigned char) buf[i]; | 61 int c = (unsigned char) buf[i]; |
74 | 62 |
75 if (DFA_ALIVE(utf8)) | 63 if (DFA_ALIVE(utf8)) |
87 } | 75 } |
88 | 76 |
89 const char *guess_jp(const char *buf, int buflen) | 77 const char *guess_jp(const char *buf, int buflen) |
90 { | 78 { |
91 int i; | 79 int i; |
92 guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar); | 80 const char *rv = NULL; |
93 guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar); | 81 guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar, "EUC-JP"); |
94 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); | 82 guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar, "SJIS"); |
83 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); | |
95 guess_dfa *top = NULL; | 84 guess_dfa *top = NULL; |
96 | 85 |
97 guess_dfa *order[] = { ORDER_JP, NULL }; | 86 guess_dfa *order[] = { ORDER_JP, NULL }; |
98 | 87 |
99 for (i = 0; i < buflen; i++) { | 88 for (i = 0; i < buflen; i++) { |
122 if (c == 0xff) | 111 if (c == 0xff) |
123 return UCS_2BE; | 112 return UCS_2BE; |
124 } | 113 } |
125 } | 114 } |
126 | 115 |
127 if (DFA_ALIVE(eucj)) { | 116 rv = dfa_process(order, c); |
128 if (!DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) | 117 if(rv) |
129 return "EUC-JP"; | 118 return rv; |
130 DFA_NEXT(eucj, c); | 119 |
131 } | 120 if (dfa_none(order)) { |
132 if (DFA_ALIVE(sjis)) { | |
133 if (!DFA_ALIVE(eucj) && !DFA_ALIVE(utf8)) | |
134 return "SJIS"; | |
135 DFA_NEXT(sjis, c); | |
136 } | |
137 if (DFA_ALIVE(utf8)) { | |
138 if (!DFA_ALIVE(sjis) && !DFA_ALIVE(eucj)) | |
139 return "UTF-8"; | |
140 DFA_NEXT(utf8, c); | |
141 } | |
142 | |
143 if (!DFA_ALIVE(eucj) && !DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) { | |
144 /* we ran out the possibilities */ | 121 /* we ran out the possibilities */ |
145 return NULL; | 122 return NULL; |
146 } | 123 } |
147 } | 124 } |
148 | 125 |
149 /* Now, we have ambigous code. Pick the highest score. If more than | 126 top = dfa_top(order); |
150 one candidate tie, pick the default encoding. */ | 127 if(top) |
151 for (i = 0; order[i] != NULL; i++) { | 128 return top->name; |
152 if (order[i]->state >= 0) { //DFA_ALIVE() | 129 else |
153 if (top == NULL || order[i]->score > top->score) | 130 return NULL; |
154 top = order[i]; | |
155 } | |
156 } | |
157 | |
158 if (top == &eucj) | |
159 return "EUC-JP"; | |
160 if (top == &utf8) | |
161 return "UTF-8"; | |
162 if (top == &sjis) | |
163 return "SJIS"; | |
164 return NULL; | |
165 } | 131 } |
166 | 132 |
167 const char *guess_tw(const char *buf, int buflen) | 133 const char *guess_tw(const char *buf, int buflen) |
168 { | 134 { |
169 int i; | 135 int i; |
170 guess_dfa big5 = DFA_INIT(guess_big5_st, guess_big5_ar); | 136 const char *rv = NULL; |
171 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); | 137 guess_dfa big5 = DFA_INIT(guess_big5_st, guess_big5_ar, "BIG5"); |
138 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); | |
172 guess_dfa *top = NULL; | 139 guess_dfa *top = NULL; |
173 | 140 |
174 guess_dfa *order[] = { ORDER_TW, NULL }; | 141 guess_dfa *order[] = { ORDER_TW, NULL }; |
175 | 142 |
176 for (i = 0; i < buflen; i++) { | 143 for (i = 0; i < buflen; i++) { |
199 if (c == 0xff) | 166 if (c == 0xff) |
200 return UCS_2BE; | 167 return UCS_2BE; |
201 } | 168 } |
202 } | 169 } |
203 | 170 |
204 if (DFA_ALIVE(big5)) { | 171 rv = dfa_process(order, c); |
205 if (!DFA_ALIVE(utf8)) | 172 if(rv) |
206 return "BIG5"; | 173 return rv; |
207 DFA_NEXT(big5, c); | 174 |
208 } | 175 if (dfa_none(order)) { |
209 if (DFA_ALIVE(utf8)) { | |
210 if (!DFA_ALIVE(big5)) | |
211 return "UTF-8"; | |
212 DFA_NEXT(utf8, c); | |
213 } | |
214 | |
215 if (!DFA_ALIVE(big5) && !DFA_ALIVE(utf8)) { | |
216 /* we ran out the possibilities */ | 176 /* we ran out the possibilities */ |
217 return NULL; | 177 return NULL; |
218 } | 178 } |
219 } | 179 } |
220 | 180 |
221 /* Now, we have ambigous code. Pick the highest score. If more than | 181 top = dfa_top(order); |
222 one candidate tie, pick the default encoding. */ | 182 if (top) |
223 for (i = 0; order[i] != NULL; i++) { | 183 return top->name; |
224 if (order[i]->state >= 0) { //DFA_ALIVE() | 184 else |
225 if (top == NULL || order[i]->score > top->score) | 185 return NULL; |
226 top = order[i]; | |
227 } | |
228 } | |
229 | |
230 if (top == &big5) | |
231 return "BIG5"; | |
232 if (top == &utf8) | |
233 return "UTF-8"; | |
234 return NULL; | |
235 } | 186 } |
236 | 187 |
237 const char *guess_cn(const char *buf, int buflen) | 188 const char *guess_cn(const char *buf, int buflen) |
238 { | 189 { |
239 int i; | 190 int i; |
240 guess_dfa gb2312 = DFA_INIT(guess_gb2312_st, guess_gb2312_ar); | 191 const char *rv = NULL; |
241 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); | 192 guess_dfa gb2312 = DFA_INIT(guess_gb2312_st, guess_gb2312_ar, "GB2312"); |
242 guess_dfa gb18030 = DFA_INIT(guess_gb18030_st, guess_gb18030_ar); | 193 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); |
194 guess_dfa gb18030 = DFA_INIT(guess_gb18030_st, guess_gb18030_ar, "GB18030"); | |
243 guess_dfa *top = NULL; | 195 guess_dfa *top = NULL; |
244 | 196 |
245 guess_dfa *order[] = { ORDER_CN, NULL }; | 197 guess_dfa *order[] = { ORDER_CN, NULL }; |
246 | 198 |
247 for (i = 0; i < buflen; i++) { | 199 for (i = 0; i < buflen; i++) { |
272 if (c == 0xff) | 224 if (c == 0xff) |
273 return UCS_2BE; | 225 return UCS_2BE; |
274 } | 226 } |
275 } | 227 } |
276 | 228 |
277 if (DFA_ALIVE(gb2312)) { | 229 rv = dfa_process(order, c); |
278 if (!DFA_ALIVE(utf8) && !DFA_ALIVE(gb18030)) | 230 if(rv) |
279 return "GB2312"; | 231 return rv; |
280 DFA_NEXT(gb2312, c); | 232 |
281 } | 233 if (dfa_none(order)) { |
282 if (DFA_ALIVE(utf8)) { | |
283 if (!DFA_ALIVE(gb2312) && !DFA_ALIVE(gb18030)) | |
284 return "UTF-8"; | |
285 DFA_NEXT(utf8, c); | |
286 } | |
287 if (DFA_ALIVE(gb18030)) { | |
288 if (!DFA_ALIVE(utf8) && !DFA_ALIVE(gb2312)) | |
289 return "GB18030"; | |
290 DFA_NEXT(gb18030, c); | |
291 } | |
292 | |
293 if (!DFA_ALIVE(gb2312) && !DFA_ALIVE(utf8) && !DFA_ALIVE(gb18030)) { | |
294 /* we ran out the possibilities */ | 234 /* we ran out the possibilities */ |
295 return NULL; | 235 return NULL; |
296 } | 236 } |
297 } | 237 } |
298 | 238 |
299 /* Now, we have ambigous code. Pick the highest score. If more than | 239 top = dfa_top(order); |
300 one candidate tie, pick the default encoding. */ | 240 if(top) |
301 for (i = 0; order[i] != NULL; i++) { | 241 return top->name; |
302 if (order[i]->state >= 0) { //DFA_ALIVE() | 242 else |
303 if (top == NULL || order[i]->score > top->score) | 243 return NULL; |
304 top = order[i]; | |
305 } | |
306 } | |
307 | |
308 if (top == &gb2312) | |
309 return "GB2312"; | |
310 if (top == &utf8) | |
311 return "UTF-8"; | |
312 if (top == &gb18030) | |
313 return "GB18030"; | |
314 return NULL; | |
315 } | 244 } |
316 | 245 |
317 const char *guess_kr(const char *buf, int buflen) | 246 const char *guess_kr(const char *buf, int buflen) |
318 { | 247 { |
319 int i; | 248 int i; |
320 guess_dfa euck = DFA_INIT(guess_euck_st, guess_euck_ar); | 249 const char *rv = NULL; |
321 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); | 250 guess_dfa euck = DFA_INIT(guess_euck_st, guess_euck_ar, "EUC-KR"); |
322 guess_dfa johab = DFA_INIT(guess_johab_st, guess_johab_ar); | 251 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); |
252 guess_dfa johab = DFA_INIT(guess_johab_st, guess_johab_ar, "JOHAB"); | |
323 guess_dfa *top = NULL; | 253 guess_dfa *top = NULL; |
324 | 254 |
325 guess_dfa *order[] = { ORDER_KR, NULL }; | 255 guess_dfa *order[] = { ORDER_KR, NULL }; |
326 | 256 |
327 for (i = 0; i < buflen; i++) { | 257 for (i = 0; i < buflen; i++) { |
352 if (c == 0xff) | 282 if (c == 0xff) |
353 return UCS_2BE; | 283 return UCS_2BE; |
354 } | 284 } |
355 } | 285 } |
356 | 286 |
357 if (DFA_ALIVE(euck)) { | 287 rv = dfa_process(order, c); |
358 if (!DFA_ALIVE(johab) && !DFA_ALIVE(utf8)) | 288 if(rv) |
359 return "EUC-KR"; | 289 return rv; |
360 DFA_NEXT(euck, c); | 290 |
361 } | 291 if (dfa_none(order)) { |
362 if (DFA_ALIVE(johab)) { | |
363 if (!DFA_ALIVE(euck) && !DFA_ALIVE(utf8)) | |
364 return "JOHAB"; | |
365 DFA_NEXT(johab, c); | |
366 } | |
367 if (DFA_ALIVE(utf8)) { | |
368 if (!DFA_ALIVE(euck) && !DFA_ALIVE(johab)) | |
369 return "UTF-8"; | |
370 DFA_NEXT(utf8, c); | |
371 } | |
372 | |
373 if (!DFA_ALIVE(euck) && !DFA_ALIVE(johab) && !DFA_ALIVE(utf8)) { | |
374 /* we ran out the possibilities */ | 292 /* we ran out the possibilities */ |
375 return NULL; | 293 return NULL; |
376 } | 294 } |
377 } | 295 } |
378 | 296 |
379 /* Now, we have ambigous code. Pick the highest score. If more than | 297 top = dfa_top(order); |
380 one candidate tie, pick the default encoding. */ | 298 if(top) |
381 for (i = 0; order[i] != NULL; i++) { | 299 return top->name; |
382 if (order[i]->state >= 0) { //DFA_ALIVE() | 300 else |
383 if (top == NULL || order[i]->score > top->score) | 301 return NULL; |
384 top = order[i]; | 302 } |
385 } | |
386 } | |
387 | |
388 if (top == &euck) | |
389 return "EUC-KR"; | |
390 if (top == &utf8) | |
391 return "UTF-8"; | |
392 if (top == &johab) | |
393 return "JOHAB"; | |
394 return NULL; | |
395 } |