comparison cjk_impl.c @ 3:70e2c306231e

- implemented dfa utility functions. - added dfa.c. - rewrote guess functions for ar, gr, hw and tr scripts with dfa utilities. - guess functions for cjk scripts too.
author Yoshiki Yazawa <yaz@cc.rim.or.jp>
date Thu, 12 Jun 2008 20:20:43 +0900
parents 754a4550c64e
children
comparison
equal deleted inserted replaced
2:754a4550c64e 3:70e2c306231e
38 */ 38 */
39 39
40 #include "libguess.h" 40 #include "libguess.h"
41 #include "dfa.h" 41 #include "dfa.h"
42 42
43 /* workaround for that glib's g_convert can't convert 43 #include <stdio.h>
44 properly from UCS-2BE/LE trailing after BOM. */
45 #define WITH_G_CONVERT 1
46 /* #undef WITH_G_CONVERT */
47
48 #ifdef WITH_G_CONVERT
49 const char UCS_2BE[] = "UTF-16";
50 const char UCS_2LE[] = "UTF-16";
51 #else
52 const char UCS_2BE[] = "UCS-2BE";
53 const char UCS_2LE[] = "UCS-2LE";
54 #endif
55 44
56 /* take precedence if scores are same. you can customize the order as: */ 45 /* take precedence if scores are same. you can customize the order as: */
57 /* ORDER_** &highest, &second, ... &lowest */ 46 /* ORDER_** &highest, &second, ... &lowest */
58 #define ORDER_JP &utf8, &sjis, &eucj 47 #define ORDER_JP &utf8, &sjis, &eucj
59 #define ORDER_TW &utf8, &big5 48 #define ORDER_TW &utf8, &big5
61 #define ORDER_KR &utf8, &euck, &johab 50 #define ORDER_KR &utf8, &euck, &johab
62 51
63 /* include DFA table generated by guess.scm */ 52 /* include DFA table generated by guess.scm */
64 #include "guess_tab.c" 53 #include "guess_tab.c"
65 54
66
67 int dfa_validate_utf8(const char *buf, int buflen) 55 int dfa_validate_utf8(const char *buf, int buflen)
68 { 56 {
69 int i; 57 int i;
70 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); 58 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
71 59
72 for (i = 0; i < buflen; i++) { 60 for (i = 0; i < buflen; i++) {
73 int c = (unsigned char) buf[i]; 61 int c = (unsigned char) buf[i];
74 62
75 if (DFA_ALIVE(utf8)) 63 if (DFA_ALIVE(utf8))
87 } 75 }
88 76
89 const char *guess_jp(const char *buf, int buflen) 77 const char *guess_jp(const char *buf, int buflen)
90 { 78 {
91 int i; 79 int i;
92 guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar); 80 const char *rv = NULL;
93 guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar); 81 guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar, "EUC-JP");
94 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); 82 guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar, "SJIS");
83 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
95 guess_dfa *top = NULL; 84 guess_dfa *top = NULL;
96 85
97 guess_dfa *order[] = { ORDER_JP, NULL }; 86 guess_dfa *order[] = { ORDER_JP, NULL };
98 87
99 for (i = 0; i < buflen; i++) { 88 for (i = 0; i < buflen; i++) {
122 if (c == 0xff) 111 if (c == 0xff)
123 return UCS_2BE; 112 return UCS_2BE;
124 } 113 }
125 } 114 }
126 115
127 if (DFA_ALIVE(eucj)) { 116 rv = dfa_process(order, c);
128 if (!DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) 117 if(rv)
129 return "EUC-JP"; 118 return rv;
130 DFA_NEXT(eucj, c); 119
131 } 120 if (dfa_none(order)) {
132 if (DFA_ALIVE(sjis)) {
133 if (!DFA_ALIVE(eucj) && !DFA_ALIVE(utf8))
134 return "SJIS";
135 DFA_NEXT(sjis, c);
136 }
137 if (DFA_ALIVE(utf8)) {
138 if (!DFA_ALIVE(sjis) && !DFA_ALIVE(eucj))
139 return "UTF-8";
140 DFA_NEXT(utf8, c);
141 }
142
143 if (!DFA_ALIVE(eucj) && !DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) {
144 /* we ran out the possibilities */ 121 /* we ran out the possibilities */
145 return NULL; 122 return NULL;
146 } 123 }
147 } 124 }
148 125
149 /* Now, we have ambigous code. Pick the highest score. If more than 126 top = dfa_top(order);
150 one candidate tie, pick the default encoding. */ 127 if(top)
151 for (i = 0; order[i] != NULL; i++) { 128 return top->name;
152 if (order[i]->state >= 0) { //DFA_ALIVE() 129 else
153 if (top == NULL || order[i]->score > top->score) 130 return NULL;
154 top = order[i];
155 }
156 }
157
158 if (top == &eucj)
159 return "EUC-JP";
160 if (top == &utf8)
161 return "UTF-8";
162 if (top == &sjis)
163 return "SJIS";
164 return NULL;
165 } 131 }
166 132
167 const char *guess_tw(const char *buf, int buflen) 133 const char *guess_tw(const char *buf, int buflen)
168 { 134 {
169 int i; 135 int i;
170 guess_dfa big5 = DFA_INIT(guess_big5_st, guess_big5_ar); 136 const char *rv = NULL;
171 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); 137 guess_dfa big5 = DFA_INIT(guess_big5_st, guess_big5_ar, "BIG5");
138 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
172 guess_dfa *top = NULL; 139 guess_dfa *top = NULL;
173 140
174 guess_dfa *order[] = { ORDER_TW, NULL }; 141 guess_dfa *order[] = { ORDER_TW, NULL };
175 142
176 for (i = 0; i < buflen; i++) { 143 for (i = 0; i < buflen; i++) {
199 if (c == 0xff) 166 if (c == 0xff)
200 return UCS_2BE; 167 return UCS_2BE;
201 } 168 }
202 } 169 }
203 170
204 if (DFA_ALIVE(big5)) { 171 rv = dfa_process(order, c);
205 if (!DFA_ALIVE(utf8)) 172 if(rv)
206 return "BIG5"; 173 return rv;
207 DFA_NEXT(big5, c); 174
208 } 175 if (dfa_none(order)) {
209 if (DFA_ALIVE(utf8)) {
210 if (!DFA_ALIVE(big5))
211 return "UTF-8";
212 DFA_NEXT(utf8, c);
213 }
214
215 if (!DFA_ALIVE(big5) && !DFA_ALIVE(utf8)) {
216 /* we ran out the possibilities */ 176 /* we ran out the possibilities */
217 return NULL; 177 return NULL;
218 } 178 }
219 } 179 }
220 180
221 /* Now, we have ambigous code. Pick the highest score. If more than 181 top = dfa_top(order);
222 one candidate tie, pick the default encoding. */ 182 if (top)
223 for (i = 0; order[i] != NULL; i++) { 183 return top->name;
224 if (order[i]->state >= 0) { //DFA_ALIVE() 184 else
225 if (top == NULL || order[i]->score > top->score) 185 return NULL;
226 top = order[i];
227 }
228 }
229
230 if (top == &big5)
231 return "BIG5";
232 if (top == &utf8)
233 return "UTF-8";
234 return NULL;
235 } 186 }
236 187
237 const char *guess_cn(const char *buf, int buflen) 188 const char *guess_cn(const char *buf, int buflen)
238 { 189 {
239 int i; 190 int i;
240 guess_dfa gb2312 = DFA_INIT(guess_gb2312_st, guess_gb2312_ar); 191 const char *rv = NULL;
241 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); 192 guess_dfa gb2312 = DFA_INIT(guess_gb2312_st, guess_gb2312_ar, "GB2312");
242 guess_dfa gb18030 = DFA_INIT(guess_gb18030_st, guess_gb18030_ar); 193 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
194 guess_dfa gb18030 = DFA_INIT(guess_gb18030_st, guess_gb18030_ar, "GB18030");
243 guess_dfa *top = NULL; 195 guess_dfa *top = NULL;
244 196
245 guess_dfa *order[] = { ORDER_CN, NULL }; 197 guess_dfa *order[] = { ORDER_CN, NULL };
246 198
247 for (i = 0; i < buflen; i++) { 199 for (i = 0; i < buflen; i++) {
272 if (c == 0xff) 224 if (c == 0xff)
273 return UCS_2BE; 225 return UCS_2BE;
274 } 226 }
275 } 227 }
276 228
277 if (DFA_ALIVE(gb2312)) { 229 rv = dfa_process(order, c);
278 if (!DFA_ALIVE(utf8) && !DFA_ALIVE(gb18030)) 230 if(rv)
279 return "GB2312"; 231 return rv;
280 DFA_NEXT(gb2312, c); 232
281 } 233 if (dfa_none(order)) {
282 if (DFA_ALIVE(utf8)) {
283 if (!DFA_ALIVE(gb2312) && !DFA_ALIVE(gb18030))
284 return "UTF-8";
285 DFA_NEXT(utf8, c);
286 }
287 if (DFA_ALIVE(gb18030)) {
288 if (!DFA_ALIVE(utf8) && !DFA_ALIVE(gb2312))
289 return "GB18030";
290 DFA_NEXT(gb18030, c);
291 }
292
293 if (!DFA_ALIVE(gb2312) && !DFA_ALIVE(utf8) && !DFA_ALIVE(gb18030)) {
294 /* we ran out the possibilities */ 234 /* we ran out the possibilities */
295 return NULL; 235 return NULL;
296 } 236 }
297 } 237 }
298 238
299 /* Now, we have ambigous code. Pick the highest score. If more than 239 top = dfa_top(order);
300 one candidate tie, pick the default encoding. */ 240 if(top)
301 for (i = 0; order[i] != NULL; i++) { 241 return top->name;
302 if (order[i]->state >= 0) { //DFA_ALIVE() 242 else
303 if (top == NULL || order[i]->score > top->score) 243 return NULL;
304 top = order[i];
305 }
306 }
307
308 if (top == &gb2312)
309 return "GB2312";
310 if (top == &utf8)
311 return "UTF-8";
312 if (top == &gb18030)
313 return "GB18030";
314 return NULL;
315 } 244 }
316 245
317 const char *guess_kr(const char *buf, int buflen) 246 const char *guess_kr(const char *buf, int buflen)
318 { 247 {
319 int i; 248 int i;
320 guess_dfa euck = DFA_INIT(guess_euck_st, guess_euck_ar); 249 const char *rv = NULL;
321 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); 250 guess_dfa euck = DFA_INIT(guess_euck_st, guess_euck_ar, "EUC-KR");
322 guess_dfa johab = DFA_INIT(guess_johab_st, guess_johab_ar); 251 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
252 guess_dfa johab = DFA_INIT(guess_johab_st, guess_johab_ar, "JOHAB");
323 guess_dfa *top = NULL; 253 guess_dfa *top = NULL;
324 254
325 guess_dfa *order[] = { ORDER_KR, NULL }; 255 guess_dfa *order[] = { ORDER_KR, NULL };
326 256
327 for (i = 0; i < buflen; i++) { 257 for (i = 0; i < buflen; i++) {
352 if (c == 0xff) 282 if (c == 0xff)
353 return UCS_2BE; 283 return UCS_2BE;
354 } 284 }
355 } 285 }
356 286
357 if (DFA_ALIVE(euck)) { 287 rv = dfa_process(order, c);
358 if (!DFA_ALIVE(johab) && !DFA_ALIVE(utf8)) 288 if(rv)
359 return "EUC-KR"; 289 return rv;
360 DFA_NEXT(euck, c); 290
361 } 291 if (dfa_none(order)) {
362 if (DFA_ALIVE(johab)) {
363 if (!DFA_ALIVE(euck) && !DFA_ALIVE(utf8))
364 return "JOHAB";
365 DFA_NEXT(johab, c);
366 }
367 if (DFA_ALIVE(utf8)) {
368 if (!DFA_ALIVE(euck) && !DFA_ALIVE(johab))
369 return "UTF-8";
370 DFA_NEXT(utf8, c);
371 }
372
373 if (!DFA_ALIVE(euck) && !DFA_ALIVE(johab) && !DFA_ALIVE(utf8)) {
374 /* we ran out the possibilities */ 292 /* we ran out the possibilities */
375 return NULL; 293 return NULL;
376 } 294 }
377 } 295 }
378 296
379 /* Now, we have ambigous code. Pick the highest score. If more than 297 top = dfa_top(order);
380 one candidate tie, pick the default encoding. */ 298 if(top)
381 for (i = 0; order[i] != NULL; i++) { 299 return top->name;
382 if (order[i]->state >= 0) { //DFA_ALIVE() 300 else
383 if (top == NULL || order[i]->score > top->score) 301 return NULL;
384 top = order[i]; 302 }
385 }
386 }
387
388 if (top == &euck)
389 return "EUC-KR";
390 if (top == &utf8)
391 return "UTF-8";
392 if (top == &johab)
393 return "JOHAB";
394 return NULL;
395 }