Mercurial > libguess
comparison cjk_impl.c @ 2:754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
- new UCS-2LE/BE DFAs
- now arabic_impl.c uses arabic DFAs
- dfa common macros have been moved to dfa.h
- minor cleanups
author | Yoshiki Yazawa <yaz@cc.rim.or.jp> |
---|---|
date | Wed, 11 Jun 2008 00:11:30 +0900 |
parents | d9b6ff839eab |
children | 70e2c306231e |
comparison
equal
deleted
inserted
replaced
1:04f2be1c8464 | 2:754a4550c64e |
---|---|
2 * This code is derivative of guess.c of Gauche-0.8.3. | 2 * This code is derivative of guess.c of Gauche-0.8.3. |
3 * The following is the original copyright notice. | 3 * The following is the original copyright notice. |
4 */ | 4 */ |
5 | 5 |
6 /* | 6 /* |
7 * guess.c - guessing character encoding | 7 * guess.c - guessing character encoding |
8 * | 8 * |
9 * Copyright (c) 2000-2003 Shiro Kawai, All rights reserved. | 9 * Copyright (c) 2000-2003 Shiro Kawai, All rights reserved. |
10 * | 10 * |
11 * Redistribution and use in source and binary forms, with or without | 11 * Redistribution and use in source and binary forms, with or without |
12 * modification, are permitted provided that the following conditions | 12 * modification, are permitted provided that the following conditions |
13 * are met: | 13 * are met: |
14 * | 14 * |
15 * 1. Redistributions of source code must retain the above copyright | 15 * 1. Redistributions of source code must retain the above copyright |
16 * notice, this list of conditions and the following disclaimer. | 16 * notice, this list of conditions and the following disclaimer. |
17 * | 17 * |
18 * 2. Redistributions in binary form must reproduce the above copyright | 18 * 2. Redistributions in binary form must reproduce the above copyright |
19 * notice, this list of conditions and the following disclaimer in the | 19 * notice, this list of conditions and the following disclaimer in the |
36 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 36 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
37 * | 37 * |
38 */ | 38 */ |
39 | 39 |
40 #include "libguess.h" | 40 #include "libguess.h" |
41 | 41 #include "dfa.h" |
42 /* take precedence if scores are same. you can customize the order as: */ | |
43 /* ORDER_** &highest, &second, ... &lowest */ | |
44 #define ORDER_JP &utf8, &sjis, &eucj | |
45 #define ORDER_TW &utf8, &big5 | |
46 #define ORDER_CN &utf8, &gb2312, &gb18030 | |
47 #define ORDER_KR &utf8, &euck, &johab | |
48 | 42 |
49 /* workaround for that glib's g_convert can't convert | 43 /* workaround for that glib's g_convert can't convert |
50 properly from UCS-2BE/LE trailing after BOM. */ | 44 properly from UCS-2BE/LE trailing after BOM. */ |
51 #define WITH_G_CONVERT 1 | 45 #define WITH_G_CONVERT 1 |
52 /* #undef WITH_G_CONVERT */ | 46 /* #undef WITH_G_CONVERT */ |
57 #else | 51 #else |
58 const char UCS_2BE[] = "UCS-2BE"; | 52 const char UCS_2BE[] = "UCS-2BE"; |
59 const char UCS_2LE[] = "UCS-2LE"; | 53 const char UCS_2LE[] = "UCS-2LE"; |
60 #endif | 54 #endif |
61 | 55 |
62 /* data types */ | 56 /* take precedence if scores are same. you can customize the order as: */ |
63 typedef struct guess_arc_rec | 57 /* ORDER_** &highest, &second, ... &lowest */ |
64 { | 58 #define ORDER_JP &utf8, &sjis, &eucj |
65 unsigned int next; /* next state */ | 59 #define ORDER_TW &utf8, &big5 |
66 double score; /* score */ | 60 #define ORDER_CN &utf8, &gb2312, &gb18030 |
67 } guess_arc; | 61 #define ORDER_KR &utf8, &euck, &johab |
68 | |
69 typedef struct guess_dfa_rec | |
70 { | |
71 signed char (*states)[256]; | |
72 guess_arc *arcs; | |
73 int state; | |
74 double score; | |
75 } guess_dfa; | |
76 | |
77 /* macros */ | |
78 #define DFA_INIT(st, ar) \ | |
79 { st, ar, 0, 1.0 } | |
80 | |
81 #define DFA_NEXT(dfa, ch) \ | |
82 do { \ | |
83 int arc__; \ | |
84 if (dfa.state >= 0) { \ | |
85 arc__ = dfa.states[dfa.state][ch]; \ | |
86 if (arc__ < 0) { \ | |
87 dfa.state = -1; \ | |
88 } else { \ | |
89 dfa.state = dfa.arcs[arc__].next; \ | |
90 dfa.score *= dfa.arcs[arc__].score; \ | |
91 } \ | |
92 } \ | |
93 } while (0) | |
94 | |
95 #define DFA_ALIVE(dfa) (dfa.state >= 0) | |
96 | 62 |
97 /* include DFA table generated by guess.scm */ | 63 /* include DFA table generated by guess.scm */ |
98 #include "guess_tab.c" | 64 #include "guess_tab.c" |
99 | 65 |
100 | 66 |
114 | 80 |
115 DFA_NEXT(utf8, '\0'); //Bug #53 | 81 DFA_NEXT(utf8, '\0'); //Bug #53 |
116 | 82 |
117 if(DFA_ALIVE(utf8)) | 83 if(DFA_ALIVE(utf8)) |
118 return 1; | 84 return 1; |
119 else | 85 else |
120 return 0; | 86 return 0; |
121 } | 87 } |
122 | 88 |
123 const char *guess_jp(const char *buf, int buflen) | 89 const char *guess_jp(const char *buf, int buflen) |
124 { | 90 { |