Mercurial > libguess
annotate cjk_impl.c @ 3:70e2c306231e
- implemented dfa utility functions.
- added dfa.c.
- rewrote guess functions for ar, gr, hw and tr scripts with dfa utilities.
- guess functions for cjk scripts too.
author | Yoshiki Yazawa <yaz@cc.rim.or.jp> |
---|---|
date | Thu, 12 Jun 2008 20:20:43 +0900 |
parents | 754a4550c64e |
children |
rev | line source |
---|---|
0 | 1 /* |
2 * This code is derivative of guess.c of Gauche-0.8.3. | |
3 * The following is the original copyright notice. | |
4 */ | |
5 | |
6 /* | |
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
7 * guess.c - guessing character encoding |
0 | 8 * |
9 * Copyright (c) 2000-2003 Shiro Kawai, All rights reserved. | |
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
10 * |
0 | 11 * Redistribution and use in source and binary forms, with or without |
12 * modification, are permitted provided that the following conditions | |
13 * are met: | |
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
14 * |
0 | 15 * 1. Redistributions of source code must retain the above copyright |
16 * notice, this list of conditions and the following disclaimer. | |
17 * | |
18 * 2. Redistributions in binary form must reproduce the above copyright | |
19 * notice, this list of conditions and the following disclaimer in the | |
20 * documentation and/or other materials provided with the distribution. | |
21 * | |
22 * 3. Neither the name of the authors nor the names of its contributors | |
23 * may be used to endorse or promote products derived from this | |
24 * software without specific prior written permission. | |
25 * | |
26 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
27 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
28 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
29 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
30 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
31 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED | |
32 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
33 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | |
34 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
35 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
36 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
37 * | |
38 */ | |
39 | |
40 #include "libguess.h" | |
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
41 #include "dfa.h" |
0 | 42 |
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
43 #include <stdio.h> |
0 | 44 |
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
45 /* take precedence if scores are same. you can customize the order as: */ |
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
46 /* ORDER_** &highest, &second, ... &lowest */ |
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
47 #define ORDER_JP &utf8, &sjis, &eucj |
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
48 #define ORDER_TW &utf8, &big5 |
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
49 #define ORDER_CN &utf8, &gb2312, &gb18030 |
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
50 #define ORDER_KR &utf8, &euck, &johab |
0 | 51 |
52 /* include DFA table generated by guess.scm */ | |
53 #include "guess_tab.c" | |
54 | |
55 int dfa_validate_utf8(const char *buf, int buflen) | |
56 { | |
57 int i; | |
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
58 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); |
0 | 59 |
60 for (i = 0; i < buflen; i++) { | |
61 int c = (unsigned char) buf[i]; | |
62 | |
63 if (DFA_ALIVE(utf8)) | |
64 DFA_NEXT(utf8, c); | |
65 else | |
66 break; | |
67 } | |
68 | |
69 DFA_NEXT(utf8, '\0'); //Bug #53 | |
70 | |
71 if(DFA_ALIVE(utf8)) | |
72 return 1; | |
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
73 else |
0 | 74 return 0; |
75 } | |
76 | |
77 const char *guess_jp(const char *buf, int buflen) | |
78 { | |
79 int i; | |
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
80 const char *rv = NULL; |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
81 guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar, "EUC-JP"); |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
82 guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar, "SJIS"); |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
83 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); |
0 | 84 guess_dfa *top = NULL; |
85 | |
86 guess_dfa *order[] = { ORDER_JP, NULL }; | |
87 | |
88 for (i = 0; i < buflen; i++) { | |
89 int c = (unsigned char) buf[i]; | |
90 | |
91 /* special treatment of iso-2022 escape sequence */ | |
92 if (c == 0x1b) { | |
93 if (i < buflen - 1) { | |
94 c = (unsigned char) buf[++i]; | |
95 if (c == '$' || c == '(') | |
96 return "ISO-2022-JP"; | |
97 } | |
98 } | |
99 | |
100 /* special treatment of BOM */ | |
101 if (i == 0 && c == 0xff) { | |
102 if (i < buflen - 1) { | |
103 c = (unsigned char) buf[i + 1]; | |
104 if (c == 0xfe) | |
105 return UCS_2LE; | |
106 } | |
107 } | |
108 if (i == 0 && c == 0xfe) { | |
109 if (i < buflen - 1) { | |
110 c = (unsigned char) buf[i + 1]; | |
111 if (c == 0xff) | |
112 return UCS_2BE; | |
113 } | |
114 } | |
115 | |
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
116 rv = dfa_process(order, c); |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
117 if(rv) |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
118 return rv; |
0 | 119 |
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
120 if (dfa_none(order)) { |
0 | 121 /* we ran out the possibilities */ |
122 return NULL; | |
123 } | |
124 } | |
125 | |
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
126 top = dfa_top(order); |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
127 if(top) |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
128 return top->name; |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
129 else |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
130 return NULL; |
0 | 131 } |
132 | |
133 const char *guess_tw(const char *buf, int buflen) | |
134 { | |
135 int i; | |
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
136 const char *rv = NULL; |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
137 guess_dfa big5 = DFA_INIT(guess_big5_st, guess_big5_ar, "BIG5"); |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
138 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); |
0 | 139 guess_dfa *top = NULL; |
140 | |
141 guess_dfa *order[] = { ORDER_TW, NULL }; | |
142 | |
143 for (i = 0; i < buflen; i++) { | |
144 int c = (unsigned char) buf[i]; | |
145 | |
146 /* special treatment of iso-2022 escape sequence */ | |
147 if (c == 0x1b) { | |
148 if (i < buflen - 1) { | |
149 c = (unsigned char) buf[++i]; | |
150 if (c == '$' || c == '(') | |
151 return "ISO-2022-TW"; | |
152 } | |
153 } | |
154 | |
155 /* special treatment of BOM */ | |
156 if (i == 0 && c == 0xff) { | |
157 if (i < buflen - 1) { | |
158 c = (unsigned char) buf[i + 1]; | |
159 if (c == 0xfe) | |
160 return UCS_2LE; | |
161 } | |
162 } | |
163 if (i == 0 && c == 0xfe) { | |
164 if (i < buflen - 1) { | |
165 c = (unsigned char) buf[i + 1]; | |
166 if (c == 0xff) | |
167 return UCS_2BE; | |
168 } | |
169 } | |
170 | |
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
171 rv = dfa_process(order, c); |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
172 if(rv) |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
173 return rv; |
0 | 174 |
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
175 if (dfa_none(order)) { |
0 | 176 /* we ran out the possibilities */ |
177 return NULL; | |
178 } | |
179 } | |
180 | |
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
181 top = dfa_top(order); |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
182 if (top) |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
183 return top->name; |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
184 else |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
185 return NULL; |
0 | 186 } |
187 | |
188 const char *guess_cn(const char *buf, int buflen) | |
189 { | |
190 int i; | |
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
191 const char *rv = NULL; |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
192 guess_dfa gb2312 = DFA_INIT(guess_gb2312_st, guess_gb2312_ar, "GB2312"); |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
193 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
194 guess_dfa gb18030 = DFA_INIT(guess_gb18030_st, guess_gb18030_ar, "GB18030"); |
0 | 195 guess_dfa *top = NULL; |
196 | |
197 guess_dfa *order[] = { ORDER_CN, NULL }; | |
198 | |
199 for (i = 0; i < buflen; i++) { | |
200 int c = (unsigned char) buf[i]; | |
201 int c2; | |
202 | |
203 /* special treatment of iso-2022 escape sequence */ | |
204 if (c == 0x1b) { | |
205 if (i < buflen - 1) { | |
206 c = (unsigned char) buf[i + 1]; | |
207 c2 = (unsigned char) buf[i + 2]; | |
208 if (c == '$' && (c2 == ')' || c2 == '+')) | |
209 return "ISO-2022-CN"; | |
210 } | |
211 } | |
212 | |
213 /* special treatment of BOM */ | |
214 if (i == 0 && c == 0xff) { | |
215 if (i < buflen - 1) { | |
216 c = (unsigned char) buf[i + 1]; | |
217 if (c == 0xfe) | |
218 return UCS_2LE; | |
219 } | |
220 } | |
221 if (i == 0 && c == 0xfe) { | |
222 if (i < buflen - 1) { | |
223 c = (unsigned char) buf[i + 1]; | |
224 if (c == 0xff) | |
225 return UCS_2BE; | |
226 } | |
227 } | |
228 | |
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
229 rv = dfa_process(order, c); |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
230 if(rv) |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
231 return rv; |
0 | 232 |
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
233 if (dfa_none(order)) { |
0 | 234 /* we ran out the possibilities */ |
235 return NULL; | |
236 } | |
237 } | |
238 | |
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
239 top = dfa_top(order); |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
240 if(top) |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
241 return top->name; |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
242 else |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
243 return NULL; |
0 | 244 } |
245 | |
246 const char *guess_kr(const char *buf, int buflen) | |
247 { | |
248 int i; | |
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
249 const char *rv = NULL; |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
250 guess_dfa euck = DFA_INIT(guess_euck_st, guess_euck_ar, "EUC-KR"); |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
251 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
252 guess_dfa johab = DFA_INIT(guess_johab_st, guess_johab_ar, "JOHAB"); |
0 | 253 guess_dfa *top = NULL; |
254 | |
255 guess_dfa *order[] = { ORDER_KR, NULL }; | |
256 | |
257 for (i = 0; i < buflen; i++) { | |
258 int c = (unsigned char) buf[i]; | |
259 int c2; | |
260 | |
261 /* special treatment of iso-2022 escape sequence */ | |
262 if (c == 0x1b) { | |
263 if (i < buflen - 1) { | |
264 c = (unsigned char) buf[i + 1]; | |
265 c2 = (unsigned char) buf[i + 2]; | |
266 if (c == '$' && c2 == ')') | |
267 return "ISO-2022-KR"; | |
268 } | |
269 } | |
270 | |
271 /* special treatment of BOM */ | |
272 if (i == 0 && c == 0xff) { | |
273 if (i < buflen - 1) { | |
274 c = (unsigned char) buf[i + 1]; | |
275 if (c == 0xfe) | |
276 return UCS_2LE; | |
277 } | |
278 } | |
279 if (i == 0 && c == 0xfe) { | |
280 if (i < buflen - 1) { | |
281 c = (unsigned char) buf[i + 1]; | |
282 if (c == 0xff) | |
283 return UCS_2BE; | |
284 } | |
285 } | |
286 | |
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
287 rv = dfa_process(order, c); |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
288 if(rv) |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
289 return rv; |
0 | 290 |
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
291 if (dfa_none(order)) { |
0 | 292 /* we ran out the possibilities */ |
293 return NULL; | |
294 } | |
295 } | |
296 | |
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
297 top = dfa_top(order); |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
298 if(top) |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
299 return top->name; |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
300 else |
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
301 return NULL; |
0 | 302 } |