comparison libguess/guess.c @ 1105:4be4d74db123 trunk

[svn] automatic character encoding detector for id3 metadata. --enable-chardet enables this feature.
author yaz
date Sat, 27 May 2006 11:02:08 -0700
parents
children e6cc84e06444
comparison
equal deleted inserted replaced
1104:c2fc86e40fba 1105:4be4d74db123
1 /*
2 * This code is derivative of guess.c of Gauche-0.8.3.
3 * The following is the original copyright notice.
4 */
5
6 /*
7 * guess.c - guessing character encoding
8 *
9 * Copyright (c) 2000-2003 Shiro Kawai, All rights reserved.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 *
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 *
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 *
22 * 3. Neither the name of the authors nor the names of its contributors
23 * may be used to endorse or promote products derived from this
24 * software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
32 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 *
38 */
39
40 #include "libguess.h"
41 #define NULL ((void *)0)
42
43 /* take precedence if scores are same. */
44 #undef PREFER_UTF8
45 #undef PREFER_SJIS
46 #undef PREFER_BIG5
47 #undef PREFER_GB18030
48 #undef PREFER_JOHAB
49
50 /* data types */
51 typedef struct guess_arc_rec {
52 unsigned int next; /* next state */
53 double score; /* score */
54 } guess_arc;
55
56 typedef struct guess_dfa_rec {
57 signed char (*states)[256];
58 guess_arc *arcs;
59 int state;
60 double score;
61 } guess_dfa;
62
63 /* macros */
64 #define DFA_INIT(st, ar) \
65 { st, ar, 0, 1.0 }
66
67 #define DFA_NEXT(dfa, ch) \
68 do { \
69 int arc__; \
70 if (dfa.state >= 0) { \
71 arc__ = dfa.states[dfa.state][ch]; \
72 if (arc__ < 0) { \
73 dfa.state = -1; \
74 } else { \
75 dfa.state = dfa.arcs[arc__].next; \
76 dfa.score *= dfa.arcs[arc__].score; \
77 } \
78 } \
79 } while (0)
80
81 #define DFA_ALIVE(dfa) (dfa.state >= 0)
82
83 /* include DFA table generated by guess.scm */
84 #include "guess_tab.c"
85
86 const char *guess_jp(const char *buf, int buflen)
87 {
88 int i;
89 guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar);
90 guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar);
91 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
92 guess_dfa *top = NULL;
93
94 for (i=0; i<buflen; i++) {
95 int c = (unsigned char)buf[i];
96
97 /* special treatment of jis escape sequence */
98 if (c == 0x1b) {
99 if (i < buflen-1) {
100 c = (unsigned char)buf[++i];
101 if (c == '$' || c == '(') return "ISO-2022-JP";
102 }
103 }
104
105 /* special treatment of BOM */
106 if (i==0 && c == 0xff) {
107 if (i < buflen-1) {
108 c = (unsigned char)buf[i+1];
109 if (c == 0xfe) return "UCS2-LE";
110 }
111 }
112 if (i==0 && c == 0xfe) {
113 if (i < buflen-1) {
114 c = (unsigned char)buf[i+1];
115 if (c == 0xff) return "UCS2-BE";
116 }
117 }
118
119 if (DFA_ALIVE(eucj)) {
120 if (!DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) return "EUC-JP";
121 DFA_NEXT(eucj, c);
122 }
123 if (DFA_ALIVE(sjis)) {
124 if (!DFA_ALIVE(eucj) && !DFA_ALIVE(utf8)) return "SJIS";
125 DFA_NEXT(sjis, c);
126 }
127 if (DFA_ALIVE(utf8)) {
128 if (!DFA_ALIVE(sjis) && !DFA_ALIVE(eucj)) return "UTF-8";
129 DFA_NEXT(utf8, c);
130 }
131
132 if (!DFA_ALIVE(eucj) && !DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) {
133 /* we ran out the possibilities */
134 return NULL;
135 }
136 }
137
138 /* Now, we have ambigous code. Pick the highest score. If more than
139 one candidate tie, pick the default encoding. */
140 if (DFA_ALIVE(eucj)) top = &eucj;
141 if (DFA_ALIVE(utf8)) {
142 if (top) {
143 #if defined PREFER_UTF8
144 if (top->score <= utf8.score) top = &utf8;
145 #else
146 if (top->score < utf8.score) top = &utf8;
147 #endif
148 } else {
149 top = &utf8;
150 }
151 }
152 if (DFA_ALIVE(sjis)) {
153 if (top) {
154 #if defined PREFER_SJIS
155 if (top->score <= sjis.score) top = &sjis;
156 #else
157 if (top->score < sjis.score) top = &sjis;
158 #endif
159 } else {
160 top = &sjis;
161 }
162 }
163
164 if (top == &eucj) return "EUC-JP";
165 if (top == &utf8) return "UTF-8";
166 if (top == &sjis) return "SJIS";
167 return NULL;
168 }
169
170 const char *guess_tw(const char *buf, int buflen)
171 {
172 int i;
173 guess_dfa big5 = DFA_INIT(guess_big5_st, guess_big5_ar);
174 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
175 guess_dfa *top = NULL;
176
177 for (i=0; i<buflen; i++) {
178 int c = (unsigned char)buf[i];
179
180 /* special treatment of jis escape sequence */
181 if (c == 0x1b) {
182 if (i < buflen-1) {
183 c = (unsigned char)buf[++i];
184 if (c == '$' || c == '(') return "ISO-2022-TW";
185 }
186 }
187
188 /* special treatment of BOM */
189 if (i==0 && c == 0xff) {
190 if (i < buflen-1) {
191 c = (unsigned char)buf[i+1];
192 if (c == 0xfe) return "UCS2-LE";
193 }
194 }
195 if (i==0 && c == 0xfe) {
196 if (i < buflen-1) {
197 c = (unsigned char)buf[i+1];
198 if (c == 0xff) return "UCS2-BE";
199 }
200 }
201
202 if (DFA_ALIVE(big5)) {
203 if (!DFA_ALIVE(utf8)) return "BIG5";
204 DFA_NEXT(big5, c);
205 }
206 if (DFA_ALIVE(utf8)) {
207 if (!DFA_ALIVE(big5)) return "UTF-8";
208 DFA_NEXT(utf8, c);
209 }
210
211 if (!DFA_ALIVE(big5) && !DFA_ALIVE(utf8)) {
212 /* we ran out the possibilities */
213 return NULL;
214 }
215 }
216
217 /* Now, we have ambigous code. Pick the highest score. If more than
218 one candidate tie, pick the default encoding. */
219 if (DFA_ALIVE(big5)) top = &big5;
220 if (DFA_ALIVE(utf8)) {
221 if (top) {
222 #if defined PREFER_UTF8
223 if (top->score <= utf8.score) top = &utf8;
224 #else
225 if (top->score < utf8.score) top = &utf8;
226 #endif
227 } else {
228 top = &utf8;
229 }
230 }
231
232 if (top == &big5) return "BIG5";
233 if (top == &utf8) return "UTF-8";
234 return NULL;
235 }
236
237 const char *guess_cn(const char *buf, int buflen)
238 {
239 int i;
240 guess_dfa gb2312 = DFA_INIT(guess_gb2312_st, guess_gb2312_ar);
241 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
242 guess_dfa gb18030 = DFA_INIT(guess_gb18030_st, guess_gb18030_ar);
243 guess_dfa *top = NULL;
244
245 for (i=0; i<buflen; i++) {
246 int c = (unsigned char)buf[i];
247 int c2;
248
249 /* special treatment of jis escape sequence */
250 if (c == 0x1b) {
251 if (i < buflen-1) {
252 c = (unsigned char)buf[i+1];
253 c2 = (unsigned char)buf[i+2];
254 if (c == '$' && (c2 == ')' || c2 == '+')) return "ISO-2022-CN";
255 }
256 }
257
258 if (DFA_ALIVE(gb2312)) {
259 if (!DFA_ALIVE(utf8) && !DFA_ALIVE(gb18030)) return "GB2312";
260 DFA_NEXT(gb2312, c);
261 }
262 if (DFA_ALIVE(utf8)) {
263 if (!DFA_ALIVE(gb2312) && !DFA_ALIVE(gb18030)) return "UTF-8";
264 DFA_NEXT(utf8, c);
265 }
266 if (DFA_ALIVE(gb18030)) {
267 if (!DFA_ALIVE(utf8) && !DFA_ALIVE(gb2312)) return "GB18030";
268 DFA_NEXT(gb18030, c);
269 }
270
271 if (!DFA_ALIVE(gb2312) && !DFA_ALIVE(utf8) && !DFA_ALIVE(gb18030)) {
272 /* we ran out the possibilities */
273 return NULL;
274 }
275 }
276
277 /* Now, we have ambigous code. Pick the highest score. If more than
278 one candidate tie, pick the default encoding. */
279 if (DFA_ALIVE(gb2312)) top = &gb2312;
280 if (DFA_ALIVE(utf8)) {
281 if (top) {
282 #if defined PREFER_UTF8
283 if (top->score <= utf8.score) top = &utf8;
284 #else
285 if (top->score < utf8.score) top = &utf8;
286 #endif
287 } else {
288 top = &utf8;
289 }
290 }
291 if (DFA_ALIVE(gb18030)) {
292 if (top) {
293 #if defined PREFER_GB18030
294 if (top->score <= gb18030.score) top = &gb18030;
295 #else
296 if (top->score < gb18030.score) top = &gb18030;
297 #endif
298 } else {
299 top = &gb18030;
300 }
301 }
302
303 if (top == &gb2312) return "GB2312";
304 if (top == &utf8) return "UTF-8";
305 if (top == &gb18030) return "GB18030";
306 return NULL;
307 }
308
309 const char *guess_kr(const char *buf, int buflen)
310 {
311 int i;
312 guess_dfa euck = DFA_INIT(guess_euck_st, guess_euck_ar);
313 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
314 guess_dfa johab = DFA_INIT(guess_johab_st, guess_johab_ar);
315 guess_dfa *top = NULL;
316
317 for (i=0; i<buflen; i++) {
318 int c = (unsigned char)buf[i];
319 int c2;
320
321 /* special treatment of jis escape sequence */
322 if (c == 0x1b) {
323 if (i < buflen-1) {
324 c = (unsigned char)buf[i+1];
325 c2 = (unsigned char)buf[i+2];
326 if (c == '$' && c2 == ')') return "ISO-2022-KR";
327 }
328 }
329
330 /* special treatment of BOM */
331 if (i==0 && c == 0xff) {
332 if (i < buflen-1) {
333 c = (unsigned char)buf[i+1];
334 if (c == 0xfe) return "UCS2-LE";
335 }
336 }
337 if (i==0 && c == 0xfe) {
338 if (i < buflen-1) {
339 c = (unsigned char)buf[i+1];
340 if (c == 0xff) return "UCS2-BE";
341 }
342 }
343
344 if (DFA_ALIVE(euck)) {
345 if (!DFA_ALIVE(johab) && !DFA_ALIVE(utf8)) return "EUC-KR";
346 DFA_NEXT(euck, c);
347 }
348 if (DFA_ALIVE(johab)) {
349 if (!DFA_ALIVE(euck) && !DFA_ALIVE(utf8)) return "JOHAB";
350 DFA_NEXT(johab, c);
351 }
352 if (DFA_ALIVE(utf8)) {
353 if (!DFA_ALIVE(euck) && !DFA_ALIVE(johab)) return "UTF-8";
354 DFA_NEXT(utf8, c);
355 }
356
357 if (!DFA_ALIVE(euck) && !DFA_ALIVE(johab) && !DFA_ALIVE(utf8)) {
358 /* we ran out the possibilities */
359 return NULL;
360 }
361 }
362
363 /* Now, we have ambigous code. Pick the highest score. If more than
364 one candidate tie, pick the default encoding. */
365 if (DFA_ALIVE(euck)) top = &euck;
366 if (DFA_ALIVE(utf8)) {
367 if (top) {
368 #if defined PREFER_UTF8
369 if (top->score <= utf8.score) top = &utf8;
370 #else
371 if (top->score < utf8.score) top = &utf8;
372 #endif
373 } else {
374 top = &utf8;
375 }
376 }
377 if (DFA_ALIVE(johab)) {
378 if (top) {
379 #if defined PREFER_JOAHB
380 if (top->score <= johab.score) top = &johab;
381 #else
382 if (top->score < johab.score) top = &johab;
383 #endif
384 } else {
385 top = &johab;
386 }
387 }
388
389 if (top == &euck) return "EUC-KR";
390 if (top == &utf8) return "UTF-8";
391 if (top == &johab) return "JOHAB";
392 return NULL;
393 }
394