Mercurial > audlegacy
comparison libguess/guess.c @ 1105:4be4d74db123 trunk
[svn] automatic character encoding detector for id3 metadata. --enable-chardet enables this feature.
author | yaz |
---|---|
date | Sat, 27 May 2006 11:02:08 -0700 |
parents | |
children | e6cc84e06444 |
comparison
equal
deleted
inserted
replaced
1104:c2fc86e40fba | 1105:4be4d74db123 |
---|---|
1 /* | |
2 * This code is derivative of guess.c of Gauche-0.8.3. | |
3 * The following is the original copyright notice. | |
4 */ | |
5 | |
6 /* | |
7 * guess.c - guessing character encoding | |
8 * | |
9 * Copyright (c) 2000-2003 Shiro Kawai, All rights reserved. | |
10 * | |
11 * Redistribution and use in source and binary forms, with or without | |
12 * modification, are permitted provided that the following conditions | |
13 * are met: | |
14 * | |
15 * 1. Redistributions of source code must retain the above copyright | |
16 * notice, this list of conditions and the following disclaimer. | |
17 * | |
18 * 2. Redistributions in binary form must reproduce the above copyright | |
19 * notice, this list of conditions and the following disclaimer in the | |
20 * documentation and/or other materials provided with the distribution. | |
21 * | |
22 * 3. Neither the name of the authors nor the names of its contributors | |
23 * may be used to endorse or promote products derived from this | |
24 * software without specific prior written permission. | |
25 * | |
26 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
27 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
28 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
29 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
30 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
31 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED | |
32 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
33 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | |
34 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
35 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
36 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
37 * | |
38 */ | |
39 | |
40 #include "libguess.h" | |
41 #define NULL ((void *)0) | |
42 | |
43 /* take precedence if scores are same. */ | |
44 #undef PREFER_UTF8 | |
45 #undef PREFER_SJIS | |
46 #undef PREFER_BIG5 | |
47 #undef PREFER_GB18030 | |
48 #undef PREFER_JOHAB | |
49 | |
50 /* data types */ | |
51 typedef struct guess_arc_rec { | |
52 unsigned int next; /* next state */ | |
53 double score; /* score */ | |
54 } guess_arc; | |
55 | |
56 typedef struct guess_dfa_rec { | |
57 signed char (*states)[256]; | |
58 guess_arc *arcs; | |
59 int state; | |
60 double score; | |
61 } guess_dfa; | |
62 | |
63 /* macros */ | |
64 #define DFA_INIT(st, ar) \ | |
65 { st, ar, 0, 1.0 } | |
66 | |
67 #define DFA_NEXT(dfa, ch) \ | |
68 do { \ | |
69 int arc__; \ | |
70 if (dfa.state >= 0) { \ | |
71 arc__ = dfa.states[dfa.state][ch]; \ | |
72 if (arc__ < 0) { \ | |
73 dfa.state = -1; \ | |
74 } else { \ | |
75 dfa.state = dfa.arcs[arc__].next; \ | |
76 dfa.score *= dfa.arcs[arc__].score; \ | |
77 } \ | |
78 } \ | |
79 } while (0) | |
80 | |
81 #define DFA_ALIVE(dfa) (dfa.state >= 0) | |
82 | |
83 /* include DFA table generated by guess.scm */ | |
84 #include "guess_tab.c" | |
85 | |
86 const char *guess_jp(const char *buf, int buflen) | |
87 { | |
88 int i; | |
89 guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar); | |
90 guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar); | |
91 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); | |
92 guess_dfa *top = NULL; | |
93 | |
94 for (i=0; i<buflen; i++) { | |
95 int c = (unsigned char)buf[i]; | |
96 | |
97 /* special treatment of jis escape sequence */ | |
98 if (c == 0x1b) { | |
99 if (i < buflen-1) { | |
100 c = (unsigned char)buf[++i]; | |
101 if (c == '$' || c == '(') return "ISO-2022-JP"; | |
102 } | |
103 } | |
104 | |
105 /* special treatment of BOM */ | |
106 if (i==0 && c == 0xff) { | |
107 if (i < buflen-1) { | |
108 c = (unsigned char)buf[i+1]; | |
109 if (c == 0xfe) return "UCS2-LE"; | |
110 } | |
111 } | |
112 if (i==0 && c == 0xfe) { | |
113 if (i < buflen-1) { | |
114 c = (unsigned char)buf[i+1]; | |
115 if (c == 0xff) return "UCS2-BE"; | |
116 } | |
117 } | |
118 | |
119 if (DFA_ALIVE(eucj)) { | |
120 if (!DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) return "EUC-JP"; | |
121 DFA_NEXT(eucj, c); | |
122 } | |
123 if (DFA_ALIVE(sjis)) { | |
124 if (!DFA_ALIVE(eucj) && !DFA_ALIVE(utf8)) return "SJIS"; | |
125 DFA_NEXT(sjis, c); | |
126 } | |
127 if (DFA_ALIVE(utf8)) { | |
128 if (!DFA_ALIVE(sjis) && !DFA_ALIVE(eucj)) return "UTF-8"; | |
129 DFA_NEXT(utf8, c); | |
130 } | |
131 | |
132 if (!DFA_ALIVE(eucj) && !DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) { | |
133 /* we ran out the possibilities */ | |
134 return NULL; | |
135 } | |
136 } | |
137 | |
138 /* Now, we have ambigous code. Pick the highest score. If more than | |
139 one candidate tie, pick the default encoding. */ | |
140 if (DFA_ALIVE(eucj)) top = &eucj; | |
141 if (DFA_ALIVE(utf8)) { | |
142 if (top) { | |
143 #if defined PREFER_UTF8 | |
144 if (top->score <= utf8.score) top = &utf8; | |
145 #else | |
146 if (top->score < utf8.score) top = &utf8; | |
147 #endif | |
148 } else { | |
149 top = &utf8; | |
150 } | |
151 } | |
152 if (DFA_ALIVE(sjis)) { | |
153 if (top) { | |
154 #if defined PREFER_SJIS | |
155 if (top->score <= sjis.score) top = &sjis; | |
156 #else | |
157 if (top->score < sjis.score) top = &sjis; | |
158 #endif | |
159 } else { | |
160 top = &sjis; | |
161 } | |
162 } | |
163 | |
164 if (top == &eucj) return "EUC-JP"; | |
165 if (top == &utf8) return "UTF-8"; | |
166 if (top == &sjis) return "SJIS"; | |
167 return NULL; | |
168 } | |
169 | |
170 const char *guess_tw(const char *buf, int buflen) | |
171 { | |
172 int i; | |
173 guess_dfa big5 = DFA_INIT(guess_big5_st, guess_big5_ar); | |
174 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); | |
175 guess_dfa *top = NULL; | |
176 | |
177 for (i=0; i<buflen; i++) { | |
178 int c = (unsigned char)buf[i]; | |
179 | |
180 /* special treatment of jis escape sequence */ | |
181 if (c == 0x1b) { | |
182 if (i < buflen-1) { | |
183 c = (unsigned char)buf[++i]; | |
184 if (c == '$' || c == '(') return "ISO-2022-TW"; | |
185 } | |
186 } | |
187 | |
188 /* special treatment of BOM */ | |
189 if (i==0 && c == 0xff) { | |
190 if (i < buflen-1) { | |
191 c = (unsigned char)buf[i+1]; | |
192 if (c == 0xfe) return "UCS2-LE"; | |
193 } | |
194 } | |
195 if (i==0 && c == 0xfe) { | |
196 if (i < buflen-1) { | |
197 c = (unsigned char)buf[i+1]; | |
198 if (c == 0xff) return "UCS2-BE"; | |
199 } | |
200 } | |
201 | |
202 if (DFA_ALIVE(big5)) { | |
203 if (!DFA_ALIVE(utf8)) return "BIG5"; | |
204 DFA_NEXT(big5, c); | |
205 } | |
206 if (DFA_ALIVE(utf8)) { | |
207 if (!DFA_ALIVE(big5)) return "UTF-8"; | |
208 DFA_NEXT(utf8, c); | |
209 } | |
210 | |
211 if (!DFA_ALIVE(big5) && !DFA_ALIVE(utf8)) { | |
212 /* we ran out the possibilities */ | |
213 return NULL; | |
214 } | |
215 } | |
216 | |
217 /* Now, we have ambigous code. Pick the highest score. If more than | |
218 one candidate tie, pick the default encoding. */ | |
219 if (DFA_ALIVE(big5)) top = &big5; | |
220 if (DFA_ALIVE(utf8)) { | |
221 if (top) { | |
222 #if defined PREFER_UTF8 | |
223 if (top->score <= utf8.score) top = &utf8; | |
224 #else | |
225 if (top->score < utf8.score) top = &utf8; | |
226 #endif | |
227 } else { | |
228 top = &utf8; | |
229 } | |
230 } | |
231 | |
232 if (top == &big5) return "BIG5"; | |
233 if (top == &utf8) return "UTF-8"; | |
234 return NULL; | |
235 } | |
236 | |
237 const char *guess_cn(const char *buf, int buflen) | |
238 { | |
239 int i; | |
240 guess_dfa gb2312 = DFA_INIT(guess_gb2312_st, guess_gb2312_ar); | |
241 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); | |
242 guess_dfa gb18030 = DFA_INIT(guess_gb18030_st, guess_gb18030_ar); | |
243 guess_dfa *top = NULL; | |
244 | |
245 for (i=0; i<buflen; i++) { | |
246 int c = (unsigned char)buf[i]; | |
247 int c2; | |
248 | |
249 /* special treatment of jis escape sequence */ | |
250 if (c == 0x1b) { | |
251 if (i < buflen-1) { | |
252 c = (unsigned char)buf[i+1]; | |
253 c2 = (unsigned char)buf[i+2]; | |
254 if (c == '$' && (c2 == ')' || c2 == '+')) return "ISO-2022-CN"; | |
255 } | |
256 } | |
257 | |
258 if (DFA_ALIVE(gb2312)) { | |
259 if (!DFA_ALIVE(utf8) && !DFA_ALIVE(gb18030)) return "GB2312"; | |
260 DFA_NEXT(gb2312, c); | |
261 } | |
262 if (DFA_ALIVE(utf8)) { | |
263 if (!DFA_ALIVE(gb2312) && !DFA_ALIVE(gb18030)) return "UTF-8"; | |
264 DFA_NEXT(utf8, c); | |
265 } | |
266 if (DFA_ALIVE(gb18030)) { | |
267 if (!DFA_ALIVE(utf8) && !DFA_ALIVE(gb2312)) return "GB18030"; | |
268 DFA_NEXT(gb18030, c); | |
269 } | |
270 | |
271 if (!DFA_ALIVE(gb2312) && !DFA_ALIVE(utf8) && !DFA_ALIVE(gb18030)) { | |
272 /* we ran out the possibilities */ | |
273 return NULL; | |
274 } | |
275 } | |
276 | |
277 /* Now, we have ambigous code. Pick the highest score. If more than | |
278 one candidate tie, pick the default encoding. */ | |
279 if (DFA_ALIVE(gb2312)) top = &gb2312; | |
280 if (DFA_ALIVE(utf8)) { | |
281 if (top) { | |
282 #if defined PREFER_UTF8 | |
283 if (top->score <= utf8.score) top = &utf8; | |
284 #else | |
285 if (top->score < utf8.score) top = &utf8; | |
286 #endif | |
287 } else { | |
288 top = &utf8; | |
289 } | |
290 } | |
291 if (DFA_ALIVE(gb18030)) { | |
292 if (top) { | |
293 #if defined PREFER_GB18030 | |
294 if (top->score <= gb18030.score) top = &gb18030; | |
295 #else | |
296 if (top->score < gb18030.score) top = &gb18030; | |
297 #endif | |
298 } else { | |
299 top = &gb18030; | |
300 } | |
301 } | |
302 | |
303 if (top == &gb2312) return "GB2312"; | |
304 if (top == &utf8) return "UTF-8"; | |
305 if (top == &gb18030) return "GB18030"; | |
306 return NULL; | |
307 } | |
308 | |
309 const char *guess_kr(const char *buf, int buflen) | |
310 { | |
311 int i; | |
312 guess_dfa euck = DFA_INIT(guess_euck_st, guess_euck_ar); | |
313 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); | |
314 guess_dfa johab = DFA_INIT(guess_johab_st, guess_johab_ar); | |
315 guess_dfa *top = NULL; | |
316 | |
317 for (i=0; i<buflen; i++) { | |
318 int c = (unsigned char)buf[i]; | |
319 int c2; | |
320 | |
321 /* special treatment of jis escape sequence */ | |
322 if (c == 0x1b) { | |
323 if (i < buflen-1) { | |
324 c = (unsigned char)buf[i+1]; | |
325 c2 = (unsigned char)buf[i+2]; | |
326 if (c == '$' && c2 == ')') return "ISO-2022-KR"; | |
327 } | |
328 } | |
329 | |
330 /* special treatment of BOM */ | |
331 if (i==0 && c == 0xff) { | |
332 if (i < buflen-1) { | |
333 c = (unsigned char)buf[i+1]; | |
334 if (c == 0xfe) return "UCS2-LE"; | |
335 } | |
336 } | |
337 if (i==0 && c == 0xfe) { | |
338 if (i < buflen-1) { | |
339 c = (unsigned char)buf[i+1]; | |
340 if (c == 0xff) return "UCS2-BE"; | |
341 } | |
342 } | |
343 | |
344 if (DFA_ALIVE(euck)) { | |
345 if (!DFA_ALIVE(johab) && !DFA_ALIVE(utf8)) return "EUC-KR"; | |
346 DFA_NEXT(euck, c); | |
347 } | |
348 if (DFA_ALIVE(johab)) { | |
349 if (!DFA_ALIVE(euck) && !DFA_ALIVE(utf8)) return "JOHAB"; | |
350 DFA_NEXT(johab, c); | |
351 } | |
352 if (DFA_ALIVE(utf8)) { | |
353 if (!DFA_ALIVE(euck) && !DFA_ALIVE(johab)) return "UTF-8"; | |
354 DFA_NEXT(utf8, c); | |
355 } | |
356 | |
357 if (!DFA_ALIVE(euck) && !DFA_ALIVE(johab) && !DFA_ALIVE(utf8)) { | |
358 /* we ran out the possibilities */ | |
359 return NULL; | |
360 } | |
361 } | |
362 | |
363 /* Now, we have ambigous code. Pick the highest score. If more than | |
364 one candidate tie, pick the default encoding. */ | |
365 if (DFA_ALIVE(euck)) top = &euck; | |
366 if (DFA_ALIVE(utf8)) { | |
367 if (top) { | |
368 #if defined PREFER_UTF8 | |
369 if (top->score <= utf8.score) top = &utf8; | |
370 #else | |
371 if (top->score < utf8.score) top = &utf8; | |
372 #endif | |
373 } else { | |
374 top = &utf8; | |
375 } | |
376 } | |
377 if (DFA_ALIVE(johab)) { | |
378 if (top) { | |
379 #if defined PREFER_JOAHB | |
380 if (top->score <= johab.score) top = &johab; | |
381 #else | |
382 if (top->score < johab.score) top = &johab; | |
383 #endif | |
384 } else { | |
385 top = &johab; | |
386 } | |
387 } | |
388 | |
389 if (top == &euck) return "EUC-KR"; | |
390 if (top == &utf8) return "UTF-8"; | |
391 if (top == &johab) return "JOHAB"; | |
392 return NULL; | |
393 } | |
394 |