comparison cjk_impl.c @ 2:754a4550c64e

- added arabic, greek, hebrew and turkish DFAs - new UCS-2LE/BE DFAs - now arabic_impl.c uses arabic DFAs - dfa common macros have been moved to dfa.h - minor cleanups
author Yoshiki Yazawa <yaz@cc.rim.or.jp>
date Wed, 11 Jun 2008 00:11:30 +0900
parents d9b6ff839eab
children 70e2c306231e
comparison
equal deleted inserted replaced
1:04f2be1c8464 2:754a4550c64e
2 * This code is derivative of guess.c of Gauche-0.8.3. 2 * This code is derivative of guess.c of Gauche-0.8.3.
3 * The following is the original copyright notice. 3 * The following is the original copyright notice.
4 */ 4 */
5 5
6 /* 6 /*
7 * guess.c - guessing character encoding 7 * guess.c - guessing character encoding
8 * 8 *
9 * Copyright (c) 2000-2003 Shiro Kawai, All rights reserved. 9 * Copyright (c) 2000-2003 Shiro Kawai, All rights reserved.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions 12 * modification, are permitted provided that the following conditions
13 * are met: 13 * are met:
14 * 14 *
15 * 1. Redistributions of source code must retain the above copyright 15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer. 16 * notice, this list of conditions and the following disclaimer.
17 * 17 *
18 * 2. Redistributions in binary form must reproduce the above copyright 18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the 19 * notice, this list of conditions and the following disclaimer in the
36 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 * 37 *
38 */ 38 */
39 39
40 #include "libguess.h" 40 #include "libguess.h"
41 41 #include "dfa.h"
42 /* take precedence if scores are same. you can customize the order as: */
43 /* ORDER_** &highest, &second, ... &lowest */
44 #define ORDER_JP &utf8, &sjis, &eucj
45 #define ORDER_TW &utf8, &big5
46 #define ORDER_CN &utf8, &gb2312, &gb18030
47 #define ORDER_KR &utf8, &euck, &johab
48 42
49 /* workaround for that glib's g_convert can't convert 43 /* workaround for that glib's g_convert can't convert
50 properly from UCS-2BE/LE trailing after BOM. */ 44 properly from UCS-2BE/LE trailing after BOM. */
51 #define WITH_G_CONVERT 1 45 #define WITH_G_CONVERT 1
52 /* #undef WITH_G_CONVERT */ 46 /* #undef WITH_G_CONVERT */
57 #else 51 #else
58 const char UCS_2BE[] = "UCS-2BE"; 52 const char UCS_2BE[] = "UCS-2BE";
59 const char UCS_2LE[] = "UCS-2LE"; 53 const char UCS_2LE[] = "UCS-2LE";
60 #endif 54 #endif
61 55
62 /* data types */ 56 /* take precedence if scores are same. you can customize the order as: */
63 typedef struct guess_arc_rec 57 /* ORDER_** &highest, &second, ... &lowest */
64 { 58 #define ORDER_JP &utf8, &sjis, &eucj
65 unsigned int next; /* next state */ 59 #define ORDER_TW &utf8, &big5
66 double score; /* score */ 60 #define ORDER_CN &utf8, &gb2312, &gb18030
67 } guess_arc; 61 #define ORDER_KR &utf8, &euck, &johab
68
69 typedef struct guess_dfa_rec
70 {
71 signed char (*states)[256];
72 guess_arc *arcs;
73 int state;
74 double score;
75 } guess_dfa;
76
77 /* macros */
78 #define DFA_INIT(st, ar) \
79 { st, ar, 0, 1.0 }
80
81 #define DFA_NEXT(dfa, ch) \
82 do { \
83 int arc__; \
84 if (dfa.state >= 0) { \
85 arc__ = dfa.states[dfa.state][ch]; \
86 if (arc__ < 0) { \
87 dfa.state = -1; \
88 } else { \
89 dfa.state = dfa.arcs[arc__].next; \
90 dfa.score *= dfa.arcs[arc__].score; \
91 } \
92 } \
93 } while (0)
94
95 #define DFA_ALIVE(dfa) (dfa.state >= 0)
96 62
97 /* include DFA table generated by guess.scm */ 63 /* include DFA table generated by guess.scm */
98 #include "guess_tab.c" 64 #include "guess_tab.c"
99 65
100 66
114 80
115 DFA_NEXT(utf8, '\0'); //Bug #53 81 DFA_NEXT(utf8, '\0'); //Bug #53
116 82
117 if(DFA_ALIVE(utf8)) 83 if(DFA_ALIVE(utf8))
118 return 1; 84 return 1;
119 else 85 else
120 return 0; 86 return 0;
121 } 87 }
122 88
123 const char *guess_jp(const char *buf, int buflen) 89 const char *guess_jp(const char *buf, int buflen)
124 { 90 {