Mercurial > libguess
comparison arabic_impl.c @ 2:754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
- new UCS-2LE/BE DFAs
- now arabic_impl.c uses arabic DFAs
- dfa common macros have been moved to dfa.h
- minor cleanups
author | Yoshiki Yazawa <yaz@cc.rim.or.jp> |
---|---|
date | Wed, 11 Jun 2008 00:11:30 +0900 |
parents | d9b6ff839eab |
children | 70e2c306231e |
comparison
equal
deleted
inserted
replaced
1:04f2be1c8464 | 2:754a4550c64e |
---|---|
1 #include "libguess.h" | 1 #include "libguess.h" |
2 #include "dfa.h" | |
3 #include "guess_tab.c" | |
2 | 4 |
3 static const char *_guess_ar(const unsigned char *ptr, int size) | 5 #define ORDER_AR &utf8, &iso8859_6, &cp1256 |
6 | |
7 const char *guess_ar(const char *buf, int buflen) | |
4 { | 8 { |
5 int i; | 9 int i; |
10 guess_dfa cp1256 = DFA_INIT(guess_cp1256_st, guess_cp1256_ar); | |
11 guess_dfa iso8859_6 = DFA_INIT(guess_iso8859_6_st, guess_iso8859_6_ar); | |
12 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); | |
13 guess_dfa *top = NULL; | |
6 | 14 |
7 for (i = 0; i < size; i++) | 15 guess_dfa *order[] = { ORDER_AR, NULL }; |
8 { | 16 |
9 if ((ptr[i] >= 0x80 && ptr[i] <= 0x9F) || | 17 for (i = 0; i < buflen; i++) { |
10 ptr[i] == 0xA1 || ptr[i] == 0xA2 || ptr[i] == 0xA3 || | 18 int c = (unsigned char) buf[i]; |
11 (ptr[i] >= 0xA5 && ptr[i] <= 0xAB) || | 19 |
12 (ptr[i] >= 0xAE && ptr[i] <= 0xBA) || | 20 if (DFA_ALIVE(cp1256)) { |
13 ptr[i] == 0xBC || ptr[i] == 0xBD || | 21 if (!DFA_ALIVE(iso8859_6) && !DFA_ALIVE(utf8)) |
14 ptr[i] == 0xBE || ptr[i] == 0xC0 || | 22 return "CP1256"; |
15 (ptr[i] >= 0xDB && ptr[i] <= 0xDF) || (ptr[i] >= 0xF3)) | 23 DFA_NEXT(cp1256, c); |
16 return "CP1256"; | 24 } |
25 if (DFA_ALIVE(iso8859_6)) { | |
26 if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(utf8)) | |
27 return "ISO-8859-6"; | |
28 DFA_NEXT(iso8859_6, c); | |
29 } | |
30 if (DFA_ALIVE(utf8)) { | |
31 if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(iso8859_6)) | |
32 return "UTF-8"; | |
33 DFA_NEXT(utf8, c); | |
34 } | |
35 | |
36 if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(iso8859_6) && !DFA_ALIVE(utf8)) { | |
37 /* we ran out the possibilities */ | |
38 return NULL; | |
39 } | |
17 } | 40 } |
18 | 41 |
19 return "ISO-8859-6"; | 42 /* Now, we have ambigous code. Pick the highest score. If more than |
43 one candidate tie, pick the default encoding. */ | |
44 for (i = 0; order[i] != NULL; i++) { | |
45 if (order[i]->state >= 0) { //DFA_ALIVE() | |
46 if (top == NULL || order[i]->score > top->score) | |
47 top = order[i]; | |
48 } | |
49 } | |
50 | |
51 if (top == &cp1256) | |
52 return "CP1256"; | |
53 if (top == &utf8) | |
54 return "UTF-8"; | |
55 if (top == &iso8859_6) | |
56 return "ISO-8859-6"; | |
57 return NULL; | |
20 } | 58 } |
21 | |
22 const char *guess_ar(const char *ptr, int size) | |
23 { | |
24 if (dfa_validate_utf8(ptr, size)) | |
25 return "UTF-8"; | |
26 | |
27 return _guess_ar((const unsigned char *)ptr, size); | |
28 } |