comparison arabic_impl.c @ 2:754a4550c64e

- added arabic, greek, hebrew and turkish DFAs - new UCS-2LE/BE DFAs - now arabic_impl.c uses arabic DFAs - dfa common macros have been moved to dfa.h - minor cleanups
author Yoshiki Yazawa <yaz@cc.rim.or.jp>
date Wed, 11 Jun 2008 00:11:30 +0900
parents d9b6ff839eab
children 70e2c306231e
comparison
equal deleted inserted replaced
1:04f2be1c8464 2:754a4550c64e
1 #include "libguess.h" 1 #include "libguess.h"
2 #include "dfa.h"
3 #include "guess_tab.c"
2 4
3 static const char *_guess_ar(const unsigned char *ptr, int size) 5 #define ORDER_AR &utf8, &iso8859_6, &cp1256
6
7 const char *guess_ar(const char *buf, int buflen)
4 { 8 {
5 int i; 9 int i;
10 guess_dfa cp1256 = DFA_INIT(guess_cp1256_st, guess_cp1256_ar);
11 guess_dfa iso8859_6 = DFA_INIT(guess_iso8859_6_st, guess_iso8859_6_ar);
12 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
13 guess_dfa *top = NULL;
6 14
7 for (i = 0; i < size; i++) 15 guess_dfa *order[] = { ORDER_AR, NULL };
8 { 16
9 if ((ptr[i] >= 0x80 && ptr[i] <= 0x9F) || 17 for (i = 0; i < buflen; i++) {
10 ptr[i] == 0xA1 || ptr[i] == 0xA2 || ptr[i] == 0xA3 || 18 int c = (unsigned char) buf[i];
11 (ptr[i] >= 0xA5 && ptr[i] <= 0xAB) || 19
12 (ptr[i] >= 0xAE && ptr[i] <= 0xBA) || 20 if (DFA_ALIVE(cp1256)) {
13 ptr[i] == 0xBC || ptr[i] == 0xBD || 21 if (!DFA_ALIVE(iso8859_6) && !DFA_ALIVE(utf8))
14 ptr[i] == 0xBE || ptr[i] == 0xC0 || 22 return "CP1256";
15 (ptr[i] >= 0xDB && ptr[i] <= 0xDF) || (ptr[i] >= 0xF3)) 23 DFA_NEXT(cp1256, c);
16 return "CP1256"; 24 }
25 if (DFA_ALIVE(iso8859_6)) {
26 if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(utf8))
27 return "ISO-8859-6";
28 DFA_NEXT(iso8859_6, c);
29 }
30 if (DFA_ALIVE(utf8)) {
31 if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(iso8859_6))
32 return "UTF-8";
33 DFA_NEXT(utf8, c);
34 }
35
36 if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(iso8859_6) && !DFA_ALIVE(utf8)) {
37 /* we ran out the possibilities */
38 return NULL;
39 }
17 } 40 }
18 41
19 return "ISO-8859-6"; 42 /* Now, we have ambigous code. Pick the highest score. If more than
43 one candidate tie, pick the default encoding. */
44 for (i = 0; order[i] != NULL; i++) {
45 if (order[i]->state >= 0) { //DFA_ALIVE()
46 if (top == NULL || order[i]->score > top->score)
47 top = order[i];
48 }
49 }
50
51 if (top == &cp1256)
52 return "CP1256";
53 if (top == &utf8)
54 return "UTF-8";
55 if (top == &iso8859_6)
56 return "ISO-8859-6";
57 return NULL;
20 } 58 }
21
22 const char *guess_ar(const char *ptr, int size)
23 {
24 if (dfa_validate_utf8(ptr, size))
25 return "UTF-8";
26
27 return _guess_ar((const unsigned char *)ptr, size);
28 }