Mercurial > libguess
diff arabic_impl.c @ 2:754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
- new UCS-2LE/BE DFAs
- now arabic_impl.c uses arabic DFAs
- dfa common macros have been moved to dfa.h
- minor cleanups
author | Yoshiki Yazawa <yaz@cc.rim.or.jp> |
---|---|
date | Wed, 11 Jun 2008 00:11:30 +0900 |
parents | d9b6ff839eab |
children | 70e2c306231e |
line wrap: on
line diff
--- a/arabic_impl.c Sat Dec 01 03:27:31 2007 +0900 +++ b/arabic_impl.c Wed Jun 11 00:11:30 2008 +0900 @@ -1,28 +1,58 @@ #include "libguess.h" +#include "dfa.h" +#include "guess_tab.c" -static const char *_guess_ar(const unsigned char *ptr, int size) +#define ORDER_AR &utf8, &iso8859_6, &cp1256 + +const char *guess_ar(const char *buf, int buflen) { int i; + guess_dfa cp1256 = DFA_INIT(guess_cp1256_st, guess_cp1256_ar); + guess_dfa iso8859_6 = DFA_INIT(guess_iso8859_6_st, guess_iso8859_6_ar); + guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); + guess_dfa *top = NULL; - for (i = 0; i < size; i++) - { - if ((ptr[i] >= 0x80 && ptr[i] <= 0x9F) || - ptr[i] == 0xA1 || ptr[i] == 0xA2 || ptr[i] == 0xA3 || - (ptr[i] >= 0xA5 && ptr[i] <= 0xAB) || - (ptr[i] >= 0xAE && ptr[i] <= 0xBA) || - ptr[i] == 0xBC || ptr[i] == 0xBD || - ptr[i] == 0xBE || ptr[i] == 0xC0 || - (ptr[i] >= 0xDB && ptr[i] <= 0xDF) || (ptr[i] >= 0xF3)) - return "CP1256"; + guess_dfa *order[] = { ORDER_AR, NULL }; + + for (i = 0; i < buflen; i++) { + int c = (unsigned char) buf[i]; + + if (DFA_ALIVE(cp1256)) { + if (!DFA_ALIVE(iso8859_6) && !DFA_ALIVE(utf8)) + return "CP1256"; + DFA_NEXT(cp1256, c); + } + if (DFA_ALIVE(iso8859_6)) { + if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(utf8)) + return "ISO-8859-6"; + DFA_NEXT(iso8859_6, c); + } + if (DFA_ALIVE(utf8)) { + if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(iso8859_6)) + return "UTF-8"; + DFA_NEXT(utf8, c); + } + + if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(iso8859_6) && !DFA_ALIVE(utf8)) { + /* we ran out the possibilities */ + return NULL; + } } - return "ISO-8859-6"; -} + /* Now, we have ambigous code. Pick the highest score. If more than + one candidate tie, pick the default encoding. */ + for (i = 0; order[i] != NULL; i++) { + if (order[i]->state >= 0) { //DFA_ALIVE() + if (top == NULL || order[i]->score > top->score) + top = order[i]; + } + } -const char *guess_ar(const char *ptr, int size) -{ - if (dfa_validate_utf8(ptr, size)) + if (top == &cp1256) + return "CP1256"; + if (top == &utf8) return "UTF-8"; - - return _guess_ar((const unsigned char *)ptr, size); + if (top == &iso8859_6) + return "ISO-8859-6"; + return NULL; }