Mercurial > libguess
diff arabic_impl.c @ 3:70e2c306231e
- implemented dfa utility functions.
- added dfa.c.
- rewrote guess functions for ar, gr, hw and tr scripts with dfa utilities.
- guess functions for cjk scripts too.
author | Yoshiki Yazawa <yaz@cc.rim.or.jp> |
---|---|
date | Thu, 12 Jun 2008 20:20:43 +0900 |
parents | 754a4550c64e |
children |
line wrap: on
line diff
--- a/arabic_impl.c Wed Jun 11 00:11:30 2008 +0900 +++ b/arabic_impl.c Thu Jun 12 20:20:43 2008 +0900 @@ -2,57 +2,53 @@ #include "dfa.h" #include "guess_tab.c" -#define ORDER_AR &utf8, &iso8859_6, &cp1256 +/* precedence order */ +#define ORDER &utf8, &iso8859_6, &cp1256 + +/* encodings */ +static guess_dfa cp1256 = DFA_INIT(guess_cp1256_st, guess_cp1256_ar, "CP1256"); +static guess_dfa iso8859_6 = DFA_INIT(guess_iso8859_6_st, guess_iso8859_6_ar, "ISO-8859-6"); +static guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); const char *guess_ar(const char *buf, int buflen) { int i; - guess_dfa cp1256 = DFA_INIT(guess_cp1256_st, guess_cp1256_ar); - guess_dfa iso8859_6 = DFA_INIT(guess_iso8859_6_st, guess_iso8859_6_ar); - guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); + const char *rv = NULL; guess_dfa *top = NULL; - - guess_dfa *order[] = { ORDER_AR, NULL }; + guess_dfa *order[] = { ORDER, NULL }; for (i = 0; i < buflen; i++) { int c = (unsigned char) buf[i]; - if (DFA_ALIVE(cp1256)) { - if (!DFA_ALIVE(iso8859_6) && !DFA_ALIVE(utf8)) - return "CP1256"; - DFA_NEXT(cp1256, c); + /* special treatment of BOM */ + if (i == 0 && c == 0xff) { + if (i < buflen - 1) { + c = (unsigned char) buf[i + 1]; + if (c == 0xfe) + return UCS_2LE; + } } - if (DFA_ALIVE(iso8859_6)) { - if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(utf8)) - return "ISO-8859-6"; - DFA_NEXT(iso8859_6, c); - } - if (DFA_ALIVE(utf8)) { - if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(iso8859_6)) - return "UTF-8"; - DFA_NEXT(utf8, c); + if (i == 0 && c == 0xfe) { + if (i < buflen - 1) { + c = (unsigned char) buf[i + 1]; + if (c == 0xff) + return UCS_2BE; + } } - if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(iso8859_6) && !DFA_ALIVE(utf8)) { + rv = dfa_process(order, c); + if(rv) + return rv; + + if (dfa_none(order)) { /* we ran out the possibilities */ return NULL; } } - /* Now, we have ambigous code. Pick the highest score. If more than - one candidate tie, pick the default encoding. */ - for (i = 0; order[i] != NULL; i++) { - if (order[i]->state >= 0) { //DFA_ALIVE() - if (top == NULL || order[i]->score > top->score) - top = order[i]; - } - } - - if (top == &cp1256) - return "CP1256"; - if (top == &utf8) - return "UTF-8"; - if (top == &iso8859_6) - return "ISO-8859-6"; - return NULL; + top = dfa_top(order); + if (top) + return top->name; + else + return NULL; }