diff arabic_impl.c @ 3:70e2c306231e

- implemented dfa utility functions. - added dfa.c. - rewrote guess functions for ar, gr, hw and tr scripts with dfa utilities. - guess functions for cjk scripts too.
author Yoshiki Yazawa <yaz@cc.rim.or.jp>
date Thu, 12 Jun 2008 20:20:43 +0900
parents 754a4550c64e
children
line wrap: on
line diff
--- a/arabic_impl.c	Wed Jun 11 00:11:30 2008 +0900
+++ b/arabic_impl.c	Thu Jun 12 20:20:43 2008 +0900
@@ -2,57 +2,53 @@
 #include "dfa.h"
 #include "guess_tab.c"
 
-#define ORDER_AR &utf8, &iso8859_6, &cp1256
+/* precedence order */
+#define ORDER &utf8, &iso8859_6, &cp1256
+
+/* encodings */
+static guess_dfa cp1256 = DFA_INIT(guess_cp1256_st, guess_cp1256_ar, "CP1256");
+static guess_dfa iso8859_6 = DFA_INIT(guess_iso8859_6_st, guess_iso8859_6_ar, "ISO-8859-6");
+static guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
 
 const char *guess_ar(const char *buf, int buflen)
 {
     int i;
-    guess_dfa cp1256 = DFA_INIT(guess_cp1256_st, guess_cp1256_ar);
-    guess_dfa iso8859_6 = DFA_INIT(guess_iso8859_6_st, guess_iso8859_6_ar);
-    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
+    const char *rv = NULL;
     guess_dfa *top = NULL;
-
-    guess_dfa *order[] = { ORDER_AR, NULL };
+    guess_dfa *order[] = { ORDER, NULL };
 
     for (i = 0; i < buflen; i++) {
         int c = (unsigned char) buf[i];
 
-        if (DFA_ALIVE(cp1256)) {
-            if (!DFA_ALIVE(iso8859_6) && !DFA_ALIVE(utf8))
-                return "CP1256";
-            DFA_NEXT(cp1256, c);
+        /* special treatment of BOM */
+        if (i == 0 && c == 0xff) {
+            if (i < buflen - 1) {
+                c = (unsigned char) buf[i + 1];
+                if (c == 0xfe)
+                    return UCS_2LE;
+            }
         }
-        if (DFA_ALIVE(iso8859_6)) {
-            if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(utf8))
-                return "ISO-8859-6";
-            DFA_NEXT(iso8859_6, c);
-        }
-        if (DFA_ALIVE(utf8)) {
-            if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(iso8859_6))
-                return "UTF-8";
-            DFA_NEXT(utf8, c);
+        if (i == 0 && c == 0xfe) {
+            if (i < buflen - 1) {
+                c = (unsigned char) buf[i + 1];
+                if (c == 0xff)
+                    return UCS_2BE;
+            }
         }
 
-        if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(iso8859_6) && !DFA_ALIVE(utf8)) {
+        rv = dfa_process(order, c);
+        if(rv)
+            return rv;
+
+        if (dfa_none(order)) {
             /* we ran out the possibilities */
             return NULL;
         }
     }
 
-    /* Now, we have ambigous code.  Pick the highest score.  If more than
-       one candidate tie, pick the default encoding. */
-    for (i = 0; order[i] != NULL; i++) {
-        if (order[i]->state >= 0) { //DFA_ALIVE()
-            if (top == NULL || order[i]->score > top->score)
-                top = order[i];
-        }
-    }
-
-    if (top == &cp1256)
-        return "CP1256";
-    if (top == &utf8)
-        return "UTF-8";
-    if (top == &iso8859_6)
-        return "ISO-8859-6";
-    return NULL;
+    top = dfa_top(order);
+    if (top)
+        return top->name;
+    else
+        return NULL;
 }