diff arabic_impl.c @ 2:754a4550c64e

- added arabic, greek, hebrew and turkish DFAs - new UCS-2LE/BE DFAs - now arabic_impl.c uses arabic DFAs - dfa common macros have been moved to dfa.h - minor cleanups
author Yoshiki Yazawa <yaz@cc.rim.or.jp>
date Wed, 11 Jun 2008 00:11:30 +0900
parents d9b6ff839eab
children 70e2c306231e
line wrap: on
line diff
--- a/arabic_impl.c	Sat Dec 01 03:27:31 2007 +0900
+++ b/arabic_impl.c	Wed Jun 11 00:11:30 2008 +0900
@@ -1,28 +1,58 @@
 #include "libguess.h"
+#include "dfa.h"
+#include "guess_tab.c"
 
-static const char *_guess_ar(const unsigned char *ptr, int size)
+#define ORDER_AR &utf8, &iso8859_6, &cp1256
+
+const char *guess_ar(const char *buf, int buflen)
 {
     int i;
+    guess_dfa cp1256 = DFA_INIT(guess_cp1256_st, guess_cp1256_ar);
+    guess_dfa iso8859_6 = DFA_INIT(guess_iso8859_6_st, guess_iso8859_6_ar);
+    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
+    guess_dfa *top = NULL;
 
-    for (i = 0; i < size; i++)
-    {
-        if ((ptr[i] >= 0x80 && ptr[i] <= 0x9F) ||
-            ptr[i] == 0xA1 || ptr[i] == 0xA2 || ptr[i] == 0xA3 ||
-            (ptr[i] >= 0xA5 && ptr[i] <= 0xAB) ||
-            (ptr[i] >= 0xAE && ptr[i] <= 0xBA) ||
-            ptr[i] == 0xBC || ptr[i] == 0xBD ||
-            ptr[i] == 0xBE || ptr[i] == 0xC0 ||
-            (ptr[i] >= 0xDB && ptr[i] <= 0xDF) || (ptr[i] >= 0xF3))
-            return "CP1256";
+    guess_dfa *order[] = { ORDER_AR, NULL };
+
+    for (i = 0; i < buflen; i++) {
+        int c = (unsigned char) buf[i];
+
+        if (DFA_ALIVE(cp1256)) {
+            if (!DFA_ALIVE(iso8859_6) && !DFA_ALIVE(utf8))
+                return "CP1256";
+            DFA_NEXT(cp1256, c);
+        }
+        if (DFA_ALIVE(iso8859_6)) {
+            if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(utf8))
+                return "ISO-8859-6";
+            DFA_NEXT(iso8859_6, c);
+        }
+        if (DFA_ALIVE(utf8)) {
+            if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(iso8859_6))
+                return "UTF-8";
+            DFA_NEXT(utf8, c);
+        }
+
+        if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(iso8859_6) && !DFA_ALIVE(utf8)) {
+            /* we ran out the possibilities */
+            return NULL;
+        }
     }
 
-    return "ISO-8859-6";
-}
+    /* Now, we have ambigous code.  Pick the highest score.  If more than
+       one candidate tie, pick the default encoding. */
+    for (i = 0; order[i] != NULL; i++) {
+        if (order[i]->state >= 0) { //DFA_ALIVE()
+            if (top == NULL || order[i]->score > top->score)
+                top = order[i];
+        }
+    }
 
-const char *guess_ar(const char *ptr, int size)
-{
-    if (dfa_validate_utf8(ptr, size))
+    if (top == &cp1256)
+        return "CP1256";
+    if (top == &utf8)
         return "UTF-8";
-
-    return _guess_ar((const unsigned char *)ptr, size);
+    if (top == &iso8859_6)
+        return "ISO-8859-6";
+    return NULL;
 }