diff hebrew_impl.c @ 3:70e2c306231e

- implemented dfa utility functions. - added dfa.c. - rewrote guess functions for ar, gr, hw and tr scripts with dfa utilities. - guess functions for cjk scripts too.
author Yoshiki Yazawa <yaz@cc.rim.or.jp>
date Thu, 12 Jun 2008 20:20:43 +0900
parents d9b6ff839eab
children
line wrap: on
line diff
--- a/hebrew_impl.c	Wed Jun 11 00:11:30 2008 +0900
+++ b/hebrew_impl.c	Thu Jun 12 20:20:43 2008 +0900
@@ -1,23 +1,56 @@
-const char *_guess_hw(const unsigned char *ptr, int size)
+#include "libguess.h"
+#include "dfa.h"
+#include "guess_tab.c"
+
+/* precedence order */
+#define ORDER &utf8, &iso8859_8, &cp1255
+
+/* encodings */
+static guess_dfa cp1255 = DFA_INIT(guess_cp1255_st, guess_cp1255_ar, "CP1255");
+static guess_dfa iso8859_8 = DFA_INIT(guess_iso8859_8_st, guess_iso8859_8_ar, "ISO-8859-8-I");
+static guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
+
+
+/* common */
+const char *guess_hw(const char *buf, int buflen)
 {
     int i;
+    const char *rv = NULL;
+    guess_dfa *top = NULL;
+    guess_dfa *order[] = { ORDER, NULL };
 
-    for (i = 0; i < size; i++)
-    {
-        if (ptr[i] == 0x80 || (ptr[i] >= 0x82 && ptr[i] <= 0x89) || ptr[i] == 0x8B ||
-            (ptr[i] >= 0x91 && ptr[i] <= 0x99) || ptr[i] == 0x9B || ptr[i] == 0xA1 ||
-            (ptr[i] >= 0xBF && ptr[i] <= 0xC9) ||
-            (ptr[i] >= 0xCB && ptr[i] <= 0xD8))
-            return "CP1255";
+    for (i = 0; i < buflen; i++) {
+        int c = (unsigned char) buf[i];
 
-        if (ptr[i] == 0xDF)
-            return "ISO-8859-8-I";
+        /* special treatment of BOM */
+        if (i == 0 && c == 0xff) {
+            if (i < buflen - 1) {
+                c = (unsigned char) buf[i + 1];
+                if (c == 0xfe)
+                    return UCS_2LE;
+            }
+        }
+        if (i == 0 && c == 0xfe) {
+            if (i < buflen - 1) {
+                c = (unsigned char) buf[i + 1];
+                if (c == 0xff)
+                    return UCS_2BE;
+            }
+        }
+
+        rv = dfa_process(order, c);
+        if(rv)
+            return rv;
+
+        if (dfa_none(order)) {
+            /* we ran out the possibilities */
+            return NULL;
+        }
     }
 
-    return "ISO-8859-8-I";
+    top = dfa_top(order);
+    if (top)
+        return top->name;
+    else
+        return NULL;
 }
-
-const char *guess_hw(const char *ptr, int size)
-{
-    return _guess_hw((const unsigned char *) ptr, size);
-}