diff turkish_impl.c @ 3:70e2c306231e

- implemented dfa utility functions. - added dfa.c. - rewrote guess functions for ar, gr, hw and tr scripts with dfa utilities. - guess functions for cjk scripts too.
author Yoshiki Yazawa <yaz@cc.rim.or.jp>
date Thu, 12 Jun 2008 20:20:43 +0900
parents 754a4550c64e
children
line wrap: on
line diff
--- a/turkish_impl.c	Wed Jun 11 00:11:30 2008 +0900
+++ b/turkish_impl.c	Thu Jun 12 20:20:43 2008 +0900
@@ -1,25 +1,55 @@
 #include "libguess.h"
+#include "dfa.h"
+#include "guess_tab.c"
+
+/* precedence order */
+#define ORDER &utf8, &iso8859_9, &cp1254
 
-static const char *_guess_tr(const unsigned char *ptr, int size)
+/* encodings */
+static guess_dfa cp1254 = DFA_INIT(guess_cp1253_st, guess_cp1253_ar, "CP1254");
+static guess_dfa iso8859_9 = DFA_INIT(guess_iso8859_9_st, guess_iso8859_9_ar, "ISO-8859-9");
+static guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
+
+/* common */
+const char *guess_tr(const char *buf, int buflen)
 {
     int i;
+    const char *rv = NULL;
+    guess_dfa *top = NULL;
+    guess_dfa *order[] = { ORDER, NULL };
 
-    for (i = 0; i < size; i++)
-    {
-        if (ptr[i] == 0x80 ||
-            (ptr[i] >= 0x82 && ptr[i] <= 0x8C) ||
-            (ptr[i] >= 0x91 && ptr[i] <= 0x9C) ||
-            ptr[ i ] == 0x9F)
-            return "CP1254";
+    for (i = 0; i < buflen; i++) {
+        int c = (unsigned char) buf[i];
+
+        /* special treatment of BOM */
+        if (i == 0 && c == 0xff) {
+            if (i < buflen - 1) {
+                c = (unsigned char) buf[i + 1];
+                if (c == 0xfe)
+                    return UCS_2LE;
+            }
+        }
+        if (i == 0 && c == 0xfe) {
+            if (i < buflen - 1) {
+                c = (unsigned char) buf[i + 1];
+                if (c == 0xff)
+                    return UCS_2BE;
+            }
+        }
+
+        rv = dfa_process(order, c);
+        if(rv)
+            return rv;
+
+        if (dfa_none(order)) {
+            /* we ran out the possibilities */
+            return NULL;
+        }
     }
 
-    return "ISO-8859-9";
+    top = dfa_top(order);
+    if (top)
+        return top->name;
+    else
+        return NULL;
 }
-
-const char *guess_tr(const char *ptr, int size)
-{
-    if (dfa_validate_utf8(ptr, size))
-        return "UTF-8";
-
-    return _guess_tr((const unsigned char *)ptr, size);
-}