changeset 3:70e2c306231e

- implemented dfa utility functions. - added dfa.c. - rewrote guess functions for ar, gr, hw and tr scripts with dfa utilities. - guess functions for cjk scripts too.
author Yoshiki Yazawa <yaz@cc.rim.or.jp>
date Thu, 12 Jun 2008 20:20:43 +0900
parents 754a4550c64e
children ff05c6c659cb
files Makefile arabic_impl.c cjk_impl.c dfa.c dfa.h greek_impl.c guess.scm guess_tab.c hebrew_impl.c libguess.h turkish_impl.c
diffstat 11 files changed, 345 insertions(+), 240 deletions(-) [+]
line wrap: on
line diff
--- a/Makefile	Wed Jun 11 00:11:30 2008 +0900
+++ b/Makefile	Thu Jun 12 20:20:43 2008 +0900
@@ -1,7 +1,7 @@
 PREFIX = /usr/local
 
 MAJOR = 0
-MINOR = 2
+MINOR = 3
 REVISION = 0
 VER = ${MAJOR}.${MINOR}.${REVISION}
 
@@ -9,6 +9,7 @@
 #OBJS = guess.o
 
 SRCS = guess.c		\
+       dfa.c		\
        arabic_impl.c	\
        cjk_impl.c	\
        greek_impl.c	\
@@ -22,11 +23,11 @@
 OBJS = ${SRCS:.c=.o} 
 
 LIBS = libguess.so libguess.a
-CFLAGS += -fPIC
+CFLAGS += -fPIC -pg -g
 SONAME = libguess.so.${MAJOR}
 
 
-all: $(LIBS)
+all: $(LIBS) test
 
 libguess.so: ${OBJS}
 	${CC} -o libguess.so -shared -Wl,-soname,${SONAME} ${OBJS}
@@ -41,8 +42,8 @@
 guess_tab.c : guess.scm
 	gosh guess.scm guess_tab.c
 
-test: test.c guess.c
-	gcc -g -o test test.c guess.c
+test: test.c libguess.a
+	gcc -g -o test test.c libguess.a
 
 install:
 	install -m644 libguess.h ${PREFIX}/include
--- a/arabic_impl.c	Wed Jun 11 00:11:30 2008 +0900
+++ b/arabic_impl.c	Thu Jun 12 20:20:43 2008 +0900
@@ -2,57 +2,53 @@
 #include "dfa.h"
 #include "guess_tab.c"
 
-#define ORDER_AR &utf8, &iso8859_6, &cp1256
+/* precedence order */
+#define ORDER &utf8, &iso8859_6, &cp1256
+
+/* encodings */
+static guess_dfa cp1256 = DFA_INIT(guess_cp1256_st, guess_cp1256_ar, "CP1256");
+static guess_dfa iso8859_6 = DFA_INIT(guess_iso8859_6_st, guess_iso8859_6_ar, "ISO-8859-6");
+static guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
 
 const char *guess_ar(const char *buf, int buflen)
 {
     int i;
-    guess_dfa cp1256 = DFA_INIT(guess_cp1256_st, guess_cp1256_ar);
-    guess_dfa iso8859_6 = DFA_INIT(guess_iso8859_6_st, guess_iso8859_6_ar);
-    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
+    const char *rv = NULL;
     guess_dfa *top = NULL;
-
-    guess_dfa *order[] = { ORDER_AR, NULL };
+    guess_dfa *order[] = { ORDER, NULL };
 
     for (i = 0; i < buflen; i++) {
         int c = (unsigned char) buf[i];
 
-        if (DFA_ALIVE(cp1256)) {
-            if (!DFA_ALIVE(iso8859_6) && !DFA_ALIVE(utf8))
-                return "CP1256";
-            DFA_NEXT(cp1256, c);
+        /* special treatment of BOM */
+        if (i == 0 && c == 0xff) {
+            if (i < buflen - 1) {
+                c = (unsigned char) buf[i + 1];
+                if (c == 0xfe)
+                    return UCS_2LE;
+            }
         }
-        if (DFA_ALIVE(iso8859_6)) {
-            if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(utf8))
-                return "ISO-8859-6";
-            DFA_NEXT(iso8859_6, c);
-        }
-        if (DFA_ALIVE(utf8)) {
-            if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(iso8859_6))
-                return "UTF-8";
-            DFA_NEXT(utf8, c);
+        if (i == 0 && c == 0xfe) {
+            if (i < buflen - 1) {
+                c = (unsigned char) buf[i + 1];
+                if (c == 0xff)
+                    return UCS_2BE;
+            }
         }
 
-        if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(iso8859_6) && !DFA_ALIVE(utf8)) {
+        rv = dfa_process(order, c);
+        if(rv)
+            return rv;
+
+        if (dfa_none(order)) {
             /* we ran out the possibilities */
             return NULL;
         }
     }
 
-    /* Now, we have ambigous code.  Pick the highest score.  If more than
-       one candidate tie, pick the default encoding. */
-    for (i = 0; order[i] != NULL; i++) {
-        if (order[i]->state >= 0) { //DFA_ALIVE()
-            if (top == NULL || order[i]->score > top->score)
-                top = order[i];
-        }
-    }
-
-    if (top == &cp1256)
-        return "CP1256";
-    if (top == &utf8)
-        return "UTF-8";
-    if (top == &iso8859_6)
-        return "ISO-8859-6";
-    return NULL;
+    top = dfa_top(order);
+    if (top)
+        return top->name;
+    else
+        return NULL;
 }
--- a/cjk_impl.c	Wed Jun 11 00:11:30 2008 +0900
+++ b/cjk_impl.c	Thu Jun 12 20:20:43 2008 +0900
@@ -40,18 +40,7 @@
 #include "libguess.h"
 #include "dfa.h"
 
-/* workaround for that glib's g_convert can't convert
-   properly from UCS-2BE/LE trailing after BOM. */
-#define WITH_G_CONVERT 1
-/* #undef WITH_G_CONVERT */
-
-#ifdef WITH_G_CONVERT
-const char UCS_2BE[] = "UTF-16";
-const char UCS_2LE[] = "UTF-16";
-#else
-const char UCS_2BE[] = "UCS-2BE";
-const char UCS_2LE[] = "UCS-2LE";
-#endif
+#include <stdio.h>
 
 /* take precedence if scores are same. you can customize the order as: */
 /* ORDER_** &highest, &second, ... &lowest */
@@ -63,11 +52,10 @@
 /* include DFA table generated by guess.scm */
 #include "guess_tab.c"
 
-
 int dfa_validate_utf8(const char *buf, int buflen)
 {
     int i;
-    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
+    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
 
     for (i = 0; i < buflen; i++) {
         int c = (unsigned char) buf[i];
@@ -89,9 +77,10 @@
 const char *guess_jp(const char *buf, int buflen)
 {
     int i;
-    guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar);
-    guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar);
-    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
+    const char *rv = NULL;
+    guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar, "EUC-JP");
+    guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar, "SJIS");
+    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
     guess_dfa *top = NULL;
 
     guess_dfa *order[] = { ORDER_JP, NULL };
@@ -124,51 +113,29 @@
             }
         }
 
-        if (DFA_ALIVE(eucj)) {
-            if (!DFA_ALIVE(sjis) && !DFA_ALIVE(utf8))
-                return "EUC-JP";
-            DFA_NEXT(eucj, c);
-        }
-        if (DFA_ALIVE(sjis)) {
-            if (!DFA_ALIVE(eucj) && !DFA_ALIVE(utf8))
-                return "SJIS";
-            DFA_NEXT(sjis, c);
-        }
-        if (DFA_ALIVE(utf8)) {
-            if (!DFA_ALIVE(sjis) && !DFA_ALIVE(eucj))
-                return "UTF-8";
-            DFA_NEXT(utf8, c);
-        }
+        rv = dfa_process(order, c);
+        if(rv)
+            return rv;
 
-        if (!DFA_ALIVE(eucj) && !DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) {
+        if (dfa_none(order)) {
             /* we ran out the possibilities */
             return NULL;
         }
     }
 
-    /* Now, we have ambigous code.  Pick the highest score.  If more than
-       one candidate tie, pick the default encoding. */
-    for (i = 0; order[i] != NULL; i++) {
-        if (order[i]->state >= 0) { //DFA_ALIVE()
-            if (top == NULL || order[i]->score > top->score)
-                top = order[i];
-        }
-    }
-
-    if (top == &eucj)
-        return "EUC-JP";
-    if (top == &utf8)
-        return "UTF-8";
-    if (top == &sjis)
-        return "SJIS";
-    return NULL;
+    top = dfa_top(order);
+    if(top)
+        return top->name;
+    else
+        return NULL;
 }
 
 const char *guess_tw(const char *buf, int buflen)
 {
     int i;
-    guess_dfa big5 = DFA_INIT(guess_big5_st, guess_big5_ar);
-    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
+    const char *rv = NULL;
+    guess_dfa big5 = DFA_INIT(guess_big5_st, guess_big5_ar, "BIG5");
+    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
     guess_dfa *top = NULL;
 
     guess_dfa *order[] = { ORDER_TW, NULL };
@@ -201,45 +168,30 @@
             }
         }
 
-        if (DFA_ALIVE(big5)) {
-            if (!DFA_ALIVE(utf8))
-                return "BIG5";
-            DFA_NEXT(big5, c);
-        }
-        if (DFA_ALIVE(utf8)) {
-            if (!DFA_ALIVE(big5))
-                return "UTF-8";
-            DFA_NEXT(utf8, c);
-        }
+        rv = dfa_process(order, c);
+        if(rv)
+            return rv;
 
-        if (!DFA_ALIVE(big5) && !DFA_ALIVE(utf8)) {
+        if (dfa_none(order)) {
             /* we ran out the possibilities */
             return NULL;
         }
     }
 
-    /* Now, we have ambigous code.  Pick the highest score.  If more than
-       one candidate tie, pick the default encoding. */
-    for (i = 0; order[i] != NULL; i++) {
-        if (order[i]->state >= 0) { //DFA_ALIVE()
-            if (top == NULL || order[i]->score > top->score)
-                top = order[i];
-        }
-    }
-
-    if (top == &big5)
-        return "BIG5";
-    if (top == &utf8)
-        return "UTF-8";
-    return NULL;
+    top = dfa_top(order);
+    if (top)
+        return top->name;
+    else
+        return NULL;
 }
 
 const char *guess_cn(const char *buf, int buflen)
 {
     int i;
-    guess_dfa gb2312 = DFA_INIT(guess_gb2312_st, guess_gb2312_ar);
-    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
-    guess_dfa gb18030 = DFA_INIT(guess_gb18030_st, guess_gb18030_ar);
+    const char *rv = NULL;
+    guess_dfa gb2312 = DFA_INIT(guess_gb2312_st, guess_gb2312_ar, "GB2312");
+    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
+    guess_dfa gb18030 = DFA_INIT(guess_gb18030_st, guess_gb18030_ar, "GB18030");
     guess_dfa *top = NULL;
 
     guess_dfa *order[] = { ORDER_CN, NULL };
@@ -274,52 +226,30 @@
             }
         }
 
-        if (DFA_ALIVE(gb2312)) {
-            if (!DFA_ALIVE(utf8) && !DFA_ALIVE(gb18030))
-                return "GB2312";
-            DFA_NEXT(gb2312, c);
-        }
-        if (DFA_ALIVE(utf8)) {
-            if (!DFA_ALIVE(gb2312) && !DFA_ALIVE(gb18030))
-                return "UTF-8";
-            DFA_NEXT(utf8, c);
-        }
-        if (DFA_ALIVE(gb18030)) {
-            if (!DFA_ALIVE(utf8) && !DFA_ALIVE(gb2312))
-                return "GB18030";
-            DFA_NEXT(gb18030, c);
-        }
+        rv = dfa_process(order, c);
+        if(rv)
+            return rv;
 
-        if (!DFA_ALIVE(gb2312) && !DFA_ALIVE(utf8) && !DFA_ALIVE(gb18030)) {
+        if (dfa_none(order)) {
             /* we ran out the possibilities */
             return NULL;
         }
     }
 
-    /* Now, we have ambigous code.  Pick the highest score.  If more than
-       one candidate tie, pick the default encoding. */
-    for (i = 0; order[i] != NULL; i++) {
-        if (order[i]->state >= 0) { //DFA_ALIVE()
-            if (top == NULL || order[i]->score > top->score)
-                top = order[i];
-        }
-    }
-
-    if (top == &gb2312)
-        return "GB2312";
-    if (top == &utf8)
-        return "UTF-8";
-    if (top == &gb18030)
-        return "GB18030";
-    return NULL;
+    top = dfa_top(order);
+    if(top)
+        return top->name;
+    else
+        return NULL;
 }
 
 const char *guess_kr(const char *buf, int buflen)
 {
     int i;
-    guess_dfa euck = DFA_INIT(guess_euck_st, guess_euck_ar);
-    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
-    guess_dfa johab = DFA_INIT(guess_johab_st, guess_johab_ar);
+    const char *rv = NULL;
+    guess_dfa euck = DFA_INIT(guess_euck_st, guess_euck_ar, "EUC-KR");
+    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
+    guess_dfa johab = DFA_INIT(guess_johab_st, guess_johab_ar, "JOHAB");
     guess_dfa *top = NULL;
 
     guess_dfa *order[] = { ORDER_KR, NULL };
@@ -354,42 +284,19 @@
             }
         }
 
-        if (DFA_ALIVE(euck)) {
-            if (!DFA_ALIVE(johab) && !DFA_ALIVE(utf8))
-                return "EUC-KR";
-            DFA_NEXT(euck, c);
-        }
-        if (DFA_ALIVE(johab)) {
-            if (!DFA_ALIVE(euck) && !DFA_ALIVE(utf8))
-                return "JOHAB";
-            DFA_NEXT(johab, c);
-        }
-        if (DFA_ALIVE(utf8)) {
-            if (!DFA_ALIVE(euck) && !DFA_ALIVE(johab))
-                return "UTF-8";
-            DFA_NEXT(utf8, c);
-        }
+        rv = dfa_process(order, c);
+        if(rv)
+            return rv;
 
-        if (!DFA_ALIVE(euck) && !DFA_ALIVE(johab) && !DFA_ALIVE(utf8)) {
+        if (dfa_none(order)) {
             /* we ran out the possibilities */
             return NULL;
         }
     }
 
-    /* Now, we have ambigous code.  Pick the highest score.  If more than
-       one candidate tie, pick the default encoding. */
-    for (i = 0; order[i] != NULL; i++) {
-        if (order[i]->state >= 0) { //DFA_ALIVE()
-            if (top == NULL || order[i]->score > top->score)
-                top = order[i];
-        }
-    }
-
-    if (top == &euck)
-        return "EUC-KR";
-    if (top == &utf8)
-        return "UTF-8";
-    if (top == &johab)
-        return "JOHAB";
-    return NULL;
+    top = dfa_top(order);
+    if(top)
+        return top->name;
+    else
+        return NULL;
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dfa.c	Thu Jun 12 20:20:43 2008 +0900
@@ -0,0 +1,61 @@
+#include "libguess.h"
+#include "dfa.h"
+
+boolean
+dfa_alone(guess_dfa *dfa, guess_dfa *order[])
+{
+    int i;
+
+    if (dfa->state < 0)
+        return FALSE;
+
+    for (i = 0; order[i] != NULL; i++) {
+        if (order[i] != dfa && order[i]->state >= 0) { //DFA_ALIVE()
+            return FALSE;
+        }
+    }
+
+    return TRUE;
+}
+
+boolean
+dfa_none(guess_dfa *order[])
+{
+    int i;
+
+    for (i = 0; order[i] != NULL; i++) {
+        if (order[i]->state >= 0) { //DFA_ALIVE()
+            return FALSE;
+        }
+    }
+
+    return TRUE;
+}
+
+guess_dfa *
+dfa_top(guess_dfa *order[])
+{
+    int i;
+    guess_dfa *top = NULL;
+    for (i = 0; order[i] != NULL; i++) {
+        if (order[i]->state >= 0) { //DFA_ALIVE()
+            if (top == NULL || order[i]->score > top->score)
+                top = order[i];
+        }
+    }
+    return top;
+}
+
+const char *
+dfa_process(guess_dfa *order[], int c)
+{
+    int i;
+    for (i = 0; order[i] != NULL; i++) {
+        if (DFA_ALIVE_P(order[i])) {
+            if (dfa_alone(order[i], order))
+                return order[i]->name;
+            DFA_NEXT_P(order[i], c);
+        }
+
+    }
+}
--- a/dfa.h	Wed Jun 11 00:11:30 2008 +0900
+++ b/dfa.h	Thu Jun 12 20:20:43 2008 +0900
@@ -1,6 +1,23 @@
 #ifndef __DFA_H__
 #define __DFA_H__
 
+typedef int boolean;
+#define TRUE  1
+#define FALSE 0
+
+/* workaround for that glib's g_convert can't convert
+   properly from UCS-2BE/LE trailing after BOM. */
+#define WITH_G_CONVERT 1
+/* #undef WITH_G_CONVERT */
+
+#ifdef WITH_G_CONVERT
+#define UCS_2BE "UTF-16"
+#define UCS_2LE "UTF-16"
+#else
+#define UCS_2BE "UCS_2BE"
+#define UCS_2LE "UCS_2LE"
+#endif
+
 /* data types */
 typedef struct guess_arc_rec
 {
@@ -14,11 +31,12 @@
     guess_arc *arcs;
     int state;
     double score;
+    char *name;
 } guess_dfa;
 
 /* macros */
-#define DFA_INIT(st, ar) \
-    { st, ar, 0, 1.0 }
+#define DFA_INIT(st, ar, name)                       \
+    { st, ar, 0, 1.0 ,name}
 
 #define DFA_NEXT(dfa, ch)                               \
     do {                                                \
@@ -36,4 +54,26 @@
 
 #define DFA_ALIVE(dfa)  (dfa.state >= 0)
 
+#define DFA_NEXT_P(dfa, ch)                               \
+    do {                                                \
+        int arc__;                                      \
+        if (dfa->state >= 0) {                         \
+            arc__ = dfa->states[dfa->state][ch];       \
+            if (arc__ < 0) {                            \
+                dfa->state = -1;                       \
+            } else {                                    \
+                dfa->state = dfa->arcs[arc__].next;     \
+                dfa->score *= dfa->arcs[arc__].score;     \
+            }                                           \
+        }                                               \
+    } while (0)
+
+#define DFA_ALIVE_P(dfa)  (dfa->state >= 0)
+
+/* prototypes */
+boolean dfa_alone(guess_dfa *dfa, guess_dfa *order[]);
+boolean dfa_none(guess_dfa *order[]);
+guess_dfa *dfa_top(guess_dfa *order[]);
+const char *dfa_process(guess_dfa *order[], int c);
+
 #endif
--- a/greek_impl.c	Wed Jun 11 00:11:30 2008 +0900
+++ b/greek_impl.c	Thu Jun 12 20:20:43 2008 +0900
@@ -1,22 +1,55 @@
-static const char *_guess_gr(const unsigned char *ptr, int size)
+#include "libguess.h"
+#include "dfa.h"
+#include "guess_tab.c"
+
+/* precedence order */
+#define ORDER &utf8, &iso8859_7, &cp1253
+
+/* encodings */
+static guess_dfa cp1253 = DFA_INIT(guess_cp1253_st, guess_cp1253_ar, "CP1253");
+static guess_dfa iso8859_7 = DFA_INIT(guess_iso8859_7_st, guess_iso8859_7_ar, "ISO-8859-7");
+static guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
+
+/* common */
+const char *guess_gr(const char *buf, int buflen)
 {
     int i;
+    const char *rv = NULL;
+    guess_dfa *top = NULL;
+    guess_dfa *order[] = { ORDER, NULL };
 
-    for (i = 0; i < size; i++)
-    {
-        if (ptr[i] == 0x80 ||
-            (ptr[i] >= 0x82 && ptr[i] <= 0x87) ||
-            ptr[i] == 0x89 || ptr[i] == 0x8B ||
-            (ptr[i] >= 0x91 && ptr[i] <= 0x97) ||
-            ptr[i] == 0x99 || ptr[i] == 0x9B || ptr[i] == 0xA4 ||
-            ptr[i] == 0xA5 || ptr[i] == 0xAE)
-            return "CP1253";
+    for (i = 0; i < buflen; i++) {
+        int c = (unsigned char) buf[i];
+
+        /* special treatment of BOM */
+        if (i == 0 && c == 0xff) {
+            if (i < buflen - 1) {
+                c = (unsigned char) buf[i + 1];
+                if (c == 0xfe)
+                    return UCS_2LE;
+            }
+        }
+        if (i == 0 && c == 0xfe) {
+            if (i < buflen - 1) {
+                c = (unsigned char) buf[i + 1];
+                if (c == 0xff)
+                    return UCS_2BE;
+            }
+        }
+
+        rv = dfa_process(order, c);
+        if(rv)
+            return rv;
+
+        if (dfa_none(order)) {
+            /* we ran out the possibilities */
+            return NULL;
+        }
     }
 
-    return "ISO-8859-7";
+    top = dfa_top(order);
+    if (top)
+        return top->name;
+    else
+        return NULL;
 }
-
-const char *guess_gr(const char *ptr, int size)
-{
-    return _guess_gr((const unsigned char *) ptr, size);
-}
--- a/guess.scm	Wed Jun 11 00:11:30 2008 +0900
+++ b/guess.scm	Thu Jun 12 20:20:43 2008 +0900
@@ -156,6 +156,8 @@
     (lambda ()
       (print "/* State transition table for character code guessing */")
       (print "/* This file is automatically generated by guess.scm */")
+      (print "")
+      (print "#include \"dfa.h\"")
       (newline)
       (for-each emit-dfa-table (all-dfas))))
   0)
--- a/guess_tab.c	Wed Jun 11 00:11:30 2008 +0900
+++ b/guess_tab.c	Thu Jun 12 20:20:43 2008 +0900
@@ -1,6 +1,8 @@
 /* State transition table for character code guessing */
 /* This file is automatically generated by guess.scm */
 
+#include "dfa.h"
+
 static signed char guess_eucj_st[][256] = {
  { /* state init */
   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
--- a/hebrew_impl.c	Wed Jun 11 00:11:30 2008 +0900
+++ b/hebrew_impl.c	Thu Jun 12 20:20:43 2008 +0900
@@ -1,23 +1,56 @@
-const char *_guess_hw(const unsigned char *ptr, int size)
+#include "libguess.h"
+#include "dfa.h"
+#include "guess_tab.c"
+
+/* precedence order */
+#define ORDER &utf8, &iso8859_8, &cp1255
+
+/* encodings */
+static guess_dfa cp1255 = DFA_INIT(guess_cp1255_st, guess_cp1255_ar, "CP1255");
+static guess_dfa iso8859_8 = DFA_INIT(guess_iso8859_8_st, guess_iso8859_8_ar, "ISO-8859-8-I");
+static guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
+
+
+/* common */
+const char *guess_hw(const char *buf, int buflen)
 {
     int i;
+    const char *rv = NULL;
+    guess_dfa *top = NULL;
+    guess_dfa *order[] = { ORDER, NULL };
 
-    for (i = 0; i < size; i++)
-    {
-        if (ptr[i] == 0x80 || (ptr[i] >= 0x82 && ptr[i] <= 0x89) || ptr[i] == 0x8B ||
-            (ptr[i] >= 0x91 && ptr[i] <= 0x99) || ptr[i] == 0x9B || ptr[i] == 0xA1 ||
-            (ptr[i] >= 0xBF && ptr[i] <= 0xC9) ||
-            (ptr[i] >= 0xCB && ptr[i] <= 0xD8))
-            return "CP1255";
+    for (i = 0; i < buflen; i++) {
+        int c = (unsigned char) buf[i];
 
-        if (ptr[i] == 0xDF)
-            return "ISO-8859-8-I";
+        /* special treatment of BOM */
+        if (i == 0 && c == 0xff) {
+            if (i < buflen - 1) {
+                c = (unsigned char) buf[i + 1];
+                if (c == 0xfe)
+                    return UCS_2LE;
+            }
+        }
+        if (i == 0 && c == 0xfe) {
+            if (i < buflen - 1) {
+                c = (unsigned char) buf[i + 1];
+                if (c == 0xff)
+                    return UCS_2BE;
+            }
+        }
+
+        rv = dfa_process(order, c);
+        if(rv)
+            return rv;
+
+        if (dfa_none(order)) {
+            /* we ran out the possibilities */
+            return NULL;
+        }
     }
 
-    return "ISO-8859-8-I";
+    top = dfa_top(order);
+    if (top)
+        return top->name;
+    else
+        return NULL;
 }
-
-const char *guess_hw(const char *ptr, int size)
-{
-    return _guess_hw((const unsigned char *) ptr, size);
-}
--- a/libguess.h	Wed Jun 11 00:11:30 2008 +0900
+++ b/libguess.h	Thu Jun 12 20:20:43 2008 +0900
@@ -5,11 +5,11 @@
 
 /*
  *   Copyright (c) 2000-2003 Shiro Kawai, All rights reserved.
- * 
+ *
  *   Redistribution and use in source and binary forms, with or without
  *   modification, are permitted provided that the following conditions
  *   are met:
- * 
+ *
  *   1. Redistributions of source code must retain the above copyright
  *      notice, this list of conditions and the following disclaimer.
  *
--- a/turkish_impl.c	Wed Jun 11 00:11:30 2008 +0900
+++ b/turkish_impl.c	Thu Jun 12 20:20:43 2008 +0900
@@ -1,25 +1,55 @@
 #include "libguess.h"
+#include "dfa.h"
+#include "guess_tab.c"
+
+/* precedence order */
+#define ORDER &utf8, &iso8859_9, &cp1254
 
-static const char *_guess_tr(const unsigned char *ptr, int size)
+/* encodings */
+static guess_dfa cp1254 = DFA_INIT(guess_cp1253_st, guess_cp1253_ar, "CP1254");
+static guess_dfa iso8859_9 = DFA_INIT(guess_iso8859_9_st, guess_iso8859_9_ar, "ISO-8859-9");
+static guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
+
+/* common */
+const char *guess_tr(const char *buf, int buflen)
 {
     int i;
+    const char *rv = NULL;
+    guess_dfa *top = NULL;
+    guess_dfa *order[] = { ORDER, NULL };
 
-    for (i = 0; i < size; i++)
-    {
-        if (ptr[i] == 0x80 ||
-            (ptr[i] >= 0x82 && ptr[i] <= 0x8C) ||
-            (ptr[i] >= 0x91 && ptr[i] <= 0x9C) ||
-            ptr[ i ] == 0x9F)
-            return "CP1254";
+    for (i = 0; i < buflen; i++) {
+        int c = (unsigned char) buf[i];
+
+        /* special treatment of BOM */
+        if (i == 0 && c == 0xff) {
+            if (i < buflen - 1) {
+                c = (unsigned char) buf[i + 1];
+                if (c == 0xfe)
+                    return UCS_2LE;
+            }
+        }
+        if (i == 0 && c == 0xfe) {
+            if (i < buflen - 1) {
+                c = (unsigned char) buf[i + 1];
+                if (c == 0xff)
+                    return UCS_2BE;
+            }
+        }
+
+        rv = dfa_process(order, c);
+        if(rv)
+            return rv;
+
+        if (dfa_none(order)) {
+            /* we ran out the possibilities */
+            return NULL;
+        }
     }
 
-    return "ISO-8859-9";
+    top = dfa_top(order);
+    if (top)
+        return top->name;
+    else
+        return NULL;
 }
-
-const char *guess_tr(const char *ptr, int size)
-{
-    if (dfa_validate_utf8(ptr, size))
-        return "UTF-8";
-
-    return _guess_tr((const unsigned char *)ptr, size);
-}