diff cjk_impl.c @ 3:70e2c306231e

- implemented dfa utility functions. - added dfa.c. - rewrote guess functions for ar, gr, hw and tr scripts with dfa utilities. - guess functions for cjk scripts too.
author Yoshiki Yazawa <yaz@cc.rim.or.jp>
date Thu, 12 Jun 2008 20:20:43 +0900
parents 754a4550c64e
children
line wrap: on
line diff
--- a/cjk_impl.c	Wed Jun 11 00:11:30 2008 +0900
+++ b/cjk_impl.c	Thu Jun 12 20:20:43 2008 +0900
@@ -40,18 +40,7 @@
 #include "libguess.h"
 #include "dfa.h"
 
-/* workaround for that glib's g_convert can't convert
-   properly from UCS-2BE/LE trailing after BOM. */
-#define WITH_G_CONVERT 1
-/* #undef WITH_G_CONVERT */
-
-#ifdef WITH_G_CONVERT
-const char UCS_2BE[] = "UTF-16";
-const char UCS_2LE[] = "UTF-16";
-#else
-const char UCS_2BE[] = "UCS-2BE";
-const char UCS_2LE[] = "UCS-2LE";
-#endif
+#include <stdio.h>
 
 /* take precedence if scores are same. you can customize the order as: */
 /* ORDER_** &highest, &second, ... &lowest */
@@ -63,11 +52,10 @@
 /* include DFA table generated by guess.scm */
 #include "guess_tab.c"
 
-
 int dfa_validate_utf8(const char *buf, int buflen)
 {
     int i;
-    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
+    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
 
     for (i = 0; i < buflen; i++) {
         int c = (unsigned char) buf[i];
@@ -89,9 +77,10 @@
 const char *guess_jp(const char *buf, int buflen)
 {
     int i;
-    guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar);
-    guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar);
-    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
+    const char *rv = NULL;
+    guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar, "EUC-JP");
+    guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar, "SJIS");
+    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
     guess_dfa *top = NULL;
 
     guess_dfa *order[] = { ORDER_JP, NULL };
@@ -124,51 +113,29 @@
             }
         }
 
-        if (DFA_ALIVE(eucj)) {
-            if (!DFA_ALIVE(sjis) && !DFA_ALIVE(utf8))
-                return "EUC-JP";
-            DFA_NEXT(eucj, c);
-        }
-        if (DFA_ALIVE(sjis)) {
-            if (!DFA_ALIVE(eucj) && !DFA_ALIVE(utf8))
-                return "SJIS";
-            DFA_NEXT(sjis, c);
-        }
-        if (DFA_ALIVE(utf8)) {
-            if (!DFA_ALIVE(sjis) && !DFA_ALIVE(eucj))
-                return "UTF-8";
-            DFA_NEXT(utf8, c);
-        }
+        rv = dfa_process(order, c);
+        if(rv)
+            return rv;
 
-        if (!DFA_ALIVE(eucj) && !DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) {
+        if (dfa_none(order)) {
             /* we ran out the possibilities */
             return NULL;
         }
     }
 
-    /* Now, we have ambigous code.  Pick the highest score.  If more than
-       one candidate tie, pick the default encoding. */
-    for (i = 0; order[i] != NULL; i++) {
-        if (order[i]->state >= 0) { //DFA_ALIVE()
-            if (top == NULL || order[i]->score > top->score)
-                top = order[i];
-        }
-    }
-
-    if (top == &eucj)
-        return "EUC-JP";
-    if (top == &utf8)
-        return "UTF-8";
-    if (top == &sjis)
-        return "SJIS";
-    return NULL;
+    top = dfa_top(order);
+    if(top)
+        return top->name;
+    else
+        return NULL;
 }
 
 const char *guess_tw(const char *buf, int buflen)
 {
     int i;
-    guess_dfa big5 = DFA_INIT(guess_big5_st, guess_big5_ar);
-    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
+    const char *rv = NULL;
+    guess_dfa big5 = DFA_INIT(guess_big5_st, guess_big5_ar, "BIG5");
+    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
     guess_dfa *top = NULL;
 
     guess_dfa *order[] = { ORDER_TW, NULL };
@@ -201,45 +168,30 @@
             }
         }
 
-        if (DFA_ALIVE(big5)) {
-            if (!DFA_ALIVE(utf8))
-                return "BIG5";
-            DFA_NEXT(big5, c);
-        }
-        if (DFA_ALIVE(utf8)) {
-            if (!DFA_ALIVE(big5))
-                return "UTF-8";
-            DFA_NEXT(utf8, c);
-        }
+        rv = dfa_process(order, c);
+        if(rv)
+            return rv;
 
-        if (!DFA_ALIVE(big5) && !DFA_ALIVE(utf8)) {
+        if (dfa_none(order)) {
             /* we ran out the possibilities */
             return NULL;
         }
     }
 
-    /* Now, we have ambigous code.  Pick the highest score.  If more than
-       one candidate tie, pick the default encoding. */
-    for (i = 0; order[i] != NULL; i++) {
-        if (order[i]->state >= 0) { //DFA_ALIVE()
-            if (top == NULL || order[i]->score > top->score)
-                top = order[i];
-        }
-    }
-
-    if (top == &big5)
-        return "BIG5";
-    if (top == &utf8)
-        return "UTF-8";
-    return NULL;
+    top = dfa_top(order);
+    if (top)
+        return top->name;
+    else
+        return NULL;
 }
 
 const char *guess_cn(const char *buf, int buflen)
 {
     int i;
-    guess_dfa gb2312 = DFA_INIT(guess_gb2312_st, guess_gb2312_ar);
-    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
-    guess_dfa gb18030 = DFA_INIT(guess_gb18030_st, guess_gb18030_ar);
+    const char *rv = NULL;
+    guess_dfa gb2312 = DFA_INIT(guess_gb2312_st, guess_gb2312_ar, "GB2312");
+    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
+    guess_dfa gb18030 = DFA_INIT(guess_gb18030_st, guess_gb18030_ar, "GB18030");
     guess_dfa *top = NULL;
 
     guess_dfa *order[] = { ORDER_CN, NULL };
@@ -274,52 +226,30 @@
             }
         }
 
-        if (DFA_ALIVE(gb2312)) {
-            if (!DFA_ALIVE(utf8) && !DFA_ALIVE(gb18030))
-                return "GB2312";
-            DFA_NEXT(gb2312, c);
-        }
-        if (DFA_ALIVE(utf8)) {
-            if (!DFA_ALIVE(gb2312) && !DFA_ALIVE(gb18030))
-                return "UTF-8";
-            DFA_NEXT(utf8, c);
-        }
-        if (DFA_ALIVE(gb18030)) {
-            if (!DFA_ALIVE(utf8) && !DFA_ALIVE(gb2312))
-                return "GB18030";
-            DFA_NEXT(gb18030, c);
-        }
+        rv = dfa_process(order, c);
+        if(rv)
+            return rv;
 
-        if (!DFA_ALIVE(gb2312) && !DFA_ALIVE(utf8) && !DFA_ALIVE(gb18030)) {
+        if (dfa_none(order)) {
             /* we ran out the possibilities */
             return NULL;
         }
     }
 
-    /* Now, we have ambigous code.  Pick the highest score.  If more than
-       one candidate tie, pick the default encoding. */
-    for (i = 0; order[i] != NULL; i++) {
-        if (order[i]->state >= 0) { //DFA_ALIVE()
-            if (top == NULL || order[i]->score > top->score)
-                top = order[i];
-        }
-    }
-
-    if (top == &gb2312)
-        return "GB2312";
-    if (top == &utf8)
-        return "UTF-8";
-    if (top == &gb18030)
-        return "GB18030";
-    return NULL;
+    top = dfa_top(order);
+    if(top)
+        return top->name;
+    else
+        return NULL;
 }
 
 const char *guess_kr(const char *buf, int buflen)
 {
     int i;
-    guess_dfa euck = DFA_INIT(guess_euck_st, guess_euck_ar);
-    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
-    guess_dfa johab = DFA_INIT(guess_johab_st, guess_johab_ar);
+    const char *rv = NULL;
+    guess_dfa euck = DFA_INIT(guess_euck_st, guess_euck_ar, "EUC-KR");
+    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8");
+    guess_dfa johab = DFA_INIT(guess_johab_st, guess_johab_ar, "JOHAB");
     guess_dfa *top = NULL;
 
     guess_dfa *order[] = { ORDER_KR, NULL };
@@ -354,42 +284,19 @@
             }
         }
 
-        if (DFA_ALIVE(euck)) {
-            if (!DFA_ALIVE(johab) && !DFA_ALIVE(utf8))
-                return "EUC-KR";
-            DFA_NEXT(euck, c);
-        }
-        if (DFA_ALIVE(johab)) {
-            if (!DFA_ALIVE(euck) && !DFA_ALIVE(utf8))
-                return "JOHAB";
-            DFA_NEXT(johab, c);
-        }
-        if (DFA_ALIVE(utf8)) {
-            if (!DFA_ALIVE(euck) && !DFA_ALIVE(johab))
-                return "UTF-8";
-            DFA_NEXT(utf8, c);
-        }
+        rv = dfa_process(order, c);
+        if(rv)
+            return rv;
 
-        if (!DFA_ALIVE(euck) && !DFA_ALIVE(johab) && !DFA_ALIVE(utf8)) {
+        if (dfa_none(order)) {
             /* we ran out the possibilities */
             return NULL;
         }
     }
 
-    /* Now, we have ambigous code.  Pick the highest score.  If more than
-       one candidate tie, pick the default encoding. */
-    for (i = 0; order[i] != NULL; i++) {
-        if (order[i]->state >= 0) { //DFA_ALIVE()
-            if (top == NULL || order[i]->score > top->score)
-                top = order[i];
-        }
-    }
-
-    if (top == &euck)
-        return "EUC-KR";
-    if (top == &utf8)
-        return "UTF-8";
-    if (top == &johab)
-        return "JOHAB";
-    return NULL;
+    top = dfa_top(order);
+    if(top)
+        return top->name;
+    else
+        return NULL;
 }