diff src/libguess/guess.c @ 2559:b474ecb5bde4 trunk

[svn] revise str_to_utf8(): - new utf8 validator using libguess DFA has been implemented. str_to_utf8() tries utf8 validation first. - default conversion from ISO-8859-1 is enabled regardless of chardet. - libguess and librcd is always compiled in. - some libguess cleanups.
author yaz
date Wed, 21 Feb 2007 04:25:12 -0800
parents 3149d4b1a9a9
children 37c7a3dbb212
line wrap: on
line diff
--- a/src/libguess/guess.c	Wed Feb 21 03:52:52 2007 -0800
+++ b/src/libguess/guess.c	Wed Feb 21 04:25:12 2007 -0800
@@ -98,6 +98,23 @@
 /* include DFA table generated by guess.scm */
 #include "guess_tab.c"
 
+
+int dfa_validate_utf8(const char *buf, int buflen)
+{
+    int i;
+    guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
+
+    for (i = 0; i < buflen; i++) {
+        int c = (unsigned char) buf[i];
+
+        if (DFA_ALIVE(utf8))
+            DFA_NEXT(utf8, c);
+        else
+            return 0;
+    }
+    return 1;
+}
+
 const char *guess_jp(const char *buf, int buflen)
 {
     int i;