comparison src/libguess/guess.c @ 2559:b474ecb5bde4 trunk

[svn] revise str_to_utf8(): - new utf8 validator using libguess DFA has been implemented. str_to_utf8() tries utf8 validation first. - default conversion from ISO-8859-1 is enabled regardless of chardet. - libguess and librcd is always compiled in. - some libguess cleanups.
author yaz
date Wed, 21 Feb 2007 04:25:12 -0800
parents 3149d4b1a9a9
children 37c7a3dbb212
comparison
equal deleted inserted replaced
2558:d4ecf0a91222 2559:b474ecb5bde4
96 #define DFA_ALIVE(dfa) (dfa.state >= 0) 96 #define DFA_ALIVE(dfa) (dfa.state >= 0)
97 97
98 /* include DFA table generated by guess.scm */ 98 /* include DFA table generated by guess.scm */
99 #include "guess_tab.c" 99 #include "guess_tab.c"
100 100
101
102 int dfa_validate_utf8(const char *buf, int buflen)
103 {
104 int i;
105 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
106
107 for (i = 0; i < buflen; i++) {
108 int c = (unsigned char) buf[i];
109
110 if (DFA_ALIVE(utf8))
111 DFA_NEXT(utf8, c);
112 else
113 return 0;
114 }
115 return 1;
116 }
117
101 const char *guess_jp(const char *buf, int buflen) 118 const char *guess_jp(const char *buf, int buflen)
102 { 119 {
103 int i; 120 int i;
104 guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar); 121 guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar);
105 guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar); 122 guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar);