Mercurial > libguess
changeset 2:754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
- new UCS-2LE/BE DFAs
- now arabic_impl.c uses arabic DFAs
- dfa common macros have been moved to dfa.h
- minor cleanups
author | Yoshiki Yazawa <yaz@cc.rim.or.jp> |
---|---|
date | Wed, 11 Jun 2008 00:11:30 +0900 |
parents | 04f2be1c8464 |
children | 70e2c306231e |
files | Makefile arabic_impl.c cjk_impl.c dfa.h guess.scm guess_tab.c russian_impl.c turkish_impl.c |
diffstat | 8 files changed, 781 insertions(+), 143 deletions(-) [+] |
line wrap: on
line diff
--- a/Makefile Sat Dec 01 03:27:31 2007 +0900 +++ b/Makefile Wed Jun 11 00:11:30 2008 +0900 @@ -55,6 +55,8 @@ clean: rm -f $(LIBS) $(OBJS) test +mostlyclean: clean + rm -f guess_tab.c + distclean: clean rm -f *~ core* -
--- a/arabic_impl.c Sat Dec 01 03:27:31 2007 +0900 +++ b/arabic_impl.c Wed Jun 11 00:11:30 2008 +0900 @@ -1,28 +1,58 @@ #include "libguess.h" +#include "dfa.h" +#include "guess_tab.c" -static const char *_guess_ar(const unsigned char *ptr, int size) +#define ORDER_AR &utf8, &iso8859_6, &cp1256 + +const char *guess_ar(const char *buf, int buflen) { int i; + guess_dfa cp1256 = DFA_INIT(guess_cp1256_st, guess_cp1256_ar); + guess_dfa iso8859_6 = DFA_INIT(guess_iso8859_6_st, guess_iso8859_6_ar); + guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); + guess_dfa *top = NULL; - for (i = 0; i < size; i++) - { - if ((ptr[i] >= 0x80 && ptr[i] <= 0x9F) || - ptr[i] == 0xA1 || ptr[i] == 0xA2 || ptr[i] == 0xA3 || - (ptr[i] >= 0xA5 && ptr[i] <= 0xAB) || - (ptr[i] >= 0xAE && ptr[i] <= 0xBA) || - ptr[i] == 0xBC || ptr[i] == 0xBD || - ptr[i] == 0xBE || ptr[i] == 0xC0 || - (ptr[i] >= 0xDB && ptr[i] <= 0xDF) || (ptr[i] >= 0xF3)) - return "CP1256"; + guess_dfa *order[] = { ORDER_AR, NULL }; + + for (i = 0; i < buflen; i++) { + int c = (unsigned char) buf[i]; + + if (DFA_ALIVE(cp1256)) { + if (!DFA_ALIVE(iso8859_6) && !DFA_ALIVE(utf8)) + return "CP1256"; + DFA_NEXT(cp1256, c); + } + if (DFA_ALIVE(iso8859_6)) { + if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(utf8)) + return "ISO-8859-6"; + DFA_NEXT(iso8859_6, c); + } + if (DFA_ALIVE(utf8)) { + if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(iso8859_6)) + return "UTF-8"; + DFA_NEXT(utf8, c); + } + + if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(iso8859_6) && !DFA_ALIVE(utf8)) { + /* we ran out the possibilities */ + return NULL; + } } - return "ISO-8859-6"; -} + /* Now, we have ambigous code. Pick the highest score. If more than + one candidate tie, pick the default encoding. */ + for (i = 0; order[i] != NULL; i++) { + if (order[i]->state >= 0) { //DFA_ALIVE() + if (top == NULL || order[i]->score > top->score) + top = order[i]; + } + } -const char *guess_ar(const char *ptr, int size) -{ - if (dfa_validate_utf8(ptr, size)) + if (top == &cp1256) + return "CP1256"; + if (top == &utf8) return "UTF-8"; - - return _guess_ar((const unsigned char *)ptr, size); + if (top == &iso8859_6) + return "ISO-8859-6"; + return NULL; }
--- a/cjk_impl.c Sat Dec 01 03:27:31 2007 +0900 +++ b/cjk_impl.c Wed Jun 11 00:11:30 2008 +0900 @@ -4,14 +4,14 @@ */ /* - * guess.c - guessing character encoding + * guess.c - guessing character encoding * * Copyright (c) 2000-2003 Shiro Kawai, All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: - * + * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * @@ -38,13 +38,7 @@ */ #include "libguess.h" - -/* take precedence if scores are same. you can customize the order as: */ -/* ORDER_** &highest, &second, ... &lowest */ -#define ORDER_JP &utf8, &sjis, &eucj -#define ORDER_TW &utf8, &big5 -#define ORDER_CN &utf8, &gb2312, &gb18030 -#define ORDER_KR &utf8, &euck, &johab +#include "dfa.h" /* workaround for that glib's g_convert can't convert properly from UCS-2BE/LE trailing after BOM. */ @@ -59,40 +53,12 @@ const char UCS_2LE[] = "UCS-2LE"; #endif -/* data types */ -typedef struct guess_arc_rec -{ - unsigned int next; /* next state */ - double score; /* score */ -} guess_arc; - -typedef struct guess_dfa_rec -{ - signed char (*states)[256]; - guess_arc *arcs; - int state; - double score; -} guess_dfa; - -/* macros */ -#define DFA_INIT(st, ar) \ - { st, ar, 0, 1.0 } - -#define DFA_NEXT(dfa, ch) \ - do { \ - int arc__; \ - if (dfa.state >= 0) { \ - arc__ = dfa.states[dfa.state][ch]; \ - if (arc__ < 0) { \ - dfa.state = -1; \ - } else { \ - dfa.state = dfa.arcs[arc__].next; \ - dfa.score *= dfa.arcs[arc__].score; \ - } \ - } \ - } while (0) - -#define DFA_ALIVE(dfa) (dfa.state >= 0) +/* take precedence if scores are same. you can customize the order as: */ +/* ORDER_** &highest, &second, ... &lowest */ +#define ORDER_JP &utf8, &sjis, &eucj +#define ORDER_TW &utf8, &big5 +#define ORDER_CN &utf8, &gb2312, &gb18030 +#define ORDER_KR &utf8, &euck, &johab /* include DFA table generated by guess.scm */ #include "guess_tab.c" @@ -116,7 +82,7 @@ if(DFA_ALIVE(utf8)) return 1; - else + else return 0; }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dfa.h Wed Jun 11 00:11:30 2008 +0900 @@ -0,0 +1,39 @@ +#ifndef __DFA_H__ +#define __DFA_H__ + +/* data types */ +typedef struct guess_arc_rec +{ + unsigned int next; /* next state */ + double score; /* score */ +} guess_arc; + +typedef struct guess_dfa_rec +{ + signed char (*states)[256]; + guess_arc *arcs; + int state; + double score; +} guess_dfa; + +/* macros */ +#define DFA_INIT(st, ar) \ + { st, ar, 0, 1.0 } + +#define DFA_NEXT(dfa, ch) \ + do { \ + int arc__; \ + if (dfa.state >= 0) { \ + arc__ = dfa.states[dfa.state][ch]; \ + if (arc__ < 0) { \ + dfa.state = -1; \ + } else { \ + dfa.state = dfa.arcs[arc__].next; \ + dfa.score *= dfa.arcs[arc__].score; \ + } \ + } \ + } while (0) + +#define DFA_ALIVE(dfa) (dfa.state >= 0) + +#endif
--- a/guess.scm Sat Dec 01 03:27:31 2007 +0900 +++ b/guess.scm Wed Jun 11 00:11:30 2008 +0900 @@ -5,24 +5,24 @@ ;;; ;;; Auxiliary script to generate japanese code guessing table -;;; +;;; ;;; Copyright (c) 2000-2003 Shiro Kawai, All rights reserved. -;;; +;;; ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions ;;; are met: -;;; +;;; ;;; 1. Redistributions of source code must retain the above copyright ;;; notice, this list of conditions and the following disclaimer. -;;; +;;; ;;; 2. Redistributions in binary form must reproduce the above copyright ;;; notice, this list of conditions and the following disclaimer in the ;;; documentation and/or other materials provided with the distribution. -;;; +;;; ;;; 3. Neither the name of the authors nor the names of its contributors ;;; may be used to endorse or promote products derived from this ;;; software without specific prior written permission. -;;; +;;; ;;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ;;; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ;;; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -34,7 +34,7 @@ ;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;;; +;;; ;;; $Id: guess.scm,v 1.3 2003/07/05 03:29:10 shirok Exp $ ;;; @@ -174,19 +174,16 @@ (((#x00 #x7f)) init 1.0) ; ASCII range ((#x8e) jis0201_kana 0.8) ; JISX 0201 kana ((#x8f) jis0213_2 0.95) ; JISX 0213 plane 2 - (((#xa1 #xfe)) jis0213_1 1.0) ; JISX 0213 plane 1 - ) + (((#xa1 #xfe)) jis0213_1 1.0)) ; JISX 0213 plane 1 ;; jis x 0201 kana (jis0201_kana - (((#xa1 #xdf)) init 1.0) - ) + (((#xa1 #xdf)) init 1.0)) ;; jis x 0208 and jis x 0213 plane 1 (jis0213_1 (((#xa1 #xfe)) init 1.0)) ;; jis x 0213 plane 2 (jis0213_2 - (((#xa1 #xfe)) init 1.0)) - ) + (((#xa1 #xfe)) init 1.0))) ;;; ;;; Shift_JIS @@ -201,8 +198,7 @@ (((#xf0 #xfc)) jis0213 0.95) ;jisx0213 plane 2 (((#xfd #xff)) init 0.8)) ;vendor extension (jis0213 - (((#x40 #x7e) (#x80 #xfc)) init 1.0)) - ) + (((#x40 #x7e) (#x80 #xfc)) init 1.0))) ;;; ;;; UTF-8 @@ -225,42 +221,57 @@ (4byte_more (((#x80 #xbf)) 3byte_more 1.0)) (5byte_more - (((#x80 #xbf)) 4byte_more 1.0)) - ) + (((#x80 #xbf)) 4byte_more 1.0))) ;;; ;;; UCS-2LE ;;; -; (define-dfa ucs2le -; (init -; ((#xff) le 1.0) -; (((#x00 #x7f)) ascii 1.0) -; (((#x00 #xff)) multi 1.0)) -; (le -; ((#xfe) init 1.0)) -; (ascii -; ((#x00) init 1.0)) -; (multi -; (((#x00 #xff)) init 1.0))) +(define-dfa ucs2le + (init + ((#xfe) bom-be 1.0) + ((#xff) bom-le 1.0) + (((#x00 #xfd)) byte2 1.0)) + (bom-le + (((#x00 #xff)) init 1.0)) + (bom-be + (((#x00 #xfe)) init 1.0)) ;; if be (0xfeff), die. + (byte2 + (((#x00 #xff)) init 1.0))) ;;; ;;; UCS-2BE ;;; -; (define-dfa ucs2be -; (init -; ((#xfe) be 1.0) -; ((#x00) ascii 1.0) -; (((#x00 #xff)) multi 1.0)) -; (be -; ((#xff) init 1.0)) -; (ascii -; (((#x00 #x7f)) init 1.0)) -; (multi -; (((#x00 #xff)) init 1.0))) - +(define-dfa ucs2be + (init + ((#xfe) bom-be 1.0) + ((#xff) bom-le 1.0) + (((#x00 #xfd)) byte2 1.0)) + (bom-le + (((#x00 #xfd)) init 1.0) + ((#xff) init 1.0)) ;; if le (0xfffe), die. + (bom-be + (((#x00 #xff)) init 1.0)) + (byte2 + (((#x00 #xff)) init 1.0))) ;;; -;;; JIS (ISO2022JP) +;;; UTF-16 +;;; +(define-dfa utf16 + (init + ((#xfe) bom-be 1.0) + ((#xff) bom-le 1.0)) + (init1 + (((#x00 #xff)) byte2 1.0)) + (bom-be + ((#xff) init1 1.0)) + (bom-le + ((#xfe) init1 1.0)) + (byte2 + (((#x00 #xff)) init1 1.0))) + +;;; +;;; ISO2022JP (JIS) ;;; ;; NB: for now, we just check the sequence of <ESC> $ or <ESC> '('. @@ -269,22 +280,18 @@ ((#x1b) esc 1.0) (((#x00 #x1a) (#x1c #x1f)) init 1.0) ;C0 (((#x20 #x7f)) init 1.0) ;ASCII - (((#xa1 #xdf)) init 0.7) ;JIS8bit kana - ) + (((#xa1 #xdf)) init 0.7)) ;JIS8bit kana (esc ((#x0d #x0a) init 0.9) ;cancel ((#\( ) esc-paren 1.0) ((#\$ ) esc-$ 1.0) - ((#\& ) esc-& 1.0) - ) + ((#\& ) esc-& 1.0)) (esc-paren ((#\B #\J #\H) init 1.0) - ((#\I) jis0201kana 0.8) - ) + ((#\I) jis0201kana 0.8)) (esc-$ ((#\@ #\B) kanji 1.0) - ((#\( ) esc-$-paren 1.0) - ) + ((#\( ) esc-$-paren 1.0)) (esc-$-paren ((#\D #\O #\P) kanji 1.0)) (esc-& @@ -296,8 +303,7 @@ ((#x1b) esc 1.0) (((#x21 #x7e)) kanji-2 1.0)) (kanji-2 - (((#x21 #x7e)) kanji 1.0)) - ) + (((#x21 #x7e)) kanji 1.0)) ) ;;; ;;; Big5 @@ -306,12 +312,10 @@ (define-dfa big5 ;; first byte (init - (((#x00 #x7f)) init 1.0) ;ascii - (((#xa1 #xfe)) 2byte 1.0) ;big5-2byte - ) + (((#x00 #x7f)) init 1.0) ;ascii + (((#xa1 #xfe)) 2byte 1.0)) ;big5-2byte (2byte - (((#x40 #x7e) (#xa1 #xfe)) init 1.0)) - ) + (((#x40 #x7e) (#xa1 #xfe)) init 1.0))) ;;; ;;; GB2312 (EUC-CN?) @@ -320,12 +324,10 @@ (define-dfa gb2312 ;; first byte (init - (((#x00 #x7f)) init 1.0) ;ascii - (((#xa1 #xfe)) 2byte 1.0) ;gb2312 2byte - ) + (((#x00 #x7f)) init 1.0) ;ascii + (((#xa1 #xfe)) 2byte 1.0)) ;gb2312 2byte (2byte - (((#xa1 #xfe)) init 1.0)) - ) + (((#xa1 #xfe)) init 1.0))) ;;; ;;; GB18030 @@ -336,8 +338,7 @@ (init (((#x00 #x80)) init 1.0) ;ascii (((#x81 #xfe)) 2byte 1.0) ;gb18030 2byte - (((#x81 #xfe)) 4byte2 1.0) ;gb18030 2byte - ) + (((#x81 #xfe)) 4byte2 1.0)) ;gb18030 2byte (2byte (((#x40 #x7e) (#x80 #xfe)) init 1.0)) (4byte2 @@ -345,8 +346,7 @@ (4byte3 (((#x81 #xfe)) 4byte4 1.0)) (4byte4 - (((#x30 #x39)) init 1.0)) - ) + (((#x30 #x39)) init 1.0)) ) ;;; ;;; EUC-KR @@ -356,12 +356,10 @@ ;; first byte (init (((#x00 #x7f)) init 1.0) ; ASCII range - (((#xa1 #xfe)) ks1001 1.0) ; KSX 1001 - ) + (((#xa1 #xfe)) ks1001 1.0)) ; KSX 1001 ;; ks x 1001 (ks1001 - (((#xa1 #xfe)) init 1.0)) - ) + (((#xa1 #xfe)) init 1.0))) ;;; ;;; Johab @@ -372,12 +370,104 @@ (init (((#x00 #x7f)) init 1.0) ; ASCII range (((#x84 #xd3)) jamo51 1.0) ; jamo51 - (((#xd8 #xde) (#xe0 #xf9)) jamo42 0.95) ; jamo42 - ) + (((#xd8 #xde) (#xe0 #xf9)) jamo42 0.95)) ; jamo42 ;; second byte (jamo51 (((#x41 #x7e) (#x81 #xfe)) init 1.0)) (jamo42 - (((#x31 #x7e) (#x91 #xfe)) init 1.0)) - ) + (((#x31 #x7e) (#x91 #xfe)) init 1.0))) + + + + + + +;;; +;;; arabic +;;; + +(define-dfa iso8859_6 + (init + (((#x00 #x7f)) init 1.0) ;ascii + ((#xa0) init 1.0) + ((#xa4) init 1.0) + ((#xac) init 1.0) + ((#xad) init 1.0) ;SHY xxx + ((#xbb) init 1.0) + ((#xbf) init 1.0) + (((#xc1 #xda)) init 1.0) + (((#xe0 #xf2)) init 1.0))) + +(define-dfa cp1256 + (init + (((#x00 #x7f)) init 1.0) ;ascii + (((#x80 #xff)) init 1.0))) ;high bit + + +;;; +;;; greek +;;; + +(define-dfa iso8859_7 + (init + (((#x00 #x7f)) init 1.0) ;ascii + (((#xa0 #xad)) init 1.0) + (((#xaf #xd1)) init 1.0) + (((#xd3 #xfe)) init 1.0))) +(define-dfa cp1253 + (init + (((#x00 #x7f)) init 1.0) ;ascii + ((#x80) init 1.0) + (((#x82 #x87)) init 1.0) + ((#x89) init 1.0) + ((#x8b) init 1.0) + (((#x91 #x97)) init 1.0) + ((#x99) init 1.0) + ((#x9b) init 1.0) + (((#xa0 #xa9)) init 1.0) + (((#xab #xd1)) init 1.0) + (((#xd3 #xfe)) init 1.0))) + +;;; +;;; hebrew +;;; + +(define-dfa iso8859_8 + (init + (((#x00 #x7f)) init 1.0) ;ascii + ((#xa0) init 1.0) + (((#xa2 #xbe)) init 1.0) + (((#xdf #xfa)) init 1.0) + (((#xfd #xfe)) init 1.0))) + +(define-dfa cp1255 + (init + (((#x00 #x7f)) init 1.0) ;ascii + ((#x80) init 1.0) + (((#x82 #x89)) init 1.0) + ((#x8b) init 1.0) + (((#x91 #x99)) init 1.0) + ((#x9b) init 1.0) + (((#xa0 #xc9)) init 1.0) + (((#xcb #xd8)) init 1.0) + (((#xe0 #xfa)) init 1.0) + (((#xfd #xfe)) init 1.0))) + +;;; +;;; turkish +;;; + +(define-dfa iso8859_9 + (init + (((#x00 #x7f)) init 1.0) ;ascii + (((#xa0 #xff)) init 1.0))) + +(define-dfa cp1254 + (init + (((#x00 #x7f)) init 1.0) ;ascii + ((#x80) init 1.0) + (((#x82 #x8c)) init 1.0) + (((#x91 #x9c)) init 1.0) + (((#x9f #xff)) init 1.0))) +
--- a/guess_tab.c Sat Dec 01 03:27:31 2007 +0900 +++ b/guess_tab.c Wed Jun 11 00:11:30 2008 +0900 @@ -259,6 +259,277 @@ { 4, 1.0 }, /* 5byte_more -> 4byte_more */ }; +static signed char guess_ucs2le_st[][256] = { + { /* state init */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, + }, + { /* state bom-le */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + }, + { /* state bom-be */ + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, -1, + }, + { /* state byte2 */ + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + }, +}; + +static guess_arc guess_ucs2le_ar[] = { + { 2, 1.0 }, /* init -> bom-be */ + { 1, 1.0 }, /* init -> bom-le */ + { 3, 1.0 }, /* init -> byte2 */ + { 0, 1.0 }, /* bom-le -> init */ + { 0, 1.0 }, /* bom-be -> init */ + { 0, 1.0 }, /* byte2 -> init */ +}; + +static signed char guess_ucs2be_st[][256] = { + { /* state init */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, + }, + { /* state bom-le */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -1, 4, + }, + { /* state bom-be */ + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + }, + { /* state byte2 */ + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + }, +}; + +static guess_arc guess_ucs2be_ar[] = { + { 2, 1.0 }, /* init -> bom-be */ + { 1, 1.0 }, /* init -> bom-le */ + { 3, 1.0 }, /* init -> byte2 */ + { 0, 1.0 }, /* bom-le -> init */ + { 0, 1.0 }, /* bom-le -> init */ + { 0, 1.0 }, /* bom-be -> init */ + { 0, 1.0 }, /* byte2 -> init */ +}; + +static signed char guess_utf16_st[][256] = { + { /* state init */ + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, + }, + { /* state init1 */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + }, + { /* state bom-be */ + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, + }, + { /* state bom-le */ + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, -1, + }, + { /* state byte2 */ + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + }, +}; + +static guess_arc guess_utf16_ar[] = { + { 2, 1.0 }, /* init -> bom-be */ + { 3, 1.0 }, /* init -> bom-le */ + { 4, 1.0 }, /* init1 -> byte2 */ + { 1, 1.0 }, /* bom-be -> init1 */ + { 1, 1.0 }, /* bom-le -> init1 */ + { 1, 1.0 }, /* byte2 -> init1 */ +}; + static signed char guess_big5_st[][256] = { { /* state init */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -562,3 +833,243 @@ { 0, 1.0 }, /* jamo42 -> init */ }; +static signed char guess_iso8859_6_st[][256] = { + { /* state init */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, 3, 4, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, -1, -1, -1, 6, + -1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, -1, -1, -1, -1, -1, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + }, +}; + +static guess_arc guess_iso8859_6_ar[] = { + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ +}; + +static signed char guess_cp1256_st[][256] = { + { /* state init */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + }, +}; + +static guess_arc guess_cp1256_ar[] = { + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ +}; + +static signed char guess_iso8859_7_st[][256] = { + { /* state init */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, -1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -1, + }, +}; + +static guess_arc guess_iso8859_7_ar[] = { + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ +}; + +static signed char guess_cp1253_st[][256] = { + { /* state init */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, -1, 2, 2, 2, 2, 2, 2, -1, 3, -1, 4, -1, -1, -1, -1, + -1, 5, 5, 5, 5, 5, 5, 5, -1, 6, -1, 7, -1, -1, -1, -1, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, -1, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, -1, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, -1, + }, +}; + +static guess_arc guess_cp1253_ar[] = { + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ +}; + +static signed char guess_iso8859_8_st[][256] = { + { /* state init */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 1, -1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -1, -1, 4, 4, -1, + }, +}; + +static guess_arc guess_iso8859_8_ar[] = { + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ +}; + +static signed char guess_cp1255_st[][256] = { + { /* state init */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, -1, 2, 2, 2, 2, 2, 2, 2, 2, -1, 3, -1, -1, -1, -1, + -1, 4, 4, 4, 4, 4, 4, 4, 4, 4, -1, 5, -1, -1, -1, -1, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, -1, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, -1, -1, -1, -1, -1, -1, -1, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, -1, -1, 9, 9, -1, + }, +}; + +static guess_arc guess_cp1255_ar[] = { + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ +}; + +static signed char guess_iso8859_9_st[][256] = { + { /* state init */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + }, +}; + +static guess_arc guess_iso8859_9_ar[] = { + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ +}; + +static signed char guess_cp1254_st[][256] = { + { /* state init */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, -1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -1, -1, -1, + -1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -1, -1, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, +}; + +static guess_arc guess_cp1254_ar[] = { + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ + { 0, 1.0 }, /* init -> init */ +}; +
--- a/russian_impl.c Sat Dec 01 03:27:31 2007 +0900 +++ b/russian_impl.c Wed Jun 11 00:11:30 2008 +0900 @@ -32,7 +32,7 @@ static int start_symbol(char ch) { if ((ch=='\t')||ch=='\r'||ch=='\n'||(ch==' ')||(ch=='(')||(ch=='"')||(ch=='\'')) return 1; return 0; -} +} typedef const struct lng_stat2 *lng_stat2_ptr; @@ -48,7 +48,7 @@ altptr=0; do{ d>>=1; - + if(!ws){ if (wi>indexes2) wi-=d; else { @@ -124,7 +124,7 @@ double winstep,koistep,altstep,winestep,koiestep,altestep,winsstep,koisstep,altsstep; double winstat=0,koistat=0,altstat=0,winestat=0,koiestat=0,altestat=0,winsstat=0,koisstat=0,altsstat=0; long j; - + #ifdef _AUTO_DEBUG fprintf(stderr,"Word: %s\n",txt); #endif @@ -185,7 +185,7 @@ fprintf(stderr,", Win %lf, Koi %lf, Alt %lf\n",winstep,koistep,altstep); #endif } - + winstat+=winstep; koistat+=koistep; altstat+=altstep;
--- a/turkish_impl.c Sat Dec 01 03:27:31 2007 +0900 +++ b/turkish_impl.c Wed Jun 11 00:11:30 2008 +0900 @@ -6,9 +6,9 @@ for (i = 0; i < size; i++) { - if (ptr[i] == 0x80 || + if (ptr[i] == 0x80 || (ptr[i] >= 0x82 && ptr[i] <= 0x8C) || - (ptr[i] >= 0x91 && ptr[i] <= 0x9C) || + (ptr[i] >= 0x91 && ptr[i] <= 0x9C) || ptr[ i ] == 0x9F) return "CP1254"; }