Mercurial > libguess
changeset 3:70e2c306231e
- implemented dfa utility functions.
- added dfa.c.
- rewrote guess functions for ar, gr, hw and tr scripts with dfa utilities.
- guess functions for cjk scripts too.
author | Yoshiki Yazawa <yaz@cc.rim.or.jp> |
---|---|
date | Thu, 12 Jun 2008 20:20:43 +0900 |
parents | 754a4550c64e |
children | ff05c6c659cb |
files | Makefile arabic_impl.c cjk_impl.c dfa.c dfa.h greek_impl.c guess.scm guess_tab.c hebrew_impl.c libguess.h turkish_impl.c |
diffstat | 11 files changed, 345 insertions(+), 240 deletions(-) [+] |
line wrap: on
line diff
--- a/Makefile Wed Jun 11 00:11:30 2008 +0900 +++ b/Makefile Thu Jun 12 20:20:43 2008 +0900 @@ -1,7 +1,7 @@ PREFIX = /usr/local MAJOR = 0 -MINOR = 2 +MINOR = 3 REVISION = 0 VER = ${MAJOR}.${MINOR}.${REVISION} @@ -9,6 +9,7 @@ #OBJS = guess.o SRCS = guess.c \ + dfa.c \ arabic_impl.c \ cjk_impl.c \ greek_impl.c \ @@ -22,11 +23,11 @@ OBJS = ${SRCS:.c=.o} LIBS = libguess.so libguess.a -CFLAGS += -fPIC +CFLAGS += -fPIC -pg -g SONAME = libguess.so.${MAJOR} -all: $(LIBS) +all: $(LIBS) test libguess.so: ${OBJS} ${CC} -o libguess.so -shared -Wl,-soname,${SONAME} ${OBJS} @@ -41,8 +42,8 @@ guess_tab.c : guess.scm gosh guess.scm guess_tab.c -test: test.c guess.c - gcc -g -o test test.c guess.c +test: test.c libguess.a + gcc -g -o test test.c libguess.a install: install -m644 libguess.h ${PREFIX}/include
--- a/arabic_impl.c Wed Jun 11 00:11:30 2008 +0900 +++ b/arabic_impl.c Thu Jun 12 20:20:43 2008 +0900 @@ -2,57 +2,53 @@ #include "dfa.h" #include "guess_tab.c" -#define ORDER_AR &utf8, &iso8859_6, &cp1256 +/* precedence order */ +#define ORDER &utf8, &iso8859_6, &cp1256 + +/* encodings */ +static guess_dfa cp1256 = DFA_INIT(guess_cp1256_st, guess_cp1256_ar, "CP1256"); +static guess_dfa iso8859_6 = DFA_INIT(guess_iso8859_6_st, guess_iso8859_6_ar, "ISO-8859-6"); +static guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); const char *guess_ar(const char *buf, int buflen) { int i; - guess_dfa cp1256 = DFA_INIT(guess_cp1256_st, guess_cp1256_ar); - guess_dfa iso8859_6 = DFA_INIT(guess_iso8859_6_st, guess_iso8859_6_ar); - guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); + const char *rv = NULL; guess_dfa *top = NULL; - - guess_dfa *order[] = { ORDER_AR, NULL }; + guess_dfa *order[] = { ORDER, NULL }; for (i = 0; i < buflen; i++) { int c = (unsigned char) buf[i]; - if (DFA_ALIVE(cp1256)) { - if (!DFA_ALIVE(iso8859_6) && !DFA_ALIVE(utf8)) - return "CP1256"; - DFA_NEXT(cp1256, c); + /* special treatment of BOM */ + if (i == 0 && c == 0xff) { + if (i < buflen - 1) { + c = (unsigned char) buf[i + 1]; + if (c == 0xfe) + return UCS_2LE; + } } - if (DFA_ALIVE(iso8859_6)) { - if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(utf8)) - return "ISO-8859-6"; - DFA_NEXT(iso8859_6, c); - } - if (DFA_ALIVE(utf8)) { - if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(iso8859_6)) - return "UTF-8"; - DFA_NEXT(utf8, c); + if (i == 0 && c == 0xfe) { + if (i < buflen - 1) { + c = (unsigned char) buf[i + 1]; + if (c == 0xff) + return UCS_2BE; + } } - if (!DFA_ALIVE(cp1256) && !DFA_ALIVE(iso8859_6) && !DFA_ALIVE(utf8)) { + rv = dfa_process(order, c); + if(rv) + return rv; + + if (dfa_none(order)) { /* we ran out the possibilities */ return NULL; } } - /* Now, we have ambigous code. Pick the highest score. If more than - one candidate tie, pick the default encoding. */ - for (i = 0; order[i] != NULL; i++) { - if (order[i]->state >= 0) { //DFA_ALIVE() - if (top == NULL || order[i]->score > top->score) - top = order[i]; - } - } - - if (top == &cp1256) - return "CP1256"; - if (top == &utf8) - return "UTF-8"; - if (top == &iso8859_6) - return "ISO-8859-6"; - return NULL; + top = dfa_top(order); + if (top) + return top->name; + else + return NULL; }
--- a/cjk_impl.c Wed Jun 11 00:11:30 2008 +0900 +++ b/cjk_impl.c Thu Jun 12 20:20:43 2008 +0900 @@ -40,18 +40,7 @@ #include "libguess.h" #include "dfa.h" -/* workaround for that glib's g_convert can't convert - properly from UCS-2BE/LE trailing after BOM. */ -#define WITH_G_CONVERT 1 -/* #undef WITH_G_CONVERT */ - -#ifdef WITH_G_CONVERT -const char UCS_2BE[] = "UTF-16"; -const char UCS_2LE[] = "UTF-16"; -#else -const char UCS_2BE[] = "UCS-2BE"; -const char UCS_2LE[] = "UCS-2LE"; -#endif +#include <stdio.h> /* take precedence if scores are same. you can customize the order as: */ /* ORDER_** &highest, &second, ... &lowest */ @@ -63,11 +52,10 @@ /* include DFA table generated by guess.scm */ #include "guess_tab.c" - int dfa_validate_utf8(const char *buf, int buflen) { int i; - guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); + guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); for (i = 0; i < buflen; i++) { int c = (unsigned char) buf[i]; @@ -89,9 +77,10 @@ const char *guess_jp(const char *buf, int buflen) { int i; - guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar); - guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar); - guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); + const char *rv = NULL; + guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar, "EUC-JP"); + guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar, "SJIS"); + guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); guess_dfa *top = NULL; guess_dfa *order[] = { ORDER_JP, NULL }; @@ -124,51 +113,29 @@ } } - if (DFA_ALIVE(eucj)) { - if (!DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) - return "EUC-JP"; - DFA_NEXT(eucj, c); - } - if (DFA_ALIVE(sjis)) { - if (!DFA_ALIVE(eucj) && !DFA_ALIVE(utf8)) - return "SJIS"; - DFA_NEXT(sjis, c); - } - if (DFA_ALIVE(utf8)) { - if (!DFA_ALIVE(sjis) && !DFA_ALIVE(eucj)) - return "UTF-8"; - DFA_NEXT(utf8, c); - } + rv = dfa_process(order, c); + if(rv) + return rv; - if (!DFA_ALIVE(eucj) && !DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) { + if (dfa_none(order)) { /* we ran out the possibilities */ return NULL; } } - /* Now, we have ambigous code. Pick the highest score. If more than - one candidate tie, pick the default encoding. */ - for (i = 0; order[i] != NULL; i++) { - if (order[i]->state >= 0) { //DFA_ALIVE() - if (top == NULL || order[i]->score > top->score) - top = order[i]; - } - } - - if (top == &eucj) - return "EUC-JP"; - if (top == &utf8) - return "UTF-8"; - if (top == &sjis) - return "SJIS"; - return NULL; + top = dfa_top(order); + if(top) + return top->name; + else + return NULL; } const char *guess_tw(const char *buf, int buflen) { int i; - guess_dfa big5 = DFA_INIT(guess_big5_st, guess_big5_ar); - guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); + const char *rv = NULL; + guess_dfa big5 = DFA_INIT(guess_big5_st, guess_big5_ar, "BIG5"); + guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); guess_dfa *top = NULL; guess_dfa *order[] = { ORDER_TW, NULL }; @@ -201,45 +168,30 @@ } } - if (DFA_ALIVE(big5)) { - if (!DFA_ALIVE(utf8)) - return "BIG5"; - DFA_NEXT(big5, c); - } - if (DFA_ALIVE(utf8)) { - if (!DFA_ALIVE(big5)) - return "UTF-8"; - DFA_NEXT(utf8, c); - } + rv = dfa_process(order, c); + if(rv) + return rv; - if (!DFA_ALIVE(big5) && !DFA_ALIVE(utf8)) { + if (dfa_none(order)) { /* we ran out the possibilities */ return NULL; } } - /* Now, we have ambigous code. Pick the highest score. If more than - one candidate tie, pick the default encoding. */ - for (i = 0; order[i] != NULL; i++) { - if (order[i]->state >= 0) { //DFA_ALIVE() - if (top == NULL || order[i]->score > top->score) - top = order[i]; - } - } - - if (top == &big5) - return "BIG5"; - if (top == &utf8) - return "UTF-8"; - return NULL; + top = dfa_top(order); + if (top) + return top->name; + else + return NULL; } const char *guess_cn(const char *buf, int buflen) { int i; - guess_dfa gb2312 = DFA_INIT(guess_gb2312_st, guess_gb2312_ar); - guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); - guess_dfa gb18030 = DFA_INIT(guess_gb18030_st, guess_gb18030_ar); + const char *rv = NULL; + guess_dfa gb2312 = DFA_INIT(guess_gb2312_st, guess_gb2312_ar, "GB2312"); + guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); + guess_dfa gb18030 = DFA_INIT(guess_gb18030_st, guess_gb18030_ar, "GB18030"); guess_dfa *top = NULL; guess_dfa *order[] = { ORDER_CN, NULL }; @@ -274,52 +226,30 @@ } } - if (DFA_ALIVE(gb2312)) { - if (!DFA_ALIVE(utf8) && !DFA_ALIVE(gb18030)) - return "GB2312"; - DFA_NEXT(gb2312, c); - } - if (DFA_ALIVE(utf8)) { - if (!DFA_ALIVE(gb2312) && !DFA_ALIVE(gb18030)) - return "UTF-8"; - DFA_NEXT(utf8, c); - } - if (DFA_ALIVE(gb18030)) { - if (!DFA_ALIVE(utf8) && !DFA_ALIVE(gb2312)) - return "GB18030"; - DFA_NEXT(gb18030, c); - } + rv = dfa_process(order, c); + if(rv) + return rv; - if (!DFA_ALIVE(gb2312) && !DFA_ALIVE(utf8) && !DFA_ALIVE(gb18030)) { + if (dfa_none(order)) { /* we ran out the possibilities */ return NULL; } } - /* Now, we have ambigous code. Pick the highest score. If more than - one candidate tie, pick the default encoding. */ - for (i = 0; order[i] != NULL; i++) { - if (order[i]->state >= 0) { //DFA_ALIVE() - if (top == NULL || order[i]->score > top->score) - top = order[i]; - } - } - - if (top == &gb2312) - return "GB2312"; - if (top == &utf8) - return "UTF-8"; - if (top == &gb18030) - return "GB18030"; - return NULL; + top = dfa_top(order); + if(top) + return top->name; + else + return NULL; } const char *guess_kr(const char *buf, int buflen) { int i; - guess_dfa euck = DFA_INIT(guess_euck_st, guess_euck_ar); - guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); - guess_dfa johab = DFA_INIT(guess_johab_st, guess_johab_ar); + const char *rv = NULL; + guess_dfa euck = DFA_INIT(guess_euck_st, guess_euck_ar, "EUC-KR"); + guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); + guess_dfa johab = DFA_INIT(guess_johab_st, guess_johab_ar, "JOHAB"); guess_dfa *top = NULL; guess_dfa *order[] = { ORDER_KR, NULL }; @@ -354,42 +284,19 @@ } } - if (DFA_ALIVE(euck)) { - if (!DFA_ALIVE(johab) && !DFA_ALIVE(utf8)) - return "EUC-KR"; - DFA_NEXT(euck, c); - } - if (DFA_ALIVE(johab)) { - if (!DFA_ALIVE(euck) && !DFA_ALIVE(utf8)) - return "JOHAB"; - DFA_NEXT(johab, c); - } - if (DFA_ALIVE(utf8)) { - if (!DFA_ALIVE(euck) && !DFA_ALIVE(johab)) - return "UTF-8"; - DFA_NEXT(utf8, c); - } + rv = dfa_process(order, c); + if(rv) + return rv; - if (!DFA_ALIVE(euck) && !DFA_ALIVE(johab) && !DFA_ALIVE(utf8)) { + if (dfa_none(order)) { /* we ran out the possibilities */ return NULL; } } - /* Now, we have ambigous code. Pick the highest score. If more than - one candidate tie, pick the default encoding. */ - for (i = 0; order[i] != NULL; i++) { - if (order[i]->state >= 0) { //DFA_ALIVE() - if (top == NULL || order[i]->score > top->score) - top = order[i]; - } - } - - if (top == &euck) - return "EUC-KR"; - if (top == &utf8) - return "UTF-8"; - if (top == &johab) - return "JOHAB"; - return NULL; + top = dfa_top(order); + if(top) + return top->name; + else + return NULL; }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dfa.c Thu Jun 12 20:20:43 2008 +0900 @@ -0,0 +1,61 @@ +#include "libguess.h" +#include "dfa.h" + +boolean +dfa_alone(guess_dfa *dfa, guess_dfa *order[]) +{ + int i; + + if (dfa->state < 0) + return FALSE; + + for (i = 0; order[i] != NULL; i++) { + if (order[i] != dfa && order[i]->state >= 0) { //DFA_ALIVE() + return FALSE; + } + } + + return TRUE; +} + +boolean +dfa_none(guess_dfa *order[]) +{ + int i; + + for (i = 0; order[i] != NULL; i++) { + if (order[i]->state >= 0) { //DFA_ALIVE() + return FALSE; + } + } + + return TRUE; +} + +guess_dfa * +dfa_top(guess_dfa *order[]) +{ + int i; + guess_dfa *top = NULL; + for (i = 0; order[i] != NULL; i++) { + if (order[i]->state >= 0) { //DFA_ALIVE() + if (top == NULL || order[i]->score > top->score) + top = order[i]; + } + } + return top; +} + +const char * +dfa_process(guess_dfa *order[], int c) +{ + int i; + for (i = 0; order[i] != NULL; i++) { + if (DFA_ALIVE_P(order[i])) { + if (dfa_alone(order[i], order)) + return order[i]->name; + DFA_NEXT_P(order[i], c); + } + + } +}
--- a/dfa.h Wed Jun 11 00:11:30 2008 +0900 +++ b/dfa.h Thu Jun 12 20:20:43 2008 +0900 @@ -1,6 +1,23 @@ #ifndef __DFA_H__ #define __DFA_H__ +typedef int boolean; +#define TRUE 1 +#define FALSE 0 + +/* workaround for that glib's g_convert can't convert + properly from UCS-2BE/LE trailing after BOM. */ +#define WITH_G_CONVERT 1 +/* #undef WITH_G_CONVERT */ + +#ifdef WITH_G_CONVERT +#define UCS_2BE "UTF-16" +#define UCS_2LE "UTF-16" +#else +#define UCS_2BE "UCS_2BE" +#define UCS_2LE "UCS_2LE" +#endif + /* data types */ typedef struct guess_arc_rec { @@ -14,11 +31,12 @@ guess_arc *arcs; int state; double score; + char *name; } guess_dfa; /* macros */ -#define DFA_INIT(st, ar) \ - { st, ar, 0, 1.0 } +#define DFA_INIT(st, ar, name) \ + { st, ar, 0, 1.0 ,name} #define DFA_NEXT(dfa, ch) \ do { \ @@ -36,4 +54,26 @@ #define DFA_ALIVE(dfa) (dfa.state >= 0) +#define DFA_NEXT_P(dfa, ch) \ + do { \ + int arc__; \ + if (dfa->state >= 0) { \ + arc__ = dfa->states[dfa->state][ch]; \ + if (arc__ < 0) { \ + dfa->state = -1; \ + } else { \ + dfa->state = dfa->arcs[arc__].next; \ + dfa->score *= dfa->arcs[arc__].score; \ + } \ + } \ + } while (0) + +#define DFA_ALIVE_P(dfa) (dfa->state >= 0) + +/* prototypes */ +boolean dfa_alone(guess_dfa *dfa, guess_dfa *order[]); +boolean dfa_none(guess_dfa *order[]); +guess_dfa *dfa_top(guess_dfa *order[]); +const char *dfa_process(guess_dfa *order[], int c); + #endif
--- a/greek_impl.c Wed Jun 11 00:11:30 2008 +0900 +++ b/greek_impl.c Thu Jun 12 20:20:43 2008 +0900 @@ -1,22 +1,55 @@ -static const char *_guess_gr(const unsigned char *ptr, int size) +#include "libguess.h" +#include "dfa.h" +#include "guess_tab.c" + +/* precedence order */ +#define ORDER &utf8, &iso8859_7, &cp1253 + +/* encodings */ +static guess_dfa cp1253 = DFA_INIT(guess_cp1253_st, guess_cp1253_ar, "CP1253"); +static guess_dfa iso8859_7 = DFA_INIT(guess_iso8859_7_st, guess_iso8859_7_ar, "ISO-8859-7"); +static guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); + +/* common */ +const char *guess_gr(const char *buf, int buflen) { int i; + const char *rv = NULL; + guess_dfa *top = NULL; + guess_dfa *order[] = { ORDER, NULL }; - for (i = 0; i < size; i++) - { - if (ptr[i] == 0x80 || - (ptr[i] >= 0x82 && ptr[i] <= 0x87) || - ptr[i] == 0x89 || ptr[i] == 0x8B || - (ptr[i] >= 0x91 && ptr[i] <= 0x97) || - ptr[i] == 0x99 || ptr[i] == 0x9B || ptr[i] == 0xA4 || - ptr[i] == 0xA5 || ptr[i] == 0xAE) - return "CP1253"; + for (i = 0; i < buflen; i++) { + int c = (unsigned char) buf[i]; + + /* special treatment of BOM */ + if (i == 0 && c == 0xff) { + if (i < buflen - 1) { + c = (unsigned char) buf[i + 1]; + if (c == 0xfe) + return UCS_2LE; + } + } + if (i == 0 && c == 0xfe) { + if (i < buflen - 1) { + c = (unsigned char) buf[i + 1]; + if (c == 0xff) + return UCS_2BE; + } + } + + rv = dfa_process(order, c); + if(rv) + return rv; + + if (dfa_none(order)) { + /* we ran out the possibilities */ + return NULL; + } } - return "ISO-8859-7"; + top = dfa_top(order); + if (top) + return top->name; + else + return NULL; } - -const char *guess_gr(const char *ptr, int size) -{ - return _guess_gr((const unsigned char *) ptr, size); -}
--- a/guess.scm Wed Jun 11 00:11:30 2008 +0900 +++ b/guess.scm Thu Jun 12 20:20:43 2008 +0900 @@ -156,6 +156,8 @@ (lambda () (print "/* State transition table for character code guessing */") (print "/* This file is automatically generated by guess.scm */") + (print "") + (print "#include \"dfa.h\"") (newline) (for-each emit-dfa-table (all-dfas)))) 0)
--- a/guess_tab.c Wed Jun 11 00:11:30 2008 +0900 +++ b/guess_tab.c Thu Jun 12 20:20:43 2008 +0900 @@ -1,6 +1,8 @@ /* State transition table for character code guessing */ /* This file is automatically generated by guess.scm */ +#include "dfa.h" + static signed char guess_eucj_st[][256] = { { /* state init */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
--- a/hebrew_impl.c Wed Jun 11 00:11:30 2008 +0900 +++ b/hebrew_impl.c Thu Jun 12 20:20:43 2008 +0900 @@ -1,23 +1,56 @@ -const char *_guess_hw(const unsigned char *ptr, int size) +#include "libguess.h" +#include "dfa.h" +#include "guess_tab.c" + +/* precedence order */ +#define ORDER &utf8, &iso8859_8, &cp1255 + +/* encodings */ +static guess_dfa cp1255 = DFA_INIT(guess_cp1255_st, guess_cp1255_ar, "CP1255"); +static guess_dfa iso8859_8 = DFA_INIT(guess_iso8859_8_st, guess_iso8859_8_ar, "ISO-8859-8-I"); +static guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); + + +/* common */ +const char *guess_hw(const char *buf, int buflen) { int i; + const char *rv = NULL; + guess_dfa *top = NULL; + guess_dfa *order[] = { ORDER, NULL }; - for (i = 0; i < size; i++) - { - if (ptr[i] == 0x80 || (ptr[i] >= 0x82 && ptr[i] <= 0x89) || ptr[i] == 0x8B || - (ptr[i] >= 0x91 && ptr[i] <= 0x99) || ptr[i] == 0x9B || ptr[i] == 0xA1 || - (ptr[i] >= 0xBF && ptr[i] <= 0xC9) || - (ptr[i] >= 0xCB && ptr[i] <= 0xD8)) - return "CP1255"; + for (i = 0; i < buflen; i++) { + int c = (unsigned char) buf[i]; - if (ptr[i] == 0xDF) - return "ISO-8859-8-I"; + /* special treatment of BOM */ + if (i == 0 && c == 0xff) { + if (i < buflen - 1) { + c = (unsigned char) buf[i + 1]; + if (c == 0xfe) + return UCS_2LE; + } + } + if (i == 0 && c == 0xfe) { + if (i < buflen - 1) { + c = (unsigned char) buf[i + 1]; + if (c == 0xff) + return UCS_2BE; + } + } + + rv = dfa_process(order, c); + if(rv) + return rv; + + if (dfa_none(order)) { + /* we ran out the possibilities */ + return NULL; + } } - return "ISO-8859-8-I"; + top = dfa_top(order); + if (top) + return top->name; + else + return NULL; } - -const char *guess_hw(const char *ptr, int size) -{ - return _guess_hw((const unsigned char *) ptr, size); -}
--- a/libguess.h Wed Jun 11 00:11:30 2008 +0900 +++ b/libguess.h Thu Jun 12 20:20:43 2008 +0900 @@ -5,11 +5,11 @@ /* * Copyright (c) 2000-2003 Shiro Kawai, All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: - * + * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. *
--- a/turkish_impl.c Wed Jun 11 00:11:30 2008 +0900 +++ b/turkish_impl.c Thu Jun 12 20:20:43 2008 +0900 @@ -1,25 +1,55 @@ #include "libguess.h" +#include "dfa.h" +#include "guess_tab.c" + +/* precedence order */ +#define ORDER &utf8, &iso8859_9, &cp1254 -static const char *_guess_tr(const unsigned char *ptr, int size) +/* encodings */ +static guess_dfa cp1254 = DFA_INIT(guess_cp1253_st, guess_cp1253_ar, "CP1254"); +static guess_dfa iso8859_9 = DFA_INIT(guess_iso8859_9_st, guess_iso8859_9_ar, "ISO-8859-9"); +static guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); + +/* common */ +const char *guess_tr(const char *buf, int buflen) { int i; + const char *rv = NULL; + guess_dfa *top = NULL; + guess_dfa *order[] = { ORDER, NULL }; - for (i = 0; i < size; i++) - { - if (ptr[i] == 0x80 || - (ptr[i] >= 0x82 && ptr[i] <= 0x8C) || - (ptr[i] >= 0x91 && ptr[i] <= 0x9C) || - ptr[ i ] == 0x9F) - return "CP1254"; + for (i = 0; i < buflen; i++) { + int c = (unsigned char) buf[i]; + + /* special treatment of BOM */ + if (i == 0 && c == 0xff) { + if (i < buflen - 1) { + c = (unsigned char) buf[i + 1]; + if (c == 0xfe) + return UCS_2LE; + } + } + if (i == 0 && c == 0xfe) { + if (i < buflen - 1) { + c = (unsigned char) buf[i + 1]; + if (c == 0xff) + return UCS_2BE; + } + } + + rv = dfa_process(order, c); + if(rv) + return rv; + + if (dfa_none(order)) { + /* we ran out the possibilities */ + return NULL; + } } - return "ISO-8859-9"; + top = dfa_top(order); + if (top) + return top->name; + else + return NULL; } - -const char *guess_tr(const char *ptr, int size) -{ - if (dfa_validate_utf8(ptr, size)) - return "UTF-8"; - - return _guess_tr((const unsigned char *)ptr, size); -}