Mercurial > libguess
diff cjk_impl.c @ 3:70e2c306231e
- implemented dfa utility functions.
- added dfa.c.
- rewrote guess functions for ar, gr, hw and tr scripts with dfa utilities.
- guess functions for cjk scripts too.
author | Yoshiki Yazawa <yaz@cc.rim.or.jp> |
---|---|
date | Thu, 12 Jun 2008 20:20:43 +0900 |
parents | 754a4550c64e |
children |
line wrap: on
line diff
--- a/cjk_impl.c Wed Jun 11 00:11:30 2008 +0900 +++ b/cjk_impl.c Thu Jun 12 20:20:43 2008 +0900 @@ -40,18 +40,7 @@ #include "libguess.h" #include "dfa.h" -/* workaround for that glib's g_convert can't convert - properly from UCS-2BE/LE trailing after BOM. */ -#define WITH_G_CONVERT 1 -/* #undef WITH_G_CONVERT */ - -#ifdef WITH_G_CONVERT -const char UCS_2BE[] = "UTF-16"; -const char UCS_2LE[] = "UTF-16"; -#else -const char UCS_2BE[] = "UCS-2BE"; -const char UCS_2LE[] = "UCS-2LE"; -#endif +#include <stdio.h> /* take precedence if scores are same. you can customize the order as: */ /* ORDER_** &highest, &second, ... &lowest */ @@ -63,11 +52,10 @@ /* include DFA table generated by guess.scm */ #include "guess_tab.c" - int dfa_validate_utf8(const char *buf, int buflen) { int i; - guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); + guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); for (i = 0; i < buflen; i++) { int c = (unsigned char) buf[i]; @@ -89,9 +77,10 @@ const char *guess_jp(const char *buf, int buflen) { int i; - guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar); - guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar); - guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); + const char *rv = NULL; + guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar, "EUC-JP"); + guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar, "SJIS"); + guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); guess_dfa *top = NULL; guess_dfa *order[] = { ORDER_JP, NULL }; @@ -124,51 +113,29 @@ } } - if (DFA_ALIVE(eucj)) { - if (!DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) - return "EUC-JP"; - DFA_NEXT(eucj, c); - } - if (DFA_ALIVE(sjis)) { - if (!DFA_ALIVE(eucj) && !DFA_ALIVE(utf8)) - return "SJIS"; - DFA_NEXT(sjis, c); - } - if (DFA_ALIVE(utf8)) { - if (!DFA_ALIVE(sjis) && !DFA_ALIVE(eucj)) - return "UTF-8"; - DFA_NEXT(utf8, c); - } + rv = dfa_process(order, c); + if(rv) + return rv; - if (!DFA_ALIVE(eucj) && !DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) { + if (dfa_none(order)) { /* we ran out the possibilities */ return NULL; } } - /* Now, we have ambigous code. Pick the highest score. If more than - one candidate tie, pick the default encoding. */ - for (i = 0; order[i] != NULL; i++) { - if (order[i]->state >= 0) { //DFA_ALIVE() - if (top == NULL || order[i]->score > top->score) - top = order[i]; - } - } - - if (top == &eucj) - return "EUC-JP"; - if (top == &utf8) - return "UTF-8"; - if (top == &sjis) - return "SJIS"; - return NULL; + top = dfa_top(order); + if(top) + return top->name; + else + return NULL; } const char *guess_tw(const char *buf, int buflen) { int i; - guess_dfa big5 = DFA_INIT(guess_big5_st, guess_big5_ar); - guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); + const char *rv = NULL; + guess_dfa big5 = DFA_INIT(guess_big5_st, guess_big5_ar, "BIG5"); + guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); guess_dfa *top = NULL; guess_dfa *order[] = { ORDER_TW, NULL }; @@ -201,45 +168,30 @@ } } - if (DFA_ALIVE(big5)) { - if (!DFA_ALIVE(utf8)) - return "BIG5"; - DFA_NEXT(big5, c); - } - if (DFA_ALIVE(utf8)) { - if (!DFA_ALIVE(big5)) - return "UTF-8"; - DFA_NEXT(utf8, c); - } + rv = dfa_process(order, c); + if(rv) + return rv; - if (!DFA_ALIVE(big5) && !DFA_ALIVE(utf8)) { + if (dfa_none(order)) { /* we ran out the possibilities */ return NULL; } } - /* Now, we have ambigous code. Pick the highest score. If more than - one candidate tie, pick the default encoding. */ - for (i = 0; order[i] != NULL; i++) { - if (order[i]->state >= 0) { //DFA_ALIVE() - if (top == NULL || order[i]->score > top->score) - top = order[i]; - } - } - - if (top == &big5) - return "BIG5"; - if (top == &utf8) - return "UTF-8"; - return NULL; + top = dfa_top(order); + if (top) + return top->name; + else + return NULL; } const char *guess_cn(const char *buf, int buflen) { int i; - guess_dfa gb2312 = DFA_INIT(guess_gb2312_st, guess_gb2312_ar); - guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); - guess_dfa gb18030 = DFA_INIT(guess_gb18030_st, guess_gb18030_ar); + const char *rv = NULL; + guess_dfa gb2312 = DFA_INIT(guess_gb2312_st, guess_gb2312_ar, "GB2312"); + guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); + guess_dfa gb18030 = DFA_INIT(guess_gb18030_st, guess_gb18030_ar, "GB18030"); guess_dfa *top = NULL; guess_dfa *order[] = { ORDER_CN, NULL }; @@ -274,52 +226,30 @@ } } - if (DFA_ALIVE(gb2312)) { - if (!DFA_ALIVE(utf8) && !DFA_ALIVE(gb18030)) - return "GB2312"; - DFA_NEXT(gb2312, c); - } - if (DFA_ALIVE(utf8)) { - if (!DFA_ALIVE(gb2312) && !DFA_ALIVE(gb18030)) - return "UTF-8"; - DFA_NEXT(utf8, c); - } - if (DFA_ALIVE(gb18030)) { - if (!DFA_ALIVE(utf8) && !DFA_ALIVE(gb2312)) - return "GB18030"; - DFA_NEXT(gb18030, c); - } + rv = dfa_process(order, c); + if(rv) + return rv; - if (!DFA_ALIVE(gb2312) && !DFA_ALIVE(utf8) && !DFA_ALIVE(gb18030)) { + if (dfa_none(order)) { /* we ran out the possibilities */ return NULL; } } - /* Now, we have ambigous code. Pick the highest score. If more than - one candidate tie, pick the default encoding. */ - for (i = 0; order[i] != NULL; i++) { - if (order[i]->state >= 0) { //DFA_ALIVE() - if (top == NULL || order[i]->score > top->score) - top = order[i]; - } - } - - if (top == &gb2312) - return "GB2312"; - if (top == &utf8) - return "UTF-8"; - if (top == &gb18030) - return "GB18030"; - return NULL; + top = dfa_top(order); + if(top) + return top->name; + else + return NULL; } const char *guess_kr(const char *buf, int buflen) { int i; - guess_dfa euck = DFA_INIT(guess_euck_st, guess_euck_ar); - guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar); - guess_dfa johab = DFA_INIT(guess_johab_st, guess_johab_ar); + const char *rv = NULL; + guess_dfa euck = DFA_INIT(guess_euck_st, guess_euck_ar, "EUC-KR"); + guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); + guess_dfa johab = DFA_INIT(guess_johab_st, guess_johab_ar, "JOHAB"); guess_dfa *top = NULL; guess_dfa *order[] = { ORDER_KR, NULL }; @@ -354,42 +284,19 @@ } } - if (DFA_ALIVE(euck)) { - if (!DFA_ALIVE(johab) && !DFA_ALIVE(utf8)) - return "EUC-KR"; - DFA_NEXT(euck, c); - } - if (DFA_ALIVE(johab)) { - if (!DFA_ALIVE(euck) && !DFA_ALIVE(utf8)) - return "JOHAB"; - DFA_NEXT(johab, c); - } - if (DFA_ALIVE(utf8)) { - if (!DFA_ALIVE(euck) && !DFA_ALIVE(johab)) - return "UTF-8"; - DFA_NEXT(utf8, c); - } + rv = dfa_process(order, c); + if(rv) + return rv; - if (!DFA_ALIVE(euck) && !DFA_ALIVE(johab) && !DFA_ALIVE(utf8)) { + if (dfa_none(order)) { /* we ran out the possibilities */ return NULL; } } - /* Now, we have ambigous code. Pick the highest score. If more than - one candidate tie, pick the default encoding. */ - for (i = 0; order[i] != NULL; i++) { - if (order[i]->state >= 0) { //DFA_ALIVE() - if (top == NULL || order[i]->score > top->score) - top = order[i]; - } - } - - if (top == &euck) - return "EUC-KR"; - if (top == &utf8) - return "UTF-8"; - if (top == &johab) - return "JOHAB"; - return NULL; + top = dfa_top(order); + if(top) + return top->name; + else + return NULL; }