Mercurial > audlegacy
diff src/audacious/strings.c @ 2559:b474ecb5bde4 trunk
[svn] revise str_to_utf8():
- new utf8 validator using libguess DFA has been implemented. str_to_utf8() tries utf8 validation first.
- default conversion from ISO-8859-1 is enabled regardless of chardet.
- libguess and librcd is always compiled in.
- some libguess cleanups.
author | yaz |
---|---|
date | Wed, 21 Feb 2007 04:25:12 -0800 |
parents | 063374a51105 |
children | e35538325145 |
line wrap: on
line diff
--- a/src/audacious/strings.c Wed Feb 21 03:52:52 2007 -0800 +++ b/src/audacious/strings.c Wed Feb 21 04:25:12 2007 -0800 @@ -34,13 +34,11 @@ #include "main.h" -#ifdef USE_CHARDET - #include "../libguess/libguess.h" - #include "../librcd/librcd.h" +#include "../libguess/libguess.h" +#include "../librcd/librcd.h" #ifdef HAVE_UDET #include <libudet_c.h> #endif -#endif /* * escape_shell_chars() @@ -203,18 +201,32 @@ * if the string is already converted into utf-8. * chardet_to_utf8() would convert a valid utf-8 string into a * different utf-8 string, if fallback encodings were supplied and - * the given string could be treated as a string in one of fallback - * encodings. To avoid this, the order of evaluation has been - * changed. (It might cause a drawback?) + * the given string could be treated as a string in one of + * fallback encodings. To avoid this, g_utf8_validate() had been + * used at the top of evaluation. + */ + + /* Note 2: g_utf8_validate() has so called encapsulated utf-8 + * problem, thus chardet_to_utf8() took the place of that. */ + + /* Note 3: As introducing madplug, the problem of conversion from + * ISO-8859-1 to UTF-8 arose. This may be coped with g_convert() + * located near the end of chardet_to_utf8(), but it requires utf8 + * validation guard where g_utf8_validate() was. New + * dfa_validate_utf8() employs libguess' DFA engine to validate + * utf-8 and can properly distinguish examples of encapsulated + * utf-8. It is considered to be safe to use as a guard. + */ + + /* already UTF-8? */ + if (dfa_validate_utf8(str, strlen(str))) + return g_strdup(str); + /* chardet encoding detector */ if ((out_str = chardet_to_utf8(str, strlen(str), NULL, NULL, NULL))) return out_str; - /* already UTF-8? */ - if (g_utf8_validate(str, -1, NULL)) - return g_strdup(str); - /* assume encoding associated with locale */ if ((out_str = g_locale_to_utf8(str, -1, NULL, NULL, NULL))) return out_str; @@ -335,15 +347,9 @@ } } -#ifdef USE_CHARDET - /* many tag libraries return 2byte latin1 utf8 character as - converted 8bit iso-8859-1 character, if they are asked to return - latin1 string. - */ if(!ret){ ret = g_convert(str, len, "UTF-8", "ISO-8859-1", bytes_read, bytes_write, error); } -#endif if(ret){ if(g_utf8_validate(ret, -1, NULL))