Mercurial > audlegacy
comparison src/audacious/strings.c @ 2559:b474ecb5bde4 trunk
[svn] revise str_to_utf8():
- new utf8 validator using libguess DFA has been implemented. str_to_utf8() tries utf8 validation first.
- default conversion from ISO-8859-1 is enabled regardless of chardet.
- libguess and librcd is always compiled in.
- some libguess cleanups.
author | yaz |
---|---|
date | Wed, 21 Feb 2007 04:25:12 -0800 |
parents | 063374a51105 |
children | e35538325145 |
comparison
equal
deleted
inserted
replaced
2558:d4ecf0a91222 | 2559:b474ecb5bde4 |
---|---|
32 #include <string.h> | 32 #include <string.h> |
33 #include <ctype.h> | 33 #include <ctype.h> |
34 | 34 |
35 #include "main.h" | 35 #include "main.h" |
36 | 36 |
37 #ifdef USE_CHARDET | 37 #include "../libguess/libguess.h" |
38 #include "../libguess/libguess.h" | 38 #include "../librcd/librcd.h" |
39 #include "../librcd/librcd.h" | |
40 #ifdef HAVE_UDET | 39 #ifdef HAVE_UDET |
41 #include <libudet_c.h> | 40 #include <libudet_c.h> |
42 #endif | |
43 #endif | 41 #endif |
44 | 42 |
45 /* | 43 /* |
46 * escape_shell_chars() | 44 * escape_shell_chars() |
47 * | 45 * |
201 | 199 |
202 /* Note: Currently, playlist calls this function repeatedly, even | 200 /* Note: Currently, playlist calls this function repeatedly, even |
203 * if the string is already converted into utf-8. | 201 * if the string is already converted into utf-8. |
204 * chardet_to_utf8() would convert a valid utf-8 string into a | 202 * chardet_to_utf8() would convert a valid utf-8 string into a |
205 * different utf-8 string, if fallback encodings were supplied and | 203 * different utf-8 string, if fallback encodings were supplied and |
206 * the given string could be treated as a string in one of fallback | 204 * the given string could be treated as a string in one of |
207 * encodings. To avoid this, the order of evaluation has been | 205 * fallback encodings. To avoid this, g_utf8_validate() had been |
208 * changed. (It might cause a drawback?) | 206 * used at the top of evaluation. |
209 */ | 207 */ |
208 | |
209 /* Note 2: g_utf8_validate() has so called encapsulated utf-8 | |
210 * problem, thus chardet_to_utf8() took the place of that. | |
211 */ | |
212 | |
213 /* Note 3: As introducing madplug, the problem of conversion from | |
214 * ISO-8859-1 to UTF-8 arose. This may be coped with g_convert() | |
215 * located near the end of chardet_to_utf8(), but it requires utf8 | |
216 * validation guard where g_utf8_validate() was. New | |
217 * dfa_validate_utf8() employs libguess' DFA engine to validate | |
218 * utf-8 and can properly distinguish examples of encapsulated | |
219 * utf-8. It is considered to be safe to use as a guard. | |
220 */ | |
221 | |
222 /* already UTF-8? */ | |
223 if (dfa_validate_utf8(str, strlen(str))) | |
224 return g_strdup(str); | |
225 | |
210 /* chardet encoding detector */ | 226 /* chardet encoding detector */ |
211 if ((out_str = chardet_to_utf8(str, strlen(str), NULL, NULL, NULL))) | 227 if ((out_str = chardet_to_utf8(str, strlen(str), NULL, NULL, NULL))) |
212 return out_str; | 228 return out_str; |
213 | |
214 /* already UTF-8? */ | |
215 if (g_utf8_validate(str, -1, NULL)) | |
216 return g_strdup(str); | |
217 | 229 |
218 /* assume encoding associated with locale */ | 230 /* assume encoding associated with locale */ |
219 if ((out_str = g_locale_to_utf8(str, -1, NULL, NULL, NULL))) | 231 if ((out_str = g_locale_to_utf8(str, -1, NULL, NULL, NULL))) |
220 return out_str; | 232 return out_str; |
221 | 233 |
333 } | 345 } |
334 g_strfreev(encs); | 346 g_strfreev(encs); |
335 } | 347 } |
336 } | 348 } |
337 | 349 |
338 #ifdef USE_CHARDET | |
339 /* many tag libraries return 2byte latin1 utf8 character as | |
340 converted 8bit iso-8859-1 character, if they are asked to return | |
341 latin1 string. | |
342 */ | |
343 if(!ret){ | 350 if(!ret){ |
344 ret = g_convert(str, len, "UTF-8", "ISO-8859-1", bytes_read, bytes_write, error); | 351 ret = g_convert(str, len, "UTF-8", "ISO-8859-1", bytes_read, bytes_write, error); |
345 } | 352 } |
346 #endif | |
347 | 353 |
348 if(ret){ | 354 if(ret){ |
349 if(g_utf8_validate(ret, -1, NULL)) | 355 if(g_utf8_validate(ret, -1, NULL)) |
350 return ret; | 356 return ret; |
351 else { | 357 else { |