comparison src/audacious/strings.c @ 2559:b474ecb5bde4 trunk

[svn] revise str_to_utf8(): - new utf8 validator using libguess DFA has been implemented. str_to_utf8() tries utf8 validation first. - default conversion from ISO-8859-1 is enabled regardless of chardet. - libguess and librcd is always compiled in. - some libguess cleanups.
author yaz
date Wed, 21 Feb 2007 04:25:12 -0800
parents 063374a51105
children e35538325145
comparison
equal deleted inserted replaced
2558:d4ecf0a91222 2559:b474ecb5bde4
32 #include <string.h> 32 #include <string.h>
33 #include <ctype.h> 33 #include <ctype.h>
34 34
35 #include "main.h" 35 #include "main.h"
36 36
37 #ifdef USE_CHARDET 37 #include "../libguess/libguess.h"
38 #include "../libguess/libguess.h" 38 #include "../librcd/librcd.h"
39 #include "../librcd/librcd.h"
40 #ifdef HAVE_UDET 39 #ifdef HAVE_UDET
41 #include <libudet_c.h> 40 #include <libudet_c.h>
42 #endif
43 #endif 41 #endif
44 42
45 /* 43 /*
46 * escape_shell_chars() 44 * escape_shell_chars()
47 * 45 *
201 199
202 /* Note: Currently, playlist calls this function repeatedly, even 200 /* Note: Currently, playlist calls this function repeatedly, even
203 * if the string is already converted into utf-8. 201 * if the string is already converted into utf-8.
204 * chardet_to_utf8() would convert a valid utf-8 string into a 202 * chardet_to_utf8() would convert a valid utf-8 string into a
205 * different utf-8 string, if fallback encodings were supplied and 203 * different utf-8 string, if fallback encodings were supplied and
206 * the given string could be treated as a string in one of fallback 204 * the given string could be treated as a string in one of
207 * encodings. To avoid this, the order of evaluation has been 205 * fallback encodings. To avoid this, g_utf8_validate() had been
208 * changed. (It might cause a drawback?) 206 * used at the top of evaluation.
209 */ 207 */
208
209 /* Note 2: g_utf8_validate() has so called encapsulated utf-8
210 * problem, thus chardet_to_utf8() took the place of that.
211 */
212
213 /* Note 3: As introducing madplug, the problem of conversion from
214 * ISO-8859-1 to UTF-8 arose. This may be coped with g_convert()
215 * located near the end of chardet_to_utf8(), but it requires utf8
216 * validation guard where g_utf8_validate() was. New
217 * dfa_validate_utf8() employs libguess' DFA engine to validate
218 * utf-8 and can properly distinguish examples of encapsulated
219 * utf-8. It is considered to be safe to use as a guard.
220 */
221
222 /* already UTF-8? */
223 if (dfa_validate_utf8(str, strlen(str)))
224 return g_strdup(str);
225
210 /* chardet encoding detector */ 226 /* chardet encoding detector */
211 if ((out_str = chardet_to_utf8(str, strlen(str), NULL, NULL, NULL))) 227 if ((out_str = chardet_to_utf8(str, strlen(str), NULL, NULL, NULL)))
212 return out_str; 228 return out_str;
213
214 /* already UTF-8? */
215 if (g_utf8_validate(str, -1, NULL))
216 return g_strdup(str);
217 229
218 /* assume encoding associated with locale */ 230 /* assume encoding associated with locale */
219 if ((out_str = g_locale_to_utf8(str, -1, NULL, NULL, NULL))) 231 if ((out_str = g_locale_to_utf8(str, -1, NULL, NULL, NULL)))
220 return out_str; 232 return out_str;
221 233
333 } 345 }
334 g_strfreev(encs); 346 g_strfreev(encs);
335 } 347 }
336 } 348 }
337 349
338 #ifdef USE_CHARDET
339 /* many tag libraries return 2byte latin1 utf8 character as
340 converted 8bit iso-8859-1 character, if they are asked to return
341 latin1 string.
342 */
343 if(!ret){ 350 if(!ret){
344 ret = g_convert(str, len, "UTF-8", "ISO-8859-1", bytes_read, bytes_write, error); 351 ret = g_convert(str, len, "UTF-8", "ISO-8859-1", bytes_read, bytes_write, error);
345 } 352 }
346 #endif
347 353
348 if(ret){ 354 if(ret){
349 if(g_utf8_validate(ret, -1, NULL)) 355 if(g_utf8_validate(ret, -1, NULL))
350 return ret; 356 return ret;
351 else { 357 else {