# HG changeset patch # User Sadrul Habib Chowdhury # Date 1155860125 0 # Node ID d38d8716426c53564d0de1ff862f4b849706bbde # Parent b1ce2e9e494a9706be9bf98320c7359ffbf33f0d [gaim-migrate @ 16821] Patch #1523103 ("Improved markup processing"): "This patch tries to handle all the html-entities (including stuff like ® and &#xx;) in gaim_markup_strip_html and gaim_unescape_html. This also allows fixing the issue with libxml2 where & was being converted to &." committer: Tailor Script diff -r b1ce2e9e494a -r d38d8716426c src/util.c --- a/src/util.c Thu Aug 17 23:58:58 2006 +0000 +++ b/src/util.c Fri Aug 18 00:15:25 2006 +0000 @@ -854,6 +854,55 @@ /************************************************************************** * Markup Functions **************************************************************************/ + +/* Returns a NULL-terminated string after unescaping an entity + * (eg. &, < & etc.) starting at s. Returns NULL on failure.*/ +static const char * +detect_entity(const char *text, int *length) +{ + const char *pln; + int len, pound; + + if (!text || *text != '&') + return NULL; + +#define IS_ENTITY(s) (!g_ascii_strncasecmp(text, s, (len = sizeof(s) - 1))) + + if(IS_ENTITY("&")) + pln = "&"; + else if(IS_ENTITY("<")) + pln = "<"; + else if(IS_ENTITY(">")) + pln = ">"; + else if(IS_ENTITY(" ")) + pln = " "; + else if(IS_ENTITY("©")) + pln = "\251"; + else if(IS_ENTITY(""")) + pln = "\""; + else if(IS_ENTITY("®")) + pln = "\256"; + else if(IS_ENTITY("'")) + pln = "\'"; + else if(*(text+1) == '#' && (sscanf(text, "&#%u;", £) == 1) && + pound != 0 && *(text+3+(gint)log10(pound)) == ';') { + static char buf[7]; + int buflen = g_unichar_to_utf8((gunichar)pound, buf); + buf[buflen] = '\0'; + pln = buf; + + len = 2; + while(isdigit((gint) text[len])) len++; + if(text[len] == ';') len++; + } + else + return NULL; + + if (length) + *length = len; + return pln; +} + gboolean gaim_markup_find_tag(const char *needle, const char *haystack, const char **start, const char **end, GData **attributes) @@ -1443,44 +1492,10 @@ } } else if(*c == '&') { char buf[7]; - char *pln; - int len = 1; - guint pound; - if(!g_ascii_strncasecmp(c, "&", 5)) { - pln = "&"; - len = 5; - } else if(!g_ascii_strncasecmp(c, "<", 4)) { - pln = "<"; - len = 4; - } else if(!g_ascii_strncasecmp(c, ">", 4)) { - pln = ">"; - len = 4; - } else if(!g_ascii_strncasecmp(c, " ", 6)) { - pln = " "; - len = 6; - } else if(!g_ascii_strncasecmp(c, "©", 6)) { - pln = "©"; - len = 6; - } else if(!g_ascii_strncasecmp(c, """, 6)) { - pln = "\""; - len = 6; - } else if(!g_ascii_strncasecmp(c, "®", 5)) { - pln = "®"; - len = 5; - } else if(!g_ascii_strncasecmp(c, "'", 6)) { - pln = "\'"; - len = 6; - } else if(*(c+1) == '#' && (sscanf(c, "&#%u;", £) == 1) && - pound != 0 && *(c+3+(gint)log10(pound)) == ';') { - int buflen = g_unichar_to_utf8((gunichar)pound, buf); - buf[buflen] = '\0'; - pln = buf; - - - len = 2; - while(isdigit((gint) c [len])) len++; - if(c [len] == ';') len++; - } else { + const char *pln; + int len; + + if ((pln = detect_entity(c, &len)) == NULL) { len = 1; g_snprintf(buf, sizeof(buf), "%c", *c); pln = buf; @@ -1522,11 +1537,11 @@ char * gaim_markup_strip_html(const char *str) { - int i, j, k; + int i, j, k, entlen; gboolean visible = TRUE; gboolean closing_td_p = FALSE; gchar *str2; - const gchar *cdata_close_tag = NULL; + const gchar *cdata_close_tag = NULL, *ent; gchar *href = NULL; int href_st = 0; @@ -1685,41 +1700,11 @@ visible = TRUE; } - /* XXX: This sucks. We need to be un-escaping all entities, which - * includes these, as well as the &#num; ones */ - - if (str2[i] == '&' && strncasecmp(str2 + i, """, 6) == 0) - { - str2[j++] = '\"'; - i = i + 5; - continue; - } - - if (str2[i] == '&' && strncasecmp(str2 + i, "&", 5) == 0) + if (str2[i] == '&' && (ent = detect_entity(str2 + i, &entlen)) != NULL) { - str2[j++] = '&'; - i = i + 4; - continue; - } - - if (str2[i] == '&' && strncasecmp(str2 + i, "<", 4) == 0) - { - str2[j++] = '<'; - i = i + 3; - continue; - } - - if (str2[i] == '&' && strncasecmp(str2 + i, ">", 4) == 0) - { - str2[j++] = '>'; - i = i + 3; - continue; - } - - if (str2[i] == '&' && strncasecmp(str2 + i, "'", 6) == 0) - { - str2[j++] = '\''; - i = i + 5; + while (*ent) + str2[j++] = *ent++; + i += entlen - 1; continue; } @@ -2026,41 +2011,28 @@ char * gaim_unescape_html(const char *html) { - const char *c; - GString *ret; - - if (html == NULL) - return NULL; - - c = html; - ret = g_string_new(""); - while (*c) { - if (!strncmp(c, "&", 5)) { - ret = g_string_append_c(ret, '&'); - c += 5; - } else if (!strncmp(c, "<", 4)) { - ret = g_string_append_c(ret, '<'); - c += 4; - } else if (!strncmp(c, ">", 4)) { - ret = g_string_append_c(ret, '>'); - c += 4; - } else if (!strncmp(c, """, 6)) { - ret = g_string_append_c(ret, '"'); - c += 6; - } else if (!strncmp(c, "'", 6)) { - ret = g_string_append_c(ret, '\''); - c += 6; - } else if (!strncmp(c, "
", 4)) { - ret = g_string_append_c(ret, '\n'); - c += 4; - } else { - ret = g_string_append_c(ret, *c); - c++; + if (html != NULL) { + const char *c = html; + GString *ret = g_string_new(""); + while (*c) { + int len; + const char *ent; + + if ((ent = detect_entity(c, &len)) != NULL) { + ret = g_string_append(ret, ent); + c += len; + } else if (!strncmp(c, "
", 4)) { + ret = g_string_append_c(ret, '\n'); + c += 4; + } else { + ret = g_string_append_c(ret, *c); + c++; + } } + return g_string_free(ret, FALSE); } - return g_string_free(ret, FALSE); - + return NULL; } char * @@ -3998,4 +3970,3 @@ return buf; } - diff -r b1ce2e9e494a -r d38d8716426c src/xmlnode.c --- a/src/xmlnode.c Thu Aug 17 23:58:58 2006 +0000 +++ b/src/xmlnode.c Fri Aug 18 00:15:25 2006 +0000 @@ -35,6 +35,7 @@ #include #include +#include "util.h" #include "xmlnode.h" #ifdef _WIN32 @@ -406,6 +407,11 @@ char *attrib = g_malloc(attrib_len + 1); memcpy(attrib, attributes[i+3], attrib_len); attrib[attrib_len] = '\0'; +#ifdef HAVE_LIBXML + char *txt = attrib; + attrib = gaim_unescape_html(txt); + g_free(txt); +#endif xmlnode_set_attrib(node, attributes[i], attrib); g_free(attrib); }