Mercurial > pidgin-twitter
changeset 136:e67b0231ba56
completely re-wrote strip_html_markup(). new function permits non-tag strings surrounded by <>.
author | Yoshiki Yazawa <yaz@honeyplanet.jp> |
---|---|
date | Wed, 23 Jul 2008 03:53:59 +0900 |
parents | cae8d5dd24d0 |
children | 97f11d057071 |
files | pidgin-twitter.c |
diffstat | 1 files changed, 153 insertions(+), 167 deletions(-) [+] |
line wrap: on
line diff
--- a/pidgin-twitter.c Tue Jul 22 20:00:03 2008 +0900 +++ b/pidgin-twitter.c Wed Jul 23 03:53:59 2008 +0900 @@ -39,163 +39,6 @@ /* functions */ /*************/ -/* this function is a modified clone of purple_markup_strip_html() */ -static char * -strip_html_markup(const char *str) -{ - int i, j, k, entlen; - gboolean visible = TRUE; - gboolean closing_td_p = FALSE; - gchar *str2; - const gchar *cdata_close_tag = NULL, *ent; - gchar *href = NULL; - int href_st = 0; - - if(!str) - return NULL; - - str2 = g_strdup(str); - - for (i = 0, j = 0; str2[i]; i++) - { - if (str2[i] == '<') - { - if (cdata_close_tag) - { - /* Note: Don't even assume any other tag is a tag in CDATA */ - if (g_ascii_strncasecmp(str2 + i, cdata_close_tag, - strlen(cdata_close_tag)) == 0) - { - i += strlen(cdata_close_tag) - 1; - cdata_close_tag = NULL; - } - continue; - } - else if (g_ascii_strncasecmp(str2 + i, "<td", 3) == 0 && closing_td_p) - { - str2[j++] = '\t'; - visible = TRUE; - } - else if (g_ascii_strncasecmp(str2 + i, "</td>", 5) == 0) - { - closing_td_p = TRUE; - visible = FALSE; - } - else - { - closing_td_p = FALSE; - visible = TRUE; - } - - k = i + 1; - - if(g_ascii_isspace(str2[k])) - visible = TRUE; - else if (str2[k]) - { - /* Scan until we end the tag either implicitly (closed start - * tag) or explicitly, using a sloppy method (i.e., < or > - * inside quoted attributes will screw us up) - */ - while (str2[k] && str2[k] != '<' && str2[k] != '>') - { - k++; - } - - /* If we've got an <a> tag with an href, save the address - * to print later. */ - if (g_ascii_strncasecmp(str2 + i, "<a", 2) == 0 && - g_ascii_isspace(str2[i+2])) - { - int st; /* start of href, inclusive [ */ - int end; /* end of href, exclusive ) */ - char delim = ' '; - /* Find start of href */ - for (st = i + 3; st < k; st++) - { - if (g_ascii_strncasecmp(str2+st, "href=", 5) == 0) - { - st += 5; - if (str2[st] == '"' || str2[st] == '\'') - { - delim = str2[st]; - st++; - } - break; - } - } - /* find end of address */ - for (end = st; end < k && str2[end] != delim; end++) - { - /* All the work is done in the loop construct above. */ - } - - /* If there's an address, save it. If there was - * already one saved, kill it. */ - if (st < k) - { - char *tmp; - g_free(href); - tmp = g_strndup(str2 + st, end - st); - href = purple_unescape_html(tmp); - g_free(tmp); - href_st = j; - } - } - - /* Check for tags which should be mapped to newline */ - else if (g_ascii_strncasecmp(str2 + i, "<p>", 3) == 0 - || g_ascii_strncasecmp(str2 + i, "<tr", 3) == 0 - || g_ascii_strncasecmp(str2 + i, "<br", 3) == 0 - || g_ascii_strncasecmp(str2 + i, "<hr", 3) == 0 - || g_ascii_strncasecmp(str2 + i, "<li", 3) == 0 - || g_ascii_strncasecmp(str2 + i, "<div", 4) == 0 - || g_ascii_strncasecmp(str2 + i, "</table>", 8) == 0) - { - str2[j++] = '\n'; - } - else if (g_ascii_strncasecmp(str2 + i, "<script", 7) == 0) - { - cdata_close_tag = "</script>"; - } - else if (g_ascii_strncasecmp(str2 + i, "<style", 6) == 0) - { - cdata_close_tag = "</style>"; - } - /* Update the index and continue checking after the tag */ - i = (str2[k] == '<' || str2[k] == '\0')? k - 1: k; - continue; - } - } - else if (cdata_close_tag) - { - continue; - } - else if (!g_ascii_isspace(str2[i])) - { - visible = TRUE; - } - - if (str2[i] == '&' && - (ent = purple_markup_unescape_entity(str2 + i, &entlen)) != NULL) - { - while (*ent) - str2[j++] = *ent++; - i += entlen - 1; - continue; - } - - if (visible) - str2[j++] = g_ascii_isspace(str2[i])? ' ': str2[i]; - } - - g_free(href); - - str2[j] = '\0'; - - return str2; -} - /* this function has been taken from autoaccept plugin */ static gboolean ensure_path_exists(const char *dir) @@ -213,6 +56,142 @@ /* our implementation */ /**********************/ +static gchar *html_tags[] = { + "<a href=", + "</a>", + "<b>", + "</b>", + "<p>", + "</p>", + "<div ", + "</div>", + "<span ", + "</span>", + "<body>", + "</body>", + "<i>", + "</i>", + "<font ", + "</font>", + "<br>", + "<br/>", + "<img ", + NULL +}; + +static gchar * +strip_html_markup(const gchar *src) +{ + gchar *head, *tail; /* head and tail of html */ + gchar *begin, *end; /* begin:< end:> */ + gchar *html, *str; /* copied src and str to be returned */ + gchar *vis1, *vis2; /* begin and end of address part */ + gchar *startp; /* starting point marker */ + gchar **tagp; /* tag iterator */ + gchar *tmp, *tmp2; /* scratches */ + + g_return_val_if_fail(src != NULL, NULL); + + const gchar *ptr, *ent; + gchar *ptr2; + gint entlen; + + /* unescape &x; */ + html = g_malloc0(strlen(src)); + ptr2 = html; + for(ptr = src; *ptr; ) { + if(*ptr == '&') { + ent = purple_markup_unescape_entity(ptr, &entlen); + if(ent != NULL) { + while(*ent) { + *ptr2++ = *ent++; + } + ptr += entlen; + } + } + else { + *ptr2++ = *ptr++; + } + } /* for */ + + str = g_strdup("\0"); + + head = html; + tail = head + strlen(html); + startp = head; + +loop: + begin = NULL; + end = NULL; + + if(startp >= tail) { + g_free(html); + return str; + } + + begin = strchr(startp, '<'); + if(begin) + end = strchr(begin + 1, '>'); + if(!end) { + tmp = g_strconcat(str, startp, NULL); + g_free(str); + str = tmp; + g_free(html); + return str; /* no corresponding >, we have done. */ + } + + /* here, both < and > are found */ + /* concatenate leading part to dest */ + tmp = g_strndup(startp, begin - startp); + tmp2 = g_strconcat(str, tmp, NULL); + g_free(tmp); + g_free(str); + str = tmp2; + + /* find tag */ + for(tagp = html_tags; *tagp; tagp++) { + if(!g_ascii_strncasecmp(begin, *tagp, strlen(*tagp))) { + /* we found a valid tag */ + /* if tag is <a href=, extract address. */ + if(!strcmp(*tagp, "<a href=")) { + vis1 = NULL; vis2 = NULL; + + vis1 = strchr(begin, '\''); + if(vis1) + vis2 = strchr(vis1+1, '\''); + if(!vis1) { + vis1 = strchr(begin, '\"'); + if(vis1) + vis2 = strchr(vis1+1, '\"'); + } + if(vis1 && vis2) { + *vis2 = '\0'; + /* generate "[ http://example.com/ ] anchor " */ + tmp = g_strconcat(str, "[ ", vis1+1, " ]", " ", NULL); + g_free(str); + str = tmp; + } + startp = end + 1; + goto loop; + } /* <a href= */ + else { + /* anything else: discard whole <>. */ + startp = end + 1; + goto loop; + } + } /* valid tag */ + } + + /* no valid tag was found: copy <brabra> */ + tmp = g_strndup(begin, end - begin + 1); + tmp2 = g_strconcat(str, tmp, NULL); + g_free(tmp); + g_free(str); + str = tmp2; + startp = end + 1; + goto loop; +} + /* string utilities */ static void escape(gchar **str) @@ -259,13 +238,20 @@ } static void -strip_markup(gchar **str) +strip_markup(gchar **str, gboolean escape) { - char *plain; + gchar *plain; plain = strip_html_markup(*str); g_free(*str); - *str = plain; + if(escape) { + *str = g_markup_escape_text(plain, -1); + g_free(plain); + } + else { + *str = plain; + } + twitter_debug("result=%s\n", *str); } @@ -481,7 +467,7 @@ if(st->id > lastid && !is_posted_message(st)) { gchar *msg = NULL; - msg = g_strdup_printf("%s: %s\n", st->screen_name, st->text); + msg = g_strdup_printf("%s: %s", st->screen_name, st->text); purple_conv_im_write(conv->u.im, "twitter@twitter.com", msg, @@ -500,7 +486,7 @@ statuseslist = g_list_remove_all(statuseslist, NULL); } -/* status fetching function. it will be called periodically. */ +/* status fetching function. it will be called periodically. */ static gboolean get_status_with_api(gpointer data) { @@ -741,16 +727,16 @@ twitter_ac = is_twitter_account(account, recipient); wassr_ac = is_wassr_account(account, recipient); + /* strip all markups */ + if(twitter_ac || wassr_ac) + strip_markup(buffer, TRUE); + if(wassr_ac) { /* store sending message to address parrot problem */ g_strlcpy(wassr_post, *buffer, WASSR_POST_LEN); twitter_debug("parrot pushed:%s\n", *buffer); } - /* strip all markups */ - if(twitter_ac || wassr_ac) - strip_markup(buffer); - /* return here if the message is not to twitter */ if(!twitter_ac) return FALSE; @@ -961,7 +947,7 @@ } /* strip all markups */ - strip_markup(buffer); //it causes missing of strings surrounded by <> + strip_markup(buffer, TRUE); /* playsound */ if(purple_prefs_get_bool(OPT_PLAYSOUND_SENDER)) {