Mercurial > pidgin-twitter
changeset 45:746ff3b54c10
trying another way to strip markups. in this revision, all markups are striped with strip_html_markup() on sending a message.
author | Yoshiki Yazawa <yaz@cc.rim.or.jp> |
---|---|
date | Tue, 13 May 2008 08:26:38 +0900 |
parents | 4f456c7150ed |
children | e4f8e5708afd |
files | pidgin-twitter.c |
diffstat | 1 files changed, 195 insertions(+), 57 deletions(-) [+] |
line wrap: on
line diff
--- a/pidgin-twitter.c Tue May 13 03:42:45 2008 +0900 +++ b/pidgin-twitter.c Tue May 13 08:26:38 2008 +0900 @@ -35,13 +35,11 @@ #define SENDER 1 #define COMMAND 2 #define PSEUDO 3 -#define EXCESS_MARKUP 4 -#define FONT_MARKUP 5 -#define ATMARK_AFTER_A 6 #define PLUGIN_ID "gtk-honeyplanet-pidgin_twitter" #define PLUGIN_NAME "pidgin-twitter" +/* options */ #define OPT_PIDGINTWITTER "/plugins/pidgin_twitter" #define OPT_TRANSLATE_RECIPIENT OPT_PIDGINTWITTER "/translate_recipient" #define OPT_TRANSLATE_SENDER OPT_PIDGINTWITTER "/translate_sender" @@ -55,20 +53,28 @@ #define OPT_COUNTER OPT_PIDGINTWITTER "/counter" #define OPT_SUPPRESS_OOPS OPT_PIDGINTWITTER "/suppress_oops" +/* formats and templates */ #define RECIPIENT_FORMAT "@<a href='http://twitter.com/%s'>%s</a>" #define SENDER_FORMAT "<a href='http://twitter.com/%s'>%s</a>: " #define DEFAULT_LIST "(list of users: separated with ' ,:;')" #define OOPS_MESSAGE "<body>Oops! Your update was over 140 characters. We sent the short version to your friends (they can view the entire update on the web).<BR></body>" +/* patterns */ +#define P_RECIPIENT "@([A-Za-z0-9_]+)" +#define P_SENDER "<body>([A-Za-z0-9_]+): " +#define P_COMMAND "^(?:\\s*)([dDfFgGlLmMnNtTwW]{1}\\s+[A-Za-z0-9_]+)(?:\\s*\\Z)" +#define P_PSEUDO "^\\s*(?:[\"#$%&'()*+,\\-./:;<=>?\\[\\\\\\]_`{|}~]|[^\\s\\x21-\\x7E])*([dDfFgGlLmMnNtTwW]{1})(?:\\Z|\\s+|[^\\x21-\\x7E]+\\Z)" + +/* debug macros */ #define twitter_debug(fmt, ...) purple_debug(PURPLE_DEBUG_INFO, PLUGIN_NAME, "%s():%4d: " fmt, __FUNCTION__, (int)__LINE__, ## __VA_ARGS__); #define twitter_error(fmt, ...) purple_debug(PURPLE_DEBUG_ERROR, PLUGIN_NAME, "%s():%4d: " fmt, __FUCTION__, (int)__LINE__, ## __VA_ARGS__); + /* globals */ -static GRegex *regp[7]; +static GRegex *regp[4]; static gboolean suppress_oops = FALSE; /* prototypes */ -static void strip_excess_markup(gchar **str); static void escape(gchar **str); static gboolean sending_im_cb(PurpleAccount *account, char *recipient, char **buffer, void *data); static gboolean eval(const GMatchInfo *match_info, GString *result, gpointer user_data); @@ -92,34 +98,166 @@ static void init_plugin(PurplePlugin *plugin); -/* implementation */ -static void -strip_excess_markup(gchar **str) +/* tentative: this function is a modified clone of purple_markup_strip_html() */ +static char * +strip_html_markup(const char *str) { - gchar *newstr = NULL; + int i, j, k, entlen; + gboolean visible = TRUE; + gboolean closing_td_p = FALSE; + gchar *str2; + const gchar *cdata_close_tag = NULL, *ent; + gchar *href = NULL; + int href_st = 0; + + if(!str) + return NULL; + + str2 = g_strdup(str); - /* strip font tag */ - newstr = g_regex_replace(regp[FONT_MARKUP], *str, -1, 0, "\\1", 0, NULL); - twitter_debug("*str = %s newstr = %s\n", *str, newstr); - g_free(*str); - *str = newstr; + for (i = 0, j = 0; str2[i]; i++) + { + if (str2[i] == '<') + { + if (cdata_close_tag) + { + /* Note: Don't even assume any other tag is a tag in CDATA */ + if (g_ascii_strncasecmp(str2 + i, cdata_close_tag, + strlen(cdata_close_tag)) == 0) + { + i += strlen(cdata_close_tag) - 1; + cdata_close_tag = NULL; + } + continue; + } + else if (g_ascii_strncasecmp(str2 + i, "<td", 3) == 0 && closing_td_p) + { + str2[j++] = '\t'; + visible = TRUE; + } + else if (g_ascii_strncasecmp(str2 + i, "</td>", 5) == 0) + { + closing_td_p = TRUE; + visible = FALSE; + } + else + { + closing_td_p = FALSE; + visible = TRUE; + } + + k = i + 1; + + if(g_ascii_isspace(str2[k])) + visible = TRUE; + else if (str2[k]) + { + /* Scan until we end the tag either implicitly (closed start + * tag) or explicitly, using a sloppy method (i.e., < or > + * inside quoted attributes will screw us up) + */ + while (str2[k] && str2[k] != '<' && str2[k] != '>') + { + k++; + } - /* move @ prior to anchor tag */ - newstr = g_regex_replace(regp[ATMARK_AFTER_A], *str, -1, 0, "@\\1", 0, NULL); - twitter_debug("*str = %s newstr = %s\n", *str, newstr); - g_free(*str); - *str = newstr; + /* If we've got an <a> tag with an href, save the address + * to print later. */ + if (g_ascii_strncasecmp(str2 + i, "<a", 2) == 0 && + g_ascii_isspace(str2[i+2])) + { + int st; /* start of href, inclusive [ */ + int end; /* end of href, exclusive ) */ + char delim = ' '; + /* Find start of href */ + for (st = i + 3; st < k; st++) + { + if (g_ascii_strncasecmp(str2+st, "href=", 5) == 0) + { + st += 5; + if (str2[st] == '"' || str2[st] == '\'') + { + delim = str2[st]; + st++; + } + break; + } + } + /* find end of address */ + for (end = st; end < k && str2[end] != delim; end++) + { + /* All the work is done in the loop construct above. */ + } + + /* If there's an address, save it. If there was + * already one saved, kill it. */ + if (st < k) + { + char *tmp; + g_free(href); + tmp = g_strndup(str2 + st, end - st); + href = purple_unescape_html(tmp); + g_free(tmp); + href_st = j; + } + } - /* strip link */ - newstr = - g_regex_replace(regp[EXCESS_MARKUP], *str, -1, 0, "\\1", 0, NULL); + /* Check for tags which should be mapped to newline */ + else if (g_ascii_strncasecmp(str2 + i, "<p>", 3) == 0 + || g_ascii_strncasecmp(str2 + i, "<tr", 3) == 0 + || g_ascii_strncasecmp(str2 + i, "<br", 3) == 0 + || g_ascii_strncasecmp(str2 + i, "<hr", 3) == 0 + || g_ascii_strncasecmp(str2 + i, "<li", 3) == 0 + || g_ascii_strncasecmp(str2 + i, "<div", 4) == 0 + || g_ascii_strncasecmp(str2 + i, "</table>", 8) == 0) + { + str2[j++] = '\n'; + } + else if (g_ascii_strncasecmp(str2 + i, "<script", 7) == 0) + { + cdata_close_tag = "</script>"; + } + else if (g_ascii_strncasecmp(str2 + i, "<style", 6) == 0) + { + cdata_close_tag = "</style>"; + } + /* Update the index and continue checking after the tag */ + i = (str2[k] == '<' || str2[k] == '\0')? k - 1: k; + continue; + } + } + else if (cdata_close_tag) + { + continue; + } + else if (!g_ascii_isspace(str2[i])) + { + visible = TRUE; + } - twitter_debug("*str = %s newstr = %s\n", *str, newstr); + if (str2[i] == '&' && + (ent = purple_markup_unescape_entity(str2 + i, &entlen)) != NULL) + { + while (*ent) + str2[j++] = *ent++; + i += entlen - 1; + continue; + } - g_free(*str); - *str = newstr; + if (visible) + str2[j++] = g_ascii_isspace(str2[i])? ' ': str2[i]; + } + + g_free(href); + + str2[j] = '\0'; + + return str2; } + +/* our implementation */ + static void escape(gchar **str) { @@ -139,9 +277,8 @@ g_match_info_free(match_info); match_info = NULL; - if(flag) { + if(flag) return; - } /* if not found, check pseudo command */ g_regex_match(regp[PSEUDO], *str, 0, &match_info); @@ -165,16 +302,30 @@ } } +static void +strip_markup(gchar **str) +{ + char *plain; + + plain = strip_html_markup(*str); + g_free(*str); + *str = plain; +} + static gboolean sending_im_cb(PurpleAccount *account, char *recipient, char **buffer, void *data) { + int utflen, bytes; + + twitter_debug("called\n"); + /* check if the message is from twitter */ if(!is_twitter_account(account, recipient)) return FALSE; - /* strip excess markup */ - strip_excess_markup(buffer); + /* strip all markups */ + strip_markup(buffer); /* escape pseudo command */ if(purple_prefs_get_bool(OPT_ESCAPE_PSEUDO)) { @@ -182,8 +333,8 @@ } /* try to suppress oops message */ - gint utflen = g_utf8_strlen(*buffer, -1); - gint bytes = strlen(*buffer); + utflen = g_utf8_strlen(*buffer, -1); + bytes = strlen(*buffer); twitter_debug("utflen = %d bytes = %d\n", utflen, bytes); if(bytes > 140 && utflen <= 140) suppress_oops = TRUE; @@ -222,7 +373,7 @@ &which, // user data NULL); // error handler - twitter_debug("*str = %s newstr = %s\n", *str, newstr); + twitter_debug("which = %d *str = %s newstr = %s\n", which, *str, newstr); g_free(*str); *str = newstr; @@ -272,10 +423,14 @@ writing_im_cb(PurpleAccount *account, char *sender, char **buffer, PurpleConversation *conv, int *flags, void *data) { + twitter_debug("called\n"); + /* check if the message is from twitter */ if(!is_twitter_account(account, sender)) return FALSE; + /* strip all markups */ + strip_markup(buffer); /* playsound */ if(purple_prefs_get_bool(OPT_PLAYSOUND_SENDER)) { @@ -285,9 +440,6 @@ playsound(buffer, RECIPIENT); } - /* strip excess markup */ - strip_excess_markup(buffer); - /* translate */ if(purple_prefs_get_bool(OPT_TRANSLATE_SENDER)) { translate(buffer, SENDER); @@ -311,15 +463,13 @@ PidginConversation *gtkconv = (PidginConversation *)user_data; GtkWidget *box, *counter = NULL; gchar *markup = NULL; + guint count; g_return_if_fail(gtkconv != NULL); - guint count = gtk_text_buffer_get_char_count(textbuffer) + + count = gtk_text_buffer_get_char_count(textbuffer) + (unsigned int)g_utf8_strlen(new_text, -1); -// twitter_debug("new_text = %s utf8_strlen = %ld new_text_length = %d\n", -// new_text, g_utf8_strlen(new_text, -1), new_text_length); - markup = g_markup_printf_escaped("<span color=\"%s\">%u</span>", count <= 140 ? "black" : "red", count); @@ -493,6 +643,7 @@ receiving_im_cb(PurpleAccount *account, char **sender, char **buffer, PurpleConversation *conv, PurpleMessageFlags *flags, void *data) { + twitter_debug("called\n"); twitter_debug("buffer = %s suppress_oops = %d\n", *buffer, suppress_oops); if(!suppress_oops || !purple_prefs_get_bool(OPT_SUPPRESS_OOPS)) @@ -524,21 +675,10 @@ plugin, PURPLE_CALLBACK(receiving_im_cb), NULL); /* compile regex */ - regp[RECIPIENT] = g_regex_new("@([A-Za-z0-9_]+)", 0, 0, NULL); - regp[SENDER] = g_regex_new("<body>([A-Za-z0-9_]+): ", 0, 0, NULL); - regp[COMMAND] = - g_regex_new("^(?:\\s*)([dDfFgGlLmMnNtTwW]{1}\\s+[A-Za-z0-9_]+)(?:\\s*\\Z)", - G_REGEX_RAW, 0, NULL); - regp[PSEUDO] = - g_regex_new - ("^\\s*(?:[\"#$%&'()*+,\\-./:;<=>?\\[\\\\\\]_`{|}~]|[^\\s\\x21-\\x7E])*([dDfFgGlLmMnNtTwW]{1})(?:\\Z|\\s+|[^\\x21-\\x7E]+\\Z)", - G_REGEX_RAW, 0, NULL); - regp[EXCESS_MARKUP] = - g_regex_new - ("<a href=\"http://twitter.com/([A-Za-z0-9_]+?)\">\\1</a>", 0, 0, - NULL); - regp[FONT_MARKUP] = g_regex_new("<font .+?>(.+?)</font>", 0, 0, NULL); - regp[ATMARK_AFTER_A] = g_regex_new("(<a href=.+?>)@", 0, 0, NULL); + regp[RECIPIENT] = g_regex_new(P_RECIPIENT, 0, 0, NULL); + regp[SENDER] = g_regex_new(P_SENDER, 0, 0, NULL); + regp[COMMAND] = g_regex_new(P_COMMAND, G_REGEX_RAW, 0, NULL); + regp[PSEUDO] = g_regex_new(P_PSEUDO, G_REGEX_RAW, 0, NULL); /* attach counter to the existing twitter window */ gboolean enabled = purple_prefs_get_bool(OPT_COUNTER); @@ -573,9 +713,6 @@ g_regex_unref(regp[SENDER]); g_regex_unref(regp[COMMAND]); g_regex_unref(regp[PSEUDO]); - g_regex_unref(regp[EXCESS_MARKUP]); - g_regex_unref(regp[FONT_MARKUP]); - g_regex_unref(regp[ATMARK_AFTER_A]); /* detach from twitter window */ detach_from_window(); @@ -588,6 +725,7 @@ gconstpointer val, gpointer data) { gboolean enabled = purple_prefs_get_bool(OPT_COUNTER); + if(enabled) { attach_to_window(); }