changeset 45:746ff3b54c10

trying another way to strip markups. in this revision, all markups are striped with strip_html_markup() on sending a message.
author Yoshiki Yazawa <yaz@cc.rim.or.jp>
date Tue, 13 May 2008 08:26:38 +0900
parents 4f456c7150ed
children e4f8e5708afd
files pidgin-twitter.c
diffstat 1 files changed, 195 insertions(+), 57 deletions(-) [+]
line wrap: on
line diff
--- a/pidgin-twitter.c	Tue May 13 03:42:45 2008 +0900
+++ b/pidgin-twitter.c	Tue May 13 08:26:38 2008 +0900
@@ -35,13 +35,11 @@
 #define SENDER      1
 #define COMMAND     2
 #define PSEUDO      3
-#define EXCESS_MARKUP   4
-#define FONT_MARKUP     5
-#define ATMARK_AFTER_A  6
 
 #define PLUGIN_ID	            "gtk-honeyplanet-pidgin_twitter"
 #define PLUGIN_NAME	            "pidgin-twitter"
 
+/* options */
 #define OPT_PIDGINTWITTER 		"/plugins/pidgin_twitter"
 #define OPT_TRANSLATE_RECIPIENT OPT_PIDGINTWITTER "/translate_recipient"
 #define OPT_TRANSLATE_SENDER    OPT_PIDGINTWITTER "/translate_sender"
@@ -55,20 +53,28 @@
 #define OPT_COUNTER             OPT_PIDGINTWITTER "/counter"
 #define OPT_SUPPRESS_OOPS       OPT_PIDGINTWITTER "/suppress_oops"
 
+/* formats and templates */
 #define RECIPIENT_FORMAT        "@<a href='http://twitter.com/%s'>%s</a>"
 #define SENDER_FORMAT           "<a href='http://twitter.com/%s'>%s</a>: "
 #define DEFAULT_LIST            "(list of users: separated with ' ,:;')"
 #define OOPS_MESSAGE            "<body>Oops! Your update was over 140 characters. We sent the short version to your friends (they can view the entire update on the web).<BR></body>"
 
+/* patterns */
+#define P_RECIPIENT     "@([A-Za-z0-9_]+)"
+#define P_SENDER        "<body>([A-Za-z0-9_]+): "
+#define P_COMMAND       "^(?:\\s*)([dDfFgGlLmMnNtTwW]{1}\\s+[A-Za-z0-9_]+)(?:\\s*\\Z)"
+#define P_PSEUDO        "^\\s*(?:[\"#$%&'()*+,\\-./:;<=>?\\[\\\\\\]_`{|}~]|[^\\s\\x21-\\x7E])*([dDfFgGlLmMnNtTwW]{1})(?:\\Z|\\s+|[^\\x21-\\x7E]+\\Z)"
+
+/* debug macros */
 #define twitter_debug(fmt, ...)	purple_debug(PURPLE_DEBUG_INFO, PLUGIN_NAME, "%s():%4d:  " fmt, __FUNCTION__, (int)__LINE__, ## __VA_ARGS__);
 #define twitter_error(fmt, ...)	purple_debug(PURPLE_DEBUG_ERROR, PLUGIN_NAME, "%s():%4d:  " fmt, __FUCTION__, (int)__LINE__, ## __VA_ARGS__);
 
+
 /* globals */
-static GRegex *regp[7];
+static GRegex *regp[4];
 static gboolean suppress_oops = FALSE;
 
 /* prototypes */
-static void strip_excess_markup(gchar **str);
 static void escape(gchar **str);
 static gboolean sending_im_cb(PurpleAccount *account, char *recipient, char **buffer, void *data);
 static gboolean eval(const GMatchInfo *match_info, GString *result, gpointer user_data);
@@ -92,34 +98,166 @@
 static void init_plugin(PurplePlugin *plugin);
 
 
-/* implementation */
-static void
-strip_excess_markup(gchar **str)
+/* tentative: this function is a modified clone of purple_markup_strip_html() */
+static char *
+strip_html_markup(const char *str)
 {
-    gchar *newstr = NULL;
+	int i, j, k, entlen;
+	gboolean visible = TRUE;
+	gboolean closing_td_p = FALSE;
+	gchar *str2;
+	const gchar *cdata_close_tag = NULL, *ent;
+	gchar *href = NULL;
+	int href_st = 0;
+
+	if(!str)
+		return NULL;
+
+	str2 = g_strdup(str);
 
-    /* strip font tag */
-    newstr = g_regex_replace(regp[FONT_MARKUP], *str, -1, 0, "\\1", 0, NULL);
-    twitter_debug("*str = %s newstr = %s\n", *str, newstr);
-    g_free(*str);
-    *str = newstr;
+	for (i = 0, j = 0; str2[i]; i++)
+	{
+		if (str2[i] == '<')
+		{
+			if (cdata_close_tag)
+			{
+				/* Note: Don't even assume any other tag is a tag in CDATA */
+				if (g_ascii_strncasecmp(str2 + i, cdata_close_tag,
+						strlen(cdata_close_tag)) == 0)
+				{
+					i += strlen(cdata_close_tag) - 1;
+					cdata_close_tag = NULL;
+				}
+				continue;
+			}
+			else if (g_ascii_strncasecmp(str2 + i, "<td", 3) == 0 && closing_td_p)
+			{
+				str2[j++] = '\t';
+				visible = TRUE;
+			}
+			else if (g_ascii_strncasecmp(str2 + i, "</td>", 5) == 0)
+			{
+				closing_td_p = TRUE;
+				visible = FALSE;
+			}
+			else
+			{
+				closing_td_p = FALSE;
+				visible = TRUE;
+			}
+
+			k = i + 1;
+
+			if(g_ascii_isspace(str2[k]))
+				visible = TRUE;
+			else if (str2[k])
+			{
+				/* Scan until we end the tag either implicitly (closed start
+				 * tag) or explicitly, using a sloppy method (i.e., < or >
+				 * inside quoted attributes will screw us up)
+				 */
+				while (str2[k] && str2[k] != '<' && str2[k] != '>')
+				{
+					k++;
+				}
 
-    /* move @ prior to anchor tag */
-    newstr = g_regex_replace(regp[ATMARK_AFTER_A], *str, -1, 0, "@\\1", 0, NULL);
-    twitter_debug("*str = %s newstr = %s\n", *str, newstr);
-    g_free(*str);
-    *str = newstr;
+				/* If we've got an <a> tag with an href, save the address
+				 * to print later. */
+				if (g_ascii_strncasecmp(str2 + i, "<a", 2) == 0 &&
+				    g_ascii_isspace(str2[i+2]))
+				{
+					int st; /* start of href, inclusive [ */
+					int end; /* end of href, exclusive ) */
+					char delim = ' ';
+					/* Find start of href */
+					for (st = i + 3; st < k; st++)
+					{
+						if (g_ascii_strncasecmp(str2+st, "href=", 5) == 0)
+						{
+							st += 5;
+							if (str2[st] == '"' || str2[st] == '\'')
+							{
+								delim = str2[st];
+								st++;
+							}
+							break;
+						}
+					}
+					/* find end of address */
+					for (end = st; end < k && str2[end] != delim; end++)
+					{
+						/* All the work is done in the loop construct above. */
+					}
+
+					/* If there's an address, save it.  If there was
+					 * already one saved, kill it. */
+					if (st < k)
+					{
+						char *tmp;
+						g_free(href);
+						tmp = g_strndup(str2 + st, end - st);
+						href = purple_unescape_html(tmp);
+						g_free(tmp);
+						href_st = j;
+					}
+				}
 
-    /* strip link */
-    newstr =
-        g_regex_replace(regp[EXCESS_MARKUP], *str, -1, 0, "\\1", 0, NULL);
+				/* Check for tags which should be mapped to newline */
+				else if (g_ascii_strncasecmp(str2 + i, "<p>", 3) == 0
+				 || g_ascii_strncasecmp(str2 + i, "<tr", 3) == 0
+				 || g_ascii_strncasecmp(str2 + i, "<br", 3) == 0
+				 || g_ascii_strncasecmp(str2 + i, "<hr", 3) == 0
+				 || g_ascii_strncasecmp(str2 + i, "<li", 3) == 0
+				 || g_ascii_strncasecmp(str2 + i, "<div", 4) == 0
+				 || g_ascii_strncasecmp(str2 + i, "</table>", 8) == 0)
+				{
+					str2[j++] = '\n';
+				}
+				else if (g_ascii_strncasecmp(str2 + i, "<script", 7) == 0)
+				{
+					cdata_close_tag = "</script>";
+				}
+				else if (g_ascii_strncasecmp(str2 + i, "<style", 6) == 0)
+				{
+					cdata_close_tag = "</style>";
+				}
+				/* Update the index and continue checking after the tag */
+				i = (str2[k] == '<' || str2[k] == '\0')? k - 1: k;
+				continue;
+			}
+		}
+		else if (cdata_close_tag)
+		{
+			continue;
+		}
+		else if (!g_ascii_isspace(str2[i]))
+		{
+			visible = TRUE;
+		}
 
-    twitter_debug("*str = %s newstr = %s\n", *str, newstr);
+		if (str2[i] == '&' &&
+            (ent = purple_markup_unescape_entity(str2 + i, &entlen)) != NULL)
+		{
+			while (*ent)
+				str2[j++] = *ent++;
+			i += entlen - 1;
+			continue;
+		}
 
-    g_free(*str);
-    *str = newstr;
+		if (visible)
+			str2[j++] = g_ascii_isspace(str2[i])? ' ': str2[i];
+	}
+
+	g_free(href);
+
+	str2[j] = '\0';
+
+	return str2;
 }
 
+
+/* our implementation */
+
 static void
 escape(gchar **str)
 {
@@ -139,9 +277,8 @@
     g_match_info_free(match_info);
     match_info = NULL;
 
-    if(flag) {
+    if(flag)
         return;
-    }
 
     /* if not found, check pseudo command */
     g_regex_match(regp[PSEUDO], *str, 0, &match_info);
@@ -165,16 +302,30 @@
     }
 }
 
+static void
+strip_markup(gchar **str)
+{
+    char *plain;
+
+    plain = strip_html_markup(*str);
+    g_free(*str);
+    *str = plain;
+}
+
 static gboolean
 sending_im_cb(PurpleAccount *account, char *recipient, char **buffer,
               void *data)
 {
+    int utflen, bytes;
+
+    twitter_debug("called\n");
+
     /* check if the message is from twitter */
     if(!is_twitter_account(account, recipient))
         return FALSE;
 
-    /* strip excess markup */
-    strip_excess_markup(buffer);
+    /* strip all markups */
+    strip_markup(buffer);
 
     /* escape pseudo command */
     if(purple_prefs_get_bool(OPT_ESCAPE_PSEUDO)) {
@@ -182,8 +333,8 @@
     }
 
     /* try to suppress oops message */
-    gint utflen = g_utf8_strlen(*buffer, -1);
-    gint bytes = strlen(*buffer);
+    utflen = g_utf8_strlen(*buffer, -1);
+    bytes = strlen(*buffer);
     twitter_debug("utflen = %d bytes = %d\n", utflen, bytes);
     if(bytes > 140 && utflen <= 140)
         suppress_oops = TRUE;
@@ -222,7 +373,7 @@
                                   &which,   // user data
                                   NULL);    // error handler
 
-    twitter_debug("*str = %s newstr = %s\n", *str, newstr);
+    twitter_debug("which = %d *str = %s newstr = %s\n", which, *str, newstr);
 
     g_free(*str);
     *str = newstr;
@@ -272,10 +423,14 @@
 writing_im_cb(PurpleAccount *account, char *sender, char **buffer,
               PurpleConversation *conv, int *flags, void *data)
 {
+    twitter_debug("called\n");
+
     /* check if the message is from twitter */
     if(!is_twitter_account(account, sender))
         return FALSE;
 
+    /* strip all markups */
+    strip_markup(buffer);
 
     /* playsound */
     if(purple_prefs_get_bool(OPT_PLAYSOUND_SENDER)) {
@@ -285,9 +440,6 @@
         playsound(buffer, RECIPIENT);
     }
 
-    /* strip excess markup */
-    strip_excess_markup(buffer);
-
     /* translate */
     if(purple_prefs_get_bool(OPT_TRANSLATE_SENDER)) {
         translate(buffer, SENDER);
@@ -311,15 +463,13 @@
     PidginConversation *gtkconv = (PidginConversation *)user_data;
     GtkWidget *box, *counter = NULL;
     gchar *markup = NULL;
+    guint count;
 
     g_return_if_fail(gtkconv != NULL);
 
-    guint count = gtk_text_buffer_get_char_count(textbuffer) +
+    count = gtk_text_buffer_get_char_count(textbuffer) +
         (unsigned int)g_utf8_strlen(new_text, -1);
 
-//    twitter_debug("new_text = %s utf8_strlen = %ld new_text_length = %d\n",
-//                  new_text, g_utf8_strlen(new_text, -1), new_text_length);
-
     markup = g_markup_printf_escaped("<span color=\"%s\">%u</span>",
                                      count <= 140 ? "black" : "red", count);
 
@@ -493,6 +643,7 @@
 receiving_im_cb(PurpleAccount *account, char **sender, char **buffer,
                 PurpleConversation *conv, PurpleMessageFlags *flags, void *data)
 {
+    twitter_debug("called\n");
     twitter_debug("buffer = %s suppress_oops = %d\n", *buffer, suppress_oops);
 
     if(!suppress_oops || !purple_prefs_get_bool(OPT_SUPPRESS_OOPS))
@@ -524,21 +675,10 @@
                           plugin, PURPLE_CALLBACK(receiving_im_cb), NULL);
 
     /* compile regex */
-    regp[RECIPIENT] = g_regex_new("@([A-Za-z0-9_]+)", 0, 0, NULL);
-    regp[SENDER] = g_regex_new("<body>([A-Za-z0-9_]+): ", 0, 0, NULL);
-    regp[COMMAND] =
-        g_regex_new("^(?:\\s*)([dDfFgGlLmMnNtTwW]{1}\\s+[A-Za-z0-9_]+)(?:\\s*\\Z)",
-                    G_REGEX_RAW, 0, NULL);
-    regp[PSEUDO] =
-        g_regex_new
-        ("^\\s*(?:[\"#$%&'()*+,\\-./:;<=>?\\[\\\\\\]_`{|}~]|[^\\s\\x21-\\x7E])*([dDfFgGlLmMnNtTwW]{1})(?:\\Z|\\s+|[^\\x21-\\x7E]+\\Z)",
-         G_REGEX_RAW, 0, NULL);
-    regp[EXCESS_MARKUP] =
-        g_regex_new
-        ("<a href=\"http://twitter.com/([A-Za-z0-9_]+?)\">\\1</a>", 0, 0,
-         NULL);
-    regp[FONT_MARKUP] = g_regex_new("<font .+?>(.+?)</font>", 0, 0, NULL);
-    regp[ATMARK_AFTER_A] = g_regex_new("(<a href=.+?>)@", 0, 0, NULL);
+    regp[RECIPIENT] = g_regex_new(P_RECIPIENT, 0, 0, NULL);
+    regp[SENDER]    = g_regex_new(P_SENDER,    0, 0, NULL);
+    regp[COMMAND]   = g_regex_new(P_COMMAND, G_REGEX_RAW, 0, NULL);
+    regp[PSEUDO]    = g_regex_new(P_PSEUDO,  G_REGEX_RAW, 0, NULL);
 
     /* attach counter to the existing twitter window */
     gboolean enabled = purple_prefs_get_bool(OPT_COUNTER);
@@ -573,9 +713,6 @@
     g_regex_unref(regp[SENDER]);
     g_regex_unref(regp[COMMAND]);
     g_regex_unref(regp[PSEUDO]);
-    g_regex_unref(regp[EXCESS_MARKUP]);
-    g_regex_unref(regp[FONT_MARKUP]);
-    g_regex_unref(regp[ATMARK_AFTER_A]);
 
     /* detach from twitter window */
     detach_from_window();
@@ -588,6 +725,7 @@
                  gconstpointer val, gpointer data)
 {
     gboolean enabled = purple_prefs_get_bool(OPT_COUNTER);
+
     if(enabled) {
         attach_to_window();
     }