changeset 136:e67b0231ba56

completely re-wrote strip_html_markup(). new function permits non-tag strings surrounded by <>.
author Yoshiki Yazawa <yaz@honeyplanet.jp>
date Wed, 23 Jul 2008 03:53:59 +0900
parents cae8d5dd24d0
children 97f11d057071
files pidgin-twitter.c
diffstat 1 files changed, 153 insertions(+), 167 deletions(-) [+]
line wrap: on
line diff
--- a/pidgin-twitter.c	Tue Jul 22 20:00:03 2008 +0900
+++ b/pidgin-twitter.c	Wed Jul 23 03:53:59 2008 +0900
@@ -39,163 +39,6 @@
 /* functions */
 /*************/
 
-/* this function is a modified clone of purple_markup_strip_html() */
-static char *
-strip_html_markup(const char *str)
-{
-	int i, j, k, entlen;
-	gboolean visible = TRUE;
-	gboolean closing_td_p = FALSE;
-	gchar *str2;
-	const gchar *cdata_close_tag = NULL, *ent;
-	gchar *href = NULL;
-	int href_st = 0;
-
-	if(!str)
-		return NULL;
-
-	str2 = g_strdup(str);
-
-	for (i = 0, j = 0; str2[i]; i++)
-	{
-		if (str2[i] == '<')
-		{
-			if (cdata_close_tag)
-			{
-				/* Note: Don't even assume any other tag is a tag in CDATA */
-				if (g_ascii_strncasecmp(str2 + i, cdata_close_tag,
-                                        strlen(cdata_close_tag)) == 0)
-				{
-					i += strlen(cdata_close_tag) - 1;
-					cdata_close_tag = NULL;
-				}
-				continue;
-			}
-			else if (g_ascii_strncasecmp(str2 + i, "<td", 3) == 0 && closing_td_p)
-			{
-				str2[j++] = '\t';
-				visible = TRUE;
-			}
-			else if (g_ascii_strncasecmp(str2 + i, "</td>", 5) == 0)
-			{
-				closing_td_p = TRUE;
-				visible = FALSE;
-			}
-			else
-			{
-				closing_td_p = FALSE;
-				visible = TRUE;
-			}
-
-			k = i + 1;
-
-			if(g_ascii_isspace(str2[k]))
-				visible = TRUE;
-			else if (str2[k])
-			{
-				/* Scan until we end the tag either implicitly (closed start
-				 * tag) or explicitly, using a sloppy method (i.e., < or >
-				 * inside quoted attributes will screw us up)
-				 */
-				while (str2[k] && str2[k] != '<' && str2[k] != '>')
-				{
-					k++;
-				}
-
-				/* If we've got an <a> tag with an href, save the address
-				 * to print later. */
-				if (g_ascii_strncasecmp(str2 + i, "<a", 2) == 0 &&
-				    g_ascii_isspace(str2[i+2]))
-				{
-					int st; /* start of href, inclusive [ */
-					int end; /* end of href, exclusive ) */
-					char delim = ' ';
-					/* Find start of href */
-					for (st = i + 3; st < k; st++)
-					{
-						if (g_ascii_strncasecmp(str2+st, "href=", 5) == 0)
-						{
-							st += 5;
-							if (str2[st] == '"' || str2[st] == '\'')
-							{
-								delim = str2[st];
-								st++;
-							}
-							break;
-						}
-					}
-					/* find end of address */
-					for (end = st; end < k && str2[end] != delim; end++)
-					{
-						/* All the work is done in the loop construct above. */
-					}
-
-					/* If there's an address, save it.  If there was
-					 * already one saved, kill it. */
-					if (st < k)
-					{
-						char *tmp;
-						g_free(href);
-						tmp = g_strndup(str2 + st, end - st);
-						href = purple_unescape_html(tmp);
-						g_free(tmp);
-						href_st = j;
-					}
-				}
-
-				/* Check for tags which should be mapped to newline */
-				else if (g_ascii_strncasecmp(str2 + i, "<p>", 3) == 0
-                         || g_ascii_strncasecmp(str2 + i, "<tr", 3) == 0
-                         || g_ascii_strncasecmp(str2 + i, "<br", 3) == 0
-                         || g_ascii_strncasecmp(str2 + i, "<hr", 3) == 0
-                         || g_ascii_strncasecmp(str2 + i, "<li", 3) == 0
-                         || g_ascii_strncasecmp(str2 + i, "<div", 4) == 0
-                         || g_ascii_strncasecmp(str2 + i, "</table>", 8) == 0)
-				{
-					str2[j++] = '\n';
-				}
-				else if (g_ascii_strncasecmp(str2 + i, "<script", 7) == 0)
-				{
-					cdata_close_tag = "</script>";
-				}
-				else if (g_ascii_strncasecmp(str2 + i, "<style", 6) == 0)
-				{
-					cdata_close_tag = "</style>";
-				}
-				/* Update the index and continue checking after the tag */
-				i = (str2[k] == '<' || str2[k] == '\0')? k - 1: k;
-				continue;
-			}
-		}
-		else if (cdata_close_tag)
-		{
-			continue;
-		}
-		else if (!g_ascii_isspace(str2[i]))
-		{
-			visible = TRUE;
-		}
-
-		if (str2[i] == '&' &&
-            (ent = purple_markup_unescape_entity(str2 + i, &entlen)) != NULL)
-		{
-			while (*ent)
-				str2[j++] = *ent++;
-			i += entlen - 1;
-			continue;
-		}
-
-		if (visible)
-			str2[j++] = g_ascii_isspace(str2[i])? ' ': str2[i];
-	}
-
-	g_free(href);
-
-	str2[j] = '\0';
-
-	return str2;
-}
-
 /* this function has been taken from autoaccept plugin */
 static gboolean
 ensure_path_exists(const char *dir)
@@ -213,6 +56,142 @@
 /* our implementation */
 /**********************/
 
+static gchar *html_tags[] = {
+    "<a href=",
+    "</a>",
+    "<b>",
+    "</b>",
+    "<p>",
+    "</p>",
+    "<div ",
+    "</div>",
+    "<span ",
+    "</span>",
+    "<body>",
+    "</body>",
+    "<i>",
+    "</i>",
+    "<font ",
+    "</font>",
+    "<br>",
+    "<br/>",
+    "<img ",
+    NULL
+};
+
+static gchar *
+strip_html_markup(const gchar *src)
+{
+    gchar *head, *tail;     /* head and tail of html */
+    gchar *begin, *end;     /* begin:<  end:> */
+    gchar *html, *str;      /* copied src and str to be returned */
+    gchar *vis1, *vis2;     /* begin and end of address part */
+    gchar *startp;          /* starting point marker */
+    gchar **tagp;           /* tag iterator */
+    gchar *tmp, *tmp2;      /* scratches */
+
+    g_return_val_if_fail(src != NULL, NULL);
+
+    const gchar *ptr, *ent;
+    gchar *ptr2;
+    gint entlen;
+
+    /* unescape &x; */
+    html = g_malloc0(strlen(src));
+    ptr2 = html;
+    for(ptr = src; *ptr; ) {
+        if(*ptr == '&') {
+            ent = purple_markup_unescape_entity(ptr, &entlen);
+            if(ent != NULL) {
+                while(*ent) {
+                    *ptr2++ = *ent++;
+                }
+                ptr += entlen;
+            }
+        }
+        else {
+            *ptr2++ = *ptr++;
+        }
+    } /* for */
+
+    str = g_strdup("\0");
+
+    head = html;
+    tail = head + strlen(html);
+    startp = head;
+
+loop:
+    begin = NULL;
+    end = NULL;
+
+    if(startp >= tail) {
+        g_free(html);
+        return str;
+    }
+
+    begin = strchr(startp, '<');
+    if(begin)
+        end = strchr(begin + 1, '>');
+    if(!end) {
+        tmp = g_strconcat(str, startp, NULL);
+        g_free(str);
+        str = tmp;
+        g_free(html);
+        return str; /* no corresponding >, we have done. */
+    }
+
+    /* here, both < and > are found */
+    /* concatenate leading part to dest */
+    tmp = g_strndup(startp, begin - startp);
+    tmp2 = g_strconcat(str, tmp, NULL);
+    g_free(tmp);
+    g_free(str);
+    str = tmp2;
+
+    /* find tag */
+    for(tagp = html_tags; *tagp; tagp++) {
+        if(!g_ascii_strncasecmp(begin, *tagp, strlen(*tagp))) {
+            /* we found a valid tag */
+            /* if tag is <a href=, extract address. */
+            if(!strcmp(*tagp, "<a href=")) {
+                vis1 = NULL; vis2 = NULL;
+
+                vis1 = strchr(begin, '\'');
+                if(vis1)
+                    vis2 = strchr(vis1+1, '\'');
+                if(!vis1) {
+                    vis1 = strchr(begin, '\"');
+                    if(vis1)
+                        vis2 = strchr(vis1+1, '\"');
+                }
+                if(vis1 && vis2) {
+                    *vis2 = '\0';
+                    /* generate "[ http://example.com/ ] anchor " */
+                    tmp = g_strconcat(str, "[ ", vis1+1, " ]", " ", NULL);
+                    g_free(str);
+                    str = tmp;
+                }
+                startp = end + 1;
+                goto loop;
+            } /* <a href= */
+            else {
+                /* anything else: discard whole <>. */
+                startp = end + 1;
+                goto loop;
+            }
+        }  /* valid tag */
+    }
+
+    /* no valid tag was found: copy <brabra> */
+    tmp = g_strndup(begin, end - begin + 1);
+    tmp2 = g_strconcat(str, tmp, NULL);
+    g_free(tmp);
+    g_free(str);
+    str = tmp2;
+    startp = end + 1;
+    goto loop;
+}
+
 /* string utilities */
 static void
 escape(gchar **str)
@@ -259,13 +238,20 @@
 }
 
 static void
-strip_markup(gchar **str)
+strip_markup(gchar **str, gboolean escape)
 {
-    char *plain;
+    gchar *plain;
 
     plain = strip_html_markup(*str);
     g_free(*str);
-    *str = plain;
+    if(escape) {
+        *str = g_markup_escape_text(plain, -1);
+        g_free(plain);
+    }
+    else {
+        *str = plain;
+    }
+    twitter_debug("result=%s\n", *str);
 }
 
 
@@ -481,7 +467,7 @@
          if(st->id > lastid && !is_posted_message(st)) {
              gchar *msg = NULL;
 
-             msg = g_strdup_printf("%s: %s\n", st->screen_name, st->text);
+             msg = g_strdup_printf("%s: %s", st->screen_name, st->text);
              purple_conv_im_write(conv->u.im,
                                   "twitter@twitter.com",
                                   msg,
@@ -500,7 +486,7 @@
      statuseslist = g_list_remove_all(statuseslist, NULL);
 }
 
-/* status fetching function. it will be called periodically. */ 
+/* status fetching function. it will be called periodically. */
 static gboolean
 get_status_with_api(gpointer data)
 {
@@ -741,16 +727,16 @@
     twitter_ac = is_twitter_account(account, recipient);
     wassr_ac   = is_wassr_account(account, recipient);
 
+    /* strip all markups */
+    if(twitter_ac || wassr_ac)
+        strip_markup(buffer, TRUE);
+
     if(wassr_ac) {
         /* store sending message to address parrot problem */
         g_strlcpy(wassr_post, *buffer, WASSR_POST_LEN);
         twitter_debug("parrot pushed:%s\n", *buffer);
     }
 
-    /* strip all markups */
-    if(twitter_ac || wassr_ac)
-        strip_markup(buffer);
-
     /* return here if the message is not to twitter */
     if(!twitter_ac)
         return FALSE;
@@ -961,7 +947,7 @@
     }
 
     /* strip all markups */
-    strip_markup(buffer); //it causes missing of strings surrounded by <>
+    strip_markup(buffer, TRUE);
 
     /* playsound */
     if(purple_prefs_get_bool(OPT_PLAYSOUND_SENDER)) {