Mercurial > pidgin.yaz

diff src/util.c @ 9161:c3fa2ad099a2
[gaim-migrate @ 9946] wing added support for yahoo profiles in, well pretty much every language. Looks pretty impressive to me. Someone may want to double check his src/util.c changes. I think we have some crazy patch writers who know those functions better than me. This also introduces a couple of warning because wing didn't add his new util.c function to util.h. Rather than adding it myself, I'm going to bug him to add it and document it. committer: Tailor Script <tailor@pidgin.im>
author: Tim Ringenbach <marv@pidgin.im>
date: Wed, 02 Jun 2004 00:44:51 +0000
parents: dabfa4184db8
children: 456ef1f4ba19
--- a/src/util.c	Tue Jun 01 19:16:34 2004 +0000
+++ b/src/util.c	Wed Jun 02 00:44:51 2004 +0000
@@ -768,6 +768,20 @@
 
 	q = strstr(p, end_token);
 
+	/* Trim leading blanks */
+	while (*p != '\n' && g_ascii_isspace(*p)) {
+		p += 1;
+	}
+
+	/* Trim trailing blanks */
+	while (q > p && g_ascii_isspace(*(q - 1))) {
+		q -= 1;
+	}
+
+	/* Don't bother with null strings */
+	if (p == q)
+		return FALSE;
+
 	if (q != NULL && (!no_value_token ||
 					  (no_value_token && strncmp(p, no_value_token,
 												 strlen(no_value_token)))))
@@ -1208,11 +1222,20 @@
 	g_string_free(plain, TRUE);
 }
 
+/* The following are probably reasonable changes:
+ * - \n should be converted to a normal space
+ * - in addition to <br>, <p> and <div> etc. should also be converted into \n
+ * - We want to turn </td>#whitespace<td> sequences into a single blank
+ * - We want to turn </tr>#whitespace<tr> sequences into a single \n
+ * We should remove all <script>...</script> etc. This should be fixed some time
+ */
+
 char *
 gaim_markup_strip_html(const char *str)
 {
 	int i, j, k;
 	gboolean visible = TRUE;
+	gboolean closing_td_p = FALSE;
 	gchar *str2;
 
 	if(!str)
@@ -1224,11 +1247,20 @@
 	{
 		if (str2[i] == '<')
 		{
-			if (strncasecmp(str2 + i, "<br>", 4) == 0)
+			if (strncasecmp(str2 + i, "<td", 3) == 0 && closing_td_p)
+			{
+				str2[j++] = ' ';
+				visible = TRUE;
+			}
+			else if (strncasecmp(str2 + i, "</td>", 5) == 0)
 			{
-				str2[j++] = '\n';
-				i = i + 3;
-				continue;
+				closing_td_p = TRUE;
+				visible = FALSE;
+			}
+			else
+			{
+				closing_td_p = FALSE;
+				visible = TRUE;
 			}
 
 			k = i + 1;
@@ -1237,28 +1269,32 @@
 				visible = TRUE;
 			else
 			{
-				while (str2[k])
+				/* Scan until we end the tag either implicitly (closed start
+				 * tag) or explicitly, using a sloppy method (i.e., < or >
+				 * inside quoted attributes will screw us up)
+				 */
+				while (str2[k] && str2[k] != '<' && str2[k] != '>')
 				{
-					if (str2[k] == '<')
-					{
-						visible = TRUE;
-						break;
-					}
-
-					if (str2[k] == '>')
-					{
-						visible = FALSE;
-						break;
-					}
-
 					k++;
 				}
+				/* Check for tags which should be mapped to newline */
+				if (strncasecmp(str2 + i, "<p>", 3) == 0
+				 || strncasecmp(str2 + i, "<tr", 3) == 0
+				 || strncasecmp(str2 + i, "<br", 3) == 0
+				 || strncasecmp(str2 + i, "<li", 3) == 0
+				 || strncasecmp(str2 + i, "<div", 4) == 0
+				 || strncasecmp(str2 + i, "</table>", 8) == 0)
+				{
+					str2[j++] = '\n';
+				}
+				/* Update the index and continue checking after the tag */
+				i = (str2[k] == '<')? k - 1: k;
+				continue;
 			}
 		}
-		else if (str2[i] == '>' && !visible)
+		else if (!g_ascii_isspace(str2[i]))
 		{
 			visible = TRUE;
-			continue;
 		}
 
 		if (str2[i] == '&' && strncasecmp(str2 + i, "&quot;", 6) == 0)
@@ -1290,7 +1326,7 @@
 		}
 
 		if (visible)
-			str2[j++] = str2[i];
+			str2[j++] = g_ascii_isspace(str2[i])? ' ': str2[i];
 	}
 
 	str2[j] = '\0';
@@ -2671,6 +2707,37 @@
 	return(NULL);
 }
 
+char *
+gaim_utf8_ncr_decode(const char *in)
+{
+	GString *out = g_string_new("");
+	int i;
+
+	g_return_val_if_fail(in != NULL, NULL);
+	g_return_val_if_fail(g_utf8_validate(in, -1, NULL), NULL);
+
+	for (i = 0; in[i]; i += 1) {
+		gboolean ncr_found_p = FALSE;
+		if (in[i] == '&' && in[i + 1] == '#' && isdigit(in[i + 2])) {
+			gunichar wc;
+			int j;
+			for (wc = 0, j = i + 2; isdigit(in[j]); j += 1) {
+				wc *= 10;
+				wc += in[j] - '0';
+			}
+			if (in[j] == ';') { /* Technically not completely correct */
+				g_string_append_unichar(out, wc);
+				i = j;
+				ncr_found_p = TRUE;
+			}
+		}
+		if (!ncr_found_p) {
+			g_string_append_c(out, in[i]);
+		}
+	}
+	return g_string_free(out, FALSE);
+}
+
 int
 gaim_utf8_strcasecmp(const char *a, const char *b)
 {
author	Tim Ringenbach <marv@pidgin.im>
date	Wed, 02 Jun 2004 00:44:51 +0000
parents	dabfa4184db8
children	456ef1f4ba19