changeset 28012:c06114f3d58d

* Change yahoo_html_to_codes() a little to hopefully be more straight forward and more similar to yahoo_codes_to_html() * Add documentation to yahoo_html_to_codes() that explains the differences between the encoded text that we send and the encoded text sent by yahoo * Enable the test caes for yahoo_html_to_codes()
author Mark Doliner <mark@kingant.net>
date Wed, 19 Aug 2009 22:01:10 +0000
parents bbb9e0ea67fe
children b289449f3e9f cf533027c3be
files libpurple/protocols/yahoo/libymsg.h libpurple/protocols/yahoo/util.c libpurple/tests/test_yahoo_util.c
diffstat 3 files changed, 134 insertions(+), 150 deletions(-) [+]
line wrap: on
line diff
--- a/libpurple/protocols/yahoo/libymsg.h	Wed Aug 19 18:54:54 2009 +0000
+++ b/libpurple/protocols/yahoo/libymsg.h	Wed Aug 19 22:01:10 2009 +0000
@@ -280,6 +280,27 @@
 void yahoo_init_colorht(void);
 void yahoo_dest_colorht(void);
 char *yahoo_codes_to_html(const char *x);
+
+/**
+ * This function takes a normal HTML message and converts it to the message
+ * format used by Yahoo, which uses a frankensteinish combination of ANSI
+ * escape codes and broken HTML.
+ *
+ * It results in slightly different output than would be sent by official
+ * Yahoo clients.  The two main differences are:
+ *
+ * 1. We always close all tags, whereas official Yahoo clients leave tags
+ *    dangling open at the end of each message (and the client treats them
+ *    as closed).
+ * 2. We always close inner tags first before closing outter tags.
+ *
+ * For example, if you want to send this message:
+ *   <b> bold <i> bolditalic </i></b><i> italic </i>
+ * Official Yahoo clients would send:
+ *   ESC[1m bold ESC[2m bolditalic ESC[x1m italic
+ * But we will send:
+ *   ESC[1m bold ESC[2m bolditalic ESC[x2mESC[x1mESC[2m italic ESC[x2m
+ */
 char *yahoo_html_to_codes(const char *src);
 
 gboolean
--- a/libpurple/protocols/yahoo/util.c	Wed Aug 19 18:54:54 2009 +0000
+++ b/libpurple/protocols/yahoo/util.c	Wed Aug 19 22:01:10 2009 +0000
@@ -669,7 +669,7 @@
 #define POINT_SIZE(x) (_point_sizes [MIN ((x > 0 ? x : 1), MAX_FONT_SIZE) - 1])
 static const gint _point_sizes [] = { 8, 10, 12, 14, 20, 30, 40 };
 
-enum fatype
+enum fontattr_type
 {
 	FATYPE_SIZE,
 	FATYPE_COLOR,
@@ -679,7 +679,7 @@
 
 typedef struct
 {
-	enum fatype type;
+	enum fontattr_type type;
 	union {
 		int size;
 		char *color;
@@ -688,6 +688,17 @@
 	} u;
 } fontattr;
 
+typedef struct
+{
+	gboolean bold;
+	gboolean italic;
+	gboolean underline;
+	gboolean in_link;
+	int font_size;
+	char *font_face;
+	char *font_color;
+} CurrentMsgState;
+
 static void fontattr_free(fontattr *f)
 {
 	if (f->type == FATYPE_COLOR)
@@ -876,167 +887,124 @@
 	GString *dest;
 	char *esc;
 	GQueue *ftattr = NULL;
-	gboolean no_more_specials = FALSE;
+	gboolean no_more_gt_brackets = FALSE;
+	gchar *tag, *tag_name;
+	gboolean is_closing_tag;
+	CurrentMsgState current_state;
+
+	bzero(&current_state, sizeof(current_state));
 
 	src_len = strlen(src);
 	dest = g_string_sized_new(src_len);
 
 	for (i = 0; i < src_len; i++) {
-
-		if (src[i] == '<' && !no_more_specials) {
+		if (src[i] == '<' && !no_more_gt_brackets) {
+			/* The start of an HTML tag  */
 			j = i;
 
-			while (1) {
-				j++;
-
-				if (j >= src_len) { /* no '>' */
-					g_string_append_c(dest, src[i]);
-					no_more_specials = TRUE;
-					break;
-				}
+			while (j++ < src_len) {
+				if (src[j] != '>') {
+					if (src[j] == '"') {
+						/* We're inside a quoted attribute value. Skip to the end */
+						j++;
+						while (j != src_len && src[j] != '"')
+							j++;
+					} else if (src[j] == '\'') {
+						/* We're inside a quoted attribute value. Skip to the end */
+						j++;
+						while (j != src_len && src[j] != '\'')
+							j++;
+					}
+					if (j != src_len)
+						/* Keep looking for the end of this tag */
+						continue;
 
-				if (src[j] == '<') {
-					/* FIXME: This doesn't convert outgoing entities.
-					 *        However, I suspect this case may never
-					 *        happen anymore because of the entities.
-					 */
-					g_string_append_len(dest, &src[i], j - i);
-					i = j - 1;
-					if (ftattr) {
-						fontattr *f;
-
-						while ((f = g_queue_pop_head(ftattr)))
-							fontattr_free(f);
-						g_queue_free(ftattr);
-						ftattr = NULL;
-					}
+					/* This < has no corresponding > */
+					g_string_append_c(dest, src[i]);
+					no_more_gt_brackets = TRUE;
 					break;
 				}
 
-				if (src[j] == ' ') {
-					if (!g_ascii_strncasecmp(&src[i+1], "BODY", j - i - 1)) {
-						char *t = strchr(&src[j], '>');
-						if (!t) {
-							g_string_append(dest, &src[i]);
+				tag = g_strndup(src + i, j - i + 1);
+				tag_name = yahoo_markup_get_tag_name(tag, &is_closing_tag);
+
+				if (g_str_equal(tag_name, "a")) {
+					j += 7;
+					g_string_append(dest, "\033[lm");
+					if (purple_str_has_prefix(src + j, "mailto:"))
+						j += sizeof("mailto:") - 1;
+					while (1) {
+						g_string_append_c(dest, src[j]);
+						if (++j >= src_len) {
 							i = src_len;
 							break;
-						} else {
-							i = t - src;
+						}
+						if (src[j] == '"') {
+							g_string_append(dest, "\033[xlm");
+							while (1) {
+								if (++j >= src_len) {
+									i = src_len;
+									break;
+								}
+								if (!g_ascii_strncasecmp(&src[j], "</A>", 4)) {
+									j += 3;
+									break;
+								}
+							}
+							i = j;
 							break;
 						}
-					} else if (!g_ascii_strncasecmp(&src[i+1], "A HREF=\"", j - i - 1)) {
-						j += 7;
-						g_string_append(dest, "\033[lm");
-						if (purple_str_has_prefix(src + j, "mailto:"))
-							j += sizeof("mailto:") - 1;
-						while (1) {
-							g_string_append_c(dest, src[j]);
-							if (++j >= src_len) {
-								i = src_len;
-								break;
-							}
-							if (src[j] == '"') {
-								g_string_append(dest, "\033[xlm");
-								while (1) {
-									if (++j >= src_len) {
-										i = src_len;
-										break;
-									}
-									if (!g_ascii_strncasecmp(&src[j], "</A>", 4)) {
-										j += 3;
-										break;
-									}
-								}
-								i = j;
-								break;
-							}
-						}
-					} else if (!g_ascii_strncasecmp(&src[i+1], "SPAN", j - i - 1)) { /* drop span tags */
-						while (1) {
-							if (++j >= src_len) {
-								g_string_append(dest, &src[i]);
-								i = src_len;
-								break;
-							}
-							if (src[j] == '>') {
-								i = j;
-								break;
+					}
+
+				} else if (g_str_equal(tag_name, "font")) {
+					_parse_font_tag(src, dest, &i, &j, src_len, &colors, &tags, ftattr);
+				} else if (g_str_equal(tag_name, "b")) {
+					g_string_append(dest, "\033[1m");
+					current_state.bold = TRUE;
+				} else if (g_str_equal(tag_name, "/b")) {
+					if (current_state.bold) {
+						g_string_append(dest, "\033[x1m");
+						current_state.bold = FALSE;
+					}
+				} else if (g_str_equal(tag_name, "i")) {
+					current_state.italic = TRUE;
+					g_string_append(dest, "\033[2m");
+				} else if (g_str_equal(tag_name, "/i")) {
+					if (current_state.italic) {
+						g_string_append(dest, "\033[x2m");
+						current_state.italic = FALSE;
+					}
+				} else if (g_str_equal(tag_name, "u")) {
+					current_state.underline = TRUE;
+					g_string_append(dest, "\033[4m");
+				} else if (g_str_equal(tag_name, "/u")) {
+					if (current_state.underline) {
+						g_string_append(dest, "\033[x4m");
+						current_state.underline = FALSE;
+					}
+				} else if (g_str_equal(tag_name, "/a")) {
+					g_string_append(dest, "\033[xlm");
+				} else if (g_str_equal(tag_name, "br")) {
+					g_string_append_c(dest, '\n');
+				} else if (g_str_equal(tag_name, "/font")) {
+					if (tags != NULL) {
+						char *etag = tags->data;
+						tags = g_slist_delete_link(tags, tags);
+						g_string_append(dest, etag);
+						if (g_str_equal(etag, "</font>")) {
+							if (colors != NULL) {
+								g_free(colors->data);
+								colors = g_slist_delete_link(colors, colors);
 							}
 						}
-					} else if (g_ascii_strncasecmp(&src[i+1], "FONT", j - i - 1)) { /* not interested! */
-						while (1) {
-							if (++j >= src_len) {
-								g_string_append(dest, &src[i]);
-								i = src_len;
-								break;
-							}
-							if (src[j] == '>') {
-								g_string_append_len(dest, &src[i], j - i + 1);
-								i = j;
-								break;
-							}
-						}
-					} else { /* yay we have a font tag */
-						_parse_font_tag(src, dest, &i, &j, src_len, &colors, &tags, ftattr);
+						g_free(etag);
 					}
-
-					break;
 				}
 
-				if (src[j] == '>') {
-					/* This has some problems like the FIXME for the
-					 * '<' case. and like that case, I suspect the case
-					 * that this has problems is won't happen anymore anyway.
-					 */
-					int sublen = j - i - 1;
-
-					if (sublen) {
-						if (!g_ascii_strncasecmp(&src[i+1], "B", sublen)) {
-							g_string_append(dest, "\033[1m");
-						} else if (!g_ascii_strncasecmp(&src[i+1], "/B", sublen)) {
-							g_string_append(dest, "\033[x1m");
-						} else if (!g_ascii_strncasecmp(&src[i+1], "I", sublen)) {
-							g_string_append(dest, "\033[2m");
-						} else if (!g_ascii_strncasecmp(&src[i+1], "/I", sublen)) {
-							g_string_append(dest, "\033[x2m");
-						} else if (!g_ascii_strncasecmp(&src[i+1], "U", sublen)) {
-							g_string_append(dest, "\033[4m");
-						} else if (!g_ascii_strncasecmp(&src[i+1], "/U", sublen)) {
-							g_string_append(dest, "\033[x4m");
-						} else if (!g_ascii_strncasecmp(&src[i+1], "/A", sublen)) {
-							g_string_append(dest, "\033[xlm");
-						} else if (!g_ascii_strncasecmp(&src[i+1], "BR", sublen)) {
-							g_string_append_c(dest, '\n');
-						} else if (!g_ascii_strncasecmp(&src[i+1], "/BODY", sublen)) {
-							/* mmm, </body> tags. *BURP* */
-						} else if (!g_ascii_strncasecmp(&src[i+1], "/SPAN", sublen)) {
-							/* </span> tags. dangerously close to </spam> */
-						} else if (!g_ascii_strncasecmp(&src[i+1], "/FONT", sublen) && tags != NULL) {
-							char *etag;
-
-							etag = tags->data;
-							tags = g_slist_delete_link(tags, tags);
-							if (etag) {
-								g_string_append(dest, etag);
-								if (!strcmp(etag, "</font>")) {
-									if (colors != NULL) {
-										g_free(colors->data);
-										colors = g_slist_delete_link(colors, colors);
-									}
-								}
-								g_free(etag);
-							}
-						} else {
-							g_string_append_len(dest, &src[i], j - i + 1);
-						}
-					} else {
-						g_string_append_len(dest, &src[i], j - i + 1);
-					}
-
-					i = j;
-					break;
-				}
-
+				i = j;
+				g_free(tag);
+				g_free(tag_name);
+				break;
 			}
 
 		} else {
--- a/libpurple/tests/test_yahoo_util.c	Wed Aug 19 18:54:54 2009 +0000
+++ b/libpurple/tests/test_yahoo_util.c	Wed Aug 19 22:01:10 2009 +0000
@@ -104,7 +104,6 @@
 }
 END_TEST
 
-#if 0
 START_TEST(test_html_to_codes)
 {
 	assert_string_equal_free("plain",
@@ -129,7 +128,6 @@
 			yahoo_html_to_codes("plain &amp;"));
 
 	/* bold/italic/underline */
-	// MARK: This isn't correct.  Should not have the closing bold escape code
 	assert_string_equal_free("\x1B[1mbold\x1B[x1m",
 			yahoo_html_to_codes("<b>bold</b>"));
 	assert_string_equal_free("\x1B[2mitalic\x1B[x2m",
@@ -140,13 +138,12 @@
 			yahoo_html_to_codes("no</u> markup"));
 	assert_string_equal_free("\x1B[1mbold\x1B[x1m \x1B[2mitalic\x1B[x2m \x1B[4munderline\x1B[x4m",
 			yahoo_html_to_codes("<b>bold</b> <i>italic</i> <u>underline</u>"));
-	assert_string_equal_free("\x1B[1mbold \x1B[2mbolditalic\x1B[x1m italic\x1B[x1m",
+	assert_string_equal_free("\x1B[1mbold \x1B[2mbolditalic\x1B[x2m\x1B[x1m\x1B[2m italic\x1B[x2m",
 			yahoo_html_to_codes("<b>bold <i>bolditalic</i></b><i> italic</i>"));
-	assert_string_equal_free("\x1B[1mbold \x1B[2mbolditalic\x1B[x1m \x1B[4mitalicunderline",
+	assert_string_equal_free("\x1B[1mbold \x1B[2mbolditalic\x1B[x2m\x1B[x1m\x1B[2m \x1B[4mitalicunderline\x1B[x4m\x1B[x2m",
 			yahoo_html_to_codes("<b>bold <i>bolditalic</i></b><i> <u>italicunderline</u></i>"));
 }
 END_TEST
-#endif
 
 Suite *
 yahoo_util_suite(void)
@@ -161,11 +158,9 @@
 	tcase_add_test(tc, test_codes_to_html);
 	suite_add_tcase(s, tc);
 
-#if 0
 	tc = tcase_create("Convert IM from HTML to network format");
 	tcase_add_test(tc, test_html_to_codes);
 	suite_add_tcase(s, tc);
-#endif
 
 	return s;
 }