changeset 29852:5bac51b394e6

util: Better validation of the allowed character values in XML 1.0 From http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char. Refs #11257 This doesn't actually make a difference, because I think all the invalid ranges aren't valid UTF-8 and so g_utf8_validate catches them.
author Paul Aurich <paul@darkrain42.org>
date Thu, 29 Apr 2010 17:17:00 +0000
parents 2a436e0ce977
children e89bf860e3cf
files libpurple/tests/test_util.c libpurple/util.c
diffstat 2 files changed, 46 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/libpurple/tests/test_util.c	Thu Apr 29 05:58:02 2010 +0000
+++ b/libpurple/tests/test_util.c	Thu Apr 29 17:17:00 2010 +0000
@@ -121,6 +121,33 @@
 }
 END_TEST
 
+START_TEST(test_utf8_strip_unprintables)
+{
+	fail_unless(NULL == purple_utf8_strip_unprintables(NULL));
+	/* invalid UTF-8 */
+#if 0
+	/* disabled because make check fails on an assertion */
+	fail_unless(NULL == purple_utf8_strip_unprintables("abc\x80\x7f"));
+#endif
+	/* \t, \n, \r, space */
+	assert_string_equal_free("ab \tcd\nef\r   ", purple_utf8_strip_unprintables("ab \tcd\nef\r   "));
+	/* Basic ASCII */
+	assert_string_equal_free("Foobar", purple_utf8_strip_unprintables("Foobar"));
+	/* 0xE000 - 0xFFFD (UTF-8 encoded) */
+	/* U+F1F7 */
+	assert_string_equal_free("aaaa\xef\x87\xb7", purple_utf8_strip_unprintables("aaaa\xef\x87\xb7"));
+#if 0
+	/* disabled because make check fails on an assertion */
+	/* U+DB80 (Private Use High Surrogate, First) -- should be stripped */
+	assert_string_equal_free("aaaa", purple_utf8_strip_unprintables("aaaa\xed\xa0\x80"));
+	/* U+FFFE (should be stripped) */
+	assert_string_equal_free("aaaa", purple_utf8_strip_unprintables("aaaa\xef\xbf\xbe"));
+#endif
+	/* U+FEFF (should not be stripped) */
+	assert_string_equal_free("aaaa\xef\xbb\xbf", purple_utf8_strip_unprintables("aaaa\xef\xbb\xbf"));
+}
+END_TEST
+
 START_TEST(test_mime_decode_field)
 {
 	gchar *result = purple_mime_decode_field("=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?=");
@@ -168,6 +195,10 @@
 	tcase_add_test(tc, test_markup_html_to_xhtml);
 	suite_add_tcase(s, tc);
 
+	tc = tcase_create("Stripping Unparseables");
+	tcase_add_test(tc, test_utf8_strip_unprintables);
+	suite_add_tcase(s, tc);
+
 	tc = tcase_create("MIME");
 	tcase_add_test(tc, test_mime_decode_field);
 	suite_add_tcase(s, tc);
--- a/libpurple/util.c	Thu Apr 29 05:58:02 2010 +0000
+++ b/libpurple/util.c	Thu Apr 29 17:17:00 2010 +0000
@@ -4593,12 +4593,22 @@
 	}
 
 	workstr = iter = g_new(gchar, strlen(str) + 1);
-	for ( ; *str; ++str) {
-		guchar c = *str;
-		if (c >= 0x20 || c == '\t' || c == '\n' || c == '\r') {
-			*iter = c;
-			++iter;
+	while (*str) {
+		gunichar ch = g_utf8_get_char(str);
+		gchar *next = g_utf8_next_char(str);
+		/*
+		 * Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
+		 *          [#x10000-#x10FFFF]
+		 */
+		if ((ch == '\t' || ch == '\n' || ch == '\r') ||
+				(ch >= 0x20 && ch <= 0xD7FF) ||
+				(ch >= 0xE000 && ch <= 0xFFFD) ||
+				(ch >= 0x10000 && ch <= 0x10FFFF)) {
+			memcpy(iter, str, next - str);
+			iter += (next - str);
 		}
+
+		str = next;
 	}
 
 	/* nul-terminate the new string */