comparison src/util.c @ 9241:5e9018c81bd9

[gaim-migrate @ 10040] Wing says he improved gaim_markup_strip_html. This patch modifies gaim_markup_strip_html to 1. Remove <style>...</style> and <script>...</script> completely 2. Turn <td> into a tab instead of a space; this will be needed for the msn profile code For #1 above, the </style> / </script> matching does not behave well when fed with technically-correct-but-implausible input, such as </style > and </script > committer: Tailor Script <tailor@pidgin.im>
author Tim Ringenbach <marv@pidgin.im>
date Wed, 09 Jun 2004 01:34:16 +0000
parents f1d87ab17e41
children 15d516d07d57
comparison
equal deleted inserted replaced
9240:f1d87ab17e41 9241:5e9018c81bd9
1224 } 1224 }
1225 1225
1226 /* The following are probably reasonable changes: 1226 /* The following are probably reasonable changes:
1227 * - \n should be converted to a normal space 1227 * - \n should be converted to a normal space
1228 * - in addition to <br>, <p> and <div> etc. should also be converted into \n 1228 * - in addition to <br>, <p> and <div> etc. should also be converted into \n
1229 * - We want to turn </td>#whitespace<td> sequences into a single blank 1229 * - We want to turn </td>#whitespace<td> sequences into a single tab
1230 * - We want to turn <td> into a single tab (for msn profile "parsing")
1230 * - We want to turn </tr>#whitespace<tr> sequences into a single \n 1231 * - We want to turn </tr>#whitespace<tr> sequences into a single \n
1231 * We should remove all <script>...</script> etc. This should be fixed some time 1232 * - <script>...</script> and <style>...</style> should be completely removed
1232 */ 1233 */
1233 1234
1234 char * 1235 char *
1235 gaim_markup_strip_html(const char *str) 1236 gaim_markup_strip_html(const char *str)
1236 { 1237 {
1237 int i, j, k; 1238 int i, j, k;
1238 gboolean visible = TRUE; 1239 gboolean visible = TRUE;
1239 gboolean closing_td_p = FALSE; 1240 gboolean closing_td_p = FALSE;
1240 gchar *str2; 1241 gchar *str2;
1242 const gchar *cdata_close_tag = NULL;
1241 1243
1242 if(!str) 1244 if(!str)
1243 return NULL; 1245 return NULL;
1244 1246
1245 str2 = g_strdup(str); 1247 str2 = g_strdup(str);
1246 1248
1247 for (i = 0, j = 0; str2[i]; i++) 1249 for (i = 0, j = 0; str2[i]; i++)
1248 { 1250 {
1249 if (str2[i] == '<') 1251 if (str2[i] == '<')
1250 { 1252 {
1251 if (strncasecmp(str2 + i, "<td", 3) == 0 && closing_td_p) 1253 if (cdata_close_tag)
1252 { 1254 {
1253 str2[j++] = ' '; 1255 /* Note: Don't even assume any other tag is a tag in CDATA */
1256 if (strncasecmp(str2 + i, cdata_close_tag,
1257 strlen(cdata_close_tag)) == 0)
1258 {
1259 i += strlen(cdata_close_tag) - 1;
1260 cdata_close_tag = NULL;
1261 }
1262 continue;
1263 }
1264 else if (strncasecmp(str2 + i, "<td", 3) == 0 && closing_td_p)
1265 {
1266 str2[j++] = '\t';
1254 visible = TRUE; 1267 visible = TRUE;
1255 } 1268 }
1256 else if (strncasecmp(str2 + i, "</td>", 5) == 0) 1269 else if (strncasecmp(str2 + i, "</td>", 5) == 0)
1257 { 1270 {
1258 closing_td_p = TRUE; 1271 closing_td_p = TRUE;
1276 */ 1289 */
1277 while (str2[k] && str2[k] != '<' && str2[k] != '>') 1290 while (str2[k] && str2[k] != '<' && str2[k] != '>')
1278 { 1291 {
1279 k++; 1292 k++;
1280 } 1293 }
1294
1281 /* Check for tags which should be mapped to newline */ 1295 /* Check for tags which should be mapped to newline */
1282 if (strncasecmp(str2 + i, "<p>", 3) == 0 1296 if (strncasecmp(str2 + i, "<p>", 3) == 0
1283 || strncasecmp(str2 + i, "<tr", 3) == 0 1297 || strncasecmp(str2 + i, "<tr", 3) == 0
1284 || strncasecmp(str2 + i, "<br", 3) == 0 1298 || strncasecmp(str2 + i, "<br", 3) == 0
1285 || strncasecmp(str2 + i, "<li", 3) == 0 1299 || strncasecmp(str2 + i, "<li", 3) == 0
1286 || strncasecmp(str2 + i, "<div", 4) == 0 1300 || strncasecmp(str2 + i, "<div", 4) == 0
1287 || strncasecmp(str2 + i, "</table>", 8) == 0) 1301 || strncasecmp(str2 + i, "</table>", 8) == 0)
1288 { 1302 {
1289 str2[j++] = '\n'; 1303 str2[j++] = '\n';
1290 } 1304 }
1305 /* Check for tags which begin CDATA and need to be closed */
1306 #if 0 /* FIXME.. option is end tag optional, we can't handle this right now */
1307 else if (strncasecmp(str2 + i, "<option", 7) == 0)
1308 {
1309 /* FIXME: We should not do this if the OPTION is SELECT'd */
1310 cdata_close_tag = "</option>";
1311 }
1312 #endif
1313 else if (strncasecmp(str2 + i, "<script", 7) == 0)
1314 {
1315 cdata_close_tag = "</script>";
1316 }
1317 else if (strncasecmp(str2 + i, "<style", 6) == 0)
1318 {
1319 cdata_close_tag = "</style>";
1320 }
1291 /* Update the index and continue checking after the tag */ 1321 /* Update the index and continue checking after the tag */
1292 i = (str2[k] == '<')? k - 1: k; 1322 i = (str2[k] == '<')? k - 1: k;
1293 continue; 1323 continue;
1294 } 1324 }
1325 }
1326 else if (cdata_close_tag)
1327 {
1328 continue;
1295 } 1329 }
1296 else if (!g_ascii_isspace(str2[i])) 1330 else if (!g_ascii_isspace(str2[i]))
1297 { 1331 {
1298 visible = TRUE; 1332 visible = TRUE;
1299 } 1333 }