changeset 111842:265c545c8f48

Decode extra numeric entities. mm-util.el (mm-extra-numeric-entities): New variable. mm-url.el (mm-url-decode-entities): mm-decode.el (mm-shr): Use it to decode extra numeric entities.
author Katsumi Yamaoka <yamaoka@jpl.org>
date Tue, 07 Dec 2010 05:06:56 +0000
parents 25e3c2636c1f
children 94c9743593b9
files lisp/gnus/ChangeLog lisp/gnus/mm-decode.el lisp/gnus/mm-url.el lisp/gnus/mm-util.el
diffstat 4 files changed, 53 insertions(+), 16 deletions(-) [+]
line wrap: on
line diff
--- a/lisp/gnus/ChangeLog	Mon Dec 06 21:01:00 2010 -0500
+++ b/lisp/gnus/ChangeLog	Tue Dec 07 05:06:56 2010 +0000
@@ -1,3 +1,10 @@
+2010-12-07  Katsumi Yamaoka  <yamaoka@jpl.org>
+
+	* mm-util.el (mm-extra-numeric-entities): New variable.
+
+	* mm-url.el (mm-url-decode-entities):
+	* mm-decode.el (mm-shr): Use it to decode extra numeric entities.
+
 2010-12-07  Stefan Monnier  <monnier@iro.umontreal.ca>
 
 	* message.el: Use completion-at-point.
--- a/lisp/gnus/mm-decode.el	Mon Dec 06 21:01:00 2010 -0500
+++ b/lisp/gnus/mm-decode.el	Tue Dec 07 05:06:56 2010 +0000
@@ -1699,7 +1699,7 @@
 				  (when handle
 				    (mm-with-part handle
 				      (buffer-string))))))
-	shr-inhibit-images shr-blocked-images charset)
+	shr-inhibit-images shr-blocked-images charset char)
     (if (and (boundp 'gnus-summary-buffer)
 	     (buffer-name gnus-summary-buffer))
 	(with-current-buffer gnus-summary-buffer
@@ -1714,13 +1714,25 @@
       (narrow-to-region (point) (point))
       (shr-insert-document
        (mm-with-part handle
-	 (when (and charset
-		    (setq charset (mm-charset-to-coding-system charset))
-		    (not (eq charset 'ascii)))
-	   (insert (prog1
-		       (mm-decode-coding-string (buffer-string) charset)
-		     (erase-buffer)
-		     (mm-enable-multibyte))))
+	 (insert (prog1
+		     (if (and charset
+			      (setq charset
+				    (mm-charset-to-coding-system charset))
+			      (not (eq charset 'ascii)))
+			 (mm-decode-coding-string (buffer-string) charset)
+		       (mm-string-as-multibyte (buffer-string)))
+		   (erase-buffer)
+		   (mm-enable-multibyte)))
+	 (goto-char (point-min))
+	 (setq case-fold-search t)
+	 (while (re-search-forward
+		 "&#\\(?:x\\([89][0-9a-f]\\)\\|\\(1[2-5][0-9]\\)\\);" nil t)
+	   (when (setq char
+		       (cdr (assq (if (match-beginning 1)
+				      (string-to-number (match-string 1) 16)
+				    (string-to-number (match-string 2)))
+				  mm-extra-numeric-entities)))
+	     (replace-match (char-to-string char))))
 	 (libxml-parse-html-region (point-min) (point-max))))
       (mm-handle-set-undisplayer
        handle
--- a/lisp/gnus/mm-url.el	Mon Dec 06 21:01:00 2010 -0500
+++ b/lisp/gnus/mm-url.el	Tue Dec 07 05:06:56 2010 +0000
@@ -365,16 +365,19 @@
 (defun mm-url-decode-entities ()
   "Decode all HTML entities."
   (goto-char (point-min))
-  (while (re-search-forward "&\\(#[0-9]+\\|#x[0-9a-f]+\\|[a-z]+[0-9]*\\);" nil t)
+  (while (re-search-forward "&\\(#[0-9]+\\|#x[0-9a-f]+\\|[a-z]+[0-9]*\\);"
+			    nil t)
     (let* ((entity (match-string 1))
 	   (elem (if (eq (aref entity 0) ?\#)
-		     (let ((c (mm-ucs-to-char
-			       ;; Hex number: &#x3212
-			       (if (eq (aref entity 1) ?x)
-				   (string-to-number (substring entity 2)
-						     16)
-				 ;; Decimal number: &#23
-				 (string-to-number (substring entity 1))))))
+		     (let ((c
+			    ;; Hex number: &#x3212
+			    (if (eq (aref entity 1) ?x)
+				(string-to-number (substring entity 2)
+						  16)
+			      ;; Decimal number: &#23
+			      (string-to-number (substring entity 1)))))
+		       (setq c (or (cdr (assq c mm-extra-numeric-entities))
+				   (mm-ucs-to-char c)))
 		       (if (mm-char-or-char-int-p c) c ?#))
 		   (or (cdr (assq (intern entity)
 				  mm-url-html-entities))
--- a/lisp/gnus/mm-util.el	Mon Dec 06 21:01:00 2010 -0500
+++ b/lisp/gnus/mm-util.el	Tue Dec 07 05:06:56 2010 +0000
@@ -866,6 +866,21 @@
 Setting it to nil is useful on Emacsen supporting Unicode if sending
 mail with multiple parts is preferred to sending a Unicode one.")
 
+(defvar mm-extra-numeric-entities
+  (mapcar
+   (lambda (item)
+     (cons (car item) (mm-ucs-to-char (cdr item))))
+   '((#x80 . #x20AC) (#x82 . #x201A) (#x83 . #x0192) (#x84 . #x201E)
+     (#x85 . #x2026) (#x86 . #x2020) (#x87 . #x2021) (#x88 . #x02C6)
+     (#x89 . #x2030) (#x8A . #x0160) (#x8B . #x2039) (#x8C . #x0152)
+     (#x8E . #x017D) (#x91 . #x2018) (#x92 . #x2019) (#x93 . #x201C)
+     (#x94 . #x201D) (#x95 . #x2022) (#x96 . #x2013) (#x97 . #x2014)
+     (#x98 . #x02DC) (#x99 . #x2122) (#x9A . #x0161) (#x9B . #x203A)
+     (#x9C . #x0153) (#x9E . #x017E) (#x9F . #x0178)))
+  "*Alist of extra numeric entities and characters other than ISO 10646.
+This table is used for decoding extra numeric entities to characters,
+like \"&#128;\" to the euro sign, mainly in html messages.")
+
 ;;; Internal variables:
 
 ;;; Functions: