Mercurial > emacs
changeset 47945:3915f2c7691e
(message-posting-charset): defvar when compiling.
(ietf-drums, gnus-util): Don't require.
(rfc2047-header-encoding-alist): Add `address-mime' part. Doc
fixes.
(rfc2047-charset-encoding-alist): Use B for iso-8859-7,
iso-8859-8.
(rfc2047-q-encoding-alist): Augment header list.
(rfc2047-encoding-type): New.
(rfc2047-dissect-region): Deleted.
(rfc2047-encode-region, rfc2047-encode): Rewritten to take
account of rfc2047 rules with respect to rfc2822 tokens and to do
encoding in place rather than by passing strings.
(rfc2047-encode-message-header): Don't include header name field
in encoding. Add `address-mime' case and bind
rfc2047-encoding-type for `mime' case.
(rfc2047-encode-string): Doc fix.
(rfc2047-encode): Use longer chunks for base64.
(rfc2047-fold-region): Insert single characters, not strings.
(rfc2047-encoded-word-regexp): Wrap in eval-and-compile.
(rfc2047-decode-region): Avoid consing regexp in loop.
author | Dave Love <fx@gnu.org> |
---|---|
date | Fri, 18 Oct 2002 10:52:56 +0000 |
parents | 03cfc305a0fa |
children | 4a168304ff75 |
files | lisp/gnus/rfc2047.el |
diffstat | 1 files changed, 255 insertions(+), 193 deletions(-) [+] |
line wrap: on
line diff
--- a/lisp/gnus/rfc2047.el Fri Oct 18 10:43:12 2002 +0000 +++ b/lisp/gnus/rfc2047.el Fri Oct 18 10:52:56 2002 +0000 @@ -1,5 +1,5 @@ ;;; rfc2047.el --- functions for encoding and decoding rfc2047 messages -;; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. +;; Copyright (C) 1998, 1999, 2000, 2002 Free Software Foundation, Inc. ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org> ;; MORIOKA Tomohiko <morioka@jaist.ac.jp> @@ -27,20 +27,22 @@ ;;; Code: -(eval-when-compile (require 'cl)) +(eval-when-compile + (require 'cl) + (defvar message-posting-charset)) (require 'qp) (require 'mm-util) -(require 'ietf-drums) +;; Fixme: Avoid this (used for mail-parse-charset) mm dependence on gnus. (require 'mail-prsvr) (require 'base64) -;; Fixme: Avoid this (for gnus-point-at-...) mm dependence on gnus. -(require 'gnus-util) (autoload 'mm-body-7-or-8 "mm-bodies") (defvar rfc2047-header-encoding-alist '(("Newsgroups" . nil) ("Message-ID" . nil) + ("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|Reply-To\\|Sender\\)" . + address-mime) (t . mime)) "*Header/encoding method alist. The list is traversed sequentially. The keys can either be @@ -50,8 +52,10 @@ 1) nil, in which case no encoding is done; 2) `mime', in which case the header will be encoded according to RFC2047; -3) a charset, in which case it will be encoded as that charset; -4) `default', in which case the field will be encoded as the rest +3) `address-mime', like `mime', but takes account of the rules for address + fields (where quoted strings and comments must be treated separately); +4) a charset, in which case it will be encoded as that charset; +5) `default', in which case the field will be encoded as the rest of the article.") (defvar rfc2047-charset-encoding-alist @@ -62,8 +66,8 @@ (iso-8859-4 . Q) (iso-8859-5 . B) (koi8-r . B) - (iso-8859-7 . Q) - (iso-8859-8 . Q) + (iso-8859-7 . B) + (iso-8859-8 . B) (iso-8859-9 . Q) (iso-8859-14 . Q) (iso-8859-15 . Q) @@ -78,7 +82,8 @@ (iso-2022-jp-2 . B) (iso-2022-int-1 . B)) "Alist of MIME charsets to RFC2047 encodings. -Valid encodings are nil, `Q' and `B'.") +Valid encodings are nil, `Q' and `B'. These indicate binary (no) encoding, +quoted-printable and base64 respectively.") (defvar rfc2047-encoding-function-alist '((Q . rfc2047-q-encode-region) @@ -87,7 +92,8 @@ "Alist of RFC2047 encodings to encoding functions.") (defvar rfc2047-q-encoding-alist - '(("\\(From\\|Cc\\|To\\|Bcc\||Reply-To\\):" . "-A-Za-z0-9!*+/") + '(("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|Reply-To\\|Sender\\):" + . "-A-Za-z0-9!*+/" ) ;; = (\075), _ (\137), ? (\077) are used in the encoded word. ;; Avoid using 8bit characters. ;; Equivalent to "^\000-\007\011\013\015-\037\200-\377=_?" @@ -112,6 +118,12 @@ (point-max)))) (goto-char (point-min))) +(defvar rfc2047-encoding-type 'address-mime + "The type of encoding done by `rfc2047-encode-region'. +This should be dynamically bound around calls to +`rfc2047-encode-region' to either `mime' or `address-mime'. See +`rfc2047-header-encoding-alist', for definitions.") + (defun rfc2047-encode-message-header () "Encode the message header according to `rfc2047-header-encoding-alist'. Should be called narrowed to the head of the message." @@ -141,21 +153,26 @@ (eq (car elem) t)) (setq alist nil method (cdr elem)))) + (goto-char (point-min)) + (re-search-forward "^[^:]+: *" nil t) (cond + ((eq method 'address-mime) + (rfc2047-encode-region (point) (point-max))) ((eq method 'mime) - (rfc2047-encode-region (point-min) (point-max))) + (let (rfc2047-encoding-type) + (rfc2047-encode-region (point) (point-max)))) ((eq method 'default) (if (and (featurep 'mule) (if (boundp 'default-enable-multibyte-characters) default-enable-multibyte-characters) mail-parse-charset) - (mm-encode-coding-region (point-min) (point-max) + (mm-encode-coding-region (point) (point-max) mail-parse-charset))) ((mm-coding-system-p method) (if (and (featurep 'mule) (if (boundp 'default-enable-multibyte-characters) default-enable-multibyte-characters)) - (mm-encode-coding-region (point-min) (point-max) method))) + (mm-encode-coding-region (point) (point-max) method))) ;; Hm. (t))) (goto-char (point-max))))))) @@ -169,133 +186,176 @@ The buffer may be narrowed." (require 'message) ; for message-posting-charset (let ((charsets - (mapcar - 'mm-mime-charset - (mm-find-charset-region (point-min) (point-max)))) - (cs (list 'us-ascii (car message-posting-charset))) - found) - (while charsets - (unless (memq (pop charsets) cs) - (setq found t))) - found)) + (mm-find-mime-charset-region (point-min) (point-max)))) + (and charsets (not (equal charsets (list message-posting-charset)))))) -(defun rfc2047-dissect-region (b e) - "Dissect the region between B and E into words." - (let ((word-chars "-A-Za-z0-9!*+/") - ;; Not using ietf-drums-specials-token makes life simple. - mail-parse-mule-charset - words point current - result word) - (save-restriction - (narrow-to-region b e) - (goto-char (point-min)) - (skip-chars-forward "\000-\177") - (while (not (eobp)) - (setq point (point)) - (skip-chars-backward word-chars b) - (unless (eq b (point)) - (push (cons (buffer-substring b (point)) nil) words)) - (setq b (point)) - (goto-char point) - (setq current (mm-charset-after)) - (forward-char 1) - (skip-chars-forward word-chars) - (while (and (not (eobp)) - (eq (mm-charset-after) current)) - (forward-char 1) - (skip-chars-forward word-chars)) - (unless (eq b (point)) - (push (cons (buffer-substring b (point)) current) words)) - (setq b (point)) - (skip-chars-forward "\000-\177")) - (unless (eq b (point)) - (push (cons (buffer-substring b (point)) nil) words))) - ;; merge adjacent words - (setq word (pop words)) - (while word - (if (and (cdr word) - (caar words) - (not (cdar words)) - (not (string-match "[^ \t]" (caar words)))) - (if (eq (cdr (nth 1 words)) (cdr word)) - (progn - (setq word (cons (concat - (car (nth 1 words)) (caar words) - (car word)) - (cdr word))) - (pop words) - (pop words)) - (push (cons (concat (caar words) (car word)) (cdr word)) - result) - (pop words) - (setq word (pop words))) - (push word result) - (setq word (pop words)))) - result)) +;; Use this syntax table when parsing into regions that may need +;; encoding. Double quotes are string delimiters, backslash is +;; character quoting, and all other RFC 2822 special characters are +;; treated as punctuation so we can use forward-sexp/forward-word to +;; skip to the end of regions appropriately. Nb. ietf-drums does +;; things differently. +(defconst rfc2047-syntax-table + (let ((table (make-char-table 'syntax-table '(2)))) + (modify-syntax-entry ?\\ "\\" table) + (modify-syntax-entry ?\" "\"" table) + (modify-syntax-entry ?\( "." table) + (modify-syntax-entry ?\) "." table) + (modify-syntax-entry ?\< "." table) + (modify-syntax-entry ?\> "." table) + (modify-syntax-entry ?\[ "." table) + (modify-syntax-entry ?\] "." table) + (modify-syntax-entry ?: "." table) + (modify-syntax-entry ?\; "." table) + (modify-syntax-entry ?, "." table) + (modify-syntax-entry ?@ "." table) + table)) (defun rfc2047-encode-region (b e) - "Encode all encodable words in region B to E." - (let ((words (rfc2047-dissect-region b e)) word) - (save-restriction - (narrow-to-region b e) - (delete-region (point-min) (point-max)) - (while (setq word (pop words)) - (if (not (cdr word)) - (insert (car word)) - (rfc2047-fold-region (gnus-point-at-bol) (point)) - (goto-char (point-max)) - (if (> (- (point) (save-restriction - (widen) - (gnus-point-at-bol))) 76) - (insert "\n ")) - ;; Insert blank between encoded words - (if (eq (char-before) ?=) (insert " ")) - (rfc2047-encode (point) - (progn (insert (car word)) (point)) - (cdr word)))) - (rfc2047-fold-region (point-min) (point-max))))) + "Encode words in region B to E that need encoding. +By default, the region is treated as containing RFC2822 addresses. +Dynamically bind `rfc2047-encoding-type' to change that." + (save-restriction + (narrow-to-region b e) + (if (eq 'mime rfc2047-encoding-type) + ;; Simple case -- treat as single word. + (progn + (goto-char (point-min)) + ;; Does it need encoding? + (skip-chars-forward "\000-\177" e) + (unless (eobp) + (rfc2047-encode b e))) + ;; `address-mime' case -- take care of quoted words, comments. + (with-syntax-table rfc2047-syntax-table + (let ((start (point)) ; start of current token + end ; end of current token + ;; Whether there's an encoded word before the current + ;; tpken, either immediately or separated by space. + last-encoded) + (goto-char (point-min)) + (condition-case nil ; in case of unbalanced quotes + ;; Look for rfc2822-style: sequences of atoms, quoted + ;; strings, specials, whitespace. (Specials mustn't be + ;; encoded.) + (while (not (eobp)) + (setq start (point)) + ;; Skip whitespace. + (unless (= 0 (skip-chars-forward " \t")) + (setq start (point))) + (cond + ((not (char-after))) ; eob + ;; else token start + ((eq ?\" (char-syntax (char-after))) + ;; Quoted word. + (forward-sexp) + (setq end (point)) + ;; Does it need encoding? + (goto-char start) + (skip-chars-forward "\000-\177" end) + (if (= end (point)) + (setq last-encoded nil) + ;; It needs encoding. Strip the quotes first, + ;; since encoded words can't occur in quotes. + (goto-char end) + (delete-backward-char 1) + (goto-char start) + (delete-char 1) + (when last-encoded + ;; There was a preceding quoted word. We need + ;; to include any separating whitespace in this + ;; word to avoid it getting lost. + (skip-chars-backward " \t") + ;; A space is needed between the encoded words. + (insert ? ) + (setq start (point) + end (1+ end))) + ;; Adjust the end position for the deleted quotes. + (rfc2047-encode start (- end 2)) + (setq last-encoded t))) ; record that it was encoded + ((eq ?. (char-syntax (char-after))) + ;; Skip other delimiters, but record that they've + ;; potentially separated quoted words. + (forward-char) + (setq last-encoded nil)) + (t ; normal token/whitespace sequence + ;; Find the end. + (forward-word 1) + (skip-chars-backward " \t") + (setq end (point)) + ;; Deal with encoding and leading space as for + ;; quoted words. + (goto-char start) + (skip-chars-forward "\000-\177" end) + (if (= end (point)) + (setq last-encoded nil) + (when last-encoded + (goto-char start) + (skip-chars-backward " \t") + (insert ? ) + (setq start (point) + end (1+ end))) + (rfc2047-encode start end) + (setq last-encoded t))))) + (error (error "Invalid data for rfc2047 encoding: %s" + (buffer-substring b e))))))) + (rfc2047-fold-region b (point)))) (defun rfc2047-encode-string (string) - "Encode words in STRING." + "Encode words in STRING. +By default, the string is treated as containing addresses (see +`rfc2047-special-chars')." (with-temp-buffer (insert string) (rfc2047-encode-region (point-min) (point-max)) (buffer-string))) -(defun rfc2047-encode (b e charset) - "Encode the word in the region B to E with CHARSET." - (let* ((mime-charset (mm-mime-charset charset)) - (cs (mm-charset-to-coding-system mime-charset)) - (encoding (or (cdr (assq mime-charset +(defun rfc2047-encode (b e) + "Encode the word(s) in the region B to E. +By default, the region is treated as containing addresses (see +`rfc2047-special-chars')." + (let* ((mime-charset (mm-find-mime-charset-region b e)) + (cs (if (> (length mime-charset) 1) + ;; Fixme: Instead of this, try to break region into + ;; parts that can be encoded separately. + (error "Can't rfc2047-encode `%s'" + (buffer-substring b e)) + (setq mime-charset (car mime-charset)) + (mm-charset-to-coding-system mime-charset))) + ;; Fixme: Better, calculate the number of non-ASCII + ;; characters, at least for 8-bit charsets. + (encoding (if (assq mime-charset + rfc2047-charset-encoding-alist) + (cdr (assq mime-charset rfc2047-charset-encoding-alist)) - 'B)) + 'B)) (start (concat "=?" (downcase (symbol-name mime-charset)) "?" (downcase (symbol-name encoding)) "?")) (first t)) - (save-restriction - (narrow-to-region b e) - (when (eq encoding 'B) - ;; break into lines before encoding - (goto-char (point-min)) - (while (not (eobp)) - (goto-char (min (point-max) (+ 15 (point)))) - (unless (eobp) - (insert "\n")))) - (if (and (mm-multibyte-p) - (mm-coding-system-p cs)) - (mm-encode-coding-region (point-min) (point-max) cs)) - (funcall (cdr (assq encoding rfc2047-encoding-function-alist)) - (point-min) (point-max)) - (goto-char (point-min)) - (while (not (eobp)) - (unless first - (insert " ")) - (setq first nil) - (insert start) - (end-of-line) - (insert "?=") - (forward-line 1))))) + (if mime-charset + (save-restriction + (narrow-to-region b e) + (when (eq encoding 'B) + ;; break into lines before encoding + (goto-char (point-min)) + (while (not (eobp)) + (goto-char (min (point-max) (+ 15 (point)))) + (unless (eobp) + (insert ?\n)))) + (if (and (mm-multibyte-p) + (mm-coding-system-p cs)) + (mm-encode-coding-region (point-min) (point-max) cs)) + (funcall (cdr (assq encoding rfc2047-encoding-function-alist)) + (point-min) (point-max)) + (goto-char (point-min)) + (while (not (eobp)) + (unless first + (insert ? )) + (setq first nil) + (insert start) + (end-of-line) + (insert "?=") + (forward-line 1)))))) (defun rfc2047-fold-region (b e) "Fold long lines in region B to E." @@ -306,14 +366,14 @@ (qword-break nil) (bol (save-restriction (widen) - (gnus-point-at-bol)))) + (mm-point-at-bol)))) (while (not (eobp)) (when (and (or break qword-break) (> (- (point) bol) 76)) (goto-char (or break qword-break)) (setq break nil qword-break nil) (if (looking-at " \t") - (insert "\n") + (insert ?\n) (insert "\n ")) (setq bol (1- (point))) ;; Don't break before the first non-LWSP characters. @@ -347,7 +407,7 @@ (setq break nil qword-break nil) (if (looking-at " \t") - (insert "\n") + (insert ?\n) (insert "\n ")) (setq bol (1- (point))) ;; Don't break before the first non-LWSP characters. @@ -361,21 +421,21 @@ (goto-char (point-min)) (let ((bol (save-restriction (widen) - (gnus-point-at-bol))) - (eol (gnus-point-at-eol)) + (mm-point-at-bol))) + (eol (mm-point-at-eol)) leading) (forward-line 1) (while (not (eobp)) (looking-at "[ \t]*") (setq leading (- (match-end 0) (match-beginning 0))) - (if (< (- (gnus-point-at-eol) bol leading) 76) + (if (< (- (mm-point-at-eol) bol leading) 76) (progn (goto-char eol) (delete-region eol (progn (skip-chars-forward "[ \t\n\r]+") (1- (point))))) - (setq bol (gnus-point-at-bol))) - (setq eol (gnus-point-at-eol)) + (setq bol (mm-point-at-bol))) + (setq eol (mm-point-at-eol)) (forward-line 1))))) (defun rfc2047-b-encode-region (b e) @@ -396,7 +456,7 @@ (let ((alist rfc2047-q-encoding-alist) (bol (save-restriction (widen) - (gnus-point-at-bol)))) + (mm-point-at-bol)))) (while alist (when (looking-at (caar alist)) (quoted-printable-encode-region b e nil (cdar alist)) @@ -413,51 +473,51 @@ (goto-char (min (point-max) (+ 56 bol))) (search-backward "=" (- (point) 2) t) (unless (or (bobp) (eobp)) - (insert "\n") + (insert ?\n) (setq bol (point))))))))) ;;; ;;; Functions for decoding RFC2047 messages ;;; -(defvar rfc2047-encoded-word-regexp - "=\\?\\([^][\000-\040()<>@,\;:\\\"/?.=]+\\)\\?\\(B\\|Q\\)\\?\\([!->@-~ +]+\\)\\?=") +(eval-and-compile + (defvar rfc2047-encoded-word-regexp + "=\\?\\([^][\000-\040()<>@,\;:\\\"/?.=]+\\)\\?\\(B\\|Q\\)\\?\ +\\([!->@-~ +]+\\)\\?=")) (defun rfc2047-decode-region (start end) "Decode MIME-encoded words in region between START and END." (interactive "r") (let ((case-fold-search t) + (undoing (not (eq t buffer-undo-list))) b e) - (save-excursion - (save-restriction - (narrow-to-region start end) - (goto-char (point-min)) - ;; Remove whitespace between encoded words. - (while (re-search-forward - (concat "\\(" rfc2047-encoded-word-regexp "\\)" - "\\(\n?[ \t]\\)+" - "\\(" rfc2047-encoded-word-regexp "\\)") - nil t) - (delete-region (goto-char (match-end 1)) (match-beginning 6))) - ;; Decode the encoded words. - (setq b (goto-char (point-min))) - (while (re-search-forward rfc2047-encoded-word-regexp nil t) - (setq e (match-beginning 0)) - (insert (rfc2047-parse-and-decode - (prog1 - (match-string 0) - (delete-region (match-beginning 0) (match-end 0))))) - (when (and (mm-multibyte-p) - mail-parse-charset - (not (eq mail-parse-charset 'gnus-decoded))) - (mm-decode-coding-region b e mail-parse-charset)) - (setq b (point))) - (when (and (mm-multibyte-p) - mail-parse-charset - (not (eq mail-parse-charset 'us-ascii)) - (not (eq mail-parse-charset 'gnus-decoded))) - (mm-decode-coding-region b (point-max) mail-parse-charset)) - (rfc2047-unfold-region (point-min) (point-max)))))) + (unwind-protect + (save-excursion + (save-restriction + (buffer-enable-undo) + (narrow-to-region start end) + (goto-char (point-min)) + ;; Remove whitespace between encoded words. + (while (re-search-forward + (eval-when-compile + (concat "\\(" rfc2047-encoded-word-regexp "\\)" + "\\(\n?[ \t]\\)+" + "\\(" rfc2047-encoded-word-regexp "\\)")) + nil t) + (delete-region (goto-char (match-end 1)) (match-beginning 6))) + ;; Decode the encoded words. + (setq b (goto-char (point-min))) + (while (re-search-forward rfc2047-encoded-word-regexp nil t) + (setq e (match-beginning 0)) + (rfc2047-parse-and-decode (match-beginning 0) (match-end 0))) + (when (and (mm-multibyte-p) + mail-parse-charset + (not (eq mail-parse-charset 'us-ascii)) + (not (eq mail-parse-charset 'gnus-decoded))) + (mm-decode-coding-region b (point-max) mail-parse-charset)) + (rfc2047-unfold-region (point-min) (point-max)))) + (unless undoing + (buffer-disable-undo))))) (defun rfc2047-decode-string (string) "Decode the quoted-printable-encoded STRING and return the results." @@ -470,22 +530,26 @@ (rfc2047-decode-region (point-min) (point-max))) (buffer-string)))) -(defun rfc2047-parse-and-decode (word) +(defun rfc2047-parse-and-decode (b e) "Decode WORD and return it if it is an encoded word. Return WORD if not." - (if (not (string-match rfc2047-encoded-word-regexp word)) - word - (or - (condition-case nil - (rfc2047-decode - (match-string 1 word) - (upcase (match-string 2 word)) - (match-string 3 word)) - (error word)) - word))) + (save-restriction + (narrow-to-region b e) + (goto-char b) + (when (looking-at (eval-when-compile + (concat "\\`" rfc2047-encoded-word-regexp "\\'"))) + (condition-case nil + (let ((charset (match-string 1)) + (encoding (upcase (match-string 2)))) + (undo-boundary) + (delete-region (match-beginning 0) (1+ (match-end 2))) + (delete-region (- (point-max) 2) (point-max)) + (rfc2047-decode charset encoding (point-min) (point-max))) + ;; If we get an error, undo the change + (error (undo)))))) -(defun rfc2047-decode (charset encoding string) - "Decode STRING from the given MIME CHARSET in the given ENCODING. +(defun rfc2047-decode (charset encoding b e) + "Decode from the given MIME CHARSET in the given ENCODING in region B to E. Valid ENCODINGs are \"B\" and \"Q\". If your Emacs implementation can't decode CHARSET, return nil." (if (stringp charset) @@ -504,18 +568,16 @@ (when (and (eq cs 'ascii) mail-parse-charset) (setq cs mail-parse-charset)) - ;; Ensure unibyte result in Emacs 20. - (let (default-enable-multibyte-characters) - (with-temp-buffer - (mm-decode-coding-string - (cond - ((equal "B" encoding) - (base64-decode-string string)) - ((equal "Q" encoding) - (quoted-printable-decode-string - (mm-replace-chars-in-string string ?_ ? ))) - (t (error "Invalid encoding: %s" encoding))) - cs)))))) + (save-restriction + (narrow-to-region b e) + (cond + ((equal "B" encoding) + (base64-decode-region b e)) + ((equal "Q" encoding) + (subst-char-in-region b e ?_ ? t) + (quoted-printable-decode-region b e)) + (t (error "Invalid encoding: %s" encoding))) + (mm-decode-coding-region (point-min) (point-max) cs))))) (provide 'rfc2047)