comparison lisp/gnus/utf7.el @ 82951:0fde48feb604

Import Gnus 5.10 from the v5_10 branch of the Gnus repository.
author Andreas Schwab <schwab@suse.de>
date Thu, 22 Jul 2004 16:45:51 +0000
parents 695cf19ef79e
children aac0a33f5772 cce1c0ee76ee
comparison
equal deleted inserted replaced
56503:8bbd2323fbf2 82951:0fde48feb604
1 ;;; utf7.el --- UTF-7 encoding/decoding for Emacs 1 ;;; utf7.el --- UTF-7 encoding/decoding for Emacs -*-coding: iso-8859-1;-*-
2 ;; Copyright (C) 1999, 2000 Free Software Foundation, Inc. 2 ;; Copyright (C) 1999, 2000, 2003 Free Software Foundation, Inc.
3 3
4 ;; Author: Jon K Hellan <hellan@acm.org> 4 ;; Author: Jon K Hellan <hellan@acm.org>
5 ;; Maintainer: bugs@gnus.org
5 ;; Keywords: mail 6 ;; Keywords: mail
6 7
7 ;; This file is part of GNU Emacs. 8 ;; This file is part of GNU Emacs.
8 9
9 ;; GNU Emacs is free software; you can redistribute it and/or modify 10 ;; GNU Emacs is free software; you can redistribute it and/or modify
20 ;; along with GNU Emacs; see the file COPYING. If not, write to 21 ;; along with GNU Emacs; see the file COPYING. If not, write to
21 ;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, 22 ;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 ;; Boston, MA 02111-1307, USA. 23 ;; Boston, MA 02111-1307, USA.
23 24
24 ;;; Commentary: 25 ;;; Commentary:
25 ;;; UTF-7 - A Mail-Safe Transformation Format of Unicode - RFC 2152 26
26 ;;; This is a transformation format of Unicode that contains only 7-bit 27 ;; UTF-7 - A Mail-Safe Transformation Format of Unicode - RFC 2152
27 ;;; ASCII octets and is intended to be readable by humans in the limiting 28 ;; This is a transformation format of Unicode that contains only 7-bit
28 ;;; case that the document consists of characters from the US-ASCII 29 ;; ASCII octets and is intended to be readable by humans in the limiting
29 ;;; repertoire. 30 ;; case that the document consists of characters from the US-ASCII
30 ;;; In short, runs of characters outside US-ASCII are encoded as base64 31 ;; repertoire.
31 ;;; inside delimiters. 32 ;; In short, runs of characters outside US-ASCII are encoded as base64
32 ;;; A variation of UTF-7 is specified in IMAP 4rev1 (RFC 2060) as the way 33 ;; inside delimiters.
33 ;;; to represent characters outside US-ASCII in mailbox names in IMAP. 34 ;; A variation of UTF-7 is specified in IMAP 4rev1 (RFC 2060) as the way
34 ;;; This library supports both variants, but the IMAP variation was the 35 ;; to represent characters outside US-ASCII in mailbox names in IMAP.
35 ;;; reason I wrote it. 36 ;; This library supports both variants, but the IMAP variation was the
36 ;;; The routines convert UTF-7 -> UTF-16 (16 bit encoding of Unicode) 37 ;; reason I wrote it.
37 ;;; -> current character set, and vice versa. 38 ;; The routines convert UTF-7 -> UTF-16 (16 bit encoding of Unicode)
38 ;;; However, until Emacs supports Unicode, the only Emacs character set 39 ;; -> current character set, and vice versa.
39 ;;; supported here is ISO-8859.1, which can trivially be converted to/from 40 ;; However, until Emacs supports Unicode, the only Emacs character set
40 ;;; Unicode. 41 ;; supported here is ISO-8859.1, which can trivially be converted to/from
41 ;;; When decoding results in a character outside the Emacs character set, 42 ;; Unicode.
42 ;;; an error is thrown. It is up to the application to recover. 43 ;; When decoding results in a character outside the Emacs character set,
44 ;; an error is thrown. It is up to the application to recover.
45
46 ;; UTF-7 should be done by providing a coding system. Mule-UCS does
47 ;; already, but I don't know if it does the IMAP version and it's not
48 ;; clear whether that should really be a coding system. The UTF-16
49 ;; part of the conversion can be done with coding systems available
50 ;; with Mule-UCS or some versions of Emacs. Unfortunately these were
51 ;; done wrongly (regarding handling of byte-order marks and how the
52 ;; variants were named), so we don't have a consistent name for the
53 ;; necessary coding system. The code below doesn't seem to DTRT
54 ;; generally. E.g.:
55 ;;
56 ;; (utf7-encode "a+£")
57 ;; => "a+ACsAow-"
58 ;;
59 ;; $ echo "a+£"|iconv -f iso-8859-1 -t utf-7
60 ;; a+-+AKM
61 ;;
62 ;; -- fx
63
43 64
44 ;;; Code: 65 ;;; Code:
45 66
46 (require 'base64) 67 (require 'base64)
47 (eval-when-compile (require 'cl)) 68 (eval-when-compile (require 'cl))
48 69 (require 'mm-util)
49 (defvar utf7-direct-encoding-chars " -%'-*,-[]-}" 70
71 (defconst utf7-direct-encoding-chars " -%'-*,-[]-}"
50 "Character ranges which do not need escaping in UTF-7.") 72 "Character ranges which do not need escaping in UTF-7.")
51 73
52 (defvar utf7-imap-direct-encoding-chars 74 (defconst utf7-imap-direct-encoding-chars
53 (concat utf7-direct-encoding-chars "+\\~") 75 (concat utf7-direct-encoding-chars "+\\~")
54 "Character ranges which do not need escaping in the IMAP variant of UTF-7.") 76 "Character ranges which do not need escaping in the IMAP variant of UTF-7.")
77
78 (defconst utf7-utf-16-coding-system
79 (cond ((mm-coding-system-p 'utf-16-be-no-signature) ; Mule-UCS
80 'utf-16-be-no-signature)
81 ((and (mm-coding-system-p 'utf-16-be) ; Emacs 21.4 (?), Emacs 22
82 ;; Avoid versions with BOM.
83 (= 2 (length (encode-coding-string "a" 'utf-16-be))))
84 'utf-16-be)
85 ((mm-coding-system-p 'utf-16-be-nosig) ; ?
86 'utf-16-be-nosig))
87 "Coding system which encodes big endian UTF-16 without a BOM signature.")
55 88
56 (defsubst utf7-imap-get-pad-length (len modulus) 89 (defsubst utf7-imap-get-pad-length (len modulus)
57 "Return required length of padding for IMAP modified base64 fragment." 90 "Return required length of padding for IMAP modified base64 fragment."
58 (mod (- len) modulus)) 91 (mod (- len) modulus))
59 92
62 Use IMAP modification if FOR-IMAP is non-nil." 95 Use IMAP modification if FOR-IMAP is non-nil."
63 (let ((start (point-min)) 96 (let ((start (point-min))
64 (end (point-max))) 97 (end (point-max)))
65 (narrow-to-region start end) 98 (narrow-to-region start end)
66 (goto-char start) 99 (goto-char start)
67 (let ((esc-char (if for-imap ?& ?+)) 100 (let* ((esc-char (if for-imap ?& ?+))
68 (direct-encoding-chars 101 (direct-encoding-chars
69 (if for-imap utf7-imap-direct-encoding-chars 102 (if for-imap utf7-imap-direct-encoding-chars
70 utf7-direct-encoding-chars))) 103 utf7-direct-encoding-chars))
104 (not-direct-encoding-chars (concat "^" direct-encoding-chars)))
71 (while (not (eobp)) 105 (while (not (eobp))
72 (skip-chars-forward direct-encoding-chars) 106 (skip-chars-forward direct-encoding-chars)
73 (unless (eobp) 107 (unless (eobp)
74 (insert esc-char) 108 (insert esc-char)
75 (let ((p (point)) 109 (let ((p (point))
76 (fc (following-char)) 110 (fc (following-char))
77 (run-length 111 (run-length
78 (skip-chars-forward (concat "^" direct-encoding-chars)))) 112 (skip-chars-forward not-direct-encoding-chars)))
79 (if (and (= fc esc-char) 113 (if (and (= fc esc-char)
80 (= run-length 1)) ; Lone esc-char? 114 (= run-length 1)) ; Lone esc-char?
81 (delete-backward-char 1) ; Now there's one too many 115 (delete-backward-char 1) ; Now there's one too many
82 (utf7-fragment-encode p (point) for-imap)) 116 (utf7-fragment-encode p (point) for-imap))
83 (insert "-"))))))) 117 (insert "-")))))))
86 "Encode text from START to END in buffer as UTF-7 escape fragment. 120 "Encode text from START to END in buffer as UTF-7 escape fragment.
87 Use IMAP modification if FOR-IMAP is non-nil." 121 Use IMAP modification if FOR-IMAP is non-nil."
88 (save-restriction 122 (save-restriction
89 (narrow-to-region start end) 123 (narrow-to-region start end)
90 (funcall (utf7-get-u16char-converter 'to-utf-16)) 124 (funcall (utf7-get-u16char-converter 'to-utf-16))
91 (base64-encode-region start (point-max)) 125 (mm-with-unibyte-current-buffer
126 (base64-encode-region start (point-max)))
92 (goto-char start) 127 (goto-char start)
93 (let ((pm (point-max))) 128 (let ((pm (point-max)))
94 (when for-imap 129 (when for-imap
95 (while (search-forward "/" nil t) 130 (while (search-forward "/" nil t)
96 (replace-match ","))) 131 (replace-match ",")))
133 (base64-decode-region start (+ end pl))) 168 (base64-decode-region start (+ end pl)))
134 (funcall (utf7-get-u16char-converter 'from-utf-16)))) 169 (funcall (utf7-get-u16char-converter 'from-utf-16))))
135 170
136 (defun utf7-get-u16char-converter (which-way) 171 (defun utf7-get-u16char-converter (which-way)
137 "Return a function to convert between UTF-16 and current character set." 172 "Return a function to convert between UTF-16 and current character set."
138 ;; Add test to check if we are really Latin-1. 173 (if utf7-utf-16-coding-system
139 ;; Support other character sets once Emacs groks Unicode. 174 (if (eq which-way 'to-utf-16)
140 (if (eq which-way 'to-utf-16) 175 (lambda ()
141 'utf7-latin1-u16-char-converter 176 (encode-coding-region (point-min) (point-max)
142 'utf7-u16-latin1-char-converter)) 177 utf7-utf-16-coding-system))
178 (lambda ()
179 (decode-coding-region (point-min) (point-max)
180 utf7-utf-16-coding-system)))
181 ;; Add test to check if we are really Latin-1.
182 (if (eq which-way 'to-utf-16)
183 'utf7-latin1-u16-char-converter
184 'utf7-u16-latin1-char-converter)))
143 185
144 (defun utf7-latin1-u16-char-converter () 186 (defun utf7-latin1-u16-char-converter ()
145 "Convert latin 1 (ISO-8859.1) characters to 16 bit Unicode. 187 "Convert latin 1 (ISO-8859.1) characters to 16 bit Unicode.
146 Characters are converted to raw byte pairs in narrowed buffer." 188 Characters are converted to raw byte pairs in narrowed buffer."
189 (mm-encode-coding-region (point-min) (point-max) 'iso-8859-1)
190 (mm-disable-multibyte)
147 (goto-char (point-min)) 191 (goto-char (point-min))
148 (while (not (eobp)) 192 (while (not (eobp))
149 (insert 0) 193 (insert 0)
150 (forward-char))) 194 (forward-char)))
151 195
155 (goto-char (point-min)) 199 (goto-char (point-min))
156 (while (not (eobp)) 200 (while (not (eobp))
157 (if (= 0 (following-char)) 201 (if (= 0 (following-char))
158 (delete-char 1) 202 (delete-char 1)
159 (error "Unable to convert from Unicode")) 203 (error "Unable to convert from Unicode"))
160 (forward-char))) 204 (forward-char))
205 (mm-decode-coding-region (point-min) (point-max) 'iso-8859-1)
206 (mm-enable-multibyte))
161 207
162 (defun utf7-encode (string &optional for-imap) 208 (defun utf7-encode (string &optional for-imap)
163 "Encode UTF-7 STRING. Use IMAP modification if FOR-IMAP is non-nil." 209 "Encode UTF-7 STRING. Use IMAP modification if FOR-IMAP is non-nil."
164 (let ((default-enable-multibyte-characters nil)) 210 (let ((default-enable-multibyte-characters t))
165 (with-temp-buffer 211 (with-temp-buffer
166 (insert string) 212 (insert string)
167 (utf7-encode-internal for-imap) 213 (utf7-encode-internal for-imap)
168 (buffer-string)))) 214 (buffer-string))))
169 215
171 "Decode UTF-7 STRING. Use IMAP modification if FOR-IMAP is non-nil." 217 "Decode UTF-7 STRING. Use IMAP modification if FOR-IMAP is non-nil."
172 (let ((default-enable-multibyte-characters nil)) 218 (let ((default-enable-multibyte-characters nil))
173 (with-temp-buffer 219 (with-temp-buffer
174 (insert string) 220 (insert string)
175 (utf7-decode-internal for-imap) 221 (utf7-decode-internal for-imap)
222 (mm-enable-multibyte)
176 (buffer-string)))) 223 (buffer-string))))
177 224
178 (provide 'utf7) 225 (provide 'utf7)
179 226
180 ;;; arch-tag: 96078b55-85c7-4161-aed2-932c24b282c7 227 ;;; arch-tag: 96078b55-85c7-4161-aed2-932c24b282c7