comparison lisp/international/utf-16.el @ 46677:86e8d59e2e49

*** empty log message ***
author Dave Love <fx@gnu.org>
date Wed, 24 Jul 2002 23:01:32 +0000
parents
children e83401f8d61c
comparison
equal deleted inserted replaced
46676:f0b8a25b0b7d 46677:86e8d59e2e49
1 ;;; utf-16.el --- UTF-16 encoding/decoding
2
3 ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
4
5 ;; Author: Dave Love <fx@gnu.org>
6 ;; Keywords: Unicode, UTF-16, i18n
7
8 ;; This file is part of GNU Emacs.
9
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation; either version 2, or (at your option)
13 ;; any later version.
14
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the
22 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;; Boston, MA 02111-1307, USA.
24
25 ;;; Commentary:
26
27 ;; Support for UTF-16, which is a two-byte encoding (modulo
28 ;; surrogates) of Unicode, written either in little or big endian
29 ;; order: coding-systems `mule-utf-16-le' and `mule-utf-16-be'.
30 ;; (utf-16-le is used by the DozeN'T clipboard, for instance.) The
31 ;; data are preceeded by a two-byte signature which identifies their
32 ;; byte sex. These are used by the coding-category-utf-16-{b,l}e code
33 ;; to identify the coding, but ignored on decoding.
34
35 ;; Note that un-decodable sequences aren't (yet?) preserved as raw
36 ;; bytes, as they are with utf-8, so reading and writing as utf-16 can
37 ;; corrupt data.
38
39 ;;; Code:
40
41 ;; We end up with trivially different -le and -be versions of most
42 ;; things below, sometimes with commonality abstracted into a let
43 ;; binding for maintenance convenience.
44
45 ;; We'd need new charsets distinct from ascii and eight-bit-control to
46 ;; deal with untranslated sequences, since we can't otherwise
47 ;; distinguish the bytes, as we can with utf-8.
48
49 ;; ;; Do a multibyte write for bytes in r3 and r4.
50 ;; ;; Intended for untranslatable utf-16 sequences.
51 ;; (define-ccl-program ccl-mule-utf-16-untrans
52 ;; `(0
53 ;; (if (r3 < 128)
54 ;; (r0 = ,(charset-id 'ascii))
55 ;; (if (r3 < 160)
56 ;; (r0 = ,(charset-id 'eight-bit-control))
57 ;; (r0 = ,(charset-id 'eight-bit-graphic))))
58 ;; (if (r4 < 128)
59 ;; (r0 = ,(charset-id 'ascii))
60 ;; (if (r4 < 160)
61 ;; (r0 = ,(charset-id 'eight-bit-control))
62 ;; (r0 = ,(charset-id 'eight-bit-graphic))))
63 ;; (r1 = r4)))
64 ;; "Do a multibyte write for bytes in r3 and r4.
65 ;; First swap them if we're big endian, indicated by r5==0.
66 ;; Intended for untranslatable utf-16 sequences.")
67
68 ;; Needed in macro expansion, so can't be let-bound. Zapped after use.
69 (eval-and-compile
70 (defconst utf-16-decode-ucs
71 ;; We have the unicode in r1. Output is character codes in r0, r1,
72 ;; and r2 if appropriate.
73 `((lookup-integer utf-8-subst-table r0 r3)
74 (if r7 (r1 = r3)) ; got a translation
75 (if (r1 < 128)
76 (r0 = ,(charset-id 'ascii))
77 (if (r1 < 160)
78 (r0 = ,(charset-id 'eight-bit-control))
79 (if (r1 < 256)
80 ((r0 = ,(charset-id 'latin-iso8859-1))
81 (r1 -= 128))
82 (if (r1 < #x2500)
83 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
84 (r1 -= #x100)
85 (r2 = (((r1 / 96) + 32) << 7))
86 (r1 %= 96)
87 (r1 += (r2 + 32)))
88 (if (r1 < #x3400)
89 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
90 (r1 -= #x2500)
91 (r2 = (((r1 / 96) + 32) << 7))
92 (r1 %= 96)
93 (r1 += (r2 + 32)))
94 (if (r1 < #xd800) ; 2 untranslated bytes
95 ;; ;; Assume this is rare, so don't worry about the
96 ;; ;; overhead of the call.
97 ;; (call mule-utf-16-untrans)
98 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
99 (r1 = 15037)) ; U+fffd
100 (if (r1 < #xe000) ; surrogate
101 ;; ((call mule-utf-16-untrans)
102 ;; (write-multibyte-character r0 r1)
103 ;; (read r3 r4)
104 ;; (call mule-utf-16-untrans))
105 ((read r3 r4)
106 (r0 = ,(charset-id 'mule-unicode-e000-ffff))
107 (r1 = 15037))
108 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
109 (r1 -= #xe000)
110 (r2 = (((r1 / 96) + 32) << 7))
111 (r1 %= 96)
112 (r1 += (r2 + 32)))))))))))))
113
114 (define-ccl-program ccl-decode-mule-utf-16-le
115 `(2 ; 2 bytes -> 1 to 4 bytes
116 ((read r0 r1) ; signature
117 (loop
118 (read r3 r4)
119 (r1 = (r4 <8 r3))
120 ,utf-16-decode-ucs
121 (translate-character utf-8-translation-table-for-decode r0 r1)
122 (write-multibyte-character r0 r1)
123 (repeat))))
124 "Decode little endian UTF-16 (ignoring signature bytes).
125 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
126 mule-unicode-*. Un-representable Unicode characters are
127 decoded as U+fffd. The result is run through translation table
128 `utf-8-translation-table-for-decode' if that is defined.")
129
130 (define-ccl-program ccl-decode-mule-utf-16-be
131 `(2 ; 2 bytes -> 1 to 4 bytes
132 ((read r0 r1) ; signature
133 (loop
134 (read r3 r4)
135 (r1 = (r3 <8 r4))
136 ,utf-16-decode-ucs
137 (translate-character utf-8-translation-table-for-decode r0 r1)
138 (write-multibyte-character r0 r1)
139 (repeat))))
140 "Decode big endian UTF-16 (ignoring signature bytes).
141 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
142 mule-unicode-*. Un-representable Unicode characters are
143 decoded as U+fffd. The result is run through translation table
144 `utf-8-non-latin-8859-table'.")
145
146 (makunbound 'utf-16-decode-ucs) ; done with it
147
148 (eval-and-compile
149 (defconst utf-16-decode-to-ucs
150 ;; CCL which, given the result of a multibyte read in r0 and r1,
151 ;; sets r0 to the character's Unicode if the charset is one of the
152 ;; basic utf-8 coding system ones. Otherwise set to U+fffd.
153 `(if (r0 == ,(charset-id 'ascii))
154 (r0 = r1)
155 (if (r0 == ,(charset-id 'latin-iso8859-1))
156 (r0 = (r1 + 128))
157 (if (r0 == ,(charset-id 'eight-bit-control))
158 (r0 = r1)
159 (if (r0 == ,(charset-id 'eight-bit-graphic))
160 (r0 = r1)
161 ((r2 = (r1 & #x7f))
162 (r1 >>= 7)
163 (r3 = ((r1 - 32) * 96))
164 (r3 += (r2 - 32))
165 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
166 (r0 = (r3 + #x100))
167 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
168 (r0 = (r3 + #x2500))
169 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
170 (r0 = (r3 + #xe000))
171 (r0 = #xfffd)))))))))))
172
173 (define-ccl-program ccl-encode-mule-utf-16-le
174 `(1
175 ((write #xff)
176 (write #xfe)
177 (loop
178 (read-multibyte-character r0 r1)
179 (translate-character ucs-mule-to-mule-unicode r0 r1)
180 ,utf-16-decode-to-ucs
181 (write (r0 & 255))
182 (write (r0 >> 8))
183 (repeat))))
184 "Encode to little endian UTF-16 with signature.
185 Characters from the charsets ascii, eight-bit-control,
186 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
187 after translation through the table `ucs-mule-to-mule-unicode'.
188 Others are encoded as U+FFFD.")
189
190 (define-ccl-program ccl-encode-mule-utf-16-be
191 `(1
192 ((write #xfe)
193 (write #xff)
194 (loop
195 (read-multibyte-character r0 r1)
196 (translate-character ucs-mule-to-mule-unicode r0 r1)
197 ,utf-16-decode-to-ucs
198 (write (r0 >> 8))
199 (write (r0 & 255))
200 (repeat))))
201 "Encode to big endian UTF-16 with signature.
202 Characters from the charsets ascii, eight-bit-control,
203 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
204 after translation through the table `ucs-mule-to-mule-unicode'.
205 Others are encoded as U+FFFD.")
206
207 (makunbound 'utf-16-decode-to-ucs)
208
209 (defun utf-16-le-pre-write-conversion (beg end)
210 "Semi-dummy pre-write function effectively to autoload ucs-tables."
211 ;; Ensure translation table is loaded.
212 (require 'ucs-tables)
213 ;; Don't do this again.
214 (coding-system-put 'mule-utf-16-le 'pre-write-conversion nil)
215 nil)
216
217 (defun utf-16-be-pre-write-conversion (beg end)
218 "Semi-dummy pre-write function effectively to autoload ucs-tables."
219 ;; Ensure translation table is loaded.
220 (require 'ucs-tables)
221 ;; Don't do this again.
222 (coding-system-put 'mule-utf-16-be 'pre-write-conversion nil)
223 nil)
224
225 (let ((doc "
226
227 Assumes and ignores the leading two-byte signature.
228
229 The supported Emacs character sets are the following, plus others
230 which may be included in the translation table
231 `ucs-mule-to-mule-unicode':
232 ascii
233 eight-bit-control
234 latin-iso8859-1
235 mule-unicode-0100-24ff
236 mule-unicode-2500-33ff
237 mule-unicode-e000-ffff
238
239 Note that Unicode characters out of the ranges U+0000-U+33FF and
240 U+E200-U+FFFF are decoded as U+FFFD, effectively corrupting the data
241 if they are re-encoded. Emacs characters without Unicode conversions
242 are encoded as U+FFFD."))
243 (make-coding-system
244 'mule-utf-16-le 4
245 ?u ; Mule-UCS uses ?U, but code-pages uses that for koi8-u.
246 (concat
247 "Little endian UTF-16 encoding for Emacs-supported Unicode characters."
248 doc)
249
250 '(ccl-decode-mule-utf-16-le . ccl-encode-mule-utf-16-le)
251 '((safe-charsets
252 ascii
253 eight-bit-control
254 latin-iso8859-1
255 mule-unicode-0100-24ff
256 mule-unicode-2500-33ff
257 mule-unicode-e000-ffff)
258 (mime-charset . utf-16le)
259 (coding-category . coding-category-utf-16-le)
260 (valid-codes (0 . 255))
261 (pre-write-conversion . utf-16-le-pre-write-conversion)))
262
263 (make-coding-system
264 'mule-utf-16-be 4 ?u
265 (concat
266 "Big endian UTF-16 encoding for Emacs-supported Unicode characters."
267 doc)
268
269 '(ccl-decode-mule-utf-16-be . ccl-encode-mule-utf-16-be)
270 '((safe-charsets
271 ascii
272 eight-bit-control
273 latin-iso8859-1
274 mule-unicode-0100-24ff
275 mule-unicode-2500-33ff
276 mule-unicode-e000-ffff)
277 (mime-charset . utf-16be)
278 (coding-category . coding-category-utf-16-be)
279 (valid-codes (0 . 255))
280 (pre-write-conversion . utf-16-be-pre-write-conversion)))
281 )
282
283 (define-coding-system-alias 'utf-16-le 'mule-utf-16-le)
284 (define-coding-system-alias 'utf-16-be 'mule-utf-16-be)
285
286 (provide 'utf-16)
287
288 ;;; utf-16.el ends here