comparison lisp/international/utf-16.el @ 47705:797c350a7f8c

(utf-16-decode-ucs): Look up utf-subst-table-for-decode. Fix for the case that the looking up succeeds. (ccl-decode-mule-utf-16-le): Translate characters by utf-translation-table-for-decode. (ccl-decode-mule-utf-16-be): Likewise. (ccl-encode-mule-utf-16-le): Look up utf-subst-table-for-encode at first. Translate characters by utf-translation-table-for-encode. (ccl-encode-mule-utf-16-be): Likewise. (mule-utf-16-le, mule-utf-16-be): Add `dependency' property.
author Kenichi Handa <handa@m17n.org>
date Mon, 30 Sep 2002 06:37:00 +0000
parents e83401f8d61c
children c024873f225a
comparison
equal deleted inserted replaced
47704:8a5233d2f072 47705:797c350a7f8c
66 ;; Intended for untranslatable utf-16 sequences.") 66 ;; Intended for untranslatable utf-16 sequences.")
67 67
68 ;; Needed in macro expansion, so can't be let-bound. Zapped after use. 68 ;; Needed in macro expansion, so can't be let-bound. Zapped after use.
69 (eval-and-compile 69 (eval-and-compile
70 (defconst utf-16-decode-ucs 70 (defconst utf-16-decode-ucs
71 ;; We have the unicode in r1. Output is character codes in r0, r1, 71 ;; We have the unicode in r1. Output is charset ID in r0, code point
72 ;; and r2 if appropriate. 72 ;; in r1.
73 `((lookup-integer utf-8-subst-table r0 r3) 73 `((lookup-integer utf-subst-table-for-decode r1 r3)
74 (if r7 (r1 = r3)) ; got a translation 74 (if r7 ; got a translation
75 (if (r1 < 128) 75 ((r0 = r1) (r1 = r3))
76 (r0 = ,(charset-id 'ascii)) 76 (if (r1 < 128)
77 (if (r1 < 160) 77 (r0 = ,(charset-id 'ascii))
78 (r0 = ,(charset-id 'eight-bit-control)) 78 (if (r1 < 160)
79 (if (r1 < 256) 79 (r0 = ,(charset-id 'eight-bit-control))
80 ((r0 = ,(charset-id 'latin-iso8859-1)) 80 (if (r1 < 256)
81 (r1 -= 128)) 81 ((r0 = ,(charset-id 'latin-iso8859-1))
82 (if (r1 < #x2500) 82 (r1 -= 128))
83 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) 83 (if (r1 < #x2500)
84 (r1 -= #x100) 84 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
85 (r2 = (((r1 / 96) + 32) << 7)) 85 (r1 -= #x100)
86 (r1 %= 96) 86 (r2 = (((r1 / 96) + 32) << 7))
87 (r1 += (r2 + 32))) 87 (r1 %= 96)
88 (if (r1 < #x3400) 88 (r1 += (r2 + 32)))
89 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) 89 (if (r1 < #x3400)
90 (r1 -= #x2500) 90 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
91 (r2 = (((r1 / 96) + 32) << 7)) 91 (r1 -= #x2500)
92 (r1 %= 96) 92 (r2 = (((r1 / 96) + 32) << 7))
93 (r1 += (r2 + 32))) 93 (r1 %= 96)
94 (if (r1 < #xd800) ; 2 untranslated bytes 94 (r1 += (r2 + 32)))
95 ;; ;; Assume this is rare, so don't worry about the 95 (if (r1 < #xd800) ; 2 untranslated bytes
96 ;; ;; overhead of the call. 96 ;; ;; Assume this is rare, so don't worry about the
97 ;; (call mule-utf-16-untrans) 97 ;; ;; overhead of the call.
98 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) 98 ;; (call mule-utf-16-untrans)
99 (r1 = 15037)) ; U+fffd 99 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
100 (if (r1 < #xe000) ; surrogate 100 (r1 = 15037)) ; U+fffd
101 ;; ((call mule-utf-16-untrans) 101 (if (r1 < #xe000) ; surrogate
102 ;; (write-multibyte-character r0 r1) 102 ;; ((call mule-utf-16-untrans)
103 ;; (read r3 r4) 103 ;; (write-multibyte-character r0 r1)
104 ;; (call mule-utf-16-untrans)) 104 ;; (read r3 r4)
105 ((read r3 r4) 105 ;; (call mule-utf-16-untrans))
106 (r0 = ,(charset-id 'mule-unicode-e000-ffff)) 106 ((read r3 r4)
107 (r1 = 15037)) 107 (r0 = ,(charset-id 'mule-unicode-e000-ffff))
108 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) 108 (r1 = 15037))
109 (r1 -= #xe000) 109 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
110 (r2 = (((r1 / 96) + 32) << 7)) 110 (r1 -= #xe000)
111 (r1 %= 96) 111 (r2 = (((r1 / 96) + 32) << 7))
112 (r1 += (r2 + 32))))))))))))) 112 (r1 %= 96)
113 (r1 += (r2 + 32))))))))))))))
113 114
114 (define-ccl-program ccl-decode-mule-utf-16-le 115 (define-ccl-program ccl-decode-mule-utf-16-le
115 `(2 ; 2 bytes -> 1 to 4 bytes 116 `(2 ; 2 bytes -> 1 to 4 bytes
116 ((read r0 r1) ; signature 117 ((read r0 r1) ; signature
117 (loop 118 (loop
118 (read r3 r4) 119 (read r3 r4)
119 (r1 = (r4 <8 r3)) 120 (r1 = (r4 <8 r3))
120 ,utf-16-decode-ucs 121 ,utf-16-decode-ucs
121 (translate-character utf-8-translation-table-for-decode r0 r1) 122 (translate-character utf-translation-table-for-decode r0 r1)
122 (write-multibyte-character r0 r1) 123 (write-multibyte-character r0 r1)
123 (repeat)))) 124 (repeat))))
124 "Decode little endian UTF-16 (ignoring signature bytes). 125 "Decode little endian UTF-16 (ignoring signature bytes).
125 Basic decoding is done into the charsets ascii, latin-iso8859-1 and 126 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
126 mule-unicode-*. Un-representable Unicode characters are 127 mule-unicode-*. Un-representable Unicode characters are decoded as
127 decoded as U+fffd. The result is run through translation table 128 U+fffd. The result is run through the translation-table named
128 `utf-8-translation-table-for-decode' if that is defined.") 129 `utf-translation-table-for-decode'.")
129 130
130 (define-ccl-program ccl-decode-mule-utf-16-be 131 (define-ccl-program ccl-decode-mule-utf-16-be
131 `(2 ; 2 bytes -> 1 to 4 bytes 132 `(2 ; 2 bytes -> 1 to 4 bytes
132 ((read r0 r1) ; signature 133 ((read r0 r1) ; signature
133 (loop 134 (loop
134 (read r3 r4) 135 (read r3 r4)
135 (r1 = (r3 <8 r4)) 136 (r1 = (r3 <8 r4))
136 ,utf-16-decode-ucs 137 ,utf-16-decode-ucs
137 (translate-character utf-8-translation-table-for-decode r0 r1) 138 (translate-character utf-translation-table-for-decode r0 r1)
138 (write-multibyte-character r0 r1) 139 (write-multibyte-character r0 r1)
139 (repeat)))) 140 (repeat))))
140 "Decode big endian UTF-16 (ignoring signature bytes). 141 "Decode big endian UTF-16 (ignoring signature bytes).
141 Basic decoding is done into the charsets ascii, latin-iso8859-1 and 142 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
142 mule-unicode-*. Un-representable Unicode characters are 143 mule-unicode-*. Un-representable Unicode characters are
143 decoded as U+fffd. The result is run through translation table 144 decoded as U+fffd. The result is run through the translation-table of
144 `utf-8-non-latin-8859-table'.") 145 name `utf-translation-table-for-decode'.")
145 146
146 (makunbound 'utf-16-decode-ucs) ; done with it 147 (makunbound 'utf-16-decode-ucs) ; done with it
147 148
148 (eval-and-compile 149 (eval-and-compile
149 (defconst utf-16-decode-to-ucs 150 (defconst utf-16-decode-to-ucs
174 `(1 175 `(1
175 ((write #xff) 176 ((write #xff)
176 (write #xfe) 177 (write #xfe)
177 (loop 178 (loop
178 (read-multibyte-character r0 r1) 179 (read-multibyte-character r0 r1)
179 (translate-character ucs-mule-to-mule-unicode r0 r1) 180 (lookup-character utf-subst-table-for-encode r0 r1)
180 ,utf-16-decode-to-ucs 181 (if (r7 == 0)
182 ((translate-character utf-translation-table-for-encode r0 r1)
183 ,utf-16-decode-to-ucs))
181 (write (r0 & 255)) 184 (write (r0 & 255))
182 (write (r0 >> 8)) 185 (write (r0 >> 8))
183 (repeat)))) 186 (repeat))))
184 "Encode to little endian UTF-16 with signature. 187 "Encode to little endian UTF-16 with signature.
185 Characters from the charsets ascii, eight-bit-control, 188 Characters from the charsets ascii, eight-bit-control,
186 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded 189 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
187 after translation through the table `ucs-mule-to-mule-unicode'. 190 after translation through the translation-table of name
191 `utf-translation-table-for-encode'.
188 Others are encoded as U+FFFD.") 192 Others are encoded as U+FFFD.")
189 193
190 (define-ccl-program ccl-encode-mule-utf-16-be 194 (define-ccl-program ccl-encode-mule-utf-16-be
191 `(1 195 `(1
192 ((write #xfe) 196 ((write #xfe)
193 (write #xff) 197 (write #xff)
194 (loop 198 (loop
195 (read-multibyte-character r0 r1) 199 (read-multibyte-character r0 r1)
196 (translate-character ucs-mule-to-mule-unicode r0 r1) 200 (lookup-character utf-subst-table-for-encode r0 r1)
197 ,utf-16-decode-to-ucs 201 (if (r7 == 0)
202 ((translate-character utf-translation-table-for-encode r0 r1)
203 ,utf-16-decode-to-ucs))
198 (write (r0 >> 8)) 204 (write (r0 >> 8))
199 (write (r0 & 255)) 205 (write (r0 & 255))
200 (repeat)))) 206 (repeat))))
201 "Encode to big endian UTF-16 with signature. 207 "Encode to big endian UTF-16 with signature.
202 Characters from the charsets ascii, eight-bit-control, 208 Characters from the charsets ascii, eight-bit-control,
203 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded 209 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
204 after translation through the table `ucs-mule-to-mule-unicode'. 210 after translation through the translation-table named
211 `utf-translation-table-for-encode'.
205 Others are encoded as U+FFFD.") 212 Others are encoded as U+FFFD.")
206 213
207 (makunbound 'utf-16-decode-to-ucs) 214 (makunbound 'utf-16-decode-to-ucs)
208 215
209 (let ((doc " 216 (let ((doc "
210 217
211 Assumes and ignores the leading two-byte signature. 218 Assumes and ignores the leading two-byte signature.
212 219
213 The supported Emacs character sets are the following, plus others 220 It supports Unicode characters of these ranges:
214 which may be included in the translation table 221 U+0000..U+33FF, U+E000..U+FFFF.
215 `ucs-mule-to-mule-unicode': 222 They correspond to these Emacs character sets:
216 ascii 223 ascii, latin-iso8859-1, mule-unicode-0100-24ff,
217 eight-bit-control 224 mule-unicode-2500-33ff, mule-unicode-e000-ffff
218 latin-iso8859-1 225
219 mule-unicode-0100-24ff 226 On decoding (e.g. reading a file), Unicode characters not in the above
220 mule-unicode-2500-33ff 227 ranges are decoded as U+FFFD, effectively corrupting the data
221 mule-unicode-e000-ffff 228 if they are re-encoded.
222 229
223 Note that Unicode characters out of the ranges U+0000-U+33FF and 230 On encoding (e.g. writing a file), Emacs characters not belonging to
224 U+E200-U+FFFF are decoded as U+FFFD, effectively corrupting the data 231 any of the character sets listed above are encoded into the byte
225 if they are re-encoded. Emacs characters without Unicode conversions 232 sequence representing U+FFFD (REPLACEMENT CHARACTER)."))
226 are encoded as U+FFFD."))
227 (make-coding-system 233 (make-coding-system
228 'mule-utf-16-le 4 234 'mule-utf-16-le 4
229 ?u ; Mule-UCS uses ?U, but code-pages uses that for koi8-u. 235 ?u ; Mule-UCS uses ?U, but code-pages uses that for koi8-u.
230 (concat 236 (concat
231 "Little endian UTF-16 encoding for Emacs-supported Unicode characters." 237 "Little endian UTF-16 encoding for Emacs-supported Unicode characters."
240 mule-unicode-2500-33ff 246 mule-unicode-2500-33ff
241 mule-unicode-e000-ffff) 247 mule-unicode-e000-ffff)
242 (mime-charset . utf-16le) 248 (mime-charset . utf-16le)
243 (coding-category . coding-category-utf-16-le) 249 (coding-category . coding-category-utf-16-le)
244 (valid-codes (0 . 255)) 250 (valid-codes (0 . 255))
245 (pre-write-conversion . utf-16-le-pre-write-conversion))) 251 (pre-write-conversion . utf-16-le-pre-write-conversion)
252 (dependency unify-8859-on-encoding-mode
253 unify-8859-on-decoding-mode
254 utf-fragment-on-decoding
255 utf-translate-cjk)))
246 256
247 (make-coding-system 257 (make-coding-system
248 'mule-utf-16-be 4 ?u 258 'mule-utf-16-be 4 ?u
249 (concat 259 (concat
250 "Big endian UTF-16 encoding for Emacs-supported Unicode characters." 260 "Big endian UTF-16 encoding for Emacs-supported Unicode characters."
259 mule-unicode-2500-33ff 269 mule-unicode-2500-33ff
260 mule-unicode-e000-ffff) 270 mule-unicode-e000-ffff)
261 (mime-charset . utf-16be) 271 (mime-charset . utf-16be)
262 (coding-category . coding-category-utf-16-be) 272 (coding-category . coding-category-utf-16-be)
263 (valid-codes (0 . 255)) 273 (valid-codes (0 . 255))
264 (pre-write-conversion . utf-16-be-pre-write-conversion))) 274 (pre-write-conversion . utf-16-be-pre-write-conversion)
265 275 (dependency unify-8859-on-encoding-mode
266 (register-char-codings 'mule-utf-16-le ucs-mule-to-mule-unicode) 276 unify-8859-on-decoding-mode
267 (register-char-codings 'mule-utf-16-be ucs-mule-to-mule-unicode)) 277 utf-fragment-on-decoding
278 utf-translate-cjk))))
268 279
269 (define-coding-system-alias 'utf-16-le 'mule-utf-16-le) 280 (define-coding-system-alias 'utf-16-le 'mule-utf-16-le)
270 (define-coding-system-alias 'utf-16-be 'mule-utf-16-be) 281 (define-coding-system-alias 'utf-16-be 'mule-utf-16-be)
271 282
272 (provide 'utf-16) 283 (provide 'utf-16)