comparison lisp/international/utf-16.el @ 50483:15382232cf57

(utf-16-le-decode-loop) (utf-16-be-decode-loop): New temporary variables. (ccl-decode-mule-utf-16-le): Use utf-16-le-decode-loop. (ccl-decode-mule-utf-16-be): Use utf-16-be-decode-loop (ccl-decode-mule-utf-16-le-with-signature) (ccl-decode-mule-utf-16-be-with-signature) (ccl-decode-mule-utf-16): New CCL programs. (utf-16-le-encode-loop, utf-16-be-encode-loop): New temporary variables. (ccl-encode-mule-utf-16-le): Use utf-16-le-encode-loop. (ccl-encode-mule-utf-16-be): Use utf-16-be-encode-loop (ccl-encode-mule-utf-16-le-with-signature) (ccl-encode-mule-utf-16-be-with-signature): New CCL programs. (mule-utf-16-post-read-conversion): New function. (mule-utf-16-le-with-signature, mule-utf-16-be-with-signature), (mule-utf-16): New coding systems. (utf-16-le-with-signature, utf-16-be-with-signature), (utf-16): Aliases of the above coding systems.
author Kenichi Handa <handa@m17n.org>
date Tue, 08 Apr 2003 07:23:44 +0000
parents 12444cb90785
children bc72d6855260
comparison
equal deleted inserted replaced
50482:ceac595b0881 50483:15382232cf57
67 67
68 ;; Needed in macro expansion, so can't be let-bound. Zapped after use. 68 ;; Needed in macro expansion, so can't be let-bound. Zapped after use.
69 (eval-and-compile 69 (eval-and-compile
70 (defconst utf-16-decode-ucs 70 (defconst utf-16-decode-ucs
71 ;; We have the unicode in r1. Output is charset ID in r0, code 71 ;; We have the unicode in r1. Output is charset ID in r0, code
72 ;; point in r1. As r6 keeps endian information, the value should 72 ;; point in r1.
73 ;; not be changed.
74 `((lookup-integer utf-subst-table-for-decode r1 r3) 73 `((lookup-integer utf-subst-table-for-decode r1 r3)
75 (if r7 ; got a translation 74 (if r7 ; got a translation
76 ((r0 = r1) (r1 = r3)) 75 ((r0 = r1) (r1 = r3))
77 (if (r1 < 128) 76 (if (r1 < 128)
78 (r0 = ,(charset-id 'ascii)) 77 (r0 = ,(charset-id 'ascii))
109 (r1 = 15037)) 108 (r1 = 15037))
110 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) 109 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
111 (r1 -= #xe000) 110 (r1 -= #xe000)
112 (r2 = (((r1 / 96) + 32) << 7)) 111 (r2 = (((r1 / 96) + 32) << 7))
113 (r1 %= 96) 112 (r1 %= 96)
114 (r1 += (r2 + 32)))))))))))))) 113 (r1 += (r2 + 32)))))))))))))
114
115 (defconst utf-16-le-decode-loop
116 `(loop
117 (read r3 r4)
118 (r1 = (r4 <8 r3))
119 ,utf-16-decode-ucs
120 (translate-character utf-translation-table-for-decode r0 r1)
121 (write-multibyte-character r0 r1)
122 (repeat)))
123
124 (defconst utf-16-be-decode-loop
125 `(loop
126 (read r3 r4)
127 (r1 = (r3 <8 r4))
128 ,@utf-16-decode-ucs
129 (translate-character utf-translation-table-for-decode r0 r1)
130 (write-multibyte-character r0 r1)
131 (repeat)))
132
133 )
115 134
116 (define-ccl-program ccl-decode-mule-utf-16-le 135 (define-ccl-program ccl-decode-mule-utf-16-le
117 `(2 ; 2 bytes -> 1 to 4 bytes 136 `(2 ; 2 bytes -> 1 to 4 bytes
118 ((loop 137 ,utf-16-le-decode-loop)
119 (read r3 r4)
120 (r1 = (r4 <8 r3))
121 ,utf-16-decode-ucs
122 (translate-character utf-translation-table-for-decode r0 r1)
123 (write-multibyte-character r0 r1)
124 (repeat))))
125 "Decode UTF-16LE (little endian without signature bytes). 138 "Decode UTF-16LE (little endian without signature bytes).
126 Basic decoding is done into the charsets ascii, latin-iso8859-1 and 139 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
127 mule-unicode-*. Un-representable Unicode characters are decoded as 140 mule-unicode-*. Un-representable Unicode characters are decoded as
128 U+fffd. The result is run through the translation-table named 141 U+fffd. The result is run through the translation-table named
129 `utf-translation-table-for-decode'.") 142 `utf-translation-table-for-decode'.")
130 143
131 (define-ccl-program ccl-decode-mule-utf-16-be 144 (define-ccl-program ccl-decode-mule-utf-16-be
132 `(2 ; 2 bytes -> 1 to 4 bytes 145 `(2 ; 2 bytes -> 1 to 4 bytes
133 ((loop 146 ,utf-16-be-decode-loop)
134 (read r3 r4)
135 (r1 = (r3 <8 r4))
136 ,utf-16-decode-ucs
137 (translate-character utf-translation-table-for-decode r0 r1)
138 (write-multibyte-character r0 r1)
139 (repeat))))
140 "Decode UTF-16BE (big endian without signature bytes). 147 "Decode UTF-16BE (big endian without signature bytes).
141 Basic decoding is done into the charsets ascii, latin-iso8859-1 and 148 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
142 mule-unicode-*. Un-representable Unicode characters are 149 mule-unicode-*. Un-representable Unicode characters are
143 decoded as U+fffd. The result is run through the translation-table of 150 decoded as U+fffd. The result is run through the translation-table of
144 name `utf-translation-table-for-decode'.") 151 name `utf-translation-table-for-decode'.")
145 152
153 (define-ccl-program ccl-decode-mule-utf-16-le-with-signature
154 `(2
155 ((read r3 r4)
156 ,utf-16-le-decode-loop))
157 "Like ccl-decode-utf-16-le but skip the first 2-byte BOM.")
158
159 (define-ccl-program ccl-decode-mule-utf-16-be-with-signature
160 `(2
161 ((read r3 r4)
162 ,utf-16-be-decode-loop))
163 "Like ccl-decode-utf-16-be but skip the first 2-byte BOM.")
164
165 (define-ccl-program ccl-decode-mule-utf-16
166 `(2
167 ((read r3 r4)
168 (r1 = (r3 <8 r4))
169 (if (r1 == #xFFFE)
170 ;; R1 is a BOM for little endian. We keep this character as
171 ;; is temporarily. It is removed by post-read-conversion
172 ;; function.
173 (,@utf-16-decode-ucs
174 (write-multibyte-character r0 r1)
175 ,utf-16-le-decode-loop)
176 ((if (r1 == #xFEFF)
177 ;; R1 is a BOM for big endian, but we can't keep that
178 ;; character in the output because it can't be
179 ;; distinguished with the normal U+FEFF. So, we keep
180 ;; #xFFFF instead.
181 ((r1 = #xFFFF)
182 ,@utf-16-decode-ucs)
183 ;; R1 a normal Unicode character.
184 (,@utf-16-decode-ucs
185 (translate-character utf-translation-table-for-decode r0 r1)))
186 (write-multibyte-character r0 r1)
187 ,utf-16-be-decode-loop))))
188 "Like ccl-decode-utf-16-be/le but check the first BOM.")
189
146 (makunbound 'utf-16-decode-ucs) ; done with it 190 (makunbound 'utf-16-decode-ucs) ; done with it
191 (makunbound 'utf-16-le-decode-loop)
192 (makunbound 'utf-16-be-decode-loop)
147 193
148 (eval-and-compile 194 (eval-and-compile
149 (defconst utf-16-decode-to-ucs 195 (defconst utf-16-decode-to-ucs
150 ;; CCL which, given the result of a multibyte read in r0 and r1, 196 ;; CCL which, given the result of a multibyte read in r0 and r1,
151 ;; sets r0 to the character's Unicode if the charset is one of the 197 ;; sets r0 to the character's Unicode if the charset is one of the
166 (r0 = (r3 + #x100)) 212 (r0 = (r3 + #x100))
167 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) 213 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
168 (r0 = (r3 + #x2500)) 214 (r0 = (r3 + #x2500))
169 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) 215 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
170 (r0 = (r3 + #xe000)) 216 (r0 = (r3 + #xe000))
171 (r0 = #xfffd))))))))))) 217 (r0 = #xfffd))))))))))
218
219 (defconst utf-16-le-encode-loop
220 `(loop
221 (read-multibyte-character r0 r1)
222 (lookup-character utf-subst-table-for-encode r0 r1)
223 (if (r7 == 0)
224 ((translate-character utf-translation-table-for-encode r0 r1)
225 ,utf-16-decode-to-ucs))
226 (write (r0 & 255))
227 (write (r0 >> 8))
228 (repeat)))
229
230 (defconst utf-16-be-encode-loop
231 `(loop
232 (read-multibyte-character r0 r1)
233 (lookup-character utf-subst-table-for-encode r0 r1)
234 (if (r7 == 0)
235 ((translate-character utf-translation-table-for-encode r0 r1)
236 ,utf-16-decode-to-ucs))
237 (write (r0 >> 8))
238 (write (r0 & 255))
239 (repeat)))
240 )
172 241
173 (define-ccl-program ccl-encode-mule-utf-16-le 242 (define-ccl-program ccl-encode-mule-utf-16-le
174 `(1 243 `(1
175 ((loop 244 ,utf-16-le-encode-loop)
176 (read-multibyte-character r0 r1)
177 (lookup-character utf-subst-table-for-encode r0 r1)
178 (if (r7 == 0)
179 ((translate-character utf-translation-table-for-encode r0 r1)
180 ,utf-16-decode-to-ucs))
181 (write (r0 & 255))
182 (write (r0 >> 8))
183 (repeat))))
184 "Encode to UTF-16LE (little endian without signature). 245 "Encode to UTF-16LE (little endian without signature).
185 Characters from the charsets ascii, eight-bit-control, 246 Characters from the charsets ascii, eight-bit-control,
186 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded 247 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
187 after translation through the translation-table of name 248 after translation through the translation-table of name
188 `utf-translation-table-for-encode'. 249 `utf-translation-table-for-encode'.
189 Others are encoded as U+FFFD.") 250 Others are encoded as U+FFFD.")
190 251
191 (define-ccl-program ccl-encode-mule-utf-16-be 252 (define-ccl-program ccl-encode-mule-utf-16-be
192 `(1 253 `(1
193 ((loop 254 ,utf-16-be-encode-loop)
194 (read-multibyte-character r0 r1)
195 (lookup-character utf-subst-table-for-encode r0 r1)
196 (if (r7 == 0)
197 ((translate-character utf-translation-table-for-encode r0 r1)
198 ,utf-16-decode-to-ucs))
199 (write (r0 >> 8))
200 (write (r0 & 255))
201 (repeat))))
202 "Encode to UTF-16BE (big endian without signature). 255 "Encode to UTF-16BE (big endian without signature).
203 Characters from the charsets ascii, eight-bit-control, 256 Characters from the charsets ascii, eight-bit-control,
204 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded 257 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
205 after translation through the translation-table named 258 after translation through the translation-table named
206 `utf-translation-table-for-encode'. 259 `utf-translation-table-for-encode'.
207 Others are encoded as U+FFFD.") 260 Others are encoded as U+FFFD.")
208 261
262 (define-ccl-program ccl-encode-mule-utf-16-le-with-signature
263 `(1
264 ((write #xFF)
265 (write #xFE)
266 ,utf-16-le-encode-loop))
267 "Encode to UTF-16 (little endian with signature).
268 Characters from the charsets ascii, eight-bit-control,
269 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
270 after translation through the translation-table of name
271 `utf-translation-table-for-encode'.
272 Others are encoded as U+FFFD.")
273
274 (define-ccl-program ccl-encode-mule-utf-16-be-with-signature
275 `(1
276 ((write #xFE)
277 (write #xFF)
278 ,utf-16-be-encode-loop))
279 "Encode to UTF-16 (big endian with signature).
280 Characters from the charsets ascii, eight-bit-control,
281 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
282 after translation through the translation-table named
283 `utf-translation-table-for-encode'.
284 Others are encoded as U+FFFD.")
285
209 (makunbound 'utf-16-decode-to-ucs) 286 (makunbound 'utf-16-decode-to-ucs)
287 (makunbound 'utf-16-le-encode-loop)
288 (makunbound 'utf-16-be-encode-loop)
289
290 (defun mule-utf-16-post-read-conversion (length)
291 (when (> length 0)
292 (let ((char (following-char)))
293 (cond ((= char (decode-char 'ucs #xFFFE))
294 (delete-char 1)
295 (setq last-coding-system-used
296 (coding-system-change-text-conversion
297 last-coding-system-used
298 'mule-utf-16-le-with-signature))
299 (setq length (1- length)))
300 ((= char (decode-char 'ucs #xFFFF))
301 (delete-char 1)
302 (setq last-coding-system-used
303 (coding-system-change-text-conversion
304 last-coding-system-used
305 'mule-utf-16-be-with-signature))
306 (setq length (1- length)))
307 (t
308 (setq last-coding-system-used 'mule-utf-16-be)))))
309 length)
210 310
211 (let ((doc " 311 (let ((doc "
212 312
213 It supports Unicode characters of these ranges: 313 It supports Unicode characters of these ranges:
214 U+0000..U+33FF, U+E000..U+FFFF. 314 U+0000..U+33FF, U+E000..U+FFFF.
237 latin-iso8859-1 337 latin-iso8859-1
238 mule-unicode-0100-24ff 338 mule-unicode-0100-24ff
239 mule-unicode-2500-33ff 339 mule-unicode-2500-33ff
240 mule-unicode-e000-ffff) 340 mule-unicode-e000-ffff)
241 (mime-charset . utf-16le) 341 (mime-charset . utf-16le)
242 (coding-category . coding-category-utf-16-le)
243 (valid-codes (0 . 255)) 342 (valid-codes (0 . 255))
244 (dependency unify-8859-on-encoding-mode 343 (dependency unify-8859-on-encoding-mode
245 unify-8859-on-decoding-mode 344 unify-8859-on-decoding-mode
246 utf-fragment-on-decoding 345 utf-fragment-on-decoding
247 utf-translate-cjk))) 346 utf-translate-cjk)))
259 latin-iso8859-1 358 latin-iso8859-1
260 mule-unicode-0100-24ff 359 mule-unicode-0100-24ff
261 mule-unicode-2500-33ff 360 mule-unicode-2500-33ff
262 mule-unicode-e000-ffff) 361 mule-unicode-e000-ffff)
263 (mime-charset . utf-16be) 362 (mime-charset . utf-16be)
264 (coding-category . coding-category-utf-16-be)
265 (valid-codes (0 . 255)) 363 (valid-codes (0 . 255))
266 (dependency unify-8859-on-encoding-mode 364 (dependency unify-8859-on-encoding-mode
267 unify-8859-on-decoding-mode 365 unify-8859-on-decoding-mode
268 utf-fragment-on-decoding 366 utf-fragment-on-decoding
269 utf-translate-cjk)))) 367 utf-translate-cjk)))
368
369 (make-coding-system
370 'mule-utf-16-le-with-signature 4 ?u
371 (concat
372 "Little endian UTF-16 (with BOM) for Emacs-supported Unicode characters."
373 doc)
374
375 '(ccl-decode-mule-utf-16-le-with-signature
376 . ccl-encode-mule-utf-16-le-with-signature)
377 '((safe-charsets
378 ascii
379 eight-bit-control
380 latin-iso8859-1
381 mule-unicode-0100-24ff
382 mule-unicode-2500-33ff
383 mule-unicode-e000-ffff)
384 (coding-category . coding-category-utf-16-le)
385 (mime-charset . utf-16)
386 (valid-codes (0 . 255))
387 (dependency unify-8859-on-encoding-mode
388 unify-8859-on-decoding-mode
389 utf-fragment-on-decoding
390 utf-translate-cjk)))
391
392 (make-coding-system
393 'mule-utf-16-be-with-signature 4 ?u
394 (concat
395 "Big endian UTF-16 (with BOM) for Emacs-supported Unicode characters."
396 doc)
397
398 '(ccl-decode-mule-utf-16-be-with-signature
399 . ccl-encode-mule-utf-16-be-with-signature)
400 '((safe-charsets
401 ascii
402 eight-bit-control
403 latin-iso8859-1
404 mule-unicode-0100-24ff
405 mule-unicode-2500-33ff
406 mule-unicode-e000-ffff)
407 (coding-category . coding-category-utf-16-be)
408 (mime-charset . utf-16)
409 (valid-codes (0 . 255))
410 (dependency unify-8859-on-encoding-mode
411 unify-8859-on-decoding-mode
412 utf-fragment-on-decoding
413 utf-translate-cjk)))
414
415 (make-coding-system
416 'mule-utf-16 4 ?u
417 (concat
418 "UTF-16 (with or without BOM) for Emacs-supported Unicode characters."
419 doc)
420
421 '(ccl-decode-mule-utf-16 . ccl-encode-mule-utf-16-be-with-signature)
422 '((safe-charsets
423 ascii
424 eight-bit-control
425 latin-iso8859-1
426 mule-unicode-0100-24ff
427 mule-unicode-2500-33ff
428 mule-unicode-e000-ffff)
429 (coding-category . coding-category-utf-16-be)
430 (mime-charset . utf-16)
431 (valid-codes (0 . 255))
432 (dependency unify-8859-on-encoding-mode
433 unify-8859-on-decoding-mode
434 utf-fragment-on-decoding
435 utf-translate-cjk)
436 (post-read-conversion . mule-utf-16-post-read-conversion)))
437 )
270 438
271 (define-coding-system-alias 'utf-16-le 'mule-utf-16-le) 439 (define-coding-system-alias 'utf-16-le 'mule-utf-16-le)
272 (define-coding-system-alias 'utf-16-be 'mule-utf-16-be) 440 (define-coding-system-alias 'utf-16-be 'mule-utf-16-be)
441 (define-coding-system-alias 'utf-16-le-with-signature
442 'mule-utf-16-le-with-signature)
443 (define-coding-system-alias 'utf-16-be-with-signature
444 'mule-utf-16-be-with-signature)
445 (define-coding-system-alias 'utf-16 'mule-utf-16)
273 446
274 ;;; utf-16.el ends here 447 ;;; utf-16.el ends here