comparison lisp/international/utf-16.el @ 50349:12444cb90785

(ccl-decode-mule-utf-16-le): Don't assume the signature bytes. (ccl-decode-mule-utf-16-be): Likewise. (ccl-encode-mule-utf-16-le): Don't produce the signature bytes. (ccl-encode-mule-utf-16-be): Likewise.
author Kenichi Handa <handa@m17n.org>
date Mon, 31 Mar 2003 01:48:23 +0000
parents 0d8b17d428b5
children 15382232cf57
comparison
equal deleted inserted replaced
50348:76c478507d78 50349:12444cb90785
66 ;; Intended for untranslatable utf-16 sequences.") 66 ;; Intended for untranslatable utf-16 sequences.")
67 67
68 ;; Needed in macro expansion, so can't be let-bound. Zapped after use. 68 ;; Needed in macro expansion, so can't be let-bound. Zapped after use.
69 (eval-and-compile 69 (eval-and-compile
70 (defconst utf-16-decode-ucs 70 (defconst utf-16-decode-ucs
71 ;; We have the unicode in r1. Output is charset ID in r0, code point 71 ;; We have the unicode in r1. Output is charset ID in r0, code
72 ;; in r1. 72 ;; point in r1. As r6 keeps endian information, the value should
73 ;; not be changed.
73 `((lookup-integer utf-subst-table-for-decode r1 r3) 74 `((lookup-integer utf-subst-table-for-decode r1 r3)
74 (if r7 ; got a translation 75 (if r7 ; got a translation
75 ((r0 = r1) (r1 = r3)) 76 ((r0 = r1) (r1 = r3))
76 (if (r1 < 128) 77 (if (r1 < 128)
77 (r0 = ,(charset-id 'ascii)) 78 (r0 = ,(charset-id 'ascii))
112 (r1 %= 96) 113 (r1 %= 96)
113 (r1 += (r2 + 32)))))))))))))) 114 (r1 += (r2 + 32))))))))))))))
114 115
115 (define-ccl-program ccl-decode-mule-utf-16-le 116 (define-ccl-program ccl-decode-mule-utf-16-le
116 `(2 ; 2 bytes -> 1 to 4 bytes 117 `(2 ; 2 bytes -> 1 to 4 bytes
117 ((read r0 r1) ; signature 118 ((loop
118 (loop
119 (read r3 r4) 119 (read r3 r4)
120 (r1 = (r4 <8 r3)) 120 (r1 = (r4 <8 r3))
121 ,utf-16-decode-ucs 121 ,utf-16-decode-ucs
122 (translate-character utf-translation-table-for-decode r0 r1) 122 (translate-character utf-translation-table-for-decode r0 r1)
123 (write-multibyte-character r0 r1) 123 (write-multibyte-character r0 r1)
124 (repeat)))) 124 (repeat))))
125 "Decode little endian UTF-16 (ignoring signature bytes). 125 "Decode UTF-16LE (little endian without signature bytes).
126 Basic decoding is done into the charsets ascii, latin-iso8859-1 and 126 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
127 mule-unicode-*. Un-representable Unicode characters are decoded as 127 mule-unicode-*. Un-representable Unicode characters are decoded as
128 U+fffd. The result is run through the translation-table named 128 U+fffd. The result is run through the translation-table named
129 `utf-translation-table-for-decode'.") 129 `utf-translation-table-for-decode'.")
130 130
131 (define-ccl-program ccl-decode-mule-utf-16-be 131 (define-ccl-program ccl-decode-mule-utf-16-be
132 `(2 ; 2 bytes -> 1 to 4 bytes 132 `(2 ; 2 bytes -> 1 to 4 bytes
133 ((read r0 r1) ; signature 133 ((loop
134 (loop
135 (read r3 r4) 134 (read r3 r4)
136 (r1 = (r3 <8 r4)) 135 (r1 = (r3 <8 r4))
137 ,utf-16-decode-ucs 136 ,utf-16-decode-ucs
138 (translate-character utf-translation-table-for-decode r0 r1) 137 (translate-character utf-translation-table-for-decode r0 r1)
139 (write-multibyte-character r0 r1) 138 (write-multibyte-character r0 r1)
140 (repeat)))) 139 (repeat))))
141 "Decode big endian UTF-16 (ignoring signature bytes). 140 "Decode UTF-16BE (big endian without signature bytes).
142 Basic decoding is done into the charsets ascii, latin-iso8859-1 and 141 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
143 mule-unicode-*. Un-representable Unicode characters are 142 mule-unicode-*. Un-representable Unicode characters are
144 decoded as U+fffd. The result is run through the translation-table of 143 decoded as U+fffd. The result is run through the translation-table of
145 name `utf-translation-table-for-decode'.") 144 name `utf-translation-table-for-decode'.")
146 145
171 (r0 = (r3 + #xe000)) 170 (r0 = (r3 + #xe000))
172 (r0 = #xfffd))))))))))) 171 (r0 = #xfffd)))))))))))
173 172
174 (define-ccl-program ccl-encode-mule-utf-16-le 173 (define-ccl-program ccl-encode-mule-utf-16-le
175 `(1 174 `(1
176 ((write #xff) 175 ((loop
177 (write #xfe)
178 (loop
179 (read-multibyte-character r0 r1) 176 (read-multibyte-character r0 r1)
180 (lookup-character utf-subst-table-for-encode r0 r1) 177 (lookup-character utf-subst-table-for-encode r0 r1)
181 (if (r7 == 0) 178 (if (r7 == 0)
182 ((translate-character utf-translation-table-for-encode r0 r1) 179 ((translate-character utf-translation-table-for-encode r0 r1)
183 ,utf-16-decode-to-ucs)) 180 ,utf-16-decode-to-ucs))
184 (write (r0 & 255)) 181 (write (r0 & 255))
185 (write (r0 >> 8)) 182 (write (r0 >> 8))
186 (repeat)))) 183 (repeat))))
187 "Encode to little endian UTF-16 with signature. 184 "Encode to UTF-16LE (little endian without signature).
188 Characters from the charsets ascii, eight-bit-control, 185 Characters from the charsets ascii, eight-bit-control,
189 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded 186 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
190 after translation through the translation-table of name 187 after translation through the translation-table of name
191 `utf-translation-table-for-encode'. 188 `utf-translation-table-for-encode'.
192 Others are encoded as U+FFFD.") 189 Others are encoded as U+FFFD.")
193 190
194 (define-ccl-program ccl-encode-mule-utf-16-be 191 (define-ccl-program ccl-encode-mule-utf-16-be
195 `(1 192 `(1
196 ((write #xfe) 193 ((loop
197 (write #xff)
198 (loop
199 (read-multibyte-character r0 r1) 194 (read-multibyte-character r0 r1)
200 (lookup-character utf-subst-table-for-encode r0 r1) 195 (lookup-character utf-subst-table-for-encode r0 r1)
201 (if (r7 == 0) 196 (if (r7 == 0)
202 ((translate-character utf-translation-table-for-encode r0 r1) 197 ((translate-character utf-translation-table-for-encode r0 r1)
203 ,utf-16-decode-to-ucs)) 198 ,utf-16-decode-to-ucs))
204 (write (r0 >> 8)) 199 (write (r0 >> 8))
205 (write (r0 & 255)) 200 (write (r0 & 255))
206 (repeat)))) 201 (repeat))))
207 "Encode to big endian UTF-16 with signature. 202 "Encode to UTF-16BE (big endian without signature).
208 Characters from the charsets ascii, eight-bit-control, 203 Characters from the charsets ascii, eight-bit-control,
209 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded 204 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
210 after translation through the translation-table named 205 after translation through the translation-table named
211 `utf-translation-table-for-encode'. 206 `utf-translation-table-for-encode'.
212 Others are encoded as U+FFFD.") 207 Others are encoded as U+FFFD.")
213 208
214 (makunbound 'utf-16-decode-to-ucs) 209 (makunbound 'utf-16-decode-to-ucs)
215 210
216 (let ((doc " 211 (let ((doc "
217
218 Assumes and ignores the leading two-byte signature.
219 212
220 It supports Unicode characters of these ranges: 213 It supports Unicode characters of these ranges:
221 U+0000..U+33FF, U+E000..U+FFFF. 214 U+0000..U+33FF, U+E000..U+FFFF.
222 They correspond to these Emacs character sets: 215 They correspond to these Emacs character sets:
223 ascii, latin-iso8859-1, mule-unicode-0100-24ff, 216 ascii, latin-iso8859-1, mule-unicode-0100-24ff,