Mercurial > emacs
comparison lisp/international/utf-16.el @ 50349:12444cb90785
(ccl-decode-mule-utf-16-le): Don't assume the signature bytes.
(ccl-decode-mule-utf-16-be): Likewise.
(ccl-encode-mule-utf-16-le): Don't produce the signature bytes.
(ccl-encode-mule-utf-16-be): Likewise.
author | Kenichi Handa <handa@m17n.org> |
---|---|
date | Mon, 31 Mar 2003 01:48:23 +0000 |
parents | 0d8b17d428b5 |
children | 15382232cf57 |
comparison
equal
deleted
inserted
replaced
50348:76c478507d78 | 50349:12444cb90785 |
---|---|
66 ;; Intended for untranslatable utf-16 sequences.") | 66 ;; Intended for untranslatable utf-16 sequences.") |
67 | 67 |
68 ;; Needed in macro expansion, so can't be let-bound. Zapped after use. | 68 ;; Needed in macro expansion, so can't be let-bound. Zapped after use. |
69 (eval-and-compile | 69 (eval-and-compile |
70 (defconst utf-16-decode-ucs | 70 (defconst utf-16-decode-ucs |
71 ;; We have the unicode in r1. Output is charset ID in r0, code point | 71 ;; We have the unicode in r1. Output is charset ID in r0, code |
72 ;; in r1. | 72 ;; point in r1. As r6 keeps endian information, the value should |
73 ;; not be changed. | |
73 `((lookup-integer utf-subst-table-for-decode r1 r3) | 74 `((lookup-integer utf-subst-table-for-decode r1 r3) |
74 (if r7 ; got a translation | 75 (if r7 ; got a translation |
75 ((r0 = r1) (r1 = r3)) | 76 ((r0 = r1) (r1 = r3)) |
76 (if (r1 < 128) | 77 (if (r1 < 128) |
77 (r0 = ,(charset-id 'ascii)) | 78 (r0 = ,(charset-id 'ascii)) |
112 (r1 %= 96) | 113 (r1 %= 96) |
113 (r1 += (r2 + 32)))))))))))))) | 114 (r1 += (r2 + 32)))))))))))))) |
114 | 115 |
115 (define-ccl-program ccl-decode-mule-utf-16-le | 116 (define-ccl-program ccl-decode-mule-utf-16-le |
116 `(2 ; 2 bytes -> 1 to 4 bytes | 117 `(2 ; 2 bytes -> 1 to 4 bytes |
117 ((read r0 r1) ; signature | 118 ((loop |
118 (loop | |
119 (read r3 r4) | 119 (read r3 r4) |
120 (r1 = (r4 <8 r3)) | 120 (r1 = (r4 <8 r3)) |
121 ,utf-16-decode-ucs | 121 ,utf-16-decode-ucs |
122 (translate-character utf-translation-table-for-decode r0 r1) | 122 (translate-character utf-translation-table-for-decode r0 r1) |
123 (write-multibyte-character r0 r1) | 123 (write-multibyte-character r0 r1) |
124 (repeat)))) | 124 (repeat)))) |
125 "Decode little endian UTF-16 (ignoring signature bytes). | 125 "Decode UTF-16LE (little endian without signature bytes). |
126 Basic decoding is done into the charsets ascii, latin-iso8859-1 and | 126 Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
127 mule-unicode-*. Un-representable Unicode characters are decoded as | 127 mule-unicode-*. Un-representable Unicode characters are decoded as |
128 U+fffd. The result is run through the translation-table named | 128 U+fffd. The result is run through the translation-table named |
129 `utf-translation-table-for-decode'.") | 129 `utf-translation-table-for-decode'.") |
130 | 130 |
131 (define-ccl-program ccl-decode-mule-utf-16-be | 131 (define-ccl-program ccl-decode-mule-utf-16-be |
132 `(2 ; 2 bytes -> 1 to 4 bytes | 132 `(2 ; 2 bytes -> 1 to 4 bytes |
133 ((read r0 r1) ; signature | 133 ((loop |
134 (loop | |
135 (read r3 r4) | 134 (read r3 r4) |
136 (r1 = (r3 <8 r4)) | 135 (r1 = (r3 <8 r4)) |
137 ,utf-16-decode-ucs | 136 ,utf-16-decode-ucs |
138 (translate-character utf-translation-table-for-decode r0 r1) | 137 (translate-character utf-translation-table-for-decode r0 r1) |
139 (write-multibyte-character r0 r1) | 138 (write-multibyte-character r0 r1) |
140 (repeat)))) | 139 (repeat)))) |
141 "Decode big endian UTF-16 (ignoring signature bytes). | 140 "Decode UTF-16BE (big endian without signature bytes). |
142 Basic decoding is done into the charsets ascii, latin-iso8859-1 and | 141 Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
143 mule-unicode-*. Un-representable Unicode characters are | 142 mule-unicode-*. Un-representable Unicode characters are |
144 decoded as U+fffd. The result is run through the translation-table of | 143 decoded as U+fffd. The result is run through the translation-table of |
145 name `utf-translation-table-for-decode'.") | 144 name `utf-translation-table-for-decode'.") |
146 | 145 |
171 (r0 = (r3 + #xe000)) | 170 (r0 = (r3 + #xe000)) |
172 (r0 = #xfffd))))))))))) | 171 (r0 = #xfffd))))))))))) |
173 | 172 |
174 (define-ccl-program ccl-encode-mule-utf-16-le | 173 (define-ccl-program ccl-encode-mule-utf-16-le |
175 `(1 | 174 `(1 |
176 ((write #xff) | 175 ((loop |
177 (write #xfe) | |
178 (loop | |
179 (read-multibyte-character r0 r1) | 176 (read-multibyte-character r0 r1) |
180 (lookup-character utf-subst-table-for-encode r0 r1) | 177 (lookup-character utf-subst-table-for-encode r0 r1) |
181 (if (r7 == 0) | 178 (if (r7 == 0) |
182 ((translate-character utf-translation-table-for-encode r0 r1) | 179 ((translate-character utf-translation-table-for-encode r0 r1) |
183 ,utf-16-decode-to-ucs)) | 180 ,utf-16-decode-to-ucs)) |
184 (write (r0 & 255)) | 181 (write (r0 & 255)) |
185 (write (r0 >> 8)) | 182 (write (r0 >> 8)) |
186 (repeat)))) | 183 (repeat)))) |
187 "Encode to little endian UTF-16 with signature. | 184 "Encode to UTF-16LE (little endian without signature). |
188 Characters from the charsets ascii, eight-bit-control, | 185 Characters from the charsets ascii, eight-bit-control, |
189 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | 186 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded |
190 after translation through the translation-table of name | 187 after translation through the translation-table of name |
191 `utf-translation-table-for-encode'. | 188 `utf-translation-table-for-encode'. |
192 Others are encoded as U+FFFD.") | 189 Others are encoded as U+FFFD.") |
193 | 190 |
194 (define-ccl-program ccl-encode-mule-utf-16-be | 191 (define-ccl-program ccl-encode-mule-utf-16-be |
195 `(1 | 192 `(1 |
196 ((write #xfe) | 193 ((loop |
197 (write #xff) | |
198 (loop | |
199 (read-multibyte-character r0 r1) | 194 (read-multibyte-character r0 r1) |
200 (lookup-character utf-subst-table-for-encode r0 r1) | 195 (lookup-character utf-subst-table-for-encode r0 r1) |
201 (if (r7 == 0) | 196 (if (r7 == 0) |
202 ((translate-character utf-translation-table-for-encode r0 r1) | 197 ((translate-character utf-translation-table-for-encode r0 r1) |
203 ,utf-16-decode-to-ucs)) | 198 ,utf-16-decode-to-ucs)) |
204 (write (r0 >> 8)) | 199 (write (r0 >> 8)) |
205 (write (r0 & 255)) | 200 (write (r0 & 255)) |
206 (repeat)))) | 201 (repeat)))) |
207 "Encode to big endian UTF-16 with signature. | 202 "Encode to UTF-16BE (big endian without signature). |
208 Characters from the charsets ascii, eight-bit-control, | 203 Characters from the charsets ascii, eight-bit-control, |
209 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | 204 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded |
210 after translation through the translation-table named | 205 after translation through the translation-table named |
211 `utf-translation-table-for-encode'. | 206 `utf-translation-table-for-encode'. |
212 Others are encoded as U+FFFD.") | 207 Others are encoded as U+FFFD.") |
213 | 208 |
214 (makunbound 'utf-16-decode-to-ucs) | 209 (makunbound 'utf-16-decode-to-ucs) |
215 | 210 |
216 (let ((doc " | 211 (let ((doc " |
217 | |
218 Assumes and ignores the leading two-byte signature. | |
219 | 212 |
220 It supports Unicode characters of these ranges: | 213 It supports Unicode characters of these ranges: |
221 U+0000..U+33FF, U+E000..U+FFFF. | 214 U+0000..U+33FF, U+E000..U+FFFF. |
222 They correspond to these Emacs character sets: | 215 They correspond to these Emacs character sets: |
223 ascii, latin-iso8859-1, mule-unicode-0100-24ff, | 216 ascii, latin-iso8859-1, mule-unicode-0100-24ff, |