Mercurial > emacs
comparison lisp/international/utf-16.el @ 47705:797c350a7f8c
(utf-16-decode-ucs): Look up
utf-subst-table-for-decode. Fix for the case that the looking up
succeeds.
(ccl-decode-mule-utf-16-le): Translate characters by
utf-translation-table-for-decode.
(ccl-decode-mule-utf-16-be): Likewise.
(ccl-encode-mule-utf-16-le): Look up utf-subst-table-for-encode
at first. Translate characters by
utf-translation-table-for-encode.
(ccl-encode-mule-utf-16-be): Likewise.
(mule-utf-16-le, mule-utf-16-be): Add `dependency' property.
author | Kenichi Handa <handa@m17n.org> |
---|---|
date | Mon, 30 Sep 2002 06:37:00 +0000 |
parents | e83401f8d61c |
children | c024873f225a |
comparison
equal
deleted
inserted
replaced
47704:8a5233d2f072 | 47705:797c350a7f8c |
---|---|
66 ;; Intended for untranslatable utf-16 sequences.") | 66 ;; Intended for untranslatable utf-16 sequences.") |
67 | 67 |
68 ;; Needed in macro expansion, so can't be let-bound. Zapped after use. | 68 ;; Needed in macro expansion, so can't be let-bound. Zapped after use. |
69 (eval-and-compile | 69 (eval-and-compile |
70 (defconst utf-16-decode-ucs | 70 (defconst utf-16-decode-ucs |
71 ;; We have the unicode in r1. Output is character codes in r0, r1, | 71 ;; We have the unicode in r1. Output is charset ID in r0, code point |
72 ;; and r2 if appropriate. | 72 ;; in r1. |
73 `((lookup-integer utf-8-subst-table r0 r3) | 73 `((lookup-integer utf-subst-table-for-decode r1 r3) |
74 (if r7 (r1 = r3)) ; got a translation | 74 (if r7 ; got a translation |
75 (if (r1 < 128) | 75 ((r0 = r1) (r1 = r3)) |
76 (r0 = ,(charset-id 'ascii)) | 76 (if (r1 < 128) |
77 (if (r1 < 160) | 77 (r0 = ,(charset-id 'ascii)) |
78 (r0 = ,(charset-id 'eight-bit-control)) | 78 (if (r1 < 160) |
79 (if (r1 < 256) | 79 (r0 = ,(charset-id 'eight-bit-control)) |
80 ((r0 = ,(charset-id 'latin-iso8859-1)) | 80 (if (r1 < 256) |
81 (r1 -= 128)) | 81 ((r0 = ,(charset-id 'latin-iso8859-1)) |
82 (if (r1 < #x2500) | 82 (r1 -= 128)) |
83 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | 83 (if (r1 < #x2500) |
84 (r1 -= #x100) | 84 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) |
85 (r2 = (((r1 / 96) + 32) << 7)) | 85 (r1 -= #x100) |
86 (r1 %= 96) | 86 (r2 = (((r1 / 96) + 32) << 7)) |
87 (r1 += (r2 + 32))) | 87 (r1 %= 96) |
88 (if (r1 < #x3400) | 88 (r1 += (r2 + 32))) |
89 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | 89 (if (r1 < #x3400) |
90 (r1 -= #x2500) | 90 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) |
91 (r2 = (((r1 / 96) + 32) << 7)) | 91 (r1 -= #x2500) |
92 (r1 %= 96) | 92 (r2 = (((r1 / 96) + 32) << 7)) |
93 (r1 += (r2 + 32))) | 93 (r1 %= 96) |
94 (if (r1 < #xd800) ; 2 untranslated bytes | 94 (r1 += (r2 + 32))) |
95 ;; ;; Assume this is rare, so don't worry about the | 95 (if (r1 < #xd800) ; 2 untranslated bytes |
96 ;; ;; overhead of the call. | 96 ;; ;; Assume this is rare, so don't worry about the |
97 ;; (call mule-utf-16-untrans) | 97 ;; ;; overhead of the call. |
98 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | 98 ;; (call mule-utf-16-untrans) |
99 (r1 = 15037)) ; U+fffd | 99 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) |
100 (if (r1 < #xe000) ; surrogate | 100 (r1 = 15037)) ; U+fffd |
101 ;; ((call mule-utf-16-untrans) | 101 (if (r1 < #xe000) ; surrogate |
102 ;; (write-multibyte-character r0 r1) | 102 ;; ((call mule-utf-16-untrans) |
103 ;; (read r3 r4) | 103 ;; (write-multibyte-character r0 r1) |
104 ;; (call mule-utf-16-untrans)) | 104 ;; (read r3 r4) |
105 ((read r3 r4) | 105 ;; (call mule-utf-16-untrans)) |
106 (r0 = ,(charset-id 'mule-unicode-e000-ffff)) | 106 ((read r3 r4) |
107 (r1 = 15037)) | 107 (r0 = ,(charset-id 'mule-unicode-e000-ffff)) |
108 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | 108 (r1 = 15037)) |
109 (r1 -= #xe000) | 109 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) |
110 (r2 = (((r1 / 96) + 32) << 7)) | 110 (r1 -= #xe000) |
111 (r1 %= 96) | 111 (r2 = (((r1 / 96) + 32) << 7)) |
112 (r1 += (r2 + 32))))))))))))) | 112 (r1 %= 96) |
113 (r1 += (r2 + 32)))))))))))))) | |
113 | 114 |
114 (define-ccl-program ccl-decode-mule-utf-16-le | 115 (define-ccl-program ccl-decode-mule-utf-16-le |
115 `(2 ; 2 bytes -> 1 to 4 bytes | 116 `(2 ; 2 bytes -> 1 to 4 bytes |
116 ((read r0 r1) ; signature | 117 ((read r0 r1) ; signature |
117 (loop | 118 (loop |
118 (read r3 r4) | 119 (read r3 r4) |
119 (r1 = (r4 <8 r3)) | 120 (r1 = (r4 <8 r3)) |
120 ,utf-16-decode-ucs | 121 ,utf-16-decode-ucs |
121 (translate-character utf-8-translation-table-for-decode r0 r1) | 122 (translate-character utf-translation-table-for-decode r0 r1) |
122 (write-multibyte-character r0 r1) | 123 (write-multibyte-character r0 r1) |
123 (repeat)))) | 124 (repeat)))) |
124 "Decode little endian UTF-16 (ignoring signature bytes). | 125 "Decode little endian UTF-16 (ignoring signature bytes). |
125 Basic decoding is done into the charsets ascii, latin-iso8859-1 and | 126 Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
126 mule-unicode-*. Un-representable Unicode characters are | 127 mule-unicode-*. Un-representable Unicode characters are decoded as |
127 decoded as U+fffd. The result is run through translation table | 128 U+fffd. The result is run through the translation-table named |
128 `utf-8-translation-table-for-decode' if that is defined.") | 129 `utf-translation-table-for-decode'.") |
129 | 130 |
130 (define-ccl-program ccl-decode-mule-utf-16-be | 131 (define-ccl-program ccl-decode-mule-utf-16-be |
131 `(2 ; 2 bytes -> 1 to 4 bytes | 132 `(2 ; 2 bytes -> 1 to 4 bytes |
132 ((read r0 r1) ; signature | 133 ((read r0 r1) ; signature |
133 (loop | 134 (loop |
134 (read r3 r4) | 135 (read r3 r4) |
135 (r1 = (r3 <8 r4)) | 136 (r1 = (r3 <8 r4)) |
136 ,utf-16-decode-ucs | 137 ,utf-16-decode-ucs |
137 (translate-character utf-8-translation-table-for-decode r0 r1) | 138 (translate-character utf-translation-table-for-decode r0 r1) |
138 (write-multibyte-character r0 r1) | 139 (write-multibyte-character r0 r1) |
139 (repeat)))) | 140 (repeat)))) |
140 "Decode big endian UTF-16 (ignoring signature bytes). | 141 "Decode big endian UTF-16 (ignoring signature bytes). |
141 Basic decoding is done into the charsets ascii, latin-iso8859-1 and | 142 Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
142 mule-unicode-*. Un-representable Unicode characters are | 143 mule-unicode-*. Un-representable Unicode characters are |
143 decoded as U+fffd. The result is run through translation table | 144 decoded as U+fffd. The result is run through the translation-table of |
144 `utf-8-non-latin-8859-table'.") | 145 name `utf-translation-table-for-decode'.") |
145 | 146 |
146 (makunbound 'utf-16-decode-ucs) ; done with it | 147 (makunbound 'utf-16-decode-ucs) ; done with it |
147 | 148 |
148 (eval-and-compile | 149 (eval-and-compile |
149 (defconst utf-16-decode-to-ucs | 150 (defconst utf-16-decode-to-ucs |
174 `(1 | 175 `(1 |
175 ((write #xff) | 176 ((write #xff) |
176 (write #xfe) | 177 (write #xfe) |
177 (loop | 178 (loop |
178 (read-multibyte-character r0 r1) | 179 (read-multibyte-character r0 r1) |
179 (translate-character ucs-mule-to-mule-unicode r0 r1) | 180 (lookup-character utf-subst-table-for-encode r0 r1) |
180 ,utf-16-decode-to-ucs | 181 (if (r7 == 0) |
182 ((translate-character utf-translation-table-for-encode r0 r1) | |
183 ,utf-16-decode-to-ucs)) | |
181 (write (r0 & 255)) | 184 (write (r0 & 255)) |
182 (write (r0 >> 8)) | 185 (write (r0 >> 8)) |
183 (repeat)))) | 186 (repeat)))) |
184 "Encode to little endian UTF-16 with signature. | 187 "Encode to little endian UTF-16 with signature. |
185 Characters from the charsets ascii, eight-bit-control, | 188 Characters from the charsets ascii, eight-bit-control, |
186 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | 189 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded |
187 after translation through the table `ucs-mule-to-mule-unicode'. | 190 after translation through the translation-table of name |
191 `utf-translation-table-for-encode'. | |
188 Others are encoded as U+FFFD.") | 192 Others are encoded as U+FFFD.") |
189 | 193 |
190 (define-ccl-program ccl-encode-mule-utf-16-be | 194 (define-ccl-program ccl-encode-mule-utf-16-be |
191 `(1 | 195 `(1 |
192 ((write #xfe) | 196 ((write #xfe) |
193 (write #xff) | 197 (write #xff) |
194 (loop | 198 (loop |
195 (read-multibyte-character r0 r1) | 199 (read-multibyte-character r0 r1) |
196 (translate-character ucs-mule-to-mule-unicode r0 r1) | 200 (lookup-character utf-subst-table-for-encode r0 r1) |
197 ,utf-16-decode-to-ucs | 201 (if (r7 == 0) |
202 ((translate-character utf-translation-table-for-encode r0 r1) | |
203 ,utf-16-decode-to-ucs)) | |
198 (write (r0 >> 8)) | 204 (write (r0 >> 8)) |
199 (write (r0 & 255)) | 205 (write (r0 & 255)) |
200 (repeat)))) | 206 (repeat)))) |
201 "Encode to big endian UTF-16 with signature. | 207 "Encode to big endian UTF-16 with signature. |
202 Characters from the charsets ascii, eight-bit-control, | 208 Characters from the charsets ascii, eight-bit-control, |
203 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | 209 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded |
204 after translation through the table `ucs-mule-to-mule-unicode'. | 210 after translation through the translation-table named |
211 `utf-translation-table-for-encode'. | |
205 Others are encoded as U+FFFD.") | 212 Others are encoded as U+FFFD.") |
206 | 213 |
207 (makunbound 'utf-16-decode-to-ucs) | 214 (makunbound 'utf-16-decode-to-ucs) |
208 | 215 |
209 (let ((doc " | 216 (let ((doc " |
210 | 217 |
211 Assumes and ignores the leading two-byte signature. | 218 Assumes and ignores the leading two-byte signature. |
212 | 219 |
213 The supported Emacs character sets are the following, plus others | 220 It supports Unicode characters of these ranges: |
214 which may be included in the translation table | 221 U+0000..U+33FF, U+E000..U+FFFF. |
215 `ucs-mule-to-mule-unicode': | 222 They correspond to these Emacs character sets: |
216 ascii | 223 ascii, latin-iso8859-1, mule-unicode-0100-24ff, |
217 eight-bit-control | 224 mule-unicode-2500-33ff, mule-unicode-e000-ffff |
218 latin-iso8859-1 | 225 |
219 mule-unicode-0100-24ff | 226 On decoding (e.g. reading a file), Unicode characters not in the above |
220 mule-unicode-2500-33ff | 227 ranges are decoded as U+FFFD, effectively corrupting the data |
221 mule-unicode-e000-ffff | 228 if they are re-encoded. |
222 | 229 |
223 Note that Unicode characters out of the ranges U+0000-U+33FF and | 230 On encoding (e.g. writing a file), Emacs characters not belonging to |
224 U+E200-U+FFFF are decoded as U+FFFD, effectively corrupting the data | 231 any of the character sets listed above are encoded into the byte |
225 if they are re-encoded. Emacs characters without Unicode conversions | 232 sequence representing U+FFFD (REPLACEMENT CHARACTER).")) |
226 are encoded as U+FFFD.")) | |
227 (make-coding-system | 233 (make-coding-system |
228 'mule-utf-16-le 4 | 234 'mule-utf-16-le 4 |
229 ?u ; Mule-UCS uses ?U, but code-pages uses that for koi8-u. | 235 ?u ; Mule-UCS uses ?U, but code-pages uses that for koi8-u. |
230 (concat | 236 (concat |
231 "Little endian UTF-16 encoding for Emacs-supported Unicode characters." | 237 "Little endian UTF-16 encoding for Emacs-supported Unicode characters." |
240 mule-unicode-2500-33ff | 246 mule-unicode-2500-33ff |
241 mule-unicode-e000-ffff) | 247 mule-unicode-e000-ffff) |
242 (mime-charset . utf-16le) | 248 (mime-charset . utf-16le) |
243 (coding-category . coding-category-utf-16-le) | 249 (coding-category . coding-category-utf-16-le) |
244 (valid-codes (0 . 255)) | 250 (valid-codes (0 . 255)) |
245 (pre-write-conversion . utf-16-le-pre-write-conversion))) | 251 (pre-write-conversion . utf-16-le-pre-write-conversion) |
252 (dependency unify-8859-on-encoding-mode | |
253 unify-8859-on-decoding-mode | |
254 utf-fragment-on-decoding | |
255 utf-translate-cjk))) | |
246 | 256 |
247 (make-coding-system | 257 (make-coding-system |
248 'mule-utf-16-be 4 ?u | 258 'mule-utf-16-be 4 ?u |
249 (concat | 259 (concat |
250 "Big endian UTF-16 encoding for Emacs-supported Unicode characters." | 260 "Big endian UTF-16 encoding for Emacs-supported Unicode characters." |
259 mule-unicode-2500-33ff | 269 mule-unicode-2500-33ff |
260 mule-unicode-e000-ffff) | 270 mule-unicode-e000-ffff) |
261 (mime-charset . utf-16be) | 271 (mime-charset . utf-16be) |
262 (coding-category . coding-category-utf-16-be) | 272 (coding-category . coding-category-utf-16-be) |
263 (valid-codes (0 . 255)) | 273 (valid-codes (0 . 255)) |
264 (pre-write-conversion . utf-16-be-pre-write-conversion))) | 274 (pre-write-conversion . utf-16-be-pre-write-conversion) |
265 | 275 (dependency unify-8859-on-encoding-mode |
266 (register-char-codings 'mule-utf-16-le ucs-mule-to-mule-unicode) | 276 unify-8859-on-decoding-mode |
267 (register-char-codings 'mule-utf-16-be ucs-mule-to-mule-unicode)) | 277 utf-fragment-on-decoding |
278 utf-translate-cjk)))) | |
268 | 279 |
269 (define-coding-system-alias 'utf-16-le 'mule-utf-16-le) | 280 (define-coding-system-alias 'utf-16-le 'mule-utf-16-le) |
270 (define-coding-system-alias 'utf-16-be 'mule-utf-16-be) | 281 (define-coding-system-alias 'utf-16-be 'mule-utf-16-be) |
271 | 282 |
272 (provide 'utf-16) | 283 (provide 'utf-16) |