comparison lisp/international/utf-16.el @ 56038:b7fe21511efe

(utf-16-decode-ucs): Handle a surrogate pair correctly. Call ccl-mule-utf-untrans for untranslable chars. (utf-16le-decode-loop): Set r5 to -1 before loop. (utf-16be-decode-loop): Likewise. (ccl-decode-mule-utf-16le): Add EOF processing block. (ccl-decode-mule-utf-16be): Likewise. (ccl-decode-mule-utf-16le-with-signature): Likewise. (ccl-decode-mule-utf-16be-with-signature): Likewise. (ccl-decode-mule-utf-16): Likewise. Set r5 to -1 initially. (ccl-mule-utf-16-encode-untrans): New CCL. (utf-16-decode-to-ucs): Handle pre-read character. (utf-16le-encode-loop): Handle surrogate pair. (utf-16be-encode-loop): Likewise. (ccl-encode-mule-utf-16le-with-signature): Adjusted for the change of utf-16le-encode-loop. (ccl-encode-mule-utf-16be-with-signature): Adjusted for the change of utf-16be-encode-loop. (mule-utf-16-post-read-conversion): Call utf-8-post-read-conversion at first. (mule-utf-16[{le|be}], mule-utf-16{le|be}-with-signature): Include CJK charsets in safe-charsets if utf-translate-cjk-mode is on. Add post-read-conversion and pre-write-conversion.
author Kenichi Handa <handa@m17n.org>
date Sat, 12 Jun 2004 02:18:36 +0000
parents 6e677a935fe9
children a878c25f56db
comparison
equal deleted inserted replaced
56037:81dbb510a1db 56038:b7fe21511efe
46 46
47 ;; We end up with trivially different -le and -be versions of most 47 ;; We end up with trivially different -le and -be versions of most
48 ;; things below, sometimes with commonality abstracted into a let 48 ;; things below, sometimes with commonality abstracted into a let
49 ;; binding for maintenance convenience. 49 ;; binding for maintenance convenience.
50 50
51 ;; We'd need new charsets distinct from ascii and eight-bit-control to
52 ;; deal with untranslated sequences, since we can't otherwise
53 ;; distinguish the bytes, as we can with utf-8.
54
55 ;; ;; Do a multibyte write for bytes in r3 and r4.
56 ;; ;; Intended for untranslatable utf-16 sequences.
57 ;; (define-ccl-program ccl-mule-utf-16-untrans
58 ;; `(0
59 ;; (if (r3 < 128)
60 ;; (r0 = ,(charset-id 'ascii))
61 ;; (if (r3 < 160)
62 ;; (r0 = ,(charset-id 'eight-bit-control))
63 ;; (r0 = ,(charset-id 'eight-bit-graphic))))
64 ;; (if (r4 < 128)
65 ;; (r0 = ,(charset-id 'ascii))
66 ;; (if (r4 < 160)
67 ;; (r0 = ,(charset-id 'eight-bit-control))
68 ;; (r0 = ,(charset-id 'eight-bit-graphic))))
69 ;; (r1 = r4)))
70 ;; "Do a multibyte write for bytes in r3 and r4.
71 ;; First swap them if we're big endian, indicated by r5==0.
72 ;; Intended for untranslatable utf-16 sequences.")
73
74 ;; Needed in macro expansion, so can't be let-bound. Zapped after use. 51 ;; Needed in macro expansion, so can't be let-bound. Zapped after use.
75 (eval-and-compile 52 (eval-and-compile
76 (defconst utf-16-decode-ucs 53 (defconst utf-16-decode-ucs
77 ;; We have the unicode in r1. Output is charset ID in r0, code 54 ;; If r5 is negative, r1 is a Unicode chacter code. Otherise, r5 is
78 ;; point in r1. 55 ;; the first of a surrogate pair and r1 is the second of the pair.
79 `((lookup-integer utf-subst-table-for-decode r1 r3) 56 ;; Output is charset ID in r0, code point in r1. R0 may be set to
80 (if r7 ; got a translation 57 ;; -1 in which case a caller should not write out r1.
81 ((r0 = r1) (r1 = r3)) 58 `((if (r5 >= 0)
82 (if (r1 < 128) 59 ((r0 = (r1 < #xDC00))
83 (r0 = ,(charset-id 'ascii)) 60 (if ((r1 >= #xE000) | r0)
84 (if (r1 < 160) 61 ;; Invalid second code of surrogate pair.
85 (r0 = ,(charset-id 'eight-bit-control)) 62 ((r0 = r5)
86 (if (r1 < 256) 63 (call ccl-mule-utf-untrans))
87 ((r0 = ,(charset-id 'latin-iso8859-1)) 64 ((r1 -= #xDC00)
88 (r1 -= 128)) 65 (r1 += (((r5 - #xD800) << 10) + #x10000))))
89 (if (r1 < #x2500) 66 (r5 = -1)))
90 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) 67 (if (r1 < 128)
91 (r1 -= #x100) 68 (r0 = ,(charset-id 'ascii))
92 (r2 = (((r1 / 96) + 32) << 7)) 69 ((lookup-integer utf-subst-table-for-decode r1 r3)
93 (r1 %= 96) 70 (if r7 ; got a translation
94 (r1 += (r2 + 32))) 71 ((r0 = r1) (r1 = r3))
95 (if (r1 < #x3400) 72 (if (r1 < 160)
96 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) 73 (r0 = ,(charset-id 'eight-bit-control))
97 (r1 -= #x2500) 74 (if (r1 < 256)
98 (r2 = (((r1 / 96) + 32) << 7)) 75 ((r0 = ,(charset-id 'latin-iso8859-1))
99 (r1 %= 96) 76 (r1 -= 128))
100 (r1 += (r2 + 32))) 77 (if (r1 < #x2500)
101 (if (r1 < #xd800) ; 2 untranslated bytes 78 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
102 ;; ;; Assume this is rare, so don't worry about the 79 (r1 -= #x100)
103 ;; ;; overhead of the call. 80 (r2 = (((r1 / 96) + 32) << 7))
104 ;; (call mule-utf-16-untrans) 81 (r1 %= 96)
105 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) 82 (r1 += (r2 + 32)))
106 (r1 = 15037)) ; U+fffd 83 (if (r1 < #x3400)
107 (if (r1 < #xe000) ; surrogate 84 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
108 ;; ((call mule-utf-16-untrans) 85 (r1 -= #x2500)
109 ;; (write-multibyte-character r0 r1) 86 (r2 = (((r1 / 96) + 32) << 7))
110 ;; (read r3 r4) 87 (r1 %= 96)
111 ;; (call mule-utf-16-untrans)) 88 (r1 += (r2 + 32)))
112 ((read r3 r4) 89 (if (r1 < #xD800)
113 (r0 = ,(charset-id 'mule-unicode-e000-ffff)) 90 ;; We can't have this character.
114 (r1 = 15037)) 91 ((r0 = r1)
115 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) 92 (call ccl-mule-utf-untrans)
116 (r1 -= #xe000) 93 (r5 = -1)
117 (r2 = (((r1 / 96) + 32) << 7)) 94 (r0 = -1))
118 (r1 %= 96) 95 (if (r1 < #xDC00)
119 (r1 += (r2 + 32))))))))))))) 96 ;; The first code of a surrogate pair.
97 ((r5 = r1)
98 (r0 = -1))
99 (if (r1 < #xE000)
100 ;; The second code of a surrogate pair, invalid.
101 ((r0 = r1)
102 (call ccl-mule-utf-untrans)
103 (r5 = -1)
104 (r0 = -1))
105 (if (r1 < #x10000)
106 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
107 (r1 -= #xE000)
108 (r2 = (((r1 / 96) + 32) << 7))
109 (r1 %= 96)
110 (r1 += (r2 + 32)))
111 ;; We can't have this character.
112 ((r0 = r1)
113 (call ccl-mule-utf-untrans)
114 (r5 = -1)
115 (r0 = -1)))))))))))))))
120 116
121 (defconst utf-16le-decode-loop 117 (defconst utf-16le-decode-loop
122 `(loop 118 `((r5 = -1)
123 (read r3 r4) 119 (loop
124 (r1 = (r4 <8 r3)) 120 (r3 = -1)
125 ,utf-16-decode-ucs 121 (read r3 r4)
126 (translate-character utf-translation-table-for-decode r0 r1) 122 (r1 = (r4 <8 r3))
127 (write-multibyte-character r0 r1) 123 ,@utf-16-decode-ucs
128 (repeat))) 124 (if (r0 >= 0)
125 ((translate-character utf-translation-table-for-decode r0 r1)
126 (write-multibyte-character r0 r1)))
127 (repeat))))
129 128
130 (defconst utf-16be-decode-loop 129 (defconst utf-16be-decode-loop
131 `(loop 130 `((r5 = -1)
132 (read r3 r4) 131 (loop
133 (r1 = (r3 <8 r4)) 132 (r3 = -1)
134 ,@utf-16-decode-ucs 133 (read r3 r4)
135 (translate-character utf-translation-table-for-decode r0 r1) 134 (r1 = (r3 <8 r4))
136 (write-multibyte-character r0 r1) 135 ,@utf-16-decode-ucs
137 (repeat))) 136 (if (r0 >= 0)
137 ((translate-character utf-translation-table-for-decode r0 r1)
138 (write-multibyte-character r0 r1)))
139 (repeat))))
138 140
139 ) 141 )
140 142
141 (define-ccl-program ccl-decode-mule-utf-16le 143 (define-ccl-program ccl-decode-mule-utf-16le
142 `(2 ; 2 bytes -> 1 to 4 bytes 144 `(2 ; 2 bytes -> 1 to 4 bytes
143 ,utf-16le-decode-loop) 145 ,utf-16le-decode-loop
146 ((if (r5 >= 0)
147 ((r0 = r5)
148 (call ccl-mule-utf-untrans)))
149 (if (r3 < 0)
150 nil
151 ((if (r3 < #xA0)
152 (r0 = ,(charset-id 'eight-bit-control))
153 (r0 = ,(charset-id 'eight-bit-graphic)))
154 (write-multibyte-character r0 r3)))))
144 "Decode UTF-16LE (little endian without signature bytes). 155 "Decode UTF-16LE (little endian without signature bytes).
145 Basic decoding is done into the charsets ascii, latin-iso8859-1 and 156 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
146 mule-unicode-*. Un-representable Unicode characters are decoded as 157 mule-unicode-*. Un-representable Unicode characters are decoded as
147 U+fffd. The result is run through the translation-table named 158 U+fffd. The result is run through the translation-table named
148 `utf-translation-table-for-decode'.") 159 `utf-translation-table-for-decode'.")
149 160
150 (define-ccl-program ccl-decode-mule-utf-16be 161 (define-ccl-program ccl-decode-mule-utf-16be
151 `(2 ; 2 bytes -> 1 to 4 bytes 162 `(2 ; 2 bytes -> 1 to 4 bytes
152 ,utf-16be-decode-loop) 163 ,utf-16be-decode-loop
164 ((if (r5 >= 0)
165 ((r0 = r5)
166 (call ccl-mule-utf-untrans)))
167 (if (r3 >= 0)
168 ((r0 = r3)
169 (call ccl-mule-utf-untrans)))))
153 "Decode UTF-16BE (big endian without signature bytes). 170 "Decode UTF-16BE (big endian without signature bytes).
154 Basic decoding is done into the charsets ascii, latin-iso8859-1 and 171 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
155 mule-unicode-*. Un-representable Unicode characters are 172 mule-unicode-*. Un-representable Unicode characters are
156 decoded as U+fffd. The result is run through the translation-table of 173 decoded as U+fffd. The result is run through the translation-table of
157 name `utf-translation-table-for-decode'.") 174 name `utf-translation-table-for-decode'.")
158 175
159 (define-ccl-program ccl-decode-mule-utf-16le-with-signature 176 (define-ccl-program ccl-decode-mule-utf-16le-with-signature
160 `(2 177 `(2
161 ((read r3 r4) 178 ((r3 = -1)
162 ,utf-16le-decode-loop)) 179 (read r3 r4)
180 ,@utf-16le-decode-loop)
181 (if (r3 >= 0)
182 ((r0 = r3)
183 (call ccl-mule-utf-untrans))))
163 "Like ccl-decode-utf-16le but skip the first 2-byte BOM.") 184 "Like ccl-decode-utf-16le but skip the first 2-byte BOM.")
164 185
165 (define-ccl-program ccl-decode-mule-utf-16be-with-signature 186 (define-ccl-program ccl-decode-mule-utf-16be-with-signature
166 `(2 187 `(2
167 ((read r3 r4) 188 ((r3 = -1)
168 ,utf-16be-decode-loop)) 189 (read r3 r4)
190 ,@utf-16be-decode-loop)
191 (if (r3 >= 0)
192 ((r0 = r3)
193 (call ccl-mule-utf-untrans))))
169 "Like ccl-decode-utf-16be but skip the first 2-byte BOM.") 194 "Like ccl-decode-utf-16be but skip the first 2-byte BOM.")
170 195
171 (define-ccl-program ccl-decode-mule-utf-16 196 (define-ccl-program ccl-decode-mule-utf-16
172 `(2 197 `(2
173 ((read r3 r4) 198 ((r3 = -1)
199 (read r3 r4)
174 (r1 = (r3 <8 r4)) 200 (r1 = (r3 <8 r4))
201 (r5 = -1)
175 (if (r1 == #xFFFE) 202 (if (r1 == #xFFFE)
176 ;; R1 is a BOM for little endian. We keep this character as 203 ;; R1 is a BOM for little endian. We keep this character as
177 ;; is temporarily. It is removed by post-read-conversion 204 ;; is temporarily. It is removed by post-read-conversion
178 ;; function. 205 ;; function.
179 (,@utf-16-decode-ucs 206 (,@utf-16-decode-ucs
180 (write-multibyte-character r0 r1) 207 (write-multibyte-character r0 r1)
181 ,utf-16le-decode-loop) 208 ,@utf-16le-decode-loop)
182 ((if (r1 == #xFEFF) 209 ((if (r1 == #xFEFF)
183 ;; R1 is a BOM for big endian, but we can't keep that 210 ;; R1 is a BOM for big endian, but we can't keep that
184 ;; character in the output because it can't be 211 ;; character in the output because it can't be
185 ;; distinguished with the normal U+FEFF. So, we keep 212 ;; distinguished with the normal U+FEFF. So, we keep
186 ;; #xFFFF instead. 213 ;; #xFFFF instead.
187 ((r1 = #xFFFF) 214 ((r1 = #xFFFF)
188 ,@utf-16-decode-ucs) 215 ,@utf-16-decode-ucs
189 ;; R1 a normal Unicode character. 216 (write-multibyte-character r0 r1))
217 ;; R1 is a normal Unicode character.
190 (,@utf-16-decode-ucs 218 (,@utf-16-decode-ucs
191 (translate-character utf-translation-table-for-decode r0 r1))) 219 (if (r0 >= 0)
192 (write-multibyte-character r0 r1) 220 ((translate-character utf-translation-table-for-decode r0 r1)
193 ,utf-16be-decode-loop)))) 221 (write-multibyte-character r0 r1)))))
222 ,@utf-16be-decode-loop)))
223 (if (r3 >= 0)
224 ((r0 = r3)
225 (call ccl-mule-utf-untrans))))
194 "Like ccl-decode-utf-16be/le but check the first BOM.") 226 "Like ccl-decode-utf-16be/le but check the first BOM.")
195 227
196 (makunbound 'utf-16-decode-ucs) ; done with it 228 (makunbound 'utf-16-decode-ucs) ; done with it
197 (makunbound 'utf-16le-decode-loop) 229 (makunbound 'utf-16le-decode-loop)
198 (makunbound 'utf-16be-decode-loop) 230 (makunbound 'utf-16be-decode-loop)
199 231
232 ;; UTF-16 decoder generates an UTF-8 sequence represented by a
233 ;; sequence eight-bit-control/graphic chars for an invalid byte (the
234 ;; last byte of an odd length source) and an untranslatable character
235 ;; (including an invalid surrogate-pair code-point).
236 ;;
237 ;; This CCL parses that sequence (the first byte is already in r1),
238 ;; and if the sequence represents an untranslatable character, it sets
239 ;; r1 to the original invalid code or untranslated Unicode character
240 ;; code, sets r2 to -1 (to prevent r2 and r3 are written), set2 r5 to
241 ;; -1 (to tell the caller that there's no pre-read character).
242 ;;
243 ;; If the sequence represents an invalid byte, it sets r1 to -1, r2 to
244 ;; the byte, sets r3 and r5 to -1.
245 ;;
246 ;; Otherwise, don't change r1, set r2 and r3 to already read
247 ;; eight-bit-control/graphic characters (if any), set r5 and r6 to the
248 ;; last character that invalidates the UTF-8 form.
249 ;;
250 ;; Note: For UTF-8 validation, we only check if a character is
251 ;; eight-bit-control/graphic or not. It may result in incorrect
252 ;; handling of random binary data, but such a data can't be encoded by
253 ;; UTF-16 anyway. At least, UTF-16 decoder doesn't generate such a
254 ;; sequence even if a source contains invalid byte-sequence.
255
256 (define-ccl-program ccl-mule-utf-16-encode-untrans
257 `(0
258 ((r2 = -1)
259 ;; Read the 2nd byte.
260 (read-multibyte-character r5 r6)
261 (r0 = (r5 != ,(charset-id 'eight-bit-control)))
262 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
263 ((r2 = r1)
264 (r3 = -1)
265 (r1 = -1)
266 (end))) ; invalid UTF-8
267
268 (r3 = -1)
269 (r2 = r6)
270 (if (r1 <= #xE0)
271 ;; 2-byte UTF-8, i.e. originally an invalid byte.
272 ((r2 &= #x3F)
273 (r2 |= ((r1 & #x1F) << 6))
274 (r1 = -1)
275 (r5 = -1)
276 (end)))
277
278 ;; Read the 3rd byte.
279 (read-multibyte-character r5 r6)
280 (r0 = (r5 != ,(charset-id 'eight-bit-control)))
281 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
282 ((end))) ; invalid UTF-8
283
284 (if (r1 < #xF0) ; valid 3-byte UTF-8
285 ((r1 = ((r1 & #x0F) << 12))
286 (r1 |= ((r2 & #x3F) << 6))
287 (r1 |= (r6 & #x3F))
288 (r2 = -1)
289 (r5 = -1)
290 (end)))
291
292 (r3 = r6)
293 ;; Read the 4th byte.
294 (read-multibyte-character r5 r6)
295 (r0 = (r5 != ,(charset-id 'eight-bit-control)))
296 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
297 (end)) ; livalid UTF-8
298
299 ;; valid 4-byte UTF-8
300 (r1 = ((r1 & #x07) << 18))
301 (r1 |= ((r2 & #x3F) << 12))
302 (r1 |= ((r3 & #x3F) << 6))
303 (r1 |= (r6 & #x3F))
304 (r2 = -1)
305 (r5 = -1)
306 (end))
307
308 (if (r1 >= 0)
309 ((write r1)
310 (if (r2 >= 0)
311 ((write r2)
312 (if (r3 >= 0)
313 (write r3))))))))
314
200 (eval-and-compile 315 (eval-and-compile
201 (defconst utf-16-decode-to-ucs 316 (defconst utf-16-decode-to-ucs
202 ;; CCL which, given the result of a multibyte read in r0 and r1, 317 ;; Read a character and set r1 to the corresponding Unicode code.
203 ;; sets r0 to the character's Unicode if the charset is one of the 318 ;; If r5 is not negative, it means that we have already read a
204 ;; basic utf-8 coding system ones. Otherwise set to U+fffd. 319 ;; character into r5 and r6.
205 `(if (r0 == ,(charset-id 'ascii)) 320 ;; If an invalid eight-bit-control/graphic sequence is found, r2 and
206 (r0 = r1) 321 ;; r3 may contain a byte to written out, r5 and r6 may contain a
207 (if (r0 == ,(charset-id 'latin-iso8859-1)) 322 ;; pre-read character. Usually they are set to -1.
208 (r0 = (r1 + 128)) 323 `((if (r5 < 0)
209 (if (r0 == ,(charset-id 'eight-bit-control)) 324 (read-multibyte-character r0 r1)
210 (r0 = r1) 325 ((r0 = r5)
211 (if (r0 == ,(charset-id 'eight-bit-graphic)) 326 (r1 = r6)
212 (r0 = r1) 327 (r5 = -1)))
213 ((r2 = (r1 & #x7f)) 328 (lookup-character utf-subst-table-for-encode r0 r1)
214 (r1 >>= 7) 329 (r2 = -1)
215 (r3 = ((r1 - 32) * 96)) 330 (if (r7 > 0)
216 (r3 += (r2 - 32)) 331 (r1 = r0)
217 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) 332 ((translate-character utf-translation-table-for-encode r0 r1)
218 (r0 = (r3 + #x100)) 333 (if (r0 == ,(charset-id 'ascii))
219 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) 334 nil
220 (r0 = (r3 + #x2500)) 335 (if (r0 == ,(charset-id 'latin-iso8859-1))
221 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) 336 (r1 += 128)
222 (r0 = (r3 + #xe000)) 337 (if (r0 == ,(charset-id 'eight-bit-control))
223 (r0 = #xfffd)))))))))) 338 nil
339 (if (r0 == ,(charset-id 'eight-bit-graphic))
340 (call ccl-mule-utf-16-encode-untrans)
341 ((r2 = ((r1 & #x7f) - 32))
342 (r3 = ((((r1 >> 7) - 32) * 96) + r2))
343 (r2 = -1)
344 (r5 = -1)
345 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
346 (r1 = (r3 + #x100))
347 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
348 (r1 = (r3 + #x2500))
349 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
350 (r1 = (r3 + #xe000))
351 (r1 = #xfffd)))))))))))))
224 352
225 (defconst utf-16le-encode-loop 353 (defconst utf-16le-encode-loop
226 `(loop 354 `((r5 = -1)
227 (read-multibyte-character r0 r1) 355 (loop
228 (lookup-character utf-subst-table-for-encode r0 r1) 356 ,@utf-16-decode-to-ucs
229 (if (r7 == 0) 357 (if (r1 >= #x10000)
230 ((translate-character utf-translation-table-for-encode r0 r1) 358 ((r1 -= #x10000)
231 ,utf-16-decode-to-ucs)) 359 (r0 = ((r1 >> 10) + #xD800))
232 (write (r0 & 255)) 360 (write (r0 & 255))
233 (write (r0 >> 8)) 361 (write (r0 >> 8))
234 (repeat))) 362 (r1 = ((r1 & #x3FF) + #xDC00))))
363 (if (r1 >= 0)
364 ((write (r1 & 255))
365 (write (r1 >> 8))))
366 (if (r2 >= 0)
367 ((write r2)
368 (if (r3 >= 0)
369 (write r3))))
370 (repeat))))
235 371
236 (defconst utf-16be-encode-loop 372 (defconst utf-16be-encode-loop
237 `(loop 373 `((r5 = -1)
238 (read-multibyte-character r0 r1) 374 (loop
239 (lookup-character utf-subst-table-for-encode r0 r1) 375 ,@utf-16-decode-to-ucs
240 (if (r7 == 0) 376 (if (r1 >= #x10000)
241 ((translate-character utf-translation-table-for-encode r0 r1) 377 ((r1 -= #x10000)
242 ,utf-16-decode-to-ucs)) 378 (r0 = ((r1 >> 10) + #xD800))
243 (write (r0 >> 8)) 379 (write (r0 >> 8))
244 (write (r0 & 255)) 380 (write (r0 & 255))
245 (repeat))) 381 (r1 = ((r1 & #x3FF) + #xDC00))))
382 (if (r1 >= 0)
383 ((write (r1 >> 8))
384 (write (r1 & 255))))
385 (if (r2 >= 0)
386 ((write r2)
387 (if (r3 >= 0)
388 (write r3))))
389 (repeat))))
246 ) 390 )
247 391
248 392
249 (define-ccl-program ccl-encode-mule-utf-16le 393 (define-ccl-program ccl-encode-mule-utf-16le
250 `(1 394 `(1
268 412
269 (define-ccl-program ccl-encode-mule-utf-16le-with-signature 413 (define-ccl-program ccl-encode-mule-utf-16le-with-signature
270 `(1 414 `(1
271 ((write #xFF) 415 ((write #xFF)
272 (write #xFE) 416 (write #xFE)
273 ,utf-16le-encode-loop)) 417 ,@utf-16le-encode-loop))
274 "Encode to UTF-16 (little endian with signature). 418 "Encode to UTF-16 (little endian with signature).
275 Characters from the charsets ascii, eight-bit-control, 419 Characters from the charsets ascii, eight-bit-control,
276 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded 420 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
277 after translation through the translation-table of name 421 after translation through the translation-table of name
278 `utf-translation-table-for-encode'. 422 `utf-translation-table-for-encode'.
280 424
281 (define-ccl-program ccl-encode-mule-utf-16be-with-signature 425 (define-ccl-program ccl-encode-mule-utf-16be-with-signature
282 `(1 426 `(1
283 ((write #xFE) 427 ((write #xFE)
284 (write #xFF) 428 (write #xFF)
285 ,utf-16be-encode-loop)) 429 ,@utf-16be-encode-loop))
286 "Encode to UTF-16 (big endian with signature). 430 "Encode to UTF-16 (big endian with signature).
287 Characters from the charsets ascii, eight-bit-control, 431 Characters from the charsets ascii, eight-bit-control,
288 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded 432 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
289 after translation through the translation-table named 433 after translation through the translation-table named
290 `utf-translation-table-for-encode'. 434 `utf-translation-table-for-encode'.
294 (makunbound 'utf-16le-encode-loop) 438 (makunbound 'utf-16le-encode-loop)
295 (makunbound 'utf-16be-encode-loop) 439 (makunbound 'utf-16be-encode-loop)
296 440
297 (defun mule-utf-16-post-read-conversion (length) 441 (defun mule-utf-16-post-read-conversion (length)
298 (when (> length 0) 442 (when (> length 0)
443 (setq length (utf-8-post-read-conversion length))
299 (let ((char (following-char))) 444 (let ((char (following-char)))
300 (cond ((= char (decode-char 'ucs #xFFFE)) 445 (cond ((= char (decode-char 'ucs #xFFFE))
301 (delete-char 1) 446 (delete-char 1)
302 (setq last-coding-system-used 447 (setq last-coding-system-used
303 (coding-system-change-text-conversion 448 (coding-system-change-text-conversion
327 ranges are decoded as U+FFFD, effectively corrupting the data 472 ranges are decoded as U+FFFD, effectively corrupting the data
328 if they are re-encoded. 473 if they are re-encoded.
329 474
330 On encoding (e.g. writing a file), Emacs characters not belonging to 475 On encoding (e.g. writing a file), Emacs characters not belonging to
331 any of the character sets listed above are encoded into the byte 476 any of the character sets listed above are encoded into the byte
332 sequence representing U+FFFD (REPLACEMENT CHARACTER).")) 477 sequence representing U+FFFD (REPLACEMENT CHARACTER).")
478 (props `((safe-charsets
479 ascii
480 eight-bit-control
481 eight-bit-graphic
482 latin-iso8859-1
483 mule-unicode-0100-24ff
484 mule-unicode-2500-33ff
485 mule-unicode-e000-ffff
486 ,@(if utf-translate-cjk-mode
487 utf-translate-cjk-charsets))
488 (valid-codes (0 . 255))
489 (mime-text-unsuitable . t)
490 (pre-write-conversion . utf-8-pre-write-conversion)
491 (dependency unify-8859-on-encoding-mode
492 unify-8859-on-decoding-mode
493 utf-fragment-on-decoding
494 utf-translate-cjk-mode))))
333 (make-coding-system 495 (make-coding-system
334 'mule-utf-16le 4 496 'mule-utf-16le 4
335 ?u ; Mule-UCS uses ?U, but code-pages uses that for koi8-u. 497 ?u ; Mule-UCS uses ?U, but code-pages uses that for koi8-u.
336 (concat 498 (concat
337 "UTF-16LE encoding for Emacs-supported Unicode characters." 499 "UTF-16LE encoding for Emacs-supported Unicode characters."
338 doc) 500 doc)
339
340 '(ccl-decode-mule-utf-16le . ccl-encode-mule-utf-16le) 501 '(ccl-decode-mule-utf-16le . ccl-encode-mule-utf-16le)
341 '((safe-charsets 502 `(,@props
342 ascii 503 (post-read-conversion . utf-8-post-read-conversion)
343 eight-bit-control 504 (mime-charset . utf-16le)))
344 latin-iso8859-1
345 mule-unicode-0100-24ff
346 mule-unicode-2500-33ff
347 mule-unicode-e000-ffff)
348 (mime-charset . utf-16le)
349 (mime-text-unsuitable . t)
350 (valid-codes (0 . 255))
351 (dependency unify-8859-on-encoding-mode
352 unify-8859-on-decoding-mode
353 utf-fragment-on-decoding
354 utf-translate-cjk-mode)))
355 505
356 (make-coding-system 506 (make-coding-system
357 'mule-utf-16be 4 ?u 507 'mule-utf-16be 4 ?u
358 (concat 508 (concat
359 "UTF-16BE encoding for Emacs-supported Unicode characters." 509 "UTF-16BE encoding for Emacs-supported Unicode characters."
360 doc) 510 doc)
361 511
362 '(ccl-decode-mule-utf-16be . ccl-encode-mule-utf-16be) 512 '(ccl-decode-mule-utf-16be . ccl-encode-mule-utf-16be)
363 '((safe-charsets 513 `(,@props
364 ascii 514 (post-read-conversion . utf-8-post-read-conversion)
365 eight-bit-control 515 (mime-charset . utf-16be)))
366 latin-iso8859-1
367 mule-unicode-0100-24ff
368 mule-unicode-2500-33ff
369 mule-unicode-e000-ffff)
370 (mime-charset . utf-16be)
371 (valid-codes (0 . 255))
372 (dependency unify-8859-on-encoding-mode
373 unify-8859-on-decoding-mode
374 utf-fragment-on-decoding
375 utf-translate-cjk-mode)))
376 516
377 (make-coding-system 517 (make-coding-system
378 'mule-utf-16le-with-signature 4 ?u 518 'mule-utf-16le-with-signature 4 ?u
379 (concat 519 (concat
380 "Little endian UTF-16 (with BOM) for Emacs-supported Unicode characters." 520 "Little endian UTF-16 (with BOM) for Emacs-supported Unicode characters."
381 doc) 521 doc)
382 522
383 '(ccl-decode-mule-utf-16le-with-signature 523 '(ccl-decode-mule-utf-16le-with-signature
384 . ccl-encode-mule-utf-16le-with-signature) 524 . ccl-encode-mule-utf-16le-with-signature)
385 '((safe-charsets 525 `(,@props
386 ascii 526 (post-read-conversion . utf-8-post-read-conversion)
387 eight-bit-control
388 latin-iso8859-1
389 mule-unicode-0100-24ff
390 mule-unicode-2500-33ff
391 mule-unicode-e000-ffff)
392 (coding-category . coding-category-utf-16-le) 527 (coding-category . coding-category-utf-16-le)
393 (mime-charset . utf-16) 528 (mime-charset . utf-16)))
394 (mime-text-unsuitable . t)
395 (valid-codes (0 . 255))
396 (dependency unify-8859-on-encoding-mode
397 unify-8859-on-decoding-mode
398 utf-fragment-on-decoding
399 utf-translate-cjk-mode)))
400 529
401 (make-coding-system 530 (make-coding-system
402 'mule-utf-16be-with-signature 4 ?u 531 'mule-utf-16be-with-signature 4 ?u
403 (concat 532 (concat
404 "Big endian UTF-16 (with BOM) for Emacs-supported Unicode characters." 533 "Big endian UTF-16 (with BOM) for Emacs-supported Unicode characters."
405 doc) 534 doc)
406 535
407 '(ccl-decode-mule-utf-16be-with-signature 536 '(ccl-decode-mule-utf-16be-with-signature
408 . ccl-encode-mule-utf-16be-with-signature) 537 . ccl-encode-mule-utf-16be-with-signature)
409 '((safe-charsets 538 `(,@props
410 ascii 539 (post-read-conversion . utf-8-post-read-conversion)
411 eight-bit-control
412 latin-iso8859-1
413 mule-unicode-0100-24ff
414 mule-unicode-2500-33ff
415 mule-unicode-e000-ffff)
416 (coding-category . coding-category-utf-16-be) 540 (coding-category . coding-category-utf-16-be)
417 (mime-charset . utf-16) 541 (mime-charset . utf-16)))
418 (valid-codes (0 . 255))
419 (dependency unify-8859-on-encoding-mode
420 unify-8859-on-decoding-mode
421 utf-fragment-on-decoding
422 utf-translate-cjk-mode)))
423 542
424 (make-coding-system 543 (make-coding-system
425 'mule-utf-16 4 ?u 544 'mule-utf-16 4 ?u
426 (concat 545 (concat
427 "UTF-16 (with or without BOM) for Emacs-supported Unicode characters." 546 "UTF-16 (with or without BOM) for Emacs-supported Unicode characters."
428 doc) 547 doc)
429 548
430 '(ccl-decode-mule-utf-16 . ccl-encode-mule-utf-16be-with-signature) 549 '(ccl-decode-mule-utf-16 . ccl-encode-mule-utf-16be-with-signature)
431 '((safe-charsets 550 `(,@props
432 ascii 551 (post-read-conversion . mule-utf-16-post-read-conversion)
433 eight-bit-control
434 latin-iso8859-1
435 mule-unicode-0100-24ff
436 mule-unicode-2500-33ff
437 mule-unicode-e000-ffff)
438 (coding-category . coding-category-utf-16-be) 552 (coding-category . coding-category-utf-16-be)
439 (mime-charset . utf-16) 553 (mime-charset . utf-16)))
440 (mime-text-unsuitable . t)
441 (valid-codes (0 . 255))
442 (dependency unify-8859-on-encoding-mode
443 unify-8859-on-decoding-mode
444 utf-fragment-on-decoding
445 utf-translate-cjk-mode)
446 (post-read-conversion . mule-utf-16-post-read-conversion)))
447 ) 554 )
448 555
449 (define-coding-system-alias 'utf-16le 'mule-utf-16le) 556 (define-coding-system-alias 'utf-16le 'mule-utf-16le)
450 (define-coding-system-alias 'utf-16be 'mule-utf-16be) 557 (define-coding-system-alias 'utf-16be 'mule-utf-16be)
451 (define-coding-system-alias 'utf-16le-with-signature 558 (define-coding-system-alias 'utf-16le-with-signature