comparison lisp/international/utf-8.el @ 37097:b095952a8678

(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
author Kenichi Handa <handa@m17n.org>
date Fri, 30 Mar 2001 12:18:01 +0000
parents 898d0f4abcad
children 88389fa9b713
comparison
equal deleted inserted replaced
37096:0271543faf85 37097:b095952a8678
1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support 1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support
2 2
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. 3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
4 ;; Licensed to the Free Software Foundation. 4 ;; Licensed to the Free Software Foundation.
5 5
6 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
6 ;; Keywords: multilingual, Unicode, UTF-8, i18n 7 ;; Keywords: multilingual, Unicode, UTF-8, i18n
7 8
8 ;; This file is part of GNU Emacs. 9 ;; This file is part of GNU Emacs.
9 10
10 ;; GNU Emacs is free software; you can redistribute it and/or modify 11 ;; GNU Emacs is free software; you can redistribute it and/or modify
184 decoded asis into eight-bit-control and eight-bit-graphic 185 decoded asis into eight-bit-control and eight-bit-graphic
185 characters.") 186 characters.")
186 187
187 (define-ccl-program ccl-encode-mule-utf-8 188 (define-ccl-program ccl-encode-mule-utf-8
188 `(1 189 `(1
189 (loop 190 ((r5 = -1)
190 (read-multibyte-character r0 r1) 191 (loop
191 192 (if (r5 < 0)
192 (if (r0 == ,(charset-id 'ascii)) 193 ((r1 = -1)
193 (write r1) 194 (read-multibyte-character r0 r1))
194 195 (;; We have already done read-multibyte-character.
195 (if (r0 == ,(charset-id 'latin-iso8859-1)) 196 (r0 = r5)
196 ;; r1 scalar utf-8 197 (r1 = r6)
197 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx 198 (r5 = -1)))
198 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 199
199 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 200 (if (r0 == ,(charset-id 'ascii))
200 ((r0 = (((r1 & #x40) >> 6) | #xc2)) 201 (write r1)
201 (r1 &= #x3f) 202
202 (r1 |= #x80) 203 (if (r0 == ,(charset-id 'latin-iso8859-1))
203 (write r0 r1)) 204 ;; r1 scalar utf-8
204 205 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
205 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) 206 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
206 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) 207 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
207 ;; #x3f80 == (0011 1111 1000 0000)b 208 ((r0 = (((r1 & #x40) >> 6) | #xc2))
208 (r1 &= #x7f) 209 (r1 &= #x3f)
209 (r1 += (r0 + 224)) ; 240 == -32 + #x0100 210 (r1 |= #x80)
210 ;; now r1 holds scalar value 211 (write r0 r1))
211 (if (r1 < #x0800) 212
212 ;; 2byte encoding 213 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
213 ((r0 = (((r1 & #x07c0) >> 6) | #xc0)) 214 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
214 ;; #x07c0 == (0000 0111 1100 0000)b 215 ;; #x3f80 == (0011 1111 1000 0000)b
215 (r1 &= #x3f) 216 (r1 &= #x7f)
216 (r1 |= #x80) 217 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
217 (write r0 r1)) 218 ;; now r1 holds scalar value
218 ;; 3byte encoding 219 (if (r1 < #x0800)
219 ((r0 = (((r1 & #xf000) >> 12) | #xe0)) 220 ;; 2byte encoding
221 ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
222 ;; #x07c0 == (0000 0111 1100 0000)b
223 (r1 &= #x3f)
224 (r1 |= #x80)
225 (write r0 r1))
226 ;; 3byte encoding
227 ((r0 = (((r1 & #xf000) >> 12) | #xe0))
228 (r2 = ((r1 & #x3f) | #x80))
229 (r1 &= #x0fc0)
230 (r1 >>= 6)
231 (r1 |= #x80)
232 (write r0 r1 r2))))
233
234 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
235 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
236 (r1 &= #x7f)
237 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
238 (r0 = (((r1 & #xf000) >> 12) | #xe0))
220 (r2 = ((r1 & #x3f) | #x80)) 239 (r2 = ((r1 & #x3f) | #x80))
221 (r1 &= #x0fc0) 240 (r1 &= #x0fc0)
222 (r1 >>= 6) 241 (r1 >>= 6)
223 (r1 |= #x80) 242 (r1 |= #x80)
224 (write r0 r1 r2)))) 243 (write r0 r1 r2))
225 244
226 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) 245 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
227 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) 246 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
228 (r1 &= #x7f) 247 (r1 &= #x7f)
229 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 248 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
230 (r0 = (((r1 & #xf000) >> 12) | #xe0)) 249 (r0 = (((r1 & #xf000) >> 12) | #xe0))
231 (r2 = ((r1 & #x3f) | #x80)) 250 (r2 = ((r1 & #x3f) | #x80))
232 (r1 &= #x0fc0) 251 (r1 &= #x0fc0)
233 (r1 >>= 6) 252 (r1 >>= 6)
234 (r1 |= #x80) 253 (r1 |= #x80)
235 (write r0 r1 r2)) 254 (write r0 r1 r2))
236 255
237 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) 256 (if (r0 == ,(charset-id 'eight-bit-control))
238 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) 257 ;; r1 scalar utf-8
239 (r1 &= #x7f) 258 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
240 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000 259 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
241 (r0 = (((r1 & #xf000) >> 12) | #xe0)) 260 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
242 (r2 = ((r1 & #x3f) | #x80)) 261 ((write #xc2)
243 (r1 &= #x0fc0) 262 (write r1))
244 (r1 >>= 6) 263
245 (r1 |= #x80) 264 (if (r0 == ,(charset-id 'eight-bit-graphic))
246 (write r0 r1 r2)) 265 ;; r1 scalar utf-8
247 266 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
248 (if (r0 == ,(charset-id 'eight-bit-control)) 267 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
249 ;; r1 scalar utf-8 268 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
250 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx 269 ((write r1)
251 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 270 (r1 = -1)
252 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 271 (read-multibyte-character r0 r1)
253 (write r1) 272 (if (r0 != ,(charset-id 'eight-bit-graphic))
254 273 (if (r0 != ,(charset-id 'eight-bit-control))
255 (if (r0 == ,(charset-id 'eight-bit-graphic)) 274 ((r5 = r0)
256 ;; r1 scalar utf-8 275 (r6 = r1))))
257 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx 276 (if (r5 < 0)
258 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 277 ((read-multibyte-character r0 r2)
259 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 278 (if (r0 != ,(charset-id 'eight-bit-graphic))
260 (write r1) 279 (if (r0 != ,(charset-id 'eight-bit-control))
261 280 ((r5 = r0)
262 ;; Unsupported character. 281 (r6 = r2))))
263 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. 282 (if (r5 < 0)
264 ((write #xef) 283 (write r1 r2)
265 (write #xbf) 284 (if (r1 < #xa0)
266 (write #xbd))))))))) 285 (write r1)
267 (repeat))) 286 ((write #xc2)
287 (write r1)))))))
288
289 ;; Unsupported character.
290 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
291 ((write #xef)
292 (write #xbf)
293 (write #xbd)))))))))
294 (repeat)))
295 (if (r1 >= #xa0)
296 (write r1)
297 (if (r1 >= #x80)
298 ((write #xc2)
299 (write r1)))))
268 300
269 "CCL program to encode into UTF-8. 301 "CCL program to encode into UTF-8.
270 Only characters from the charsets ascii, eight-bit-control, 302 Only characters from the charsets ascii, eight-bit-control,
271 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized. 303 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
272 Others are encoded as U+FFFD.") 304 Others are encoded as U+FFFD.")