Mercurial > emacs
comparison lisp/international/utf-8.el @ 37097:b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
author | Kenichi Handa <handa@m17n.org> |
---|---|
date | Fri, 30 Mar 2001 12:18:01 +0000 |
parents | 898d0f4abcad |
children | 88389fa9b713 |
comparison
equal
deleted
inserted
replaced
37096:0271543faf85 | 37097:b095952a8678 |
---|---|
1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support | 1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support |
2 | 2 |
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. | 3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. |
4 ;; Licensed to the Free Software Foundation. | 4 ;; Licensed to the Free Software Foundation. |
5 | 5 |
6 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> | |
6 ;; Keywords: multilingual, Unicode, UTF-8, i18n | 7 ;; Keywords: multilingual, Unicode, UTF-8, i18n |
7 | 8 |
8 ;; This file is part of GNU Emacs. | 9 ;; This file is part of GNU Emacs. |
9 | 10 |
10 ;; GNU Emacs is free software; you can redistribute it and/or modify | 11 ;; GNU Emacs is free software; you can redistribute it and/or modify |
184 decoded asis into eight-bit-control and eight-bit-graphic | 185 decoded asis into eight-bit-control and eight-bit-graphic |
185 characters.") | 186 characters.") |
186 | 187 |
187 (define-ccl-program ccl-encode-mule-utf-8 | 188 (define-ccl-program ccl-encode-mule-utf-8 |
188 `(1 | 189 `(1 |
189 (loop | 190 ((r5 = -1) |
190 (read-multibyte-character r0 r1) | 191 (loop |
191 | 192 (if (r5 < 0) |
192 (if (r0 == ,(charset-id 'ascii)) | 193 ((r1 = -1) |
193 (write r1) | 194 (read-multibyte-character r0 r1)) |
194 | 195 (;; We have already done read-multibyte-character. |
195 (if (r0 == ,(charset-id 'latin-iso8859-1)) | 196 (r0 = r5) |
196 ;; r1 scalar utf-8 | 197 (r1 = r6) |
197 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | 198 (r5 = -1))) |
198 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 | 199 |
199 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 | 200 (if (r0 == ,(charset-id 'ascii)) |
200 ((r0 = (((r1 & #x40) >> 6) | #xc2)) | 201 (write r1) |
201 (r1 &= #x3f) | 202 |
202 (r1 |= #x80) | 203 (if (r0 == ,(charset-id 'latin-iso8859-1)) |
203 (write r0 r1)) | 204 ;; r1 scalar utf-8 |
204 | 205 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
205 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) | 206 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 |
206 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | 207 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 |
207 ;; #x3f80 == (0011 1111 1000 0000)b | 208 ((r0 = (((r1 & #x40) >> 6) | #xc2)) |
208 (r1 &= #x7f) | 209 (r1 &= #x3f) |
209 (r1 += (r0 + 224)) ; 240 == -32 + #x0100 | 210 (r1 |= #x80) |
210 ;; now r1 holds scalar value | 211 (write r0 r1)) |
211 (if (r1 < #x0800) | 212 |
212 ;; 2byte encoding | 213 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) |
213 ((r0 = (((r1 & #x07c0) >> 6) | #xc0)) | 214 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
214 ;; #x07c0 == (0000 0111 1100 0000)b | 215 ;; #x3f80 == (0011 1111 1000 0000)b |
215 (r1 &= #x3f) | 216 (r1 &= #x7f) |
216 (r1 |= #x80) | 217 (r1 += (r0 + 224)) ; 240 == -32 + #x0100 |
217 (write r0 r1)) | 218 ;; now r1 holds scalar value |
218 ;; 3byte encoding | 219 (if (r1 < #x0800) |
219 ((r0 = (((r1 & #xf000) >> 12) | #xe0)) | 220 ;; 2byte encoding |
221 ((r0 = (((r1 & #x07c0) >> 6) | #xc0)) | |
222 ;; #x07c0 == (0000 0111 1100 0000)b | |
223 (r1 &= #x3f) | |
224 (r1 |= #x80) | |
225 (write r0 r1)) | |
226 ;; 3byte encoding | |
227 ((r0 = (((r1 & #xf000) >> 12) | #xe0)) | |
228 (r2 = ((r1 & #x3f) | #x80)) | |
229 (r1 &= #x0fc0) | |
230 (r1 >>= 6) | |
231 (r1 |= #x80) | |
232 (write r0 r1 r2)))) | |
233 | |
234 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) | |
235 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
236 (r1 &= #x7f) | |
237 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 | |
238 (r0 = (((r1 & #xf000) >> 12) | #xe0)) | |
220 (r2 = ((r1 & #x3f) | #x80)) | 239 (r2 = ((r1 & #x3f) | #x80)) |
221 (r1 &= #x0fc0) | 240 (r1 &= #x0fc0) |
222 (r1 >>= 6) | 241 (r1 >>= 6) |
223 (r1 |= #x80) | 242 (r1 |= #x80) |
224 (write r0 r1 r2)))) | 243 (write r0 r1 r2)) |
225 | 244 |
226 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) | 245 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) |
227 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | 246 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
228 (r1 &= #x7f) | 247 (r1 &= #x7f) |
229 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 | 248 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000 |
230 (r0 = (((r1 & #xf000) >> 12) | #xe0)) | 249 (r0 = (((r1 & #xf000) >> 12) | #xe0)) |
231 (r2 = ((r1 & #x3f) | #x80)) | 250 (r2 = ((r1 & #x3f) | #x80)) |
232 (r1 &= #x0fc0) | 251 (r1 &= #x0fc0) |
233 (r1 >>= 6) | 252 (r1 >>= 6) |
234 (r1 |= #x80) | 253 (r1 |= #x80) |
235 (write r0 r1 r2)) | 254 (write r0 r1 r2)) |
236 | 255 |
237 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) | 256 (if (r0 == ,(charset-id 'eight-bit-control)) |
238 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | 257 ;; r1 scalar utf-8 |
239 (r1 &= #x7f) | 258 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
240 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000 | 259 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 |
241 (r0 = (((r1 & #xf000) >> 12) | #xe0)) | 260 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 |
242 (r2 = ((r1 & #x3f) | #x80)) | 261 ((write #xc2) |
243 (r1 &= #x0fc0) | 262 (write r1)) |
244 (r1 >>= 6) | 263 |
245 (r1 |= #x80) | 264 (if (r0 == ,(charset-id 'eight-bit-graphic)) |
246 (write r0 r1 r2)) | 265 ;; r1 scalar utf-8 |
247 | 266 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
248 (if (r0 == ,(charset-id 'eight-bit-control)) | 267 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 |
249 ;; r1 scalar utf-8 | 268 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 |
250 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | 269 ((write r1) |
251 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 | 270 (r1 = -1) |
252 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 | 271 (read-multibyte-character r0 r1) |
253 (write r1) | 272 (if (r0 != ,(charset-id 'eight-bit-graphic)) |
254 | 273 (if (r0 != ,(charset-id 'eight-bit-control)) |
255 (if (r0 == ,(charset-id 'eight-bit-graphic)) | 274 ((r5 = r0) |
256 ;; r1 scalar utf-8 | 275 (r6 = r1)))) |
257 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | 276 (if (r5 < 0) |
258 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 | 277 ((read-multibyte-character r0 r2) |
259 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 | 278 (if (r0 != ,(charset-id 'eight-bit-graphic)) |
260 (write r1) | 279 (if (r0 != ,(charset-id 'eight-bit-control)) |
261 | 280 ((r5 = r0) |
262 ;; Unsupported character. | 281 (r6 = r2)))) |
263 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. | 282 (if (r5 < 0) |
264 ((write #xef) | 283 (write r1 r2) |
265 (write #xbf) | 284 (if (r1 < #xa0) |
266 (write #xbd))))))))) | 285 (write r1) |
267 (repeat))) | 286 ((write #xc2) |
287 (write r1))))))) | |
288 | |
289 ;; Unsupported character. | |
290 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. | |
291 ((write #xef) | |
292 (write #xbf) | |
293 (write #xbd))))))))) | |
294 (repeat))) | |
295 (if (r1 >= #xa0) | |
296 (write r1) | |
297 (if (r1 >= #x80) | |
298 ((write #xc2) | |
299 (write r1))))) | |
268 | 300 |
269 "CCL program to encode into UTF-8. | 301 "CCL program to encode into UTF-8. |
270 Only characters from the charsets ascii, eight-bit-control, | 302 Only characters from the charsets ascii, eight-bit-control, |
271 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized. | 303 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized. |
272 Others are encoded as U+FFFD.") | 304 Others are encoded as U+FFFD.") |