Mercurial > emacs
comparison lisp/international/utf-8.el @ 50085:575609f03daa
(ccl-decode-mule-utf-8): Don't loose
bytes on handling an invalid byte sequence.
author | Kenichi Handa <handa@m17n.org> |
---|---|
date | Wed, 12 Mar 2003 00:45:49 +0000 |
parents | 0d8b17d428b5 |
children | 65bb5afb37ef |
comparison
equal
deleted
inserted
replaced
50084:e41f8dbc86aa | 50085:575609f03daa |
---|---|
306 ;; | 306 ;; |
307 `(2 | 307 `(2 |
308 ((r5 = ,(charset-id 'eight-bit-control)) | 308 ((r5 = ,(charset-id 'eight-bit-control)) |
309 (r6 = ,(charset-id 'eight-bit-graphic)) | 309 (r6 = ,(charset-id 'eight-bit-graphic)) |
310 (loop | 310 (loop |
311 (r0 = -1) | |
311 (read r0) | 312 (read r0) |
312 | 313 |
313 ;; 1byte encoding, i.e., ascii | 314 ;; 1byte encoding, i.e., ascii |
314 (if (r0 < #x80) | 315 (if (r0 < #x80) |
315 (write r0) | 316 ((write r0)) |
316 (if (r0 < #xc0) ; continuation byte (invalid here) | 317 (if (r0 < #xc0) ; continuation byte (invalid here) |
317 (if (r0 < #xa0) | 318 ((if (r0 < #xa0) |
318 (write-multibyte-character r5 r0) | 319 (write-multibyte-character r5 r0) |
319 (write-multibyte-character r6 r0)) | 320 (write-multibyte-character r6 r0))) |
320 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx | 321 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx |
321 (if (r0 < #xe0) | 322 (if (r0 < #xe0) |
322 ((read r1) | 323 ((r1 = -1) |
324 (read r1) | |
323 | 325 |
324 (if ((r1 & #b11000000) != #b10000000) | 326 (if ((r1 & #b11000000) != #b10000000) |
325 ;; Invalid 2-byte sequence | 327 ;; Invalid 2-byte sequence |
326 ((if (r0 < #xa0) | 328 ((if (r0 < #xa0) |
327 (write-multibyte-character r5 r0) | 329 (write-multibyte-character r5 r0) |
371 (write-multibyte-character r0 r1)))))))) | 373 (write-multibyte-character r0 r1)))))))) |
372 | 374 |
373 ;; 3byte encoding | 375 ;; 3byte encoding |
374 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx | 376 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx |
375 (if (r0 < #xf0) | 377 (if (r0 < #xf0) |
376 ((read r1 r2) | 378 ((r1 = -1) |
379 (r2 = -1) | |
380 (read r1 r2) | |
377 | 381 |
378 ;; This is set to 1 if the encoding is invalid. | 382 ;; This is set to 1 if the encoding is invalid. |
379 (r4 = 0) | 383 (r4 = 0) |
380 | 384 |
381 (r3 = (r1 & #b11000000)) | 385 (r3 = (r1 & #b11000000)) |
476 | 480 |
477 (if (r0 < #xfe) | 481 (if (r0 < #xfe) |
478 ;; 4byte encoding | 482 ;; 4byte encoding |
479 ;; keep those bytes as eight-bit-{control|graphic} | 483 ;; keep those bytes as eight-bit-{control|graphic} |
480 ;; Fixme: allow lookup in utf-subst-table-for-decode. | 484 ;; Fixme: allow lookup in utf-subst-table-for-decode. |
481 ((read r1 r2 r3) | 485 ((r1 = -1) |
486 (r2 = -1) | |
487 (r3 = -1) | |
488 (read r1 r2 r3) | |
482 ;; r0 > #xf0, thus eight-bit-graphic | 489 ;; r0 > #xf0, thus eight-bit-graphic |
483 (write-multibyte-character r6 r0) | 490 (write-multibyte-character r6 r0) |
484 (if (r1 < #xa0) | 491 (if (r1 < #xa0) |
485 (if (r1 < #x80) ; invalid byte | 492 (if (r1 < #x80) ; invalid byte |
486 (write r1) | 493 (write r1) |
510 (write r1) | 517 (write r1) |
511 (write-multibyte-character r5 r1)) | 518 (write-multibyte-character r5 r1)) |
512 (write-multibyte-character r6 r1))))))) | 519 (write-multibyte-character r6 r1))))))) |
513 ;; else invalid byte >= #xfe | 520 ;; else invalid byte >= #xfe |
514 (write-multibyte-character r6 r0)))))) | 521 (write-multibyte-character r6 r0)))))) |
515 (repeat)))) | 522 (repeat))) |
523 | |
524 ;; At EOF... | |
525 (if (r0 >= 0) | |
526 ((if (r0 < #x80) | |
527 (write r0) | |
528 (if (r0 < #xa0) | |
529 (write-multibyte-character r5 r0) | |
530 ((write-multibyte-character r6 r0)))) | |
531 (if (r1 >= 0) | |
532 ((if (r1 < #x80) | |
533 (write r1) | |
534 (if (r1 < #xa0) | |
535 (write-multibyte-character r5 r1) | |
536 ((write-multibyte-character r6 r1)))) | |
537 (if (r2 >= 0) | |
538 ((if (r2 < #x80) | |
539 (write r2) | |
540 (if (r2 < #xa0) | |
541 (write-multibyte-character r5 r2) | |
542 ((write-multibyte-character r6 r2)))) | |
543 (if (r3 >= 0) | |
544 (if (r3 < #x80) | |
545 (write r3) | |
546 (if (r3 < #xa0) | |
547 (write-multibyte-character r5 r3) | |
548 ((write-multibyte-character r6 r3)))))))))))) | |
516 | 549 |
517 "CCL program to decode UTF-8. | 550 "CCL program to decode UTF-8. |
518 Basic decoding is done into the charsets ascii, latin-iso8859-1 and | 551 Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
519 mule-unicode-*, but see also `utf-fragmentation-table' and | 552 mule-unicode-*, but see also `utf-fragmentation-table' and |
520 `ucs-mule-cjk-to-unicode'. | 553 `ucs-mule-cjk-to-unicode'. |