comparison lisp/international/utf-8.el @ 50085:575609f03daa

(ccl-decode-mule-utf-8): Don't loose bytes on handling an invalid byte sequence.
author Kenichi Handa <handa@m17n.org>
date Wed, 12 Mar 2003 00:45:49 +0000
parents 0d8b17d428b5
children 65bb5afb37ef
comparison
equal deleted inserted replaced
50084:e41f8dbc86aa 50085:575609f03daa
306 ;; 306 ;;
307 `(2 307 `(2
308 ((r5 = ,(charset-id 'eight-bit-control)) 308 ((r5 = ,(charset-id 'eight-bit-control))
309 (r6 = ,(charset-id 'eight-bit-graphic)) 309 (r6 = ,(charset-id 'eight-bit-graphic))
310 (loop 310 (loop
311 (r0 = -1)
311 (read r0) 312 (read r0)
312 313
313 ;; 1byte encoding, i.e., ascii 314 ;; 1byte encoding, i.e., ascii
314 (if (r0 < #x80) 315 (if (r0 < #x80)
315 (write r0) 316 ((write r0))
316 (if (r0 < #xc0) ; continuation byte (invalid here) 317 (if (r0 < #xc0) ; continuation byte (invalid here)
317 (if (r0 < #xa0) 318 ((if (r0 < #xa0)
318 (write-multibyte-character r5 r0) 319 (write-multibyte-character r5 r0)
319 (write-multibyte-character r6 r0)) 320 (write-multibyte-character r6 r0)))
320 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx 321 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
321 (if (r0 < #xe0) 322 (if (r0 < #xe0)
322 ((read r1) 323 ((r1 = -1)
324 (read r1)
323 325
324 (if ((r1 & #b11000000) != #b10000000) 326 (if ((r1 & #b11000000) != #b10000000)
325 ;; Invalid 2-byte sequence 327 ;; Invalid 2-byte sequence
326 ((if (r0 < #xa0) 328 ((if (r0 < #xa0)
327 (write-multibyte-character r5 r0) 329 (write-multibyte-character r5 r0)
371 (write-multibyte-character r0 r1)))))))) 373 (write-multibyte-character r0 r1))))))))
372 374
373 ;; 3byte encoding 375 ;; 3byte encoding
374 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx 376 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
375 (if (r0 < #xf0) 377 (if (r0 < #xf0)
376 ((read r1 r2) 378 ((r1 = -1)
379 (r2 = -1)
380 (read r1 r2)
377 381
378 ;; This is set to 1 if the encoding is invalid. 382 ;; This is set to 1 if the encoding is invalid.
379 (r4 = 0) 383 (r4 = 0)
380 384
381 (r3 = (r1 & #b11000000)) 385 (r3 = (r1 & #b11000000))
476 480
477 (if (r0 < #xfe) 481 (if (r0 < #xfe)
478 ;; 4byte encoding 482 ;; 4byte encoding
479 ;; keep those bytes as eight-bit-{control|graphic} 483 ;; keep those bytes as eight-bit-{control|graphic}
480 ;; Fixme: allow lookup in utf-subst-table-for-decode. 484 ;; Fixme: allow lookup in utf-subst-table-for-decode.
481 ((read r1 r2 r3) 485 ((r1 = -1)
486 (r2 = -1)
487 (r3 = -1)
488 (read r1 r2 r3)
482 ;; r0 > #xf0, thus eight-bit-graphic 489 ;; r0 > #xf0, thus eight-bit-graphic
483 (write-multibyte-character r6 r0) 490 (write-multibyte-character r6 r0)
484 (if (r1 < #xa0) 491 (if (r1 < #xa0)
485 (if (r1 < #x80) ; invalid byte 492 (if (r1 < #x80) ; invalid byte
486 (write r1) 493 (write r1)
510 (write r1) 517 (write r1)
511 (write-multibyte-character r5 r1)) 518 (write-multibyte-character r5 r1))
512 (write-multibyte-character r6 r1))))))) 519 (write-multibyte-character r6 r1)))))))
513 ;; else invalid byte >= #xfe 520 ;; else invalid byte >= #xfe
514 (write-multibyte-character r6 r0)))))) 521 (write-multibyte-character r6 r0))))))
515 (repeat)))) 522 (repeat)))
523
524 ;; At EOF...
525 (if (r0 >= 0)
526 ((if (r0 < #x80)
527 (write r0)
528 (if (r0 < #xa0)
529 (write-multibyte-character r5 r0)
530 ((write-multibyte-character r6 r0))))
531 (if (r1 >= 0)
532 ((if (r1 < #x80)
533 (write r1)
534 (if (r1 < #xa0)
535 (write-multibyte-character r5 r1)
536 ((write-multibyte-character r6 r1))))
537 (if (r2 >= 0)
538 ((if (r2 < #x80)
539 (write r2)
540 (if (r2 < #xa0)
541 (write-multibyte-character r5 r2)
542 ((write-multibyte-character r6 r2))))
543 (if (r3 >= 0)
544 (if (r3 < #x80)
545 (write r3)
546 (if (r3 < #xa0)
547 (write-multibyte-character r5 r3)
548 ((write-multibyte-character r6 r3))))))))))))
516 549
517 "CCL program to decode UTF-8. 550 "CCL program to decode UTF-8.
518 Basic decoding is done into the charsets ascii, latin-iso8859-1 and 551 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
519 mule-unicode-*, but see also `utf-fragmentation-table' and 552 mule-unicode-*, but see also `utf-fragmentation-table' and
520 `ucs-mule-cjk-to-unicode'. 553 `ucs-mule-cjk-to-unicode'.