comparison lisp/international/utf-8.el @ 56056:4575a565f45d

(ccl-decode-mule-utf-8): Fix previous change. (ccl-untranslated-to-ucs): Fix typo.
author Kenichi Handa <handa@m17n.org>
date Sun, 13 Jun 2004 00:21:39 +0000
parents 81dbb510a1db
children 4ec2da03a87c
comparison
equal deleted inserted replaced
56055:34a75c56c1ce 56056:4575a565f45d
393 ;; invalid byte | 1 | 2 393 ;; invalid byte | 1 | 2
394 ;; 394 ;;
395 ;; Thus magnification factor is two. 395 ;; Thus magnification factor is two.
396 ;; 396 ;;
397 `(2 397 `(2
398 ((r0 = -1) 398 ((r6 = ,(charset-id 'latin-iso8859-1))
399 (read r0)
399 (loop 400 (loop
400 (if (r0 < 0)
401 (read r0))
402 (if (r0 < #x80) 401 (if (r0 < #x80)
403 ;; 1-byte encoding, i.e., ascii 402 ;; 1-byte encoding, i.e., ascii
404 ((write r0) 403 (write-read-repeat r0))
405 (r0 = -1) 404 (if (r0 < #xc2)
405 ;; continuation byte (invalid here) or 1st byte of overlong
406 ;; 2-byte sequence.
407 ((call ccl-mule-utf-untrans)
408 (r6 = ,(charset-id 'latin-iso8859-1))
409 (read r0)
406 (repeat))) 410 (repeat)))
407 (if (r0 < #xc0) ; continuation byte (invalid here)
408 ((call ccl-mule-utf-untrans)
409 (r0 = -1)
410 (repeat)))
411 411
412 ;; Read the 2nd byte. 412 ;; Read the 2nd byte.
413 (r1 = -1)
414 (read r1) 413 (read r1)
415 (if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte 414 (if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte
416 ((call ccl-mule-utf-untrans) 415 ((call ccl-mule-utf-untrans)
416 (r6 = ,(charset-id 'latin-iso8859-1))
417 ;; Handle it in the next loop. 417 ;; Handle it in the next loop.
418 (r0 = r1) 418 (r0 = r1)
419 (repeat))) 419 (repeat)))
420 420
421 (if (r0 < #xe0) 421 (if (r0 < #xe0)
422 ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx 422 ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
423 ((r2 = ((r0 & #x1F) << 6)) 423 ((r1 &= #x3F)
424 (r2 |= (r1 & #x3F)) 424 (r1 |= ((r0 & #x1F) << 6))
425 ;; Now r2 holds scalar value 425 ;; Now r2 holds scalar value. We don't have to check
426 426 ;; `overlong sequence' because r0 >= 0xC2.
427 (if (r2 < 128) ; `overlong sequence' 427
428 ((call ccl-mule-utf-untrans) 428 (if (r1 >= 256)
429 (r0 = r1)
430 (call ccl-mule-utf-untrans)
431 (r0 = -1)
432 (repeat)))
433
434 (r1 = r2)
435 (if (r1 < 160)
436 ;; eight-bit-control
437 (r0 = ,(charset-id 'eight-bit-control))
438 (if (r1 < 256)
439 ;; latin-iso8859-1
440 ((r0 = ,(charset-id 'latin-iso8859-1))
441 (r1 -= 128))
442 ;; mule-unicode-0100-24ff (< 0800) 429 ;; mule-unicode-0100-24ff (< 0800)
443 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) 430 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
444 (r1 -= #x0100) 431 (r1 -= #x0100)
445 (r2 = (((r1 / 96) + 32) << 7)) 432 (r2 = (((r1 / 96) + 32) << 7))
446 (r1 %= 96) 433 (r1 %= 96)
447 (r1 += (r2 + 32)) 434 (r1 += (r2 + 32))
448 (translate-character 435 (translate-character
449 utf-translation-table-for-decode r0 r1)))) 436 utf-translation-table-for-decode r0 r1)
450 (write-multibyte-character r0 r1) 437 (write-multibyte-character r0 r1)
451 (r0 = -1) 438 (read r0)
452 (repeat))) 439 (repeat))
440 (if (r1 >= 160)
441 ;; latin-iso8859-1
442 ((r1 -= 128)
443 (write-multibyte-character r6 r1)
444 (read r0)
445 (repeat))
446 ;; eight-bit-control
447 ((r0 = ,(charset-id 'eight-bit-control))
448 (write-multibyte-character r0 r1)
449 (read r0)
450 (repeat))))))
453 451
454 ;; Read the 3rd bytes. 452 ;; Read the 3rd bytes.
455 (r2 = -1)
456 (read r2) 453 (read r2)
457 (if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte 454 (if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte
458 ((call ccl-mule-utf-untrans) 455 ((call ccl-mule-utf-untrans)
459 (r0 = r1) 456 (r0 = r1)
460 (call ccl-mule-utf-untrans) 457 (call ccl-mule-utf-untrans)
458 (r6 = ,(charset-id 'latin-iso8859-1))
461 ;; Handle it in the next loop. 459 ;; Handle it in the next loop.
462 (r0 = r2) 460 (r0 = r2)
463 (repeat))) 461 (repeat)))
464 462
465 (if (r0 < #xF0) 463 (if (r0 < #xF0)
473 ((call ccl-mule-utf-untrans) 471 ((call ccl-mule-utf-untrans)
474 (r0 = r1) 472 (r0 = r1)
475 (call ccl-mule-utf-untrans) 473 (call ccl-mule-utf-untrans)
476 (r0 = r2) 474 (r0 = r2)
477 (call ccl-mule-utf-untrans) 475 (call ccl-mule-utf-untrans)
478 (r0 = -1) 476 (r6 = ,(charset-id 'latin-iso8859-1))
477 (read r0)
479 (repeat))) 478 (repeat)))
480 479
481 (if (r3 < #x2500) 480 (if (r3 < #x2500)
482 ;; mule-unicode-0100-24ff (>= 0800) 481 ;; mule-unicode-0100-24ff (>= 0800)
483 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) 482 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
486 (r1 = (r7 + 32)) 485 (r1 = (r7 + 32))
487 (r1 += ((r3 + 32) << 7)) 486 (r1 += ((r3 + 32) << 7))
488 (translate-character 487 (translate-character
489 utf-translation-table-for-decode r0 r1) 488 utf-translation-table-for-decode r0 r1)
490 (write-multibyte-character r0 r1) 489 (write-multibyte-character r0 r1)
491 (r0 = -1) 490 (read r0)
492 (repeat))) 491 (repeat)))
493 492
494 (if (r3 < #x3400) 493 (if (r3 < #x3400)
495 ;; mule-unicode-2500-33ff 494 ;; mule-unicode-2500-33ff
496 ((r0 = r3) ; don't zap r3 495 ((r0 = r3) ; don't zap r3
500 (r3 -= #x2500) 499 (r3 -= #x2500)
501 (r3 //= 96) 500 (r3 //= 96)
502 (r1 = (r7 + 32)) 501 (r1 = (r7 + 32))
503 (r1 += ((r3 + 32) << 7)))) 502 (r1 += ((r3 + 32) << 7))))
504 (write-multibyte-character r0 r1) 503 (write-multibyte-character r0 r1)
505 (r0 = -1) 504 (read r0)
506 (repeat))) 505 (repeat)))
507 506
508 (if (r3 < #xE000) 507 (if (r3 < #xE000)
509 ;; Try to convert to CJK chars, else 508 ;; Try to convert to CJK chars, else
510 ;; keep them as eight-bit-{control|graphic}. 509 ;; keep them as eight-bit-{control|graphic}.
511 ((r0 = r3) 510 ((r0 = r3)
512 (lookup-integer utf-subst-table-for-decode r3 r1) 511 (lookup-integer utf-subst-table-for-decode r3 r1)
513 (if r7 512 (if r7
514 ;; got a translation 513 ;; got a translation
515 (write-multibyte-character r3 r1) 514 ((write-multibyte-character r3 r1)
516 (call ccl-mule-utf-untrans)) 515 (read r0)
517 (r0 = -1) 516 (repeat))
518 (repeat))) 517 ((call ccl-mule-utf-untrans)
518 (r6 = ,(charset-id 'latin-iso8859-1))
519 (read r0)
520 (repeat)))))
519 521
520 ;; mule-unicode-e000-ffff 522 ;; mule-unicode-e000-ffff
521 ;; Fixme: fffe and ffff are invalid. 523 ;; Fixme: fffe and ffff are invalid.
522 (r0 = r3) ; don't zap r3 524 (r0 = r3) ; don't zap r3
523 (lookup-integer utf-subst-table-for-decode r0 r1) 525 (lookup-integer utf-subst-table-for-decode r0 r1)
526 (r3 -= #xe000) 528 (r3 -= #xe000)
527 (r3 //= 96) 529 (r3 //= 96)
528 (r1 = (r7 + 32)) 530 (r1 = (r7 + 32))
529 (r1 += ((r3 + 32) << 7)))) 531 (r1 += ((r3 + 32) << 7))))
530 (write-multibyte-character r0 r1) 532 (write-multibyte-character r0 r1)
531 (r0 = -1) 533 (read r0)
532 (repeat))) 534 (repeat)))
533 535
534 ;; Read the 4th bytes. 536 ;; Read the 4th bytes.
535 (r3 = -1)
536 (read r3) 537 (read r3)
537 (if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte 538 (if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte
538 ((call ccl-mule-utf-untrans) 539 ((call ccl-mule-utf-untrans)
539 (r0 = r1) 540 (r0 = r1)
540 (call ccl-mule-utf-untrans) 541 (call ccl-mule-utf-untrans)
542 (r0 = r2)
543 (call ccl-mule-utf-untrans)
544 (r6 = ,(charset-id 'latin-iso8859-1))
541 ;; Handle it in the next loop. 545 ;; Handle it in the next loop.
542 (r0 = r3) 546 (r0 = r3)
543 (repeat))) 547 (repeat)))
544 548
545 (if (r3 < #xF8) 549 (if (r0 < #xF8)
546 ;; 4-byte encoding: 550 ;; 4-byte encoding:
547 ;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx 551 ;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx
548 ;; keep those bytes as eight-bit-{control|graphic} 552 ;; keep those bytes as eight-bit-{control|graphic}
549 ;; Fixme: allow lookup in utf-subst-table-for-decode. 553 ;; Fixme: allow lookup in utf-subst-table-for-decode.
550 ((r4 = ((r0 & #x7) << 18)) 554 ((r4 = ((r0 & #x7) << 18))
559 (r0 = r2) 563 (r0 = r2)
560 (call ccl-mule-utf-untrans) 564 (call ccl-mule-utf-untrans)
561 (r0 = r3) 565 (r0 = r3)
562 (call ccl-mule-utf-untrans)) 566 (call ccl-mule-utf-untrans))
563 ((r0 = r4) 567 ((r0 = r4)
564 (call ccl-mule-utf-untrans))) 568 (call ccl-mule-utf-untrans))))
565 (r0 = -1) 569
566 (repeat))) 570 ;; Unsupported sequence.
567 571 ((call ccl-mule-utf-untrans)
568 ;; Unsupported sequence. 572 (r0 = r1)
569 (call ccl-mule-utf-untrans) 573 (call ccl-mule-utf-untrans)
570 (r0 = r1) 574 (r0 = r2)
571 (call ccl-mule-utf-untrans) 575 (call ccl-mule-utf-untrans)
572 (r0 = r2) 576 (r0 = r3)
573 (call ccl-mule-utf-untrans) 577 (call ccl-mule-utf-untrans)))
574 (r0 = r3) 578 (r6 = ,(charset-id 'latin-iso8859-1))
575 (call ccl-mule-utf-untrans) 579 (read r0)
576 (r0 = -1)
577 (repeat))) 580 (repeat)))
581
578 582
579 ;; At EOF... 583 ;; At EOF...
580 (if (r0 >= 0) 584 (if (r0 >= 0)
581 ;; r0 >= #x80 585 ;; r0 >= #x80
582 ((call ccl-mule-utf-untrans) 586 ((call ccl-mule-utf-untrans)
784 (if (r2 == 0) 788 (if (r2 == 0)
785 (r1 = 0) 789 (r1 = 0)
786 (if (r0 < #xF0) ; 3-byte encoding, as above 790 (if (r0 < #xF0) ; 3-byte encoding, as above
787 ((r0 = ((r0 & #xF) << 12)) 791 ((r0 = ((r0 & #xF) << 12))
788 (r0 |= ((r1 & #x3F) << 6)) 792 (r0 |= ((r1 & #x3F) << 6))
789 (r0 |= (r1 & #x3F)) 793 (r0 |= (r2 & #x3F))
790 (r1 = 3)) 794 (r1 = 3))
791 (if (r3 == 0) 795 (if (r3 == 0)
792 (r1 = 0) 796 (r1 = 0)
793 ((r0 = ((r0 & #x7) << 18)) 797 ((r0 = ((r0 & #x7) << 18))
794 (r0 |= ((r1 & #x3F) << 12)) 798 (r0 |= ((r1 & #x3F) << 12))