Mercurial > emacs
changeset 42150:35e8e47e376b
Implementing euc-tw encoding.
Improving doc strings.
author | Werner LEMBERG <wl@gnu.org> |
---|---|
date | Tue, 18 Dec 2001 17:46:16 +0000 |
parents | 111acebcb4e0 |
children | e9a441eabced |
files | lisp/language/chinese.el |
diffstat | 1 files changed, 153 insertions(+), 8 deletions(-) [+] |
line wrap: on
line diff
--- a/lisp/language/chinese.el Tue Dec 18 17:43:09 2001 +0000 +++ b/lisp/language/chinese.el Tue Dec 18 17:46:16 2001 +0000 @@ -35,7 +35,7 @@ (make-coding-system 'iso-2022-cn 2 ?C - "ISO 2022 based 7bit encoding for Chinese GB and CNS (MIME:ISO-2022-CN)" + "ISO 2022 based 7bit encoding for Chinese GB and CNS (MIME:ISO-2022-CN)." '(ascii (nil chinese-gb2312 chinese-cns11643-1) (nil chinese-cns11643-2) @@ -49,7 +49,7 @@ (make-coding-system 'iso-2022-cn-ext 2 ?C - "ISO 2022 based 7bit encoding for Chinese GB and CNS (MIME:ISO-2022-CN-EXT)" + "ISO 2022 based 7bit encoding for Chinese GB and CNS (MIME:ISO-2022-CN-EXT)." '(ascii (nil chinese-gb2312 chinese-cns11643-1) (nil chinese-cns11643-2) @@ -69,7 +69,7 @@ (make-coding-system 'chinese-iso-8bit 2 ?c - "ISO 2022 based EUC encoding for Chinese GB2312 (MIME:GB2312)" + "ISO 2022 based EUC encoding for Chinese GB2312 (MIME:GB2312)." '(ascii chinese-gb2312 nil nil nil ascii-eol ascii-cntl nil nil nil nil) '((safe-charsets ascii chinese-gb2312) @@ -83,7 +83,7 @@ (make-coding-system 'chinese-hz 0 ?z - "Hz/ZW 7-bit encoding for Chinese GB2312 (MIME:HZ-GB-2312)" + "Hz/ZW 7-bit encoding for Chinese GB2312 (MIME:HZ-GB-2312)." nil '((safe-charsets ascii chinese-gb2312) (mime-charset . hz-gb-2312) @@ -126,7 +126,8 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (make-coding-system - 'chinese-big5 3 ?B "BIG5 8-bit encoding for Chinese (MIME:Big5)" + 'chinese-big5 3 ?B + "BIG5 8-bit encoding for Chinese (MIME:Big5)." nil '((safe-charsets ascii chinese-big5-1 chinese-big5-2) (mime-charset . big5) @@ -168,16 +169,160 @@ ;; Chinese CNS11643 (traditional) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(defvar big5-to-cns (make-translation-table) + "Translation table for encoding to `euc-tw'.") +;; Could have been done by china-util loaded before. +(unless (get 'big5-to-cns 'translation-table) + (define-translation-table 'big5-to-cns big5-to-cns)) + +(define-ccl-program ccl-decode-euc-tw + ;; CNS plane 1 needs either two or four bytes in EUC-TW encoding; + ;; CNS planes 2 to 7 always need four bytes. In internal encoding of + ;; Emacs, CNS planes 1 and 2 need three bytes, and planes 3 to 7 need + ;; four bytes. Thus a buffer magnification value of 2 (for both + ;; encoding and decoding) is sufficient. + `(2 + ;; we don't have enough registers to hold all charset-ids + ((r4 = ,(charset-id 'chinese-cns11643-1)) + (r5 = ,(charset-id 'chinese-cns11643-2)) + (r6 = ,(charset-id 'chinese-cns11643-3)) + (loop + (read-if (r0 < #x80) + ;; ASCII + (write-repeat r0) + ;; not ASCII + (if (r0 == #x8E) + ;; single shift + (read-if (r1 < #xA1) + ;; invalid byte + ((write r0) + (write-repeat r1)) + (if (r1 > #xA7) + ;; invalid plane + ((write r0) + (write-repeat r1)) + ;; OK, we have a plane + (read-if (r2 < #xA1) + ;; invalid first byte + ((write r0 r1) + (write-repeat r2)) + (read-if (r3 < #xA1) + ;; invalid second byte + ((write r0 r1 r2) + (write-repeat r3)) + ;; CNS 1-7, finally + ((branch (r1 - #xA1) + (r1 = r4) + (r1 = r5) + (r1 = r6) + (r1 = ,(charset-id 'chinese-cns11643-4)) + (r1 = ,(charset-id 'chinese-cns11643-5)) + (r1 = ,(charset-id 'chinese-cns11643-6)) + (r1 = ,(charset-id 'chinese-cns11643-7))) + (r2 = ((((r2 - #x80) << 7) + r3) - #x80)) + (write-multibyte-character r1 r2) + (repeat)))))) + ;; standard EUC + (if (r0 < #xA1) + ;; invalid first byte + (write-repeat r0) + (read-if (r1 < #xA1) + ;; invalid second byte + ((write r0) + (write-repeat r1)) + ;; CNS 1, finally + ((r1 = ((((r0 - #x80) << 7) + r1) - #x80)) + (write-multibyte-character r4 r1) + (repeat))))))))) + "CCL program to decode EUC-TW encoding." +) + +(define-ccl-program ccl-encode-euc-tw + `(2 + ;; we don't have enough registers to hold all charset-ids + ((r2 = ,(charset-id 'ascii)) + (r3 = ,(charset-id 'chinese-big5-1)) + (r4 = ,(charset-id 'chinese-big5-2)) + (r5 = ,(charset-id 'chinese-cns11643-1)) + (r6 = ,(charset-id 'chinese-cns11643-2)) + (loop + (read-multibyte-character r0 r1) + (if (r0 == r2) + (write-repeat r1) + (;; Big 5 encoded characters are first translated to CNS + (if (r0 == r3) + (translate-character big5-to-cns r0 r1) + (if (r0 == r4) + (translate-character big5-to-cns r0 r1))) + (if (r0 == r5) + (r0 = #xA1) + (if (r0 == r6) + (r0 = #xA2) + (if (r0 == ,(charset-id 'chinese-cns11643-3)) + (r0 = #xA3) + (if (r0 == ,(charset-id 'chinese-cns11643-4)) + (r0 = #xA4) + (if (r0 == ,(charset-id 'chinese-cns11643-5)) + (r0 = #xA5) + (if (r0 == ,(charset-id 'chinese-cns11643-6)) + (r0 = #xA6) + (if (r0 == ,(charset-id 'chinese-cns11643-7)) + (r0 = #xA7) + ;; not CNS. We use a dummy character which + ;; can't occur in EUC-TW encoding to indicate + ;; this. + (write-repeat #xFF)))))))))) + (if (r0 != #xA1) + ;; single shift and CNS plane + ((write #x8E) + (write r0))) + (write ((r1 >> 7) + #x80)) + (write ((r1 % #x80) + #x80)) + (repeat)))) + "CCL program to encode EUC-TW encoding." +) + +(defun euc-tw-pre-write-conversion (beg end) + "Semi-dummy pre-write function effectively to autoload china-util." + ;; Ensure translation table is loaded. + (require 'china-util) + ;; Don't do this again. + (coding-system-put 'euc-tw 'pre-write-conversion nil) + nil) + +(make-coding-system + 'euc-tw 4 ?Z + "ISO 2022 based EUC encoding for Chinese CNS11643. +Big5 encoding is accepted for input also (which is then converted to CNS)." + '(ccl-decode-euc-tw . ccl-encode-euc-tw) + '((safe-charsets ascii + chinese-big5-1 + chinese-big5-2 + chinese-cns11643-1 + chinese-cns11643-2 + chinese-cns11643-3 + chinese-cns11643-4 + chinese-cns11643-5 + chinese-cns11643-6 + chinese-cns11643-7) + (valid-codes (0 . 255)) + (pre-write-conversion . euc-tw-pre-write-conversion))) + +(define-coding-system-alias 'euc-taiwan 'euc-tw) + (set-language-info-alist "Chinese-CNS" '((charset chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3 chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6 chinese-cns11643-7) - (coding-system iso-2022-cn) - (coding-priority iso-2022-cn chinese-big5 chinese-iso-8bit) + (coding-system iso-2022-cn euc-tw) + (coding-priority iso-2022-cn euc-tw chinese-big5 + chinese-iso-8bit) (features china-util) (input-method . "chinese-cns-quick") - (documentation . "Support for Chinese CNS character sets.")) + (documentation . "\ +Support for Chinese CNS character sets. Note that EUC-TW coding system +accepts Big5 for input also (which is then converted to CNS).")) '("Chinese")) (provide 'chinese)