Mercurial > emacs
changeset 42153:ca6dbe4635da
Implementing euc-tw encoding.
author | Werner LEMBERG <wl@gnu.org> |
---|---|
date | Tue, 18 Dec 2001 17:52:17 +0000 |
parents | e3ae5ef41293 |
children | 55ae020d4d7a |
files | lisp/language/china-util.el |
diffstat | 1 files changed, 269 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- a/lisp/language/china-util.el Tue Dec 18 17:51:50 2001 +0000 +++ b/lisp/language/china-util.el Tue Dec 18 17:52:17 2001 +0000 @@ -1,7 +1,8 @@ -;;; china-util.el --- utilities for Chinese +;;; china-util.el --- utilities for Chinese -*- coding: iso-2022-7bit -*- ;; Copyright (C) 1995 Electrotechnical Laboratory, JAPAN. ;; Licensed to the Free Software Foundation. +;; Copyright (C) 1995, 2001 Free Software Foundation, Inc. ;; Keywords: mule, multilingual, Chinese @@ -26,7 +27,7 @@ ;;; Code: -;; Hz/ZW encoding stuffs +;; Hz/ZW/EUC-TW encoding stuff ;; HZ is an encoding method for Chinese character set GB2312 used ;; widely in Internet. It is very similar to 7-bit environment of @@ -38,6 +39,13 @@ ;; encodes Chinese characters line by line by starting each line with ;; the sequence "zW". It also uses only 7-bit as HZ. +;; EUC-TW is similar to EUC-KS or EUC-JP. Its main character set is +;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with +;; a single shift escape followed by three bytes: the first gives the +;; plane, the second and third the character code. Note that characters +;; of plane 1 are (redundantly) accessible with a single shift escape +;; also. + ;; ISO-2022 escape sequence to designate GB2312. (defvar iso2022-gb-designation "\e$A") ;; HZ escape sequence to designate GB2312. @@ -156,6 +164,265 @@ (interactive) (encode-hz-region (point-min) (point-max))) +;; The following sets up a translation table (big5-to-cns) from Big 5 +;; to CNS encoding, using some auxiliary functions to make the code +;; more readable. + +;; Many kudos to Himi! The used code has been adapted from his +;; mule-ucs package. + +(defun big5-to-flat-code (num) + "Convert NUM in Big 5 encoding to a `flat code'. +0xA140 will be mapped to position 0, 0xA141 to position 1, etc. +There are no gaps in the flat code." + + (let ((hi (/ num 256)) + (lo (% num 256))) + (+ (* 157 (- hi #xa1)) + (- lo (if (>= lo #xa1) 98 64))))) + +(defun flat-code-to-big5 (num) + "Convert NUM from a `flat code' to Big 5 encoding. +This is the inverse function of `big5-to-flat-code'." + + (let ((hi (/ num 157)) + (lo (% num 157))) + (+ (* 256 (+ hi #xa1)) + (+ lo (if (< lo 63) 64 98))))) + +(defun euc-to-flat-code (num) + "Convert NUM in EUC encoding (in GL representation) to a `flat code'. +0x2121 will be mapped to position 0, 0x2122 to position 1, etc. +There are no gaps in the flat code." + + (let ((hi (/ num 256)) + (lo (% num 256))) + (+ (* 94 (- hi #x21)) + (- lo #x21)))) + +(defun flat-code-to-euc (num) + "Convert NUM from a `flat code' to EUC encoding (in GL representation). +The inverse function of `euc-to-flat-code'. The high and low bytes are +returned in a list." + + (let ((hi (/ num 94)) + (lo (% num 94))) + (list (+ hi #x21) (+ lo #x21)))) + +(defun expand-euc-big5-alist (alist) + "Create a translation table and fills it with data given in ALIST. +Elements of ALIST can be either given as + + ((euc-charset . startchar) . (big5-range-begin . big5-range-end)) + +or as + + (euc-character . big5-charcode) + +The former maps a range of glyphs in an EUC charset (where STARTCHAR +is in GL representation) to a certain range of Big 5 encoded +characters, the latter maps a single glyph. Glyphs which can't be +mapped will be represented with the byte 0xFF. + +The return value is the filled translation table." + + (let (chartable + elem + result + char + big5 + i + end + codepoint + charset) + (setq chartable (make-char-table 'translation-table #xFF)) + (while alist + (setq elem (car alist) + char (car elem) + big5 (cdr elem) + alist (cdr alist)) + (cond ((and (consp char) + (consp big5)) + (setq i (big5-to-flat-code (car big5)) + end (big5-to-flat-code (cdr big5)) + codepoint (euc-to-flat-code (cdr char)) + charset (car char)) + (while (>= end i) + (aset chartable + (decode-big5-char (flat-code-to-big5 i)) + (apply (function make-char) + charset + (flat-code-to-euc codepoint))) + (setq i (1+ i) + codepoint (1+ codepoint))) + ) + ((and (char-valid-p char) + (numberp big5)) + (setq i (decode-big5-char big5)) + (aset chartable i char) + ) + (t + (error "Unknown slot type: %S" elem) + ) + ) + ) + ;; the return value + chartable + ) +) + +;; All non-CNS encodings are commented out. + +(define-translation-table 'big5-to-cns + (expand-euc-big5-alist + '( + ;; Symbols + ((chinese-cns11643-1 . #x2121) . (#xA140 . #xA1F5)) + (?$(G"X(B . #xA1F6) + (?$(G"W(B . #xA1F7) + ((chinese-cns11643-1 . #x2259) . (#xA1F8 . #xA2AE)) + ((chinese-cns11643-1 . #x2421) . (#xA2AF . #xA3BF)) + ;; Control codes (vendor dependent) + ((chinese-cns11643-1 . #x4221) . (#xA3C0 . #xA3E0)) + ;; Level 1 Ideographs + ((chinese-cns11643-1 . #x4421) . (#xA440 . #xACFD)) + (?$(GWS(B . #xACFE) + ((chinese-cns11643-1 . #x5323) . (#xAD40 . #xAFCF)) + ((chinese-cns11643-1 . #x5754) . (#xAFD0 . #xBBC7)) + ((chinese-cns11643-1 . #x6B51) . (#xBBC8 . #xBE51)) + (?$(GkP(B . #xBE52) + ((chinese-cns11643-1 . #x6F5C) . (#xBE53 . #xC1AA)) + ((chinese-cns11643-1 . #x7536) . (#xC1AB . #xC2CA)) + (?$(Gu5(B . #xC2CB) + ((chinese-cns11643-1 . #x7737) . (#xC2CC . #xC360)) + ((chinese-cns11643-1 . #x782E) . (#xC361 . #xC3B8)) + (?$(Gxe(B . #xC3B9) + (?$(Gxd(B . #xC3BA) + ((chinese-cns11643-1 . #x7866) . (#xC3BB . #xC455)) + (?$(Gx-(B . #xC456) + ((chinese-cns11643-1 . #x7962) . (#xC457 . #xC67E)) + ;; Symbols + ((chinese-cns11643-1 . #x2621) . (#xC6A1 . #xC6BE)) + ;; Radicals + (?$(G'#(B . #xC6BF) + (?$(G'$(B . #xC6C0) + (?$(G'&(B . #xC6C1) + (?$(G'((B . #xC6C2) + (?$(G'-(B . #xC6C3) + (?$(G'.(B . #xC6C4) + (?$(G'/(B . #xC6C5) + (?$(G'4(B . #xC6C6) + (?$(G'7(B . #xC6C7) + (?$(G':(B . #xC6C8) + (?$(G'<(B . #xC6C9) + (?$(G'B(B . #xC6CA) + (?$(G'G(B . #xC6CB) + (?$(G'N(B . #xC6CC) + (?$(G'S(B . #xC6CD) + (?$(G'T(B . #xC6CE) + (?$(G'U(B . #xC6CF) + (?$(G'Y(B . #xC6D0) + (?$(G'Z(B . #xC6D1) + (?$(G'a(B . #xC6D2) + (?$(G'f(B . #xC6D3) + (?$(G()(B . #xC6D4) + (?$(G(*(B . #xC6D5) + (?$(G(c(B . #xC6D6) + (?$(G(l(B . #xC6D7) + ;; Diacritical Marks + ; ((japanese-jisx0208 . #x212F) . (#xC6D8 . #xC6D9)) + ;; Japanese Kana Supplement + ; ((japanese-jisx0208 . #x2133) . (#xC6DA . #xC6E3)) + ;; Japanese Hiragana + ; ((japanese-jisx0208 . #x2421) . (#xC6E7 . #xC77A)) + ;; Japanese Katakana + ; ((japanese-jisx0208 . #x2521) . (#xC77B . #xC7F2)) + ;; Cyrillic Characters + ; ((japanese-jisx0208 . #x2721) . (#xC7F3 . #xC854)) + ; ((japanese-jisx0208 . #x2751) . (#xC855 . #xC875)) + ;; Special Chinese Characters + (?$(J!#(B . #xC879) + (?$(J!$(B . #xC87B) + (?$(J!*(B . #xC87D) + (?$(J!R(B . #xC8A2) + + ;; JIS X 0208 NOT SIGN (cf. U+00AC) + ; (?$B"L(B . #xC8CD) + ;; JIS X 0212 BROKEN BAR (cf. U+00A6) + ; (?$(D"C(B . #xC8CE) + + ;; GB 2312 characters + ; (?$A!d(B . #xC8CF) + ; (?$A!e(B . #xC8D0) + ;;;;; C8D1 - Japanese `($B3t(B)' + ; (?$A!m(B . #xC8D2) + ;;;;; C8D2 - Tel. + + ;; Level 2 Ideographs + ((chinese-cns11643-2 . #x2121) . (#xC940 . #xC949)) + (?$(GDB(B . #xC94A);; a duplicate of #xA461 + ((chinese-cns11643-2 . #x212B) . (#xC94B . #xC96B)) + ((chinese-cns11643-2 . #x214D) . (#xC96C . #xC9BD)) + (?$(H!L(B . #xC9BE) + ((chinese-cns11643-2 . #x217D) . (#xC9BF . #xC9EC)) + ((chinese-cns11643-2 . #x224E) . (#xC9ED . #xCAF6)) + (?$(H"M(B . #xCAF7) + ((chinese-cns11643-2 . #x2439) . (#xCAF8 . #xD6CB)) + (?$(H>c(B . #xD6CC) + ((chinese-cns11643-2 . #x3770) . (#xD6CD . #xD779)) + (?$(H?j(B . #xD77A) + ((chinese-cns11643-2 . #x387E) . (#xD77B . #xDADE)) + (?$(H7o(B . #xDADF) + ((chinese-cns11643-2 . #x3E64) . (#xDAE0 . #xDBA6)) + ((chinese-cns11643-2 . #x3F6B) . (#xDBA7 . #xDDFB)) + (?$(HAv(B . #xDDFC);; a duplicate of #xDCD1 + ((chinese-cns11643-2 . #x4424) . (#xDDFD . #xE8A2)) + ((chinese-cns11643-2 . #x554C) . (#xE8A3 . #xE975)) + ((chinese-cns11643-2 . #x5723) . (#xE976 . #xEB5A)) + ((chinese-cns11643-2 . #x5A29) . (#xEB5B . #xEBF0)) + (?$(HUK(B . #xEBF1) + ((chinese-cns11643-2 . #x5B3F) . (#xEBF2 . #xECDD)) + (?$(HW"(B . #xECDE) + ((chinese-cns11643-2 . #x5C6A) . (#xECDF . #xEDA9)) + ((chinese-cns11643-2 . #x5D75) . (#xEDAA . #xEEEA)) + (?$(Hd/(B . #xEEEB) + ((chinese-cns11643-2 . #x6039) . (#xEEEC . #xF055)) + (?$(H]t(B . #xF056) + ((chinese-cns11643-2 . #x6243) . (#xF057 . #xF0CA)) + (?$(HZ((B . #xF0CB) + ((chinese-cns11643-2 . #x6337) . (#xF0CC . #xF162)) + ((chinese-cns11643-2 . #x6430) . (#xF163 . #xF16A)) + (?$(Hga(B . #xF16B) + ((chinese-cns11643-2 . #x6438) . (#xF16C . #xF267)) + (?$(Hi4(B . #xF268) + ((chinese-cns11643-2 . #x6573) . (#xF269 . #xF2C2)) + ((chinese-cns11643-2 . #x664E) . (#xF2C3 . #xF374)) + ((chinese-cns11643-2 . #x6762) . (#xF375 . #xF465)) + ((chinese-cns11643-2 . #x6935) . (#xF466 . #xF4B4)) + (?$(HfM(B . #xF4B5) + ((chinese-cns11643-2 . #x6962) . (#xF4B6 . #xF4FC)) + ((chinese-cns11643-2 . #x6A4C) . (#xF4FD . #xF662)) + (?$(HjK(B . #xF663) + ((chinese-cns11643-2 . #x6C52) . (#xF664 . #xF976)) + ((chinese-cns11643-2 . #x7167) . (#xF977 . #xF9C3)) + (?$(Hqf(B . #xF9C4) + (?$(Hr4(B . #xF9C5) + (?$(Hr@(B . #xF9C6) + ((chinese-cns11643-2 . #x7235) . (#xF9C7 . #xF9D1)) + ((chinese-cns11643-2 . #x7241) . (#xF9D2 . #xF9D5)) + + ;; Additional Ideographs + (?$(IC7(B . #xF9D6) + (?$(IOP(B . #xF9D7) + (?$(IDN(B . #xF9D8) + (?$(IPJ(B . #xF9D9) + (?$(I,](B . #xF9DA) + (?$(I=~(B . #xF9DB) + (?$(IK\(B . #xF9DC) + ) + ) +) + ;; (provide 'china-util)