Mercurial > emacs
comparison lisp/language/china-util.el @ 42153:ca6dbe4635da
Implementing euc-tw encoding.
author | Werner LEMBERG <wl@gnu.org> |
---|---|
date | Tue, 18 Dec 2001 17:52:17 +0000 |
parents | 67b464da13ec |
children | bd6dedbdc53f d7ddb3e565de 0816a10eb8ac |
comparison
equal
deleted
inserted
replaced
42152:e3ae5ef41293 | 42153:ca6dbe4635da |
---|---|
1 ;;; china-util.el --- utilities for Chinese | 1 ;;; china-util.el --- utilities for Chinese -*- coding: iso-2022-7bit -*- |
2 | 2 |
3 ;; Copyright (C) 1995 Electrotechnical Laboratory, JAPAN. | 3 ;; Copyright (C) 1995 Electrotechnical Laboratory, JAPAN. |
4 ;; Licensed to the Free Software Foundation. | 4 ;; Licensed to the Free Software Foundation. |
5 ;; Copyright (C) 1995, 2001 Free Software Foundation, Inc. | |
5 | 6 |
6 ;; Keywords: mule, multilingual, Chinese | 7 ;; Keywords: mule, multilingual, Chinese |
7 | 8 |
8 ;; This file is part of GNU Emacs. | 9 ;; This file is part of GNU Emacs. |
9 | 10 |
24 | 25 |
25 ;;; Commentary: | 26 ;;; Commentary: |
26 | 27 |
27 ;;; Code: | 28 ;;; Code: |
28 | 29 |
29 ;; Hz/ZW encoding stuffs | 30 ;; Hz/ZW/EUC-TW encoding stuff |
30 | 31 |
31 ;; HZ is an encoding method for Chinese character set GB2312 used | 32 ;; HZ is an encoding method for Chinese character set GB2312 used |
32 ;; widely in Internet. It is very similar to 7-bit environment of | 33 ;; widely in Internet. It is very similar to 7-bit environment of |
33 ;; ISO-2022. The difference is that HZ uses the sequence "~{" and | 34 ;; ISO-2022. The difference is that HZ uses the sequence "~{" and |
34 ;; "~}" for designating GB2312 and ASCII respectively, hence, it | 35 ;; "~}" for designating GB2312 and ASCII respectively, hence, it |
35 ;; doesn't uses ESC (0x1B) code. | 36 ;; doesn't uses ESC (0x1B) code. |
36 | 37 |
37 ;; ZW is another encoding method for Chinese character set GB2312. It | 38 ;; ZW is another encoding method for Chinese character set GB2312. It |
38 ;; encodes Chinese characters line by line by starting each line with | 39 ;; encodes Chinese characters line by line by starting each line with |
39 ;; the sequence "zW". It also uses only 7-bit as HZ. | 40 ;; the sequence "zW". It also uses only 7-bit as HZ. |
41 | |
42 ;; EUC-TW is similar to EUC-KS or EUC-JP. Its main character set is | |
43 ;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with | |
44 ;; a single shift escape followed by three bytes: the first gives the | |
45 ;; plane, the second and third the character code. Note that characters | |
46 ;; of plane 1 are (redundantly) accessible with a single shift escape | |
47 ;; also. | |
40 | 48 |
41 ;; ISO-2022 escape sequence to designate GB2312. | 49 ;; ISO-2022 escape sequence to designate GB2312. |
42 (defvar iso2022-gb-designation "\e$A") | 50 (defvar iso2022-gb-designation "\e$A") |
43 ;; HZ escape sequence to designate GB2312. | 51 ;; HZ escape sequence to designate GB2312. |
44 (defvar hz-gb-designnation "~{") | 52 (defvar hz-gb-designnation "~{") |
154 (defun encode-hz-buffer () | 162 (defun encode-hz-buffer () |
155 "Encode the text in the current buffer to HZ." | 163 "Encode the text in the current buffer to HZ." |
156 (interactive) | 164 (interactive) |
157 (encode-hz-region (point-min) (point-max))) | 165 (encode-hz-region (point-min) (point-max))) |
158 | 166 |
167 ;; The following sets up a translation table (big5-to-cns) from Big 5 | |
168 ;; to CNS encoding, using some auxiliary functions to make the code | |
169 ;; more readable. | |
170 | |
171 ;; Many kudos to Himi! The used code has been adapted from his | |
172 ;; mule-ucs package. | |
173 | |
174 (defun big5-to-flat-code (num) | |
175 "Convert NUM in Big 5 encoding to a `flat code'. | |
176 0xA140 will be mapped to position 0, 0xA141 to position 1, etc. | |
177 There are no gaps in the flat code." | |
178 | |
179 (let ((hi (/ num 256)) | |
180 (lo (% num 256))) | |
181 (+ (* 157 (- hi #xa1)) | |
182 (- lo (if (>= lo #xa1) 98 64))))) | |
183 | |
184 (defun flat-code-to-big5 (num) | |
185 "Convert NUM from a `flat code' to Big 5 encoding. | |
186 This is the inverse function of `big5-to-flat-code'." | |
187 | |
188 (let ((hi (/ num 157)) | |
189 (lo (% num 157))) | |
190 (+ (* 256 (+ hi #xa1)) | |
191 (+ lo (if (< lo 63) 64 98))))) | |
192 | |
193 (defun euc-to-flat-code (num) | |
194 "Convert NUM in EUC encoding (in GL representation) to a `flat code'. | |
195 0x2121 will be mapped to position 0, 0x2122 to position 1, etc. | |
196 There are no gaps in the flat code." | |
197 | |
198 (let ((hi (/ num 256)) | |
199 (lo (% num 256))) | |
200 (+ (* 94 (- hi #x21)) | |
201 (- lo #x21)))) | |
202 | |
203 (defun flat-code-to-euc (num) | |
204 "Convert NUM from a `flat code' to EUC encoding (in GL representation). | |
205 The inverse function of `euc-to-flat-code'. The high and low bytes are | |
206 returned in a list." | |
207 | |
208 (let ((hi (/ num 94)) | |
209 (lo (% num 94))) | |
210 (list (+ hi #x21) (+ lo #x21)))) | |
211 | |
212 (defun expand-euc-big5-alist (alist) | |
213 "Create a translation table and fills it with data given in ALIST. | |
214 Elements of ALIST can be either given as | |
215 | |
216 ((euc-charset . startchar) . (big5-range-begin . big5-range-end)) | |
217 | |
218 or as | |
219 | |
220 (euc-character . big5-charcode) | |
221 | |
222 The former maps a range of glyphs in an EUC charset (where STARTCHAR | |
223 is in GL representation) to a certain range of Big 5 encoded | |
224 characters, the latter maps a single glyph. Glyphs which can't be | |
225 mapped will be represented with the byte 0xFF. | |
226 | |
227 The return value is the filled translation table." | |
228 | |
229 (let (chartable | |
230 elem | |
231 result | |
232 char | |
233 big5 | |
234 i | |
235 end | |
236 codepoint | |
237 charset) | |
238 (setq chartable (make-char-table 'translation-table #xFF)) | |
239 (while alist | |
240 (setq elem (car alist) | |
241 char (car elem) | |
242 big5 (cdr elem) | |
243 alist (cdr alist)) | |
244 (cond ((and (consp char) | |
245 (consp big5)) | |
246 (setq i (big5-to-flat-code (car big5)) | |
247 end (big5-to-flat-code (cdr big5)) | |
248 codepoint (euc-to-flat-code (cdr char)) | |
249 charset (car char)) | |
250 (while (>= end i) | |
251 (aset chartable | |
252 (decode-big5-char (flat-code-to-big5 i)) | |
253 (apply (function make-char) | |
254 charset | |
255 (flat-code-to-euc codepoint))) | |
256 (setq i (1+ i) | |
257 codepoint (1+ codepoint))) | |
258 ) | |
259 ((and (char-valid-p char) | |
260 (numberp big5)) | |
261 (setq i (decode-big5-char big5)) | |
262 (aset chartable i char) | |
263 ) | |
264 (t | |
265 (error "Unknown slot type: %S" elem) | |
266 ) | |
267 ) | |
268 ) | |
269 ;; the return value | |
270 chartable | |
271 ) | |
272 ) | |
273 | |
274 ;; All non-CNS encodings are commented out. | |
275 | |
276 (define-translation-table 'big5-to-cns | |
277 (expand-euc-big5-alist | |
278 '( | |
279 ;; Symbols | |
280 ((chinese-cns11643-1 . #x2121) . (#xA140 . #xA1F5)) | |
281 (?$(G"X(B . #xA1F6) | |
282 (?$(G"W(B . #xA1F7) | |
283 ((chinese-cns11643-1 . #x2259) . (#xA1F8 . #xA2AE)) | |
284 ((chinese-cns11643-1 . #x2421) . (#xA2AF . #xA3BF)) | |
285 ;; Control codes (vendor dependent) | |
286 ((chinese-cns11643-1 . #x4221) . (#xA3C0 . #xA3E0)) | |
287 ;; Level 1 Ideographs | |
288 ((chinese-cns11643-1 . #x4421) . (#xA440 . #xACFD)) | |
289 (?$(GWS(B . #xACFE) | |
290 ((chinese-cns11643-1 . #x5323) . (#xAD40 . #xAFCF)) | |
291 ((chinese-cns11643-1 . #x5754) . (#xAFD0 . #xBBC7)) | |
292 ((chinese-cns11643-1 . #x6B51) . (#xBBC8 . #xBE51)) | |
293 (?$(GkP(B . #xBE52) | |
294 ((chinese-cns11643-1 . #x6F5C) . (#xBE53 . #xC1AA)) | |
295 ((chinese-cns11643-1 . #x7536) . (#xC1AB . #xC2CA)) | |
296 (?$(Gu5(B . #xC2CB) | |
297 ((chinese-cns11643-1 . #x7737) . (#xC2CC . #xC360)) | |
298 ((chinese-cns11643-1 . #x782E) . (#xC361 . #xC3B8)) | |
299 (?$(Gxe(B . #xC3B9) | |
300 (?$(Gxd(B . #xC3BA) | |
301 ((chinese-cns11643-1 . #x7866) . (#xC3BB . #xC455)) | |
302 (?$(Gx-(B . #xC456) | |
303 ((chinese-cns11643-1 . #x7962) . (#xC457 . #xC67E)) | |
304 ;; Symbols | |
305 ((chinese-cns11643-1 . #x2621) . (#xC6A1 . #xC6BE)) | |
306 ;; Radicals | |
307 (?$(G'#(B . #xC6BF) | |
308 (?$(G'$(B . #xC6C0) | |
309 (?$(G'&(B . #xC6C1) | |
310 (?$(G'((B . #xC6C2) | |
311 (?$(G'-(B . #xC6C3) | |
312 (?$(G'.(B . #xC6C4) | |
313 (?$(G'/(B . #xC6C5) | |
314 (?$(G'4(B . #xC6C6) | |
315 (?$(G'7(B . #xC6C7) | |
316 (?$(G':(B . #xC6C8) | |
317 (?$(G'<(B . #xC6C9) | |
318 (?$(G'B(B . #xC6CA) | |
319 (?$(G'G(B . #xC6CB) | |
320 (?$(G'N(B . #xC6CC) | |
321 (?$(G'S(B . #xC6CD) | |
322 (?$(G'T(B . #xC6CE) | |
323 (?$(G'U(B . #xC6CF) | |
324 (?$(G'Y(B . #xC6D0) | |
325 (?$(G'Z(B . #xC6D1) | |
326 (?$(G'a(B . #xC6D2) | |
327 (?$(G'f(B . #xC6D3) | |
328 (?$(G()(B . #xC6D4) | |
329 (?$(G(*(B . #xC6D5) | |
330 (?$(G(c(B . #xC6D6) | |
331 (?$(G(l(B . #xC6D7) | |
332 ;; Diacritical Marks | |
333 ; ((japanese-jisx0208 . #x212F) . (#xC6D8 . #xC6D9)) | |
334 ;; Japanese Kana Supplement | |
335 ; ((japanese-jisx0208 . #x2133) . (#xC6DA . #xC6E3)) | |
336 ;; Japanese Hiragana | |
337 ; ((japanese-jisx0208 . #x2421) . (#xC6E7 . #xC77A)) | |
338 ;; Japanese Katakana | |
339 ; ((japanese-jisx0208 . #x2521) . (#xC77B . #xC7F2)) | |
340 ;; Cyrillic Characters | |
341 ; ((japanese-jisx0208 . #x2721) . (#xC7F3 . #xC854)) | |
342 ; ((japanese-jisx0208 . #x2751) . (#xC855 . #xC875)) | |
343 ;; Special Chinese Characters | |
344 (?$(J!#(B . #xC879) | |
345 (?$(J!$(B . #xC87B) | |
346 (?$(J!*(B . #xC87D) | |
347 (?$(J!R(B . #xC8A2) | |
348 | |
349 ;; JIS X 0208 NOT SIGN (cf. U+00AC) | |
350 ; (?$B"L(B . #xC8CD) | |
351 ;; JIS X 0212 BROKEN BAR (cf. U+00A6) | |
352 ; (?$(D"C(B . #xC8CE) | |
353 | |
354 ;; GB 2312 characters | |
355 ; (?$A!d(B . #xC8CF) | |
356 ; (?$A!e(B . #xC8D0) | |
357 ;;;;; C8D1 - Japanese `($B3t(B)' | |
358 ; (?$A!m(B . #xC8D2) | |
359 ;;;;; C8D2 - Tel. | |
360 | |
361 ;; Level 2 Ideographs | |
362 ((chinese-cns11643-2 . #x2121) . (#xC940 . #xC949)) | |
363 (?$(GDB(B . #xC94A);; a duplicate of #xA461 | |
364 ((chinese-cns11643-2 . #x212B) . (#xC94B . #xC96B)) | |
365 ((chinese-cns11643-2 . #x214D) . (#xC96C . #xC9BD)) | |
366 (?$(H!L(B . #xC9BE) | |
367 ((chinese-cns11643-2 . #x217D) . (#xC9BF . #xC9EC)) | |
368 ((chinese-cns11643-2 . #x224E) . (#xC9ED . #xCAF6)) | |
369 (?$(H"M(B . #xCAF7) | |
370 ((chinese-cns11643-2 . #x2439) . (#xCAF8 . #xD6CB)) | |
371 (?$(H>c(B . #xD6CC) | |
372 ((chinese-cns11643-2 . #x3770) . (#xD6CD . #xD779)) | |
373 (?$(H?j(B . #xD77A) | |
374 ((chinese-cns11643-2 . #x387E) . (#xD77B . #xDADE)) | |
375 (?$(H7o(B . #xDADF) | |
376 ((chinese-cns11643-2 . #x3E64) . (#xDAE0 . #xDBA6)) | |
377 ((chinese-cns11643-2 . #x3F6B) . (#xDBA7 . #xDDFB)) | |
378 (?$(HAv(B . #xDDFC);; a duplicate of #xDCD1 | |
379 ((chinese-cns11643-2 . #x4424) . (#xDDFD . #xE8A2)) | |
380 ((chinese-cns11643-2 . #x554C) . (#xE8A3 . #xE975)) | |
381 ((chinese-cns11643-2 . #x5723) . (#xE976 . #xEB5A)) | |
382 ((chinese-cns11643-2 . #x5A29) . (#xEB5B . #xEBF0)) | |
383 (?$(HUK(B . #xEBF1) | |
384 ((chinese-cns11643-2 . #x5B3F) . (#xEBF2 . #xECDD)) | |
385 (?$(HW"(B . #xECDE) | |
386 ((chinese-cns11643-2 . #x5C6A) . (#xECDF . #xEDA9)) | |
387 ((chinese-cns11643-2 . #x5D75) . (#xEDAA . #xEEEA)) | |
388 (?$(Hd/(B . #xEEEB) | |
389 ((chinese-cns11643-2 . #x6039) . (#xEEEC . #xF055)) | |
390 (?$(H]t(B . #xF056) | |
391 ((chinese-cns11643-2 . #x6243) . (#xF057 . #xF0CA)) | |
392 (?$(HZ((B . #xF0CB) | |
393 ((chinese-cns11643-2 . #x6337) . (#xF0CC . #xF162)) | |
394 ((chinese-cns11643-2 . #x6430) . (#xF163 . #xF16A)) | |
395 (?$(Hga(B . #xF16B) | |
396 ((chinese-cns11643-2 . #x6438) . (#xF16C . #xF267)) | |
397 (?$(Hi4(B . #xF268) | |
398 ((chinese-cns11643-2 . #x6573) . (#xF269 . #xF2C2)) | |
399 ((chinese-cns11643-2 . #x664E) . (#xF2C3 . #xF374)) | |
400 ((chinese-cns11643-2 . #x6762) . (#xF375 . #xF465)) | |
401 ((chinese-cns11643-2 . #x6935) . (#xF466 . #xF4B4)) | |
402 (?$(HfM(B . #xF4B5) | |
403 ((chinese-cns11643-2 . #x6962) . (#xF4B6 . #xF4FC)) | |
404 ((chinese-cns11643-2 . #x6A4C) . (#xF4FD . #xF662)) | |
405 (?$(HjK(B . #xF663) | |
406 ((chinese-cns11643-2 . #x6C52) . (#xF664 . #xF976)) | |
407 ((chinese-cns11643-2 . #x7167) . (#xF977 . #xF9C3)) | |
408 (?$(Hqf(B . #xF9C4) | |
409 (?$(Hr4(B . #xF9C5) | |
410 (?$(Hr@(B . #xF9C6) | |
411 ((chinese-cns11643-2 . #x7235) . (#xF9C7 . #xF9D1)) | |
412 ((chinese-cns11643-2 . #x7241) . (#xF9D2 . #xF9D5)) | |
413 | |
414 ;; Additional Ideographs | |
415 (?$(IC7(B . #xF9D6) | |
416 (?$(IOP(B . #xF9D7) | |
417 (?$(IDN(B . #xF9D8) | |
418 (?$(IPJ(B . #xF9D9) | |
419 (?$(I,](B . #xF9DA) | |
420 (?$(I=~(B . #xF9DB) | |
421 (?$(IK\(B . #xF9DC) | |
422 ) | |
423 ) | |
424 ) | |
425 | |
159 ;; | 426 ;; |
160 (provide 'china-util) | 427 (provide 'china-util) |
161 | 428 |
162 ;;; china-util.el ends here | 429 ;;; china-util.el ends here |