Mercurial > emacs
comparison lisp/international/utf-8.el @ 48848:4eb835c1257d
(ucs-mule-cjk-to-unicode)
(utf-subst-table-for-encode, ucs-unicode-to-mule-cjk)
(utf-subst-table-for-decode): Specify :size, :rehash-size.
(utf-translate-cjk): :set rewritten to load subst-... files. Add
:set-after.
(ccl-decode-mule-utf-8): Consider CJK translation for r3<#x3400.
author | Dave Love <fx@gnu.org> |
---|---|
date | Sun, 15 Dec 2002 16:46:00 +0000 |
parents | e1b0e7a4859f |
children | d17c0d3e36ba |
comparison
equal
deleted
inserted
replaced
48847:49b559ee7ba4 | 48848:4eb835c1257d |
---|---|
45 ;; | 45 ;; |
46 ;; Fixme: note that reading and writing invalid utf-8 may not be | 46 ;; Fixme: note that reading and writing invalid utf-8 may not be |
47 ;; idempotent -- to represent the bytes to fix that needs a new charset. | 47 ;; idempotent -- to represent the bytes to fix that needs a new charset. |
48 ;; | 48 ;; |
49 ;; Characters from other character sets can be encoded with mule-utf-8 | 49 ;; Characters from other character sets can be encoded with mule-utf-8 |
50 ;; by populating the translation-table | 50 ;; by populating the translation table |
51 ;; `utf-translation-table-for-encode' and registering the translation | 51 ;; `utf-translation-table-for-encode' and registering the translation |
52 ;; with `register-char-codings'. Hash tables | 52 ;; with `register-char-codings'. Hash tables |
53 ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are | 53 ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are |
54 ;; used to support encoding and decoding of about a quarter of the CJK | 54 ;; used to support encoding and decoding of about a quarter of the CJK |
55 ;; space between U+3400 and U+DFFF. | 55 ;; space between U+3400 and U+DFFF. |
93 translation-table named `utf-translation-table-for-encode'") | 93 translation-table named `utf-translation-table-for-encode'") |
94 | 94 |
95 (define-translation-table 'utf-translation-table-for-decode) | 95 (define-translation-table 'utf-translation-table-for-decode) |
96 | 96 |
97 | 97 |
98 (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq) | 98 (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq :size 43000 |
99 :rehash-size 1000) | |
99 "Hash table mapping Emacs CJK character sets to Unicode code points. | 100 "Hash table mapping Emacs CJK character sets to Unicode code points. |
100 | 101 |
101 If `utf-translate-cjk' is non-nil, this table populates the | 102 If `utf-translate-cjk' is non-nil, this table populates the |
102 translation-hash-table named `utf-subst-table-for-encode'.") | 103 translation-hash-table named `utf-subst-table-for-encode'.") |
103 | 104 |
104 (define-translation-hash-table 'utf-subst-table-for-encode | 105 (define-translation-hash-table 'utf-subst-table-for-encode |
105 (make-hash-table :test 'eq)) | 106 (make-hash-table :test 'eq :size 43000 :rehash-size 1000)) |
106 | 107 |
107 (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq) | 108 (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq :size 43000 |
109 :rehash-size 1000) | |
108 "Hash table mapping Unicode code points to Emacs CJK character sets. | 110 "Hash table mapping Unicode code points to Emacs CJK character sets. |
109 | 111 |
110 If `utf-translate-cjk' is non-nil, this table populates the | 112 If `utf-translate-cjk' is non-nil, this table populates the |
111 translation-hash-table named `utf-subst-table-for-decode'.") | 113 translation-hash-table named `utf-subst-table-for-decode'.") |
112 | 114 |
113 (define-translation-hash-table 'utf-subst-table-for-decode | 115 (define-translation-hash-table 'utf-subst-table-for-decode |
114 (make-hash-table :test 'eq)) | 116 (make-hash-table :test 'eq :size 21500 :rehash-size 200)) |
115 | 117 |
116 (mapc | 118 (mapc |
117 (lambda (pair) | 119 (lambda (pair) |
118 (aset utf-fragmentation-table (car pair) (cdr pair)) | 120 (aset utf-fragmentation-table (car pair) (cdr pair)) |
119 (aset utf-defragmentation-table (cdr pair) (car pair))) | 121 (aset utf-defragmentation-table (cdr pair) (car pair))) |
203 :type 'boolean | 205 :type 'boolean |
204 :group 'mule) | 206 :group 'mule) |
205 | 207 |
206 (defcustom utf-translate-cjk nil | 208 (defcustom utf-translate-cjk nil |
207 "Whether the UTF based coding systems should decode/encode CJK characters. | 209 "Whether the UTF based coding systems should decode/encode CJK characters. |
208 | 210 Enabling this loads tables which allow the coding systems mule-utf-8, |
209 Enabling this loads tables which enable the coding systems: | 211 mule-utf-16-le and mule-utf-16-be to encode characters in the charsets |
210 mule-utf-8, mule-utf-16-le, mule-utf-16-be | 212 `korean-ksc5601', `chinese-gb2312', `chinese-big5-1', |
211 to encode characters in the charsets `korean-ksc5601', `chinese-gb2312' and | 213 `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to |
212 `japanese-jisx0208', and to decode the corresponding unicodes into | 214 decode the corresponding unicodes into such characters. |
213 such characters. This works by loading the library `utf-8-subst'; see | 215 |
214 its commentary. The tables are fairly large (about 33000 entries), so this | 216 Where the charsets overlap, the one preferred for decoding is chosen |
215 option is not the default." | 217 according to the language environment in effect when this option is |
216 :link '(emacs-commentary-link "utf-8-subst") | 218 turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for |
219 Chinese-Big5 and jisx for other environments. | |
220 | |
221 The tables are large (over 40000 entries), so this option is not the | |
222 default. Also, installing them may be rather slow." | |
217 :set (lambda (s v) | 223 :set (lambda (s v) |
218 (if v | 224 (if v |
219 (progn | 225 (progn |
220 (require 'utf-8-subst) | 226 ;; Load the files explicitly, to avoid having to keep |
227 ;; around the large tables they contain (as well as the | |
228 ;; ones which get built). | |
229 (cond | |
230 ((string= "Korean" current-language-environment) | |
231 (load "subst-jis") | |
232 (load "subst-big5") | |
233 (load "subst-gb2312") | |
234 (load "subst-ksc")) | |
235 ((string= "Chinese-BIG5" current-language-environment) | |
236 (load "subst-jis") | |
237 (load "subst-ksc") | |
238 (load "subst-gb2312") | |
239 (load "subst-big5")) | |
240 ((string= "Chinese-GB" current-language-environment) | |
241 (load "subst-jis") | |
242 (load "subst-ksc") | |
243 (load "subst-big5") | |
244 (load "subst-gb2312")) | |
245 (t | |
246 (load "subst-ksc") | |
247 (load "subst-gb2312") | |
248 (load "subst-big5") | |
249 (load "subst-jis"))) ; jis covers as much as big5, gb2312 | |
221 (let ((table (make-char-table 'translation-table))) | 250 (let ((table (make-char-table 'translation-table))) |
222 (maphash (lambda (k v) | 251 (maphash (lambda (k v) |
223 (aset table k t)) | 252 (aset table k t)) |
224 ucs-mule-cjk-to-unicode) | 253 ucs-mule-cjk-to-unicode) |
225 (register-char-codings 'mule-utf-8 table) | 254 (register-char-codings 'mule-utf-8 table) |
242 (define-translation-hash-table 'utf-subst-table-for-encode | 271 (define-translation-hash-table 'utf-subst-table-for-encode |
243 (make-hash-table :test 'eq))) | 272 (make-hash-table :test 'eq))) |
244 (set-default s v)) | 273 (set-default s v)) |
245 :version "21.4" | 274 :version "21.4" |
246 :type 'boolean | 275 :type 'boolean |
276 :set-after '(current-language-environment) | |
247 :group 'mule) | 277 :group 'mule) |
248 | 278 |
249 (define-ccl-program ccl-decode-mule-utf-8 | 279 (define-ccl-program ccl-decode-mule-utf-8 |
250 ;; | 280 ;; |
251 ;; charset | bytes in utf-8 | bytes in emacs | 281 ;; charset | bytes in utf-8 | bytes in emacs |
376 (translate-character | 406 (translate-character |
377 utf-translation-table-for-decode r0 r1) | 407 utf-translation-table-for-decode r0 r1) |
378 (write-multibyte-character r0 r1)) | 408 (write-multibyte-character r0 r1)) |
379 | 409 |
380 ;; mule-unicode-2500-33ff | 410 ;; mule-unicode-2500-33ff |
381 ;; Fixme: Perhaps allow translation via | |
382 ;; utf-subst-table-for-decode for #x2e80 up, so | |
383 ;; that we use consistent charsets for all of | |
384 ;; CJK. Would need corresponding change to | |
385 ;; encoding tables. | |
386 (if (r3 < #x3400) | 411 (if (r3 < #x3400) |
387 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | 412 ((r4 = r3) ; don't zap r3 |
388 (r3 -= #x2500) | 413 (lookup-integer utf-subst-table-for-decode r4 r5) |
389 (r3 //= 96) | 414 (if r7 |
390 (r1 = (r7 + 32)) | 415 ;; got a translation |
391 (r1 += ((r3 + 32) << 7)) | 416 ((write-multibyte-character r4 r5) |
392 (write-multibyte-character r0 r1)) | 417 ;; Zapped through register starvation. |
418 (r5 = ,(charset-id 'eight-bit-control))) | |
419 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | |
420 (r3 -= #x2500) | |
421 (r3 //= 96) | |
422 (r1 = (r7 + 32)) | |
423 (r1 += ((r3 + 32) << 7)) | |
424 (write-multibyte-character r0 r1)))) | |
393 | 425 |
394 ;; U+3400 .. U+D7FF | 426 ;; U+3400 .. U+D7FF |
395 ;; Try to convert to CJK chars, else keep | 427 ;; Try to convert to CJK chars, else keep |
396 ;; them as eight-bit-{control|graphic}. | 428 ;; them as eight-bit-{control|graphic}. |
397 (if (r3 < #xd800) | 429 (if (r3 < #xd800) |