comparison lisp/international/utf-8.el @ 48848:4eb835c1257d

(ucs-mule-cjk-to-unicode) (utf-subst-table-for-encode, ucs-unicode-to-mule-cjk) (utf-subst-table-for-decode): Specify :size, :rehash-size. (utf-translate-cjk): :set rewritten to load subst-... files. Add :set-after. (ccl-decode-mule-utf-8): Consider CJK translation for r3<#x3400.
author Dave Love <fx@gnu.org>
date Sun, 15 Dec 2002 16:46:00 +0000
parents e1b0e7a4859f
children d17c0d3e36ba
comparison
equal deleted inserted replaced
48847:49b559ee7ba4 48848:4eb835c1257d
45 ;; 45 ;;
46 ;; Fixme: note that reading and writing invalid utf-8 may not be 46 ;; Fixme: note that reading and writing invalid utf-8 may not be
47 ;; idempotent -- to represent the bytes to fix that needs a new charset. 47 ;; idempotent -- to represent the bytes to fix that needs a new charset.
48 ;; 48 ;;
49 ;; Characters from other character sets can be encoded with mule-utf-8 49 ;; Characters from other character sets can be encoded with mule-utf-8
50 ;; by populating the translation-table 50 ;; by populating the translation table
51 ;; `utf-translation-table-for-encode' and registering the translation 51 ;; `utf-translation-table-for-encode' and registering the translation
52 ;; with `register-char-codings'. Hash tables 52 ;; with `register-char-codings'. Hash tables
53 ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are 53 ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are
54 ;; used to support encoding and decoding of about a quarter of the CJK 54 ;; used to support encoding and decoding of about a quarter of the CJK
55 ;; space between U+3400 and U+DFFF. 55 ;; space between U+3400 and U+DFFF.
93 translation-table named `utf-translation-table-for-encode'") 93 translation-table named `utf-translation-table-for-encode'")
94 94
95 (define-translation-table 'utf-translation-table-for-decode) 95 (define-translation-table 'utf-translation-table-for-decode)
96 96
97 97
98 (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq) 98 (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq :size 43000
99 :rehash-size 1000)
99 "Hash table mapping Emacs CJK character sets to Unicode code points. 100 "Hash table mapping Emacs CJK character sets to Unicode code points.
100 101
101 If `utf-translate-cjk' is non-nil, this table populates the 102 If `utf-translate-cjk' is non-nil, this table populates the
102 translation-hash-table named `utf-subst-table-for-encode'.") 103 translation-hash-table named `utf-subst-table-for-encode'.")
103 104
104 (define-translation-hash-table 'utf-subst-table-for-encode 105 (define-translation-hash-table 'utf-subst-table-for-encode
105 (make-hash-table :test 'eq)) 106 (make-hash-table :test 'eq :size 43000 :rehash-size 1000))
106 107
107 (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq) 108 (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq :size 43000
109 :rehash-size 1000)
108 "Hash table mapping Unicode code points to Emacs CJK character sets. 110 "Hash table mapping Unicode code points to Emacs CJK character sets.
109 111
110 If `utf-translate-cjk' is non-nil, this table populates the 112 If `utf-translate-cjk' is non-nil, this table populates the
111 translation-hash-table named `utf-subst-table-for-decode'.") 113 translation-hash-table named `utf-subst-table-for-decode'.")
112 114
113 (define-translation-hash-table 'utf-subst-table-for-decode 115 (define-translation-hash-table 'utf-subst-table-for-decode
114 (make-hash-table :test 'eq)) 116 (make-hash-table :test 'eq :size 21500 :rehash-size 200))
115 117
116 (mapc 118 (mapc
117 (lambda (pair) 119 (lambda (pair)
118 (aset utf-fragmentation-table (car pair) (cdr pair)) 120 (aset utf-fragmentation-table (car pair) (cdr pair))
119 (aset utf-defragmentation-table (cdr pair) (car pair))) 121 (aset utf-defragmentation-table (cdr pair) (car pair)))
203 :type 'boolean 205 :type 'boolean
204 :group 'mule) 206 :group 'mule)
205 207
206 (defcustom utf-translate-cjk nil 208 (defcustom utf-translate-cjk nil
207 "Whether the UTF based coding systems should decode/encode CJK characters. 209 "Whether the UTF based coding systems should decode/encode CJK characters.
208 210 Enabling this loads tables which allow the coding systems mule-utf-8,
209 Enabling this loads tables which enable the coding systems: 211 mule-utf-16-le and mule-utf-16-be to encode characters in the charsets
210 mule-utf-8, mule-utf-16-le, mule-utf-16-be 212 `korean-ksc5601', `chinese-gb2312', `chinese-big5-1',
211 to encode characters in the charsets `korean-ksc5601', `chinese-gb2312' and 213 `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to
212 `japanese-jisx0208', and to decode the corresponding unicodes into 214 decode the corresponding unicodes into such characters.
213 such characters. This works by loading the library `utf-8-subst'; see 215
214 its commentary. The tables are fairly large (about 33000 entries), so this 216 Where the charsets overlap, the one preferred for decoding is chosen
215 option is not the default." 217 according to the language environment in effect when this option is
216 :link '(emacs-commentary-link "utf-8-subst") 218 turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for
219 Chinese-Big5 and jisx for other environments.
220
221 The tables are large (over 40000 entries), so this option is not the
222 default. Also, installing them may be rather slow."
217 :set (lambda (s v) 223 :set (lambda (s v)
218 (if v 224 (if v
219 (progn 225 (progn
220 (require 'utf-8-subst) 226 ;; Load the files explicitly, to avoid having to keep
227 ;; around the large tables they contain (as well as the
228 ;; ones which get built).
229 (cond
230 ((string= "Korean" current-language-environment)
231 (load "subst-jis")
232 (load "subst-big5")
233 (load "subst-gb2312")
234 (load "subst-ksc"))
235 ((string= "Chinese-BIG5" current-language-environment)
236 (load "subst-jis")
237 (load "subst-ksc")
238 (load "subst-gb2312")
239 (load "subst-big5"))
240 ((string= "Chinese-GB" current-language-environment)
241 (load "subst-jis")
242 (load "subst-ksc")
243 (load "subst-big5")
244 (load "subst-gb2312"))
245 (t
246 (load "subst-ksc")
247 (load "subst-gb2312")
248 (load "subst-big5")
249 (load "subst-jis"))) ; jis covers as much as big5, gb2312
221 (let ((table (make-char-table 'translation-table))) 250 (let ((table (make-char-table 'translation-table)))
222 (maphash (lambda (k v) 251 (maphash (lambda (k v)
223 (aset table k t)) 252 (aset table k t))
224 ucs-mule-cjk-to-unicode) 253 ucs-mule-cjk-to-unicode)
225 (register-char-codings 'mule-utf-8 table) 254 (register-char-codings 'mule-utf-8 table)
242 (define-translation-hash-table 'utf-subst-table-for-encode 271 (define-translation-hash-table 'utf-subst-table-for-encode
243 (make-hash-table :test 'eq))) 272 (make-hash-table :test 'eq)))
244 (set-default s v)) 273 (set-default s v))
245 :version "21.4" 274 :version "21.4"
246 :type 'boolean 275 :type 'boolean
276 :set-after '(current-language-environment)
247 :group 'mule) 277 :group 'mule)
248 278
249 (define-ccl-program ccl-decode-mule-utf-8 279 (define-ccl-program ccl-decode-mule-utf-8
250 ;; 280 ;;
251 ;; charset | bytes in utf-8 | bytes in emacs 281 ;; charset | bytes in utf-8 | bytes in emacs
376 (translate-character 406 (translate-character
377 utf-translation-table-for-decode r0 r1) 407 utf-translation-table-for-decode r0 r1)
378 (write-multibyte-character r0 r1)) 408 (write-multibyte-character r0 r1))
379 409
380 ;; mule-unicode-2500-33ff 410 ;; mule-unicode-2500-33ff
381 ;; Fixme: Perhaps allow translation via
382 ;; utf-subst-table-for-decode for #x2e80 up, so
383 ;; that we use consistent charsets for all of
384 ;; CJK. Would need corresponding change to
385 ;; encoding tables.
386 (if (r3 < #x3400) 411 (if (r3 < #x3400)
387 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) 412 ((r4 = r3) ; don't zap r3
388 (r3 -= #x2500) 413 (lookup-integer utf-subst-table-for-decode r4 r5)
389 (r3 //= 96) 414 (if r7
390 (r1 = (r7 + 32)) 415 ;; got a translation
391 (r1 += ((r3 + 32) << 7)) 416 ((write-multibyte-character r4 r5)
392 (write-multibyte-character r0 r1)) 417 ;; Zapped through register starvation.
418 (r5 = ,(charset-id 'eight-bit-control)))
419 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
420 (r3 -= #x2500)
421 (r3 //= 96)
422 (r1 = (r7 + 32))
423 (r1 += ((r3 + 32) << 7))
424 (write-multibyte-character r0 r1))))
393 425
394 ;; U+3400 .. U+D7FF 426 ;; U+3400 .. U+D7FF
395 ;; Try to convert to CJK chars, else keep 427 ;; Try to convert to CJK chars, else keep
396 ;; them as eight-bit-{control|graphic}. 428 ;; them as eight-bit-{control|graphic}.
397 (if (r3 < #xd800) 429 (if (r3 < #xd800)