comparison lisp/international/utf-8.el @ 47703:6d4430dfeafc

(ucs-mule-to-mule-unicode): Don't define this translation-table name here. (utf-translation-table-for-encode): New translation-table name. (utf-fragmentation-table): Renamed from utf-8-fragmentation-table. (utf-defragmentation-table): New variable. (ucs-mule-cjk-to-unicode): Renamed from utf-8-subst-rev-table. (utf-subst-table-for-encode): New translation-table name. (ucs-unicode-to-mule-cjk): Renamed from utf-8-subst-table. (utf-subst-table-for-decode): New translation-table name. (utf-fragment-on-decoding): Renamed from utf-8-fragment-on-decoding. Correctly handle the case that unify-8859-on-encoding-mode is off. Handle mule-utf-16-le and mule-utf-16-be too. (utf-translate-cjk): Renamed from utf-8-translate-cjk. Handle mule-utf-16-le and mule-utf-16-be too. (ccl-decode-mule-utf-8): Refer to utf-translation-table-for-decode and utf-subst-table-for-decode. (ccl-encode-mule-utf-8): Refer to utf-translation-table-for-encode and utf-subst-table-for-encode. (mule-utf-8): Fix `safe-charsets' property, put `dependency' property.
author Kenichi Handa <handa@m17n.org>
date Mon, 30 Sep 2002 06:35:13 +0000
parents 63f5cc467cea
children e1b0e7a4859f
comparison
equal deleted inserted replaced
47702:e0786a68f34e 47703:6d4430dfeafc
44 ;; unicode. 44 ;; unicode.
45 ;; 45 ;;
46 ;; Fixme: note that reading and writing invalid utf-8 may not be 46 ;; Fixme: note that reading and writing invalid utf-8 may not be
47 ;; idempotent -- to represent the bytes to fix that needs a new charset. 47 ;; idempotent -- to represent the bytes to fix that needs a new charset.
48 ;; 48 ;;
49 ;; Characters from other character sets can be encoded with 49 ;; Characters from other character sets can be encoded with mule-utf-8
50 ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and 50 ;; by populating the translation-table
51 ;; registering the translation with `register-char-codings'. Hash 51 ;; `utf-translation-table-for-encode' and registering the translation
52 ;; tables `utf-8-subst-table' and `utf-8-subst-rev-table' are used to 52 ;; with `register-char-codings'. Hash tables
53 ;; support encoding and decoding of about a quarter of the CJK space 53 ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are
54 ;; between U+3400 and U+DFFF. 54 ;; used to support encoding and decoding of about a quarter of the CJK
55 ;; space between U+3400 and U+DFFF.
55 56
56 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: 57 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
57 58
58 ;; scalar | utf-8 59 ;; scalar | utf-8
59 ;; value | 1st byte | 2nd byte | 3rd byte 60 ;; value | 1st byte | 2nd byte | 3rd byte
62 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | 63 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
63 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx 64 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
64 65
65 ;;; Code: 66 ;;; Code:
66 67
67 (defvar ucs-mule-to-mule-unicode (make-translation-table) 68 (defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil)
68 "Translation table for encoding to `mule-utf-8'.") 69 "Char table mapping characters to latin-iso8859-1 or mule-unicode-*.
69 (define-translation-table 'ucs-mule-to-mule-unicode 70
70 ucs-mule-to-mule-unicode) 71 If `unify-8859-on-encoding-mode' is non-nil, this table populates the
71 72 translation-table named `utf-translation-table-for-encode'.")
72 (defvar utf-8-subst-table (make-hash-table :test 'eq)) 73
73 (defvar utf-8-subst-rev-table (make-hash-table :test 'eq)) 74 (define-translation-table 'utf-translation-table-for-encode)
74 (define-translation-hash-table 'utf-8-subst-table utf-8-subst-table) 75
75 (define-translation-hash-table 'utf-8-subst-rev-table utf-8-subst-rev-table)
76
77 (defvar utf-8-translation-table-for-decode (make-translation-table)
78 "Translation table applied after decoding utf-8 to mule-unicode.
79 This is only actually applied to characters which would normally be
80 decoded into mule-unicode-0100-24ff.")
81 (define-translation-table 'utf-8-translation-table-for-decode
82 utf-8-translation-table-for-decode)
83 76
84 ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the 77 ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
85 ;; space of mule-unicode. For Latin scripts this isn't very 78 ;; space of mule-unicode. For Latin scripts this isn't very
86 ;; important. Hebrew and Arabic might go here too when there's proper 79 ;; important. Hebrew and Arabic might go here too when there's proper
87 ;; support for them. 80 ;; support for them.
88 (defvar utf-8-fragmentation-table (make-translation-table) 81
89 "Char table normally mapping non-Latin mule-unicode-... characters to iso8859. 82 (defvar utf-fragmentation-table (make-char-table 'translation-table nil)
90 Used as the value of `utf-8-translation-table-for-decode' in 83 "Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*.
91 `utf-8-fragment-on-decoding' mode.") 84
85 If `utf-fragment-on-decoding' is non-nil, this table populates the
86 translation-table named `utf-translation-table-for-decode'")
87
88 (defvar utf-defragmentation-table (make-char-table 'translation-table nil)
89 "Char-table for reverse mapping of `utf-fragmentation-table'.
90
91 If `utf-fragment-on-decoding' is non-nil and
92 `unify-8859-on-encoding-mode' is nil, this table populates the
93 translation-table named `utf-translation-table-for-encode'")
94
95 (define-translation-table 'utf-translation-table-for-decode)
96
97
98 (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq)
99 "Hash table mapping Emacs CJK character sets to Unicode code points.
100
101 If `utf-translate-cjk' is non-nil, this table populates the
102 translation-hash-table named `utf-subst-table-for-encode'.")
103
104 (define-translation-hash-table 'utf-subst-table-for-encode
105 (make-hash-table :test 'eq))
106
107 (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq)
108 "Hash table mapping Unicode code points to Emacs CJK character sets.
109
110 If `utf-translate-cjk' is non-nil, this table populates the
111 translation-hash-table named `utf-subst-table-for-decode'.")
112
113 (define-translation-hash-table 'utf-subst-table-for-decode
114 (make-hash-table :test 'eq))
115
92 (mapc 116 (mapc
93 (lambda (pair) 117 (lambda (pair)
94 (aset utf-8-fragmentation-table (car pair) (cdr pair))) 118 (aset utf-fragmentation-table (car pair) (cdr pair))
119 (aset utf-defragmentation-table (cdr pair) (car pair)))
95 '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) (?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B) 120 '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) (?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B)
96 (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) (?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,F@(B) 121 (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) (?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,F@(B)
97 (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) (?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B) 122 (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) (?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B)
98 (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) (?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B) 123 (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) (?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B)
99 (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) (?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B) 124 (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) (?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B)
126 (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) (?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B) 151 (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) (?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B)
127 (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) (?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B) 152 (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) (?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B)
128 (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) (?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B) 153 (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) (?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B)
129 (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B))) 154 (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B)))
130 155
131 (defcustom utf-8-fragment-on-decoding nil 156
132 "Whether or not to decode some scripts in UTF-8 text into iso8859 charsets. 157 (defcustom utf-fragment-on-decoding nil
158 "Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets.
133 Setting this means that the relevant Cyrillic and Greek characters are 159 Setting this means that the relevant Cyrillic and Greek characters are
134 decoded into the iso8859 charsets rather than into 160 decoded into the iso8859 charsets rather than into
135 mule-unicode-0100-24ff. The iso8859 charsets take half as much space 161 mule-unicode-0100-24ff. The iso8859 charsets take half as much space
136 in the buffer, but using them may affect how the buffer can be re-encoded 162 in the buffer, but using them may affect how the buffer can be re-encoded
137 and may require a different input method to search for them, for instance. 163 and may require a different input method to search for them, for instance.
138 See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode' 164 See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode'
139 for mechanisms to make this largely transparent. 165 for mechanisms to make this largely transparent.
140 166
141 Setting this variable outside customize has no effect." 167 Setting this variable outside customize has no effect."
142 :set (lambda (s v) 168 :set (lambda (s v)
143 (setq utf-8-translation-table-for-decode 169 (if v
144 (if v 170 (progn
145 utf-8-fragmentation-table 171 (define-translation-table 'utf-translation-table-for-decode
146 (make-translation-table))) 172 utf-fragmentation-table)
147 (define-translation-table 'utf-8-translation-table-for-decode 173 ;; Even if unify-8859-on-encoding-mode is off, make
148 utf-8-translation-table-for-decode) 174 ;; mule-utf-* encode characters in
175 ;; utf-fragmentation-table.
176 (unless (eq (get 'utf-translation-table-for-encode
177 'translation-table)
178 ucs-mule-to-mule-unicode)
179 (define-translation-table 'utf-translation-table-for-encode
180 utf-defragmentation-table)
181 (dolist (coding '(mule-utf-8 mule-utf-16-be mule-utf-16-le))
182 (register-char-codings coding utf-defragmentation-table))))
183 (define-translation-table 'utf-translation-table-for-decode)
184 ;; When unify-8859-on-encoding-mode is off, be sure to make
185 ;; mule-utf-* disabled for characters in
186 ;; utf-fragmentation-table.
187 (unless (eq (get 'utf-translation-table-for-encode
188 'translation-table)
189 ucs-mule-to-mule-unicode)
190 (define-translation-table 'utf-translation-table-for-encode)
191 (map-char-table
192 (lambda (key val)
193 (if (and (>= key 128) val)
194 (aset char-coding-system-table key
195 (delq 'mule-utf-8
196 (delq 'mule-utf-16-le
197 (delq 'mule-utf-16-be
198 (aref char-coding-system-table
199 key)))))))
200 utf-defragmentation-table)))
149 (set-default s v)) 201 (set-default s v))
150 :version "21.4" 202 :version "21.4"
151 :type 'boolean 203 :type 'boolean
152 :group 'mule) 204 :group 'mule)
153 205
154 (defcustom utf-8-translate-cjk nil 206 (defcustom utf-translate-cjk nil
155 "Whether the `mule-utf-8' coding system should encode many CJK characters. 207 "Whether the UTF based coding systems should decode/encode CJK characters.
156 208
157 Enabling this loads tables which enable the coding system to encode 209 Enabling this loads tables which enable the coding systems:
158 characters in the charsets `korean-ksc5601', `chinese-gb2312' and 210 mule-utf-8, mule-utf-16-le, mule-utf-16-be
211 to encode characters in the charsets `korean-ksc5601', `chinese-gb2312' and
159 `japanese-jisx0208', and to decode the corresponding unicodes into 212 `japanese-jisx0208', and to decode the corresponding unicodes into
160 such characters. This works by loading the library `utf-8-subst'; see 213 such characters. This works by loading the library `utf-8-subst'; see
161 its commentary. The tables are fairly large (about 33000 entries), so this 214 its commentary. The tables are fairly large (about 33000 entries), so this
162 option is not the default." 215 option is not the default."
163 :link '(emacs-commentary-link "utf-8-subst") 216 :link '(emacs-commentary-link "utf-8-subst")
164 :set (lambda (s v) 217 :set (lambda (s v)
165 (when v 218 (if v
166 (require 'utf-8-subst) 219 (progn
167 (let ((table (make-char-table 'translation-table))) 220 (require 'utf-8-subst)
168 (coding-system-put 'mule-utf-8 'safe-charsets 221 (let ((table (make-char-table 'translation-table)))
169 (append (coding-system-get 'mule-utf-8 222 (maphash (lambda (k v)
170 'safe-charsets) 223 (aset table k t))
171 '(korean-ksc5601 chinese-gb2312 224 ucs-mule-cjk-to-unicode)
172 japanese-jisx0208))) 225 (register-char-codings 'mule-utf-8 table)
173 (maphash (lambda (k v) 226 (register-char-codings 'mule-utf-16-le table)
174 (aset table k v)) 227 (register-char-codings 'mule-utf-16-be table))
175 utf-8-subst-rev-table) 228 (define-translation-hash-table 'utf-subst-table-for-decode
176 (register-char-codings 'mule-utf-8 table))) 229 ucs-unicode-to-mule-cjk)
230 (define-translation-hash-table 'utf-subst-table-for-encode
231 ucs-mule-cjk-to-unicode))
232 (map-char-table
233 (lambda (k v)
234 (if (gethash k ucs-mule-cjk-to-unicode)
235 (aset char-coding-system-table k
236 (delq 'mule-utf-8
237 (delq 'mule-utf-16-le
238 (delq 'mule-utf-16-be v))))))
239 char-coding-system-table)
240 (define-translation-hash-table 'utf-subst-table-for-decode
241 (make-hash-table :test 'eq))
242 (define-translation-hash-table 'utf-subst-table-for-encode
243 (make-hash-table :test 'eq)))
177 (set-default s v)) 244 (set-default s v))
178 :version "21.4" 245 :version "21.4"
179 :type 'boolean 246 :type 'boolean
180 :group 'mule) 247 :group 'mule)
181 248
261 (r1 -= #x0100) 328 (r1 -= #x0100)
262 (r2 = (((r1 / 96) + 32) << 7)) 329 (r2 = (((r1 / 96) + 32) << 7))
263 (r1 %= 96) 330 (r1 %= 96)
264 (r1 += (r2 + 32)) 331 (r1 += (r2 + 32))
265 (translate-character 332 (translate-character
266 utf-8-translation-table-for-decode r0 r1) 333 utf-translation-table-for-decode r0 r1)
267 (write-multibyte-character r0 r1)))))))) 334 (write-multibyte-character r0 r1))))))))
268 335
269 ;; 3byte encoding 336 ;; 3byte encoding
270 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx 337 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
271 (if (r0 < #xf0) 338 (if (r0 < #xf0)
306 (r3 -= #x0100) 373 (r3 -= #x0100)
307 (r3 //= 96) 374 (r3 //= 96)
308 (r1 = (r7 + 32)) 375 (r1 = (r7 + 32))
309 (r1 += ((r3 + 32) << 7)) 376 (r1 += ((r3 + 32) << 7))
310 (translate-character 377 (translate-character
311 utf-8-translation-table-for-decode r0 r1) 378 utf-translation-table-for-decode r0 r1)
312 (write-multibyte-character r0 r1)) 379 (write-multibyte-character r0 r1))
313 380
314 ;; mule-unicode-2500-33ff 381 ;; mule-unicode-2500-33ff
315 ;; Fixme: Perhaps allow translation via 382 ;; Fixme: Perhaps allow translation via
316 ;; utf-8-subst-table for #x2e80 up, so that we use 383 ;; utf-subst-table-for-decode for #x2e80 up, so
317 ;; consistent charsets for all of CJK. Would need 384 ;; that we use consistent charsets for all of
318 ;; corresponding change to encoding tables. 385 ;; CJK. Would need corresponding change to
386 ;; encoding tables.
319 (if (r3 < #x3400) 387 (if (r3 < #x3400)
320 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) 388 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
321 (r3 -= #x2500) 389 (r3 -= #x2500)
322 (r3 //= 96) 390 (r3 //= 96)
323 (r1 = (r7 + 32)) 391 (r1 = (r7 + 32))
327 ;; U+3400 .. U+D7FF 395 ;; U+3400 .. U+D7FF
328 ;; Try to convert to CJK chars, else keep 396 ;; Try to convert to CJK chars, else keep
329 ;; them as eight-bit-{control|graphic}. 397 ;; them as eight-bit-{control|graphic}.
330 (if (r3 < #xd800) 398 (if (r3 < #xd800)
331 ((r4 = r3) ; don't zap r3 399 ((r4 = r3) ; don't zap r3
332 (lookup-integer utf-8-subst-table r4 r5) 400 (lookup-integer utf-subst-table-for-decode r4 r5)
333 (if r7 401 (if r7
334 ;; got a translation 402 ;; got a translation
335 ((write-multibyte-character r4 r5) 403 ((write-multibyte-character r4 r5)
336 ;; Zapped through register starvation. 404 ;; Zapped through register starvation.
337 (r5 = ,(charset-id 'eight-bit-control))) 405 (r5 = ,(charset-id 'eight-bit-control)))
368 (write-multibyte-character r0 r1))))))))) 436 (write-multibyte-character r0 r1)))))))))
369 437
370 (if (r0 < #xfe) 438 (if (r0 < #xfe)
371 ;; 4byte encoding 439 ;; 4byte encoding
372 ;; keep those bytes as eight-bit-{control|graphic} 440 ;; keep those bytes as eight-bit-{control|graphic}
373 ;; Fixme: allow lookup in utf-8-subst-table. 441 ;; Fixme: allow lookup in utf-subst-table-for-decode.
374 ((read r1 r2 r3) 442 ((read r1 r2 r3)
375 ;; r0 > #xf0, thus eight-bit-graphic 443 ;; r0 > #xf0, thus eight-bit-graphic
376 (write-multibyte-character r6 r0) 444 (write-multibyte-character r6 r0)
377 (if (r1 < #xa0) 445 (if (r1 < #xa0)
378 (if (r1 < #x80) ; invalid byte 446 (if (r1 < #x80) ; invalid byte
407 (write-multibyte-character r6 r0)))))) 475 (write-multibyte-character r6 r0))))))
408 (repeat)))) 476 (repeat))))
409 477
410 "CCL program to decode UTF-8. 478 "CCL program to decode UTF-8.
411 Basic decoding is done into the charsets ascii, latin-iso8859-1 and 479 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
412 mule-unicode-*, but see also `utf-8-translation-table-for-decode' and 480 mule-unicode-*, but see also `utf-fragmentation-table' and
413 `utf-8-subst-table'. 481 `ucs-mule-cjk-to-unicode'.
414 Encodings of un-representable Unicode characters are decoded asis into 482 Encodings of un-representable Unicode characters are decoded asis into
415 eight-bit-control and eight-bit-graphic characters.") 483 eight-bit-control and eight-bit-graphic characters.")
416 484
417 (define-ccl-program ccl-encode-mule-utf-8 485 (define-ccl-program ccl-encode-mule-utf-8
418 `(1 486 `(1
419 ((r5 = -1) 487 ((r5 = -1)
420 (loop 488 (loop
421 (if (r5 < 0) 489 (if (r5 < 0)
422 ((r1 = -1) 490 ((r1 = -1)
423 (read-multibyte-character r0 r1) 491 (read-multibyte-character r0 r1)
424 (translate-character ucs-mule-to-mule-unicode r0 r1)) 492 (translate-character utf-translation-table-for-encode r0 r1))
425 (;; We have already done read-multibyte-character. 493 (;; We have already done read-multibyte-character.
426 (r0 = r5) 494 (r0 = r5)
427 (r1 = r6) 495 (r1 = r6)
428 (r5 = -1))) 496 (r5 = -1)))
429 497
514 (if (r1 < #xa0) 582 (if (r1 < #xa0)
515 (write r1) 583 (write r1)
516 ((write #xc2) 584 ((write #xc2)
517 (write r1))))))) 585 (write r1)))))))
518 586
519 ((lookup-character utf-8-subst-rev-table r0 r1) 587 ((lookup-character utf-subst-table-for-encode r0 r1)
520 (if r7 ; lookup succeeded 588 (if r7 ; lookup succeeded
521 ((r1 = (((r0 & #xf000) >> 12) | #xe0)) 589 ((r1 = (((r0 & #xf000) >> 12) | #xe0))
522 (r2 = ((r0 & #x3f) | #x80)) 590 (r2 = ((r0 & #x3f) | #x80))
523 (r0 &= #x0fc0) 591 (r0 &= #x0fc0)
524 (r0 >>= 6) 592 (r0 >>= 6)
536 ((write #xc2) 604 ((write #xc2)
537 (write r1))))) 605 (write r1)))))
538 606
539 "CCL program to encode into UTF-8.") 607 "CCL program to encode into UTF-8.")
540 608
541 ;; Dummy definition so that the CCL can be checked correctly; the
542 ;; actual data are loaded on demand.
543 (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it
544 (define-translation-table 'ucs-mule-8859-to-mule-unicode))
545 609
546 (define-ccl-program ccl-untranslated-to-ucs 610 (define-ccl-program ccl-untranslated-to-ucs
547 `(0 611 `(0
548 (if (r0 < #xf0) ; 3-byte encoding, as above 612 (if (r0 < #xf0) ; 3-byte encoding, as above
549 ((r4 = 0) 613 ((r4 = 0)
646 length) 710 length)
647 711
648 ;; ucs-tables is preloaded 712 ;; ucs-tables is preloaded
649 ;; (defun utf-8-pre-write-conversion (beg end) 713 ;; (defun utf-8-pre-write-conversion (beg end)
650 ;; "Semi-dummy pre-write function effectively to autoload ucs-tables." 714 ;; "Semi-dummy pre-write function effectively to autoload ucs-tables."
651 ;; ;; Ensure translation table is loaded. 715 ;; ;; Ensure translation-table is loaded.
652 ;; (require 'ucs-tables) 716 ;; (require 'ucs-tables)
653 ;; ;; Don't do this again. 717 ;; ;; Don't do this again.
654 ;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil) 718 ;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
655 ;; nil) 719 ;; nil)
656 720
657 (make-coding-system 721 (make-coding-system
658 'mule-utf-8 4 ?u 722 'mule-utf-8 4 ?u
659 "UTF-8 encoding for Emacs-supported Unicode characters. 723 "UTF-8 encoding for Emacs-supported Unicode characters.
660 The supported Emacs character sets are the following, plus any other 724 It supports Unicode characters of these ranges:
661 characters included in the tables `ucs-mule-to-mule-unicode' and 725 U+0000..U+33FF, U+E000..U+FFFF.
662 `utf-8-subst-rev-table': 726 They correspond to these Emacs character sets:
663 ascii 727 ascii, latin-iso8859-1, mule-unicode-0100-24ff,
664 eight-bit-control 728 mule-unicode-2500-33ff, mule-unicode-e000-ffff
665 eight-bit-graphic 729
666 latin-iso8859-1 730 On decoding (e.g. reading a file), Unicode characters not in the above
667 latin-iso8859-2 731 ranges are decoded into sequences of eight-bit-control and
668 latin-iso8859-3 732 eight-bit-graphic characters to preserve their byte sequences. The
669 latin-iso8859-4 733 byte sequence is preserved on i/o for valid utf-8, but not necessarily
670 cyrillic-iso8859-5 734 for invalid utf-8.
671 greek-iso8859-7 735
672 hebrew-iso8859-8 736 On encoding (e.g. writing a file), Emacs characters not belonging to
673 latin-iso8859-9 737 any of the character sets listed above are encoded into the UTF-8 byte
674 latin-iso8859-14 738 sequence representing U+FFFD (REPLACEMENT CHARACTER)."
675 latin-iso8859-15
676 mule-unicode-0100-24ff
677 mule-unicode-2500-33ff
678 mule-unicode-e000-ffff
679
680 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
681 may be decoded into korean-ksc5601, chinese-gb2312, japanese-jisx0208
682 \(see user option `utf-8-translate-cjk'); otherwise, sequences of
683 eight-bit-control and eight-bit-graphic characters are used to
684 preserve their byte sequences, and these are composed to display as a
685 single character. Emacs characters that otherwise can't be encoded
686 are encoded as U+FFFD."
687 739
688 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) 740 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
689 '((safe-charsets 741 '((safe-charsets
690 ascii 742 ascii
691 eight-bit-control 743 eight-bit-control
692 eight-bit-graphic 744 eight-bit-graphic
693 latin-iso8859-1 745 latin-iso8859-1
694 latin-iso8859-15
695 latin-iso8859-14
696 latin-iso8859-9
697 hebrew-iso8859-8
698 greek-iso8859-7
699 cyrillic-iso8859-5
700 latin-iso8859-4
701 latin-iso8859-3
702 latin-iso8859-2
703 vietnamese-viscii-lower
704 vietnamese-viscii-upper
705 thai-tis620
706 ipa
707 ethiopic
708 indian-is13194
709 katakana-jisx0201
710 chinese-sisheng
711 lao
712 mule-unicode-0100-24ff 746 mule-unicode-0100-24ff
713 mule-unicode-2500-33ff 747 mule-unicode-2500-33ff
714 mule-unicode-e000-ffff) 748 mule-unicode-e000-ffff)
715 (mime-charset . utf-8) 749 (mime-charset . utf-8)
716 (coding-category . coding-category-utf-8) 750 (coding-category . coding-category-utf-8)
717 (valid-codes (0 . 255)) 751 (valid-codes (0 . 255))
718 ;; (pre-write-conversion . utf-8-pre-write-conversion) 752 ;; (pre-write-conversion . utf-8-pre-write-conversion)
719 (post-read-conversion . utf-8-post-read-conversion))) 753 (post-read-conversion . utf-8-post-read-conversion)
754 (dependency unify-8859-on-encoding-mode
755 unify-8859-on-decoding-mode
756 utf-fragment-on-decoding
757 utf-translate-cjk)))
720 758
721 (define-coding-system-alias 'utf-8 'mule-utf-8) 759 (define-coding-system-alias 'utf-8 'mule-utf-8)
722 760
723 ;; I think this needs special private charsets defined for the 761 ;; I think this needs special private charsets defined for the
724 ;; untranslated sequences, if it's going to work well. 762 ;; untranslated sequences, if it's going to work well.