comparison lisp/international/utf-8.el @ 46496:395e5c46761b

(utf-8-subst-table) (utf-8-subst-rev-table, utf-8-translation-table-for-decode) (utf-8-fragment-on-decoding, ccl-untranslated-to-ucs) (utf-8-ccl-regs, utf-8-translate-cjk): New. (ccl-encode-mule-utf-8): Use utf-8-subst-rev-table. (ccl-decode-mule-utf-8, ccl-untranslated-to-ucs) (utf-8-untranslated-to-ucs, utf-8-compose): Rewritten. (mule-utf-8): Remove pre-write-conversion. (utf-8-post-read-conversion): Comment out.
author Dave Love <fx@gnu.org>
date Wed, 17 Jul 2002 15:04:25 +0000
parents c3c4e09c3eab
children f0b8a25b0b7d
comparison
equal deleted inserted replaced
46495:fc51841e5d97 46496:395e5c46761b
1 ;;; utf-8.el --- limited UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*- 1 ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*-
2 2
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. 3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
4 ;; Licensed to the Free Software Foundation. 4 ;; Licensed to the Free Software Foundation.
5 ;; Copyright (C) 2001 Free Software Foundation, Inc. 5 ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
6 6
7 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> 7 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
8 ;; Maintainer: FSF
8 ;; Keywords: multilingual, Unicode, UTF-8, i18n 9 ;; Keywords: multilingual, Unicode, UTF-8, i18n
9 10
10 ;; This file is part of GNU Emacs. 11 ;; This file is part of GNU Emacs.
11 12
12 ;; GNU Emacs is free software; you can redistribute it and/or modify 13 ;; GNU Emacs is free software; you can redistribute it and/or modify
37 ;; mule-unicode-e000-ffff 38 ;; mule-unicode-e000-ffff
38 ;; 39 ;;
39 ;; On decoding, Unicode characters that do not fit into the above 40 ;; On decoding, Unicode characters that do not fit into the above
40 ;; character sets are handled as `eight-bit-control' or 41 ;; character sets are handled as `eight-bit-control' or
41 ;; `eight-bit-graphic' characters to retain the information about the 42 ;; `eight-bit-graphic' characters to retain the information about the
42 ;; original byte sequence. 43 ;; original byte sequence and text properties record the corresponding
44 ;; unicode.
45 ;;
46 ;; Fixme: note that reading and writing invalid utf-8 may not be
47 ;; idempotent -- to represent the bytes to fix that needs a new charset.
43 ;; 48 ;;
44 ;; Characters from other character sets can be encoded with 49 ;; Characters from other character sets can be encoded with
45 ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and 50 ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and
46 ;; registering the translation with `register-char-codings'. 51 ;; registering the translation with `register-char-codings'. Hash
52 ;; tables `utf-8-subst-table' and `utf-8-subst-rev-table' are used to
53 ;; support encoding and decoding of about a quarter of the CJK space
54 ;; between U+3400 and U+DFFF.
47 55
48 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: 56 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
49 57
50 ;; scalar | utf-8 58 ;; scalar | utf-8
51 ;; value | 1st byte | 2nd byte | 3rd byte 59 ;; value | 1st byte | 2nd byte | 3rd byte
58 66
59 (defvar ucs-mule-to-mule-unicode (make-translation-table) 67 (defvar ucs-mule-to-mule-unicode (make-translation-table)
60 "Translation table for encoding to `mule-utf-8'.") 68 "Translation table for encoding to `mule-utf-8'.")
61 ;; Could have been done by ucs-tables loaded before. 69 ;; Could have been done by ucs-tables loaded before.
62 (unless (get 'ucs-mule-to-mule-unicode 'translation-table) 70 (unless (get 'ucs-mule-to-mule-unicode 'translation-table)
63 (define-translation-table 'ucs-mule-to-mule-unicode ucs-mule-to-mule-unicode)) 71 (define-translation-table 'ucs-mule-to-mule-unicode
72 ucs-mule-to-mule-unicode))
73
74 (defvar utf-8-subst-table (make-hash-table :test 'eq))
75 (defvar utf-8-subst-rev-table (make-hash-table :test 'eq))
76 (define-translation-hash-table 'utf-8-subst-table utf-8-subst-table)
77 (define-translation-hash-table 'utf-8-subst-rev-table utf-8-subst-rev-table)
78
79 (defvar utf-8-translation-table-for-decode (make-translation-table)
80 "Translation table applied after decoding utf-8 to mule-unicode.
81 This is only actually applied to characters which would normally be
82 decoded into mule-unicode-0100-24ff.")
83 (define-translation-table 'utf-8-translation-table-for-decode
84 utf-8-translation-table-for-decode)
85
86 ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
87 ;; space of mule-unicode. For Latin scripts this isn't very
88 ;; important. Hebrew and Arabic might go here too when there's proper
89 ;; support for them.
90 (mapc
91 (lambda (pair)
92 (aset utf-8-translation-table-for-decode (car pair) (cdr pair)))
93 '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) (?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B)
94 (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) (?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,F@(B)
95 (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) (?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B)
96 (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) (?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B)
97 (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) (?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B)
98 (?$,1' (B . ?,FP(B) (?$,1'!(B . ?,FQ(B) (?$,1'#(B . ?,FS(B) (?$,1'$(B . ?,FT(B) (?$,1'%(B . ?,FU(B)
99 (?$,1'&(B . ?,FV(B) (?$,1''(B . ?,FW(B) (?$,1'((B . ?,FX(B) (?$,1')(B . ?,FY(B) (?$,1'*(B . ?,FZ(B)
100 (?$,1'+(B . ?,F[(B) (?$,1',(B . ?,F\(B) (?$,1'-(B . ?,F](B) (?$,1'.(B . ?,F^(B) (?$,1'/(B . ?,F_(B)
101 (?$,1'0(B . ?,F`(B) (?$,1'1(B . ?,Fa(B) (?$,1'2(B . ?,Fb(B) (?$,1'3(B . ?,Fc(B) (?$,1'4(B . ?,Fd(B)
102 (?$,1'5(B . ?,Fe(B) (?$,1'6(B . ?,Ff(B) (?$,1'7(B . ?,Fg(B) (?$,1'8(B . ?,Fh(B) (?$,1'9(B . ?,Fi(B)
103 (?$,1':(B . ?,Fj(B) (?$,1';(B . ?,Fk(B) (?$,1'<(B . ?,Fl(B) (?$,1'=(B . ?,Fm(B) (?$,1'>(B . ?,Fn(B)
104 (?$,1'?(B . ?,Fo(B) (?$,1'@(B . ?,Fp(B) (?$,1'A(B . ?,Fq(B) (?$,1'B(B . ?,Fr(B) (?$,1'C(B . ?,Fs(B)
105 (?$,1'D(B . ?,Ft(B) (?$,1'E(B . ?,Fu(B) (?$,1'F(B . ?,Fv(B) (?$,1'G(B . ?,Fw(B) (?$,1'H(B . ?,Fx(B)
106 (?$,1'I(B . ?,Fy(B) (?$,1'J(B . ?,Fz(B) (?$,1'K(B . ?,F{(B) (?$,1'L(B . ?,F|(B) (?$,1'M(B . ?,F}(B)
107 (?$,1'N(B . ?,F~(B)
108
109 (?$,1(!(B . ?,L!(B) (?$,1("(B . ?,L"(B) (?$,1(#(B . ?,L#(B) (?$,1($(B . ?,L$(B)
110 (?$,1(%(B . ?,L%(B) (?$,1(&(B . ?,L&(B) (?$,1('(B . ?,L'(B) (?$,1(((B . ?,L((B) (?$,1()(B . ?,L)(B)
111 (?$,1(*(B . ?,L*(B) (?$,1(+(B . ?,L+(B) (?$,1(,(B . ?,L,(B) (?$,1(.(B . ?,L.(B) (?$,1(/(B . ?,L/(B)
112 (?$,1(0(B . ?,L0(B) (?$,1(1(B . ?,L1(B) (?$,1(2(B . ?,L2(B) (?$,1(3(B . ?,L3(B) (?$,1(4(B . ?,L4(B)
113 (?$,1(5(B . ?,L5(B) (?$,1(6(B . ?,L6(B) (?$,1(7(B . ?,L7(B) (?$,1(8(B . ?,L8(B) (?$,1(9(B . ?,L9(B)
114 (?$,1(:(B . ?,L:(B) (?$,1(;(B . ?,L;(B) (?$,1(<(B . ?,L<(B) (?$,1(=(B . ?,L=(B) (?$,1(>(B . ?,L>(B)
115 (?$,1(?(B . ?,L?(B) (?$,1(@(B . ?,L@(B) (?$,1(A(B . ?,LA(B) (?$,1(B(B . ?,LB(B) (?$,1(C(B . ?,LC(B)
116 (?$,1(D(B . ?,LD(B) (?$,1(E(B . ?,LE(B) (?$,1(F(B . ?,LF(B) (?$,1(G(B . ?,LG(B) (?$,1(H(B . ?,LH(B)
117 (?$,1(I(B . ?,LI(B) (?$,1(J(B . ?,LJ(B) (?$,1(K(B . ?,LK(B) (?$,1(L(B . ?,LL(B) (?$,1(M(B . ?,LM(B)
118 (?$,1(N(B . ?,LN(B) (?$,1(O(B . ?,LO(B) (?$,1(P(B . ?,LP(B) (?$,1(Q(B . ?,LQ(B) (?$,1(R(B . ?,LR(B)
119 (?$,1(S(B . ?,LS(B) (?$,1(T(B . ?,LT(B) (?$,1(U(B . ?,LU(B) (?$,1(V(B . ?,LV(B) (?$,1(W(B . ?,LW(B)
120 (?$,1(X(B . ?,LX(B) (?$,1(Y(B . ?,LY(B) (?$,1(Z(B . ?,LZ(B) (?$,1([(B . ?,L[(B) (?$,1(\(B . ?,L\(B)
121 (?$,1(](B . ?,L](B) (?$,1(^(B . ?,L^(B) (?$,1(_(B . ?,L_(B) (?$,1(`(B . ?,L`(B) (?$,1(a(B . ?,La(B)
122 (?$,1(b(B . ?,Lb(B) (?$,1(c(B . ?,Lc(B) (?$,1(d(B . ?,Ld(B) (?$,1(e(B . ?,Le(B) (?$,1(f(B . ?,Lf(B)
123 (?$,1(g(B . ?,Lg(B) (?$,1(h(B . ?,Lh(B) (?$,1(i(B . ?,Li(B) (?$,1(j(B . ?,Lj(B) (?$,1(k(B . ?,Lk(B)
124 (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) (?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B)
125 (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) (?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B)
126 (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) (?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B)
127 (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B)))
128
129 (defcustom utf-8-fragment-on-decoding nil
130 "Whether or not to decode some scripts in UTF-8 text into 8-bit characters.
131 Setting this means that the relevant Cyrillic and Greek characters are
132 decoded into the iso8859 charsets rather than into
133 mule-unicode-0100-24ff. The 8-bit characters take half as much space
134 in the buffer, but using them may affect how the buffer can be re-encoded
135 and may require a different input method to search for them, for instance.
136 See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode'
137 for mechanisms to make this largely transparent."
138 :set (lambda (s v)
139 (if v
140 (define-translation-table 'utf-8-translation-table-for-decode
141 utf-8-translation-table-for-decode)
142 (define-translation-table 'utf-8-translation-table-for-decode))
143 (set-default s v))
144 :version "21.4"
145 :type 'boolean
146 :group 'mule)
147
148 (defcustom utf-8-translate-cjk nil
149 "Whether the `mule-utf-8' coding system should encode many CJK characters.
150
151 Enabling this loads tables which enable the coding system to encode
152 characters in the charsets `korean-ksc5601', `chinese-gb2312' and
153 `japanese-jisx0208', and to decode the corresponding unicodes into
154 such characters. This works by loading the library `utf-8-subst'; see
155 its commentary. The tables are fairly large (about 33000 entries), so this
156 option is not the default."
157 :link '(emacs-commentary-link "utf-8-subst")
158 :set (lambda (s v)
159 (when v
160 (require 'utf-8-subst)
161 (let ((table (make-char-table 'translation-table)))
162 (coding-system-put 'mule-utf-8 'safe-charsets
163 (append (coding-system-get 'mule-utf-8
164 'safe-charsets)
165 '(korean-ksc5601 chinese-gb2312
166 japanese-jisx0208)))
167 (maphash (lambda (k v)
168 (aset table k v))
169 utf-8-subst-rev-table)
170 (register-char-codings 'mule-utf-8 table)))
171 (set-default s v))
172 :version "21.4"
173 :type 'boolean
174 :group 'mule)
175
64 (define-ccl-program ccl-decode-mule-utf-8 176 (define-ccl-program ccl-decode-mule-utf-8
65 ;; 177 ;;
66 ;; charset | bytes in utf-8 | bytes in emacs 178 ;; charset | bytes in utf-8 | bytes in emacs
67 ;; -----------------------+----------------+--------------- 179 ;; -----------------------+----------------+---------------
68 ;; ascii | 1 | 1 180 ;; ascii | 1 | 1
88 (read r0) 200 (read r0)
89 201
90 ;; 1byte encoding, i.e., ascii 202 ;; 1byte encoding, i.e., ascii
91 (if (r0 < #x80) 203 (if (r0 < #x80)
92 (write r0) 204 (write r0)
93 205 (if (r0 < #xc0) ; continuation byte (invalid here)
94 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx 206 (if (r0 < #xa0)
95 (if (r0 < #xe0) 207 (write-multibyte-character r5 r0)
96 ((read r1) 208 (write-multibyte-character r6 r0))
97 209 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
98 (if ((r1 & #b11000000) != #b10000000) 210 (if (r0 < #xe0)
99 ;; Invalid 2-byte sequence 211 ((read r1)
100 ((if (r0 < #xa0) 212
101 (write-multibyte-character r5 r0) 213 (if ((r1 & #b11000000) != #b10000000)
102 (write-multibyte-character r6 r0)) 214 ;; Invalid 2-byte sequence
103 (if (r1 < #x80)
104 (write r1)
105 (if (r1 < #xa0)
106 (write-multibyte-character r5 r1)
107 (write-multibyte-character r6 r1))))
108
109 ((r0 &= #x1f)
110 (r0 <<= 6)
111 (r1 &= #x3f)
112 (r1 += r0)
113 ;; Now r1 holds scalar value
114
115 ;; eight-bit-control
116 (if (r1 < 160)
117 ((write-multibyte-character r5 r1))
118
119 ;; latin-iso8859-1
120 (if (r1 < 256)
121 ((r0 = ,(charset-id 'latin-iso8859-1))
122 (r1 -= 128)
123 (write-multibyte-character r0 r1))
124
125 ;; mule-unicode-0100-24ff (< 0800)
126 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
127 (r1 -= #x0100)
128 (r2 = (((r1 / 96) + 32) << 7))
129 (r1 %= 96)
130 (r1 += (r2 + 32))
131 (write-multibyte-character r0 r1)))))))
132
133 ;; 3byte encoding
134 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
135 (if (r0 < #xf0)
136 ((read r1 r2)
137
138 ;; This is set to 1 if the encoding is invalid.
139 (r4 = 0)
140
141 (r3 = (r1 & #b11000000))
142 (r3 |= ((r2 >> 2) & #b00110000))
143 (if (r3 != #b10100000)
144 (r4 = 1)
145 ((r3 = ((r0 & #x0f) << 12))
146 (r3 += ((r1 & #x3f) << 6))
147 (r3 += (r2 & #x3f))
148 (if (r3 < #x0800)
149 (r4 = 1))))
150
151 (if (r4 != 0)
152 ;; Invalid 3-byte sequence
153 ((if (r0 < #xa0) 215 ((if (r0 < #xa0)
154 (write-multibyte-character r5 r0) 216 (write-multibyte-character r5 r0)
155 (write-multibyte-character r6 r0)) 217 (write-multibyte-character r6 r0))
156 (if (r1 < #x80) 218 (if (r1 < #x80)
157 (write r1) 219 (write r1)
158 (if (r1 < #xa0) 220 (if (r1 < #xa0)
159 (write-multibyte-character r5 r1) 221 (write-multibyte-character r5 r1)
160 (write-multibyte-character r6 r1))) 222 (write-multibyte-character r6 r1))))
161 (if (r2 < #x80) 223
162 (write r2) 224 ((r3 = r0) ; save in case of overlong sequence
163 (if (r2 < #xa0) 225 (r2 = r1)
164 (write-multibyte-character r5 r2) 226 (r0 &= #x1f)
165 (write-multibyte-character r6 r2)))) 227 (r0 <<= 6)
228 (r2 = r1) ; save in case of overlong sequence
229 (r1 &= #x3f)
230 (r1 += r0)
231 ;; Now r1 holds scalar value
232
233 (if (r1 < 128) ; `overlong sequence'
234 ((if (r3 < #xa0)
235 (write-multibyte-character r5 r3)
236 (write-multibyte-character r6 r3))
237 (if (r2 < #x80)
238 (write r2)
239 (if (r2 < #xa0)
240 (write-multibyte-character r5 r2)
241 (write-multibyte-character r6 r2))))
242
243 ;; eight-bit-control
244 (if (r1 < 160)
245 ((write-multibyte-character r5 r1))
246
247 ;; latin-iso8859-1
248 (if (r1 < 256)
249 ((r0 = ,(charset-id 'latin-iso8859-1))
250 (r1 -= 128)
251 (write-multibyte-character r0 r1))
252
253 ;; mule-unicode-0100-24ff (< 0800)
254 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
255 (r1 -= #x0100)
256 (r2 = (((r1 / 96) + 32) << 7))
257 (r1 %= 96)
258 (r1 += (r2 + 32))
259 (translate-character
260 utf-8-translation-table-for-decode r0 r1)
261 (write-multibyte-character r0 r1))))))))
262
263 ;; 3byte encoding
264 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
265 (if (r0 < #xf0)
266 ((read r1 r2)
267
268 ;; This is set to 1 if the encoding is invalid.
269 (r4 = 0)
270
271 (r3 = (r1 & #b11000000))
272 (r3 |= ((r2 >> 2) & #b00110000))
273 (if (r3 != #b10100000)
274 (r4 = 1)
275 ((r3 = ((r0 & #x0f) << 12))
276 (r3 += ((r1 & #x3f) << 6))
277 (r3 += (r2 & #x3f))
278 (if (r3 < #x0800)
279 (r4 = 1))))
280
281 (if (r4 != 0)
282 ;; Invalid 3-byte sequence
283 ((if (r0 < #xa0)
284 (write-multibyte-character r5 r0)
285 (write-multibyte-character r6 r0))
286 (if (r1 < #x80)
287 (write r1)
288 (if (r1 < #xa0)
289 (write-multibyte-character r5 r1)
290 (write-multibyte-character r6 r1)))
291 (if (r2 < #x80)
292 (write r2)
293 (if (r2 < #xa0)
294 (write-multibyte-character r5 r2)
295 (write-multibyte-character r6 r2))))
166 296
167 ;; mule-unicode-0100-24ff (>= 0800) 297 ;; mule-unicode-0100-24ff (>= 0800)
168 ((if (r3 < #x2500) 298 ((if (r3 < #x2500)
169 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) 299 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
170 (r3 -= #x0100) 300 (r3 -= #x0100)
171 (r3 //= 96)
172 (r1 = (r7 + 32))
173 (r1 += ((r3 + 32) << 7))
174 (write-multibyte-character r0 r1))
175
176 ;; mule-unicode-2500-33ff
177 (if (r3 < #x3400)
178 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
179 (r3 -= #x2500)
180 (r3 //= 96) 301 (r3 //= 96)
181 (r1 = (r7 + 32)) 302 (r1 = (r7 + 32))
182 (r1 += ((r3 + 32) << 7)) 303 (r1 += ((r3 + 32) << 7))
304 (translate-character
305 utf-8-translation-table-for-decode r0 r1)
183 (write-multibyte-character r0 r1)) 306 (write-multibyte-character r0 r1))
184 307
185 ;; U+3400 .. U+DFFF 308 ;; mule-unicode-2500-33ff
186 ;; keep those bytes as eight-bit-{control|graphic} 309 ;; Fixme: Perhaps allow translation via
187 (if (r3 < #xe000) 310 ;; utf-8-subst-table for #x2e80 up, so that we use
188 ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic 311 ;; consistent charsets for all of CJK. Would need
189 (r3 = r6) 312 ;; corresponding change to encoding tables.
190 (write-multibyte-character r3 r0) 313 (if (r3 < #x3400)
191 (if (r1 < #xa0) 314 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
192 (r3 = r5)) 315 (r3 -= #x2500)
193 (write-multibyte-character r3 r1) 316 (r3 //= 96)
194 (if (r2 < #xa0) 317 (r1 = (r7 + 32))
195 (r3 = r5) 318 (r1 += ((r3 + 32) << 7))
196 (r3 = r6)) 319 (write-multibyte-character r0 r1))
197 (write-multibyte-character r3 r2)) 320
321 ;; U+3400 .. U+D7FF
322 ;; Try to convert to CJK chars, else keep
323 ;; them as eight-bit-{control|graphic}.
324 (if (r3 < #xd800)
325 ((r4 = r3) ; don't zap r3
326 (lookup-integer utf-8-subst-table r4 r5)
327 (if r7
328 ;; got a translation
329 ((write-multibyte-character r4 r5)
330 ;; Zapped through register starvation.
331 (r5 = ,(charset-id 'eight-bit-control)))
332 ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
333 ((r3 = r6)
334 (write-multibyte-character r3 r0)
335 (if (r1 < #xa0)
336 (r3 = r5))
337 (write-multibyte-character r3 r1)
338 (if (r2 < #xa0)
339 (r3 = r5)
340 (r3 = r6))
341 (write-multibyte-character r3 r2))))
342
343 ;; Surrogates, U+D800 .. U+DFFF
344 ;; Fixme: process them properly.
345 (if (r3 < #xe000)
346 ((r3 = r6)
347 (write-multibyte-character r3 r0) ; eight-bit-graphic
348 (if (r1 < #xa0)
349 (r3 = r5))
350 (write-multibyte-character r3 r1)
351 (if (r2 < #xa0)
352 (r3 = r5)
353 (r3 = r6))
354 (write-multibyte-character r3 r2))
198 355
199 ;; mule-unicode-e000-ffff 356 ;; mule-unicode-e000-ffff
200 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) 357 ;; Fixme: fffe and ffff are invalid.
201 (r3 -= #xe000) 358 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
202 (r3 //= 96) 359 (r3 -= #xe000)
203 (r1 = (r7 + 32)) 360 (r3 //= 96)
204 (r1 += ((r3 + 32) << 7)) 361 (r1 = (r7 + 32))
205 (write-multibyte-character r0 r1)))))))) 362 (r1 += ((r3 + 32) << 7))
206 363 (write-multibyte-character r0 r1)))))))))
207 ;; 4byte encoding 364
208 ;; keep those bytes as eight-bit-{control|graphic} 365 (if (r0 < #xfe)
209 ((read r1 r2 r3) 366 ;; 4byte encoding
210 ;; r0 > #xf0, thus eight-bit-graphic 367 ;; keep those bytes as eight-bit-{control|graphic}
211 (write-multibyte-character r6 r0) 368 ;; Fixme: allow lookup in utf-8-subst-table.
212 (if (r1 < #xa0) 369 ((read r1 r2 r3)
213 (write-multibyte-character r5 r1) 370 ;; r0 > #xf0, thus eight-bit-graphic
214 (write-multibyte-character r6 r1)) 371 (write-multibyte-character r6 r0)
215 (if (r2 < #xa0) 372 (if (r1 < #xa0)
216 (write-multibyte-character r5 r2) 373 (if (r1 < #x80) ; invalid byte
217 (write-multibyte-character r6 r2)) 374 (write r1)
218 (if (r3 < #xa0) 375 (write-multibyte-character r5 r1))
219 (write-multibyte-character r5 r3) 376 (write-multibyte-character r6 r1))
220 (write-multibyte-character r6 r3)))))) 377 (if (r2 < #xa0)
221 378 (if (r2 < #x80) ; invalid byte
379 (write r2)
380 (write-multibyte-character r5 r2))
381 (write-multibyte-character r6 r2))
382 (if (r3 < #xa0)
383 (if (r3 < #x80) ; invalid byte
384 (write r3)
385 (write-multibyte-character r5 r3))
386 (write-multibyte-character r6 r3))
387 (if (r0 >= #xf8) ; 5- or 6-byte encoding
388 ((read r1)
389 (if (r1 < #xa0)
390 (if (r1 < #x80) ; invalid byte
391 (write r1)
392 (write-multibyte-character r5 r1))
393 (write-multibyte-character r6 r1))
394 (if (r0 >= #xfc) ; 6-byte
395 ((read r1)
396 (if (r1 < #xa0)
397 (if (r1 < #x80) ; invalid byte
398 (write r1)
399 (write-multibyte-character r5 r1))
400 (write-multibyte-character r6 r1)))))))
401 ;; else invalid byte >= #xfe
402 (write-multibyte-character r6 r0))))))
222 (repeat)))) 403 (repeat))))
223 404
224 "CCL program to decode UTF-8. 405 "CCL program to decode UTF-8.
225 Basic decoding is done into the charsets ascii, latin-iso8859-1 and 406 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
226 mule-unicode-*. Encodings of un-representable Unicode characters are 407 mule-unicode-*, but see also `utf-8-translation-table-for-decode' and
227 decoded asis into eight-bit-control and eight-bit-graphic 408 `utf-8-subst-table'.
228 characters.") 409 Encodings of un-representable Unicode characters are decoded asis into
410 eight-bit-control and eight-bit-graphic characters.")
229 411
230 (define-ccl-program ccl-encode-mule-utf-8 412 (define-ccl-program ccl-encode-mule-utf-8
231 `(1 413 `(1
232 ((r5 = -1) 414 ((r5 = -1)
233 (loop 415 (loop
286 (write r0 r1 r2)) 468 (write r0 r1 r2))
287 469
288 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) 470 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
289 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) 471 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
290 (r1 &= #x7f) 472 (r1 &= #x7f)
291 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000 473 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000
292 (r0 = (((r1 & #xf000) >> 12) | #xe0)) 474 (r0 = (((r1 & #xf000) >> 12) | #xe0))
293 (r2 = ((r1 & #x3f) | #x80)) 475 (r2 = ((r1 & #x3f) | #x80))
294 (r1 &= #x0fc0) 476 (r1 &= #x0fc0)
295 (r1 >>= 6) 477 (r1 >>= 6)
296 (r1 |= #x80) 478 (r1 |= #x80)
327 (if (r1 < #xa0) 509 (if (r1 < #xa0)
328 (write r1) 510 (write r1)
329 ((write #xc2) 511 ((write #xc2)
330 (write r1))))))) 512 (write r1)))))))
331 513
332 ;; Unsupported character. 514 ((lookup-character utf-8-subst-rev-table r0 r1)
333 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. 515 (if r7 ; lookup succeeded
334 ((write #xef) 516 ((r1 = (((r0 & #xf000) >> 12) | #xe0))
335 (write #xbf) 517 (r2 = ((r0 & #x3f) | #x80))
336 (write #xbd))))))))) 518 (r0 &= #x0fc0)
519 (r0 >>= 6)
520 (r0 |= #x80)
521 (write r1 r0 r2))
522 ;; Unsupported character.
523 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
524 ((write #xef)
525 (write #xbf)
526 (write #xbd)))))))))))
337 (repeat))) 527 (repeat)))
338 (if (r1 >= #xa0) 528 (if (r1 >= #xa0)
339 (write r1) 529 (write r1)
340 (if (r1 >= #x80) 530 (if (r1 >= #x80)
341 ((write #xc2) 531 ((write #xc2)
342 (write r1))))) 532 (write r1)))))
343 533
344 "CCL program to encode into UTF-8. 534 "CCL program to encode into UTF-8.")
345 Only characters from the charsets ascii, eight-bit-control,
346 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
347 Others are encoded as U+FFFD.")
348 535
349 ;; Dummy definition so that the CCL can be checked correctly; the 536 ;; Dummy definition so that the CCL can be checked correctly; the
350 ;; actual data are loaded on demand. 537 ;; actual data are loaded on demand.
351 (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it 538 (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it
352 (define-translation-table 'ucs-mule-8859-to-mule-unicode)) 539 (define-translation-table 'ucs-mule-8859-to-mule-unicode))
353 540
541 (define-ccl-program ccl-untranslated-to-ucs
542 `(0
543 (if (r0 < #xf0) ; 3-byte encoding, as above
544 ((r4 = 0)
545 (r3 = (r1 & #b11000000))
546 (r3 |= ((r2 >> 2) & #b00110000))
547 (if (r3 != #b10100000)
548 (r4 = 1)
549 ((r3 = ((r0 & #x0f) << 12))
550 (r3 += ((r1 & #x3f) << 6))
551 (r3 += (r2 & #x3f))
552 (if (r3 < #x0800)
553 (r4 = 1))))
554 (if (r4 != 0)
555 (r0 = 0)
556 (r0 = r3)))
557 (if (r0 < #xf8) ; 4-byte (Mule-UCS recipe)
558 ((r4 = (r1 >> 6))
559 (if (r4 != #b10)
560 (r0 = 0)
561 ((r4 = (r2 >> 6))
562 (if (r4 != #b10)
563 (r0 = 0)
564 ((r4 = (r3 >> 6))
565 (if (r4 != #b10)
566 (r0 = 0)
567 ((r1 = ((r1 & #x3F) << 12))
568 (r2 = ((r2 & #x3F) << 6))
569 (r3 &= #x3F)
570 (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3)))))))))
571 (r0 = 0))))
572 "Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
573 r0 == 0 for invalid sequence.")
574
575 (defvar utf-8-ccl-regs (make-vector 8 0))
576
354 (defsubst utf-8-untranslated-to-ucs () 577 (defsubst utf-8-untranslated-to-ucs ()
355 (let ((b1 (char-after)) 578 "Return the UCS code for an untranslated sequence of raw bytes t point.
356 (b2 (char-after (1+ (point)))) 579 Only for 3- or 4-byte sequences."
357 (b3 (char-after (+ 2 (point)))) 580 (aset utf-8-ccl-regs 0 (or (char-after) 0))
358 (b4 (char-after (+ 4 (point))))) 581 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0))
359 (if (and b1 b2 b3) 582 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0))
360 (cond ((< b1 ?\xf0) 583 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0))
361 (setq b2 (lsh (logand b2 ?\x3f) 6)) 584 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs)
362 (setq b3 (logand b3 ?\x3f)) 585 (aref utf-8-ccl-regs 0))
363 (logior b3 (logior b2 (lsh (logand b1 ?\x0f) 12))))
364 (b4
365 (setq b2 (lsh (logand b2 ?\x3f) 12))
366 (setq b3 (lsh (logand b3 ?\x3f) 6))
367 (setq b4 (logand b4 ?\x3f))
368 (logior b4 (logior b3 (logior b2 (lsh (logand b1 ?\x07)
369 18)))))))))
370 586
371 (defun utf-8-help-echo (window object position) 587 (defun utf-8-help-echo (window object position)
372 (format "Untranslated Unicode U+%04X" 588 (format "Untranslated Unicode U+%04X"
373 (get-char-property position 'untranslated-utf-8 object))) 589 (get-char-property position 'untranslated-utf-8 object)))
374 590
375 (defvar utf-8-subst-table nil
376 "If non-nil, a hash table mapping `untranslatable utf-8' to Emacs characters.")
377
378 ;; We compose the untranslatable sequences into a single character. 591 ;; We compose the untranslatable sequences into a single character.
379 ;; This is infelicitous for editing, because there's currently no 592 ;; This is infelicitous for editing, because there's currently no
380 ;; mechanism for treating compositions as atomic, but is OK for 593 ;; mechanism for treating compositions as atomic, but is OK for
381 ;; display. We try to compose an appropriate character from a hash 594 ;; display. They are composed to U+FFFD with help-echo which
382 ;; table of CJK characters to display correctly. Otherwise we use 595 ;; indicates the unicodes they represent. This function GCs too much.
383 ;; U+FFFD. What we really should have is hash table lookup from CCL
384 ;; so that we could do this properly. This function GCs too much.
385 (defsubst utf-8-compose () 596 (defsubst utf-8-compose ()
386 "Put a suitable composition on an untranslatable sequence. 597 "Put a suitable composition on an untranslatable sequence.
387 Return the sequence's length." 598 Return the sequence's length."
388 (let* ((u (utf-8-untranslated-to-ucs)) 599 (let* ((u (utf-8-untranslated-to-ucs))
389 (l (and u (if (>= u ?\x10000) 600 (l (unless (zerop u)
601 (if (>= u #x10000)
390 4 602 4
391 3))) 603 3))))
392 (subst (and utf-8-subst-table (gethash u utf-8-subst-table)))) 604 (when l
393 (when u
394 (put-text-property (point) (min (point-max) (+ l (point))) 605 (put-text-property (point) (min (point-max) (+ l (point)))
395 'untranslated-utf-8 u) 606 'untranslated-utf-8 u)
396 (unless subst 607 (put-text-property (point) (min (point-max) (+ l (point)))
397 (put-text-property (point) (min (point-max) (+ l (point))) 608 'help-echo 'utf-8-help-echo)
398 'help-echo 'utf-8-help-echo) 609 (compose-region (point) (+ l (point)) ?$,3u=(B)
399 (setq subst ?$,3u=(B))
400 (compose-region (point) (+ l (point)) subst)
401 l))) 610 l)))
402 611
403 (defcustom utf-8-compose-scripts nil 612 (defcustom utf-8-compose-scripts nil
404 "*Non-nil means compose various scipts on decoding utf-8 text." 613 "*Non-nil means compose various scripts on decoding utf-8 text."
405 :group 'mule 614 :group 'mule
406 :type 'boolean) ; omitted in Emacs 21.1 615 :version "21.4"
616 :type 'boolean)
407 617
408 (defun utf-8-post-read-conversion (length) 618 (defun utf-8-post-read-conversion (length)
409 "Compose untranslated utf-8 sequences into single characters. 619 "Compose untranslated utf-8 sequences into single characters.
410 Also compose particular scripts if `utf-8-compose-scripts' is non-nil." 620 Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
411 (save-excursion 621 (save-excursion
412 ;; Can't do eval-when-compile to insert a multibyte constant 622 ;; Can't do eval-when-compile to insert a multibyte constant
413 ;; version of the string in the loop, since it's always loaded as 623 ;; version of the string in the loop, since it's always loaded as
414 ;; unibyte from a byte-compiled file. 624 ;; unibyte from a byte-compiled file.
415 (let ((range (string-as-multibyte "^\341-\377"))) 625 (let ((range (string-as-multibyte "^\xe1-\xf7")))
416 (while (and (skip-chars-forward 626 (while (and (skip-chars-forward range)
417 range)
418 (not (eobp))) 627 (not (eobp)))
419 (forward-char (utf-8-compose))))) 628 (forward-char (utf-8-compose)))))
420 ;; Fixme: Takahashi-san implies it may not work this easily -- needs 629 ;; Fixme: Takahashi-san implies it may not work this easily. I
421 ;; checking with him. 630 ;; asked why but didn't get a reply. -- fx
422 (when (and utf-8-compose-scripts (> length 1)) 631 (when (and utf-8-compose-scripts (> length 1))
423 ;; These currently have definitions which cover the relevant 632 ;; These currently have definitions which cover the relevant
424 ;; Unicodes. We could avoid loading thai-util &c by checking 633 ;; unicodes. We could avoid loading thai-util &c by checking
425 ;; whether the region contains any characters with the appropriate 634 ;; whether the region contains any characters with the appropriate
426 ;; categories. There aren't yet Unicode-based rules for Tibetan. 635 ;; categories. There aren't yet Unicode-based rules for Tibetan.
427 (save-excursion (setq length (diacritic-post-read-conversion length))) 636 (save-excursion (setq length (diacritic-post-read-conversion length)))
428 (save-excursion (setq length (thai-post-read-conversion length))) 637 (save-excursion (setq length (thai-post-read-conversion length)))
429 (save-excursion (setq length (lao-post-read-conversion length))) 638 (save-excursion (setq length (lao-post-read-conversion length)))
430 (save-excursion (setq length (devanagari-post-read-conversion length)))) 639 (save-excursion
640 (setq length (in-is13194-devanagari-post-read-conversion length))))
431 length) 641 length)
432 642
433 (defun utf-8-pre-write-conversion (beg end) 643 ;; ucs-tables is preloaded
434 "Semi-dummy pre-write function effectively to autoload ucs-tables." 644 ;; (defun utf-8-pre-write-conversion (beg end)
435 ;; Ensure translation table is loaded. 645 ;; "Semi-dummy pre-write function effectively to autoload ucs-tables."
436 (require 'ucs-tables) 646 ;; ;; Ensure translation table is loaded.
437 ;; Don't do this again. 647 ;; (require 'ucs-tables)
438 (coding-system-put 'mule-utf-8 'pre-write-conversion nil) 648 ;; ;; Don't do this again.
439 nil) 649 ;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
650 ;; nil)
440 651
441 (make-coding-system 652 (make-coding-system
442 'mule-utf-8 4 ?u 653 'mule-utf-8 4 ?u
443 "UTF-8 encoding for Emacs-supported Unicode characters. 654 "UTF-8 encoding for Emacs-supported Unicode characters.
444 The supported Emacs character sets are the following, plus others 655 The supported Emacs character sets are the following, plus any other
445 which may be included in the translation table 656 characters included in the tables `ucs-mule-to-mule-unicode' and
446 `ucs-mule-to-mule-unicode': 657 `utf-8-subst-rev-table':
447 ascii 658 ascii
448 eight-bit-control 659 eight-bit-control
449 eight-bit-graphic 660 eight-bit-graphic
450 latin-iso8859-1 661 latin-iso8859-1
451 latin-iso8859-2 662 latin-iso8859-2
460 mule-unicode-0100-24ff 671 mule-unicode-0100-24ff
461 mule-unicode-2500-33ff 672 mule-unicode-2500-33ff
462 mule-unicode-e000-ffff 673 mule-unicode-e000-ffff
463 674
464 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF 675 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
465 are decoded into sequences of eight-bit-control and eight-bit-graphic 676 may be decoded into korean-ksc5601, chinese-gb2312, japanese-jisx0208
466 characters to preserve their byte sequences and composed to display as 677 \(see user option `utf-8-translate-cjk'); otherwise, sequences of
467 a single character. Emacs characters that can't be encoded to these 678 eight-bit-control and eight-bit-graphic characters are used to
468 ranges are encoded as U+FFFD." 679 preserve their byte sequences, and these are composed to display as a
680 single character. Emacs characters that otherwise can't be encoded
681 are encoded as U+FFFD."
469 682
470 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) 683 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
471 '((safe-charsets 684 '((safe-charsets
472 ascii 685 ascii
473 eight-bit-control 686 eight-bit-control
495 mule-unicode-2500-33ff 708 mule-unicode-2500-33ff
496 mule-unicode-e000-ffff) 709 mule-unicode-e000-ffff)
497 (mime-charset . utf-8) 710 (mime-charset . utf-8)
498 (coding-category . coding-category-utf-8) 711 (coding-category . coding-category-utf-8)
499 (valid-codes (0 . 255)) 712 (valid-codes (0 . 255))
500 (pre-write-conversion . utf-8-pre-write-conversion) 713 ;; (pre-write-conversion . utf-8-pre-write-conversion)
501 (post-read-conversion . utf-8-post-read-conversion))) 714 (post-read-conversion . utf-8-post-read-conversion)))
502 715
503 (define-coding-system-alias 'utf-8 'mule-utf-8) 716 (define-coding-system-alias 'utf-8 'mule-utf-8)
504 717
505 ;; I think this needs special private charsets defined for the 718 ;; I think this needs special private charsets defined for the