Mercurial > emacs
comparison lisp/international/utf-8.el @ 46496:395e5c46761b
(utf-8-subst-table)
(utf-8-subst-rev-table, utf-8-translation-table-for-decode)
(utf-8-fragment-on-decoding, ccl-untranslated-to-ucs)
(utf-8-ccl-regs, utf-8-translate-cjk): New.
(ccl-encode-mule-utf-8): Use utf-8-subst-rev-table.
(ccl-decode-mule-utf-8, ccl-untranslated-to-ucs)
(utf-8-untranslated-to-ucs, utf-8-compose): Rewritten.
(mule-utf-8): Remove pre-write-conversion.
(utf-8-post-read-conversion): Comment out.
author | Dave Love <fx@gnu.org> |
---|---|
date | Wed, 17 Jul 2002 15:04:25 +0000 |
parents | c3c4e09c3eab |
children | f0b8a25b0b7d |
comparison
equal
deleted
inserted
replaced
46495:fc51841e5d97 | 46496:395e5c46761b |
---|---|
1 ;;; utf-8.el --- limited UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*- | 1 ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*- |
2 | 2 |
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. | 3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. |
4 ;; Licensed to the Free Software Foundation. | 4 ;; Licensed to the Free Software Foundation. |
5 ;; Copyright (C) 2001 Free Software Foundation, Inc. | 5 ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc. |
6 | 6 |
7 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> | 7 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> |
8 ;; Maintainer: FSF | |
8 ;; Keywords: multilingual, Unicode, UTF-8, i18n | 9 ;; Keywords: multilingual, Unicode, UTF-8, i18n |
9 | 10 |
10 ;; This file is part of GNU Emacs. | 11 ;; This file is part of GNU Emacs. |
11 | 12 |
12 ;; GNU Emacs is free software; you can redistribute it and/or modify | 13 ;; GNU Emacs is free software; you can redistribute it and/or modify |
37 ;; mule-unicode-e000-ffff | 38 ;; mule-unicode-e000-ffff |
38 ;; | 39 ;; |
39 ;; On decoding, Unicode characters that do not fit into the above | 40 ;; On decoding, Unicode characters that do not fit into the above |
40 ;; character sets are handled as `eight-bit-control' or | 41 ;; character sets are handled as `eight-bit-control' or |
41 ;; `eight-bit-graphic' characters to retain the information about the | 42 ;; `eight-bit-graphic' characters to retain the information about the |
42 ;; original byte sequence. | 43 ;; original byte sequence and text properties record the corresponding |
44 ;; unicode. | |
45 ;; | |
46 ;; Fixme: note that reading and writing invalid utf-8 may not be | |
47 ;; idempotent -- to represent the bytes to fix that needs a new charset. | |
43 ;; | 48 ;; |
44 ;; Characters from other character sets can be encoded with | 49 ;; Characters from other character sets can be encoded with |
45 ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and | 50 ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and |
46 ;; registering the translation with `register-char-codings'. | 51 ;; registering the translation with `register-char-codings'. Hash |
52 ;; tables `utf-8-subst-table' and `utf-8-subst-rev-table' are used to | |
53 ;; support encoding and decoding of about a quarter of the CJK space | |
54 ;; between U+3400 and U+DFFF. | |
47 | 55 |
48 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: | 56 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: |
49 | 57 |
50 ;; scalar | utf-8 | 58 ;; scalar | utf-8 |
51 ;; value | 1st byte | 2nd byte | 3rd byte | 59 ;; value | 1st byte | 2nd byte | 3rd byte |
58 | 66 |
59 (defvar ucs-mule-to-mule-unicode (make-translation-table) | 67 (defvar ucs-mule-to-mule-unicode (make-translation-table) |
60 "Translation table for encoding to `mule-utf-8'.") | 68 "Translation table for encoding to `mule-utf-8'.") |
61 ;; Could have been done by ucs-tables loaded before. | 69 ;; Could have been done by ucs-tables loaded before. |
62 (unless (get 'ucs-mule-to-mule-unicode 'translation-table) | 70 (unless (get 'ucs-mule-to-mule-unicode 'translation-table) |
63 (define-translation-table 'ucs-mule-to-mule-unicode ucs-mule-to-mule-unicode)) | 71 (define-translation-table 'ucs-mule-to-mule-unicode |
72 ucs-mule-to-mule-unicode)) | |
73 | |
74 (defvar utf-8-subst-table (make-hash-table :test 'eq)) | |
75 (defvar utf-8-subst-rev-table (make-hash-table :test 'eq)) | |
76 (define-translation-hash-table 'utf-8-subst-table utf-8-subst-table) | |
77 (define-translation-hash-table 'utf-8-subst-rev-table utf-8-subst-rev-table) | |
78 | |
79 (defvar utf-8-translation-table-for-decode (make-translation-table) | |
80 "Translation table applied after decoding utf-8 to mule-unicode. | |
81 This is only actually applied to characters which would normally be | |
82 decoded into mule-unicode-0100-24ff.") | |
83 (define-translation-table 'utf-8-translation-table-for-decode | |
84 utf-8-translation-table-for-decode) | |
85 | |
86 ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the | |
87 ;; space of mule-unicode. For Latin scripts this isn't very | |
88 ;; important. Hebrew and Arabic might go here too when there's proper | |
89 ;; support for them. | |
90 (mapc | |
91 (lambda (pair) | |
92 (aset utf-8-translation-table-for-decode (car pair) (cdr pair))) | |
93 '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) (?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B) | |
94 (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) (?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,F@(B) | |
95 (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) (?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B) | |
96 (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) (?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B) | |
97 (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) (?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B) | |
98 (?$,1' (B . ?,FP(B) (?$,1'!(B . ?,FQ(B) (?$,1'#(B . ?,FS(B) (?$,1'$(B . ?,FT(B) (?$,1'%(B . ?,FU(B) | |
99 (?$,1'&(B . ?,FV(B) (?$,1''(B . ?,FW(B) (?$,1'((B . ?,FX(B) (?$,1')(B . ?,FY(B) (?$,1'*(B . ?,FZ(B) | |
100 (?$,1'+(B . ?,F[(B) (?$,1',(B . ?,F\(B) (?$,1'-(B . ?,F](B) (?$,1'.(B . ?,F^(B) (?$,1'/(B . ?,F_(B) | |
101 (?$,1'0(B . ?,F`(B) (?$,1'1(B . ?,Fa(B) (?$,1'2(B . ?,Fb(B) (?$,1'3(B . ?,Fc(B) (?$,1'4(B . ?,Fd(B) | |
102 (?$,1'5(B . ?,Fe(B) (?$,1'6(B . ?,Ff(B) (?$,1'7(B . ?,Fg(B) (?$,1'8(B . ?,Fh(B) (?$,1'9(B . ?,Fi(B) | |
103 (?$,1':(B . ?,Fj(B) (?$,1';(B . ?,Fk(B) (?$,1'<(B . ?,Fl(B) (?$,1'=(B . ?,Fm(B) (?$,1'>(B . ?,Fn(B) | |
104 (?$,1'?(B . ?,Fo(B) (?$,1'@(B . ?,Fp(B) (?$,1'A(B . ?,Fq(B) (?$,1'B(B . ?,Fr(B) (?$,1'C(B . ?,Fs(B) | |
105 (?$,1'D(B . ?,Ft(B) (?$,1'E(B . ?,Fu(B) (?$,1'F(B . ?,Fv(B) (?$,1'G(B . ?,Fw(B) (?$,1'H(B . ?,Fx(B) | |
106 (?$,1'I(B . ?,Fy(B) (?$,1'J(B . ?,Fz(B) (?$,1'K(B . ?,F{(B) (?$,1'L(B . ?,F|(B) (?$,1'M(B . ?,F}(B) | |
107 (?$,1'N(B . ?,F~(B) | |
108 | |
109 (?$,1(!(B . ?,L!(B) (?$,1("(B . ?,L"(B) (?$,1(#(B . ?,L#(B) (?$,1($(B . ?,L$(B) | |
110 (?$,1(%(B . ?,L%(B) (?$,1(&(B . ?,L&(B) (?$,1('(B . ?,L'(B) (?$,1(((B . ?,L((B) (?$,1()(B . ?,L)(B) | |
111 (?$,1(*(B . ?,L*(B) (?$,1(+(B . ?,L+(B) (?$,1(,(B . ?,L,(B) (?$,1(.(B . ?,L.(B) (?$,1(/(B . ?,L/(B) | |
112 (?$,1(0(B . ?,L0(B) (?$,1(1(B . ?,L1(B) (?$,1(2(B . ?,L2(B) (?$,1(3(B . ?,L3(B) (?$,1(4(B . ?,L4(B) | |
113 (?$,1(5(B . ?,L5(B) (?$,1(6(B . ?,L6(B) (?$,1(7(B . ?,L7(B) (?$,1(8(B . ?,L8(B) (?$,1(9(B . ?,L9(B) | |
114 (?$,1(:(B . ?,L:(B) (?$,1(;(B . ?,L;(B) (?$,1(<(B . ?,L<(B) (?$,1(=(B . ?,L=(B) (?$,1(>(B . ?,L>(B) | |
115 (?$,1(?(B . ?,L?(B) (?$,1(@(B . ?,L@(B) (?$,1(A(B . ?,LA(B) (?$,1(B(B . ?,LB(B) (?$,1(C(B . ?,LC(B) | |
116 (?$,1(D(B . ?,LD(B) (?$,1(E(B . ?,LE(B) (?$,1(F(B . ?,LF(B) (?$,1(G(B . ?,LG(B) (?$,1(H(B . ?,LH(B) | |
117 (?$,1(I(B . ?,LI(B) (?$,1(J(B . ?,LJ(B) (?$,1(K(B . ?,LK(B) (?$,1(L(B . ?,LL(B) (?$,1(M(B . ?,LM(B) | |
118 (?$,1(N(B . ?,LN(B) (?$,1(O(B . ?,LO(B) (?$,1(P(B . ?,LP(B) (?$,1(Q(B . ?,LQ(B) (?$,1(R(B . ?,LR(B) | |
119 (?$,1(S(B . ?,LS(B) (?$,1(T(B . ?,LT(B) (?$,1(U(B . ?,LU(B) (?$,1(V(B . ?,LV(B) (?$,1(W(B . ?,LW(B) | |
120 (?$,1(X(B . ?,LX(B) (?$,1(Y(B . ?,LY(B) (?$,1(Z(B . ?,LZ(B) (?$,1([(B . ?,L[(B) (?$,1(\(B . ?,L\(B) | |
121 (?$,1(](B . ?,L](B) (?$,1(^(B . ?,L^(B) (?$,1(_(B . ?,L_(B) (?$,1(`(B . ?,L`(B) (?$,1(a(B . ?,La(B) | |
122 (?$,1(b(B . ?,Lb(B) (?$,1(c(B . ?,Lc(B) (?$,1(d(B . ?,Ld(B) (?$,1(e(B . ?,Le(B) (?$,1(f(B . ?,Lf(B) | |
123 (?$,1(g(B . ?,Lg(B) (?$,1(h(B . ?,Lh(B) (?$,1(i(B . ?,Li(B) (?$,1(j(B . ?,Lj(B) (?$,1(k(B . ?,Lk(B) | |
124 (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) (?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B) | |
125 (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) (?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B) | |
126 (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) (?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B) | |
127 (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B))) | |
128 | |
129 (defcustom utf-8-fragment-on-decoding nil | |
130 "Whether or not to decode some scripts in UTF-8 text into 8-bit characters. | |
131 Setting this means that the relevant Cyrillic and Greek characters are | |
132 decoded into the iso8859 charsets rather than into | |
133 mule-unicode-0100-24ff. The 8-bit characters take half as much space | |
134 in the buffer, but using them may affect how the buffer can be re-encoded | |
135 and may require a different input method to search for them, for instance. | |
136 See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode' | |
137 for mechanisms to make this largely transparent." | |
138 :set (lambda (s v) | |
139 (if v | |
140 (define-translation-table 'utf-8-translation-table-for-decode | |
141 utf-8-translation-table-for-decode) | |
142 (define-translation-table 'utf-8-translation-table-for-decode)) | |
143 (set-default s v)) | |
144 :version "21.4" | |
145 :type 'boolean | |
146 :group 'mule) | |
147 | |
148 (defcustom utf-8-translate-cjk nil | |
149 "Whether the `mule-utf-8' coding system should encode many CJK characters. | |
150 | |
151 Enabling this loads tables which enable the coding system to encode | |
152 characters in the charsets `korean-ksc5601', `chinese-gb2312' and | |
153 `japanese-jisx0208', and to decode the corresponding unicodes into | |
154 such characters. This works by loading the library `utf-8-subst'; see | |
155 its commentary. The tables are fairly large (about 33000 entries), so this | |
156 option is not the default." | |
157 :link '(emacs-commentary-link "utf-8-subst") | |
158 :set (lambda (s v) | |
159 (when v | |
160 (require 'utf-8-subst) | |
161 (let ((table (make-char-table 'translation-table))) | |
162 (coding-system-put 'mule-utf-8 'safe-charsets | |
163 (append (coding-system-get 'mule-utf-8 | |
164 'safe-charsets) | |
165 '(korean-ksc5601 chinese-gb2312 | |
166 japanese-jisx0208))) | |
167 (maphash (lambda (k v) | |
168 (aset table k v)) | |
169 utf-8-subst-rev-table) | |
170 (register-char-codings 'mule-utf-8 table))) | |
171 (set-default s v)) | |
172 :version "21.4" | |
173 :type 'boolean | |
174 :group 'mule) | |
175 | |
64 (define-ccl-program ccl-decode-mule-utf-8 | 176 (define-ccl-program ccl-decode-mule-utf-8 |
65 ;; | 177 ;; |
66 ;; charset | bytes in utf-8 | bytes in emacs | 178 ;; charset | bytes in utf-8 | bytes in emacs |
67 ;; -----------------------+----------------+--------------- | 179 ;; -----------------------+----------------+--------------- |
68 ;; ascii | 1 | 1 | 180 ;; ascii | 1 | 1 |
88 (read r0) | 200 (read r0) |
89 | 201 |
90 ;; 1byte encoding, i.e., ascii | 202 ;; 1byte encoding, i.e., ascii |
91 (if (r0 < #x80) | 203 (if (r0 < #x80) |
92 (write r0) | 204 (write r0) |
93 | 205 (if (r0 < #xc0) ; continuation byte (invalid here) |
94 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx | 206 (if (r0 < #xa0) |
95 (if (r0 < #xe0) | 207 (write-multibyte-character r5 r0) |
96 ((read r1) | 208 (write-multibyte-character r6 r0)) |
97 | 209 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx |
98 (if ((r1 & #b11000000) != #b10000000) | 210 (if (r0 < #xe0) |
99 ;; Invalid 2-byte sequence | 211 ((read r1) |
100 ((if (r0 < #xa0) | 212 |
101 (write-multibyte-character r5 r0) | 213 (if ((r1 & #b11000000) != #b10000000) |
102 (write-multibyte-character r6 r0)) | 214 ;; Invalid 2-byte sequence |
103 (if (r1 < #x80) | |
104 (write r1) | |
105 (if (r1 < #xa0) | |
106 (write-multibyte-character r5 r1) | |
107 (write-multibyte-character r6 r1)))) | |
108 | |
109 ((r0 &= #x1f) | |
110 (r0 <<= 6) | |
111 (r1 &= #x3f) | |
112 (r1 += r0) | |
113 ;; Now r1 holds scalar value | |
114 | |
115 ;; eight-bit-control | |
116 (if (r1 < 160) | |
117 ((write-multibyte-character r5 r1)) | |
118 | |
119 ;; latin-iso8859-1 | |
120 (if (r1 < 256) | |
121 ((r0 = ,(charset-id 'latin-iso8859-1)) | |
122 (r1 -= 128) | |
123 (write-multibyte-character r0 r1)) | |
124 | |
125 ;; mule-unicode-0100-24ff (< 0800) | |
126 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
127 (r1 -= #x0100) | |
128 (r2 = (((r1 / 96) + 32) << 7)) | |
129 (r1 %= 96) | |
130 (r1 += (r2 + 32)) | |
131 (write-multibyte-character r0 r1))))))) | |
132 | |
133 ;; 3byte encoding | |
134 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx | |
135 (if (r0 < #xf0) | |
136 ((read r1 r2) | |
137 | |
138 ;; This is set to 1 if the encoding is invalid. | |
139 (r4 = 0) | |
140 | |
141 (r3 = (r1 & #b11000000)) | |
142 (r3 |= ((r2 >> 2) & #b00110000)) | |
143 (if (r3 != #b10100000) | |
144 (r4 = 1) | |
145 ((r3 = ((r0 & #x0f) << 12)) | |
146 (r3 += ((r1 & #x3f) << 6)) | |
147 (r3 += (r2 & #x3f)) | |
148 (if (r3 < #x0800) | |
149 (r4 = 1)))) | |
150 | |
151 (if (r4 != 0) | |
152 ;; Invalid 3-byte sequence | |
153 ((if (r0 < #xa0) | 215 ((if (r0 < #xa0) |
154 (write-multibyte-character r5 r0) | 216 (write-multibyte-character r5 r0) |
155 (write-multibyte-character r6 r0)) | 217 (write-multibyte-character r6 r0)) |
156 (if (r1 < #x80) | 218 (if (r1 < #x80) |
157 (write r1) | 219 (write r1) |
158 (if (r1 < #xa0) | 220 (if (r1 < #xa0) |
159 (write-multibyte-character r5 r1) | 221 (write-multibyte-character r5 r1) |
160 (write-multibyte-character r6 r1))) | 222 (write-multibyte-character r6 r1)))) |
161 (if (r2 < #x80) | 223 |
162 (write r2) | 224 ((r3 = r0) ; save in case of overlong sequence |
163 (if (r2 < #xa0) | 225 (r2 = r1) |
164 (write-multibyte-character r5 r2) | 226 (r0 &= #x1f) |
165 (write-multibyte-character r6 r2)))) | 227 (r0 <<= 6) |
228 (r2 = r1) ; save in case of overlong sequence | |
229 (r1 &= #x3f) | |
230 (r1 += r0) | |
231 ;; Now r1 holds scalar value | |
232 | |
233 (if (r1 < 128) ; `overlong sequence' | |
234 ((if (r3 < #xa0) | |
235 (write-multibyte-character r5 r3) | |
236 (write-multibyte-character r6 r3)) | |
237 (if (r2 < #x80) | |
238 (write r2) | |
239 (if (r2 < #xa0) | |
240 (write-multibyte-character r5 r2) | |
241 (write-multibyte-character r6 r2)))) | |
242 | |
243 ;; eight-bit-control | |
244 (if (r1 < 160) | |
245 ((write-multibyte-character r5 r1)) | |
246 | |
247 ;; latin-iso8859-1 | |
248 (if (r1 < 256) | |
249 ((r0 = ,(charset-id 'latin-iso8859-1)) | |
250 (r1 -= 128) | |
251 (write-multibyte-character r0 r1)) | |
252 | |
253 ;; mule-unicode-0100-24ff (< 0800) | |
254 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
255 (r1 -= #x0100) | |
256 (r2 = (((r1 / 96) + 32) << 7)) | |
257 (r1 %= 96) | |
258 (r1 += (r2 + 32)) | |
259 (translate-character | |
260 utf-8-translation-table-for-decode r0 r1) | |
261 (write-multibyte-character r0 r1)))))))) | |
262 | |
263 ;; 3byte encoding | |
264 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx | |
265 (if (r0 < #xf0) | |
266 ((read r1 r2) | |
267 | |
268 ;; This is set to 1 if the encoding is invalid. | |
269 (r4 = 0) | |
270 | |
271 (r3 = (r1 & #b11000000)) | |
272 (r3 |= ((r2 >> 2) & #b00110000)) | |
273 (if (r3 != #b10100000) | |
274 (r4 = 1) | |
275 ((r3 = ((r0 & #x0f) << 12)) | |
276 (r3 += ((r1 & #x3f) << 6)) | |
277 (r3 += (r2 & #x3f)) | |
278 (if (r3 < #x0800) | |
279 (r4 = 1)))) | |
280 | |
281 (if (r4 != 0) | |
282 ;; Invalid 3-byte sequence | |
283 ((if (r0 < #xa0) | |
284 (write-multibyte-character r5 r0) | |
285 (write-multibyte-character r6 r0)) | |
286 (if (r1 < #x80) | |
287 (write r1) | |
288 (if (r1 < #xa0) | |
289 (write-multibyte-character r5 r1) | |
290 (write-multibyte-character r6 r1))) | |
291 (if (r2 < #x80) | |
292 (write r2) | |
293 (if (r2 < #xa0) | |
294 (write-multibyte-character r5 r2) | |
295 (write-multibyte-character r6 r2)))) | |
166 | 296 |
167 ;; mule-unicode-0100-24ff (>= 0800) | 297 ;; mule-unicode-0100-24ff (>= 0800) |
168 ((if (r3 < #x2500) | 298 ((if (r3 < #x2500) |
169 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | 299 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) |
170 (r3 -= #x0100) | 300 (r3 -= #x0100) |
171 (r3 //= 96) | |
172 (r1 = (r7 + 32)) | |
173 (r1 += ((r3 + 32) << 7)) | |
174 (write-multibyte-character r0 r1)) | |
175 | |
176 ;; mule-unicode-2500-33ff | |
177 (if (r3 < #x3400) | |
178 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | |
179 (r3 -= #x2500) | |
180 (r3 //= 96) | 301 (r3 //= 96) |
181 (r1 = (r7 + 32)) | 302 (r1 = (r7 + 32)) |
182 (r1 += ((r3 + 32) << 7)) | 303 (r1 += ((r3 + 32) << 7)) |
304 (translate-character | |
305 utf-8-translation-table-for-decode r0 r1) | |
183 (write-multibyte-character r0 r1)) | 306 (write-multibyte-character r0 r1)) |
184 | 307 |
185 ;; U+3400 .. U+DFFF | 308 ;; mule-unicode-2500-33ff |
186 ;; keep those bytes as eight-bit-{control|graphic} | 309 ;; Fixme: Perhaps allow translation via |
187 (if (r3 < #xe000) | 310 ;; utf-8-subst-table for #x2e80 up, so that we use |
188 ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic | 311 ;; consistent charsets for all of CJK. Would need |
189 (r3 = r6) | 312 ;; corresponding change to encoding tables. |
190 (write-multibyte-character r3 r0) | 313 (if (r3 < #x3400) |
191 (if (r1 < #xa0) | 314 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) |
192 (r3 = r5)) | 315 (r3 -= #x2500) |
193 (write-multibyte-character r3 r1) | 316 (r3 //= 96) |
194 (if (r2 < #xa0) | 317 (r1 = (r7 + 32)) |
195 (r3 = r5) | 318 (r1 += ((r3 + 32) << 7)) |
196 (r3 = r6)) | 319 (write-multibyte-character r0 r1)) |
197 (write-multibyte-character r3 r2)) | 320 |
321 ;; U+3400 .. U+D7FF | |
322 ;; Try to convert to CJK chars, else keep | |
323 ;; them as eight-bit-{control|graphic}. | |
324 (if (r3 < #xd800) | |
325 ((r4 = r3) ; don't zap r3 | |
326 (lookup-integer utf-8-subst-table r4 r5) | |
327 (if r7 | |
328 ;; got a translation | |
329 ((write-multibyte-character r4 r5) | |
330 ;; Zapped through register starvation. | |
331 (r5 = ,(charset-id 'eight-bit-control))) | |
332 ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic | |
333 ((r3 = r6) | |
334 (write-multibyte-character r3 r0) | |
335 (if (r1 < #xa0) | |
336 (r3 = r5)) | |
337 (write-multibyte-character r3 r1) | |
338 (if (r2 < #xa0) | |
339 (r3 = r5) | |
340 (r3 = r6)) | |
341 (write-multibyte-character r3 r2)))) | |
342 | |
343 ;; Surrogates, U+D800 .. U+DFFF | |
344 ;; Fixme: process them properly. | |
345 (if (r3 < #xe000) | |
346 ((r3 = r6) | |
347 (write-multibyte-character r3 r0) ; eight-bit-graphic | |
348 (if (r1 < #xa0) | |
349 (r3 = r5)) | |
350 (write-multibyte-character r3 r1) | |
351 (if (r2 < #xa0) | |
352 (r3 = r5) | |
353 (r3 = r6)) | |
354 (write-multibyte-character r3 r2)) | |
198 | 355 |
199 ;; mule-unicode-e000-ffff | 356 ;; mule-unicode-e000-ffff |
200 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | 357 ;; Fixme: fffe and ffff are invalid. |
201 (r3 -= #xe000) | 358 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) |
202 (r3 //= 96) | 359 (r3 -= #xe000) |
203 (r1 = (r7 + 32)) | 360 (r3 //= 96) |
204 (r1 += ((r3 + 32) << 7)) | 361 (r1 = (r7 + 32)) |
205 (write-multibyte-character r0 r1)))))))) | 362 (r1 += ((r3 + 32) << 7)) |
206 | 363 (write-multibyte-character r0 r1))))))))) |
207 ;; 4byte encoding | 364 |
208 ;; keep those bytes as eight-bit-{control|graphic} | 365 (if (r0 < #xfe) |
209 ((read r1 r2 r3) | 366 ;; 4byte encoding |
210 ;; r0 > #xf0, thus eight-bit-graphic | 367 ;; keep those bytes as eight-bit-{control|graphic} |
211 (write-multibyte-character r6 r0) | 368 ;; Fixme: allow lookup in utf-8-subst-table. |
212 (if (r1 < #xa0) | 369 ((read r1 r2 r3) |
213 (write-multibyte-character r5 r1) | 370 ;; r0 > #xf0, thus eight-bit-graphic |
214 (write-multibyte-character r6 r1)) | 371 (write-multibyte-character r6 r0) |
215 (if (r2 < #xa0) | 372 (if (r1 < #xa0) |
216 (write-multibyte-character r5 r2) | 373 (if (r1 < #x80) ; invalid byte |
217 (write-multibyte-character r6 r2)) | 374 (write r1) |
218 (if (r3 < #xa0) | 375 (write-multibyte-character r5 r1)) |
219 (write-multibyte-character r5 r3) | 376 (write-multibyte-character r6 r1)) |
220 (write-multibyte-character r6 r3)))))) | 377 (if (r2 < #xa0) |
221 | 378 (if (r2 < #x80) ; invalid byte |
379 (write r2) | |
380 (write-multibyte-character r5 r2)) | |
381 (write-multibyte-character r6 r2)) | |
382 (if (r3 < #xa0) | |
383 (if (r3 < #x80) ; invalid byte | |
384 (write r3) | |
385 (write-multibyte-character r5 r3)) | |
386 (write-multibyte-character r6 r3)) | |
387 (if (r0 >= #xf8) ; 5- or 6-byte encoding | |
388 ((read r1) | |
389 (if (r1 < #xa0) | |
390 (if (r1 < #x80) ; invalid byte | |
391 (write r1) | |
392 (write-multibyte-character r5 r1)) | |
393 (write-multibyte-character r6 r1)) | |
394 (if (r0 >= #xfc) ; 6-byte | |
395 ((read r1) | |
396 (if (r1 < #xa0) | |
397 (if (r1 < #x80) ; invalid byte | |
398 (write r1) | |
399 (write-multibyte-character r5 r1)) | |
400 (write-multibyte-character r6 r1))))))) | |
401 ;; else invalid byte >= #xfe | |
402 (write-multibyte-character r6 r0)))))) | |
222 (repeat)))) | 403 (repeat)))) |
223 | 404 |
224 "CCL program to decode UTF-8. | 405 "CCL program to decode UTF-8. |
225 Basic decoding is done into the charsets ascii, latin-iso8859-1 and | 406 Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
226 mule-unicode-*. Encodings of un-representable Unicode characters are | 407 mule-unicode-*, but see also `utf-8-translation-table-for-decode' and |
227 decoded asis into eight-bit-control and eight-bit-graphic | 408 `utf-8-subst-table'. |
228 characters.") | 409 Encodings of un-representable Unicode characters are decoded asis into |
410 eight-bit-control and eight-bit-graphic characters.") | |
229 | 411 |
230 (define-ccl-program ccl-encode-mule-utf-8 | 412 (define-ccl-program ccl-encode-mule-utf-8 |
231 `(1 | 413 `(1 |
232 ((r5 = -1) | 414 ((r5 = -1) |
233 (loop | 415 (loop |
286 (write r0 r1 r2)) | 468 (write r0 r1 r2)) |
287 | 469 |
288 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) | 470 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) |
289 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | 471 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
290 (r1 &= #x7f) | 472 (r1 &= #x7f) |
291 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000 | 473 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000 |
292 (r0 = (((r1 & #xf000) >> 12) | #xe0)) | 474 (r0 = (((r1 & #xf000) >> 12) | #xe0)) |
293 (r2 = ((r1 & #x3f) | #x80)) | 475 (r2 = ((r1 & #x3f) | #x80)) |
294 (r1 &= #x0fc0) | 476 (r1 &= #x0fc0) |
295 (r1 >>= 6) | 477 (r1 >>= 6) |
296 (r1 |= #x80) | 478 (r1 |= #x80) |
327 (if (r1 < #xa0) | 509 (if (r1 < #xa0) |
328 (write r1) | 510 (write r1) |
329 ((write #xc2) | 511 ((write #xc2) |
330 (write r1))))))) | 512 (write r1))))))) |
331 | 513 |
332 ;; Unsupported character. | 514 ((lookup-character utf-8-subst-rev-table r0 r1) |
333 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. | 515 (if r7 ; lookup succeeded |
334 ((write #xef) | 516 ((r1 = (((r0 & #xf000) >> 12) | #xe0)) |
335 (write #xbf) | 517 (r2 = ((r0 & #x3f) | #x80)) |
336 (write #xbd))))))))) | 518 (r0 &= #x0fc0) |
519 (r0 >>= 6) | |
520 (r0 |= #x80) | |
521 (write r1 r0 r2)) | |
522 ;; Unsupported character. | |
523 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. | |
524 ((write #xef) | |
525 (write #xbf) | |
526 (write #xbd))))))))))) | |
337 (repeat))) | 527 (repeat))) |
338 (if (r1 >= #xa0) | 528 (if (r1 >= #xa0) |
339 (write r1) | 529 (write r1) |
340 (if (r1 >= #x80) | 530 (if (r1 >= #x80) |
341 ((write #xc2) | 531 ((write #xc2) |
342 (write r1))))) | 532 (write r1))))) |
343 | 533 |
344 "CCL program to encode into UTF-8. | 534 "CCL program to encode into UTF-8.") |
345 Only characters from the charsets ascii, eight-bit-control, | |
346 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized. | |
347 Others are encoded as U+FFFD.") | |
348 | 535 |
349 ;; Dummy definition so that the CCL can be checked correctly; the | 536 ;; Dummy definition so that the CCL can be checked correctly; the |
350 ;; actual data are loaded on demand. | 537 ;; actual data are loaded on demand. |
351 (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it | 538 (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it |
352 (define-translation-table 'ucs-mule-8859-to-mule-unicode)) | 539 (define-translation-table 'ucs-mule-8859-to-mule-unicode)) |
353 | 540 |
541 (define-ccl-program ccl-untranslated-to-ucs | |
542 `(0 | |
543 (if (r0 < #xf0) ; 3-byte encoding, as above | |
544 ((r4 = 0) | |
545 (r3 = (r1 & #b11000000)) | |
546 (r3 |= ((r2 >> 2) & #b00110000)) | |
547 (if (r3 != #b10100000) | |
548 (r4 = 1) | |
549 ((r3 = ((r0 & #x0f) << 12)) | |
550 (r3 += ((r1 & #x3f) << 6)) | |
551 (r3 += (r2 & #x3f)) | |
552 (if (r3 < #x0800) | |
553 (r4 = 1)))) | |
554 (if (r4 != 0) | |
555 (r0 = 0) | |
556 (r0 = r3))) | |
557 (if (r0 < #xf8) ; 4-byte (Mule-UCS recipe) | |
558 ((r4 = (r1 >> 6)) | |
559 (if (r4 != #b10) | |
560 (r0 = 0) | |
561 ((r4 = (r2 >> 6)) | |
562 (if (r4 != #b10) | |
563 (r0 = 0) | |
564 ((r4 = (r3 >> 6)) | |
565 (if (r4 != #b10) | |
566 (r0 = 0) | |
567 ((r1 = ((r1 & #x3F) << 12)) | |
568 (r2 = ((r2 & #x3F) << 6)) | |
569 (r3 &= #x3F) | |
570 (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3))))))))) | |
571 (r0 = 0)))) | |
572 "Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0. | |
573 r0 == 0 for invalid sequence.") | |
574 | |
575 (defvar utf-8-ccl-regs (make-vector 8 0)) | |
576 | |
354 (defsubst utf-8-untranslated-to-ucs () | 577 (defsubst utf-8-untranslated-to-ucs () |
355 (let ((b1 (char-after)) | 578 "Return the UCS code for an untranslated sequence of raw bytes t point. |
356 (b2 (char-after (1+ (point)))) | 579 Only for 3- or 4-byte sequences." |
357 (b3 (char-after (+ 2 (point)))) | 580 (aset utf-8-ccl-regs 0 (or (char-after) 0)) |
358 (b4 (char-after (+ 4 (point))))) | 581 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0)) |
359 (if (and b1 b2 b3) | 582 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0)) |
360 (cond ((< b1 ?\xf0) | 583 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0)) |
361 (setq b2 (lsh (logand b2 ?\x3f) 6)) | 584 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs) |
362 (setq b3 (logand b3 ?\x3f)) | 585 (aref utf-8-ccl-regs 0)) |
363 (logior b3 (logior b2 (lsh (logand b1 ?\x0f) 12)))) | |
364 (b4 | |
365 (setq b2 (lsh (logand b2 ?\x3f) 12)) | |
366 (setq b3 (lsh (logand b3 ?\x3f) 6)) | |
367 (setq b4 (logand b4 ?\x3f)) | |
368 (logior b4 (logior b3 (logior b2 (lsh (logand b1 ?\x07) | |
369 18))))))))) | |
370 | 586 |
371 (defun utf-8-help-echo (window object position) | 587 (defun utf-8-help-echo (window object position) |
372 (format "Untranslated Unicode U+%04X" | 588 (format "Untranslated Unicode U+%04X" |
373 (get-char-property position 'untranslated-utf-8 object))) | 589 (get-char-property position 'untranslated-utf-8 object))) |
374 | 590 |
375 (defvar utf-8-subst-table nil | |
376 "If non-nil, a hash table mapping `untranslatable utf-8' to Emacs characters.") | |
377 | |
378 ;; We compose the untranslatable sequences into a single character. | 591 ;; We compose the untranslatable sequences into a single character. |
379 ;; This is infelicitous for editing, because there's currently no | 592 ;; This is infelicitous for editing, because there's currently no |
380 ;; mechanism for treating compositions as atomic, but is OK for | 593 ;; mechanism for treating compositions as atomic, but is OK for |
381 ;; display. We try to compose an appropriate character from a hash | 594 ;; display. They are composed to U+FFFD with help-echo which |
382 ;; table of CJK characters to display correctly. Otherwise we use | 595 ;; indicates the unicodes they represent. This function GCs too much. |
383 ;; U+FFFD. What we really should have is hash table lookup from CCL | |
384 ;; so that we could do this properly. This function GCs too much. | |
385 (defsubst utf-8-compose () | 596 (defsubst utf-8-compose () |
386 "Put a suitable composition on an untranslatable sequence. | 597 "Put a suitable composition on an untranslatable sequence. |
387 Return the sequence's length." | 598 Return the sequence's length." |
388 (let* ((u (utf-8-untranslated-to-ucs)) | 599 (let* ((u (utf-8-untranslated-to-ucs)) |
389 (l (and u (if (>= u ?\x10000) | 600 (l (unless (zerop u) |
601 (if (>= u #x10000) | |
390 4 | 602 4 |
391 3))) | 603 3)))) |
392 (subst (and utf-8-subst-table (gethash u utf-8-subst-table)))) | 604 (when l |
393 (when u | |
394 (put-text-property (point) (min (point-max) (+ l (point))) | 605 (put-text-property (point) (min (point-max) (+ l (point))) |
395 'untranslated-utf-8 u) | 606 'untranslated-utf-8 u) |
396 (unless subst | 607 (put-text-property (point) (min (point-max) (+ l (point))) |
397 (put-text-property (point) (min (point-max) (+ l (point))) | 608 'help-echo 'utf-8-help-echo) |
398 'help-echo 'utf-8-help-echo) | 609 (compose-region (point) (+ l (point)) ?$,3u=(B) |
399 (setq subst ?$,3u=(B)) | |
400 (compose-region (point) (+ l (point)) subst) | |
401 l))) | 610 l))) |
402 | 611 |
403 (defcustom utf-8-compose-scripts nil | 612 (defcustom utf-8-compose-scripts nil |
404 "*Non-nil means compose various scipts on decoding utf-8 text." | 613 "*Non-nil means compose various scripts on decoding utf-8 text." |
405 :group 'mule | 614 :group 'mule |
406 :type 'boolean) ; omitted in Emacs 21.1 | 615 :version "21.4" |
616 :type 'boolean) | |
407 | 617 |
408 (defun utf-8-post-read-conversion (length) | 618 (defun utf-8-post-read-conversion (length) |
409 "Compose untranslated utf-8 sequences into single characters. | 619 "Compose untranslated utf-8 sequences into single characters. |
410 Also compose particular scripts if `utf-8-compose-scripts' is non-nil." | 620 Also compose particular scripts if `utf-8-compose-scripts' is non-nil." |
411 (save-excursion | 621 (save-excursion |
412 ;; Can't do eval-when-compile to insert a multibyte constant | 622 ;; Can't do eval-when-compile to insert a multibyte constant |
413 ;; version of the string in the loop, since it's always loaded as | 623 ;; version of the string in the loop, since it's always loaded as |
414 ;; unibyte from a byte-compiled file. | 624 ;; unibyte from a byte-compiled file. |
415 (let ((range (string-as-multibyte "^\341-\377"))) | 625 (let ((range (string-as-multibyte "^\xe1-\xf7"))) |
416 (while (and (skip-chars-forward | 626 (while (and (skip-chars-forward range) |
417 range) | |
418 (not (eobp))) | 627 (not (eobp))) |
419 (forward-char (utf-8-compose))))) | 628 (forward-char (utf-8-compose))))) |
420 ;; Fixme: Takahashi-san implies it may not work this easily -- needs | 629 ;; Fixme: Takahashi-san implies it may not work this easily. I |
421 ;; checking with him. | 630 ;; asked why but didn't get a reply. -- fx |
422 (when (and utf-8-compose-scripts (> length 1)) | 631 (when (and utf-8-compose-scripts (> length 1)) |
423 ;; These currently have definitions which cover the relevant | 632 ;; These currently have definitions which cover the relevant |
424 ;; Unicodes. We could avoid loading thai-util &c by checking | 633 ;; unicodes. We could avoid loading thai-util &c by checking |
425 ;; whether the region contains any characters with the appropriate | 634 ;; whether the region contains any characters with the appropriate |
426 ;; categories. There aren't yet Unicode-based rules for Tibetan. | 635 ;; categories. There aren't yet Unicode-based rules for Tibetan. |
427 (save-excursion (setq length (diacritic-post-read-conversion length))) | 636 (save-excursion (setq length (diacritic-post-read-conversion length))) |
428 (save-excursion (setq length (thai-post-read-conversion length))) | 637 (save-excursion (setq length (thai-post-read-conversion length))) |
429 (save-excursion (setq length (lao-post-read-conversion length))) | 638 (save-excursion (setq length (lao-post-read-conversion length))) |
430 (save-excursion (setq length (devanagari-post-read-conversion length)))) | 639 (save-excursion |
640 (setq length (in-is13194-devanagari-post-read-conversion length)))) | |
431 length) | 641 length) |
432 | 642 |
433 (defun utf-8-pre-write-conversion (beg end) | 643 ;; ucs-tables is preloaded |
434 "Semi-dummy pre-write function effectively to autoload ucs-tables." | 644 ;; (defun utf-8-pre-write-conversion (beg end) |
435 ;; Ensure translation table is loaded. | 645 ;; "Semi-dummy pre-write function effectively to autoload ucs-tables." |
436 (require 'ucs-tables) | 646 ;; ;; Ensure translation table is loaded. |
437 ;; Don't do this again. | 647 ;; (require 'ucs-tables) |
438 (coding-system-put 'mule-utf-8 'pre-write-conversion nil) | 648 ;; ;; Don't do this again. |
439 nil) | 649 ;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil) |
650 ;; nil) | |
440 | 651 |
441 (make-coding-system | 652 (make-coding-system |
442 'mule-utf-8 4 ?u | 653 'mule-utf-8 4 ?u |
443 "UTF-8 encoding for Emacs-supported Unicode characters. | 654 "UTF-8 encoding for Emacs-supported Unicode characters. |
444 The supported Emacs character sets are the following, plus others | 655 The supported Emacs character sets are the following, plus any other |
445 which may be included in the translation table | 656 characters included in the tables `ucs-mule-to-mule-unicode' and |
446 `ucs-mule-to-mule-unicode': | 657 `utf-8-subst-rev-table': |
447 ascii | 658 ascii |
448 eight-bit-control | 659 eight-bit-control |
449 eight-bit-graphic | 660 eight-bit-graphic |
450 latin-iso8859-1 | 661 latin-iso8859-1 |
451 latin-iso8859-2 | 662 latin-iso8859-2 |
460 mule-unicode-0100-24ff | 671 mule-unicode-0100-24ff |
461 mule-unicode-2500-33ff | 672 mule-unicode-2500-33ff |
462 mule-unicode-e000-ffff | 673 mule-unicode-e000-ffff |
463 | 674 |
464 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF | 675 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF |
465 are decoded into sequences of eight-bit-control and eight-bit-graphic | 676 may be decoded into korean-ksc5601, chinese-gb2312, japanese-jisx0208 |
466 characters to preserve their byte sequences and composed to display as | 677 \(see user option `utf-8-translate-cjk'); otherwise, sequences of |
467 a single character. Emacs characters that can't be encoded to these | 678 eight-bit-control and eight-bit-graphic characters are used to |
468 ranges are encoded as U+FFFD." | 679 preserve their byte sequences, and these are composed to display as a |
680 single character. Emacs characters that otherwise can't be encoded | |
681 are encoded as U+FFFD." | |
469 | 682 |
470 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | 683 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) |
471 '((safe-charsets | 684 '((safe-charsets |
472 ascii | 685 ascii |
473 eight-bit-control | 686 eight-bit-control |
495 mule-unicode-2500-33ff | 708 mule-unicode-2500-33ff |
496 mule-unicode-e000-ffff) | 709 mule-unicode-e000-ffff) |
497 (mime-charset . utf-8) | 710 (mime-charset . utf-8) |
498 (coding-category . coding-category-utf-8) | 711 (coding-category . coding-category-utf-8) |
499 (valid-codes (0 . 255)) | 712 (valid-codes (0 . 255)) |
500 (pre-write-conversion . utf-8-pre-write-conversion) | 713 ;; (pre-write-conversion . utf-8-pre-write-conversion) |
501 (post-read-conversion . utf-8-post-read-conversion))) | 714 (post-read-conversion . utf-8-post-read-conversion))) |
502 | 715 |
503 (define-coding-system-alias 'utf-8 'mule-utf-8) | 716 (define-coding-system-alias 'utf-8 'mule-utf-8) |
504 | 717 |
505 ;; I think this needs special private charsets defined for the | 718 ;; I think this needs special private charsets defined for the |