Mercurial > emacs
annotate lisp/international/utf-8.el @ 47475:ee41d5989edf
Fix typo.
author | Richard M. Stallman <rms@gnu.org> |
---|---|
date | Fri, 13 Sep 2002 19:36:55 +0000 |
parents | 63f5cc467cea |
children | 6d4430dfeafc |
rev | line source |
---|---|
46496 | 1 ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*- |
35542 | 2 |
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. | |
4 ;; Licensed to the Free Software Foundation. | |
46496 | 5 ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc. |
35542 | 6 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
7 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> |
46496 | 8 ;; Maintainer: FSF |
36243 | 9 ;; Keywords: multilingual, Unicode, UTF-8, i18n |
35542 | 10 |
11 ;; This file is part of GNU Emacs. | |
12 | |
13 ;; GNU Emacs is free software; you can redistribute it and/or modify | |
14 ;; it under the terms of the GNU General Public License as published by | |
15 ;; the Free Software Foundation; either version 2, or (at your option) | |
16 ;; any later version. | |
17 | |
18 ;; GNU Emacs is distributed in the hope that it will be useful, | |
19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 ;; GNU General Public License for more details. | |
22 | |
23 ;; You should have received a copy of the GNU General Public License | |
24 ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
25 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
26 ;; Boston, MA 02111-1307, USA. | |
27 | |
28 ;;; Commentary: | |
29 | |
41873 | 30 ;; The coding-system `mule-utf-8' basically supports encoding/decoding |
31 ;; of the following character sets to and from UTF-8: | |
35542 | 32 ;; |
33 ;; ascii | |
34 ;; eight-bit-control | |
35 ;; latin-iso8859-1 | |
36 ;; mule-unicode-0100-24ff | |
37 ;; mule-unicode-2500-33ff | |
38 ;; mule-unicode-e000-ffff | |
39 ;; | |
36243 | 40 ;; On decoding, Unicode characters that do not fit into the above |
41 ;; character sets are handled as `eight-bit-control' or | |
42 ;; `eight-bit-graphic' characters to retain the information about the | |
46496 | 43 ;; original byte sequence and text properties record the corresponding |
44 ;; unicode. | |
45 ;; | |
46 ;; Fixme: note that reading and writing invalid utf-8 may not be | |
47 ;; idempotent -- to represent the bytes to fix that needs a new charset. | |
41873 | 48 ;; |
49 ;; Characters from other character sets can be encoded with | |
50 ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and | |
46496 | 51 ;; registering the translation with `register-char-codings'. Hash |
52 ;; tables `utf-8-subst-table' and `utf-8-subst-rev-table' are used to | |
53 ;; support encoding and decoding of about a quarter of the CJK space | |
54 ;; between U+3400 and U+DFFF. | |
36243 | 55 |
56 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: | |
35542 | 57 |
58 ;; scalar | utf-8 | |
59 ;; value | 1st byte | 2nd byte | 3rd byte | |
60 ;; --------------------+-----------+-----------+---------- | |
61 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | | |
62 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | | |
63 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx | |
64 | |
65 ;;; Code: | |
66 | |
41873 | 67 (defvar ucs-mule-to-mule-unicode (make-translation-table) |
68 "Translation table for encoding to `mule-utf-8'.") | |
47385
d46bcfbfdda0
(ucs-mule-to-mule-unicode): Define
Dave Love <fx@gnu.org>
parents:
47231
diff
changeset
|
69 (define-translation-table 'ucs-mule-to-mule-unicode |
d46bcfbfdda0
(ucs-mule-to-mule-unicode): Define
Dave Love <fx@gnu.org>
parents:
47231
diff
changeset
|
70 ucs-mule-to-mule-unicode) |
46496 | 71 |
72 (defvar utf-8-subst-table (make-hash-table :test 'eq)) | |
73 (defvar utf-8-subst-rev-table (make-hash-table :test 'eq)) | |
74 (define-translation-hash-table 'utf-8-subst-table utf-8-subst-table) | |
75 (define-translation-hash-table 'utf-8-subst-rev-table utf-8-subst-rev-table) | |
76 | |
77 (defvar utf-8-translation-table-for-decode (make-translation-table) | |
78 "Translation table applied after decoding utf-8 to mule-unicode. | |
79 This is only actually applied to characters which would normally be | |
80 decoded into mule-unicode-0100-24ff.") | |
81 (define-translation-table 'utf-8-translation-table-for-decode | |
82 utf-8-translation-table-for-decode) | |
83 | |
84 ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the | |
85 ;; space of mule-unicode. For Latin scripts this isn't very | |
86 ;; important. Hebrew and Arabic might go here too when there's proper | |
87 ;; support for them. | |
47385
d46bcfbfdda0
(ucs-mule-to-mule-unicode): Define
Dave Love <fx@gnu.org>
parents:
47231
diff
changeset
|
88 (defvar utf-8-fragmentation-table (make-translation-table) |
d46bcfbfdda0
(ucs-mule-to-mule-unicode): Define
Dave Love <fx@gnu.org>
parents:
47231
diff
changeset
|
89 "Char table normally mapping non-Latin mule-unicode-... characters to iso8859. |
d46bcfbfdda0
(ucs-mule-to-mule-unicode): Define
Dave Love <fx@gnu.org>
parents:
47231
diff
changeset
|
90 Used as the value of `utf-8-translation-table-for-decode' in |
d46bcfbfdda0
(ucs-mule-to-mule-unicode): Define
Dave Love <fx@gnu.org>
parents:
47231
diff
changeset
|
91 `utf-8-fragment-on-decoding' mode.") |
46496 | 92 (mapc |
93 (lambda (pair) | |
47385
d46bcfbfdda0
(ucs-mule-to-mule-unicode): Define
Dave Love <fx@gnu.org>
parents:
47231
diff
changeset
|
94 (aset utf-8-fragmentation-table (car pair) (cdr pair))) |
46496 | 95 '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) (?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B) |
96 (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) (?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,F@(B) | |
97 (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) (?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B) | |
98 (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) (?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B) | |
99 (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) (?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B) | |
100 (?$,1' (B . ?,FP(B) (?$,1'!(B . ?,FQ(B) (?$,1'#(B . ?,FS(B) (?$,1'$(B . ?,FT(B) (?$,1'%(B . ?,FU(B) | |
101 (?$,1'&(B . ?,FV(B) (?$,1''(B . ?,FW(B) (?$,1'((B . ?,FX(B) (?$,1')(B . ?,FY(B) (?$,1'*(B . ?,FZ(B) | |
102 (?$,1'+(B . ?,F[(B) (?$,1',(B . ?,F\(B) (?$,1'-(B . ?,F](B) (?$,1'.(B . ?,F^(B) (?$,1'/(B . ?,F_(B) | |
103 (?$,1'0(B . ?,F`(B) (?$,1'1(B . ?,Fa(B) (?$,1'2(B . ?,Fb(B) (?$,1'3(B . ?,Fc(B) (?$,1'4(B . ?,Fd(B) | |
104 (?$,1'5(B . ?,Fe(B) (?$,1'6(B . ?,Ff(B) (?$,1'7(B . ?,Fg(B) (?$,1'8(B . ?,Fh(B) (?$,1'9(B . ?,Fi(B) | |
105 (?$,1':(B . ?,Fj(B) (?$,1';(B . ?,Fk(B) (?$,1'<(B . ?,Fl(B) (?$,1'=(B . ?,Fm(B) (?$,1'>(B . ?,Fn(B) | |
106 (?$,1'?(B . ?,Fo(B) (?$,1'@(B . ?,Fp(B) (?$,1'A(B . ?,Fq(B) (?$,1'B(B . ?,Fr(B) (?$,1'C(B . ?,Fs(B) | |
107 (?$,1'D(B . ?,Ft(B) (?$,1'E(B . ?,Fu(B) (?$,1'F(B . ?,Fv(B) (?$,1'G(B . ?,Fw(B) (?$,1'H(B . ?,Fx(B) | |
108 (?$,1'I(B . ?,Fy(B) (?$,1'J(B . ?,Fz(B) (?$,1'K(B . ?,F{(B) (?$,1'L(B . ?,F|(B) (?$,1'M(B . ?,F}(B) | |
109 (?$,1'N(B . ?,F~(B) | |
110 | |
111 (?$,1(!(B . ?,L!(B) (?$,1("(B . ?,L"(B) (?$,1(#(B . ?,L#(B) (?$,1($(B . ?,L$(B) | |
112 (?$,1(%(B . ?,L%(B) (?$,1(&(B . ?,L&(B) (?$,1('(B . ?,L'(B) (?$,1(((B . ?,L((B) (?$,1()(B . ?,L)(B) | |
113 (?$,1(*(B . ?,L*(B) (?$,1(+(B . ?,L+(B) (?$,1(,(B . ?,L,(B) (?$,1(.(B . ?,L.(B) (?$,1(/(B . ?,L/(B) | |
114 (?$,1(0(B . ?,L0(B) (?$,1(1(B . ?,L1(B) (?$,1(2(B . ?,L2(B) (?$,1(3(B . ?,L3(B) (?$,1(4(B . ?,L4(B) | |
115 (?$,1(5(B . ?,L5(B) (?$,1(6(B . ?,L6(B) (?$,1(7(B . ?,L7(B) (?$,1(8(B . ?,L8(B) (?$,1(9(B . ?,L9(B) | |
116 (?$,1(:(B . ?,L:(B) (?$,1(;(B . ?,L;(B) (?$,1(<(B . ?,L<(B) (?$,1(=(B . ?,L=(B) (?$,1(>(B . ?,L>(B) | |
117 (?$,1(?(B . ?,L?(B) (?$,1(@(B . ?,L@(B) (?$,1(A(B . ?,LA(B) (?$,1(B(B . ?,LB(B) (?$,1(C(B . ?,LC(B) | |
118 (?$,1(D(B . ?,LD(B) (?$,1(E(B . ?,LE(B) (?$,1(F(B . ?,LF(B) (?$,1(G(B . ?,LG(B) (?$,1(H(B . ?,LH(B) | |
119 (?$,1(I(B . ?,LI(B) (?$,1(J(B . ?,LJ(B) (?$,1(K(B . ?,LK(B) (?$,1(L(B . ?,LL(B) (?$,1(M(B . ?,LM(B) | |
120 (?$,1(N(B . ?,LN(B) (?$,1(O(B . ?,LO(B) (?$,1(P(B . ?,LP(B) (?$,1(Q(B . ?,LQ(B) (?$,1(R(B . ?,LR(B) | |
121 (?$,1(S(B . ?,LS(B) (?$,1(T(B . ?,LT(B) (?$,1(U(B . ?,LU(B) (?$,1(V(B . ?,LV(B) (?$,1(W(B . ?,LW(B) | |
122 (?$,1(X(B . ?,LX(B) (?$,1(Y(B . ?,LY(B) (?$,1(Z(B . ?,LZ(B) (?$,1([(B . ?,L[(B) (?$,1(\(B . ?,L\(B) | |
123 (?$,1(](B . ?,L](B) (?$,1(^(B . ?,L^(B) (?$,1(_(B . ?,L_(B) (?$,1(`(B . ?,L`(B) (?$,1(a(B . ?,La(B) | |
124 (?$,1(b(B . ?,Lb(B) (?$,1(c(B . ?,Lc(B) (?$,1(d(B . ?,Ld(B) (?$,1(e(B . ?,Le(B) (?$,1(f(B . ?,Lf(B) | |
125 (?$,1(g(B . ?,Lg(B) (?$,1(h(B . ?,Lh(B) (?$,1(i(B . ?,Li(B) (?$,1(j(B . ?,Lj(B) (?$,1(k(B . ?,Lk(B) | |
126 (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) (?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B) | |
127 (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) (?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B) | |
128 (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) (?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B) | |
129 (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B))) | |
130 | |
131 (defcustom utf-8-fragment-on-decoding nil | |
47231 | 132 "Whether or not to decode some scripts in UTF-8 text into iso8859 charsets. |
46496 | 133 Setting this means that the relevant Cyrillic and Greek characters are |
134 decoded into the iso8859 charsets rather than into | |
47231 | 135 mule-unicode-0100-24ff. The iso8859 charsets take half as much space |
46496 | 136 in the buffer, but using them may affect how the buffer can be re-encoded |
137 and may require a different input method to search for them, for instance. | |
138 See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode' | |
47231 | 139 for mechanisms to make this largely transparent. |
140 | |
141 Setting this variable outside customize has no effect." | |
46496 | 142 :set (lambda (s v) |
47385
d46bcfbfdda0
(ucs-mule-to-mule-unicode): Define
Dave Love <fx@gnu.org>
parents:
47231
diff
changeset
|
143 (setq utf-8-translation-table-for-decode |
d46bcfbfdda0
(ucs-mule-to-mule-unicode): Define
Dave Love <fx@gnu.org>
parents:
47231
diff
changeset
|
144 (if v |
d46bcfbfdda0
(ucs-mule-to-mule-unicode): Define
Dave Love <fx@gnu.org>
parents:
47231
diff
changeset
|
145 utf-8-fragmentation-table |
47409
63f5cc467cea
(utf-8-fragment-on-decoding): Fix last
Dave Love <fx@gnu.org>
parents:
47385
diff
changeset
|
146 (make-translation-table))) |
47385
d46bcfbfdda0
(ucs-mule-to-mule-unicode): Define
Dave Love <fx@gnu.org>
parents:
47231
diff
changeset
|
147 (define-translation-table 'utf-8-translation-table-for-decode |
d46bcfbfdda0
(ucs-mule-to-mule-unicode): Define
Dave Love <fx@gnu.org>
parents:
47231
diff
changeset
|
148 utf-8-translation-table-for-decode) |
46496 | 149 (set-default s v)) |
150 :version "21.4" | |
151 :type 'boolean | |
152 :group 'mule) | |
153 | |
154 (defcustom utf-8-translate-cjk nil | |
155 "Whether the `mule-utf-8' coding system should encode many CJK characters. | |
156 | |
157 Enabling this loads tables which enable the coding system to encode | |
158 characters in the charsets `korean-ksc5601', `chinese-gb2312' and | |
159 `japanese-jisx0208', and to decode the corresponding unicodes into | |
160 such characters. This works by loading the library `utf-8-subst'; see | |
161 its commentary. The tables are fairly large (about 33000 entries), so this | |
162 option is not the default." | |
163 :link '(emacs-commentary-link "utf-8-subst") | |
164 :set (lambda (s v) | |
165 (when v | |
166 (require 'utf-8-subst) | |
167 (let ((table (make-char-table 'translation-table))) | |
168 (coding-system-put 'mule-utf-8 'safe-charsets | |
169 (append (coding-system-get 'mule-utf-8 | |
170 'safe-charsets) | |
171 '(korean-ksc5601 chinese-gb2312 | |
172 japanese-jisx0208))) | |
173 (maphash (lambda (k v) | |
174 (aset table k v)) | |
175 utf-8-subst-rev-table) | |
176 (register-char-codings 'mule-utf-8 table))) | |
177 (set-default s v)) | |
178 :version "21.4" | |
179 :type 'boolean | |
180 :group 'mule) | |
181 | |
35542 | 182 (define-ccl-program ccl-decode-mule-utf-8 |
183 ;; | |
184 ;; charset | bytes in utf-8 | bytes in emacs | |
185 ;; -----------------------+----------------+--------------- | |
186 ;; ascii | 1 | 1 | |
187 ;; -----------------------+----------------+--------------- | |
188 ;; eight-bit-control | 2 | 2 | |
41873 | 189 ;; eight-bit-graphic | 2 | 1 |
35542 | 190 ;; latin-iso8859-1 | 2 | 2 |
191 ;; -----------------------+----------------+--------------- | |
192 ;; mule-unicode-0100-24ff | 2 | 4 | |
193 ;; (< 0800) | | | |
194 ;; -----------------------+----------------+--------------- | |
195 ;; mule-unicode-0100-24ff | 3 | 4 | |
196 ;; (>= 8000) | | | |
197 ;; mule-unicode-2500-33ff | 3 | 4 | |
198 ;; mule-unicode-e000-ffff | 3 | 4 | |
199 ;; | |
200 ;; Thus magnification factor is two. | |
201 ;; | |
202 `(2 | |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
203 ((r5 = ,(charset-id 'eight-bit-control)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
204 (r6 = ,(charset-id 'eight-bit-graphic)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
205 (loop |
35542 | 206 (read r0) |
207 | |
208 ;; 1byte encoding, i.e., ascii | |
209 (if (r0 < #x80) | |
210 (write r0) | |
46496 | 211 (if (r0 < #xc0) ; continuation byte (invalid here) |
212 (if (r0 < #xa0) | |
213 (write-multibyte-character r5 r0) | |
214 (write-multibyte-character r6 r0)) | |
215 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx | |
216 (if (r0 < #xe0) | |
217 ((read r1) | |
35542 | 218 |
46496 | 219 (if ((r1 & #b11000000) != #b10000000) |
220 ;; Invalid 2-byte sequence | |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
221 ((if (r0 < #xa0) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
222 (write-multibyte-character r5 r0) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
223 (write-multibyte-character r6 r0)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
224 (if (r1 < #x80) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
225 (write r1) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
226 (if (r1 < #xa0) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
227 (write-multibyte-character r5 r1) |
46496 | 228 (write-multibyte-character r6 r1)))) |
229 | |
230 ((r3 = r0) ; save in case of overlong sequence | |
231 (r2 = r1) | |
232 (r0 &= #x1f) | |
233 (r0 <<= 6) | |
234 (r2 = r1) ; save in case of overlong sequence | |
235 (r1 &= #x3f) | |
236 (r1 += r0) | |
237 ;; Now r1 holds scalar value | |
238 | |
239 (if (r1 < 128) ; `overlong sequence' | |
240 ((if (r3 < #xa0) | |
241 (write-multibyte-character r5 r3) | |
242 (write-multibyte-character r6 r3)) | |
243 (if (r2 < #x80) | |
244 (write r2) | |
245 (if (r2 < #xa0) | |
246 (write-multibyte-character r5 r2) | |
247 (write-multibyte-character r6 r2)))) | |
248 | |
249 ;; eight-bit-control | |
250 (if (r1 < 160) | |
251 ((write-multibyte-character r5 r1)) | |
252 | |
253 ;; latin-iso8859-1 | |
254 (if (r1 < 256) | |
255 ((r0 = ,(charset-id 'latin-iso8859-1)) | |
256 (r1 -= 128) | |
257 (write-multibyte-character r0 r1)) | |
258 | |
259 ;; mule-unicode-0100-24ff (< 0800) | |
260 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
261 (r1 -= #x0100) | |
262 (r2 = (((r1 / 96) + 32) << 7)) | |
263 (r1 %= 96) | |
264 (r1 += (r2 + 32)) | |
265 (translate-character | |
266 utf-8-translation-table-for-decode r0 r1) | |
267 (write-multibyte-character r0 r1)))))))) | |
268 | |
269 ;; 3byte encoding | |
270 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx | |
271 (if (r0 < #xf0) | |
272 ((read r1 r2) | |
273 | |
274 ;; This is set to 1 if the encoding is invalid. | |
275 (r4 = 0) | |
276 | |
277 (r3 = (r1 & #b11000000)) | |
278 (r3 |= ((r2 >> 2) & #b00110000)) | |
279 (if (r3 != #b10100000) | |
280 (r4 = 1) | |
281 ((r3 = ((r0 & #x0f) << 12)) | |
282 (r3 += ((r1 & #x3f) << 6)) | |
283 (r3 += (r2 & #x3f)) | |
284 (if (r3 < #x0800) | |
285 (r4 = 1)))) | |
286 | |
287 (if (r4 != 0) | |
288 ;; Invalid 3-byte sequence | |
289 ((if (r0 < #xa0) | |
290 (write-multibyte-character r5 r0) | |
291 (write-multibyte-character r6 r0)) | |
292 (if (r1 < #x80) | |
293 (write r1) | |
294 (if (r1 < #xa0) | |
295 (write-multibyte-character r5 r1) | |
296 (write-multibyte-character r6 r1))) | |
297 (if (r2 < #x80) | |
298 (write r2) | |
299 (if (r2 < #xa0) | |
300 (write-multibyte-character r5 r2) | |
301 (write-multibyte-character r6 r2)))) | |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
302 |
46496 | 303 ;; mule-unicode-0100-24ff (>= 0800) |
304 ((if (r3 < #x2500) | |
305 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
306 (r3 -= #x0100) | |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
307 (r3 //= 96) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
308 (r1 = (r7 + 32)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
309 (r1 += ((r3 + 32) << 7)) |
46496 | 310 (translate-character |
311 utf-8-translation-table-for-decode r0 r1) | |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
312 (write-multibyte-character r0 r1)) |
46496 | 313 |
314 ;; mule-unicode-2500-33ff | |
315 ;; Fixme: Perhaps allow translation via | |
316 ;; utf-8-subst-table for #x2e80 up, so that we use | |
317 ;; consistent charsets for all of CJK. Would need | |
318 ;; corresponding change to encoding tables. | |
319 (if (r3 < #x3400) | |
320 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | |
321 (r3 -= #x2500) | |
322 (r3 //= 96) | |
323 (r1 = (r7 + 32)) | |
324 (r1 += ((r3 + 32) << 7)) | |
325 (write-multibyte-character r0 r1)) | |
326 | |
327 ;; U+3400 .. U+D7FF | |
328 ;; Try to convert to CJK chars, else keep | |
329 ;; them as eight-bit-{control|graphic}. | |
330 (if (r3 < #xd800) | |
331 ((r4 = r3) ; don't zap r3 | |
332 (lookup-integer utf-8-subst-table r4 r5) | |
333 (if r7 | |
334 ;; got a translation | |
335 ((write-multibyte-character r4 r5) | |
336 ;; Zapped through register starvation. | |
337 (r5 = ,(charset-id 'eight-bit-control))) | |
338 ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic | |
339 ((r3 = r6) | |
340 (write-multibyte-character r3 r0) | |
341 (if (r1 < #xa0) | |
342 (r3 = r5)) | |
343 (write-multibyte-character r3 r1) | |
344 (if (r2 < #xa0) | |
345 (r3 = r5) | |
346 (r3 = r6)) | |
347 (write-multibyte-character r3 r2)))) | |
348 | |
349 ;; Surrogates, U+D800 .. U+DFFF | |
350 (if (r3 < #xe000) | |
351 ((r3 = r6) | |
352 (write-multibyte-character r3 r0) ; eight-bit-graphic | |
353 (if (r1 < #xa0) | |
354 (r3 = r5)) | |
355 (write-multibyte-character r3 r1) | |
356 (if (r2 < #xa0) | |
357 (r3 = r5) | |
358 (r3 = r6)) | |
359 (write-multibyte-character r3 r2)) | |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
360 |
46496 | 361 ;; mule-unicode-e000-ffff |
362 ;; Fixme: fffe and ffff are invalid. | |
363 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | |
364 (r3 -= #xe000) | |
365 (r3 //= 96) | |
366 (r1 = (r7 + 32)) | |
367 (r1 += ((r3 + 32) << 7)) | |
368 (write-multibyte-character r0 r1))))))))) | |
35542 | 369 |
46496 | 370 (if (r0 < #xfe) |
371 ;; 4byte encoding | |
372 ;; keep those bytes as eight-bit-{control|graphic} | |
373 ;; Fixme: allow lookup in utf-8-subst-table. | |
374 ((read r1 r2 r3) | |
375 ;; r0 > #xf0, thus eight-bit-graphic | |
376 (write-multibyte-character r6 r0) | |
377 (if (r1 < #xa0) | |
378 (if (r1 < #x80) ; invalid byte | |
379 (write r1) | |
380 (write-multibyte-character r5 r1)) | |
381 (write-multibyte-character r6 r1)) | |
382 (if (r2 < #xa0) | |
383 (if (r2 < #x80) ; invalid byte | |
384 (write r2) | |
385 (write-multibyte-character r5 r2)) | |
386 (write-multibyte-character r6 r2)) | |
387 (if (r3 < #xa0) | |
388 (if (r3 < #x80) ; invalid byte | |
389 (write r3) | |
390 (write-multibyte-character r5 r3)) | |
391 (write-multibyte-character r6 r3)) | |
392 (if (r0 >= #xf8) ; 5- or 6-byte encoding | |
393 ((read r1) | |
394 (if (r1 < #xa0) | |
395 (if (r1 < #x80) ; invalid byte | |
396 (write r1) | |
397 (write-multibyte-character r5 r1)) | |
398 (write-multibyte-character r6 r1)) | |
399 (if (r0 >= #xfc) ; 6-byte | |
400 ((read r1) | |
401 (if (r1 < #xa0) | |
402 (if (r1 < #x80) ; invalid byte | |
403 (write r1) | |
404 (write-multibyte-character r5 r1)) | |
405 (write-multibyte-character r6 r1))))))) | |
406 ;; else invalid byte >= #xfe | |
407 (write-multibyte-character r6 r0)))))) | |
35542 | 408 (repeat)))) |
409 | |
36243 | 410 "CCL program to decode UTF-8. |
36465 | 411 Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
46496 | 412 mule-unicode-*, but see also `utf-8-translation-table-for-decode' and |
413 `utf-8-subst-table'. | |
414 Encodings of un-representable Unicode characters are decoded asis into | |
415 eight-bit-control and eight-bit-graphic characters.") | |
35542 | 416 |
417 (define-ccl-program ccl-encode-mule-utf-8 | |
418 `(1 | |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
419 ((r5 = -1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
420 (loop |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
421 (if (r5 < 0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
422 ((r1 = -1) |
41873 | 423 (read-multibyte-character r0 r1) |
424 (translate-character ucs-mule-to-mule-unicode r0 r1)) | |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
425 (;; We have already done read-multibyte-character. |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
426 (r0 = r5) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
427 (r1 = r6) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
428 (r5 = -1))) |
35542 | 429 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
430 (if (r0 == ,(charset-id 'ascii)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
431 (write r1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
432 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
433 (if (r0 == ,(charset-id 'latin-iso8859-1)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
434 ;; r1 scalar utf-8 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
435 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
436 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
437 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
438 ((r0 = (((r1 & #x40) >> 6) | #xc2)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
439 (r1 &= #x3f) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
440 (r1 |= #x80) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
441 (write r0 r1)) |
35542 | 442 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
443 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
444 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
445 ;; #x3f80 == (0011 1111 1000 0000)b |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
446 (r1 &= #x7f) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
447 (r1 += (r0 + 224)) ; 240 == -32 + #x0100 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
448 ;; now r1 holds scalar value |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
449 (if (r1 < #x0800) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
450 ;; 2byte encoding |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
451 ((r0 = (((r1 & #x07c0) >> 6) | #xc0)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
452 ;; #x07c0 == (0000 0111 1100 0000)b |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
453 (r1 &= #x3f) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
454 (r1 |= #x80) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
455 (write r0 r1)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
456 ;; 3byte encoding |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
457 ((r0 = (((r1 & #xf000) >> 12) | #xe0)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
458 (r2 = ((r1 & #x3f) | #x80)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
459 (r1 &= #x0fc0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
460 (r1 >>= 6) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
461 (r1 |= #x80) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
462 (write r0 r1 r2)))) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
463 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
464 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
465 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
466 (r1 &= #x7f) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
467 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
468 (r0 = (((r1 & #xf000) >> 12) | #xe0)) |
35542 | 469 (r2 = ((r1 & #x3f) | #x80)) |
470 (r1 &= #x0fc0) | |
471 (r1 >>= 6) | |
472 (r1 |= #x80) | |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
473 (write r0 r1 r2)) |
35542 | 474 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
475 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
476 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
477 (r1 &= #x7f) |
46496 | 478 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
479 (r0 = (((r1 & #xf000) >> 12) | #xe0)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
480 (r2 = ((r1 & #x3f) | #x80)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
481 (r1 &= #x0fc0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
482 (r1 >>= 6) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
483 (r1 |= #x80) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
484 (write r0 r1 r2)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
485 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
486 (if (r0 == ,(charset-id 'eight-bit-control)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
487 ;; r1 scalar utf-8 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
488 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
489 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
490 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
491 ((write #xc2) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
492 (write r1)) |
35542 | 493 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
494 (if (r0 == ,(charset-id 'eight-bit-graphic)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
495 ;; r1 scalar utf-8 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
496 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
497 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
498 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
499 ((write r1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
500 (r1 = -1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
501 (read-multibyte-character r0 r1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
502 (if (r0 != ,(charset-id 'eight-bit-graphic)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
503 (if (r0 != ,(charset-id 'eight-bit-control)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
504 ((r5 = r0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
505 (r6 = r1)))) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
506 (if (r5 < 0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
507 ((read-multibyte-character r0 r2) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
508 (if (r0 != ,(charset-id 'eight-bit-graphic)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
509 (if (r0 != ,(charset-id 'eight-bit-control)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
510 ((r5 = r0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
511 (r6 = r2)))) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
512 (if (r5 < 0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
513 (write r1 r2) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
514 (if (r1 < #xa0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
515 (write r1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
516 ((write #xc2) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
517 (write r1))))))) |
35542 | 518 |
46496 | 519 ((lookup-character utf-8-subst-rev-table r0 r1) |
520 (if r7 ; lookup succeeded | |
521 ((r1 = (((r0 & #xf000) >> 12) | #xe0)) | |
522 (r2 = ((r0 & #x3f) | #x80)) | |
523 (r0 &= #x0fc0) | |
524 (r0 >>= 6) | |
525 (r0 |= #x80) | |
526 (write r1 r0 r2)) | |
527 ;; Unsupported character. | |
528 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. | |
529 ((write #xef) | |
530 (write #xbf) | |
531 (write #xbd))))))))))) | |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
532 (repeat))) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
533 (if (r1 >= #xa0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
534 (write r1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
535 (if (r1 >= #x80) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
536 ((write #xc2) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
537 (write r1))))) |
35542 | 538 |
46496 | 539 "CCL program to encode into UTF-8.") |
35542 | 540 |
41873 | 541 ;; Dummy definition so that the CCL can be checked correctly; the |
542 ;; actual data are loaded on demand. | |
543 (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it | |
544 (define-translation-table 'ucs-mule-8859-to-mule-unicode)) | |
545 | |
46496 | 546 (define-ccl-program ccl-untranslated-to-ucs |
547 `(0 | |
548 (if (r0 < #xf0) ; 3-byte encoding, as above | |
549 ((r4 = 0) | |
550 (r3 = (r1 & #b11000000)) | |
551 (r3 |= ((r2 >> 2) & #b00110000)) | |
552 (if (r3 != #b10100000) | |
553 (r4 = 1) | |
554 ((r3 = ((r0 & #x0f) << 12)) | |
555 (r3 += ((r1 & #x3f) << 6)) | |
556 (r3 += (r2 & #x3f)) | |
557 (if (r3 < #x0800) | |
558 (r4 = 1)))) | |
559 (if (r4 != 0) | |
560 (r0 = 0) | |
561 (r0 = r3))) | |
562 (if (r0 < #xf8) ; 4-byte (Mule-UCS recipe) | |
563 ((r4 = (r1 >> 6)) | |
564 (if (r4 != #b10) | |
565 (r0 = 0) | |
566 ((r4 = (r2 >> 6)) | |
567 (if (r4 != #b10) | |
568 (r0 = 0) | |
569 ((r4 = (r3 >> 6)) | |
570 (if (r4 != #b10) | |
571 (r0 = 0) | |
572 ((r1 = ((r1 & #x3F) << 12)) | |
573 (r2 = ((r2 & #x3F) << 6)) | |
574 (r3 &= #x3F) | |
575 (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3))))))))) | |
576 (r0 = 0)))) | |
577 "Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0. | |
578 r0 == 0 for invalid sequence.") | |
579 | |
580 (defvar utf-8-ccl-regs (make-vector 8 0)) | |
581 | |
41873 | 582 (defsubst utf-8-untranslated-to-ucs () |
46496 | 583 "Return the UCS code for an untranslated sequence of raw bytes t point. |
584 Only for 3- or 4-byte sequences." | |
585 (aset utf-8-ccl-regs 0 (or (char-after) 0)) | |
586 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0)) | |
587 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0)) | |
588 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0)) | |
589 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs) | |
590 (aref utf-8-ccl-regs 0)) | |
41873 | 591 |
592 (defun utf-8-help-echo (window object position) | |
593 (format "Untranslated Unicode U+%04X" | |
594 (get-char-property position 'untranslated-utf-8 object))) | |
595 | |
596 ;; We compose the untranslatable sequences into a single character. | |
597 ;; This is infelicitous for editing, because there's currently no | |
598 ;; mechanism for treating compositions as atomic, but is OK for | |
46496 | 599 ;; display. They are composed to U+FFFD with help-echo which |
600 ;; indicates the unicodes they represent. This function GCs too much. | |
41873 | 601 (defsubst utf-8-compose () |
602 "Put a suitable composition on an untranslatable sequence. | |
603 Return the sequence's length." | |
604 (let* ((u (utf-8-untranslated-to-ucs)) | |
46496 | 605 (l (unless (zerop u) |
606 (if (>= u #x10000) | |
41873 | 607 4 |
46496 | 608 3)))) |
609 (when l | |
41873 | 610 (put-text-property (point) (min (point-max) (+ l (point))) |
611 'untranslated-utf-8 u) | |
46496 | 612 (put-text-property (point) (min (point-max) (+ l (point))) |
613 'help-echo 'utf-8-help-echo) | |
614 (compose-region (point) (+ l (point)) ?$,3u=(B) | |
41873 | 615 l))) |
616 | |
617 (defcustom utf-8-compose-scripts nil | |
46496 | 618 "*Non-nil means compose various scripts on decoding utf-8 text." |
41873 | 619 :group 'mule |
46496 | 620 :version "21.4" |
621 :type 'boolean) | |
41873 | 622 |
623 (defun utf-8-post-read-conversion (length) | |
624 "Compose untranslated utf-8 sequences into single characters. | |
625 Also compose particular scripts if `utf-8-compose-scripts' is non-nil." | |
626 (save-excursion | |
627 ;; Can't do eval-when-compile to insert a multibyte constant | |
628 ;; version of the string in the loop, since it's always loaded as | |
629 ;; unibyte from a byte-compiled file. | |
46496 | 630 (let ((range (string-as-multibyte "^\xe1-\xf7"))) |
631 (while (and (skip-chars-forward range) | |
41873 | 632 (not (eobp))) |
633 (forward-char (utf-8-compose))))) | |
46496 | 634 ;; Fixme: Takahashi-san implies it may not work this easily. I |
635 ;; asked why but didn't get a reply. -- fx | |
41873 | 636 (when (and utf-8-compose-scripts (> length 1)) |
637 ;; These currently have definitions which cover the relevant | |
46496 | 638 ;; unicodes. We could avoid loading thai-util &c by checking |
41873 | 639 ;; whether the region contains any characters with the appropriate |
640 ;; categories. There aren't yet Unicode-based rules for Tibetan. | |
641 (save-excursion (setq length (diacritic-post-read-conversion length))) | |
642 (save-excursion (setq length (thai-post-read-conversion length))) | |
643 (save-excursion (setq length (lao-post-read-conversion length))) | |
46496 | 644 (save-excursion |
645 (setq length (in-is13194-devanagari-post-read-conversion length)))) | |
41873 | 646 length) |
647 | |
46496 | 648 ;; ucs-tables is preloaded |
649 ;; (defun utf-8-pre-write-conversion (beg end) | |
650 ;; "Semi-dummy pre-write function effectively to autoload ucs-tables." | |
651 ;; ;; Ensure translation table is loaded. | |
652 ;; (require 'ucs-tables) | |
653 ;; ;; Don't do this again. | |
654 ;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil) | |
655 ;; nil) | |
41873 | 656 |
35542 | 657 (make-coding-system |
658 'mule-utf-8 4 ?u | |
659 "UTF-8 encoding for Emacs-supported Unicode characters. | |
46496 | 660 The supported Emacs character sets are the following, plus any other |
661 characters included in the tables `ucs-mule-to-mule-unicode' and | |
662 `utf-8-subst-rev-table': | |
41873 | 663 ascii |
664 eight-bit-control | |
665 eight-bit-graphic | |
666 latin-iso8859-1 | |
667 latin-iso8859-2 | |
668 latin-iso8859-3 | |
669 latin-iso8859-4 | |
670 cyrillic-iso8859-5 | |
671 greek-iso8859-7 | |
672 hebrew-iso8859-8 | |
673 latin-iso8859-9 | |
674 latin-iso8859-14 | |
675 latin-iso8859-15 | |
676 mule-unicode-0100-24ff | |
677 mule-unicode-2500-33ff | |
678 mule-unicode-e000-ffff | |
35542 | 679 |
36243 | 680 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF |
46496 | 681 may be decoded into korean-ksc5601, chinese-gb2312, japanese-jisx0208 |
682 \(see user option `utf-8-translate-cjk'); otherwise, sequences of | |
683 eight-bit-control and eight-bit-graphic characters are used to | |
684 preserve their byte sequences, and these are composed to display as a | |
685 single character. Emacs characters that otherwise can't be encoded | |
686 are encoded as U+FFFD." | |
35542 | 687 |
688 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | |
689 '((safe-charsets | |
690 ascii | |
691 eight-bit-control | |
692 eight-bit-graphic | |
693 latin-iso8859-1 | |
41873 | 694 latin-iso8859-15 |
695 latin-iso8859-14 | |
696 latin-iso8859-9 | |
697 hebrew-iso8859-8 | |
698 greek-iso8859-7 | |
699 cyrillic-iso8859-5 | |
700 latin-iso8859-4 | |
701 latin-iso8859-3 | |
702 latin-iso8859-2 | |
703 vietnamese-viscii-lower | |
704 vietnamese-viscii-upper | |
705 thai-tis620 | |
706 ipa | |
707 ethiopic | |
708 indian-is13194 | |
709 katakana-jisx0201 | |
710 chinese-sisheng | |
711 lao | |
35542 | 712 mule-unicode-0100-24ff |
713 mule-unicode-2500-33ff | |
714 mule-unicode-e000-ffff) | |
36371
f6bb3ed752b4
(mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents:
36243
diff
changeset
|
715 (mime-charset . utf-8) |
36423
aa776838b660
(mule-utf-8): Set coding-category property to coding-category-utf-8.
Kenichi Handa <handa@m17n.org>
parents:
36371
diff
changeset
|
716 (coding-category . coding-category-utf-8) |
41873 | 717 (valid-codes (0 . 255)) |
46496 | 718 ;; (pre-write-conversion . utf-8-pre-write-conversion) |
41873 | 719 (post-read-conversion . utf-8-post-read-conversion))) |
35542 | 720 |
721 (define-coding-system-alias 'utf-8 'mule-utf-8) | |
38436
b174db545cfd
Some fixes to follow coding conventions.
Pavel Janík <Pavel@Janik.cz>
parents:
37934
diff
changeset
|
722 |
41873 | 723 ;; I think this needs special private charsets defined for the |
724 ;; untranslated sequences, if it's going to work well. | |
725 | |
726 ;;; (defun utf-8-compose-function (pos to pattern &optional string) | |
727 ;;; (let* ((prop (get-char-property pos 'composition string)) | |
728 ;;; (l (and prop (- (cadr prop) (car prop))))) | |
729 ;;; (cond ((and l (> l (- to pos))) | |
730 ;;; (delete-region pos to)) | |
731 ;;; ((and (> (char-after pos) 224) | |
732 ;;; (< (char-after pos) 256) | |
733 ;;; (save-restriction | |
734 ;;; (narrow-to-region pos to) | |
735 ;;; (utf-8-compose))) | |
736 ;;; t)))) | |
737 | |
738 ;;; (dotimes (i 96) | |
739 ;;; (aset composition-function-table | |
740 ;;; (+ 128 i) | |
741 ;;; `((,(string-as-multibyte "[\200-\237\240-\377]") | |
742 ;;; . utf-8-compose-function)))) | |
743 | |
38436
b174db545cfd
Some fixes to follow coding conventions.
Pavel Janík <Pavel@Janik.cz>
parents:
37934
diff
changeset
|
744 ;;; utf-8.el ends here |