Mercurial > emacs
annotate lisp/international/utf-8.el @ 56811:694cd033cd0d
Make "GNU GENERAL PUBLIC LICENSE" an appendix.
Rearrange order of nodes and sections such that both "GNU GENERAL
PUBLIC LICENSE" and "GNU Free Documentation License" appear at the
end, as appropriate for appendices.
(Acknowledgments): Use `@unnumberedsec'.
author | Luc Teirlinck <teirllm@auburn.edu> |
---|---|
date | Fri, 27 Aug 2004 23:36:38 +0000 |
parents | 752ef76fcc08 |
children | c3945be39e09 |
rev | line source |
---|---|
46496 | 1 ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*- |
35542 | 2 |
55437
6e677a935fe9
Fix references to utf-translate-cjk into utf-translate-cjk-mode.
Andreas Schwab <schwab@suse.de>
parents:
54304
diff
changeset
|
3 ;; Copyright (C) 2001, 2004 Electrotechnical Laboratory, JAPAN. |
35542 | 4 ;; Licensed to the Free Software Foundation. |
46496 | 5 ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc. |
35542 | 6 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
7 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> |
46496 | 8 ;; Maintainer: FSF |
36243 | 9 ;; Keywords: multilingual, Unicode, UTF-8, i18n |
35542 | 10 |
11 ;; This file is part of GNU Emacs. | |
12 | |
13 ;; GNU Emacs is free software; you can redistribute it and/or modify | |
14 ;; it under the terms of the GNU General Public License as published by | |
15 ;; the Free Software Foundation; either version 2, or (at your option) | |
16 ;; any later version. | |
17 | |
18 ;; GNU Emacs is distributed in the hope that it will be useful, | |
19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 ;; GNU General Public License for more details. | |
22 | |
23 ;; You should have received a copy of the GNU General Public License | |
24 ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
25 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
26 ;; Boston, MA 02111-1307, USA. | |
27 | |
28 ;;; Commentary: | |
29 | |
41873 | 30 ;; The coding-system `mule-utf-8' basically supports encoding/decoding |
31 ;; of the following character sets to and from UTF-8: | |
35542 | 32 ;; |
33 ;; ascii | |
34 ;; eight-bit-control | |
35 ;; latin-iso8859-1 | |
36 ;; mule-unicode-0100-24ff | |
37 ;; mule-unicode-2500-33ff | |
38 ;; mule-unicode-e000-ffff | |
39 ;; | |
36243 | 40 ;; On decoding, Unicode characters that do not fit into the above |
41 ;; character sets are handled as `eight-bit-control' or | |
42 ;; `eight-bit-graphic' characters to retain the information about the | |
46496 | 43 ;; original byte sequence and text properties record the corresponding |
44 ;; unicode. | |
45 ;; | |
46 ;; Fixme: note that reading and writing invalid utf-8 may not be | |
47 ;; idempotent -- to represent the bytes to fix that needs a new charset. | |
41873 | 48 ;; |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
49 ;; Characters from other character sets can be encoded with mule-utf-8 |
48848 | 50 ;; by populating the translation table |
50179
65bb5afb37ef
(utf-fragment-on-decoding): Don't call
Kenichi Handa <handa@m17n.org>
parents:
50085
diff
changeset
|
51 ;; `utf-translation-table-for-encode'. Hash tables |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
52 ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
53 ;; used to support encoding and decoding of about a quarter of the CJK |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
54 ;; space between U+3400 and U+DFFF. |
36243 | 55 |
54304 | 56 ;; UTF-8 is defined in RFC 3629. A sketch of the encoding is: |
35542 | 57 |
58 ;; scalar | utf-8 | |
59 ;; value | 1st byte | 2nd byte | 3rd byte | |
60 ;; --------------------+-----------+-----------+---------- | |
61 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | | |
62 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | | |
63 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx | |
64 | |
65 ;;; Code: | |
66 | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
67 (defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
68 "Char table mapping characters to latin-iso8859-1 or mule-unicode-*. |
46496 | 69 |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
70 If `unify-8859-on-encoding-mode' is non-nil, this table populates the |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
71 translation-table named `utf-translation-table-for-encode'.") |
46496 | 72 |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
73 (define-translation-table 'utf-translation-table-for-encode) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
74 |
46496 | 75 |
76 ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the | |
77 ;; space of mule-unicode. For Latin scripts this isn't very | |
78 ;; important. Hebrew and Arabic might go here too when there's proper | |
79 ;; support for them. | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
80 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
81 (defvar utf-fragmentation-table (make-char-table 'translation-table nil) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
82 "Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
83 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
84 If `utf-fragment-on-decoding' is non-nil, this table populates the |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
85 translation-table named `utf-translation-table-for-decode'") |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
86 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
87 (defvar utf-defragmentation-table (make-char-table 'translation-table nil) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
88 "Char-table for reverse mapping of `utf-fragmentation-table'. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
89 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
90 If `utf-fragment-on-decoding' is non-nil and |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
91 `unify-8859-on-encoding-mode' is nil, this table populates the |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
92 translation-table named `utf-translation-table-for-encode'") |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
93 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
94 (define-translation-table 'utf-translation-table-for-decode) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
95 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
96 |
48882 | 97 (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
98 "Hash table mapping Emacs CJK character sets to Unicode code points. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
99 |
55437
6e677a935fe9
Fix references to utf-translate-cjk into utf-translate-cjk-mode.
Andreas Schwab <schwab@suse.de>
parents:
54304
diff
changeset
|
100 If `utf-translate-cjk-mode' is non-nil, this table populates the |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
101 translation-hash-table named `utf-subst-table-for-encode'.") |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
102 |
48882 | 103 (define-translation-hash-table 'utf-subst-table-for-encode |
104 ucs-mule-cjk-to-unicode) | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
105 |
48882 | 106 (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
107 "Hash table mapping Unicode code points to Emacs CJK character sets. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
108 |
55437
6e677a935fe9
Fix references to utf-translate-cjk into utf-translate-cjk-mode.
Andreas Schwab <schwab@suse.de>
parents:
54304
diff
changeset
|
109 If `utf-translate-cjk-mode' is non-nil, this table populates the |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
110 translation-hash-table named `utf-subst-table-for-decode'.") |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
111 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
112 (define-translation-hash-table 'utf-subst-table-for-decode |
48882 | 113 ucs-unicode-to-mule-cjk) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
114 |
46496 | 115 (mapc |
116 (lambda (pair) | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
117 (aset utf-fragmentation-table (car pair) (cdr pair)) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
118 (aset utf-defragmentation-table (cdr pair) (car pair))) |
46496 | 119 '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) (?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B) |
120 (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) (?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,F@(B) | |
121 (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) (?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B) | |
122 (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) (?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B) | |
123 (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) (?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B) | |
124 (?$,1' (B . ?,FP(B) (?$,1'!(B . ?,FQ(B) (?$,1'#(B . ?,FS(B) (?$,1'$(B . ?,FT(B) (?$,1'%(B . ?,FU(B) | |
125 (?$,1'&(B . ?,FV(B) (?$,1''(B . ?,FW(B) (?$,1'((B . ?,FX(B) (?$,1')(B . ?,FY(B) (?$,1'*(B . ?,FZ(B) | |
126 (?$,1'+(B . ?,F[(B) (?$,1',(B . ?,F\(B) (?$,1'-(B . ?,F](B) (?$,1'.(B . ?,F^(B) (?$,1'/(B . ?,F_(B) | |
127 (?$,1'0(B . ?,F`(B) (?$,1'1(B . ?,Fa(B) (?$,1'2(B . ?,Fb(B) (?$,1'3(B . ?,Fc(B) (?$,1'4(B . ?,Fd(B) | |
128 (?$,1'5(B . ?,Fe(B) (?$,1'6(B . ?,Ff(B) (?$,1'7(B . ?,Fg(B) (?$,1'8(B . ?,Fh(B) (?$,1'9(B . ?,Fi(B) | |
129 (?$,1':(B . ?,Fj(B) (?$,1';(B . ?,Fk(B) (?$,1'<(B . ?,Fl(B) (?$,1'=(B . ?,Fm(B) (?$,1'>(B . ?,Fn(B) | |
130 (?$,1'?(B . ?,Fo(B) (?$,1'@(B . ?,Fp(B) (?$,1'A(B . ?,Fq(B) (?$,1'B(B . ?,Fr(B) (?$,1'C(B . ?,Fs(B) | |
131 (?$,1'D(B . ?,Ft(B) (?$,1'E(B . ?,Fu(B) (?$,1'F(B . ?,Fv(B) (?$,1'G(B . ?,Fw(B) (?$,1'H(B . ?,Fx(B) | |
132 (?$,1'I(B . ?,Fy(B) (?$,1'J(B . ?,Fz(B) (?$,1'K(B . ?,F{(B) (?$,1'L(B . ?,F|(B) (?$,1'M(B . ?,F}(B) | |
133 (?$,1'N(B . ?,F~(B) | |
134 | |
135 (?$,1(!(B . ?,L!(B) (?$,1("(B . ?,L"(B) (?$,1(#(B . ?,L#(B) (?$,1($(B . ?,L$(B) | |
136 (?$,1(%(B . ?,L%(B) (?$,1(&(B . ?,L&(B) (?$,1('(B . ?,L'(B) (?$,1(((B . ?,L((B) (?$,1()(B . ?,L)(B) | |
137 (?$,1(*(B . ?,L*(B) (?$,1(+(B . ?,L+(B) (?$,1(,(B . ?,L,(B) (?$,1(.(B . ?,L.(B) (?$,1(/(B . ?,L/(B) | |
138 (?$,1(0(B . ?,L0(B) (?$,1(1(B . ?,L1(B) (?$,1(2(B . ?,L2(B) (?$,1(3(B . ?,L3(B) (?$,1(4(B . ?,L4(B) | |
139 (?$,1(5(B . ?,L5(B) (?$,1(6(B . ?,L6(B) (?$,1(7(B . ?,L7(B) (?$,1(8(B . ?,L8(B) (?$,1(9(B . ?,L9(B) | |
140 (?$,1(:(B . ?,L:(B) (?$,1(;(B . ?,L;(B) (?$,1(<(B . ?,L<(B) (?$,1(=(B . ?,L=(B) (?$,1(>(B . ?,L>(B) | |
141 (?$,1(?(B . ?,L?(B) (?$,1(@(B . ?,L@(B) (?$,1(A(B . ?,LA(B) (?$,1(B(B . ?,LB(B) (?$,1(C(B . ?,LC(B) | |
142 (?$,1(D(B . ?,LD(B) (?$,1(E(B . ?,LE(B) (?$,1(F(B . ?,LF(B) (?$,1(G(B . ?,LG(B) (?$,1(H(B . ?,LH(B) | |
143 (?$,1(I(B . ?,LI(B) (?$,1(J(B . ?,LJ(B) (?$,1(K(B . ?,LK(B) (?$,1(L(B . ?,LL(B) (?$,1(M(B . ?,LM(B) | |
144 (?$,1(N(B . ?,LN(B) (?$,1(O(B . ?,LO(B) (?$,1(P(B . ?,LP(B) (?$,1(Q(B . ?,LQ(B) (?$,1(R(B . ?,LR(B) | |
145 (?$,1(S(B . ?,LS(B) (?$,1(T(B . ?,LT(B) (?$,1(U(B . ?,LU(B) (?$,1(V(B . ?,LV(B) (?$,1(W(B . ?,LW(B) | |
146 (?$,1(X(B . ?,LX(B) (?$,1(Y(B . ?,LY(B) (?$,1(Z(B . ?,LZ(B) (?$,1([(B . ?,L[(B) (?$,1(\(B . ?,L\(B) | |
147 (?$,1(](B . ?,L](B) (?$,1(^(B . ?,L^(B) (?$,1(_(B . ?,L_(B) (?$,1(`(B . ?,L`(B) (?$,1(a(B . ?,La(B) | |
148 (?$,1(b(B . ?,Lb(B) (?$,1(c(B . ?,Lc(B) (?$,1(d(B . ?,Ld(B) (?$,1(e(B . ?,Le(B) (?$,1(f(B . ?,Lf(B) | |
149 (?$,1(g(B . ?,Lg(B) (?$,1(h(B . ?,Lh(B) (?$,1(i(B . ?,Li(B) (?$,1(j(B . ?,Lj(B) (?$,1(k(B . ?,Lk(B) | |
150 (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) (?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B) | |
151 (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) (?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B) | |
152 (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) (?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B) | |
153 (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B))) | |
154 | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
155 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
156 (defcustom utf-fragment-on-decoding nil |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
157 "Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets. |
46496 | 158 Setting this means that the relevant Cyrillic and Greek characters are |
159 decoded into the iso8859 charsets rather than into | |
47231 | 160 mule-unicode-0100-24ff. The iso8859 charsets take half as much space |
46496 | 161 in the buffer, but using them may affect how the buffer can be re-encoded |
162 and may require a different input method to search for them, for instance. | |
163 See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode' | |
47231 | 164 for mechanisms to make this largely transparent. |
165 | |
166 Setting this variable outside customize has no effect." | |
46496 | 167 :set (lambda (s v) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
168 (if v |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
169 (progn |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
170 (define-translation-table 'utf-translation-table-for-decode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
171 utf-fragmentation-table) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
172 ;; Even if unify-8859-on-encoding-mode is off, make |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
173 ;; mule-utf-* encode characters in |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
174 ;; utf-fragmentation-table. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
175 (unless (eq (get 'utf-translation-table-for-encode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
176 'translation-table) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
177 ucs-mule-to-mule-unicode) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
178 (define-translation-table 'utf-translation-table-for-encode |
50179
65bb5afb37ef
(utf-fragment-on-decoding): Don't call
Kenichi Handa <handa@m17n.org>
parents:
50085
diff
changeset
|
179 utf-defragmentation-table))) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
180 (define-translation-table 'utf-translation-table-for-decode) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
181 ;; When unify-8859-on-encoding-mode is off, be sure to make |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
182 ;; mule-utf-* disabled for characters in |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
183 ;; utf-fragmentation-table. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
184 (unless (eq (get 'utf-translation-table-for-encode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
185 'translation-table) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
186 ucs-mule-to-mule-unicode) |
50179
65bb5afb37ef
(utf-fragment-on-decoding): Don't call
Kenichi Handa <handa@m17n.org>
parents:
50085
diff
changeset
|
187 (define-translation-table 'utf-translation-table-for-encode))) |
46496 | 188 (set-default s v)) |
189 :version "21.4" | |
190 :type 'boolean | |
191 :group 'mule) | |
192 | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
193 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
194 (defconst utf-translate-cjk-charsets '(chinese-gb2312 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
195 chinese-big5-1 chinese-big5-2 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
196 japanese-jisx0208 japanese-jisx0212 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
197 korean-ksc5601) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
198 "List of charsets supported by `utf-translate-cjk-mode'.") |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
199 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
200 (defconst utf-translate-cjk-unicode-range |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
201 '((#x2e80 . #xd7a3) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
202 (#xff00 . #xffef)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
203 "List of Unicode code ranges supported by `utf-translate-cjk-mode'.") |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
204 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
205 ;; Return non-nil if CODE-POINT is in `utf-translate-cjk-unicode-range'. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
206 (defsubst utf-translate-cjk-substitutable-p (code-point) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
207 (let ((tail utf-translate-cjk-unicode-range) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
208 elt) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
209 (while tail |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
210 (setq elt (car tail) tail (cdr tail)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
211 (if (and (>= code-point (car elt)) (<= code-point (cdr elt))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
212 (setq tail nil) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
213 (setq elt nil))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
214 elt)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
215 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
216 (defvar utf-translate-cjk-lang-env nil |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
217 "Language environment in which tables for `utf-translate-cjk-mode' is loaded. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
218 The value nil means that the tables are not yet loaded.") |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
219 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
220 (defun utf-translate-cjk-load-tables () |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
221 "Load tables for `utf-translate-cjk-mode'." |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
222 ;; Fixme: Allow the use of the CJK charsets to be |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
223 ;; customized by reordering and possible omission. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
224 (let ((redefined (< (hash-table-size ucs-mule-cjk-to-unicode) 43000))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
225 (if redefined |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
226 ;; Redefine them with realistic initial sizes and a |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
227 ;; smallish rehash size to avoid wasting significant |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
228 ;; space after they're built. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
229 (setq ucs-mule-cjk-to-unicode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
230 (make-hash-table :test 'eq :size 43000 :rehash-size 1000) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
231 ucs-unicode-to-mule-cjk |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
232 (make-hash-table :test 'eq :size 21500 :rehash-size 1000))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
233 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
234 ;; Load the files explicitly, to avoid having to keep |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
235 ;; around the large tables they contain (as well as the |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
236 ;; ones which get built). |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
237 (cond ((string= "Korean" current-language-environment) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
238 (load "subst-jis") |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
239 (load "subst-big5") |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
240 (load "subst-gb2312") |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
241 (load "subst-ksc")) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
242 ((string= "Chinese-BIG5" current-language-environment) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
243 (load "subst-jis") |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
244 (load "subst-ksc") |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
245 (load "subst-gb2312") |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
246 (load "subst-big5")) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
247 ((string= "Chinese-GB" current-language-environment) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
248 (load "subst-jis") |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
249 (load "subst-ksc") |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
250 (load "subst-big5") |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
251 (load "subst-gb2312")) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
252 (t |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
253 (load "subst-ksc") |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
254 (load "subst-gb2312") |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
255 (load "subst-big5") |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
256 (load "subst-jis"))) ; jis covers as much as big5, gb2312 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
257 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
258 (when redefined |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
259 (define-translation-hash-table 'utf-subst-table-for-decode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
260 ucs-unicode-to-mule-cjk) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
261 (define-translation-hash-table 'utf-subst-table-for-encode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
262 ucs-mule-cjk-to-unicode) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
263 (set-char-table-extra-slot (get 'utf-translation-table-for-encode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
264 'translation-table) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
265 1 ucs-mule-cjk-to-unicode)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
266 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
267 (setq utf-translate-cjk-lang-env current-language-environment))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
268 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
269 (defun utf-lookup-subst-table-for-decode (code-point) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
270 (if (and utf-translate-cjk-mode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
271 (not utf-translate-cjk-lang-env) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
272 (utf-translate-cjk-substitutable-p code-point)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
273 (utf-translate-cjk-load-tables)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
274 (gethash code-point |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
275 (get 'utf-subst-table-for-decode 'translation-hash-table))) |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
276 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
277 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
278 (defun utf-lookup-subst-table-for-encode (char) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
279 (if (and utf-translate-cjk-mode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
280 (not utf-translate-cjk-lang-env) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
281 (memq (char-charset char) utf-translate-cjk-charsets)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
282 (utf-translate-cjk-load-tables)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
283 (gethash char |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
284 (get 'utf-subst-table-for-encode 'translation-hash-table))) |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
285 |
50341
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
286 (define-minor-mode utf-translate-cjk-mode |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
287 "Toggle whether UTF based coding systems de/encode CJK characters. |
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
288 If ARG is an integer, enable if ARG is positive and disable if |
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
289 zero or negative. This is a minor mode. |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
290 Enabling this allows the coding systems mule-utf-8, |
51628
abfc7d48b476
(utf-translate-cjk-mode): Fix docstring.
Kenichi Handa <handa@m17n.org>
parents:
50766
diff
changeset
|
291 mule-utf-16le and mule-utf-16be to encode characters in the charsets |
48848 | 292 `korean-ksc5601', `chinese-gb2312', `chinese-big5-1', |
293 `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to | |
294 decode the corresponding unicodes into such characters. | |
46496 | 295 |
48848 | 296 Where the charsets overlap, the one preferred for decoding is chosen |
297 according to the language environment in effect when this option is | |
298 turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for | |
299 Chinese-Big5 and jisx for other environments. | |
300 | |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
301 This mode is on by default. If you are not interested in CJK |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
302 characters and want to avoid some overhead on encoding/decoding |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
303 by the above coding systems, you can customize the user option |
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
304 `utf-translate-cjk-mode' to nil." |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
305 :init-value t |
46496 | 306 :version "21.4" |
307 :type 'boolean | |
50341
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
308 :group 'mule |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
309 :global t |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
310 (if utf-translate-cjk-mode |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
311 (progn |
50766
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
312 (define-translation-hash-table 'utf-subst-table-for-decode |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
313 ucs-unicode-to-mule-cjk) |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
314 (define-translation-hash-table 'utf-subst-table-for-encode |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
315 ucs-mule-cjk-to-unicode) |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
316 (set-char-table-extra-slot (get 'utf-translation-table-for-encode |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
317 'translation-table) |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
318 1 ucs-mule-cjk-to-unicode)) |
50549
c8525ac04d76
(utf-translate-cjk-mode): Fix incorrect
Kenichi Handa <handa@m17n.org>
parents:
50341
diff
changeset
|
319 (define-translation-hash-table 'utf-subst-table-for-decode |
c8525ac04d76
(utf-translate-cjk-mode): Fix incorrect
Kenichi Handa <handa@m17n.org>
parents:
50341
diff
changeset
|
320 (make-hash-table :test 'eq)) |
c8525ac04d76
(utf-translate-cjk-mode): Fix incorrect
Kenichi Handa <handa@m17n.org>
parents:
50341
diff
changeset
|
321 (define-translation-hash-table 'utf-subst-table-for-encode |
50766
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
322 (make-hash-table :test 'eq)) |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
323 (set-char-table-extra-slot (get 'utf-translation-table-for-encode |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
324 'translation-table) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
325 1 nil)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
326 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
327 ;; Update safe-chars of mule-utf-* coding systems. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
328 (dolist (elt (coding-system-list t)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
329 (if (string-match "^mule-utf" (symbol-name elt)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
330 (let ((safe-charsets (coding-system-get elt 'safe-charsets)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
331 (safe-chars (coding-system-get elt 'safe-chars)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
332 (need-update nil)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
333 (dolist (charset utf-translate-cjk-charsets) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
334 (unless (eq utf-translate-cjk-mode (memq charset safe-charsets)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
335 (setq safe-charsets |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
336 (if utf-translate-cjk-mode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
337 (cons charset safe-charsets) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
338 (delq charset safe-charsets)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
339 need-update t) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
340 (aset safe-chars (make-char charset) utf-translate-cjk-mode))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
341 (when need-update |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
342 (coding-system-put elt 'safe-charsets safe-charsets) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
343 (define-coding-system-internal elt)))))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
344 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
345 (define-ccl-program ccl-mule-utf-untrans |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
346 ;; R0 is an untranslatable Unicode code-point (U+3500..U+DFFF or |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
347 ;; U+10000..U+10FFFF) or an invaid byte (#x00..#xFF). Write |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
348 ;; eight-bit-control/graphic sequence (2 to 4 chars) representing |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
349 ;; UTF-8 sequence of r0. Registers r4, r5, r6 are modified. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
350 ;; |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
351 ;; This is a subrountine because we assume that this is called very |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
352 ;; rarely (so we don't have to worry about the overhead of the |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
353 ;; call). |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
354 `(0 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
355 ((r5 = ,(charset-id 'eight-bit-control)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
356 (r6 = ,(charset-id 'eight-bit-graphic)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
357 (if (r0 < #x100) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
358 ((r4 = ((r0 >> 6) | #xC0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
359 (write-multibyte-character r6 r4)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
360 ((if (r0 < #x10000) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
361 ((r4 = ((r0 >> 12) | #xE0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
362 (write-multibyte-character r6 r4)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
363 ((r4 = ((r0 >> 18) | #xF0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
364 (write-multibyte-character r6 r4) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
365 (r4 = (((r0 >> 12) & #x3F) | #x80)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
366 (if (r4 < #xA0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
367 (write-multibyte-character r5 r4) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
368 (write-multibyte-character r6 r4)))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
369 (r4 = (((r0 >> 6) & #x3F) | #x80)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
370 (if (r4 < #xA0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
371 (write-multibyte-character r5 r4) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
372 (write-multibyte-character r6 r4)))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
373 (r4 = ((r0 & #x3F) | #x80)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
374 (if (r4 < #xA0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
375 (write-multibyte-character r5 r4) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
376 (write-multibyte-character r6 r4))))) |
46496 | 377 |
35542 | 378 (define-ccl-program ccl-decode-mule-utf-8 |
379 ;; | |
380 ;; charset | bytes in utf-8 | bytes in emacs | |
381 ;; -----------------------+----------------+--------------- | |
382 ;; ascii | 1 | 1 | |
383 ;; -----------------------+----------------+--------------- | |
384 ;; eight-bit-control | 2 | 2 | |
41873 | 385 ;; eight-bit-graphic | 2 | 1 |
35542 | 386 ;; latin-iso8859-1 | 2 | 2 |
387 ;; -----------------------+----------------+--------------- | |
388 ;; mule-unicode-0100-24ff | 2 | 4 | |
389 ;; (< 0800) | | | |
390 ;; -----------------------+----------------+--------------- | |
391 ;; mule-unicode-0100-24ff | 3 | 4 | |
392 ;; (>= 8000) | | | |
393 ;; mule-unicode-2500-33ff | 3 | 4 | |
394 ;; mule-unicode-e000-ffff | 3 | 4 | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
395 ;; -----------------------+----------------+--------------- |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
396 ;; invalid byte | 1 | 2 |
35542 | 397 ;; |
398 ;; Thus magnification factor is two. | |
399 ;; | |
400 `(2 | |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
401 ((r6 = ,(charset-id 'latin-iso8859-1)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
402 (read r0) |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
403 (loop |
35542 | 404 (if (r0 < #x80) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
405 ;; 1-byte encoding, i.e., ascii |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
406 (write-read-repeat r0)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
407 (if (r0 < #xc2) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
408 ;; continuation byte (invalid here) or 1st byte of overlong |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
409 ;; 2-byte sequence. |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
410 ((call ccl-mule-utf-untrans) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
411 (r6 = ,(charset-id 'latin-iso8859-1)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
412 (read r0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
413 (repeat))) |
35542 | 414 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
415 ;; Read the 2nd byte. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
416 (read r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
417 (if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
418 ((call ccl-mule-utf-untrans) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
419 (r6 = ,(charset-id 'latin-iso8859-1)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
420 ;; Handle it in the next loop. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
421 (r0 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
422 (repeat))) |
46496 | 423 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
424 (if (r0 < #xe0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
425 ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
426 ((r1 &= #x3F) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
427 (r1 |= ((r0 & #x1F) << 6)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
428 ;; Now r2 holds scalar value. We don't have to check |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
429 ;; `overlong sequence' because r0 >= 0xC2. |
46496 | 430 |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
431 (if (r1 >= 256) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
432 ;; mule-unicode-0100-24ff (< 0800) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
433 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
434 (r1 -= #x0100) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
435 (r2 = (((r1 / 96) + 32) << 7)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
436 (r1 %= 96) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
437 (r1 += (r2 + 32)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
438 (translate-character |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
439 utf-translation-table-for-decode r0 r1) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
440 (write-multibyte-character r0 r1) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
441 (read r0) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
442 (repeat)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
443 (if (r1 >= 160) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
444 ;; latin-iso8859-1 |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
445 ((r1 -= 128) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
446 (write-multibyte-character r6 r1) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
447 (read r0) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
448 (repeat)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
449 ;; eight-bit-control |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
450 ((r0 = ,(charset-id 'eight-bit-control)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
451 (write-multibyte-character r0 r1) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
452 (read r0) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
453 (repeat)))))) |
46496 | 454 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
455 ;; Read the 3rd bytes. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
456 (read r2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
457 (if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
458 ((call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
459 (r0 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
460 (call ccl-mule-utf-untrans) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
461 (r6 = ,(charset-id 'latin-iso8859-1)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
462 ;; Handle it in the next loop. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
463 (r0 = r2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
464 (repeat))) |
46496 | 465 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
466 (if (r0 < #xF0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
467 ;; 3byte encoding |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
468 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
469 ((r3 = ((r0 & #xF) << 12)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
470 (r3 |= ((r1 & #x3F) << 6)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
471 (r3 |= (r2 & #x3F)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
472 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
473 (if (r3 < #x800) ; `overlong sequence' |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
474 ((call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
475 (r0 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
476 (call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
477 (r0 = r2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
478 (call ccl-mule-utf-untrans) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
479 (r6 = ,(charset-id 'latin-iso8859-1)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
480 (read r0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
481 (repeat))) |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
49028
diff
changeset
|
482 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
483 (if (r3 < #x2500) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
484 ;; mule-unicode-0100-24ff (>= 0800) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
485 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
486 (r3 -= #x0100) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
487 (r3 //= 96) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
488 (r1 = (r7 + 32)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
489 (r1 += ((r3 + 32) << 7)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
490 (translate-character |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
491 utf-translation-table-for-decode r0 r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
492 (write-multibyte-character r0 r1) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
493 (read r0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
494 (repeat))) |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
49028
diff
changeset
|
495 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
496 (if (r3 < #x3400) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
497 ;; mule-unicode-2500-33ff |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
498 ((r0 = r3) ; don't zap r3 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
499 (lookup-integer utf-subst-table-for-decode r0 r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
500 (if (r7 == 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
501 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
502 (r3 -= #x2500) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
503 (r3 //= 96) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
504 (r1 = (r7 + 32)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
505 (r1 += ((r3 + 32) << 7)))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
506 (write-multibyte-character r0 r1) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
507 (read r0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
508 (repeat))) |
46496 | 509 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
510 (if (r3 < #xE000) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
511 ;; Try to convert to CJK chars, else |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
512 ;; keep them as eight-bit-{control|graphic}. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
513 ((r0 = r3) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
514 (lookup-integer utf-subst-table-for-decode r3 r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
515 (if r7 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
516 ;; got a translation |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
517 ((write-multibyte-character r3 r1) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
518 (read r0) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
519 (repeat)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
520 ((call ccl-mule-utf-untrans) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
521 (r6 = ,(charset-id 'latin-iso8859-1)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
522 (read r0) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
523 (repeat))))) |
46496 | 524 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
525 ;; mule-unicode-e000-ffff |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
526 ;; Fixme: fffe and ffff are invalid. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
527 (r0 = r3) ; don't zap r3 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
528 (lookup-integer utf-subst-table-for-decode r0 r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
529 (if (r7 == 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
530 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
531 (r3 -= #xe000) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
532 (r3 //= 96) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
533 (r1 = (r7 + 32)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
534 (r1 += ((r3 + 32) << 7)))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
535 (write-multibyte-character r0 r1) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
536 (read r0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
537 (repeat))) |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
49028
diff
changeset
|
538 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
539 ;; Read the 4th bytes. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
540 (read r3) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
541 (if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
542 ((call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
543 (r0 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
544 (call ccl-mule-utf-untrans) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
545 (r0 = r2) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
546 (call ccl-mule-utf-untrans) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
547 (r6 = ,(charset-id 'latin-iso8859-1)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
548 ;; Handle it in the next loop. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
549 (r0 = r3) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
550 (repeat))) |
35542 | 551 |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
552 (if (r0 < #xF8) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
553 ;; 4-byte encoding: |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
554 ;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
555 ;; keep those bytes as eight-bit-{control|graphic} |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
556 ;; Fixme: allow lookup in utf-subst-table-for-decode. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
557 ((r4 = ((r0 & #x7) << 18)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
558 (r4 |= ((r1 & #x3F) << 12)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
559 (r4 |= ((r2 & #x3F) << 6)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
560 (r4 |= (r3 & #x3F)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
561 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
562 (if (r4 < #x10000) ; `overlong sequence' |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
563 ((call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
564 (r0 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
565 (call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
566 (r0 = r2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
567 (call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
568 (r0 = r3) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
569 (call ccl-mule-utf-untrans)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
570 ((r0 = r4) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
571 (call ccl-mule-utf-untrans)))) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
572 |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
573 ;; Unsupported sequence. |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
574 ((call ccl-mule-utf-untrans) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
575 (r0 = r1) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
576 (call ccl-mule-utf-untrans) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
577 (r0 = r2) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
578 (call ccl-mule-utf-untrans) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
579 (r0 = r3) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
580 (call ccl-mule-utf-untrans))) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
581 (r6 = ,(charset-id 'latin-iso8859-1)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
582 (read r0) |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
583 (repeat))) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
584 |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
585 |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
586 ;; At EOF... |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
587 (if (r0 >= 0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
588 ;; r0 >= #x80 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
589 ((call ccl-mule-utf-untrans) |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
590 (if (r1 >= 0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
591 ((r0 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
592 (call ccl-mule-utf-untrans) |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
593 (if (r2 >= 0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
594 ((r0 = r2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
595 (call ccl-mule-utf-untrans) |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
596 (if (r3 >= 0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
597 ((r0 = r3) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
598 (call ccl-mule-utf-untrans)))))))))) |
35542 | 599 |
36243 | 600 "CCL program to decode UTF-8. |
36465 | 601 Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
602 mule-unicode-*, but see also `utf-fragmentation-table' and |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
603 `ucs-mule-cjk-to-unicode'. |
46496 | 604 Encodings of un-representable Unicode characters are decoded asis into |
605 eight-bit-control and eight-bit-graphic characters.") | |
35542 | 606 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
607 (define-ccl-program ccl-mule-utf-8-encode-untrans |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
608 ;; UTF-8 decoder generates an UTF-8 sequence represented by a |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
609 ;; sequence eight-bit-control/graphic chars for an untranslatable |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
610 ;; character and an invalid byte. |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
611 ;; |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
612 ;; This CCL parses that sequence (the first byte is already in r1), |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
613 ;; writes out the original bytes of that sequence, and sets r5 to |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
614 ;; -1. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
615 ;; |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
616 ;; If the eight-bit-control/graphic sequence is shorter than what r1 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
617 ;; suggests, it sets r5 and r6 to the last character read that |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
618 ;; should be handled by the next loop of a caller. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
619 ;; |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
620 ;; Note: For UTF-8 validation, we only check if a character is |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
621 ;; eight-bit-control/graphic or not. It may result in incorrect |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
622 ;; handling of random binary data, but such a data can't be encoded |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
623 ;; by UTF-8 anyway. At least, UTF-8 decoders doesn't generate such |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
624 ;; a sequence even if a source contains invalid byte-sequence. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
625 `(0 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
626 (;; Read the 2nd byte. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
627 (read-multibyte-character r5 r6) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
628 (r0 = (r5 != ,(charset-id 'eight-bit-control))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
629 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
630 ((write r1) ; invalid UTF-8 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
631 (r1 = -1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
632 (end))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
633 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
634 (if (r1 <= #xC3) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
635 ;; 2-byte sequence for an originally invalid byte. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
636 ((r6 &= #x3F) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
637 (r6 |= ((r1 & #x1F) << 6)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
638 (write r6) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
639 (r5 = -1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
640 (end))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
641 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
642 (write r1 r6) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
643 (r2 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
644 (r1 = -1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
645 ;; Read the 3rd byte. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
646 (read-multibyte-character r5 r6) |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
647 (r0 = (r5 != ,(charset-id 'eight-bit-control))) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
648 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
649 (end)) ; invalid UTF-8 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
650 (write r6) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
651 (if (r2 < #xF0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
652 ;; 3-byte sequence for an untranslated character. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
653 ((r5 = -1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
654 (end))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
655 ;; Read the 4th byte. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
656 (read-multibyte-character r5 r6) |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
657 (r0 = (r5 != ,(charset-id 'eight-bit-control))) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
658 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
659 (end)) ; invalid UTF-8 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
660 ;; 4-byte sequence for an untranslated character. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
661 (write r6) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
662 (r5 = -1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
663 (end)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
664 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
665 ;; At EOF... |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
666 ((r5 = -1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
667 (if (r1 >= 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
668 (write r1))))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
669 |
35542 | 670 (define-ccl-program ccl-encode-mule-utf-8 |
671 `(1 | |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
672 ((r5 = -1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
673 (loop |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
674 (if (r5 < 0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
675 (read-multibyte-character r0 r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
676 ;; Pre-read character is in r5 (charset-ID) and r6 (code-point). |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
677 ((r0 = r5) |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
678 (r1 = r6) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
679 (r5 = -1))) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
680 (translate-character utf-translation-table-for-encode r0 r1) |
35542 | 681 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
682 (if (r0 == ,(charset-id 'ascii)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
683 (write-repeat r1)) |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
684 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
685 (if (r0 == ,(charset-id 'latin-iso8859-1)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
686 ;; r1 scalar utf-8 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
687 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
688 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
689 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 |
56095
4ec2da03a87c
(ccl-encode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56056
diff
changeset
|
690 ((write ((r1 >> 6) | #xc2)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
691 (r1 &= #x3f) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
692 (r1 |= #x80) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
693 (write-repeat r1))) |
35542 | 694 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
695 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
696 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
697 ;; #x3f80 == (0011 1111 1000 0000)b |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
698 (r1 &= #x7f) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
699 (r1 += (r0 + 224)) ; 240 == -32 + #x0100 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
700 ;; now r1 holds scalar value |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
701 (if (r1 < #x0800) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
702 ;; 2byte encoding |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
703 ((write ((r1 >> 6) | #xC0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
704 (r1 &= #x3F) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
705 (r1 |= #x80) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
706 (write-repeat r1)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
707 ;; 3byte encoding |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
708 ((write ((r1 >> 12) | #xE0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
709 (write (((r1 & #x0FC0) >> 6) | #x80)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
710 (r1 &= #x3F) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
711 (r1 |= #x80) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
712 (write-repeat r1))))) |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
713 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
714 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
715 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
716 (r1 &= #x7f) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
717 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
718 ;; now r1 holds scalar value |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
719 (write ((r1 >> 12) | #xE0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
720 (write (((r1 & #x0FC0) >> 6) | #x80)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
721 (r1 &= #x3F) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
722 (r1 |= #x80) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
723 (write-repeat r1))) |
35542 | 724 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
725 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
726 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
727 (r1 &= #x7f) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
728 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
729 ;; now r1 holds scalar value |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
730 (write ((r1 >> 12) | #xE0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
731 (write (((r1 & #x0FC0) >> 6) | #x80)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
732 (r1 &= #x3F) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
733 (r1 |= #x80) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
734 (write-repeat r1))) |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
735 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
736 (if (r0 == ,(charset-id 'eight-bit-control)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
737 ;; r1 scalar utf-8 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
738 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
739 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
740 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
741 ((write #xC2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
742 (write-repeat r1))) |
35542 | 743 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
744 (if (r0 == ,(charset-id 'eight-bit-graphic)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
745 ;; r1 scalar utf-8 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
746 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
747 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
748 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
749 ((r0 = (r1 >= #xC0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
750 (r0 &= (r1 <= #xC3)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
751 (r4 = (r1 >= #xE1)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
752 (r4 &= (r1 <= #xF7)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
753 (r0 |= r4) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
754 (if r0 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
755 ((call ccl-mule-utf-8-encode-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
756 (repeat)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
757 (write-repeat r1)))) |
35542 | 758 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
759 (lookup-character utf-subst-table-for-encode r0 r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
760 (if r7 ; lookup succeeded |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
761 (if (r0 < #x800) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
762 ;; 2byte encoding |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
763 ((write ((r0 >> 6) | #xC0)) |
56095
4ec2da03a87c
(ccl-encode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56056
diff
changeset
|
764 (r0 = ((r0 & #x3F) | #x80)) |
4ec2da03a87c
(ccl-encode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56056
diff
changeset
|
765 (write-repeat r0)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
766 ;; 3byte encoding |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
767 ((write ((r0 >> 12) | #xE0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
768 (write (((r0 & #x0FC0) >> 6) | #x80)) |
56095
4ec2da03a87c
(ccl-encode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56056
diff
changeset
|
769 (r0 = ((r0 & #x3F) | #x80)) |
4ec2da03a87c
(ccl-encode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56056
diff
changeset
|
770 (write-repeat r0)))) |
35542 | 771 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
772 ;; Unsupported character. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
773 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
774 (write #xef) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
775 (write #xbf) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
776 (write-repeat #xbd)))) |
46496 | 777 "CCL program to encode into UTF-8.") |
35542 | 778 |
41873 | 779 |
46496 | 780 (define-ccl-program ccl-untranslated-to-ucs |
781 `(0 | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
782 (if (r1 == 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
783 nil |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
784 (if (r0 <= #xC3) ; 2-byte encoding |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
785 ((r0 = ((r0 & #x3) << 6)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
786 (r0 |= (r1 & #x3F)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
787 (r1 = 2)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
788 (if (r2 == 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
789 (r1 = 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
790 (if (r0 < #xF0) ; 3-byte encoding, as above |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
791 ((r0 = ((r0 & #xF) << 12)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
792 (r0 |= ((r1 & #x3F) << 6)) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
793 (r0 |= (r2 & #x3F)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
794 (r1 = 3)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
795 (if (r3 == 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
796 (r1 = 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
797 ((r0 = ((r0 & #x7) << 18)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
798 (r0 |= ((r1 & #x3F) << 12)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
799 (r0 |= ((r2 & #x3F) << 6)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
800 (r0 |= (r3 & #x3F)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
801 (r1 = 4)))))))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
802 "Decode 2-, 3-, or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
803 Set r1 to the byte length. r0 == 0 for invalid sequence.") |
46496 | 804 |
805 (defvar utf-8-ccl-regs (make-vector 8 0)) | |
806 | |
41873 | 807 (defsubst utf-8-untranslated-to-ucs () |
46496 | 808 "Return the UCS code for an untranslated sequence of raw bytes t point. |
809 Only for 3- or 4-byte sequences." | |
810 (aset utf-8-ccl-regs 0 (or (char-after) 0)) | |
811 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0)) | |
812 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0)) | |
813 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0)) | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
814 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs)) |
41873 | 815 |
816 (defun utf-8-help-echo (window object position) | |
817 (format "Untranslated Unicode U+%04X" | |
818 (get-char-property position 'untranslated-utf-8 object))) | |
819 | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
820 ;; We compose the untranslatable sequences into a single character, |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
821 ;; and move point to the next character. |
41873 | 822 ;; This is infelicitous for editing, because there's currently no |
823 ;; mechanism for treating compositions as atomic, but is OK for | |
46496 | 824 ;; display. They are composed to U+FFFD with help-echo which |
825 ;; indicates the unicodes they represent. This function GCs too much. | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
826 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
827 ;; If utf-translate-cjk-mode is non-nil, this function is called with |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
828 ;; HASH-TABLE which translates CJK characters into some of CJK |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
829 ;; charsets. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
830 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
831 (defsubst utf-8-compose (hash-table) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
832 "Put a suitable composition on an untranslatable sequence at point. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
833 If HASH-TABLE is non-nil, try to translate CJK characters by it at first. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
834 Move point to the end of the sequence." |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
835 (utf-8-untranslated-to-ucs) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
836 (let ((l (aref utf-8-ccl-regs 1)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
837 ch) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
838 (if (> l 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
839 (if (and hash-table |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
840 (setq ch (gethash (aref utf-8-ccl-regs 0) hash-table))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
841 (progn |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
842 (insert ch) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
843 (delete-region (point) (min (point-max) (+ l (point))))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
844 (setq ch (aref utf-8-ccl-regs 0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
845 (put-text-property (point) (min (point-max) (+ l (point))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
846 'untranslated-utf-8 ch) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
847 (put-text-property (point) (min (point-max) (+ l (point))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
848 'help-echo 'utf-8-help-echo) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
849 (if (= l 2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
850 (put-text-property (point) (min (point-max) (+ l (point))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
851 'display (format "\\%03o" ch)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
852 (compose-region (point) (+ l (point)) ?$,3u=(B)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
853 (forward-char l)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
854 (forward-char 1)))) |
41873 | 855 |
856 (defcustom utf-8-compose-scripts nil | |
46496 | 857 "*Non-nil means compose various scripts on decoding utf-8 text." |
41873 | 858 :group 'mule |
46496 | 859 :version "21.4" |
860 :type 'boolean) | |
41873 | 861 |
862 (defun utf-8-post-read-conversion (length) | |
863 "Compose untranslated utf-8 sequences into single characters. | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
864 If `utf-translate-cjk-mode' is non-nil, tries to translate CJK characters. |
41873 | 865 Also compose particular scripts if `utf-8-compose-scripts' is non-nil." |
866 (save-excursion | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
867 (save-restriction |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
868 (narrow-to-region (point) (+ (point) length)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
869 ;; Can't do eval-when-compile to insert a multibyte constant |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
870 ;; version of the string in the loop, since it's always loaded as |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
871 ;; unibyte from a byte-compiled file. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
872 (let ((range (string-as-multibyte "^\xc0-\xc3\xe1-\xf7")) |
56800
752ef76fcc08
(utf-8-post-read-conversion): If the
Kenichi Handa <handa@m17n.org>
parents:
56562
diff
changeset
|
873 (buffer-multibyte enable-multibyte-characters) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
874 hash-table ch) |
56800
752ef76fcc08
(utf-8-post-read-conversion): If the
Kenichi Handa <handa@m17n.org>
parents:
56562
diff
changeset
|
875 (set-buffer-multibyte t) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
876 (when utf-translate-cjk-mode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
877 (if (not utf-translate-cjk-lang-env) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
878 ;; Check these characters: |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
879 ;; "U+2e80-U+33ff", "U+ff00-U+ffef" |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
880 ;; We may have to translate them to CJK charsets. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
881 (let ((range2 "$,29@(B-$,2G$,3r`(B-$,3u/(B")) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
882 (skip-chars-forward (concat range range2)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
883 (unless (eobp) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
884 (utf-translate-cjk-load-tables) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
885 (setq range (concat range range2))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
886 (setq hash-table (get 'utf-subst-table-for-decode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
887 'translation-hash-table))))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
888 (while (and (skip-chars-forward range) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
889 (not (eobp))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
890 (setq ch (following-char)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
891 (if (< ch 256) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
892 (utf-8-compose hash-table) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
893 (if (and hash-table |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
894 (setq ch (gethash (encode-char ch 'ucs) hash-table))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
895 (progn |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
896 (insert ch) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
897 (delete-char 1)) |
56800
752ef76fcc08
(utf-8-post-read-conversion): If the
Kenichi Handa <handa@m17n.org>
parents:
56562
diff
changeset
|
898 (forward-char 1)))) |
752ef76fcc08
(utf-8-post-read-conversion): If the
Kenichi Handa <handa@m17n.org>
parents:
56562
diff
changeset
|
899 (or buffer-multibyte |
752ef76fcc08
(utf-8-post-read-conversion): If the
Kenichi Handa <handa@m17n.org>
parents:
56562
diff
changeset
|
900 (set-buffer-multibyte nil))) |
41873 | 901 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
902 (when (and utf-8-compose-scripts (> length 1)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
903 ;; These currently have definitions which cover the relevant |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
904 ;; unicodes. We could avoid loading thai-util &c by checking |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
905 ;; whether the region contains any characters with the appropriate |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
906 ;; categories. There aren't yet Unicode-based rules for Tibetan. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
907 (diacritic-compose-region (point-max) (point-min)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
908 (thai-compose-region (point-max) (point-min)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
909 (lao-compose-region (point-max) (point-min)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
910 (devanagari-compose-region (point-max) (point-min)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
911 (malayalam-compose-region (point-max) (point-min)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
912 (tamil-compose-region (point-max) (point-min))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
913 (- (point-max) (point-min))))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
914 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
915 (defun utf-8-pre-write-conversion (beg end) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
916 "Prepare for `utf-translate-cjk-mode' to encode text between BEG and END. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
917 This is used as a post-read-conversion of utf-8 coding system." |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
918 (if (and utf-translate-cjk-mode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
919 (not utf-translate-cjk-lang-env) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
920 (save-excursion |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
921 (goto-char beg) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
922 (re-search-forward "\\cc\\|\\cj\\|\\ch" end t))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
923 (utf-translate-cjk-load-tables)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
924 nil) |
41873 | 925 |
35542 | 926 (make-coding-system |
927 'mule-utf-8 4 ?u | |
928 "UTF-8 encoding for Emacs-supported Unicode characters. | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
929 It supports Unicode characters of these ranges: |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
930 U+0000..U+33FF, U+E000..U+FFFF. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
931 They correspond to these Emacs character sets: |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
932 ascii, latin-iso8859-1, mule-unicode-0100-24ff, |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
933 mule-unicode-2500-33ff, mule-unicode-e000-ffff |
35542 | 934 |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
935 On decoding (e.g. reading a file), Unicode characters not in the above |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
936 ranges are decoded into sequences of eight-bit-control and |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
937 eight-bit-graphic characters to preserve their byte sequences. The |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
938 byte sequence is preserved on i/o for valid utf-8, but not necessarily |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
939 for invalid utf-8. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
940 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
941 On encoding (e.g. writing a file), Emacs characters not belonging to |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
942 any of the character sets listed above are encoded into the UTF-8 byte |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
943 sequence representing U+FFFD (REPLACEMENT CHARACTER)." |
35542 | 944 |
945 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
946 `((safe-charsets |
35542 | 947 ascii |
948 eight-bit-control | |
949 eight-bit-graphic | |
950 latin-iso8859-1 | |
951 mule-unicode-0100-24ff | |
952 mule-unicode-2500-33ff | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
953 mule-unicode-e000-ffff |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
954 ,@(if utf-translate-cjk-mode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
955 utf-translate-cjk-charsets)) |
36371
f6bb3ed752b4
(mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents:
36243
diff
changeset
|
956 (mime-charset . utf-8) |
36423
aa776838b660
(mule-utf-8): Set coding-category property to coding-category-utf-8.
Kenichi Handa <handa@m17n.org>
parents:
36371
diff
changeset
|
957 (coding-category . coding-category-utf-8) |
41873 | 958 (valid-codes (0 . 255)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
959 (pre-write-conversion . utf-8-pre-write-conversion) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
960 (post-read-conversion . utf-8-post-read-conversion) |
50766
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
961 (translation-table-for-encode . utf-translation-table-for-encode) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
962 (dependency unify-8859-on-encoding-mode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
963 unify-8859-on-decoding-mode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
964 utf-fragment-on-decoding |
55437
6e677a935fe9
Fix references to utf-translate-cjk into utf-translate-cjk-mode.
Andreas Schwab <schwab@suse.de>
parents:
54304
diff
changeset
|
965 utf-translate-cjk-mode))) |
35542 | 966 |
967 (define-coding-system-alias 'utf-8 'mule-utf-8) | |
38436
b174db545cfd
Some fixes to follow coding conventions.
Pavel Janík <Pavel@Janik.cz>
parents:
37934
diff
changeset
|
968 |
41873 | 969 ;; I think this needs special private charsets defined for the |
970 ;; untranslated sequences, if it's going to work well. | |
971 | |
972 ;;; (defun utf-8-compose-function (pos to pattern &optional string) | |
973 ;;; (let* ((prop (get-char-property pos 'composition string)) | |
974 ;;; (l (and prop (- (cadr prop) (car prop))))) | |
975 ;;; (cond ((and l (> l (- to pos))) | |
976 ;;; (delete-region pos to)) | |
977 ;;; ((and (> (char-after pos) 224) | |
978 ;;; (< (char-after pos) 256) | |
979 ;;; (save-restriction | |
980 ;;; (narrow-to-region pos to) | |
981 ;;; (utf-8-compose))) | |
982 ;;; t)))) | |
983 | |
984 ;;; (dotimes (i 96) | |
985 ;;; (aset composition-function-table | |
986 ;;; (+ 128 i) | |
987 ;;; `((,(string-as-multibyte "[\200-\237\240-\377]") | |
988 ;;; . utf-8-compose-function)))) | |
989 | |
52401 | 990 ;;; arch-tag: b08735b7-753b-4ae6-b754-0f3efe4515c5 |
38436
b174db545cfd
Some fixes to follow coding conventions.
Pavel Janík <Pavel@Janik.cz>
parents:
37934
diff
changeset
|
991 ;;; utf-8.el ends here |