Mercurial > emacs
annotate lisp/international/utf-8.el @ 61829:bfa373ffccf6
*** empty log message ***
author | Thien-Thi Nguyen <ttn@gnuvola.org> |
---|---|
date | Mon, 25 Apr 2005 11:42:25 +0000 |
parents | aac0a33f5772 |
children | c36561fe0657 |
rev | line source |
---|---|
46496 | 1 ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*- |
35542 | 2 |
55437
6e677a935fe9
Fix references to utf-translate-cjk into utf-translate-cjk-mode.
Andreas Schwab <schwab@suse.de>
parents:
54304
diff
changeset
|
3 ;; Copyright (C) 2001, 2004 Electrotechnical Laboratory, JAPAN. |
35542 | 4 ;; Licensed to the Free Software Foundation. |
46496 | 5 ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc. |
35542 | 6 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
7 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> |
46496 | 8 ;; Maintainer: FSF |
36243 | 9 ;; Keywords: multilingual, Unicode, UTF-8, i18n |
35542 | 10 |
11 ;; This file is part of GNU Emacs. | |
12 | |
13 ;; GNU Emacs is free software; you can redistribute it and/or modify | |
14 ;; it under the terms of the GNU General Public License as published by | |
15 ;; the Free Software Foundation; either version 2, or (at your option) | |
16 ;; any later version. | |
17 | |
18 ;; GNU Emacs is distributed in the hope that it will be useful, | |
19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 ;; GNU General Public License for more details. | |
22 | |
23 ;; You should have received a copy of the GNU General Public License | |
24 ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
25 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
26 ;; Boston, MA 02111-1307, USA. | |
27 | |
28 ;;; Commentary: | |
29 | |
41873 | 30 ;; The coding-system `mule-utf-8' basically supports encoding/decoding |
31 ;; of the following character sets to and from UTF-8: | |
35542 | 32 ;; |
33 ;; ascii | |
34 ;; eight-bit-control | |
35 ;; latin-iso8859-1 | |
36 ;; mule-unicode-0100-24ff | |
37 ;; mule-unicode-2500-33ff | |
38 ;; mule-unicode-e000-ffff | |
39 ;; | |
36243 | 40 ;; On decoding, Unicode characters that do not fit into the above |
41 ;; character sets are handled as `eight-bit-control' or | |
42 ;; `eight-bit-graphic' characters to retain the information about the | |
46496 | 43 ;; original byte sequence and text properties record the corresponding |
44 ;; unicode. | |
45 ;; | |
46 ;; Fixme: note that reading and writing invalid utf-8 may not be | |
47 ;; idempotent -- to represent the bytes to fix that needs a new charset. | |
41873 | 48 ;; |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
49 ;; Characters from other character sets can be encoded with mule-utf-8 |
48848 | 50 ;; by populating the translation table |
50179
65bb5afb37ef
(utf-fragment-on-decoding): Don't call
Kenichi Handa <handa@m17n.org>
parents:
50085
diff
changeset
|
51 ;; `utf-translation-table-for-encode'. Hash tables |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
52 ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
53 ;; used to support encoding and decoding of about a quarter of the CJK |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
54 ;; space between U+3400 and U+DFFF. |
36243 | 55 |
54304 | 56 ;; UTF-8 is defined in RFC 3629. A sketch of the encoding is: |
35542 | 57 |
58 ;; scalar | utf-8 | |
59 ;; value | 1st byte | 2nd byte | 3rd byte | |
60 ;; --------------------+-----------+-----------+---------- | |
61 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | | |
62 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | | |
63 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx | |
64 | |
65 ;;; Code: | |
66 | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
67 (defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
68 "Char table mapping characters to latin-iso8859-1 or mule-unicode-*. |
46496 | 69 |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
70 If `unify-8859-on-encoding-mode' is non-nil, this table populates the |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
71 translation-table named `utf-translation-table-for-encode'.") |
46496 | 72 |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
73 (define-translation-table 'utf-translation-table-for-encode) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
74 |
46496 | 75 |
76 ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the | |
77 ;; space of mule-unicode. For Latin scripts this isn't very | |
78 ;; important. Hebrew and Arabic might go here too when there's proper | |
79 ;; support for them. | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
80 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
81 (defvar utf-fragmentation-table (make-char-table 'translation-table nil) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
82 "Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
83 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
84 If `utf-fragment-on-decoding' is non-nil, this table populates the |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
85 translation-table named `utf-translation-table-for-decode'") |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
86 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
87 (defvar utf-defragmentation-table (make-char-table 'translation-table nil) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
88 "Char-table for reverse mapping of `utf-fragmentation-table'. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
89 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
90 If `utf-fragment-on-decoding' is non-nil and |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
91 `unify-8859-on-encoding-mode' is nil, this table populates the |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
92 translation-table named `utf-translation-table-for-encode'") |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
93 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
94 (define-translation-table 'utf-translation-table-for-decode) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
95 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
96 |
48882 | 97 (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
98 "Hash table mapping Emacs CJK character sets to Unicode code points. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
99 |
55437
6e677a935fe9
Fix references to utf-translate-cjk into utf-translate-cjk-mode.
Andreas Schwab <schwab@suse.de>
parents:
54304
diff
changeset
|
100 If `utf-translate-cjk-mode' is non-nil, this table populates the |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
101 translation-hash-table named `utf-subst-table-for-encode'.") |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
102 |
48882 | 103 (define-translation-hash-table 'utf-subst-table-for-encode |
104 ucs-mule-cjk-to-unicode) | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
105 |
48882 | 106 (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
107 "Hash table mapping Unicode code points to Emacs CJK character sets. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
108 |
55437
6e677a935fe9
Fix references to utf-translate-cjk into utf-translate-cjk-mode.
Andreas Schwab <schwab@suse.de>
parents:
54304
diff
changeset
|
109 If `utf-translate-cjk-mode' is non-nil, this table populates the |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
110 translation-hash-table named `utf-subst-table-for-decode'.") |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
111 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
112 (define-translation-hash-table 'utf-subst-table-for-decode |
48882 | 113 ucs-unicode-to-mule-cjk) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
114 |
46496 | 115 (mapc |
116 (lambda (pair) | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
117 (aset utf-fragmentation-table (car pair) (cdr pair)) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
118 (aset utf-defragmentation-table (cdr pair) (car pair))) |
46496 | 119 '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) (?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B) |
120 (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) (?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,F@(B) | |
121 (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) (?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B) | |
122 (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) (?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B) | |
123 (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) (?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B) | |
124 (?$,1' (B . ?,FP(B) (?$,1'!(B . ?,FQ(B) (?$,1'#(B . ?,FS(B) (?$,1'$(B . ?,FT(B) (?$,1'%(B . ?,FU(B) | |
125 (?$,1'&(B . ?,FV(B) (?$,1''(B . ?,FW(B) (?$,1'((B . ?,FX(B) (?$,1')(B . ?,FY(B) (?$,1'*(B . ?,FZ(B) | |
126 (?$,1'+(B . ?,F[(B) (?$,1',(B . ?,F\(B) (?$,1'-(B . ?,F](B) (?$,1'.(B . ?,F^(B) (?$,1'/(B . ?,F_(B) | |
127 (?$,1'0(B . ?,F`(B) (?$,1'1(B . ?,Fa(B) (?$,1'2(B . ?,Fb(B) (?$,1'3(B . ?,Fc(B) (?$,1'4(B . ?,Fd(B) | |
128 (?$,1'5(B . ?,Fe(B) (?$,1'6(B . ?,Ff(B) (?$,1'7(B . ?,Fg(B) (?$,1'8(B . ?,Fh(B) (?$,1'9(B . ?,Fi(B) | |
129 (?$,1':(B . ?,Fj(B) (?$,1';(B . ?,Fk(B) (?$,1'<(B . ?,Fl(B) (?$,1'=(B . ?,Fm(B) (?$,1'>(B . ?,Fn(B) | |
130 (?$,1'?(B . ?,Fo(B) (?$,1'@(B . ?,Fp(B) (?$,1'A(B . ?,Fq(B) (?$,1'B(B . ?,Fr(B) (?$,1'C(B . ?,Fs(B) | |
131 (?$,1'D(B . ?,Ft(B) (?$,1'E(B . ?,Fu(B) (?$,1'F(B . ?,Fv(B) (?$,1'G(B . ?,Fw(B) (?$,1'H(B . ?,Fx(B) | |
132 (?$,1'I(B . ?,Fy(B) (?$,1'J(B . ?,Fz(B) (?$,1'K(B . ?,F{(B) (?$,1'L(B . ?,F|(B) (?$,1'M(B . ?,F}(B) | |
133 (?$,1'N(B . ?,F~(B) | |
134 | |
135 (?$,1(!(B . ?,L!(B) (?$,1("(B . ?,L"(B) (?$,1(#(B . ?,L#(B) (?$,1($(B . ?,L$(B) | |
136 (?$,1(%(B . ?,L%(B) (?$,1(&(B . ?,L&(B) (?$,1('(B . ?,L'(B) (?$,1(((B . ?,L((B) (?$,1()(B . ?,L)(B) | |
137 (?$,1(*(B . ?,L*(B) (?$,1(+(B . ?,L+(B) (?$,1(,(B . ?,L,(B) (?$,1(.(B . ?,L.(B) (?$,1(/(B . ?,L/(B) | |
138 (?$,1(0(B . ?,L0(B) (?$,1(1(B . ?,L1(B) (?$,1(2(B . ?,L2(B) (?$,1(3(B . ?,L3(B) (?$,1(4(B . ?,L4(B) | |
139 (?$,1(5(B . ?,L5(B) (?$,1(6(B . ?,L6(B) (?$,1(7(B . ?,L7(B) (?$,1(8(B . ?,L8(B) (?$,1(9(B . ?,L9(B) | |
140 (?$,1(:(B . ?,L:(B) (?$,1(;(B . ?,L;(B) (?$,1(<(B . ?,L<(B) (?$,1(=(B . ?,L=(B) (?$,1(>(B . ?,L>(B) | |
141 (?$,1(?(B . ?,L?(B) (?$,1(@(B . ?,L@(B) (?$,1(A(B . ?,LA(B) (?$,1(B(B . ?,LB(B) (?$,1(C(B . ?,LC(B) | |
142 (?$,1(D(B . ?,LD(B) (?$,1(E(B . ?,LE(B) (?$,1(F(B . ?,LF(B) (?$,1(G(B . ?,LG(B) (?$,1(H(B . ?,LH(B) | |
143 (?$,1(I(B . ?,LI(B) (?$,1(J(B . ?,LJ(B) (?$,1(K(B . ?,LK(B) (?$,1(L(B . ?,LL(B) (?$,1(M(B . ?,LM(B) | |
144 (?$,1(N(B . ?,LN(B) (?$,1(O(B . ?,LO(B) (?$,1(P(B . ?,LP(B) (?$,1(Q(B . ?,LQ(B) (?$,1(R(B . ?,LR(B) | |
145 (?$,1(S(B . ?,LS(B) (?$,1(T(B . ?,LT(B) (?$,1(U(B . ?,LU(B) (?$,1(V(B . ?,LV(B) (?$,1(W(B . ?,LW(B) | |
146 (?$,1(X(B . ?,LX(B) (?$,1(Y(B . ?,LY(B) (?$,1(Z(B . ?,LZ(B) (?$,1([(B . ?,L[(B) (?$,1(\(B . ?,L\(B) | |
147 (?$,1(](B . ?,L](B) (?$,1(^(B . ?,L^(B) (?$,1(_(B . ?,L_(B) (?$,1(`(B . ?,L`(B) (?$,1(a(B . ?,La(B) | |
148 (?$,1(b(B . ?,Lb(B) (?$,1(c(B . ?,Lc(B) (?$,1(d(B . ?,Ld(B) (?$,1(e(B . ?,Le(B) (?$,1(f(B . ?,Lf(B) | |
149 (?$,1(g(B . ?,Lg(B) (?$,1(h(B . ?,Lh(B) (?$,1(i(B . ?,Li(B) (?$,1(j(B . ?,Lj(B) (?$,1(k(B . ?,Lk(B) | |
150 (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) (?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B) | |
151 (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) (?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B) | |
152 (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) (?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B) | |
153 (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B))) | |
154 | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
155 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
156 (defcustom utf-fragment-on-decoding nil |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
157 "Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets. |
46496 | 158 Setting this means that the relevant Cyrillic and Greek characters are |
159 decoded into the iso8859 charsets rather than into | |
47231 | 160 mule-unicode-0100-24ff. The iso8859 charsets take half as much space |
46496 | 161 in the buffer, but using them may affect how the buffer can be re-encoded |
162 and may require a different input method to search for them, for instance. | |
163 See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode' | |
47231 | 164 for mechanisms to make this largely transparent. |
165 | |
166 Setting this variable outside customize has no effect." | |
46496 | 167 :set (lambda (s v) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
168 (if v |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
169 (progn |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
170 (define-translation-table 'utf-translation-table-for-decode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
171 utf-fragmentation-table) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
172 ;; Even if unify-8859-on-encoding-mode is off, make |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
173 ;; mule-utf-* encode characters in |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
174 ;; utf-fragmentation-table. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
175 (unless (eq (get 'utf-translation-table-for-encode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
176 'translation-table) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
177 ucs-mule-to-mule-unicode) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
178 (define-translation-table 'utf-translation-table-for-encode |
50179
65bb5afb37ef
(utf-fragment-on-decoding): Don't call
Kenichi Handa <handa@m17n.org>
parents:
50085
diff
changeset
|
179 utf-defragmentation-table))) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
180 (define-translation-table 'utf-translation-table-for-decode) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
181 ;; When unify-8859-on-encoding-mode is off, be sure to make |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
182 ;; mule-utf-* disabled for characters in |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
183 ;; utf-fragmentation-table. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
184 (unless (eq (get 'utf-translation-table-for-encode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
185 'translation-table) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
186 ucs-mule-to-mule-unicode) |
50179
65bb5afb37ef
(utf-fragment-on-decoding): Don't call
Kenichi Handa <handa@m17n.org>
parents:
50085
diff
changeset
|
187 (define-translation-table 'utf-translation-table-for-encode))) |
46496 | 188 (set-default s v)) |
59996
aac0a33f5772
Change release version from 21.4 to 22.1 throughout.
Kim F. Storm <storm@cua.dk>
parents:
59096
diff
changeset
|
189 :version "22.1" |
46496 | 190 :type 'boolean |
191 :group 'mule) | |
192 | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
193 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
194 (defconst utf-translate-cjk-charsets '(chinese-gb2312 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
195 chinese-big5-1 chinese-big5-2 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
196 japanese-jisx0208 japanese-jisx0212 |
57761
13239a8e9e80
(utf-translate-cjk-charsets): Add katakana-jisx0201.
Kenichi Handa <handa@m17n.org>
parents:
57737
diff
changeset
|
197 katakana-jisx0201 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
198 korean-ksc5601) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
199 "List of charsets supported by `utf-translate-cjk-mode'.") |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
200 |
57727
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
201 (defvar utf-translate-cjk-lang-env nil |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
202 "Language environment in which tables for `utf-translate-cjk-mode' is loaded. |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
203 The value nil means that the tables are not yet loaded.") |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
204 |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
205 (defvar utf-translate-cjk-unicode-range) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
206 |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
207 ;; String generated from utf-translate-cjk-unicode-range. It is |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
208 ;; suitable for an argument to skip-chars-forward. |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
209 (defvar utf-translate-cjk-unicode-range-string nil) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
210 |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
211 (defun utf-translate-cjk-set-unicode-range (range) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
212 (setq utf-translate-cjk-unicode-range range) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
213 (setq utf-translate-cjk-unicode-range-string |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
214 (let ((decode-char-no-trans |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
215 #'(lambda (x) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
216 (cond ((< x #x100) (make-char 'latin-iso8859-1 x)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
217 ((< x #x2500) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
218 (setq x (- x #x100)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
219 (make-char 'mule-unicode-0100-24ff |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
220 (+ (/ x 96) 32) (+ (% x 96) 32))) |
59996
aac0a33f5772
Change release version from 21.4 to 22.1 throughout.
Kim F. Storm <storm@cua.dk>
parents:
59096
diff
changeset
|
221 ((< x #x3400) |
57727
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
222 (setq x (- x #x2500)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
223 (make-char 'mule-unicode-2500-33ff |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
224 (+ (/ x 96) 32) (+ (% x 96) 32))) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
225 (t |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
226 (setq x (- x #xe000)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
227 (make-char 'mule-unicode-e000-ffff |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
228 (+ (/ x 96) 32) (+ (% x 96) 32)))))) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
229 ranges from to) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
230 (dolist (elt range) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
231 (setq from (max #xA0 (car elt)) to (min #xffff (cdr elt))) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
232 (if (and (>= to #x3400) (< to #xE000)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
233 (setq to #x33FF)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
234 (cond ((< from #x100) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
235 (if (>= to #xE000) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
236 (setq ranges (cons (cons #xE000 to) ranges) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
237 to #x33FF)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
238 (if (>= to #x2500) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
239 (setq ranges (cons (cons #x2500 to) ranges) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
240 to #x24FF)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
241 (if (>= to #x100) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
242 (setq ranges (cons (cons #x100 to) ranges) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
243 to #xFF))) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
244 ((< from #x2500) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
245 (if (>= to #xE000) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
246 (setq ranges (cons (cons #xE000 to) ranges) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
247 to #x33FF)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
248 (if (>= to #x2500) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
249 (setq ranges (cons (cons #x2500 to) ranges) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
250 to #x24FF))) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
251 ((< from #x3400) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
252 (if (>= to #xE000) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
253 (setq ranges (cons (cons #xE000 to) ranges) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
254 to #x33FF)))) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
255 (if (<= from to) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
256 (setq ranges (cons (cons from to) ranges)))) |
59996
aac0a33f5772
Change release version from 21.4 to 22.1 throughout.
Kim F. Storm <storm@cua.dk>
parents:
59096
diff
changeset
|
257 (mapconcat #'(lambda (x) |
aac0a33f5772
Change release version from 21.4 to 22.1 throughout.
Kim F. Storm <storm@cua.dk>
parents:
59096
diff
changeset
|
258 (format "%c-%c" |
57727
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
259 (funcall decode-char-no-trans (car x)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
260 (funcall decode-char-no-trans (cdr x)))) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
261 ranges ""))) |
57737
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
262 ;; These forces loading and settting tables for |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
263 ;; utf-translate-cjk-mode. |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
264 (setq utf-translate-cjk-lang-env nil |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
265 ucs-mule-cjk-to-unicode (make-hash-table :test 'eq) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
266 ucs-unicode-to-mule-cjk (make-hash-table :test 'eq))) |
57727
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
267 |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
268 (defcustom utf-translate-cjk-unicode-range '((#x2e80 . #xd7a3) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
269 (#xff00 . #xffef)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
270 "List of Unicode code ranges supported by `utf-translate-cjk-mode'. |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
271 Setting this variable directly does not take effect; |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
272 use either \\[customize] or the function |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
273 `utf-translate-cjk-set-unicode-range'." |
59996
aac0a33f5772
Change release version from 21.4 to 22.1 throughout.
Kim F. Storm <storm@cua.dk>
parents:
59096
diff
changeset
|
274 :version "22.1" |
57727
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
275 :type '(repeat (cons integer integer)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
276 :set (lambda (symbol value) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
277 (utf-translate-cjk-set-unicode-range value)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
278 :group 'mule) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
279 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
280 ;; Return non-nil if CODE-POINT is in `utf-translate-cjk-unicode-range'. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
281 (defsubst utf-translate-cjk-substitutable-p (code-point) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
282 (let ((tail utf-translate-cjk-unicode-range) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
283 elt) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
284 (while tail |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
285 (setq elt (car tail) tail (cdr tail)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
286 (if (and (>= code-point (car elt)) (<= code-point (cdr elt))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
287 (setq tail nil) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
288 (setq elt nil))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
289 elt)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
290 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
291 (defun utf-translate-cjk-load-tables () |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
292 "Load tables for `utf-translate-cjk-mode'." |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
293 ;; Fixme: Allow the use of the CJK charsets to be |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
294 ;; customized by reordering and possible omission. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
295 (let ((redefined (< (hash-table-size ucs-mule-cjk-to-unicode) 43000))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
296 (if redefined |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
297 ;; Redefine them with realistic initial sizes and a |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
298 ;; smallish rehash size to avoid wasting significant |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
299 ;; space after they're built. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
300 (setq ucs-mule-cjk-to-unicode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
301 (make-hash-table :test 'eq :size 43000 :rehash-size 1000) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
302 ucs-unicode-to-mule-cjk |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
303 (make-hash-table :test 'eq :size 21500 :rehash-size 1000))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
304 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
305 ;; Load the files explicitly, to avoid having to keep |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
306 ;; around the large tables they contain (as well as the |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
307 ;; ones which get built). |
59096
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
308 ;; Here we bind coding-system-for-read to nil so that coding tags |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
309 ;; in the files are respected even if the files are not yet |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
310 ;; byte-compiled |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
311 (let ((coding-system-for-read nil)) |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
312 (cond ((string= "Korean" current-language-environment) |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
313 (load "subst-jis") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
314 (load "subst-big5") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
315 (load "subst-gb2312") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
316 (load "subst-ksc")) |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
317 ((string= "Chinese-BIG5" current-language-environment) |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
318 (load "subst-jis") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
319 (load "subst-ksc") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
320 (load "subst-gb2312") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
321 (load "subst-big5")) |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
322 ((string= "Chinese-GB" current-language-environment) |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
323 (load "subst-jis") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
324 (load "subst-ksc") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
325 (load "subst-big5") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
326 (load "subst-gb2312")) |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
327 (t |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
328 (load "subst-ksc") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
329 (load "subst-gb2312") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
330 (load "subst-big5") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
331 (load "subst-jis")))) ; jis covers as much as big5, gb2312 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
332 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
333 (when redefined |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
334 (define-translation-hash-table 'utf-subst-table-for-decode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
335 ucs-unicode-to-mule-cjk) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
336 (define-translation-hash-table 'utf-subst-table-for-encode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
337 ucs-mule-cjk-to-unicode) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
338 (set-char-table-extra-slot (get 'utf-translation-table-for-encode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
339 'translation-table) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
340 1 ucs-mule-cjk-to-unicode)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
341 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
342 (setq utf-translate-cjk-lang-env current-language-environment))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
343 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
344 (defun utf-lookup-subst-table-for-decode (code-point) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
345 (if (and utf-translate-cjk-mode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
346 (not utf-translate-cjk-lang-env) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
347 (utf-translate-cjk-substitutable-p code-point)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
348 (utf-translate-cjk-load-tables)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
349 (gethash code-point |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
350 (get 'utf-subst-table-for-decode 'translation-hash-table))) |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
351 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
352 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
353 (defun utf-lookup-subst-table-for-encode (char) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
354 (if (and utf-translate-cjk-mode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
355 (not utf-translate-cjk-lang-env) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
356 (memq (char-charset char) utf-translate-cjk-charsets)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
357 (utf-translate-cjk-load-tables)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
358 (gethash char |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
359 (get 'utf-subst-table-for-encode 'translation-hash-table))) |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
360 |
50341
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
361 (define-minor-mode utf-translate-cjk-mode |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
362 "Toggle whether UTF based coding systems de/encode CJK characters. |
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
363 If ARG is an integer, enable if ARG is positive and disable if |
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
364 zero or negative. This is a minor mode. |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
365 Enabling this allows the coding systems mule-utf-8, |
51628
abfc7d48b476
(utf-translate-cjk-mode): Fix docstring.
Kenichi Handa <handa@m17n.org>
parents:
50766
diff
changeset
|
366 mule-utf-16le and mule-utf-16be to encode characters in the charsets |
48848 | 367 `korean-ksc5601', `chinese-gb2312', `chinese-big5-1', |
368 `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to | |
369 decode the corresponding unicodes into such characters. | |
46496 | 370 |
48848 | 371 Where the charsets overlap, the one preferred for decoding is chosen |
372 according to the language environment in effect when this option is | |
373 turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for | |
374 Chinese-Big5 and jisx for other environments. | |
375 | |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
376 This mode is on by default. If you are not interested in CJK |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
377 characters and want to avoid some overhead on encoding/decoding |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
378 by the above coding systems, you can customize the user option |
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
379 `utf-translate-cjk-mode' to nil." |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
380 :init-value t |
59996
aac0a33f5772
Change release version from 21.4 to 22.1 throughout.
Kim F. Storm <storm@cua.dk>
parents:
59096
diff
changeset
|
381 :version "22.1" |
46496 | 382 :type 'boolean |
50341
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
383 :group 'mule |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
384 :global t |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
385 (if utf-translate-cjk-mode |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
386 (progn |
50766
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
387 (define-translation-hash-table 'utf-subst-table-for-decode |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
388 ucs-unicode-to-mule-cjk) |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
389 (define-translation-hash-table 'utf-subst-table-for-encode |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
390 ucs-mule-cjk-to-unicode) |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
391 (set-char-table-extra-slot (get 'utf-translation-table-for-encode |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
392 'translation-table) |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
393 1 ucs-mule-cjk-to-unicode)) |
50549
c8525ac04d76
(utf-translate-cjk-mode): Fix incorrect
Kenichi Handa <handa@m17n.org>
parents:
50341
diff
changeset
|
394 (define-translation-hash-table 'utf-subst-table-for-decode |
c8525ac04d76
(utf-translate-cjk-mode): Fix incorrect
Kenichi Handa <handa@m17n.org>
parents:
50341
diff
changeset
|
395 (make-hash-table :test 'eq)) |
c8525ac04d76
(utf-translate-cjk-mode): Fix incorrect
Kenichi Handa <handa@m17n.org>
parents:
50341
diff
changeset
|
396 (define-translation-hash-table 'utf-subst-table-for-encode |
50766
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
397 (make-hash-table :test 'eq)) |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
398 (set-char-table-extra-slot (get 'utf-translation-table-for-encode |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
399 'translation-table) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
400 1 nil)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
401 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
402 ;; Update safe-chars of mule-utf-* coding systems. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
403 (dolist (elt (coding-system-list t)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
404 (if (string-match "^mule-utf" (symbol-name elt)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
405 (let ((safe-charsets (coding-system-get elt 'safe-charsets)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
406 (safe-chars (coding-system-get elt 'safe-chars)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
407 (need-update nil)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
408 (dolist (charset utf-translate-cjk-charsets) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
409 (unless (eq utf-translate-cjk-mode (memq charset safe-charsets)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
410 (setq safe-charsets |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
411 (if utf-translate-cjk-mode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
412 (cons charset safe-charsets) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
413 (delq charset safe-charsets)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
414 need-update t) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
415 (aset safe-chars (make-char charset) utf-translate-cjk-mode))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
416 (when need-update |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
417 (coding-system-put elt 'safe-charsets safe-charsets) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
418 (define-coding-system-internal elt)))))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
419 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
420 (define-ccl-program ccl-mule-utf-untrans |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
421 ;; R0 is an untranslatable Unicode code-point (U+3500..U+DFFF or |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
422 ;; U+10000..U+10FFFF) or an invaid byte (#x00..#xFF). Write |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
423 ;; eight-bit-control/graphic sequence (2 to 4 chars) representing |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
424 ;; UTF-8 sequence of r0. Registers r4, r5, r6 are modified. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
425 ;; |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
426 ;; This is a subrountine because we assume that this is called very |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
427 ;; rarely (so we don't have to worry about the overhead of the |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
428 ;; call). |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
429 `(0 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
430 ((r5 = ,(charset-id 'eight-bit-control)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
431 (r6 = ,(charset-id 'eight-bit-graphic)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
432 (if (r0 < #x100) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
433 ((r4 = ((r0 >> 6) | #xC0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
434 (write-multibyte-character r6 r4)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
435 ((if (r0 < #x10000) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
436 ((r4 = ((r0 >> 12) | #xE0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
437 (write-multibyte-character r6 r4)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
438 ((r4 = ((r0 >> 18) | #xF0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
439 (write-multibyte-character r6 r4) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
440 (r4 = (((r0 >> 12) & #x3F) | #x80)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
441 (if (r4 < #xA0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
442 (write-multibyte-character r5 r4) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
443 (write-multibyte-character r6 r4)))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
444 (r4 = (((r0 >> 6) & #x3F) | #x80)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
445 (if (r4 < #xA0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
446 (write-multibyte-character r5 r4) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
447 (write-multibyte-character r6 r4)))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
448 (r4 = ((r0 & #x3F) | #x80)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
449 (if (r4 < #xA0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
450 (write-multibyte-character r5 r4) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
451 (write-multibyte-character r6 r4))))) |
46496 | 452 |
35542 | 453 (define-ccl-program ccl-decode-mule-utf-8 |
454 ;; | |
455 ;; charset | bytes in utf-8 | bytes in emacs | |
456 ;; -----------------------+----------------+--------------- | |
457 ;; ascii | 1 | 1 | |
458 ;; -----------------------+----------------+--------------- | |
459 ;; eight-bit-control | 2 | 2 | |
41873 | 460 ;; eight-bit-graphic | 2 | 1 |
35542 | 461 ;; latin-iso8859-1 | 2 | 2 |
462 ;; -----------------------+----------------+--------------- | |
463 ;; mule-unicode-0100-24ff | 2 | 4 | |
464 ;; (< 0800) | | | |
465 ;; -----------------------+----------------+--------------- | |
466 ;; mule-unicode-0100-24ff | 3 | 4 | |
467 ;; (>= 8000) | | | |
468 ;; mule-unicode-2500-33ff | 3 | 4 | |
469 ;; mule-unicode-e000-ffff | 3 | 4 | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
470 ;; -----------------------+----------------+--------------- |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
471 ;; invalid byte | 1 | 2 |
35542 | 472 ;; |
473 ;; Thus magnification factor is two. | |
474 ;; | |
475 `(2 | |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
476 ((r6 = ,(charset-id 'latin-iso8859-1)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
477 (read r0) |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
478 (loop |
35542 | 479 (if (r0 < #x80) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
480 ;; 1-byte encoding, i.e., ascii |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
481 (write-read-repeat r0)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
482 (if (r0 < #xc2) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
483 ;; continuation byte (invalid here) or 1st byte of overlong |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
484 ;; 2-byte sequence. |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
485 ((call ccl-mule-utf-untrans) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
486 (r6 = ,(charset-id 'latin-iso8859-1)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
487 (read r0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
488 (repeat))) |
35542 | 489 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
490 ;; Read the 2nd byte. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
491 (read r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
492 (if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
493 ((call ccl-mule-utf-untrans) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
494 (r6 = ,(charset-id 'latin-iso8859-1)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
495 ;; Handle it in the next loop. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
496 (r0 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
497 (repeat))) |
46496 | 498 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
499 (if (r0 < #xe0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
500 ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
501 ((r1 &= #x3F) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
502 (r1 |= ((r0 & #x1F) << 6)) |
57737
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
503 ;; Now r1 holds scalar value. We don't have to check |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
504 ;; `overlong sequence' because r0 >= 0xC2. |
46496 | 505 |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
506 (if (r1 >= 256) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
507 ;; mule-unicode-0100-24ff (< 0800) |
57737
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
508 ((r0 = r1) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
509 (lookup-integer utf-subst-table-for-decode r0 r1) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
510 (if (r7 == 0) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
511 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
512 (r1 -= #x0100) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
513 (r2 = (((r1 / 96) + 32) << 7)) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
514 (r1 %= 96) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
515 (r1 += (r2 + 32)) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
516 (translate-character |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
517 utf-translation-table-for-decode r0 r1))) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
518 (write-multibyte-character r0 r1) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
519 (read r0) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
520 (repeat)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
521 (if (r1 >= 160) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
522 ;; latin-iso8859-1 |
57737
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
523 ((r0 = r1) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
524 (lookup-integer utf-subst-table-for-decode r0 r1) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
525 (if (r7 == 0) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
526 ((r1 -= 128) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
527 (write-multibyte-character r6 r1)) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
528 ((write-multibyte-character r0 r1))) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
529 (read r0) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
530 (repeat)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
531 ;; eight-bit-control |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
532 ((r0 = ,(charset-id 'eight-bit-control)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
533 (write-multibyte-character r0 r1) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
534 (read r0) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
535 (repeat)))))) |
46496 | 536 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
537 ;; Read the 3rd bytes. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
538 (read r2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
539 (if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
540 ((call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
541 (r0 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
542 (call ccl-mule-utf-untrans) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
543 (r6 = ,(charset-id 'latin-iso8859-1)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
544 ;; Handle it in the next loop. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
545 (r0 = r2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
546 (repeat))) |
46496 | 547 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
548 (if (r0 < #xF0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
549 ;; 3byte encoding |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
550 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
551 ((r3 = ((r0 & #xF) << 12)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
552 (r3 |= ((r1 & #x3F) << 6)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
553 (r3 |= (r2 & #x3F)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
554 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
555 (if (r3 < #x800) ; `overlong sequence' |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
556 ((call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
557 (r0 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
558 (call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
559 (r0 = r2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
560 (call ccl-mule-utf-untrans) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
561 (r6 = ,(charset-id 'latin-iso8859-1)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
562 (read r0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
563 (repeat))) |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
49028
diff
changeset
|
564 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
565 (if (r3 < #x2500) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
566 ;; mule-unicode-0100-24ff (>= 0800) |
57737
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
567 ((r0 = r3) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
568 (lookup-integer utf-subst-table-for-decode r0 r1) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
569 (if (r7 == 0) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
570 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
571 (r3 -= #x0100) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
572 (r3 //= 96) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
573 (r1 = (r7 + 32)) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
574 (r1 += ((r3 + 32) << 7)) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
575 (translate-character |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
576 utf-translation-table-for-decode r0 r1))) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
577 (write-multibyte-character r0 r1) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
578 (read r0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
579 (repeat))) |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
49028
diff
changeset
|
580 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
581 (if (r3 < #x3400) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
582 ;; mule-unicode-2500-33ff |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
583 ((r0 = r3) ; don't zap r3 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
584 (lookup-integer utf-subst-table-for-decode r0 r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
585 (if (r7 == 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
586 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
587 (r3 -= #x2500) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
588 (r3 //= 96) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
589 (r1 = (r7 + 32)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
590 (r1 += ((r3 + 32) << 7)))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
591 (write-multibyte-character r0 r1) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
592 (read r0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
593 (repeat))) |
46496 | 594 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
595 (if (r3 < #xE000) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
596 ;; Try to convert to CJK chars, else |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
597 ;; keep them as eight-bit-{control|graphic}. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
598 ((r0 = r3) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
599 (lookup-integer utf-subst-table-for-decode r3 r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
600 (if r7 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
601 ;; got a translation |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
602 ((write-multibyte-character r3 r1) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
603 (read r0) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
604 (repeat)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
605 ((call ccl-mule-utf-untrans) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
606 (r6 = ,(charset-id 'latin-iso8859-1)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
607 (read r0) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
608 (repeat))))) |
46496 | 609 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
610 ;; mule-unicode-e000-ffff |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
611 ;; Fixme: fffe and ffff are invalid. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
612 (r0 = r3) ; don't zap r3 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
613 (lookup-integer utf-subst-table-for-decode r0 r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
614 (if (r7 == 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
615 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
616 (r3 -= #xe000) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
617 (r3 //= 96) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
618 (r1 = (r7 + 32)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
619 (r1 += ((r3 + 32) << 7)))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
620 (write-multibyte-character r0 r1) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
621 (read r0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
622 (repeat))) |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
49028
diff
changeset
|
623 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
624 ;; Read the 4th bytes. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
625 (read r3) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
626 (if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
627 ((call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
628 (r0 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
629 (call ccl-mule-utf-untrans) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
630 (r0 = r2) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
631 (call ccl-mule-utf-untrans) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
632 (r6 = ,(charset-id 'latin-iso8859-1)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
633 ;; Handle it in the next loop. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
634 (r0 = r3) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
635 (repeat))) |
35542 | 636 |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
637 (if (r0 < #xF8) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
638 ;; 4-byte encoding: |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
639 ;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
640 ;; keep those bytes as eight-bit-{control|graphic} |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
641 ;; Fixme: allow lookup in utf-subst-table-for-decode. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
642 ((r4 = ((r0 & #x7) << 18)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
643 (r4 |= ((r1 & #x3F) << 12)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
644 (r4 |= ((r2 & #x3F) << 6)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
645 (r4 |= (r3 & #x3F)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
646 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
647 (if (r4 < #x10000) ; `overlong sequence' |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
648 ((call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
649 (r0 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
650 (call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
651 (r0 = r2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
652 (call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
653 (r0 = r3) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
654 (call ccl-mule-utf-untrans)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
655 ((r0 = r4) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
656 (call ccl-mule-utf-untrans)))) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
657 |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
658 ;; Unsupported sequence. |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
659 ((call ccl-mule-utf-untrans) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
660 (r0 = r1) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
661 (call ccl-mule-utf-untrans) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
662 (r0 = r2) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
663 (call ccl-mule-utf-untrans) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
664 (r0 = r3) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
665 (call ccl-mule-utf-untrans))) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
666 (r6 = ,(charset-id 'latin-iso8859-1)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
667 (read r0) |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
668 (repeat))) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
669 |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
670 |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
671 ;; At EOF... |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
672 (if (r0 >= 0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
673 ;; r0 >= #x80 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
674 ((call ccl-mule-utf-untrans) |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
675 (if (r1 >= 0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
676 ((r0 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
677 (call ccl-mule-utf-untrans) |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
678 (if (r2 >= 0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
679 ((r0 = r2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
680 (call ccl-mule-utf-untrans) |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
681 (if (r3 >= 0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
682 ((r0 = r3) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
683 (call ccl-mule-utf-untrans)))))))))) |
35542 | 684 |
36243 | 685 "CCL program to decode UTF-8. |
36465 | 686 Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
687 mule-unicode-*, but see also `utf-fragmentation-table' and |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
688 `ucs-mule-cjk-to-unicode'. |
46496 | 689 Encodings of un-representable Unicode characters are decoded asis into |
690 eight-bit-control and eight-bit-graphic characters.") | |
35542 | 691 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
692 (define-ccl-program ccl-mule-utf-8-encode-untrans |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
693 ;; UTF-8 decoder generates an UTF-8 sequence represented by a |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
694 ;; sequence eight-bit-control/graphic chars for an untranslatable |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
695 ;; character and an invalid byte. |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
696 ;; |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
697 ;; This CCL parses that sequence (the first byte is already in r1), |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
698 ;; writes out the original bytes of that sequence, and sets r5 to |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
699 ;; -1. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
700 ;; |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
701 ;; If the eight-bit-control/graphic sequence is shorter than what r1 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
702 ;; suggests, it sets r5 and r6 to the last character read that |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
703 ;; should be handled by the next loop of a caller. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
704 ;; |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
705 ;; Note: For UTF-8 validation, we only check if a character is |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
706 ;; eight-bit-control/graphic or not. It may result in incorrect |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
707 ;; handling of random binary data, but such a data can't be encoded |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
708 ;; by UTF-8 anyway. At least, UTF-8 decoders doesn't generate such |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
709 ;; a sequence even if a source contains invalid byte-sequence. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
710 `(0 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
711 (;; Read the 2nd byte. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
712 (read-multibyte-character r5 r6) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
713 (r0 = (r5 != ,(charset-id 'eight-bit-control))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
714 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
715 ((write r1) ; invalid UTF-8 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
716 (r1 = -1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
717 (end))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
718 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
719 (if (r1 <= #xC3) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
720 ;; 2-byte sequence for an originally invalid byte. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
721 ((r6 &= #x3F) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
722 (r6 |= ((r1 & #x1F) << 6)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
723 (write r6) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
724 (r5 = -1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
725 (end))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
726 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
727 (write r1 r6) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
728 (r2 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
729 (r1 = -1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
730 ;; Read the 3rd byte. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
731 (read-multibyte-character r5 r6) |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
732 (r0 = (r5 != ,(charset-id 'eight-bit-control))) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
733 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
734 (end)) ; invalid UTF-8 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
735 (write r6) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
736 (if (r2 < #xF0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
737 ;; 3-byte sequence for an untranslated character. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
738 ((r5 = -1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
739 (end))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
740 ;; Read the 4th byte. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
741 (read-multibyte-character r5 r6) |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
742 (r0 = (r5 != ,(charset-id 'eight-bit-control))) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
743 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
744 (end)) ; invalid UTF-8 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
745 ;; 4-byte sequence for an untranslated character. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
746 (write r6) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
747 (r5 = -1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
748 (end)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
749 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
750 ;; At EOF... |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
751 ((r5 = -1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
752 (if (r1 >= 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
753 (write r1))))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
754 |
35542 | 755 (define-ccl-program ccl-encode-mule-utf-8 |
756 `(1 | |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
757 ((r5 = -1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
758 (loop |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
759 (if (r5 < 0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
760 (read-multibyte-character r0 r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
761 ;; Pre-read character is in r5 (charset-ID) and r6 (code-point). |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
762 ((r0 = r5) |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
763 (r1 = r6) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
764 (r5 = -1))) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
765 (translate-character utf-translation-table-for-encode r0 r1) |
35542 | 766 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
767 (if (r0 == ,(charset-id 'ascii)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
768 (write-repeat r1)) |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
769 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
770 (if (r0 == ,(charset-id 'latin-iso8859-1)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
771 ;; r1 scalar utf-8 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
772 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
773 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
774 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 |
56095
4ec2da03a87c
(ccl-encode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56056
diff
changeset
|
775 ((write ((r1 >> 6) | #xc2)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
776 (r1 &= #x3f) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
777 (r1 |= #x80) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
778 (write-repeat r1))) |
35542 | 779 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
780 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
781 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
782 ;; #x3f80 == (0011 1111 1000 0000)b |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
783 (r1 &= #x7f) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
784 (r1 += (r0 + 224)) ; 240 == -32 + #x0100 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
785 ;; now r1 holds scalar value |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
786 (if (r1 < #x0800) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
787 ;; 2byte encoding |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
788 ((write ((r1 >> 6) | #xC0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
789 (r1 &= #x3F) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
790 (r1 |= #x80) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
791 (write-repeat r1)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
792 ;; 3byte encoding |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
793 ((write ((r1 >> 12) | #xE0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
794 (write (((r1 & #x0FC0) >> 6) | #x80)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
795 (r1 &= #x3F) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
796 (r1 |= #x80) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
797 (write-repeat r1))))) |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
798 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
799 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
800 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
801 (r1 &= #x7f) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
802 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
803 ;; now r1 holds scalar value |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
804 (write ((r1 >> 12) | #xE0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
805 (write (((r1 & #x0FC0) >> 6) | #x80)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
806 (r1 &= #x3F) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
807 (r1 |= #x80) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
808 (write-repeat r1))) |
35542 | 809 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
810 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
811 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
812 (r1 &= #x7f) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
813 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
814 ;; now r1 holds scalar value |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
815 (write ((r1 >> 12) | #xE0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
816 (write (((r1 & #x0FC0) >> 6) | #x80)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
817 (r1 &= #x3F) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
818 (r1 |= #x80) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
819 (write-repeat r1))) |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
820 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
821 (if (r0 == ,(charset-id 'eight-bit-control)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
822 ;; r1 scalar utf-8 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
823 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
824 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
825 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
826 ((write #xC2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
827 (write-repeat r1))) |
35542 | 828 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
829 (if (r0 == ,(charset-id 'eight-bit-graphic)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
830 ;; r1 scalar utf-8 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
831 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
832 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
833 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
834 ((r0 = (r1 >= #xC0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
835 (r0 &= (r1 <= #xC3)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
836 (r4 = (r1 >= #xE1)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
837 (r4 &= (r1 <= #xF7)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
838 (r0 |= r4) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
839 (if r0 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
840 ((call ccl-mule-utf-8-encode-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
841 (repeat)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
842 (write-repeat r1)))) |
35542 | 843 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
844 (lookup-character utf-subst-table-for-encode r0 r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
845 (if r7 ; lookup succeeded |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
846 (if (r0 < #x800) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
847 ;; 2byte encoding |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
848 ((write ((r0 >> 6) | #xC0)) |
56095
4ec2da03a87c
(ccl-encode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56056
diff
changeset
|
849 (r0 = ((r0 & #x3F) | #x80)) |
4ec2da03a87c
(ccl-encode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56056
diff
changeset
|
850 (write-repeat r0)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
851 ;; 3byte encoding |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
852 ((write ((r0 >> 12) | #xE0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
853 (write (((r0 & #x0FC0) >> 6) | #x80)) |
56095
4ec2da03a87c
(ccl-encode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56056
diff
changeset
|
854 (r0 = ((r0 & #x3F) | #x80)) |
4ec2da03a87c
(ccl-encode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56056
diff
changeset
|
855 (write-repeat r0)))) |
35542 | 856 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
857 ;; Unsupported character. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
858 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
859 (write #xef) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
860 (write #xbf) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
861 (write-repeat #xbd)))) |
46496 | 862 "CCL program to encode into UTF-8.") |
35542 | 863 |
41873 | 864 |
46496 | 865 (define-ccl-program ccl-untranslated-to-ucs |
866 `(0 | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
867 (if (r1 == 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
868 nil |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
869 (if (r0 <= #xC3) ; 2-byte encoding |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
870 ((r0 = ((r0 & #x3) << 6)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
871 (r0 |= (r1 & #x3F)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
872 (r1 = 2)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
873 (if (r2 == 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
874 (r1 = 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
875 (if (r0 < #xF0) ; 3-byte encoding, as above |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
876 ((r0 = ((r0 & #xF) << 12)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
877 (r0 |= ((r1 & #x3F) << 6)) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
878 (r0 |= (r2 & #x3F)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
879 (r1 = 3)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
880 (if (r3 == 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
881 (r1 = 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
882 ((r0 = ((r0 & #x7) << 18)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
883 (r0 |= ((r1 & #x3F) << 12)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
884 (r0 |= ((r2 & #x3F) << 6)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
885 (r0 |= (r3 & #x3F)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
886 (r1 = 4)))))))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
887 "Decode 2-, 3-, or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
888 Set r1 to the byte length. r0 == 0 for invalid sequence.") |
46496 | 889 |
890 (defvar utf-8-ccl-regs (make-vector 8 0)) | |
891 | |
41873 | 892 (defsubst utf-8-untranslated-to-ucs () |
46496 | 893 "Return the UCS code for an untranslated sequence of raw bytes t point. |
894 Only for 3- or 4-byte sequences." | |
895 (aset utf-8-ccl-regs 0 (or (char-after) 0)) | |
896 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0)) | |
897 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0)) | |
898 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0)) | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
899 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs)) |
41873 | 900 |
901 (defun utf-8-help-echo (window object position) | |
902 (format "Untranslated Unicode U+%04X" | |
903 (get-char-property position 'untranslated-utf-8 object))) | |
904 | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
905 ;; We compose the untranslatable sequences into a single character, |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
906 ;; and move point to the next character. |
41873 | 907 ;; This is infelicitous for editing, because there's currently no |
908 ;; mechanism for treating compositions as atomic, but is OK for | |
46496 | 909 ;; display. They are composed to U+FFFD with help-echo which |
910 ;; indicates the unicodes they represent. This function GCs too much. | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
911 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
912 ;; If utf-translate-cjk-mode is non-nil, this function is called with |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
913 ;; HASH-TABLE which translates CJK characters into some of CJK |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
914 ;; charsets. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
915 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
916 (defsubst utf-8-compose (hash-table) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
917 "Put a suitable composition on an untranslatable sequence at point. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
918 If HASH-TABLE is non-nil, try to translate CJK characters by it at first. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
919 Move point to the end of the sequence." |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
920 (utf-8-untranslated-to-ucs) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
921 (let ((l (aref utf-8-ccl-regs 1)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
922 ch) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
923 (if (> l 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
924 (if (and hash-table |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
925 (setq ch (gethash (aref utf-8-ccl-regs 0) hash-table))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
926 (progn |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
927 (insert ch) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
928 (delete-region (point) (min (point-max) (+ l (point))))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
929 (setq ch (aref utf-8-ccl-regs 0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
930 (put-text-property (point) (min (point-max) (+ l (point))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
931 'untranslated-utf-8 ch) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
932 (put-text-property (point) (min (point-max) (+ l (point))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
933 'help-echo 'utf-8-help-echo) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
934 (if (= l 2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
935 (put-text-property (point) (min (point-max) (+ l (point))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
936 'display (format "\\%03o" ch)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
937 (compose-region (point) (+ l (point)) ?$,3u=(B)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
938 (forward-char l)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
939 (forward-char 1)))) |
41873 | 940 |
941 (defcustom utf-8-compose-scripts nil | |
46496 | 942 "*Non-nil means compose various scripts on decoding utf-8 text." |
41873 | 943 :group 'mule |
59996
aac0a33f5772
Change release version from 21.4 to 22.1 throughout.
Kim F. Storm <storm@cua.dk>
parents:
59096
diff
changeset
|
944 :version "22.1" |
46496 | 945 :type 'boolean) |
41873 | 946 |
947 (defun utf-8-post-read-conversion (length) | |
948 "Compose untranslated utf-8 sequences into single characters. | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
949 If `utf-translate-cjk-mode' is non-nil, tries to translate CJK characters. |
41873 | 950 Also compose particular scripts if `utf-8-compose-scripts' is non-nil." |
951 (save-excursion | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
952 (save-restriction |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
953 (narrow-to-region (point) (+ (point) length)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
954 ;; Can't do eval-when-compile to insert a multibyte constant |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
955 ;; version of the string in the loop, since it's always loaded as |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
956 ;; unibyte from a byte-compiled file. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
957 (let ((range (string-as-multibyte "^\xc0-\xc3\xe1-\xf7")) |
56800
752ef76fcc08
(utf-8-post-read-conversion): If the
Kenichi Handa <handa@m17n.org>
parents:
56562
diff
changeset
|
958 (buffer-multibyte enable-multibyte-characters) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
959 hash-table ch) |
56800
752ef76fcc08
(utf-8-post-read-conversion): If the
Kenichi Handa <handa@m17n.org>
parents:
56562
diff
changeset
|
960 (set-buffer-multibyte t) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
961 (when utf-translate-cjk-mode |
57727
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
962 (unless utf-translate-cjk-lang-env |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
963 ;; Check these characters in utf-translate-cjk-range. |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
964 ;; We may have to translate them to CJK charsets. |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
965 (skip-chars-forward |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
966 (concat range utf-translate-cjk-unicode-range-string)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
967 (unless (eobp) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
968 (utf-translate-cjk-load-tables) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
969 (setq range |
57737
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
970 (concat range utf-translate-cjk-unicode-range-string))) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
971 (setq hash-table (get 'utf-subst-table-for-decode |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
972 'translation-hash-table)))) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
973 (while (and (skip-chars-forward range) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
974 (not (eobp))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
975 (setq ch (following-char)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
976 (if (< ch 256) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
977 (utf-8-compose hash-table) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
978 (if (and hash-table |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
979 (setq ch (gethash (encode-char ch 'ucs) hash-table))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
980 (progn |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
981 (insert ch) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
982 (delete-char 1)) |
56800
752ef76fcc08
(utf-8-post-read-conversion): If the
Kenichi Handa <handa@m17n.org>
parents:
56562
diff
changeset
|
983 (forward-char 1)))) |
752ef76fcc08
(utf-8-post-read-conversion): If the
Kenichi Handa <handa@m17n.org>
parents:
56562
diff
changeset
|
984 (or buffer-multibyte |
752ef76fcc08
(utf-8-post-read-conversion): If the
Kenichi Handa <handa@m17n.org>
parents:
56562
diff
changeset
|
985 (set-buffer-multibyte nil))) |
41873 | 986 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
987 (when (and utf-8-compose-scripts (> length 1)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
988 ;; These currently have definitions which cover the relevant |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
989 ;; unicodes. We could avoid loading thai-util &c by checking |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
990 ;; whether the region contains any characters with the appropriate |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
991 ;; categories. There aren't yet Unicode-based rules for Tibetan. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
992 (diacritic-compose-region (point-max) (point-min)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
993 (thai-compose-region (point-max) (point-min)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
994 (lao-compose-region (point-max) (point-min)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
995 (devanagari-compose-region (point-max) (point-min)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
996 (malayalam-compose-region (point-max) (point-min)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
997 (tamil-compose-region (point-max) (point-min))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
998 (- (point-max) (point-min))))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
999 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1000 (defun utf-8-pre-write-conversion (beg end) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1001 "Prepare for `utf-translate-cjk-mode' to encode text between BEG and END. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1002 This is used as a post-read-conversion of utf-8 coding system." |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1003 (if (and utf-translate-cjk-mode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1004 (not utf-translate-cjk-lang-env) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1005 (save-excursion |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1006 (goto-char beg) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1007 (re-search-forward "\\cc\\|\\cj\\|\\ch" end t))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1008 (utf-translate-cjk-load-tables)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1009 nil) |
41873 | 1010 |
35542 | 1011 (make-coding-system |
1012 'mule-utf-8 4 ?u | |
1013 "UTF-8 encoding for Emacs-supported Unicode characters. | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1014 It supports Unicode characters of these ranges: |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1015 U+0000..U+33FF, U+E000..U+FFFF. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1016 They correspond to these Emacs character sets: |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1017 ascii, latin-iso8859-1, mule-unicode-0100-24ff, |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1018 mule-unicode-2500-33ff, mule-unicode-e000-ffff |
35542 | 1019 |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1020 On decoding (e.g. reading a file), Unicode characters not in the above |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1021 ranges are decoded into sequences of eight-bit-control and |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1022 eight-bit-graphic characters to preserve their byte sequences. The |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1023 byte sequence is preserved on i/o for valid utf-8, but not necessarily |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1024 for invalid utf-8. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1025 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1026 On encoding (e.g. writing a file), Emacs characters not belonging to |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1027 any of the character sets listed above are encoded into the UTF-8 byte |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1028 sequence representing U+FFFD (REPLACEMENT CHARACTER)." |
35542 | 1029 |
1030 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1031 `((safe-charsets |
35542 | 1032 ascii |
1033 eight-bit-control | |
1034 eight-bit-graphic | |
1035 latin-iso8859-1 | |
1036 mule-unicode-0100-24ff | |
1037 mule-unicode-2500-33ff | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1038 mule-unicode-e000-ffff |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1039 ,@(if utf-translate-cjk-mode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1040 utf-translate-cjk-charsets)) |
36371
f6bb3ed752b4
(mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents:
36243
diff
changeset
|
1041 (mime-charset . utf-8) |
36423
aa776838b660
(mule-utf-8): Set coding-category property to coding-category-utf-8.
Kenichi Handa <handa@m17n.org>
parents:
36371
diff
changeset
|
1042 (coding-category . coding-category-utf-8) |
41873 | 1043 (valid-codes (0 . 255)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1044 (pre-write-conversion . utf-8-pre-write-conversion) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1045 (post-read-conversion . utf-8-post-read-conversion) |
50766
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
1046 (translation-table-for-encode . utf-translation-table-for-encode) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1047 (dependency unify-8859-on-encoding-mode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1048 unify-8859-on-decoding-mode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1049 utf-fragment-on-decoding |
55437
6e677a935fe9
Fix references to utf-translate-cjk into utf-translate-cjk-mode.
Andreas Schwab <schwab@suse.de>
parents:
54304
diff
changeset
|
1050 utf-translate-cjk-mode))) |
35542 | 1051 |
1052 (define-coding-system-alias 'utf-8 'mule-utf-8) | |
38436
b174db545cfd
Some fixes to follow coding conventions.
Pavel Janík <Pavel@Janik.cz>
parents:
37934
diff
changeset
|
1053 |
41873 | 1054 ;; I think this needs special private charsets defined for the |
1055 ;; untranslated sequences, if it's going to work well. | |
1056 | |
1057 ;;; (defun utf-8-compose-function (pos to pattern &optional string) | |
1058 ;;; (let* ((prop (get-char-property pos 'composition string)) | |
1059 ;;; (l (and prop (- (cadr prop) (car prop))))) | |
1060 ;;; (cond ((and l (> l (- to pos))) | |
1061 ;;; (delete-region pos to)) | |
1062 ;;; ((and (> (char-after pos) 224) | |
1063 ;;; (< (char-after pos) 256) | |
1064 ;;; (save-restriction | |
1065 ;;; (narrow-to-region pos to) | |
1066 ;;; (utf-8-compose))) | |
1067 ;;; t)))) | |
1068 | |
1069 ;;; (dotimes (i 96) | |
1070 ;;; (aset composition-function-table | |
1071 ;;; (+ 128 i) | |
1072 ;;; `((,(string-as-multibyte "[\200-\237\240-\377]") | |
1073 ;;; . utf-8-compose-function)))) | |
1074 | |
52401 | 1075 ;;; arch-tag: b08735b7-753b-4ae6-b754-0f3efe4515c5 |
38436
b174db545cfd
Some fixes to follow coding conventions.
Pavel Janík <Pavel@Janik.cz>
parents:
37934
diff
changeset
|
1076 ;;; utf-8.el ends here |