Mercurial > emacs
annotate lisp/international/utf-8.el @ 71710:dbbc0b93cfeb
(Forcing Redisplay, Displaying Messages, Temporary Displays, Font Selection,
Auto Faces, Font Lookup, Fringe Indicators, Display Margins, Image Descriptors,
Showing Images, Image Cache, Button Types, Making Buttons, Manipulating
Buttons, Button Buffer Commands, Display Table Format, Glyphs): Remove
@tindex.
author | Eli Zaretskii <eliz@gnu.org> |
---|---|
date | Sat, 08 Jul 2006 18:11:49 +0000 |
parents | b23c01e98a4b |
children | 43ccf7c7d312 |
rev | line source |
---|---|
46496 | 1 ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*- |
35542 | 2 |
62274 | 3 ;; Copyright (C) 2001, 2002, 2003, 2004 Free Software Foundation, Inc. |
4 ;; Copyright (C) 2001, 2002, 2003, 2004 | |
5 ;; National Institute of Advanced Industrial Science and Technology (AIST) | |
6 ;; Registration Number H14PRO021 | |
35542 | 7 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
8 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> |
46496 | 9 ;; Maintainer: FSF |
36243 | 10 ;; Keywords: multilingual, Unicode, UTF-8, i18n |
35542 | 11 |
12 ;; This file is part of GNU Emacs. | |
13 | |
14 ;; GNU Emacs is free software; you can redistribute it and/or modify | |
15 ;; it under the terms of the GNU General Public License as published by | |
16 ;; the Free Software Foundation; either version 2, or (at your option) | |
17 ;; any later version. | |
18 | |
19 ;; GNU Emacs is distributed in the hope that it will be useful, | |
20 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
21 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
22 ;; GNU General Public License for more details. | |
23 | |
24 ;; You should have received a copy of the GNU General Public License | |
25 ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
64085 | 26 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
27 ;; Boston, MA 02110-1301, USA. | |
35542 | 28 |
29 ;;; Commentary: | |
30 | |
41873 | 31 ;; The coding-system `mule-utf-8' basically supports encoding/decoding |
32 ;; of the following character sets to and from UTF-8: | |
35542 | 33 ;; |
34 ;; ascii | |
35 ;; eight-bit-control | |
36 ;; latin-iso8859-1 | |
37 ;; mule-unicode-0100-24ff | |
38 ;; mule-unicode-2500-33ff | |
39 ;; mule-unicode-e000-ffff | |
40 ;; | |
36243 | 41 ;; On decoding, Unicode characters that do not fit into the above |
42 ;; character sets are handled as `eight-bit-control' or | |
43 ;; `eight-bit-graphic' characters to retain the information about the | |
46496 | 44 ;; original byte sequence and text properties record the corresponding |
45 ;; unicode. | |
46 ;; | |
47 ;; Fixme: note that reading and writing invalid utf-8 may not be | |
48 ;; idempotent -- to represent the bytes to fix that needs a new charset. | |
41873 | 49 ;; |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
50 ;; Characters from other character sets can be encoded with mule-utf-8 |
48848 | 51 ;; by populating the translation table |
50179
65bb5afb37ef
(utf-fragment-on-decoding): Don't call
Kenichi Handa <handa@m17n.org>
parents:
50085
diff
changeset
|
52 ;; `utf-translation-table-for-encode'. Hash tables |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
53 ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
54 ;; used to support encoding and decoding of about a quarter of the CJK |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
55 ;; space between U+3400 and U+DFFF. |
36243 | 56 |
54304 | 57 ;; UTF-8 is defined in RFC 3629. A sketch of the encoding is: |
35542 | 58 |
59 ;; scalar | utf-8 | |
60 ;; value | 1st byte | 2nd byte | 3rd byte | |
61 ;; --------------------+-----------+-----------+---------- | |
62 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | | |
63 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | | |
64 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx | |
65 | |
66 ;;; Code: | |
67 | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
68 (defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
69 "Char table mapping characters to latin-iso8859-1 or mule-unicode-*. |
46496 | 70 |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
71 If `unify-8859-on-encoding-mode' is non-nil, this table populates the |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
72 translation-table named `utf-translation-table-for-encode'.") |
46496 | 73 |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
74 (define-translation-table 'utf-translation-table-for-encode) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
75 |
46496 | 76 |
77 ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the | |
78 ;; space of mule-unicode. For Latin scripts this isn't very | |
79 ;; important. Hebrew and Arabic might go here too when there's proper | |
80 ;; support for them. | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
81 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
82 (defvar utf-fragmentation-table (make-char-table 'translation-table nil) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
83 "Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
84 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
85 If `utf-fragment-on-decoding' is non-nil, this table populates the |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
86 translation-table named `utf-translation-table-for-decode'") |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
87 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
88 (defvar utf-defragmentation-table (make-char-table 'translation-table nil) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
89 "Char-table for reverse mapping of `utf-fragmentation-table'. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
90 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
91 If `utf-fragment-on-decoding' is non-nil and |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
92 `unify-8859-on-encoding-mode' is nil, this table populates the |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
93 translation-table named `utf-translation-table-for-encode'") |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
94 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
95 (define-translation-table 'utf-translation-table-for-decode) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
96 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
97 |
48882 | 98 (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
99 "Hash table mapping Emacs CJK character sets to Unicode code points. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
100 |
55437
6e677a935fe9
Fix references to utf-translate-cjk into utf-translate-cjk-mode.
Andreas Schwab <schwab@suse.de>
parents:
54304
diff
changeset
|
101 If `utf-translate-cjk-mode' is non-nil, this table populates the |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
102 translation-hash-table named `utf-subst-table-for-encode'.") |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
103 |
48882 | 104 (define-translation-hash-table 'utf-subst-table-for-encode |
105 ucs-mule-cjk-to-unicode) | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
106 |
48882 | 107 (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
108 "Hash table mapping Unicode code points to Emacs CJK character sets. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
109 |
55437
6e677a935fe9
Fix references to utf-translate-cjk into utf-translate-cjk-mode.
Andreas Schwab <schwab@suse.de>
parents:
54304
diff
changeset
|
110 If `utf-translate-cjk-mode' is non-nil, this table populates the |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
111 translation-hash-table named `utf-subst-table-for-decode'.") |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
112 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
113 (define-translation-hash-table 'utf-subst-table-for-decode |
48882 | 114 ucs-unicode-to-mule-cjk) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
115 |
46496 | 116 (mapc |
117 (lambda (pair) | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
118 (aset utf-fragmentation-table (car pair) (cdr pair)) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
119 (aset utf-defragmentation-table (cdr pair) (car pair))) |
46496 | 120 '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) (?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B) |
121 (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) (?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,F@(B) | |
122 (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) (?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B) | |
123 (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) (?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B) | |
124 (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) (?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B) | |
125 (?$,1' (B . ?,FP(B) (?$,1'!(B . ?,FQ(B) (?$,1'#(B . ?,FS(B) (?$,1'$(B . ?,FT(B) (?$,1'%(B . ?,FU(B) | |
126 (?$,1'&(B . ?,FV(B) (?$,1''(B . ?,FW(B) (?$,1'((B . ?,FX(B) (?$,1')(B . ?,FY(B) (?$,1'*(B . ?,FZ(B) | |
127 (?$,1'+(B . ?,F[(B) (?$,1',(B . ?,F\(B) (?$,1'-(B . ?,F](B) (?$,1'.(B . ?,F^(B) (?$,1'/(B . ?,F_(B) | |
128 (?$,1'0(B . ?,F`(B) (?$,1'1(B . ?,Fa(B) (?$,1'2(B . ?,Fb(B) (?$,1'3(B . ?,Fc(B) (?$,1'4(B . ?,Fd(B) | |
129 (?$,1'5(B . ?,Fe(B) (?$,1'6(B . ?,Ff(B) (?$,1'7(B . ?,Fg(B) (?$,1'8(B . ?,Fh(B) (?$,1'9(B . ?,Fi(B) | |
130 (?$,1':(B . ?,Fj(B) (?$,1';(B . ?,Fk(B) (?$,1'<(B . ?,Fl(B) (?$,1'=(B . ?,Fm(B) (?$,1'>(B . ?,Fn(B) | |
131 (?$,1'?(B . ?,Fo(B) (?$,1'@(B . ?,Fp(B) (?$,1'A(B . ?,Fq(B) (?$,1'B(B . ?,Fr(B) (?$,1'C(B . ?,Fs(B) | |
132 (?$,1'D(B . ?,Ft(B) (?$,1'E(B . ?,Fu(B) (?$,1'F(B . ?,Fv(B) (?$,1'G(B . ?,Fw(B) (?$,1'H(B . ?,Fx(B) | |
133 (?$,1'I(B . ?,Fy(B) (?$,1'J(B . ?,Fz(B) (?$,1'K(B . ?,F{(B) (?$,1'L(B . ?,F|(B) (?$,1'M(B . ?,F}(B) | |
134 (?$,1'N(B . ?,F~(B) | |
135 | |
136 (?$,1(!(B . ?,L!(B) (?$,1("(B . ?,L"(B) (?$,1(#(B . ?,L#(B) (?$,1($(B . ?,L$(B) | |
137 (?$,1(%(B . ?,L%(B) (?$,1(&(B . ?,L&(B) (?$,1('(B . ?,L'(B) (?$,1(((B . ?,L((B) (?$,1()(B . ?,L)(B) | |
138 (?$,1(*(B . ?,L*(B) (?$,1(+(B . ?,L+(B) (?$,1(,(B . ?,L,(B) (?$,1(.(B . ?,L.(B) (?$,1(/(B . ?,L/(B) | |
139 (?$,1(0(B . ?,L0(B) (?$,1(1(B . ?,L1(B) (?$,1(2(B . ?,L2(B) (?$,1(3(B . ?,L3(B) (?$,1(4(B . ?,L4(B) | |
140 (?$,1(5(B . ?,L5(B) (?$,1(6(B . ?,L6(B) (?$,1(7(B . ?,L7(B) (?$,1(8(B . ?,L8(B) (?$,1(9(B . ?,L9(B) | |
141 (?$,1(:(B . ?,L:(B) (?$,1(;(B . ?,L;(B) (?$,1(<(B . ?,L<(B) (?$,1(=(B . ?,L=(B) (?$,1(>(B . ?,L>(B) | |
142 (?$,1(?(B . ?,L?(B) (?$,1(@(B . ?,L@(B) (?$,1(A(B . ?,LA(B) (?$,1(B(B . ?,LB(B) (?$,1(C(B . ?,LC(B) | |
143 (?$,1(D(B . ?,LD(B) (?$,1(E(B . ?,LE(B) (?$,1(F(B . ?,LF(B) (?$,1(G(B . ?,LG(B) (?$,1(H(B . ?,LH(B) | |
144 (?$,1(I(B . ?,LI(B) (?$,1(J(B . ?,LJ(B) (?$,1(K(B . ?,LK(B) (?$,1(L(B . ?,LL(B) (?$,1(M(B . ?,LM(B) | |
145 (?$,1(N(B . ?,LN(B) (?$,1(O(B . ?,LO(B) (?$,1(P(B . ?,LP(B) (?$,1(Q(B . ?,LQ(B) (?$,1(R(B . ?,LR(B) | |
146 (?$,1(S(B . ?,LS(B) (?$,1(T(B . ?,LT(B) (?$,1(U(B . ?,LU(B) (?$,1(V(B . ?,LV(B) (?$,1(W(B . ?,LW(B) | |
147 (?$,1(X(B . ?,LX(B) (?$,1(Y(B . ?,LY(B) (?$,1(Z(B . ?,LZ(B) (?$,1([(B . ?,L[(B) (?$,1(\(B . ?,L\(B) | |
148 (?$,1(](B . ?,L](B) (?$,1(^(B . ?,L^(B) (?$,1(_(B . ?,L_(B) (?$,1(`(B . ?,L`(B) (?$,1(a(B . ?,La(B) | |
149 (?$,1(b(B . ?,Lb(B) (?$,1(c(B . ?,Lc(B) (?$,1(d(B . ?,Ld(B) (?$,1(e(B . ?,Le(B) (?$,1(f(B . ?,Lf(B) | |
150 (?$,1(g(B . ?,Lg(B) (?$,1(h(B . ?,Lh(B) (?$,1(i(B . ?,Li(B) (?$,1(j(B . ?,Lj(B) (?$,1(k(B . ?,Lk(B) | |
151 (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) (?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B) | |
152 (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) (?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B) | |
153 (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) (?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B) | |
154 (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B))) | |
155 | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
156 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
157 (defcustom utf-fragment-on-decoding nil |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
158 "Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets. |
46496 | 159 Setting this means that the relevant Cyrillic and Greek characters are |
160 decoded into the iso8859 charsets rather than into | |
47231 | 161 mule-unicode-0100-24ff. The iso8859 charsets take half as much space |
46496 | 162 in the buffer, but using them may affect how the buffer can be re-encoded |
163 and may require a different input method to search for them, for instance. | |
164 See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode' | |
47231 | 165 for mechanisms to make this largely transparent. |
166 | |
167 Setting this variable outside customize has no effect." | |
46496 | 168 :set (lambda (s v) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
169 (if v |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
170 (progn |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
171 (define-translation-table 'utf-translation-table-for-decode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
172 utf-fragmentation-table) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
173 ;; Even if unify-8859-on-encoding-mode is off, make |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
174 ;; mule-utf-* encode characters in |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
175 ;; utf-fragmentation-table. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
176 (unless (eq (get 'utf-translation-table-for-encode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
177 'translation-table) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
178 ucs-mule-to-mule-unicode) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
179 (define-translation-table 'utf-translation-table-for-encode |
50179
65bb5afb37ef
(utf-fragment-on-decoding): Don't call
Kenichi Handa <handa@m17n.org>
parents:
50085
diff
changeset
|
180 utf-defragmentation-table))) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
181 (define-translation-table 'utf-translation-table-for-decode) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
182 ;; When unify-8859-on-encoding-mode is off, be sure to make |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
183 ;; mule-utf-* disabled for characters in |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
184 ;; utf-fragmentation-table. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
185 (unless (eq (get 'utf-translation-table-for-encode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
186 'translation-table) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
187 ucs-mule-to-mule-unicode) |
50179
65bb5afb37ef
(utf-fragment-on-decoding): Don't call
Kenichi Handa <handa@m17n.org>
parents:
50085
diff
changeset
|
188 (define-translation-table 'utf-translation-table-for-encode))) |
46496 | 189 (set-default s v)) |
59996
aac0a33f5772
Change release version from 21.4 to 22.1 throughout.
Kim F. Storm <storm@cua.dk>
parents:
59096
diff
changeset
|
190 :version "22.1" |
46496 | 191 :type 'boolean |
192 :group 'mule) | |
193 | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
194 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
195 (defconst utf-translate-cjk-charsets '(chinese-gb2312 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
196 chinese-big5-1 chinese-big5-2 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
197 japanese-jisx0208 japanese-jisx0212 |
57761
13239a8e9e80
(utf-translate-cjk-charsets): Add katakana-jisx0201.
Kenichi Handa <handa@m17n.org>
parents:
57737
diff
changeset
|
198 katakana-jisx0201 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
199 korean-ksc5601) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
200 "List of charsets supported by `utf-translate-cjk-mode'.") |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
201 |
57727
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
202 (defvar utf-translate-cjk-lang-env nil |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
203 "Language environment in which tables for `utf-translate-cjk-mode' is loaded. |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
204 The value nil means that the tables are not yet loaded.") |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
205 |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
206 (defvar utf-translate-cjk-unicode-range) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
207 |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
208 ;; String generated from utf-translate-cjk-unicode-range. It is |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
209 ;; suitable for an argument to skip-chars-forward. |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
210 (defvar utf-translate-cjk-unicode-range-string nil) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
211 |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
212 (defun utf-translate-cjk-set-unicode-range (range) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
213 (setq utf-translate-cjk-unicode-range range) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
214 (setq utf-translate-cjk-unicode-range-string |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
215 (let ((decode-char-no-trans |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
216 #'(lambda (x) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
217 (cond ((< x #x100) (make-char 'latin-iso8859-1 x)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
218 ((< x #x2500) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
219 (setq x (- x #x100)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
220 (make-char 'mule-unicode-0100-24ff |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
221 (+ (/ x 96) 32) (+ (% x 96) 32))) |
59996
aac0a33f5772
Change release version from 21.4 to 22.1 throughout.
Kim F. Storm <storm@cua.dk>
parents:
59096
diff
changeset
|
222 ((< x #x3400) |
57727
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
223 (setq x (- x #x2500)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
224 (make-char 'mule-unicode-2500-33ff |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
225 (+ (/ x 96) 32) (+ (% x 96) 32))) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
226 (t |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
227 (setq x (- x #xe000)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
228 (make-char 'mule-unicode-e000-ffff |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
229 (+ (/ x 96) 32) (+ (% x 96) 32)))))) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
230 ranges from to) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
231 (dolist (elt range) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
232 (setq from (max #xA0 (car elt)) to (min #xffff (cdr elt))) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
233 (if (and (>= to #x3400) (< to #xE000)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
234 (setq to #x33FF)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
235 (cond ((< from #x100) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
236 (if (>= to #xE000) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
237 (setq ranges (cons (cons #xE000 to) ranges) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
238 to #x33FF)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
239 (if (>= to #x2500) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
240 (setq ranges (cons (cons #x2500 to) ranges) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
241 to #x24FF)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
242 (if (>= to #x100) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
243 (setq ranges (cons (cons #x100 to) ranges) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
244 to #xFF))) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
245 ((< from #x2500) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
246 (if (>= to #xE000) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
247 (setq ranges (cons (cons #xE000 to) ranges) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
248 to #x33FF)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
249 (if (>= to #x2500) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
250 (setq ranges (cons (cons #x2500 to) ranges) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
251 to #x24FF))) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
252 ((< from #x3400) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
253 (if (>= to #xE000) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
254 (setq ranges (cons (cons #xE000 to) ranges) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
255 to #x33FF)))) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
256 (if (<= from to) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
257 (setq ranges (cons (cons from to) ranges)))) |
59996
aac0a33f5772
Change release version from 21.4 to 22.1 throughout.
Kim F. Storm <storm@cua.dk>
parents:
59096
diff
changeset
|
258 (mapconcat #'(lambda (x) |
aac0a33f5772
Change release version from 21.4 to 22.1 throughout.
Kim F. Storm <storm@cua.dk>
parents:
59096
diff
changeset
|
259 (format "%c-%c" |
57727
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
260 (funcall decode-char-no-trans (car x)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
261 (funcall decode-char-no-trans (cdr x)))) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
262 ranges ""))) |
57737
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
263 ;; These forces loading and settting tables for |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
264 ;; utf-translate-cjk-mode. |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
265 (setq utf-translate-cjk-lang-env nil |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
266 ucs-mule-cjk-to-unicode (make-hash-table :test 'eq) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
267 ucs-unicode-to-mule-cjk (make-hash-table :test 'eq))) |
57727
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
268 |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
269 (defcustom utf-translate-cjk-unicode-range '((#x2e80 . #xd7a3) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
270 (#xff00 . #xffef)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
271 "List of Unicode code ranges supported by `utf-translate-cjk-mode'. |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
272 Setting this variable directly does not take effect; |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
273 use either \\[customize] or the function |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
274 `utf-translate-cjk-set-unicode-range'." |
59996
aac0a33f5772
Change release version from 21.4 to 22.1 throughout.
Kim F. Storm <storm@cua.dk>
parents:
59096
diff
changeset
|
275 :version "22.1" |
57727
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
276 :type '(repeat (cons integer integer)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
277 :set (lambda (symbol value) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
278 (utf-translate-cjk-set-unicode-range value)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
279 :group 'mule) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
280 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
281 ;; Return non-nil if CODE-POINT is in `utf-translate-cjk-unicode-range'. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
282 (defsubst utf-translate-cjk-substitutable-p (code-point) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
283 (let ((tail utf-translate-cjk-unicode-range) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
284 elt) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
285 (while tail |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
286 (setq elt (car tail) tail (cdr tail)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
287 (if (and (>= code-point (car elt)) (<= code-point (cdr elt))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
288 (setq tail nil) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
289 (setq elt nil))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
290 elt)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
291 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
292 (defun utf-translate-cjk-load-tables () |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
293 "Load tables for `utf-translate-cjk-mode'." |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
294 ;; Fixme: Allow the use of the CJK charsets to be |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
295 ;; customized by reordering and possible omission. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
296 (let ((redefined (< (hash-table-size ucs-mule-cjk-to-unicode) 43000))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
297 (if redefined |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
298 ;; Redefine them with realistic initial sizes and a |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
299 ;; smallish rehash size to avoid wasting significant |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
300 ;; space after they're built. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
301 (setq ucs-mule-cjk-to-unicode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
302 (make-hash-table :test 'eq :size 43000 :rehash-size 1000) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
303 ucs-unicode-to-mule-cjk |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
304 (make-hash-table :test 'eq :size 21500 :rehash-size 1000))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
305 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
306 ;; Load the files explicitly, to avoid having to keep |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
307 ;; around the large tables they contain (as well as the |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
308 ;; ones which get built). |
59096
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
309 ;; Here we bind coding-system-for-read to nil so that coding tags |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
310 ;; in the files are respected even if the files are not yet |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
311 ;; byte-compiled |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
312 (let ((coding-system-for-read nil)) |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
313 (cond ((string= "Korean" current-language-environment) |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
314 (load "subst-jis") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
315 (load "subst-big5") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
316 (load "subst-gb2312") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
317 (load "subst-ksc")) |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
318 ((string= "Chinese-BIG5" current-language-environment) |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
319 (load "subst-jis") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
320 (load "subst-ksc") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
321 (load "subst-gb2312") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
322 (load "subst-big5")) |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
323 ((string= "Chinese-GB" current-language-environment) |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
324 (load "subst-jis") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
325 (load "subst-ksc") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
326 (load "subst-big5") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
327 (load "subst-gb2312")) |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
328 (t |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
329 (load "subst-ksc") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
330 (load "subst-gb2312") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
331 (load "subst-big5") |
2447136abfc1
(utf-translate-cjk-load-tables): Bind
Kenichi Handa <handa@m17n.org>
parents:
57761
diff
changeset
|
332 (load "subst-jis")))) ; jis covers as much as big5, gb2312 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
333 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
334 (when redefined |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
335 (define-translation-hash-table 'utf-subst-table-for-decode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
336 ucs-unicode-to-mule-cjk) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
337 (define-translation-hash-table 'utf-subst-table-for-encode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
338 ucs-mule-cjk-to-unicode) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
339 (set-char-table-extra-slot (get 'utf-translation-table-for-encode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
340 'translation-table) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
341 1 ucs-mule-cjk-to-unicode)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
342 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
343 (setq utf-translate-cjk-lang-env current-language-environment))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
344 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
345 (defun utf-lookup-subst-table-for-decode (code-point) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
346 (if (and utf-translate-cjk-mode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
347 (not utf-translate-cjk-lang-env) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
348 (utf-translate-cjk-substitutable-p code-point)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
349 (utf-translate-cjk-load-tables)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
350 (gethash code-point |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
351 (get 'utf-subst-table-for-decode 'translation-hash-table))) |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
352 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
353 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
354 (defun utf-lookup-subst-table-for-encode (char) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
355 (if (and utf-translate-cjk-mode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
356 (not utf-translate-cjk-lang-env) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
357 (memq (char-charset char) utf-translate-cjk-charsets)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
358 (utf-translate-cjk-load-tables)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
359 (gethash char |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
360 (get 'utf-subst-table-for-encode 'translation-hash-table))) |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
361 |
50341
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
362 (define-minor-mode utf-translate-cjk-mode |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
363 "Toggle whether UTF based coding systems de/encode CJK characters. |
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
364 If ARG is an integer, enable if ARG is positive and disable if |
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
365 zero or negative. This is a minor mode. |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
366 Enabling this allows the coding systems mule-utf-8, |
51628
abfc7d48b476
(utf-translate-cjk-mode): Fix docstring.
Kenichi Handa <handa@m17n.org>
parents:
50766
diff
changeset
|
367 mule-utf-16le and mule-utf-16be to encode characters in the charsets |
48848 | 368 `korean-ksc5601', `chinese-gb2312', `chinese-big5-1', |
369 `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to | |
370 decode the corresponding unicodes into such characters. | |
46496 | 371 |
48848 | 372 Where the charsets overlap, the one preferred for decoding is chosen |
373 according to the language environment in effect when this option is | |
374 turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for | |
375 Chinese-Big5 and jisx for other environments. | |
376 | |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
377 This mode is on by default. If you are not interested in CJK |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
378 characters and want to avoid some overhead on encoding/decoding |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
379 by the above coding systems, you can customize the user option |
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
380 `utf-translate-cjk-mode' to nil." |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
381 :init-value t |
59996
aac0a33f5772
Change release version from 21.4 to 22.1 throughout.
Kim F. Storm <storm@cua.dk>
parents:
59096
diff
changeset
|
382 :version "22.1" |
46496 | 383 :type 'boolean |
50341
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
384 :group 'mule |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
385 :global t |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
386 (if utf-translate-cjk-mode |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
387 (progn |
50766
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
388 (define-translation-hash-table 'utf-subst-table-for-decode |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
389 ucs-unicode-to-mule-cjk) |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
390 (define-translation-hash-table 'utf-subst-table-for-encode |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
391 ucs-mule-cjk-to-unicode) |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
392 (set-char-table-extra-slot (get 'utf-translation-table-for-encode |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
393 'translation-table) |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
394 1 ucs-mule-cjk-to-unicode)) |
50549
c8525ac04d76
(utf-translate-cjk-mode): Fix incorrect
Kenichi Handa <handa@m17n.org>
parents:
50341
diff
changeset
|
395 (define-translation-hash-table 'utf-subst-table-for-decode |
c8525ac04d76
(utf-translate-cjk-mode): Fix incorrect
Kenichi Handa <handa@m17n.org>
parents:
50341
diff
changeset
|
396 (make-hash-table :test 'eq)) |
c8525ac04d76
(utf-translate-cjk-mode): Fix incorrect
Kenichi Handa <handa@m17n.org>
parents:
50341
diff
changeset
|
397 (define-translation-hash-table 'utf-subst-table-for-encode |
50766
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
398 (make-hash-table :test 'eq)) |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
399 (set-char-table-extra-slot (get 'utf-translation-table-for-encode |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
400 'translation-table) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
401 1 nil)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
402 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
403 ;; Update safe-chars of mule-utf-* coding systems. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
404 (dolist (elt (coding-system-list t)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
405 (if (string-match "^mule-utf" (symbol-name elt)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
406 (let ((safe-charsets (coding-system-get elt 'safe-charsets)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
407 (safe-chars (coding-system-get elt 'safe-chars)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
408 (need-update nil)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
409 (dolist (charset utf-translate-cjk-charsets) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
410 (unless (eq utf-translate-cjk-mode (memq charset safe-charsets)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
411 (setq safe-charsets |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
412 (if utf-translate-cjk-mode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
413 (cons charset safe-charsets) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
414 (delq charset safe-charsets)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
415 need-update t) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
416 (aset safe-chars (make-char charset) utf-translate-cjk-mode))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
417 (when need-update |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
418 (coding-system-put elt 'safe-charsets safe-charsets) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
419 (define-coding-system-internal elt)))))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
420 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
421 (define-ccl-program ccl-mule-utf-untrans |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
422 ;; R0 is an untranslatable Unicode code-point (U+3500..U+DFFF or |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
423 ;; U+10000..U+10FFFF) or an invaid byte (#x00..#xFF). Write |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
424 ;; eight-bit-control/graphic sequence (2 to 4 chars) representing |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
425 ;; UTF-8 sequence of r0. Registers r4, r5, r6 are modified. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
426 ;; |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
427 ;; This is a subrountine because we assume that this is called very |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
428 ;; rarely (so we don't have to worry about the overhead of the |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
429 ;; call). |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
430 `(0 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
431 ((r5 = ,(charset-id 'eight-bit-control)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
432 (r6 = ,(charset-id 'eight-bit-graphic)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
433 (if (r0 < #x100) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
434 ((r4 = ((r0 >> 6) | #xC0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
435 (write-multibyte-character r6 r4)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
436 ((if (r0 < #x10000) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
437 ((r4 = ((r0 >> 12) | #xE0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
438 (write-multibyte-character r6 r4)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
439 ((r4 = ((r0 >> 18) | #xF0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
440 (write-multibyte-character r6 r4) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
441 (r4 = (((r0 >> 12) & #x3F) | #x80)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
442 (if (r4 < #xA0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
443 (write-multibyte-character r5 r4) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
444 (write-multibyte-character r6 r4)))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
445 (r4 = (((r0 >> 6) & #x3F) | #x80)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
446 (if (r4 < #xA0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
447 (write-multibyte-character r5 r4) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
448 (write-multibyte-character r6 r4)))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
449 (r4 = ((r0 & #x3F) | #x80)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
450 (if (r4 < #xA0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
451 (write-multibyte-character r5 r4) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
452 (write-multibyte-character r6 r4))))) |
46496 | 453 |
35542 | 454 (define-ccl-program ccl-decode-mule-utf-8 |
455 ;; | |
456 ;; charset | bytes in utf-8 | bytes in emacs | |
457 ;; -----------------------+----------------+--------------- | |
458 ;; ascii | 1 | 1 | |
459 ;; -----------------------+----------------+--------------- | |
460 ;; eight-bit-control | 2 | 2 | |
41873 | 461 ;; eight-bit-graphic | 2 | 1 |
35542 | 462 ;; latin-iso8859-1 | 2 | 2 |
463 ;; -----------------------+----------------+--------------- | |
464 ;; mule-unicode-0100-24ff | 2 | 4 | |
465 ;; (< 0800) | | | |
466 ;; -----------------------+----------------+--------------- | |
467 ;; mule-unicode-0100-24ff | 3 | 4 | |
468 ;; (>= 8000) | | | |
469 ;; mule-unicode-2500-33ff | 3 | 4 | |
470 ;; mule-unicode-e000-ffff | 3 | 4 | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
471 ;; -----------------------+----------------+--------------- |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
472 ;; invalid byte | 1 | 2 |
35542 | 473 ;; |
474 ;; Thus magnification factor is two. | |
475 ;; | |
476 `(2 | |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
477 ((r6 = ,(charset-id 'latin-iso8859-1)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
478 (read r0) |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
479 (loop |
35542 | 480 (if (r0 < #x80) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
481 ;; 1-byte encoding, i.e., ascii |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
482 (write-read-repeat r0)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
483 (if (r0 < #xc2) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
484 ;; continuation byte (invalid here) or 1st byte of overlong |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
485 ;; 2-byte sequence. |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
486 ((call ccl-mule-utf-untrans) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
487 (r6 = ,(charset-id 'latin-iso8859-1)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
488 (read r0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
489 (repeat))) |
35542 | 490 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
491 ;; Read the 2nd byte. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
492 (read r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
493 (if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
494 ((call ccl-mule-utf-untrans) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
495 (r6 = ,(charset-id 'latin-iso8859-1)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
496 ;; Handle it in the next loop. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
497 (r0 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
498 (repeat))) |
46496 | 499 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
500 (if (r0 < #xe0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
501 ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
502 ((r1 &= #x3F) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
503 (r1 |= ((r0 & #x1F) << 6)) |
57737
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
504 ;; Now r1 holds scalar value. We don't have to check |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
505 ;; `overlong sequence' because r0 >= 0xC2. |
46496 | 506 |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
507 (if (r1 >= 256) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
508 ;; mule-unicode-0100-24ff (< 0800) |
57737
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
509 ((r0 = r1) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
510 (lookup-integer utf-subst-table-for-decode r0 r1) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
511 (if (r7 == 0) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
512 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
513 (r1 -= #x0100) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
514 (r2 = (((r1 / 96) + 32) << 7)) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
515 (r1 %= 96) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
516 (r1 += (r2 + 32)) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
517 (translate-character |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
518 utf-translation-table-for-decode r0 r1))) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
519 (write-multibyte-character r0 r1) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
520 (read r0) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
521 (repeat)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
522 (if (r1 >= 160) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
523 ;; latin-iso8859-1 |
57737
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
524 ((r0 = r1) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
525 (lookup-integer utf-subst-table-for-decode r0 r1) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
526 (if (r7 == 0) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
527 ((r1 -= 128) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
528 (write-multibyte-character r6 r1)) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
529 ((write-multibyte-character r0 r1))) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
530 (read r0) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
531 (repeat)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
532 ;; eight-bit-control |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
533 ((r0 = ,(charset-id 'eight-bit-control)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
534 (write-multibyte-character r0 r1) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
535 (read r0) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
536 (repeat)))))) |
46496 | 537 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
538 ;; Read the 3rd bytes. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
539 (read r2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
540 (if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
541 ((call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
542 (r0 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
543 (call ccl-mule-utf-untrans) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
544 (r6 = ,(charset-id 'latin-iso8859-1)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
545 ;; Handle it in the next loop. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
546 (r0 = r2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
547 (repeat))) |
46496 | 548 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
549 (if (r0 < #xF0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
550 ;; 3byte encoding |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
551 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
552 ((r3 = ((r0 & #xF) << 12)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
553 (r3 |= ((r1 & #x3F) << 6)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
554 (r3 |= (r2 & #x3F)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
555 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
556 (if (r3 < #x800) ; `overlong sequence' |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
557 ((call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
558 (r0 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
559 (call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
560 (r0 = r2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
561 (call ccl-mule-utf-untrans) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
562 (r6 = ,(charset-id 'latin-iso8859-1)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
563 (read r0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
564 (repeat))) |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
49028
diff
changeset
|
565 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
566 (if (r3 < #x2500) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
567 ;; mule-unicode-0100-24ff (>= 0800) |
57737
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
568 ((r0 = r3) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
569 (lookup-integer utf-subst-table-for-decode r0 r1) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
570 (if (r7 == 0) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
571 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
572 (r3 -= #x0100) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
573 (r3 //= 96) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
574 (r1 = (r7 + 32)) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
575 (r1 += ((r3 + 32) << 7)) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
576 (translate-character |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
577 utf-translation-table-for-decode r0 r1))) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
578 (write-multibyte-character r0 r1) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
579 (read r0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
580 (repeat))) |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
49028
diff
changeset
|
581 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
582 (if (r3 < #x3400) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
583 ;; mule-unicode-2500-33ff |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
584 ((r0 = r3) ; don't zap r3 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
585 (lookup-integer utf-subst-table-for-decode r0 r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
586 (if (r7 == 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
587 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
588 (r3 -= #x2500) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
589 (r3 //= 96) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
590 (r1 = (r7 + 32)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
591 (r1 += ((r3 + 32) << 7)))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
592 (write-multibyte-character r0 r1) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
593 (read r0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
594 (repeat))) |
46496 | 595 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
596 (if (r3 < #xE000) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
597 ;; Try to convert to CJK chars, else |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
598 ;; keep them as eight-bit-{control|graphic}. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
599 ((r0 = r3) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
600 (lookup-integer utf-subst-table-for-decode r3 r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
601 (if r7 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
602 ;; got a translation |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
603 ((write-multibyte-character r3 r1) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
604 (read r0) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
605 (repeat)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
606 ((call ccl-mule-utf-untrans) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
607 (r6 = ,(charset-id 'latin-iso8859-1)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
608 (read r0) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
609 (repeat))))) |
46496 | 610 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
611 ;; mule-unicode-e000-ffff |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
612 ;; Fixme: fffe and ffff are invalid. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
613 (r0 = r3) ; don't zap r3 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
614 (lookup-integer utf-subst-table-for-decode r0 r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
615 (if (r7 == 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
616 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
617 (r3 -= #xe000) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
618 (r3 //= 96) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
619 (r1 = (r7 + 32)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
620 (r1 += ((r3 + 32) << 7)))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
621 (write-multibyte-character r0 r1) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
622 (read r0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
623 (repeat))) |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
49028
diff
changeset
|
624 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
625 ;; Read the 4th bytes. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
626 (read r3) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
627 (if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
628 ((call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
629 (r0 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
630 (call ccl-mule-utf-untrans) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
631 (r0 = r2) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
632 (call ccl-mule-utf-untrans) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
633 (r6 = ,(charset-id 'latin-iso8859-1)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
634 ;; Handle it in the next loop. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
635 (r0 = r3) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
636 (repeat))) |
35542 | 637 |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
638 (if (r0 < #xF8) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
639 ;; 4-byte encoding: |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
640 ;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
641 ;; keep those bytes as eight-bit-{control|graphic} |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
642 ;; Fixme: allow lookup in utf-subst-table-for-decode. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
643 ((r4 = ((r0 & #x7) << 18)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
644 (r4 |= ((r1 & #x3F) << 12)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
645 (r4 |= ((r2 & #x3F) << 6)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
646 (r4 |= (r3 & #x3F)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
647 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
648 (if (r4 < #x10000) ; `overlong sequence' |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
649 ((call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
650 (r0 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
651 (call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
652 (r0 = r2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
653 (call ccl-mule-utf-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
654 (r0 = r3) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
655 (call ccl-mule-utf-untrans)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
656 ((r0 = r4) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
657 (call ccl-mule-utf-untrans)))) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
658 |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
659 ;; Unsupported sequence. |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
660 ((call ccl-mule-utf-untrans) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
661 (r0 = r1) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
662 (call ccl-mule-utf-untrans) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
663 (r0 = r2) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
664 (call ccl-mule-utf-untrans) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
665 (r0 = r3) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
666 (call ccl-mule-utf-untrans))) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
667 (r6 = ,(charset-id 'latin-iso8859-1)) |
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
668 (read r0) |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
669 (repeat))) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
670 |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
671 |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
672 ;; At EOF... |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
673 (if (r0 >= 0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
674 ;; r0 >= #x80 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
675 ((call ccl-mule-utf-untrans) |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
676 (if (r1 >= 0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
677 ((r0 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
678 (call ccl-mule-utf-untrans) |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
679 (if (r2 >= 0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
680 ((r0 = r2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
681 (call ccl-mule-utf-untrans) |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
682 (if (r3 >= 0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
683 ((r0 = r3) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
684 (call ccl-mule-utf-untrans)))))))))) |
35542 | 685 |
36243 | 686 "CCL program to decode UTF-8. |
36465 | 687 Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
688 mule-unicode-*, but see also `utf-fragmentation-table' and |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
689 `ucs-mule-cjk-to-unicode'. |
46496 | 690 Encodings of un-representable Unicode characters are decoded asis into |
691 eight-bit-control and eight-bit-graphic characters.") | |
35542 | 692 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
693 (define-ccl-program ccl-mule-utf-8-encode-untrans |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
694 ;; UTF-8 decoder generates an UTF-8 sequence represented by a |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
695 ;; sequence eight-bit-control/graphic chars for an untranslatable |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
696 ;; character and an invalid byte. |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
697 ;; |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
698 ;; This CCL parses that sequence (the first byte is already in r1), |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
699 ;; writes out the original bytes of that sequence, and sets r5 to |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
700 ;; -1. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
701 ;; |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
702 ;; If the eight-bit-control/graphic sequence is shorter than what r1 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
703 ;; suggests, it sets r5 and r6 to the last character read that |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
704 ;; should be handled by the next loop of a caller. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
705 ;; |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
706 ;; Note: For UTF-8 validation, we only check if a character is |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
707 ;; eight-bit-control/graphic or not. It may result in incorrect |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
708 ;; handling of random binary data, but such a data can't be encoded |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
709 ;; by UTF-8 anyway. At least, UTF-8 decoders doesn't generate such |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
710 ;; a sequence even if a source contains invalid byte-sequence. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
711 `(0 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
712 (;; Read the 2nd byte. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
713 (read-multibyte-character r5 r6) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
714 (r0 = (r5 != ,(charset-id 'eight-bit-control))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
715 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
716 ((write r1) ; invalid UTF-8 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
717 (r1 = -1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
718 (end))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
719 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
720 (if (r1 <= #xC3) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
721 ;; 2-byte sequence for an originally invalid byte. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
722 ((r6 &= #x3F) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
723 (r6 |= ((r1 & #x1F) << 6)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
724 (write r6) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
725 (r5 = -1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
726 (end))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
727 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
728 (write r1 r6) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
729 (r2 = r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
730 (r1 = -1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
731 ;; Read the 3rd byte. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
732 (read-multibyte-character r5 r6) |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
733 (r0 = (r5 != ,(charset-id 'eight-bit-control))) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
734 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
735 (end)) ; invalid UTF-8 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
736 (write r6) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
737 (if (r2 < #xF0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
738 ;; 3-byte sequence for an untranslated character. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
739 ((r5 = -1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
740 (end))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
741 ;; Read the 4th byte. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
742 (read-multibyte-character r5 r6) |
56562
9274a15c1400
(utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents:
56095
diff
changeset
|
743 (r0 = (r5 != ,(charset-id 'eight-bit-control))) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
744 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
745 (end)) ; invalid UTF-8 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
746 ;; 4-byte sequence for an untranslated character. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
747 (write r6) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
748 (r5 = -1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
749 (end)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
750 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
751 ;; At EOF... |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
752 ((r5 = -1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
753 (if (r1 >= 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
754 (write r1))))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
755 |
35542 | 756 (define-ccl-program ccl-encode-mule-utf-8 |
757 `(1 | |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
758 ((r5 = -1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
759 (loop |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
760 (if (r5 < 0) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
761 (read-multibyte-character r0 r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
762 ;; Pre-read character is in r5 (charset-ID) and r6 (code-point). |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
763 ((r0 = r5) |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
764 (r1 = r6) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
765 (r5 = -1))) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
766 (translate-character utf-translation-table-for-encode r0 r1) |
35542 | 767 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
768 (if (r0 == ,(charset-id 'ascii)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
769 (write-repeat r1)) |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
770 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
771 (if (r0 == ,(charset-id 'latin-iso8859-1)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
772 ;; r1 scalar utf-8 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
773 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
774 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
775 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 |
56095
4ec2da03a87c
(ccl-encode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56056
diff
changeset
|
776 ((write ((r1 >> 6) | #xc2)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
777 (r1 &= #x3f) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
778 (r1 |= #x80) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
779 (write-repeat r1))) |
35542 | 780 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
781 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
782 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
783 ;; #x3f80 == (0011 1111 1000 0000)b |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
784 (r1 &= #x7f) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
785 (r1 += (r0 + 224)) ; 240 == -32 + #x0100 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
786 ;; now r1 holds scalar value |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
787 (if (r1 < #x0800) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
788 ;; 2byte encoding |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
789 ((write ((r1 >> 6) | #xC0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
790 (r1 &= #x3F) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
791 (r1 |= #x80) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
792 (write-repeat r1)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
793 ;; 3byte encoding |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
794 ((write ((r1 >> 12) | #xE0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
795 (write (((r1 & #x0FC0) >> 6) | #x80)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
796 (r1 &= #x3F) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
797 (r1 |= #x80) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
798 (write-repeat r1))))) |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
799 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
800 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
801 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
802 (r1 &= #x7f) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
803 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
804 ;; now r1 holds scalar value |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
805 (write ((r1 >> 12) | #xE0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
806 (write (((r1 & #x0FC0) >> 6) | #x80)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
807 (r1 &= #x3F) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
808 (r1 |= #x80) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
809 (write-repeat r1))) |
35542 | 810 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
811 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
812 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
813 (r1 &= #x7f) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
814 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
815 ;; now r1 holds scalar value |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
816 (write ((r1 >> 12) | #xE0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
817 (write (((r1 & #x0FC0) >> 6) | #x80)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
818 (r1 &= #x3F) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
819 (r1 |= #x80) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
820 (write-repeat r1))) |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
821 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
822 (if (r0 == ,(charset-id 'eight-bit-control)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
823 ;; r1 scalar utf-8 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
824 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
825 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
826 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
827 ((write #xC2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
828 (write-repeat r1))) |
35542 | 829 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
830 (if (r0 == ,(charset-id 'eight-bit-graphic)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
831 ;; r1 scalar utf-8 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
832 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
833 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
834 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
835 ((r0 = (r1 >= #xC0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
836 (r0 &= (r1 <= #xC3)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
837 (r4 = (r1 >= #xE1)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
838 (r4 &= (r1 <= #xF7)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
839 (r0 |= r4) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
840 (if r0 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
841 ((call ccl-mule-utf-8-encode-untrans) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
842 (repeat)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
843 (write-repeat r1)))) |
35542 | 844 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
845 (lookup-character utf-subst-table-for-encode r0 r1) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
846 (if r7 ; lookup succeeded |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
847 (if (r0 < #x800) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
848 ;; 2byte encoding |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
849 ((write ((r0 >> 6) | #xC0)) |
56095
4ec2da03a87c
(ccl-encode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56056
diff
changeset
|
850 (r0 = ((r0 & #x3F) | #x80)) |
4ec2da03a87c
(ccl-encode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56056
diff
changeset
|
851 (write-repeat r0)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
852 ;; 3byte encoding |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
853 ((write ((r0 >> 12) | #xE0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
854 (write (((r0 & #x0FC0) >> 6) | #x80)) |
56095
4ec2da03a87c
(ccl-encode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56056
diff
changeset
|
855 (r0 = ((r0 & #x3F) | #x80)) |
4ec2da03a87c
(ccl-encode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56056
diff
changeset
|
856 (write-repeat r0)))) |
35542 | 857 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
858 ;; Unsupported character. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
859 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
860 (write #xef) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
861 (write #xbf) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
862 (write-repeat #xbd)))) |
46496 | 863 "CCL program to encode into UTF-8.") |
35542 | 864 |
41873 | 865 |
46496 | 866 (define-ccl-program ccl-untranslated-to-ucs |
867 `(0 | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
868 (if (r1 == 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
869 nil |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
870 (if (r0 <= #xC3) ; 2-byte encoding |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
871 ((r0 = ((r0 & #x3) << 6)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
872 (r0 |= (r1 & #x3F)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
873 (r1 = 2)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
874 (if (r2 == 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
875 (r1 = 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
876 (if (r0 < #xF0) ; 3-byte encoding, as above |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
877 ((r0 = ((r0 & #xF) << 12)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
878 (r0 |= ((r1 & #x3F) << 6)) |
56056
4575a565f45d
(ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents:
56037
diff
changeset
|
879 (r0 |= (r2 & #x3F)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
880 (r1 = 3)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
881 (if (r3 == 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
882 (r1 = 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
883 ((r0 = ((r0 & #x7) << 18)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
884 (r0 |= ((r1 & #x3F) << 12)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
885 (r0 |= ((r2 & #x3F) << 6)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
886 (r0 |= (r3 & #x3F)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
887 (r1 = 4)))))))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
888 "Decode 2-, 3-, or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
889 Set r1 to the byte length. r0 == 0 for invalid sequence.") |
46496 | 890 |
891 (defvar utf-8-ccl-regs (make-vector 8 0)) | |
892 | |
41873 | 893 (defsubst utf-8-untranslated-to-ucs () |
46496 | 894 "Return the UCS code for an untranslated sequence of raw bytes t point. |
895 Only for 3- or 4-byte sequences." | |
896 (aset utf-8-ccl-regs 0 (or (char-after) 0)) | |
897 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0)) | |
898 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0)) | |
899 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0)) | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
900 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs)) |
41873 | 901 |
902 (defun utf-8-help-echo (window object position) | |
903 (format "Untranslated Unicode U+%04X" | |
904 (get-char-property position 'untranslated-utf-8 object))) | |
905 | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
906 ;; We compose the untranslatable sequences into a single character, |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
907 ;; and move point to the next character. |
41873 | 908 ;; This is infelicitous for editing, because there's currently no |
909 ;; mechanism for treating compositions as atomic, but is OK for | |
46496 | 910 ;; display. They are composed to U+FFFD with help-echo which |
911 ;; indicates the unicodes they represent. This function GCs too much. | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
912 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
913 ;; If utf-translate-cjk-mode is non-nil, this function is called with |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
914 ;; HASH-TABLE which translates CJK characters into some of CJK |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
915 ;; charsets. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
916 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
917 (defsubst utf-8-compose (hash-table) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
918 "Put a suitable composition on an untranslatable sequence at point. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
919 If HASH-TABLE is non-nil, try to translate CJK characters by it at first. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
920 Move point to the end of the sequence." |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
921 (utf-8-untranslated-to-ucs) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
922 (let ((l (aref utf-8-ccl-regs 1)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
923 ch) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
924 (if (> l 0) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
925 (if (and hash-table |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
926 (setq ch (gethash (aref utf-8-ccl-regs 0) hash-table))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
927 (progn |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
928 (insert ch) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
929 (delete-region (point) (min (point-max) (+ l (point))))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
930 (setq ch (aref utf-8-ccl-regs 0)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
931 (put-text-property (point) (min (point-max) (+ l (point))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
932 'untranslated-utf-8 ch) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
933 (put-text-property (point) (min (point-max) (+ l (point))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
934 'help-echo 'utf-8-help-echo) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
935 (if (= l 2) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
936 (put-text-property (point) (min (point-max) (+ l (point))) |
66062
b23c01e98a4b
(utf-8-compose): Display an invalid UTF-8 byte with `escape-glyph'
Kenichi Handa <handa@m17n.org>
parents:
64085
diff
changeset
|
937 'display (propertize (format "\\%03o" ch) |
b23c01e98a4b
(utf-8-compose): Display an invalid UTF-8 byte with `escape-glyph'
Kenichi Handa <handa@m17n.org>
parents:
64085
diff
changeset
|
938 'face 'escape-glyph)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
939 (compose-region (point) (+ l (point)) ?$,3u=(B)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
940 (forward-char l)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
941 (forward-char 1)))) |
41873 | 942 |
943 (defcustom utf-8-compose-scripts nil | |
46496 | 944 "*Non-nil means compose various scripts on decoding utf-8 text." |
41873 | 945 :group 'mule |
59996
aac0a33f5772
Change release version from 21.4 to 22.1 throughout.
Kim F. Storm <storm@cua.dk>
parents:
59096
diff
changeset
|
946 :version "22.1" |
46496 | 947 :type 'boolean) |
41873 | 948 |
949 (defun utf-8-post-read-conversion (length) | |
950 "Compose untranslated utf-8 sequences into single characters. | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
951 If `utf-translate-cjk-mode' is non-nil, tries to translate CJK characters. |
41873 | 952 Also compose particular scripts if `utf-8-compose-scripts' is non-nil." |
953 (save-excursion | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
954 (save-restriction |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
955 (narrow-to-region (point) (+ (point) length)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
956 ;; Can't do eval-when-compile to insert a multibyte constant |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
957 ;; version of the string in the loop, since it's always loaded as |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
958 ;; unibyte from a byte-compiled file. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
959 (let ((range (string-as-multibyte "^\xc0-\xc3\xe1-\xf7")) |
56800
752ef76fcc08
(utf-8-post-read-conversion): If the
Kenichi Handa <handa@m17n.org>
parents:
56562
diff
changeset
|
960 (buffer-multibyte enable-multibyte-characters) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
961 hash-table ch) |
56800
752ef76fcc08
(utf-8-post-read-conversion): If the
Kenichi Handa <handa@m17n.org>
parents:
56562
diff
changeset
|
962 (set-buffer-multibyte t) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
963 (when utf-translate-cjk-mode |
57727
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
964 (unless utf-translate-cjk-lang-env |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
965 ;; Check these characters in utf-translate-cjk-range. |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
966 ;; We may have to translate them to CJK charsets. |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
967 (skip-chars-forward |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
968 (concat range utf-translate-cjk-unicode-range-string)) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
969 (unless (eobp) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
970 (utf-translate-cjk-load-tables) |
c3945be39e09
(utf-translate-cjk-unicode-range-string):
Kenichi Handa <handa@m17n.org>
parents:
56800
diff
changeset
|
971 (setq range |
57737
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
972 (concat range utf-translate-cjk-unicode-range-string))) |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
973 (setq hash-table (get 'utf-subst-table-for-decode |
e425df7605c9
(ccl-decode-mule-utf-8): Check utf-subst-table-for-decode for more
Kenichi Handa <handa@m17n.org>
parents:
57727
diff
changeset
|
974 'translation-hash-table)))) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
975 (while (and (skip-chars-forward range) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
976 (not (eobp))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
977 (setq ch (following-char)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
978 (if (< ch 256) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
979 (utf-8-compose hash-table) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
980 (if (and hash-table |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
981 (setq ch (gethash (encode-char ch 'ucs) hash-table))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
982 (progn |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
983 (insert ch) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
984 (delete-char 1)) |
56800
752ef76fcc08
(utf-8-post-read-conversion): If the
Kenichi Handa <handa@m17n.org>
parents:
56562
diff
changeset
|
985 (forward-char 1)))) |
752ef76fcc08
(utf-8-post-read-conversion): If the
Kenichi Handa <handa@m17n.org>
parents:
56562
diff
changeset
|
986 (or buffer-multibyte |
752ef76fcc08
(utf-8-post-read-conversion): If the
Kenichi Handa <handa@m17n.org>
parents:
56562
diff
changeset
|
987 (set-buffer-multibyte nil))) |
41873 | 988 |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
989 (when (and utf-8-compose-scripts (> length 1)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
990 ;; These currently have definitions which cover the relevant |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
991 ;; unicodes. We could avoid loading thai-util &c by checking |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
992 ;; whether the region contains any characters with the appropriate |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
993 ;; categories. There aren't yet Unicode-based rules for Tibetan. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
994 (diacritic-compose-region (point-max) (point-min)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
995 (thai-compose-region (point-max) (point-min)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
996 (lao-compose-region (point-max) (point-min)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
997 (devanagari-compose-region (point-max) (point-min)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
998 (malayalam-compose-region (point-max) (point-min)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
999 (tamil-compose-region (point-max) (point-min))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1000 (- (point-max) (point-min))))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1001 |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1002 (defun utf-8-pre-write-conversion (beg end) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1003 "Prepare for `utf-translate-cjk-mode' to encode text between BEG and END. |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1004 This is used as a post-read-conversion of utf-8 coding system." |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1005 (if (and utf-translate-cjk-mode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1006 (not utf-translate-cjk-lang-env) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1007 (save-excursion |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1008 (goto-char beg) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1009 (re-search-forward "\\cc\\|\\cj\\|\\ch" end t))) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1010 (utf-translate-cjk-load-tables)) |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1011 nil) |
41873 | 1012 |
35542 | 1013 (make-coding-system |
1014 'mule-utf-8 4 ?u | |
1015 "UTF-8 encoding for Emacs-supported Unicode characters. | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1016 It supports Unicode characters of these ranges: |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1017 U+0000..U+33FF, U+E000..U+FFFF. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1018 They correspond to these Emacs character sets: |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1019 ascii, latin-iso8859-1, mule-unicode-0100-24ff, |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1020 mule-unicode-2500-33ff, mule-unicode-e000-ffff |
35542 | 1021 |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1022 On decoding (e.g. reading a file), Unicode characters not in the above |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1023 ranges are decoded into sequences of eight-bit-control and |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1024 eight-bit-graphic characters to preserve their byte sequences. The |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1025 byte sequence is preserved on i/o for valid utf-8, but not necessarily |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1026 for invalid utf-8. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1027 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1028 On encoding (e.g. writing a file), Emacs characters not belonging to |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1029 any of the character sets listed above are encoded into the UTF-8 byte |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1030 sequence representing U+FFFD (REPLACEMENT CHARACTER)." |
35542 | 1031 |
1032 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1033 `((safe-charsets |
35542 | 1034 ascii |
1035 eight-bit-control | |
1036 eight-bit-graphic | |
1037 latin-iso8859-1 | |
1038 mule-unicode-0100-24ff | |
1039 mule-unicode-2500-33ff | |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1040 mule-unicode-e000-ffff |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1041 ,@(if utf-translate-cjk-mode |
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1042 utf-translate-cjk-charsets)) |
36371
f6bb3ed752b4
(mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents:
36243
diff
changeset
|
1043 (mime-charset . utf-8) |
36423
aa776838b660
(mule-utf-8): Set coding-category property to coding-category-utf-8.
Kenichi Handa <handa@m17n.org>
parents:
36371
diff
changeset
|
1044 (coding-category . coding-category-utf-8) |
41873 | 1045 (valid-codes (0 . 255)) |
56037
81dbb510a1db
(utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents:
55437
diff
changeset
|
1046 (pre-write-conversion . utf-8-pre-write-conversion) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1047 (post-read-conversion . utf-8-post-read-conversion) |
50766
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
1048 (translation-table-for-encode . utf-translation-table-for-encode) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1049 (dependency unify-8859-on-encoding-mode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1050 unify-8859-on-decoding-mode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1051 utf-fragment-on-decoding |
55437
6e677a935fe9
Fix references to utf-translate-cjk into utf-translate-cjk-mode.
Andreas Schwab <schwab@suse.de>
parents:
54304
diff
changeset
|
1052 utf-translate-cjk-mode))) |
35542 | 1053 |
1054 (define-coding-system-alias 'utf-8 'mule-utf-8) | |
38436
b174db545cfd
Some fixes to follow coding conventions.
Pavel Janík <Pavel@Janik.cz>
parents:
37934
diff
changeset
|
1055 |
41873 | 1056 ;; I think this needs special private charsets defined for the |
1057 ;; untranslated sequences, if it's going to work well. | |
1058 | |
1059 ;;; (defun utf-8-compose-function (pos to pattern &optional string) | |
1060 ;;; (let* ((prop (get-char-property pos 'composition string)) | |
1061 ;;; (l (and prop (- (cadr prop) (car prop))))) | |
1062 ;;; (cond ((and l (> l (- to pos))) | |
1063 ;;; (delete-region pos to)) | |
1064 ;;; ((and (> (char-after pos) 224) | |
1065 ;;; (< (char-after pos) 256) | |
1066 ;;; (save-restriction | |
1067 ;;; (narrow-to-region pos to) | |
1068 ;;; (utf-8-compose))) | |
1069 ;;; t)))) | |
1070 | |
1071 ;;; (dotimes (i 96) | |
1072 ;;; (aset composition-function-table | |
1073 ;;; (+ 128 i) | |
1074 ;;; `((,(string-as-multibyte "[\200-\237\240-\377]") | |
1075 ;;; . utf-8-compose-function)))) | |
1076 | |
52401 | 1077 ;;; arch-tag: b08735b7-753b-4ae6-b754-0f3efe4515c5 |
38436
b174db545cfd
Some fixes to follow coding conventions.
Pavel Janík <Pavel@Janik.cz>
parents:
37934
diff
changeset
|
1078 ;;; utf-8.el ends here |