Mercurial > emacs
annotate lisp/international/utf-8.el @ 88250:61c503e4cee5
(rmail-require-mime-maybe): Simplified.
(rmail-mode-map): Rewritten using a local binding to agree
with "(elisp)Tips for Defining".
author | Alex Schroeder <alex@gnu.org> |
---|---|
date | Sat, 21 Jan 2006 01:30:30 +0000 |
parents | d7ddb3e565de |
children |
rev | line source |
---|---|
46496 | 1 ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*- |
35542 | 2 |
88155 | 3 ;; Copyright (C) 2001, 2002, 2003, 2004 Free Software Foundation, Inc. |
4 ;; Copyright (C) 2001, 2002, 2003, 2004 | |
5 ;; National Institute of Advanced Industrial Science and Technology (AIST) | |
6 ;; Registration Number H14PRO021 | |
35542 | 7 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
8 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> |
46496 | 9 ;; Maintainer: FSF |
36243 | 10 ;; Keywords: multilingual, Unicode, UTF-8, i18n |
35542 | 11 |
12 ;; This file is part of GNU Emacs. | |
13 | |
14 ;; GNU Emacs is free software; you can redistribute it and/or modify | |
15 ;; it under the terms of the GNU General Public License as published by | |
16 ;; the Free Software Foundation; either version 2, or (at your option) | |
17 ;; any later version. | |
18 | |
19 ;; GNU Emacs is distributed in the hope that it will be useful, | |
20 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
21 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
22 ;; GNU General Public License for more details. | |
23 | |
24 ;; You should have received a copy of the GNU General Public License | |
25 ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
88155 | 26 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
27 ;; Boston, MA 02110-1301, USA. | |
35542 | 28 |
29 ;;; Commentary: | |
30 | |
41873 | 31 ;; The coding-system `mule-utf-8' basically supports encoding/decoding |
32 ;; of the following character sets to and from UTF-8: | |
35542 | 33 ;; |
34 ;; ascii | |
35 ;; eight-bit-control | |
36 ;; latin-iso8859-1 | |
37 ;; mule-unicode-0100-24ff | |
38 ;; mule-unicode-2500-33ff | |
39 ;; mule-unicode-e000-ffff | |
40 ;; | |
36243 | 41 ;; On decoding, Unicode characters that do not fit into the above |
42 ;; character sets are handled as `eight-bit-control' or | |
43 ;; `eight-bit-graphic' characters to retain the information about the | |
46496 | 44 ;; original byte sequence and text properties record the corresponding |
45 ;; unicode. | |
46 ;; | |
47 ;; Fixme: note that reading and writing invalid utf-8 may not be | |
48 ;; idempotent -- to represent the bytes to fix that needs a new charset. | |
41873 | 49 ;; |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
50 ;; Characters from other character sets can be encoded with mule-utf-8 |
48848 | 51 ;; by populating the translation table |
88155 | 52 ;; `utf-translation-table-for-encode'. Hash tables |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
53 ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
54 ;; used to support encoding and decoding of about a quarter of the CJK |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
55 ;; space between U+3400 and U+DFFF. |
36243 | 56 |
88155 | 57 ;; UTF-8 is defined in RFC 3629. A sketch of the encoding is: |
35542 | 58 |
59 ;; scalar | utf-8 | |
60 ;; value | 1st byte | 2nd byte | 3rd byte | |
61 ;; --------------------+-----------+-----------+---------- | |
62 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | | |
63 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | | |
64 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx | |
65 | |
66 ;;; Code: | |
67 | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
68 (defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
69 "Char table mapping characters to latin-iso8859-1 or mule-unicode-*. |
46496 | 70 |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
71 If `unify-8859-on-encoding-mode' is non-nil, this table populates the |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
72 translation-table named `utf-translation-table-for-encode'.") |
46496 | 73 |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
74 (define-translation-table 'utf-translation-table-for-encode) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
75 |
46496 | 76 |
77 ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the | |
78 ;; space of mule-unicode. For Latin scripts this isn't very | |
79 ;; important. Hebrew and Arabic might go here too when there's proper | |
80 ;; support for them. | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
81 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
82 (defvar utf-fragmentation-table (make-char-table 'translation-table nil) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
83 "Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
84 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
85 If `utf-fragment-on-decoding' is non-nil, this table populates the |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
86 translation-table named `utf-translation-table-for-decode'") |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
87 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
88 (defvar utf-defragmentation-table (make-char-table 'translation-table nil) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
89 "Char-table for reverse mapping of `utf-fragmentation-table'. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
90 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
91 If `utf-fragment-on-decoding' is non-nil and |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
92 `unify-8859-on-encoding-mode' is nil, this table populates the |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
93 translation-table named `utf-translation-table-for-encode'") |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
94 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
95 (define-translation-table 'utf-translation-table-for-decode) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
96 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
97 |
48882 | 98 (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
99 "Hash table mapping Emacs CJK character sets to Unicode code points. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
100 |
88155 | 101 If `utf-translate-cjk-mode' is non-nil, this table populates the |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
102 translation-hash-table named `utf-subst-table-for-encode'.") |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
103 |
48882 | 104 (define-translation-hash-table 'utf-subst-table-for-encode |
105 ucs-mule-cjk-to-unicode) | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
106 |
48882 | 107 (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
108 "Hash table mapping Unicode code points to Emacs CJK character sets. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
109 |
88155 | 110 If `utf-translate-cjk-mode' is non-nil, this table populates the |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
111 translation-hash-table named `utf-subst-table-for-decode'.") |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
112 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
113 (define-translation-hash-table 'utf-subst-table-for-decode |
48882 | 114 ucs-unicode-to-mule-cjk) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
115 |
46496 | 116 (mapc |
117 (lambda (pair) | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
118 (aset utf-fragmentation-table (car pair) (cdr pair)) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
119 (aset utf-defragmentation-table (cdr pair) (car pair))) |
46496 | 120 '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) (?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B) |
121 (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) (?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,F@(B) | |
122 (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) (?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B) | |
123 (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) (?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B) | |
124 (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) (?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B) | |
125 (?$,1' (B . ?,FP(B) (?$,1'!(B . ?,FQ(B) (?$,1'#(B . ?,FS(B) (?$,1'$(B . ?,FT(B) (?$,1'%(B . ?,FU(B) | |
126 (?$,1'&(B . ?,FV(B) (?$,1''(B . ?,FW(B) (?$,1'((B . ?,FX(B) (?$,1')(B . ?,FY(B) (?$,1'*(B . ?,FZ(B) | |
127 (?$,1'+(B . ?,F[(B) (?$,1',(B . ?,F\(B) (?$,1'-(B . ?,F](B) (?$,1'.(B . ?,F^(B) (?$,1'/(B . ?,F_(B) | |
128 (?$,1'0(B . ?,F`(B) (?$,1'1(B . ?,Fa(B) (?$,1'2(B . ?,Fb(B) (?$,1'3(B . ?,Fc(B) (?$,1'4(B . ?,Fd(B) | |
129 (?$,1'5(B . ?,Fe(B) (?$,1'6(B . ?,Ff(B) (?$,1'7(B . ?,Fg(B) (?$,1'8(B . ?,Fh(B) (?$,1'9(B . ?,Fi(B) | |
130 (?$,1':(B . ?,Fj(B) (?$,1';(B . ?,Fk(B) (?$,1'<(B . ?,Fl(B) (?$,1'=(B . ?,Fm(B) (?$,1'>(B . ?,Fn(B) | |
131 (?$,1'?(B . ?,Fo(B) (?$,1'@(B . ?,Fp(B) (?$,1'A(B . ?,Fq(B) (?$,1'B(B . ?,Fr(B) (?$,1'C(B . ?,Fs(B) | |
132 (?$,1'D(B . ?,Ft(B) (?$,1'E(B . ?,Fu(B) (?$,1'F(B . ?,Fv(B) (?$,1'G(B . ?,Fw(B) (?$,1'H(B . ?,Fx(B) | |
133 (?$,1'I(B . ?,Fy(B) (?$,1'J(B . ?,Fz(B) (?$,1'K(B . ?,F{(B) (?$,1'L(B . ?,F|(B) (?$,1'M(B . ?,F}(B) | |
134 (?$,1'N(B . ?,F~(B) | |
135 | |
136 (?$,1(!(B . ?,L!(B) (?$,1("(B . ?,L"(B) (?$,1(#(B . ?,L#(B) (?$,1($(B . ?,L$(B) | |
137 (?$,1(%(B . ?,L%(B) (?$,1(&(B . ?,L&(B) (?$,1('(B . ?,L'(B) (?$,1(((B . ?,L((B) (?$,1()(B . ?,L)(B) | |
138 (?$,1(*(B . ?,L*(B) (?$,1(+(B . ?,L+(B) (?$,1(,(B . ?,L,(B) (?$,1(.(B . ?,L.(B) (?$,1(/(B . ?,L/(B) | |
139 (?$,1(0(B . ?,L0(B) (?$,1(1(B . ?,L1(B) (?$,1(2(B . ?,L2(B) (?$,1(3(B . ?,L3(B) (?$,1(4(B . ?,L4(B) | |
140 (?$,1(5(B . ?,L5(B) (?$,1(6(B . ?,L6(B) (?$,1(7(B . ?,L7(B) (?$,1(8(B . ?,L8(B) (?$,1(9(B . ?,L9(B) | |
141 (?$,1(:(B . ?,L:(B) (?$,1(;(B . ?,L;(B) (?$,1(<(B . ?,L<(B) (?$,1(=(B . ?,L=(B) (?$,1(>(B . ?,L>(B) | |
142 (?$,1(?(B . ?,L?(B) (?$,1(@(B . ?,L@(B) (?$,1(A(B . ?,LA(B) (?$,1(B(B . ?,LB(B) (?$,1(C(B . ?,LC(B) | |
143 (?$,1(D(B . ?,LD(B) (?$,1(E(B . ?,LE(B) (?$,1(F(B . ?,LF(B) (?$,1(G(B . ?,LG(B) (?$,1(H(B . ?,LH(B) | |
144 (?$,1(I(B . ?,LI(B) (?$,1(J(B . ?,LJ(B) (?$,1(K(B . ?,LK(B) (?$,1(L(B . ?,LL(B) (?$,1(M(B . ?,LM(B) | |
145 (?$,1(N(B . ?,LN(B) (?$,1(O(B . ?,LO(B) (?$,1(P(B . ?,LP(B) (?$,1(Q(B . ?,LQ(B) (?$,1(R(B . ?,LR(B) | |
146 (?$,1(S(B . ?,LS(B) (?$,1(T(B . ?,LT(B) (?$,1(U(B . ?,LU(B) (?$,1(V(B . ?,LV(B) (?$,1(W(B . ?,LW(B) | |
147 (?$,1(X(B . ?,LX(B) (?$,1(Y(B . ?,LY(B) (?$,1(Z(B . ?,LZ(B) (?$,1([(B . ?,L[(B) (?$,1(\(B . ?,L\(B) | |
148 (?$,1(](B . ?,L](B) (?$,1(^(B . ?,L^(B) (?$,1(_(B . ?,L_(B) (?$,1(`(B . ?,L`(B) (?$,1(a(B . ?,La(B) | |
149 (?$,1(b(B . ?,Lb(B) (?$,1(c(B . ?,Lc(B) (?$,1(d(B . ?,Ld(B) (?$,1(e(B . ?,Le(B) (?$,1(f(B . ?,Lf(B) | |
150 (?$,1(g(B . ?,Lg(B) (?$,1(h(B . ?,Lh(B) (?$,1(i(B . ?,Li(B) (?$,1(j(B . ?,Lj(B) (?$,1(k(B . ?,Lk(B) | |
151 (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) (?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B) | |
152 (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) (?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B) | |
153 (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) (?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B) | |
154 (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B))) | |
155 | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
156 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
157 (defcustom utf-fragment-on-decoding nil |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
158 "Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets. |
46496 | 159 Setting this means that the relevant Cyrillic and Greek characters are |
160 decoded into the iso8859 charsets rather than into | |
47231 | 161 mule-unicode-0100-24ff. The iso8859 charsets take half as much space |
46496 | 162 in the buffer, but using them may affect how the buffer can be re-encoded |
163 and may require a different input method to search for them, for instance. | |
164 See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode' | |
47231 | 165 for mechanisms to make this largely transparent. |
166 | |
167 Setting this variable outside customize has no effect." | |
46496 | 168 :set (lambda (s v) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
169 (if v |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
170 (progn |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
171 (define-translation-table 'utf-translation-table-for-decode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
172 utf-fragmentation-table) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
173 ;; Even if unify-8859-on-encoding-mode is off, make |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
174 ;; mule-utf-* encode characters in |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
175 ;; utf-fragmentation-table. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
176 (unless (eq (get 'utf-translation-table-for-encode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
177 'translation-table) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
178 ucs-mule-to-mule-unicode) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
179 (define-translation-table 'utf-translation-table-for-encode |
88155 | 180 utf-defragmentation-table))) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
181 (define-translation-table 'utf-translation-table-for-decode) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
182 ;; When unify-8859-on-encoding-mode is off, be sure to make |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
183 ;; mule-utf-* disabled for characters in |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
184 ;; utf-fragmentation-table. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
185 (unless (eq (get 'utf-translation-table-for-encode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
186 'translation-table) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
187 ucs-mule-to-mule-unicode) |
88155 | 188 (define-translation-table 'utf-translation-table-for-encode))) |
46496 | 189 (set-default s v)) |
88155 | 190 :version "22.1" |
46496 | 191 :type 'boolean |
192 :group 'mule) | |
193 | |
88155 | 194 |
195 (defconst utf-translate-cjk-charsets '(chinese-gb2312 | |
196 chinese-big5-1 chinese-big5-2 | |
197 japanese-jisx0208 japanese-jisx0212 | |
198 katakana-jisx0201 | |
199 korean-ksc5601) | |
200 "List of charsets supported by `utf-translate-cjk-mode'.") | |
201 | |
202 (defvar utf-translate-cjk-lang-env nil | |
203 "Language environment in which tables for `utf-translate-cjk-mode' is loaded. | |
204 The value nil means that the tables are not yet loaded.") | |
205 | |
206 (defvar utf-translate-cjk-unicode-range) | |
207 | |
208 ;; String generated from utf-translate-cjk-unicode-range. It is | |
209 ;; suitable for an argument to skip-chars-forward. | |
210 (defvar utf-translate-cjk-unicode-range-string nil) | |
211 | |
212 (defun utf-translate-cjk-set-unicode-range (range) | |
213 (setq utf-translate-cjk-unicode-range range) | |
214 (setq utf-translate-cjk-unicode-range-string | |
215 (let ((decode-char-no-trans | |
216 #'(lambda (x) | |
217 (cond ((< x #x100) (make-char 'latin-iso8859-1 x)) | |
218 ((< x #x2500) | |
219 (setq x (- x #x100)) | |
220 (make-char 'mule-unicode-0100-24ff | |
221 (+ (/ x 96) 32) (+ (% x 96) 32))) | |
222 ((< x #x3400) | |
223 (setq x (- x #x2500)) | |
224 (make-char 'mule-unicode-2500-33ff | |
225 (+ (/ x 96) 32) (+ (% x 96) 32))) | |
226 (t | |
227 (setq x (- x #xe000)) | |
228 (make-char 'mule-unicode-e000-ffff | |
229 (+ (/ x 96) 32) (+ (% x 96) 32)))))) | |
230 ranges from to) | |
231 (dolist (elt range) | |
232 (setq from (max #xA0 (car elt)) to (min #xffff (cdr elt))) | |
233 (if (and (>= to #x3400) (< to #xE000)) | |
234 (setq to #x33FF)) | |
235 (cond ((< from #x100) | |
236 (if (>= to #xE000) | |
237 (setq ranges (cons (cons #xE000 to) ranges) | |
238 to #x33FF)) | |
239 (if (>= to #x2500) | |
240 (setq ranges (cons (cons #x2500 to) ranges) | |
241 to #x24FF)) | |
242 (if (>= to #x100) | |
243 (setq ranges (cons (cons #x100 to) ranges) | |
244 to #xFF))) | |
245 ((< from #x2500) | |
246 (if (>= to #xE000) | |
247 (setq ranges (cons (cons #xE000 to) ranges) | |
248 to #x33FF)) | |
249 (if (>= to #x2500) | |
250 (setq ranges (cons (cons #x2500 to) ranges) | |
251 to #x24FF))) | |
252 ((< from #x3400) | |
253 (if (>= to #xE000) | |
254 (setq ranges (cons (cons #xE000 to) ranges) | |
255 to #x33FF)))) | |
256 (if (<= from to) | |
257 (setq ranges (cons (cons from to) ranges)))) | |
258 (mapconcat #'(lambda (x) | |
259 (format "%c-%c" | |
260 (funcall decode-char-no-trans (car x)) | |
261 (funcall decode-char-no-trans (cdr x)))) | |
262 ranges ""))) | |
263 ;; These forces loading and settting tables for | |
264 ;; utf-translate-cjk-mode. | |
265 (setq utf-translate-cjk-lang-env nil | |
266 ucs-mule-cjk-to-unicode (make-hash-table :test 'eq) | |
267 ucs-unicode-to-mule-cjk (make-hash-table :test 'eq))) | |
268 | |
269 (defcustom utf-translate-cjk-unicode-range '((#x2e80 . #xd7a3) | |
270 (#xff00 . #xffef)) | |
271 "List of Unicode code ranges supported by `utf-translate-cjk-mode'. | |
272 Setting this variable directly does not take effect; | |
273 use either \\[customize] or the function | |
274 `utf-translate-cjk-set-unicode-range'." | |
275 :version "22.1" | |
276 :type '(repeat (cons integer integer)) | |
277 :set (lambda (symbol value) | |
278 (utf-translate-cjk-set-unicode-range value)) | |
279 :group 'mule) | |
280 | |
281 ;; Return non-nil if CODE-POINT is in `utf-translate-cjk-unicode-range'. | |
282 (defsubst utf-translate-cjk-substitutable-p (code-point) | |
283 (let ((tail utf-translate-cjk-unicode-range) | |
284 elt) | |
285 (while tail | |
286 (setq elt (car tail) tail (cdr tail)) | |
287 (if (and (>= code-point (car elt)) (<= code-point (cdr elt))) | |
288 (setq tail nil) | |
289 (setq elt nil))) | |
290 elt)) | |
291 | |
292 (defun utf-translate-cjk-load-tables () | |
293 "Load tables for `utf-translate-cjk-mode'." | |
294 ;; Fixme: Allow the use of the CJK charsets to be | |
295 ;; customized by reordering and possible omission. | |
296 (let ((redefined (< (hash-table-size ucs-mule-cjk-to-unicode) 43000))) | |
297 (if redefined | |
298 ;; Redefine them with realistic initial sizes and a | |
299 ;; smallish rehash size to avoid wasting significant | |
300 ;; space after they're built. | |
301 (setq ucs-mule-cjk-to-unicode | |
302 (make-hash-table :test 'eq :size 43000 :rehash-size 1000) | |
303 ucs-unicode-to-mule-cjk | |
304 (make-hash-table :test 'eq :size 21500 :rehash-size 1000))) | |
305 | |
306 ;; Load the files explicitly, to avoid having to keep | |
307 ;; around the large tables they contain (as well as the | |
308 ;; ones which get built). | |
309 ;; Here we bind coding-system-for-read to nil so that coding tags | |
310 ;; in the files are respected even if the files are not yet | |
311 ;; byte-compiled | |
312 (let ((coding-system-for-read nil)) | |
313 (cond ((string= "Korean" current-language-environment) | |
314 (load "subst-jis") | |
315 (load "subst-big5") | |
316 (load "subst-gb2312") | |
317 (load "subst-ksc")) | |
318 ((string= "Chinese-BIG5" current-language-environment) | |
319 (load "subst-jis") | |
320 (load "subst-ksc") | |
321 (load "subst-gb2312") | |
322 (load "subst-big5")) | |
323 ((string= "Chinese-GB" current-language-environment) | |
324 (load "subst-jis") | |
325 (load "subst-ksc") | |
326 (load "subst-big5") | |
327 (load "subst-gb2312")) | |
328 (t | |
329 (load "subst-ksc") | |
330 (load "subst-gb2312") | |
331 (load "subst-big5") | |
332 (load "subst-jis")))) ; jis covers as much as big5, gb2312 | |
333 | |
334 (when redefined | |
335 (define-translation-hash-table 'utf-subst-table-for-decode | |
336 ucs-unicode-to-mule-cjk) | |
337 (define-translation-hash-table 'utf-subst-table-for-encode | |
338 ucs-mule-cjk-to-unicode) | |
339 (set-char-table-extra-slot (get 'utf-translation-table-for-encode | |
340 'translation-table) | |
341 1 ucs-mule-cjk-to-unicode)) | |
342 | |
343 (setq utf-translate-cjk-lang-env current-language-environment))) | |
344 | |
345 (defun utf-lookup-subst-table-for-decode (code-point) | |
346 (if (and utf-translate-cjk-mode | |
347 (not utf-translate-cjk-lang-env) | |
348 (utf-translate-cjk-substitutable-p code-point)) | |
349 (utf-translate-cjk-load-tables)) | |
350 (gethash code-point | |
351 (get 'utf-subst-table-for-decode 'translation-hash-table))) | |
352 | |
353 | |
354 (defun utf-lookup-subst-table-for-encode (char) | |
355 (if (and utf-translate-cjk-mode | |
356 (not utf-translate-cjk-lang-env) | |
357 (memq (char-charset char) utf-translate-cjk-charsets)) | |
358 (utf-translate-cjk-load-tables)) | |
359 (gethash char | |
360 (get 'utf-subst-table-for-encode 'translation-hash-table))) | |
361 | |
362 (define-minor-mode utf-translate-cjk-mode | |
363 "Toggle whether UTF based coding systems de/encode CJK characters. | |
364 If ARG is an integer, enable if ARG is positive and disable if | |
365 zero or negative. This is a minor mode. | |
366 Enabling this allows the coding systems mule-utf-8, | |
367 mule-utf-16le and mule-utf-16be to encode characters in the charsets | |
48848 | 368 `korean-ksc5601', `chinese-gb2312', `chinese-big5-1', |
369 `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to | |
370 decode the corresponding unicodes into such characters. | |
46496 | 371 |
48848 | 372 Where the charsets overlap, the one preferred for decoding is chosen |
373 according to the language environment in effect when this option is | |
374 turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for | |
375 Chinese-Big5 and jisx for other environments. | |
376 | |
88155 | 377 This mode is on by default. If you are not interested in CJK |
378 characters and want to avoid some overhead on encoding/decoding | |
379 by the above coding systems, you can customize the user option | |
380 `utf-translate-cjk-mode' to nil." | |
381 :init-value t | |
382 :version "22.1" | |
46496 | 383 :type 'boolean |
88155 | 384 :group 'mule |
385 :global t | |
386 (if utf-translate-cjk-mode | |
387 (progn | |
388 (define-translation-hash-table 'utf-subst-table-for-decode | |
389 ucs-unicode-to-mule-cjk) | |
390 (define-translation-hash-table 'utf-subst-table-for-encode | |
391 ucs-mule-cjk-to-unicode) | |
392 (set-char-table-extra-slot (get 'utf-translation-table-for-encode | |
393 'translation-table) | |
394 1 ucs-mule-cjk-to-unicode)) | |
395 (define-translation-hash-table 'utf-subst-table-for-decode | |
396 (make-hash-table :test 'eq)) | |
397 (define-translation-hash-table 'utf-subst-table-for-encode | |
398 (make-hash-table :test 'eq)) | |
399 (set-char-table-extra-slot (get 'utf-translation-table-for-encode | |
400 'translation-table) | |
401 1 nil)) | |
402 | |
403 ;; Update safe-chars of mule-utf-* coding systems. | |
404 (dolist (elt (coding-system-list t)) | |
405 (if (string-match "^mule-utf" (symbol-name elt)) | |
406 (let ((safe-charsets (coding-system-get elt 'safe-charsets)) | |
407 (safe-chars (coding-system-get elt 'safe-chars)) | |
408 (need-update nil)) | |
409 (dolist (charset utf-translate-cjk-charsets) | |
410 (unless (eq utf-translate-cjk-mode (memq charset safe-charsets)) | |
411 (setq safe-charsets | |
412 (if utf-translate-cjk-mode | |
413 (cons charset safe-charsets) | |
414 (delq charset safe-charsets)) | |
415 need-update t) | |
416 (aset safe-chars (make-char charset) utf-translate-cjk-mode))) | |
417 (when need-update | |
418 (coding-system-put elt 'safe-charsets safe-charsets) | |
419 (define-coding-system-internal elt)))))) | |
420 | |
421 (define-ccl-program ccl-mule-utf-untrans | |
422 ;; R0 is an untranslatable Unicode code-point (U+3500..U+DFFF or | |
423 ;; U+10000..U+10FFFF) or an invaid byte (#x00..#xFF). Write | |
424 ;; eight-bit-control/graphic sequence (2 to 4 chars) representing | |
425 ;; UTF-8 sequence of r0. Registers r4, r5, r6 are modified. | |
426 ;; | |
427 ;; This is a subrountine because we assume that this is called very | |
428 ;; rarely (so we don't have to worry about the overhead of the | |
429 ;; call). | |
430 `(0 | |
431 ((r5 = ,(charset-id 'eight-bit-control)) | |
432 (r6 = ,(charset-id 'eight-bit-graphic)) | |
433 (if (r0 < #x100) | |
434 ((r4 = ((r0 >> 6) | #xC0)) | |
435 (write-multibyte-character r6 r4)) | |
436 ((if (r0 < #x10000) | |
437 ((r4 = ((r0 >> 12) | #xE0)) | |
438 (write-multibyte-character r6 r4)) | |
439 ((r4 = ((r0 >> 18) | #xF0)) | |
440 (write-multibyte-character r6 r4) | |
441 (r4 = (((r0 >> 12) & #x3F) | #x80)) | |
442 (if (r4 < #xA0) | |
443 (write-multibyte-character r5 r4) | |
444 (write-multibyte-character r6 r4)))) | |
445 (r4 = (((r0 >> 6) & #x3F) | #x80)) | |
446 (if (r4 < #xA0) | |
447 (write-multibyte-character r5 r4) | |
448 (write-multibyte-character r6 r4)))) | |
449 (r4 = ((r0 & #x3F) | #x80)) | |
450 (if (r4 < #xA0) | |
451 (write-multibyte-character r5 r4) | |
452 (write-multibyte-character r6 r4))))) | |
46496 | 453 |
35542 | 454 (define-ccl-program ccl-decode-mule-utf-8 |
455 ;; | |
456 ;; charset | bytes in utf-8 | bytes in emacs | |
457 ;; -----------------------+----------------+--------------- | |
458 ;; ascii | 1 | 1 | |
459 ;; -----------------------+----------------+--------------- | |
460 ;; eight-bit-control | 2 | 2 | |
41873 | 461 ;; eight-bit-graphic | 2 | 1 |
35542 | 462 ;; latin-iso8859-1 | 2 | 2 |
463 ;; -----------------------+----------------+--------------- | |
464 ;; mule-unicode-0100-24ff | 2 | 4 | |
465 ;; (< 0800) | | | |
466 ;; -----------------------+----------------+--------------- | |
467 ;; mule-unicode-0100-24ff | 3 | 4 | |
468 ;; (>= 8000) | | | |
469 ;; mule-unicode-2500-33ff | 3 | 4 | |
470 ;; mule-unicode-e000-ffff | 3 | 4 | |
88155 | 471 ;; -----------------------+----------------+--------------- |
472 ;; invalid byte | 1 | 2 | |
35542 | 473 ;; |
474 ;; Thus magnification factor is two. | |
475 ;; | |
476 `(2 | |
88155 | 477 ((r6 = ,(charset-id 'latin-iso8859-1)) |
478 (read r0) | |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
479 (loop |
35542 | 480 (if (r0 < #x80) |
88155 | 481 ;; 1-byte encoding, i.e., ascii |
482 (write-read-repeat r0)) | |
483 (if (r0 < #xc2) | |
484 ;; continuation byte (invalid here) or 1st byte of overlong | |
485 ;; 2-byte sequence. | |
486 ((call ccl-mule-utf-untrans) | |
487 (r6 = ,(charset-id 'latin-iso8859-1)) | |
488 (read r0) | |
489 (repeat))) | |
35542 | 490 |
88155 | 491 ;; Read the 2nd byte. |
492 (read r1) | |
493 (if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte | |
494 ((call ccl-mule-utf-untrans) | |
495 (r6 = ,(charset-id 'latin-iso8859-1)) | |
496 ;; Handle it in the next loop. | |
497 (r0 = r1) | |
498 (repeat))) | |
46496 | 499 |
88155 | 500 (if (r0 < #xe0) |
501 ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx | |
502 ((r1 &= #x3F) | |
503 (r1 |= ((r0 & #x1F) << 6)) | |
504 ;; Now r1 holds scalar value. We don't have to check | |
505 ;; `overlong sequence' because r0 >= 0xC2. | |
46496 | 506 |
88155 | 507 (if (r1 >= 256) |
508 ;; mule-unicode-0100-24ff (< 0800) | |
509 ((r0 = r1) | |
510 (lookup-integer utf-subst-table-for-decode r0 r1) | |
511 (if (r7 == 0) | |
512 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
513 (r1 -= #x0100) | |
514 (r2 = (((r1 / 96) + 32) << 7)) | |
515 (r1 %= 96) | |
516 (r1 += (r2 + 32)) | |
517 (translate-character | |
518 utf-translation-table-for-decode r0 r1))) | |
519 (write-multibyte-character r0 r1) | |
520 (read r0) | |
521 (repeat)) | |
522 (if (r1 >= 160) | |
523 ;; latin-iso8859-1 | |
524 ((r0 = r1) | |
525 (lookup-integer utf-subst-table-for-decode r0 r1) | |
526 (if (r7 == 0) | |
527 ((r1 -= 128) | |
528 (write-multibyte-character r6 r1)) | |
529 ((write-multibyte-character r0 r1))) | |
530 (read r0) | |
531 (repeat)) | |
532 ;; eight-bit-control | |
533 ((r0 = ,(charset-id 'eight-bit-control)) | |
534 (write-multibyte-character r0 r1) | |
535 (read r0) | |
536 (repeat)))))) | |
46496 | 537 |
88155 | 538 ;; Read the 3rd bytes. |
539 (read r2) | |
540 (if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte | |
541 ((call ccl-mule-utf-untrans) | |
542 (r0 = r1) | |
543 (call ccl-mule-utf-untrans) | |
544 (r6 = ,(charset-id 'latin-iso8859-1)) | |
545 ;; Handle it in the next loop. | |
546 (r0 = r2) | |
547 (repeat))) | |
46496 | 548 |
88155 | 549 (if (r0 < #xF0) |
550 ;; 3byte encoding | |
551 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx | |
552 ((r3 = ((r0 & #xF) << 12)) | |
553 (r3 |= ((r1 & #x3F) << 6)) | |
554 (r3 |= (r2 & #x3F)) | |
555 | |
556 (if (r3 < #x800) ; `overlong sequence' | |
557 ((call ccl-mule-utf-untrans) | |
558 (r0 = r1) | |
559 (call ccl-mule-utf-untrans) | |
560 (r0 = r2) | |
561 (call ccl-mule-utf-untrans) | |
562 (r6 = ,(charset-id 'latin-iso8859-1)) | |
563 (read r0) | |
564 (repeat))) | |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
49028
diff
changeset
|
565 |
88155 | 566 (if (r3 < #x2500) |
567 ;; mule-unicode-0100-24ff (>= 0800) | |
568 ((r0 = r3) | |
569 (lookup-integer utf-subst-table-for-decode r0 r1) | |
570 (if (r7 == 0) | |
571 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
572 (r3 -= #x0100) | |
573 (r3 //= 96) | |
574 (r1 = (r7 + 32)) | |
575 (r1 += ((r3 + 32) << 7)) | |
576 (translate-character | |
577 utf-translation-table-for-decode r0 r1))) | |
578 (write-multibyte-character r0 r1) | |
579 (read r0) | |
580 (repeat))) | |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
49028
diff
changeset
|
581 |
88155 | 582 (if (r3 < #x3400) |
583 ;; mule-unicode-2500-33ff | |
584 ((r0 = r3) ; don't zap r3 | |
585 (lookup-integer utf-subst-table-for-decode r0 r1) | |
586 (if (r7 == 0) | |
587 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | |
588 (r3 -= #x2500) | |
589 (r3 //= 96) | |
590 (r1 = (r7 + 32)) | |
591 (r1 += ((r3 + 32) << 7)))) | |
592 (write-multibyte-character r0 r1) | |
593 (read r0) | |
594 (repeat))) | |
46496 | 595 |
88155 | 596 (if (r3 < #xE000) |
597 ;; Try to convert to CJK chars, else | |
598 ;; keep them as eight-bit-{control|graphic}. | |
599 ((r0 = r3) | |
600 (lookup-integer utf-subst-table-for-decode r3 r1) | |
601 (if r7 | |
602 ;; got a translation | |
603 ((write-multibyte-character r3 r1) | |
604 (read r0) | |
605 (repeat)) | |
606 ((call ccl-mule-utf-untrans) | |
607 (r6 = ,(charset-id 'latin-iso8859-1)) | |
608 (read r0) | |
609 (repeat))))) | |
610 | |
611 ;; mule-unicode-e000-ffff | |
612 ;; Fixme: fffe and ffff are invalid. | |
613 (r0 = r3) ; don't zap r3 | |
614 (lookup-integer utf-subst-table-for-decode r0 r1) | |
615 (if (r7 == 0) | |
616 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | |
617 (r3 -= #xe000) | |
618 (r3 //= 96) | |
619 (r1 = (r7 + 32)) | |
620 (r1 += ((r3 + 32) << 7)))) | |
621 (write-multibyte-character r0 r1) | |
622 (read r0) | |
623 (repeat))) | |
46496 | 624 |
88155 | 625 ;; Read the 4th bytes. |
626 (read r3) | |
627 (if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte | |
628 ((call ccl-mule-utf-untrans) | |
629 (r0 = r1) | |
630 (call ccl-mule-utf-untrans) | |
631 (r0 = r2) | |
632 (call ccl-mule-utf-untrans) | |
633 (r6 = ,(charset-id 'latin-iso8859-1)) | |
634 ;; Handle it in the next loop. | |
635 (r0 = r3) | |
636 (repeat))) | |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
49028
diff
changeset
|
637 |
88155 | 638 (if (r0 < #xF8) |
639 ;; 4-byte encoding: | |
640 ;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx | |
641 ;; keep those bytes as eight-bit-{control|graphic} | |
642 ;; Fixme: allow lookup in utf-subst-table-for-decode. | |
643 ((r4 = ((r0 & #x7) << 18)) | |
644 (r4 |= ((r1 & #x3F) << 12)) | |
645 (r4 |= ((r2 & #x3F) << 6)) | |
646 (r4 |= (r3 & #x3F)) | |
35542 | 647 |
88155 | 648 (if (r4 < #x10000) ; `overlong sequence' |
649 ((call ccl-mule-utf-untrans) | |
650 (r0 = r1) | |
651 (call ccl-mule-utf-untrans) | |
652 (r0 = r2) | |
653 (call ccl-mule-utf-untrans) | |
654 (r0 = r3) | |
655 (call ccl-mule-utf-untrans)) | |
656 ((r0 = r4) | |
657 (call ccl-mule-utf-untrans)))) | |
658 | |
659 ;; Unsupported sequence. | |
660 ((call ccl-mule-utf-untrans) | |
661 (r0 = r1) | |
662 (call ccl-mule-utf-untrans) | |
663 (r0 = r2) | |
664 (call ccl-mule-utf-untrans) | |
665 (r0 = r3) | |
666 (call ccl-mule-utf-untrans))) | |
667 (r6 = ,(charset-id 'latin-iso8859-1)) | |
668 (read r0) | |
669 (repeat))) | |
670 | |
671 | |
672 ;; At EOF... | |
673 (if (r0 >= 0) | |
674 ;; r0 >= #x80 | |
675 ((call ccl-mule-utf-untrans) | |
676 (if (r1 >= 0) | |
677 ((r0 = r1) | |
678 (call ccl-mule-utf-untrans) | |
679 (if (r2 >= 0) | |
680 ((r0 = r2) | |
681 (call ccl-mule-utf-untrans) | |
682 (if (r3 >= 0) | |
683 ((r0 = r3) | |
684 (call ccl-mule-utf-untrans)))))))))) | |
35542 | 685 |
36243 | 686 "CCL program to decode UTF-8. |
36465 | 687 Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
688 mule-unicode-*, but see also `utf-fragmentation-table' and |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
689 `ucs-mule-cjk-to-unicode'. |
46496 | 690 Encodings of un-representable Unicode characters are decoded asis into |
691 eight-bit-control and eight-bit-graphic characters.") | |
35542 | 692 |
88155 | 693 (define-ccl-program ccl-mule-utf-8-encode-untrans |
694 ;; UTF-8 decoder generates an UTF-8 sequence represented by a | |
695 ;; sequence eight-bit-control/graphic chars for an untranslatable | |
696 ;; character and an invalid byte. | |
697 ;; | |
698 ;; This CCL parses that sequence (the first byte is already in r1), | |
699 ;; writes out the original bytes of that sequence, and sets r5 to | |
700 ;; -1. | |
701 ;; | |
702 ;; If the eight-bit-control/graphic sequence is shorter than what r1 | |
703 ;; suggests, it sets r5 and r6 to the last character read that | |
704 ;; should be handled by the next loop of a caller. | |
705 ;; | |
706 ;; Note: For UTF-8 validation, we only check if a character is | |
707 ;; eight-bit-control/graphic or not. It may result in incorrect | |
708 ;; handling of random binary data, but such a data can't be encoded | |
709 ;; by UTF-8 anyway. At least, UTF-8 decoders doesn't generate such | |
710 ;; a sequence even if a source contains invalid byte-sequence. | |
711 `(0 | |
712 (;; Read the 2nd byte. | |
713 (read-multibyte-character r5 r6) | |
714 (r0 = (r5 != ,(charset-id 'eight-bit-control))) | |
715 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | |
716 ((write r1) ; invalid UTF-8 | |
717 (r1 = -1) | |
718 (end))) | |
719 | |
720 (if (r1 <= #xC3) | |
721 ;; 2-byte sequence for an originally invalid byte. | |
722 ((r6 &= #x3F) | |
723 (r6 |= ((r1 & #x1F) << 6)) | |
724 (write r6) | |
725 (r5 = -1) | |
726 (end))) | |
727 | |
728 (write r1 r6) | |
729 (r2 = r1) | |
730 (r1 = -1) | |
731 ;; Read the 3rd byte. | |
732 (read-multibyte-character r5 r6) | |
733 (r0 = (r5 != ,(charset-id 'eight-bit-control))) | |
734 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | |
735 (end)) ; invalid UTF-8 | |
736 (write r6) | |
737 (if (r2 < #xF0) | |
738 ;; 3-byte sequence for an untranslated character. | |
739 ((r5 = -1) | |
740 (end))) | |
741 ;; Read the 4th byte. | |
742 (read-multibyte-character r5 r6) | |
743 (r0 = (r5 != ,(charset-id 'eight-bit-control))) | |
744 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | |
745 (end)) ; invalid UTF-8 | |
746 ;; 4-byte sequence for an untranslated character. | |
747 (write r6) | |
748 (r5 = -1) | |
749 (end)) | |
750 | |
751 ;; At EOF... | |
752 ((r5 = -1) | |
753 (if (r1 >= 0) | |
754 (write r1))))) | |
755 | |
35542 | 756 (define-ccl-program ccl-encode-mule-utf-8 |
757 `(1 | |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
758 ((r5 = -1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
759 (loop |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
760 (if (r5 < 0) |
88155 | 761 (read-multibyte-character r0 r1) |
762 ;; Pre-read character is in r5 (charset-ID) and r6 (code-point). | |
763 ((r0 = r5) | |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
764 (r1 = r6) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
765 (r5 = -1))) |
88155 | 766 (translate-character utf-translation-table-for-encode r0 r1) |
35542 | 767 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
768 (if (r0 == ,(charset-id 'ascii)) |
88155 | 769 (write-repeat r1)) |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
770 |
88155 | 771 (if (r0 == ,(charset-id 'latin-iso8859-1)) |
772 ;; r1 scalar utf-8 | |
773 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
774 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 | |
775 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 | |
776 ((write ((r1 >> 6) | #xc2)) | |
777 (r1 &= #x3f) | |
778 (r1 |= #x80) | |
779 (write-repeat r1))) | |
35542 | 780 |
88155 | 781 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) |
782 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
783 ;; #x3f80 == (0011 1111 1000 0000)b | |
784 (r1 &= #x7f) | |
785 (r1 += (r0 + 224)) ; 240 == -32 + #x0100 | |
786 ;; now r1 holds scalar value | |
787 (if (r1 < #x0800) | |
788 ;; 2byte encoding | |
789 ((write ((r1 >> 6) | #xC0)) | |
790 (r1 &= #x3F) | |
791 (r1 |= #x80) | |
792 (write-repeat r1)) | |
793 ;; 3byte encoding | |
794 ((write ((r1 >> 12) | #xE0)) | |
795 (write (((r1 & #x0FC0) >> 6) | #x80)) | |
796 (r1 &= #x3F) | |
797 (r1 |= #x80) | |
798 (write-repeat r1))))) | |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
799 |
88155 | 800 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) |
801 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
802 (r1 &= #x7f) | |
803 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 | |
804 ;; now r1 holds scalar value | |
805 (write ((r1 >> 12) | #xE0)) | |
806 (write (((r1 & #x0FC0) >> 6) | #x80)) | |
807 (r1 &= #x3F) | |
808 (r1 |= #x80) | |
809 (write-repeat r1))) | |
35542 | 810 |
88155 | 811 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) |
812 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
813 (r1 &= #x7f) | |
814 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000 | |
815 ;; now r1 holds scalar value | |
816 (write ((r1 >> 12) | #xE0)) | |
817 (write (((r1 & #x0FC0) >> 6) | #x80)) | |
818 (r1 &= #x3F) | |
819 (r1 |= #x80) | |
820 (write-repeat r1))) | |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
821 |
88155 | 822 (if (r0 == ,(charset-id 'eight-bit-control)) |
823 ;; r1 scalar utf-8 | |
824 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
825 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 | |
826 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 | |
827 ((write #xC2) | |
828 (write-repeat r1))) | |
35542 | 829 |
88155 | 830 (if (r0 == ,(charset-id 'eight-bit-graphic)) |
831 ;; r1 scalar utf-8 | |
832 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
833 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 | |
834 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 | |
835 ((r0 = (r1 >= #xC0)) | |
836 (r0 &= (r1 <= #xC3)) | |
837 (r4 = (r1 >= #xE1)) | |
838 (r4 &= (r1 <= #xF7)) | |
839 (r0 |= r4) | |
840 (if r0 | |
841 ((call ccl-mule-utf-8-encode-untrans) | |
842 (repeat)) | |
843 (write-repeat r1)))) | |
35542 | 844 |
88155 | 845 (lookup-character utf-subst-table-for-encode r0 r1) |
846 (if r7 ; lookup succeeded | |
847 (if (r0 < #x800) | |
848 ;; 2byte encoding | |
849 ((write ((r0 >> 6) | #xC0)) | |
850 (r0 = ((r0 & #x3F) | #x80)) | |
851 (write-repeat r0)) | |
852 ;; 3byte encoding | |
853 ((write ((r0 >> 12) | #xE0)) | |
854 (write (((r0 & #x0FC0) >> 6) | #x80)) | |
855 (r0 = ((r0 & #x3F) | #x80)) | |
856 (write-repeat r0)))) | |
35542 | 857 |
88155 | 858 ;; Unsupported character. |
859 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. | |
860 (write #xef) | |
861 (write #xbf) | |
862 (write-repeat #xbd)))) | |
46496 | 863 "CCL program to encode into UTF-8.") |
35542 | 864 |
41873 | 865 |
46496 | 866 (define-ccl-program ccl-untranslated-to-ucs |
867 `(0 | |
88155 | 868 (if (r1 == 0) |
869 nil | |
870 (if (r0 <= #xC3) ; 2-byte encoding | |
871 ((r0 = ((r0 & #x3) << 6)) | |
872 (r0 |= (r1 & #x3F)) | |
873 (r1 = 2)) | |
874 (if (r2 == 0) | |
875 (r1 = 0) | |
876 (if (r0 < #xF0) ; 3-byte encoding, as above | |
877 ((r0 = ((r0 & #xF) << 12)) | |
878 (r0 |= ((r1 & #x3F) << 6)) | |
879 (r0 |= (r2 & #x3F)) | |
880 (r1 = 3)) | |
881 (if (r3 == 0) | |
882 (r1 = 0) | |
883 ((r0 = ((r0 & #x7) << 18)) | |
884 (r0 |= ((r1 & #x3F) << 12)) | |
885 (r0 |= ((r2 & #x3F) << 6)) | |
886 (r0 |= (r3 & #x3F)) | |
887 (r1 = 4)))))))) | |
888 "Decode 2-, 3-, or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0. | |
889 Set r1 to the byte length. r0 == 0 for invalid sequence.") | |
46496 | 890 |
891 (defvar utf-8-ccl-regs (make-vector 8 0)) | |
892 | |
41873 | 893 (defsubst utf-8-untranslated-to-ucs () |
46496 | 894 "Return the UCS code for an untranslated sequence of raw bytes t point. |
895 Only for 3- or 4-byte sequences." | |
896 (aset utf-8-ccl-regs 0 (or (char-after) 0)) | |
897 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0)) | |
898 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0)) | |
899 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0)) | |
88155 | 900 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs)) |
41873 | 901 |
902 (defun utf-8-help-echo (window object position) | |
903 (format "Untranslated Unicode U+%04X" | |
904 (get-char-property position 'untranslated-utf-8 object))) | |
905 | |
88155 | 906 ;; We compose the untranslatable sequences into a single character, |
907 ;; and move point to the next character. | |
41873 | 908 ;; This is infelicitous for editing, because there's currently no |
909 ;; mechanism for treating compositions as atomic, but is OK for | |
46496 | 910 ;; display. They are composed to U+FFFD with help-echo which |
911 ;; indicates the unicodes they represent. This function GCs too much. | |
88155 | 912 |
913 ;; If utf-translate-cjk-mode is non-nil, this function is called with | |
914 ;; HASH-TABLE which translates CJK characters into some of CJK | |
915 ;; charsets. | |
916 | |
917 (defsubst utf-8-compose (hash-table) | |
918 "Put a suitable composition on an untranslatable sequence at point. | |
919 If HASH-TABLE is non-nil, try to translate CJK characters by it at first. | |
920 Move point to the end of the sequence." | |
921 (utf-8-untranslated-to-ucs) | |
922 (let ((l (aref utf-8-ccl-regs 1)) | |
923 ch) | |
924 (if (> l 0) | |
925 (if (and hash-table | |
926 (setq ch (gethash (aref utf-8-ccl-regs 0) hash-table))) | |
927 (progn | |
928 (insert ch) | |
929 (delete-region (point) (min (point-max) (+ l (point))))) | |
930 (setq ch (aref utf-8-ccl-regs 0)) | |
931 (put-text-property (point) (min (point-max) (+ l (point))) | |
932 'untranslated-utf-8 ch) | |
933 (put-text-property (point) (min (point-max) (+ l (point))) | |
934 'help-echo 'utf-8-help-echo) | |
935 (if (= l 2) | |
936 (put-text-property (point) (min (point-max) (+ l (point))) | |
937 'display (propertize (format "\\%03o" ch) | |
938 'face 'escape-glyph)) | |
939 (compose-region (point) (+ l (point)) ?$,3u=(B)) | |
940 (forward-char l)) | |
941 (forward-char 1)))) | |
41873 | 942 |
943 (defcustom utf-8-compose-scripts nil | |
46496 | 944 "*Non-nil means compose various scripts on decoding utf-8 text." |
41873 | 945 :group 'mule |
88155 | 946 :version "22.1" |
46496 | 947 :type 'boolean) |
41873 | 948 |
949 (defun utf-8-post-read-conversion (length) | |
950 "Compose untranslated utf-8 sequences into single characters. | |
88155 | 951 If `utf-translate-cjk-mode' is non-nil, tries to translate CJK characters. |
41873 | 952 Also compose particular scripts if `utf-8-compose-scripts' is non-nil." |
953 (save-excursion | |
88155 | 954 (save-restriction |
955 (narrow-to-region (point) (+ (point) length)) | |
956 ;; Can't do eval-when-compile to insert a multibyte constant | |
957 ;; version of the string in the loop, since it's always loaded as | |
958 ;; unibyte from a byte-compiled file. | |
959 (let ((range (string-as-multibyte "^\xc0-\xc3\xe1-\xf7")) | |
960 (buffer-multibyte enable-multibyte-characters) | |
961 hash-table ch) | |
962 (set-buffer-multibyte t) | |
963 (when utf-translate-cjk-mode | |
964 (unless utf-translate-cjk-lang-env | |
965 ;; Check these characters in utf-translate-cjk-range. | |
966 ;; We may have to translate them to CJK charsets. | |
967 (skip-chars-forward | |
968 (concat range utf-translate-cjk-unicode-range-string)) | |
969 (unless (eobp) | |
970 (utf-translate-cjk-load-tables) | |
971 (setq range | |
972 (concat range utf-translate-cjk-unicode-range-string))) | |
973 (setq hash-table (get 'utf-subst-table-for-decode | |
974 'translation-hash-table)))) | |
975 (while (and (skip-chars-forward range) | |
976 (not (eobp))) | |
977 (setq ch (following-char)) | |
978 (if (< ch 256) | |
979 (utf-8-compose hash-table) | |
980 (if (and hash-table | |
981 (setq ch (gethash (encode-char ch 'ucs) hash-table))) | |
982 (progn | |
983 (insert ch) | |
984 (delete-char 1)) | |
985 (forward-char 1)))) | |
986 (or buffer-multibyte | |
987 (set-buffer-multibyte nil))) | |
41873 | 988 |
88155 | 989 (when (and utf-8-compose-scripts (> length 1)) |
990 ;; These currently have definitions which cover the relevant | |
991 ;; unicodes. We could avoid loading thai-util &c by checking | |
992 ;; whether the region contains any characters with the appropriate | |
993 ;; categories. There aren't yet Unicode-based rules for Tibetan. | |
994 (diacritic-compose-region (point-max) (point-min)) | |
995 (thai-compose-region (point-max) (point-min)) | |
996 (lao-compose-region (point-max) (point-min)) | |
997 (devanagari-compose-region (point-max) (point-min)) | |
998 (malayalam-compose-region (point-max) (point-min)) | |
999 (tamil-compose-region (point-max) (point-min))) | |
1000 (- (point-max) (point-min))))) | |
1001 | |
1002 (defun utf-8-pre-write-conversion (beg end) | |
1003 "Prepare for `utf-translate-cjk-mode' to encode text between BEG and END. | |
1004 This is used as a post-read-conversion of utf-8 coding system." | |
1005 (if (and utf-translate-cjk-mode | |
1006 (not utf-translate-cjk-lang-env) | |
1007 (save-excursion | |
1008 (goto-char beg) | |
1009 (re-search-forward "\\cc\\|\\cj\\|\\ch" end t))) | |
1010 (utf-translate-cjk-load-tables)) | |
1011 nil) | |
41873 | 1012 |
35542 | 1013 (make-coding-system |
1014 'mule-utf-8 4 ?u | |
1015 "UTF-8 encoding for Emacs-supported Unicode characters. | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1016 It supports Unicode characters of these ranges: |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1017 U+0000..U+33FF, U+E000..U+FFFF. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1018 They correspond to these Emacs character sets: |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1019 ascii, latin-iso8859-1, mule-unicode-0100-24ff, |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1020 mule-unicode-2500-33ff, mule-unicode-e000-ffff |
35542 | 1021 |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1022 On decoding (e.g. reading a file), Unicode characters not in the above |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1023 ranges are decoded into sequences of eight-bit-control and |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1024 eight-bit-graphic characters to preserve their byte sequences. The |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1025 byte sequence is preserved on i/o for valid utf-8, but not necessarily |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1026 for invalid utf-8. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1027 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1028 On encoding (e.g. writing a file), Emacs characters not belonging to |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1029 any of the character sets listed above are encoded into the UTF-8 byte |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1030 sequence representing U+FFFD (REPLACEMENT CHARACTER)." |
35542 | 1031 |
1032 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | |
88155 | 1033 `((safe-charsets |
35542 | 1034 ascii |
1035 eight-bit-control | |
1036 eight-bit-graphic | |
1037 latin-iso8859-1 | |
1038 mule-unicode-0100-24ff | |
1039 mule-unicode-2500-33ff | |
88155 | 1040 mule-unicode-e000-ffff |
1041 ,@(if utf-translate-cjk-mode | |
1042 utf-translate-cjk-charsets)) | |
36371
f6bb3ed752b4
(mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents:
36243
diff
changeset
|
1043 (mime-charset . utf-8) |
36423
aa776838b660
(mule-utf-8): Set coding-category property to coding-category-utf-8.
Kenichi Handa <handa@m17n.org>
parents:
36371
diff
changeset
|
1044 (coding-category . coding-category-utf-8) |
41873 | 1045 (valid-codes (0 . 255)) |
88155 | 1046 (pre-write-conversion . utf-8-pre-write-conversion) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1047 (post-read-conversion . utf-8-post-read-conversion) |
88155 | 1048 (translation-table-for-encode . utf-translation-table-for-encode) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1049 (dependency unify-8859-on-encoding-mode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1050 unify-8859-on-decoding-mode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
1051 utf-fragment-on-decoding |
88155 | 1052 utf-translate-cjk-mode))) |
35542 | 1053 |
1054 (define-coding-system-alias 'utf-8 'mule-utf-8) | |
38436
b174db545cfd
Some fixes to follow coding conventions.
Pavel Janík <Pavel@Janik.cz>
parents:
37934
diff
changeset
|
1055 |
41873 | 1056 ;; I think this needs special private charsets defined for the |
1057 ;; untranslated sequences, if it's going to work well. | |
1058 | |
1059 ;;; (defun utf-8-compose-function (pos to pattern &optional string) | |
1060 ;;; (let* ((prop (get-char-property pos 'composition string)) | |
1061 ;;; (l (and prop (- (cadr prop) (car prop))))) | |
1062 ;;; (cond ((and l (> l (- to pos))) | |
1063 ;;; (delete-region pos to)) | |
1064 ;;; ((and (> (char-after pos) 224) | |
1065 ;;; (< (char-after pos) 256) | |
1066 ;;; (save-restriction | |
1067 ;;; (narrow-to-region pos to) | |
1068 ;;; (utf-8-compose))) | |
1069 ;;; t)))) | |
1070 | |
1071 ;;; (dotimes (i 96) | |
1072 ;;; (aset composition-function-table | |
1073 ;;; (+ 128 i) | |
1074 ;;; `((,(string-as-multibyte "[\200-\237\240-\377]") | |
1075 ;;; . utf-8-compose-function)))) | |
1076 | |
88155 | 1077 ;;; arch-tag: b08735b7-753b-4ae6-b754-0f3efe4515c5 |
38436
b174db545cfd
Some fixes to follow coding conventions.
Pavel Janík <Pavel@Janik.cz>
parents:
37934
diff
changeset
|
1078 ;;; utf-8.el ends here |