Mercurial > emacs
annotate lisp/international/utf-8.el @ 54736:b94de166de9d
(ethio-sera-being-called-by-w3): New
variable.
(ethio-sera-to-fidel-ethio): Check ethio-sera-being-called-by-w3
instead of sera-being-called-by-w3.
(ethio-fidel-to-sera-buffer): Likewise.
(ethio-find-file): Bind ethio-sera-being-called-by-w3 to t
instead of sera-being-called-by-w3.
(ethio-write-file): Likewise.
| author | Kenichi Handa <handa@m17n.org> |
|---|---|
| date | Mon, 05 Apr 2004 23:27:37 +0000 |
| parents | d61b01de8cdf |
| children | 6e677a935fe9 |
| rev | line source |
|---|---|
| 46496 | 1 ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*- |
| 35542 | 2 |
| 3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. | |
| 4 ;; Licensed to the Free Software Foundation. | |
| 46496 | 5 ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc. |
| 35542 | 6 |
|
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
7 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> |
| 46496 | 8 ;; Maintainer: FSF |
| 36243 | 9 ;; Keywords: multilingual, Unicode, UTF-8, i18n |
| 35542 | 10 |
| 11 ;; This file is part of GNU Emacs. | |
| 12 | |
| 13 ;; GNU Emacs is free software; you can redistribute it and/or modify | |
| 14 ;; it under the terms of the GNU General Public License as published by | |
| 15 ;; the Free Software Foundation; either version 2, or (at your option) | |
| 16 ;; any later version. | |
| 17 | |
| 18 ;; GNU Emacs is distributed in the hope that it will be useful, | |
| 19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 21 ;; GNU General Public License for more details. | |
| 22 | |
| 23 ;; You should have received a copy of the GNU General Public License | |
| 24 ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
| 25 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
| 26 ;; Boston, MA 02111-1307, USA. | |
| 27 | |
| 28 ;;; Commentary: | |
| 29 | |
| 41873 | 30 ;; The coding-system `mule-utf-8' basically supports encoding/decoding |
| 31 ;; of the following character sets to and from UTF-8: | |
| 35542 | 32 ;; |
| 33 ;; ascii | |
| 34 ;; eight-bit-control | |
| 35 ;; latin-iso8859-1 | |
| 36 ;; mule-unicode-0100-24ff | |
| 37 ;; mule-unicode-2500-33ff | |
| 38 ;; mule-unicode-e000-ffff | |
| 39 ;; | |
| 36243 | 40 ;; On decoding, Unicode characters that do not fit into the above |
| 41 ;; character sets are handled as `eight-bit-control' or | |
| 42 ;; `eight-bit-graphic' characters to retain the information about the | |
| 46496 | 43 ;; original byte sequence and text properties record the corresponding |
| 44 ;; unicode. | |
| 45 ;; | |
| 46 ;; Fixme: note that reading and writing invalid utf-8 may not be | |
| 47 ;; idempotent -- to represent the bytes to fix that needs a new charset. | |
| 41873 | 48 ;; |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
49 ;; Characters from other character sets can be encoded with mule-utf-8 |
| 48848 | 50 ;; by populating the translation table |
|
50179
65bb5afb37ef
(utf-fragment-on-decoding): Don't call
Kenichi Handa <handa@m17n.org>
parents:
50085
diff
changeset
|
51 ;; `utf-translation-table-for-encode'. Hash tables |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
52 ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
53 ;; used to support encoding and decoding of about a quarter of the CJK |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
54 ;; space between U+3400 and U+DFFF. |
| 36243 | 55 |
| 54304 | 56 ;; UTF-8 is defined in RFC 3629. A sketch of the encoding is: |
| 35542 | 57 |
| 58 ;; scalar | utf-8 | |
| 59 ;; value | 1st byte | 2nd byte | 3rd byte | |
| 60 ;; --------------------+-----------+-----------+---------- | |
| 61 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | | |
| 62 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | | |
| 63 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx | |
| 64 | |
| 65 ;;; Code: | |
| 66 | |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
67 (defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil) |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
68 "Char table mapping characters to latin-iso8859-1 or mule-unicode-*. |
| 46496 | 69 |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
70 If `unify-8859-on-encoding-mode' is non-nil, this table populates the |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
71 translation-table named `utf-translation-table-for-encode'.") |
| 46496 | 72 |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
73 (define-translation-table 'utf-translation-table-for-encode) |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
74 |
| 46496 | 75 |
| 76 ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the | |
| 77 ;; space of mule-unicode. For Latin scripts this isn't very | |
| 78 ;; important. Hebrew and Arabic might go here too when there's proper | |
| 79 ;; support for them. | |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
80 |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
81 (defvar utf-fragmentation-table (make-char-table 'translation-table nil) |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
82 "Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*. |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
83 |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
84 If `utf-fragment-on-decoding' is non-nil, this table populates the |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
85 translation-table named `utf-translation-table-for-decode'") |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
86 |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
87 (defvar utf-defragmentation-table (make-char-table 'translation-table nil) |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
88 "Char-table for reverse mapping of `utf-fragmentation-table'. |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
89 |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
90 If `utf-fragment-on-decoding' is non-nil and |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
91 `unify-8859-on-encoding-mode' is nil, this table populates the |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
92 translation-table named `utf-translation-table-for-encode'") |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
93 |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
94 (define-translation-table 'utf-translation-table-for-decode) |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
95 |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
96 |
| 48882 | 97 (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq) |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
98 "Hash table mapping Emacs CJK character sets to Unicode code points. |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
99 |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
100 If `utf-translate-cjk' is non-nil, this table populates the |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
101 translation-hash-table named `utf-subst-table-for-encode'.") |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
102 |
| 48882 | 103 (define-translation-hash-table 'utf-subst-table-for-encode |
| 104 ucs-mule-cjk-to-unicode) | |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
105 |
| 48882 | 106 (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq) |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
107 "Hash table mapping Unicode code points to Emacs CJK character sets. |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
108 |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
109 If `utf-translate-cjk' is non-nil, this table populates the |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
110 translation-hash-table named `utf-subst-table-for-decode'.") |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
111 |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
112 (define-translation-hash-table 'utf-subst-table-for-decode |
| 48882 | 113 ucs-unicode-to-mule-cjk) |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
114 |
| 46496 | 115 (mapc |
| 116 (lambda (pair) | |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
117 (aset utf-fragmentation-table (car pair) (cdr pair)) |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
118 (aset utf-defragmentation-table (cdr pair) (car pair))) |
| 46496 | 119 '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) (?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B) |
| 120 (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) (?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,F@(B) | |
| 121 (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) (?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B) | |
| 122 (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) (?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B) | |
| 123 (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) (?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B) | |
| 124 (?$,1' (B . ?,FP(B) (?$,1'!(B . ?,FQ(B) (?$,1'#(B . ?,FS(B) (?$,1'$(B . ?,FT(B) (?$,1'%(B . ?,FU(B) | |
| 125 (?$,1'&(B . ?,FV(B) (?$,1''(B . ?,FW(B) (?$,1'((B . ?,FX(B) (?$,1')(B . ?,FY(B) (?$,1'*(B . ?,FZ(B) | |
| 126 (?$,1'+(B . ?,F[(B) (?$,1',(B . ?,F\(B) (?$,1'-(B . ?,F](B) (?$,1'.(B . ?,F^(B) (?$,1'/(B . ?,F_(B) | |
| 127 (?$,1'0(B . ?,F`(B) (?$,1'1(B . ?,Fa(B) (?$,1'2(B . ?,Fb(B) (?$,1'3(B . ?,Fc(B) (?$,1'4(B . ?,Fd(B) | |
| 128 (?$,1'5(B . ?,Fe(B) (?$,1'6(B . ?,Ff(B) (?$,1'7(B . ?,Fg(B) (?$,1'8(B . ?,Fh(B) (?$,1'9(B . ?,Fi(B) | |
| 129 (?$,1':(B . ?,Fj(B) (?$,1';(B . ?,Fk(B) (?$,1'<(B . ?,Fl(B) (?$,1'=(B . ?,Fm(B) (?$,1'>(B . ?,Fn(B) | |
| 130 (?$,1'?(B . ?,Fo(B) (?$,1'@(B . ?,Fp(B) (?$,1'A(B . ?,Fq(B) (?$,1'B(B . ?,Fr(B) (?$,1'C(B . ?,Fs(B) | |
| 131 (?$,1'D(B . ?,Ft(B) (?$,1'E(B . ?,Fu(B) (?$,1'F(B . ?,Fv(B) (?$,1'G(B . ?,Fw(B) (?$,1'H(B . ?,Fx(B) | |
| 132 (?$,1'I(B . ?,Fy(B) (?$,1'J(B . ?,Fz(B) (?$,1'K(B . ?,F{(B) (?$,1'L(B . ?,F|(B) (?$,1'M(B . ?,F}(B) | |
| 133 (?$,1'N(B . ?,F~(B) | |
| 134 | |
| 135 (?$,1(!(B . ?,L!(B) (?$,1("(B . ?,L"(B) (?$,1(#(B . ?,L#(B) (?$,1($(B . ?,L$(B) | |
| 136 (?$,1(%(B . ?,L%(B) (?$,1(&(B . ?,L&(B) (?$,1('(B . ?,L'(B) (?$,1(((B . ?,L((B) (?$,1()(B . ?,L)(B) | |
| 137 (?$,1(*(B . ?,L*(B) (?$,1(+(B . ?,L+(B) (?$,1(,(B . ?,L,(B) (?$,1(.(B . ?,L.(B) (?$,1(/(B . ?,L/(B) | |
| 138 (?$,1(0(B . ?,L0(B) (?$,1(1(B . ?,L1(B) (?$,1(2(B . ?,L2(B) (?$,1(3(B . ?,L3(B) (?$,1(4(B . ?,L4(B) | |
| 139 (?$,1(5(B . ?,L5(B) (?$,1(6(B . ?,L6(B) (?$,1(7(B . ?,L7(B) (?$,1(8(B . ?,L8(B) (?$,1(9(B . ?,L9(B) | |
| 140 (?$,1(:(B . ?,L:(B) (?$,1(;(B . ?,L;(B) (?$,1(<(B . ?,L<(B) (?$,1(=(B . ?,L=(B) (?$,1(>(B . ?,L>(B) | |
| 141 (?$,1(?(B . ?,L?(B) (?$,1(@(B . ?,L@(B) (?$,1(A(B . ?,LA(B) (?$,1(B(B . ?,LB(B) (?$,1(C(B . ?,LC(B) | |
| 142 (?$,1(D(B . ?,LD(B) (?$,1(E(B . ?,LE(B) (?$,1(F(B . ?,LF(B) (?$,1(G(B . ?,LG(B) (?$,1(H(B . ?,LH(B) | |
| 143 (?$,1(I(B . ?,LI(B) (?$,1(J(B . ?,LJ(B) (?$,1(K(B . ?,LK(B) (?$,1(L(B . ?,LL(B) (?$,1(M(B . ?,LM(B) | |
| 144 (?$,1(N(B . ?,LN(B) (?$,1(O(B . ?,LO(B) (?$,1(P(B . ?,LP(B) (?$,1(Q(B . ?,LQ(B) (?$,1(R(B . ?,LR(B) | |
| 145 (?$,1(S(B . ?,LS(B) (?$,1(T(B . ?,LT(B) (?$,1(U(B . ?,LU(B) (?$,1(V(B . ?,LV(B) (?$,1(W(B . ?,LW(B) | |
| 146 (?$,1(X(B . ?,LX(B) (?$,1(Y(B . ?,LY(B) (?$,1(Z(B . ?,LZ(B) (?$,1([(B . ?,L[(B) (?$,1(\(B . ?,L\(B) | |
| 147 (?$,1(](B . ?,L](B) (?$,1(^(B . ?,L^(B) (?$,1(_(B . ?,L_(B) (?$,1(`(B . ?,L`(B) (?$,1(a(B . ?,La(B) | |
| 148 (?$,1(b(B . ?,Lb(B) (?$,1(c(B . ?,Lc(B) (?$,1(d(B . ?,Ld(B) (?$,1(e(B . ?,Le(B) (?$,1(f(B . ?,Lf(B) | |
| 149 (?$,1(g(B . ?,Lg(B) (?$,1(h(B . ?,Lh(B) (?$,1(i(B . ?,Li(B) (?$,1(j(B . ?,Lj(B) (?$,1(k(B . ?,Lk(B) | |
| 150 (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) (?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B) | |
| 151 (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) (?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B) | |
| 152 (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) (?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B) | |
| 153 (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B))) | |
| 154 | |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
155 |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
156 (defcustom utf-fragment-on-decoding nil |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
157 "Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets. |
| 46496 | 158 Setting this means that the relevant Cyrillic and Greek characters are |
| 159 decoded into the iso8859 charsets rather than into | |
| 47231 | 160 mule-unicode-0100-24ff. The iso8859 charsets take half as much space |
| 46496 | 161 in the buffer, but using them may affect how the buffer can be re-encoded |
| 162 and may require a different input method to search for them, for instance. | |
| 163 See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode' | |
| 47231 | 164 for mechanisms to make this largely transparent. |
| 165 | |
| 166 Setting this variable outside customize has no effect." | |
| 46496 | 167 :set (lambda (s v) |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
168 (if v |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
169 (progn |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
170 (define-translation-table 'utf-translation-table-for-decode |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
171 utf-fragmentation-table) |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
172 ;; Even if unify-8859-on-encoding-mode is off, make |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
173 ;; mule-utf-* encode characters in |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
174 ;; utf-fragmentation-table. |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
175 (unless (eq (get 'utf-translation-table-for-encode |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
176 'translation-table) |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
177 ucs-mule-to-mule-unicode) |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
178 (define-translation-table 'utf-translation-table-for-encode |
|
50179
65bb5afb37ef
(utf-fragment-on-decoding): Don't call
Kenichi Handa <handa@m17n.org>
parents:
50085
diff
changeset
|
179 utf-defragmentation-table))) |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
180 (define-translation-table 'utf-translation-table-for-decode) |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
181 ;; When unify-8859-on-encoding-mode is off, be sure to make |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
182 ;; mule-utf-* disabled for characters in |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
183 ;; utf-fragmentation-table. |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
184 (unless (eq (get 'utf-translation-table-for-encode |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
185 'translation-table) |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
186 ucs-mule-to-mule-unicode) |
|
50179
65bb5afb37ef
(utf-fragment-on-decoding): Don't call
Kenichi Handa <handa@m17n.org>
parents:
50085
diff
changeset
|
187 (define-translation-table 'utf-translation-table-for-encode))) |
| 46496 | 188 (set-default s v)) |
| 189 :version "21.4" | |
| 190 :type 'boolean | |
| 191 :group 'mule) | |
| 192 | |
|
50341
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
193 (define-minor-mode utf-translate-cjk-mode |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
194 "Whether the UTF based coding systems should decode/encode CJK characters. |
| 48848 | 195 Enabling this loads tables which allow the coding systems mule-utf-8, |
|
51628
abfc7d48b476
(utf-translate-cjk-mode): Fix docstring.
Kenichi Handa <handa@m17n.org>
parents:
50766
diff
changeset
|
196 mule-utf-16le and mule-utf-16be to encode characters in the charsets |
| 48848 | 197 `korean-ksc5601', `chinese-gb2312', `chinese-big5-1', |
| 198 `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to | |
| 199 decode the corresponding unicodes into such characters. | |
| 46496 | 200 |
| 48848 | 201 Where the charsets overlap, the one preferred for decoding is chosen |
| 202 according to the language environment in effect when this option is | |
| 203 turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for | |
| 204 Chinese-Big5 and jisx for other environments. | |
| 205 | |
| 206 The tables are large (over 40000 entries), so this option is not the | |
| 207 default. Also, installing them may be rather slow." | |
|
50341
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
208 :init-value nil |
| 46496 | 209 :version "21.4" |
| 210 :type 'boolean | |
| 48848 | 211 :set-after '(current-language-environment) |
|
50341
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
212 :group 'mule |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
213 :global t |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
214 (if utf-translate-cjk-mode |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
215 ;; Fixme: Allow the use of the CJK charsets to be |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
216 ;; customized by reordering and possible omission. |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
217 (progn |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
218 ;; Redefine them with realistic initial sizes and a |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
219 ;; smallish rehash size to avoid wasting significant |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
220 ;; space after they're built. |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
221 (setq ucs-mule-cjk-to-unicode |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
222 (make-hash-table :test 'eq :size 43000 :rehash-size 1000) |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
223 ucs-unicode-to-mule-cjk |
|
52284
29c8180a975d
(ucs-unicode-to-mule-cjk): Use smaller
Dave Love <fx@gnu.org>
parents:
51628
diff
changeset
|
224 (make-hash-table :test 'eq :size 21500 :rehash-size 1000)) |
|
50341
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
225 ;; Load the files explicitly, to avoid having to keep |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
226 ;; around the large tables they contain (as well as the |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
227 ;; ones which get built). |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
228 (cond |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
229 ((string= "Korean" current-language-environment) |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
230 (load "subst-jis") |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
231 (load "subst-big5") |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
232 (load "subst-gb2312") |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
233 (load "subst-ksc")) |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
234 ((string= "Chinese-BIG5" current-language-environment) |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
235 (load "subst-jis") |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
236 (load "subst-ksc") |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
237 (load "subst-gb2312") |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
238 (load "subst-big5")) |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
239 ((string= "Chinese-GB" current-language-environment) |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
240 (load "subst-jis") |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
241 (load "subst-ksc") |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
242 (load "subst-big5") |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
243 (load "subst-gb2312")) |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
244 (t |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
245 (load "subst-ksc") |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
246 (load "subst-gb2312") |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
247 (load "subst-big5") |
|
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Gro?johann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
248 (load "subst-jis"))) ; jis covers as much as big5, gb2312 |
|
50766
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
249 (define-translation-hash-table 'utf-subst-table-for-decode |
|
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
250 ucs-unicode-to-mule-cjk) |
|
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
251 (define-translation-hash-table 'utf-subst-table-for-encode |
|
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
252 ucs-mule-cjk-to-unicode) |
|
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
253 (set-char-table-extra-slot (get 'utf-translation-table-for-encode |
|
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
254 'translation-table) |
|
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
255 1 ucs-mule-cjk-to-unicode)) |
|
50549
c8525ac04d76
(utf-translate-cjk-mode): Fix incorrect
Kenichi Handa <handa@m17n.org>
parents:
50341
diff
changeset
|
256 (define-translation-hash-table 'utf-subst-table-for-decode |
|
c8525ac04d76
(utf-translate-cjk-mode): Fix incorrect
Kenichi Handa <handa@m17n.org>
parents:
50341
diff
changeset
|
257 (make-hash-table :test 'eq)) |
|
c8525ac04d76
(utf-translate-cjk-mode): Fix incorrect
Kenichi Handa <handa@m17n.org>
parents:
50341
diff
changeset
|
258 (define-translation-hash-table 'utf-subst-table-for-encode |
|
50766
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
259 (make-hash-table :test 'eq)) |
|
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
260 (set-char-table-extra-slot (get 'utf-translation-table-for-encode |
|
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
261 'translation-table) |
|
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
262 1 nil))) |
| 46496 | 263 |
| 35542 | 264 (define-ccl-program ccl-decode-mule-utf-8 |
| 265 ;; | |
| 266 ;; charset | bytes in utf-8 | bytes in emacs | |
| 267 ;; -----------------------+----------------+--------------- | |
| 268 ;; ascii | 1 | 1 | |
| 269 ;; -----------------------+----------------+--------------- | |
| 270 ;; eight-bit-control | 2 | 2 | |
| 41873 | 271 ;; eight-bit-graphic | 2 | 1 |
| 35542 | 272 ;; latin-iso8859-1 | 2 | 2 |
| 273 ;; -----------------------+----------------+--------------- | |
| 274 ;; mule-unicode-0100-24ff | 2 | 4 | |
| 275 ;; (< 0800) | | | |
| 276 ;; -----------------------+----------------+--------------- | |
| 277 ;; mule-unicode-0100-24ff | 3 | 4 | |
| 278 ;; (>= 8000) | | | |
| 279 ;; mule-unicode-2500-33ff | 3 | 4 | |
| 280 ;; mule-unicode-e000-ffff | 3 | 4 | |
| 281 ;; | |
| 282 ;; Thus magnification factor is two. | |
| 283 ;; | |
| 284 `(2 | |
|
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
285 ((r5 = ,(charset-id 'eight-bit-control)) |
|
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
286 (r6 = ,(charset-id 'eight-bit-graphic)) |
|
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
287 (loop |
|
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
288 (r0 = -1) |
| 35542 | 289 (read r0) |
| 290 | |
| 291 ;; 1byte encoding, i.e., ascii | |
| 292 (if (r0 < #x80) | |
|
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
293 ((write r0)) |
| 46496 | 294 (if (r0 < #xc0) ; continuation byte (invalid here) |
|
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
295 ((if (r0 < #xa0) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
296 (write-multibyte-character r5 r0) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
297 (write-multibyte-character r6 r0))) |
| 46496 | 298 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx |
| 299 (if (r0 < #xe0) | |
|
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
300 ((r1 = -1) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
301 (read r1) |
| 35542 | 302 |
| 46496 | 303 (if ((r1 & #b11000000) != #b10000000) |
| 304 ;; Invalid 2-byte sequence | |
|
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
305 ((if (r0 < #xa0) |
|
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
306 (write-multibyte-character r5 r0) |
|
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
307 (write-multibyte-character r6 r0)) |
|
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
308 (if (r1 < #x80) |
|
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
309 (write r1) |
|
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
310 (if (r1 < #xa0) |
|
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
311 (write-multibyte-character r5 r1) |
| 46496 | 312 (write-multibyte-character r6 r1)))) |
| 313 | |
| 314 ((r3 = r0) ; save in case of overlong sequence | |
| 315 (r2 = r1) | |
| 316 (r0 &= #x1f) | |
| 317 (r0 <<= 6) | |
| 318 (r1 &= #x3f) | |
| 319 (r1 += r0) | |
| 320 ;; Now r1 holds scalar value | |
| 321 | |
| 322 (if (r1 < 128) ; `overlong sequence' | |
| 323 ((if (r3 < #xa0) | |
| 324 (write-multibyte-character r5 r3) | |
| 325 (write-multibyte-character r6 r3)) | |
| 326 (if (r2 < #x80) | |
| 327 (write r2) | |
| 328 (if (r2 < #xa0) | |
| 329 (write-multibyte-character r5 r2) | |
| 330 (write-multibyte-character r6 r2)))) | |
| 331 | |
| 332 ;; eight-bit-control | |
| 333 (if (r1 < 160) | |
| 334 ((write-multibyte-character r5 r1)) | |
| 335 | |
| 336 ;; latin-iso8859-1 | |
| 337 (if (r1 < 256) | |
| 338 ((r0 = ,(charset-id 'latin-iso8859-1)) | |
| 339 (r1 -= 128) | |
| 340 (write-multibyte-character r0 r1)) | |
| 341 | |
| 342 ;; mule-unicode-0100-24ff (< 0800) | |
| 343 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
| 344 (r1 -= #x0100) | |
| 345 (r2 = (((r1 / 96) + 32) << 7)) | |
| 346 (r1 %= 96) | |
| 347 (r1 += (r2 + 32)) | |
| 348 (translate-character | |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
349 utf-translation-table-for-decode r0 r1) |
| 46496 | 350 (write-multibyte-character r0 r1)))))))) |
| 351 | |
| 352 ;; 3byte encoding | |
| 353 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx | |
| 354 (if (r0 < #xf0) | |
|
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
355 ((r1 = -1) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
356 (r2 = -1) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
357 (read r1 r2) |
| 46496 | 358 |
| 359 ;; This is set to 1 if the encoding is invalid. | |
| 360 (r4 = 0) | |
| 361 | |
| 362 (r3 = (r1 & #b11000000)) | |
| 363 (r3 |= ((r2 >> 2) & #b00110000)) | |
| 364 (if (r3 != #b10100000) | |
| 365 (r4 = 1) | |
| 366 ((r3 = ((r0 & #x0f) << 12)) | |
| 367 (r3 += ((r1 & #x3f) << 6)) | |
| 368 (r3 += (r2 & #x3f)) | |
| 369 (if (r3 < #x0800) | |
| 370 (r4 = 1)))) | |
| 371 | |
| 372 (if (r4 != 0) | |
| 373 ;; Invalid 3-byte sequence | |
| 374 ((if (r0 < #xa0) | |
| 375 (write-multibyte-character r5 r0) | |
| 376 (write-multibyte-character r6 r0)) | |
| 377 (if (r1 < #x80) | |
| 378 (write r1) | |
| 379 (if (r1 < #xa0) | |
| 380 (write-multibyte-character r5 r1) | |
| 381 (write-multibyte-character r6 r1))) | |
| 382 (if (r2 < #x80) | |
| 383 (write r2) | |
| 384 (if (r2 < #xa0) | |
| 385 (write-multibyte-character r5 r2) | |
| 386 (write-multibyte-character r6 r2)))) | |
|
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
49028
diff
changeset
|
387 |
| 46496 | 388 ;; mule-unicode-0100-24ff (>= 0800) |
| 389 ((if (r3 < #x2500) | |
| 390 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
| 391 (r3 -= #x0100) | |
|
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
392 (r3 //= 96) |
|
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
393 (r1 = (r7 + 32)) |
|
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
394 (r1 += ((r3 + 32) << 7)) |
| 46496 | 395 (translate-character |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
396 utf-translation-table-for-decode r0 r1) |
|
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
397 (write-multibyte-character r0 r1)) |
|
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
49028
diff
changeset
|
398 |
| 46496 | 399 ;; mule-unicode-2500-33ff |
| 400 (if (r3 < #x3400) | |
| 48848 | 401 ((r4 = r3) ; don't zap r3 |
| 402 (lookup-integer utf-subst-table-for-decode r4 r5) | |
| 403 (if r7 | |
| 404 ;; got a translation | |
| 405 ((write-multibyte-character r4 r5) | |
| 406 ;; Zapped through register starvation. | |
| 407 (r5 = ,(charset-id 'eight-bit-control))) | |
| 408 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | |
| 409 (r3 -= #x2500) | |
| 410 (r3 //= 96) | |
| 411 (r1 = (r7 + 32)) | |
| 412 (r1 += ((r3 + 32) << 7)) | |
| 413 (write-multibyte-character r0 r1)))) | |
| 46496 | 414 |
| 415 ;; U+3400 .. U+D7FF | |
| 416 ;; Try to convert to CJK chars, else keep | |
| 417 ;; them as eight-bit-{control|graphic}. | |
| 418 (if (r3 < #xd800) | |
| 419 ((r4 = r3) ; don't zap r3 | |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
420 (lookup-integer utf-subst-table-for-decode r4 r5) |
| 46496 | 421 (if r7 |
| 422 ;; got a translation | |
| 423 ((write-multibyte-character r4 r5) | |
| 424 ;; Zapped through register starvation. | |
| 425 (r5 = ,(charset-id 'eight-bit-control))) | |
| 426 ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic | |
| 427 ((r3 = r6) | |
| 428 (write-multibyte-character r3 r0) | |
| 429 (if (r1 < #xa0) | |
| 430 (r3 = r5)) | |
| 431 (write-multibyte-character r3 r1) | |
| 432 (if (r2 < #xa0) | |
| 433 (r3 = r5) | |
| 434 (r3 = r6)) | |
| 435 (write-multibyte-character r3 r2)))) | |
| 436 | |
| 437 ;; Surrogates, U+D800 .. U+DFFF | |
| 438 (if (r3 < #xe000) | |
| 439 ((r3 = r6) | |
| 440 (write-multibyte-character r3 r0) ; eight-bit-graphic | |
| 441 (if (r1 < #xa0) | |
| 442 (r3 = r5)) | |
| 443 (write-multibyte-character r3 r1) | |
| 444 (if (r2 < #xa0) | |
| 445 (r3 = r5) | |
| 446 (r3 = r6)) | |
| 447 (write-multibyte-character r3 r2)) | |
|
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
49028
diff
changeset
|
448 |
| 46496 | 449 ;; mule-unicode-e000-ffff |
| 450 ;; Fixme: fffe and ffff are invalid. | |
|
52725
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
451 ((r4 = r3) ; don't zap r3 |
|
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
452 (lookup-integer utf-subst-table-for-decode r4 r5) |
|
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
453 (if r7 |
|
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
454 ;; got a translation |
|
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
455 ((write-multibyte-character r4 r5) |
|
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
456 ;; Zapped through register starvation. |
|
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
457 (r5 = ,(charset-id 'eight-bit-control))) |
|
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
458 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) |
|
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
459 (r3 -= #xe000) |
|
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
460 (r3 //= 96) |
|
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
461 (r1 = (r7 + 32)) |
|
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
462 (r1 += ((r3 + 32) << 7)) |
|
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
463 (write-multibyte-character r0 r1))))))))))) |
| 35542 | 464 |
| 46496 | 465 (if (r0 < #xfe) |
| 466 ;; 4byte encoding | |
| 467 ;; keep those bytes as eight-bit-{control|graphic} | |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
468 ;; Fixme: allow lookup in utf-subst-table-for-decode. |
|
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
469 ((r1 = -1) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
470 (r2 = -1) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
471 (r3 = -1) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
472 (read r1 r2 r3) |
| 46496 | 473 ;; r0 > #xf0, thus eight-bit-graphic |
| 474 (write-multibyte-character r6 r0) | |
| 475 (if (r1 < #xa0) | |
| 476 (if (r1 < #x80) ; invalid byte | |
| 477 (write r1) | |
| 478 (write-multibyte-character r5 r1)) | |
| 479 (write-multibyte-character r6 r1)) | |
| 480 (if (r2 < #xa0) | |
| 481 (if (r2 < #x80) ; invalid byte | |
| 482 (write r2) | |
| 483 (write-multibyte-character r5 r2)) | |
| 484 (write-multibyte-character r6 r2)) | |
| 485 (if (r3 < #xa0) | |
| 486 (if (r3 < #x80) ; invalid byte | |
| 487 (write r3) | |
| 488 (write-multibyte-character r5 r3)) | |
| 489 (write-multibyte-character r6 r3)) | |
| 490 (if (r0 >= #xf8) ; 5- or 6-byte encoding | |
|
50207
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
491 ((r0 = -1) |
|
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
492 (read r0) |
|
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
493 (if (r0 < #xa0) |
|
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
494 (if (r0 < #x80) ; invalid byte |
|
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
495 (write r0) |
|
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
496 (write-multibyte-character r5 r0)) |
|
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
497 (write-multibyte-character r6 r0)) |
| 46496 | 498 (if (r0 >= #xfc) ; 6-byte |
|
50207
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
499 ((r0 = -1) |
|
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
500 (read r0) |
|
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
501 (if (r0 < #xa0) |
|
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
502 (if (r0 < #x80) ; invalid byte |
|
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
503 (write r0) |
|
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
504 (write-multibyte-character r5 r0)) |
|
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
505 (write-multibyte-character r6 r0))))))) |
| 46496 | 506 ;; else invalid byte >= #xfe |
| 507 (write-multibyte-character r6 r0)))))) | |
|
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
508 (repeat))) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
509 |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
510 ;; At EOF... |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
511 (if (r0 >= 0) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
512 ((if (r0 < #x80) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
513 (write r0) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
514 (if (r0 < #xa0) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
515 (write-multibyte-character r5 r0) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
516 ((write-multibyte-character r6 r0)))) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
517 (if (r1 >= 0) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
518 ((if (r1 < #x80) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
519 (write r1) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
520 (if (r1 < #xa0) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
521 (write-multibyte-character r5 r1) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
522 ((write-multibyte-character r6 r1)))) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
523 (if (r2 >= 0) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
524 ((if (r2 < #x80) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
525 (write r2) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
526 (if (r2 < #xa0) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
527 (write-multibyte-character r5 r2) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
528 ((write-multibyte-character r6 r2)))) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
529 (if (r3 >= 0) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
530 (if (r3 < #x80) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
531 (write r3) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
532 (if (r3 < #xa0) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
533 (write-multibyte-character r5 r3) |
|
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
534 ((write-multibyte-character r6 r3)))))))))))) |
| 35542 | 535 |
| 36243 | 536 "CCL program to decode UTF-8. |
| 36465 | 537 Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
538 mule-unicode-*, but see also `utf-fragmentation-table' and |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
539 `ucs-mule-cjk-to-unicode'. |
| 46496 | 540 Encodings of un-representable Unicode characters are decoded asis into |
| 541 eight-bit-control and eight-bit-graphic characters.") | |
| 35542 | 542 |
| 543 (define-ccl-program ccl-encode-mule-utf-8 | |
| 544 `(1 | |
|
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
545 ((r5 = -1) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
546 (loop |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
547 (if (r5 < 0) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
548 ((r1 = -1) |
| 41873 | 549 (read-multibyte-character r0 r1) |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
550 (translate-character utf-translation-table-for-encode r0 r1)) |
|
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
551 (;; We have already done read-multibyte-character. |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
552 (r0 = r5) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
553 (r1 = r6) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
554 (r5 = -1))) |
| 35542 | 555 |
|
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
556 (if (r0 == ,(charset-id 'ascii)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
557 (write r1) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
558 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
559 (if (r0 == ,(charset-id 'latin-iso8859-1)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
560 ;; r1 scalar utf-8 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
561 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
562 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
563 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
564 ((r0 = (((r1 & #x40) >> 6) | #xc2)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
565 (r1 &= #x3f) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
566 (r1 |= #x80) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
567 (write r0 r1)) |
| 35542 | 568 |
|
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
569 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
570 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
571 ;; #x3f80 == (0011 1111 1000 0000)b |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
572 (r1 &= #x7f) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
573 (r1 += (r0 + 224)) ; 240 == -32 + #x0100 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
574 ;; now r1 holds scalar value |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
575 (if (r1 < #x0800) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
576 ;; 2byte encoding |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
577 ((r0 = (((r1 & #x07c0) >> 6) | #xc0)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
578 ;; #x07c0 == (0000 0111 1100 0000)b |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
579 (r1 &= #x3f) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
580 (r1 |= #x80) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
581 (write r0 r1)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
582 ;; 3byte encoding |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
583 ((r0 = (((r1 & #xf000) >> 12) | #xe0)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
584 (r2 = ((r1 & #x3f) | #x80)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
585 (r1 &= #x0fc0) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
586 (r1 >>= 6) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
587 (r1 |= #x80) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
588 (write r0 r1 r2)))) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
589 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
590 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
591 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
592 (r1 &= #x7f) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
593 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
594 (r0 = (((r1 & #xf000) >> 12) | #xe0)) |
| 35542 | 595 (r2 = ((r1 & #x3f) | #x80)) |
| 596 (r1 &= #x0fc0) | |
| 597 (r1 >>= 6) | |
| 598 (r1 |= #x80) | |
|
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
599 (write r0 r1 r2)) |
| 35542 | 600 |
|
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
601 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
602 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
603 (r1 &= #x7f) |
| 46496 | 604 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000 |
|
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
605 (r0 = (((r1 & #xf000) >> 12) | #xe0)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
606 (r2 = ((r1 & #x3f) | #x80)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
607 (r1 &= #x0fc0) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
608 (r1 >>= 6) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
609 (r1 |= #x80) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
610 (write r0 r1 r2)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
611 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
612 (if (r0 == ,(charset-id 'eight-bit-control)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
613 ;; r1 scalar utf-8 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
614 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
615 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
616 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
617 ((write #xc2) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
618 (write r1)) |
| 35542 | 619 |
|
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
620 (if (r0 == ,(charset-id 'eight-bit-graphic)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
621 ;; r1 scalar utf-8 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
622 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
623 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
624 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
625 ((write r1) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
626 (r1 = -1) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
627 (read-multibyte-character r0 r1) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
628 (if (r0 != ,(charset-id 'eight-bit-graphic)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
629 (if (r0 != ,(charset-id 'eight-bit-control)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
630 ((r5 = r0) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
631 (r6 = r1)))) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
632 (if (r5 < 0) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
633 ((read-multibyte-character r0 r2) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
634 (if (r0 != ,(charset-id 'eight-bit-graphic)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
635 (if (r0 != ,(charset-id 'eight-bit-control)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
636 ((r5 = r0) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
637 (r6 = r2)))) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
638 (if (r5 < 0) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
639 (write r1 r2) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
640 (if (r1 < #xa0) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
641 (write r1) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
642 ((write #xc2) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
643 (write r1))))))) |
| 35542 | 644 |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
645 ((lookup-character utf-subst-table-for-encode r0 r1) |
| 46496 | 646 (if r7 ; lookup succeeded |
| 647 ((r1 = (((r0 & #xf000) >> 12) | #xe0)) | |
| 648 (r2 = ((r0 & #x3f) | #x80)) | |
| 649 (r0 &= #x0fc0) | |
| 650 (r0 >>= 6) | |
| 651 (r0 |= #x80) | |
| 652 (write r1 r0 r2)) | |
| 653 ;; Unsupported character. | |
| 654 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. | |
| 655 ((write #xef) | |
| 656 (write #xbf) | |
| 657 (write #xbd))))))))))) | |
|
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
658 (repeat))) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
659 (if (r1 >= #xa0) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
660 (write r1) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
661 (if (r1 >= #x80) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
662 ((write #xc2) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
663 (write r1))))) |
| 35542 | 664 |
| 46496 | 665 "CCL program to encode into UTF-8.") |
| 35542 | 666 |
| 41873 | 667 |
| 46496 | 668 (define-ccl-program ccl-untranslated-to-ucs |
| 669 `(0 | |
| 670 (if (r0 < #xf0) ; 3-byte encoding, as above | |
| 671 ((r4 = 0) | |
| 672 (r3 = (r1 & #b11000000)) | |
| 673 (r3 |= ((r2 >> 2) & #b00110000)) | |
| 674 (if (r3 != #b10100000) | |
| 675 (r4 = 1) | |
| 676 ((r3 = ((r0 & #x0f) << 12)) | |
| 677 (r3 += ((r1 & #x3f) << 6)) | |
| 678 (r3 += (r2 & #x3f)) | |
| 679 (if (r3 < #x0800) | |
| 680 (r4 = 1)))) | |
| 681 (if (r4 != 0) | |
| 682 (r0 = 0) | |
| 683 (r0 = r3))) | |
| 684 (if (r0 < #xf8) ; 4-byte (Mule-UCS recipe) | |
| 685 ((r4 = (r1 >> 6)) | |
| 686 (if (r4 != #b10) | |
| 687 (r0 = 0) | |
| 688 ((r4 = (r2 >> 6)) | |
| 689 (if (r4 != #b10) | |
| 690 (r0 = 0) | |
| 691 ((r4 = (r3 >> 6)) | |
| 692 (if (r4 != #b10) | |
| 693 (r0 = 0) | |
| 694 ((r1 = ((r1 & #x3F) << 12)) | |
| 695 (r2 = ((r2 & #x3F) << 6)) | |
| 696 (r3 &= #x3F) | |
| 697 (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3))))))))) | |
| 698 (r0 = 0)))) | |
| 699 "Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0. | |
| 700 r0 == 0 for invalid sequence.") | |
| 701 | |
| 702 (defvar utf-8-ccl-regs (make-vector 8 0)) | |
| 703 | |
| 41873 | 704 (defsubst utf-8-untranslated-to-ucs () |
| 46496 | 705 "Return the UCS code for an untranslated sequence of raw bytes t point. |
| 706 Only for 3- or 4-byte sequences." | |
| 707 (aset utf-8-ccl-regs 0 (or (char-after) 0)) | |
| 708 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0)) | |
| 709 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0)) | |
| 710 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0)) | |
| 711 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs) | |
| 712 (aref utf-8-ccl-regs 0)) | |
| 41873 | 713 |
| 714 (defun utf-8-help-echo (window object position) | |
| 715 (format "Untranslated Unicode U+%04X" | |
| 716 (get-char-property position 'untranslated-utf-8 object))) | |
| 717 | |
| 718 ;; We compose the untranslatable sequences into a single character. | |
| 719 ;; This is infelicitous for editing, because there's currently no | |
| 720 ;; mechanism for treating compositions as atomic, but is OK for | |
| 46496 | 721 ;; display. They are composed to U+FFFD with help-echo which |
| 722 ;; indicates the unicodes they represent. This function GCs too much. | |
| 41873 | 723 (defsubst utf-8-compose () |
| 724 "Put a suitable composition on an untranslatable sequence. | |
| 725 Return the sequence's length." | |
| 726 (let* ((u (utf-8-untranslated-to-ucs)) | |
| 46496 | 727 (l (unless (zerop u) |
| 728 (if (>= u #x10000) | |
| 41873 | 729 4 |
| 46496 | 730 3)))) |
| 731 (when l | |
| 41873 | 732 (put-text-property (point) (min (point-max) (+ l (point))) |
| 733 'untranslated-utf-8 u) | |
| 46496 | 734 (put-text-property (point) (min (point-max) (+ l (point))) |
| 735 'help-echo 'utf-8-help-echo) | |
| 736 (compose-region (point) (+ l (point)) ?$,3u=(B) | |
| 41873 | 737 l))) |
| 738 | |
| 739 (defcustom utf-8-compose-scripts nil | |
| 46496 | 740 "*Non-nil means compose various scripts on decoding utf-8 text." |
| 41873 | 741 :group 'mule |
| 46496 | 742 :version "21.4" |
| 743 :type 'boolean) | |
| 41873 | 744 |
| 745 (defun utf-8-post-read-conversion (length) | |
| 746 "Compose untranslated utf-8 sequences into single characters. | |
| 747 Also compose particular scripts if `utf-8-compose-scripts' is non-nil." | |
| 748 (save-excursion | |
| 749 ;; Can't do eval-when-compile to insert a multibyte constant | |
| 750 ;; version of the string in the loop, since it's always loaded as | |
| 751 ;; unibyte from a byte-compiled file. | |
| 46496 | 752 (let ((range (string-as-multibyte "^\xe1-\xf7"))) |
| 753 (while (and (skip-chars-forward range) | |
| 41873 | 754 (not (eobp))) |
| 755 (forward-char (utf-8-compose))))) | |
| 46496 | 756 ;; Fixme: Takahashi-san implies it may not work this easily. I |
| 757 ;; asked why but didn't get a reply. -- fx | |
| 41873 | 758 (when (and utf-8-compose-scripts (> length 1)) |
| 759 ;; These currently have definitions which cover the relevant | |
| 46496 | 760 ;; unicodes. We could avoid loading thai-util &c by checking |
| 41873 | 761 ;; whether the region contains any characters with the appropriate |
| 762 ;; categories. There aren't yet Unicode-based rules for Tibetan. | |
| 763 (save-excursion (setq length (diacritic-post-read-conversion length))) | |
| 764 (save-excursion (setq length (thai-post-read-conversion length))) | |
| 765 (save-excursion (setq length (lao-post-read-conversion length))) | |
|
52520
f5d5daea4d3c
(utf-8-post-read-conversion): Call post-read-conversion functions for
Kenichi Handa <handa@m17n.org>
parents:
52401
diff
changeset
|
766 (save-excursion (setq length (devanagari-post-read-conversion length))) |
|
f5d5daea4d3c
(utf-8-post-read-conversion): Call post-read-conversion functions for
Kenichi Handa <handa@m17n.org>
parents:
52401
diff
changeset
|
767 (save-excursion (setq length (malayalam-post-read-conversion length))) |
|
f5d5daea4d3c
(utf-8-post-read-conversion): Call post-read-conversion functions for
Kenichi Handa <handa@m17n.org>
parents:
52401
diff
changeset
|
768 (save-excursion (setq length (tamil-post-read-conversion length)))) |
| 41873 | 769 length) |
| 770 | |
| 46496 | 771 ;; ucs-tables is preloaded |
| 772 ;; (defun utf-8-pre-write-conversion (beg end) | |
| 773 ;; "Semi-dummy pre-write function effectively to autoload ucs-tables." | |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
774 ;; ;; Ensure translation-table is loaded. |
| 46496 | 775 ;; (require 'ucs-tables) |
| 776 ;; ;; Don't do this again. | |
| 777 ;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil) | |
| 778 ;; nil) | |
| 41873 | 779 |
| 35542 | 780 (make-coding-system |
| 781 'mule-utf-8 4 ?u | |
| 782 "UTF-8 encoding for Emacs-supported Unicode characters. | |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
783 It supports Unicode characters of these ranges: |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
784 U+0000..U+33FF, U+E000..U+FFFF. |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
785 They correspond to these Emacs character sets: |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
786 ascii, latin-iso8859-1, mule-unicode-0100-24ff, |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
787 mule-unicode-2500-33ff, mule-unicode-e000-ffff |
| 35542 | 788 |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
789 On decoding (e.g. reading a file), Unicode characters not in the above |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
790 ranges are decoded into sequences of eight-bit-control and |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
791 eight-bit-graphic characters to preserve their byte sequences. The |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
792 byte sequence is preserved on i/o for valid utf-8, but not necessarily |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
793 for invalid utf-8. |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
794 |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
795 On encoding (e.g. writing a file), Emacs characters not belonging to |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
796 any of the character sets listed above are encoded into the UTF-8 byte |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
797 sequence representing U+FFFD (REPLACEMENT CHARACTER)." |
| 35542 | 798 |
| 799 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | |
| 800 '((safe-charsets | |
| 801 ascii | |
| 802 eight-bit-control | |
| 803 eight-bit-graphic | |
| 804 latin-iso8859-1 | |
| 805 mule-unicode-0100-24ff | |
| 806 mule-unicode-2500-33ff | |
| 807 mule-unicode-e000-ffff) | |
|
36371
f6bb3ed752b4
(mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents:
36243
diff
changeset
|
808 (mime-charset . utf-8) |
|
36423
aa776838b660
(mule-utf-8): Set coding-category property to coding-category-utf-8.
Kenichi Handa <handa@m17n.org>
parents:
36371
diff
changeset
|
809 (coding-category . coding-category-utf-8) |
| 41873 | 810 (valid-codes (0 . 255)) |
| 46496 | 811 ;; (pre-write-conversion . utf-8-pre-write-conversion) |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
812 (post-read-conversion . utf-8-post-read-conversion) |
|
50766
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
813 (translation-table-for-encode . utf-translation-table-for-encode) |
|
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
814 (dependency unify-8859-on-encoding-mode |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
815 unify-8859-on-decoding-mode |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
816 utf-fragment-on-decoding |
|
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
817 utf-translate-cjk))) |
| 35542 | 818 |
| 819 (define-coding-system-alias 'utf-8 'mule-utf-8) | |
|
38436
b174db545cfd
Some fixes to follow coding conventions.
Pavel Jan?k <Pavel@Janik.cz>
parents:
37934
diff
changeset
|
820 |
| 41873 | 821 ;; I think this needs special private charsets defined for the |
| 822 ;; untranslated sequences, if it's going to work well. | |
| 823 | |
| 824 ;;; (defun utf-8-compose-function (pos to pattern &optional string) | |
| 825 ;;; (let* ((prop (get-char-property pos 'composition string)) | |
| 826 ;;; (l (and prop (- (cadr prop) (car prop))))) | |
| 827 ;;; (cond ((and l (> l (- to pos))) | |
| 828 ;;; (delete-region pos to)) | |
| 829 ;;; ((and (> (char-after pos) 224) | |
| 830 ;;; (< (char-after pos) 256) | |
| 831 ;;; (save-restriction | |
| 832 ;;; (narrow-to-region pos to) | |
| 833 ;;; (utf-8-compose))) | |
| 834 ;;; t)))) | |
| 835 | |
| 836 ;;; (dotimes (i 96) | |
| 837 ;;; (aset composition-function-table | |
| 838 ;;; (+ 128 i) | |
| 839 ;;; `((,(string-as-multibyte "[\200-\237\240-\377]") | |
| 840 ;;; . utf-8-compose-function)))) | |
| 841 | |
| 52401 | 842 ;;; arch-tag: b08735b7-753b-4ae6-b754-0f3efe4515c5 |
|
38436
b174db545cfd
Some fixes to follow coding conventions.
Pavel Jan?k <Pavel@Janik.cz>
parents:
37934
diff
changeset
|
843 ;;; utf-8.el ends here |
