Mercurial > emacs
annotate lisp/international/utf-8.el @ 53879:e3771c262410
New file. Move original fringe related declarations
and code from dispextern.h and xdisp.c here.
Rework code to support user defined fringe bitmaps, redefining
standard bitmaps, ability to overlay user defined bitmap with
overlay arrow bitmap, and add faces to bitmaps.
(Voverflow_newline_into_fringe): Declare here.
(enum fringe_bitmap_align): New enum.
(..._bits): All bitmaps are now defined without bitswapping; that
is now done in init_fringe_once (if necessary).
(standard_bitmaps): New array with specifications for the
standard fringe bitmaps.
(fringe_faces): New array.
(valid_fringe_bitmap_id_p): New function.
(draw_fringe_bitmap_1): Rename from draw_fringe_bitmap.
(draw_fringe_bitmap): New function which draws fringe bitmap,
possibly overlaying bitmap with cursor in right fringe or the
overlay arrow in the left fringe.
(update_window_fringes): Do not handle overlay arrow here.
Compare and copy fringe bitmap faces.
(init_fringe_bitmap): New function.
(Fdefine_fringe_bitmap, Fdestroy_fringe_bitmap): New DEFUNs to
define and destroy user defined fringe bitmaps.
(Fset_fringe_bitmap_face): New DEFUN to set face for a fringe bitmap.
(Ffringe_bitmaps_at_pos): New DEFUN to read current fringe bitmaps.
(syms_of_fringe): New function. Defsubr new DEFUNs.
DEFVAR_LISP Voverflow_newline_into_fringe.
(init_fringe_once, init_fringe): New functions.
(w32_init_fringe, w32_reset_fringes) [WINDOWS_NT]: New functions.
author | Kim F. Storm <storm@cua.dk> |
---|---|
date | Sun, 08 Feb 2004 23:18:16 +0000 |
parents | dd3cff7e93d7 |
children | d61b01de8cdf |
rev | line source |
---|---|
46496 | 1 ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*- |
35542 | 2 |
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. | |
4 ;; Licensed to the Free Software Foundation. | |
46496 | 5 ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc. |
35542 | 6 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
7 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> |
46496 | 8 ;; Maintainer: FSF |
36243 | 9 ;; Keywords: multilingual, Unicode, UTF-8, i18n |
35542 | 10 |
11 ;; This file is part of GNU Emacs. | |
12 | |
13 ;; GNU Emacs is free software; you can redistribute it and/or modify | |
14 ;; it under the terms of the GNU General Public License as published by | |
15 ;; the Free Software Foundation; either version 2, or (at your option) | |
16 ;; any later version. | |
17 | |
18 ;; GNU Emacs is distributed in the hope that it will be useful, | |
19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 ;; GNU General Public License for more details. | |
22 | |
23 ;; You should have received a copy of the GNU General Public License | |
24 ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
25 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
26 ;; Boston, MA 02111-1307, USA. | |
27 | |
28 ;;; Commentary: | |
29 | |
41873 | 30 ;; The coding-system `mule-utf-8' basically supports encoding/decoding |
31 ;; of the following character sets to and from UTF-8: | |
35542 | 32 ;; |
33 ;; ascii | |
34 ;; eight-bit-control | |
35 ;; latin-iso8859-1 | |
36 ;; mule-unicode-0100-24ff | |
37 ;; mule-unicode-2500-33ff | |
38 ;; mule-unicode-e000-ffff | |
39 ;; | |
36243 | 40 ;; On decoding, Unicode characters that do not fit into the above |
41 ;; character sets are handled as `eight-bit-control' or | |
42 ;; `eight-bit-graphic' characters to retain the information about the | |
46496 | 43 ;; original byte sequence and text properties record the corresponding |
44 ;; unicode. | |
45 ;; | |
46 ;; Fixme: note that reading and writing invalid utf-8 may not be | |
47 ;; idempotent -- to represent the bytes to fix that needs a new charset. | |
41873 | 48 ;; |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
49 ;; Characters from other character sets can be encoded with mule-utf-8 |
48848 | 50 ;; by populating the translation table |
50179
65bb5afb37ef
(utf-fragment-on-decoding): Don't call
Kenichi Handa <handa@m17n.org>
parents:
50085
diff
changeset
|
51 ;; `utf-translation-table-for-encode'. Hash tables |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
52 ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
53 ;; used to support encoding and decoding of about a quarter of the CJK |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
54 ;; space between U+3400 and U+DFFF. |
36243 | 55 |
56 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: | |
35542 | 57 |
58 ;; scalar | utf-8 | |
59 ;; value | 1st byte | 2nd byte | 3rd byte | |
60 ;; --------------------+-----------+-----------+---------- | |
61 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | | |
62 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | | |
63 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx | |
64 | |
65 ;;; Code: | |
66 | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
67 (defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
68 "Char table mapping characters to latin-iso8859-1 or mule-unicode-*. |
46496 | 69 |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
70 If `unify-8859-on-encoding-mode' is non-nil, this table populates the |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
71 translation-table named `utf-translation-table-for-encode'.") |
46496 | 72 |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
73 (define-translation-table 'utf-translation-table-for-encode) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
74 |
46496 | 75 |
76 ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the | |
77 ;; space of mule-unicode. For Latin scripts this isn't very | |
78 ;; important. Hebrew and Arabic might go here too when there's proper | |
79 ;; support for them. | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
80 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
81 (defvar utf-fragmentation-table (make-char-table 'translation-table nil) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
82 "Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
83 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
84 If `utf-fragment-on-decoding' is non-nil, this table populates the |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
85 translation-table named `utf-translation-table-for-decode'") |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
86 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
87 (defvar utf-defragmentation-table (make-char-table 'translation-table nil) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
88 "Char-table for reverse mapping of `utf-fragmentation-table'. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
89 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
90 If `utf-fragment-on-decoding' is non-nil and |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
91 `unify-8859-on-encoding-mode' is nil, this table populates the |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
92 translation-table named `utf-translation-table-for-encode'") |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
93 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
94 (define-translation-table 'utf-translation-table-for-decode) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
95 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
96 |
48882 | 97 (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
98 "Hash table mapping Emacs CJK character sets to Unicode code points. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
99 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
100 If `utf-translate-cjk' is non-nil, this table populates the |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
101 translation-hash-table named `utf-subst-table-for-encode'.") |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
102 |
48882 | 103 (define-translation-hash-table 'utf-subst-table-for-encode |
104 ucs-mule-cjk-to-unicode) | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
105 |
48882 | 106 (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
107 "Hash table mapping Unicode code points to Emacs CJK character sets. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
108 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
109 If `utf-translate-cjk' is non-nil, this table populates the |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
110 translation-hash-table named `utf-subst-table-for-decode'.") |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
111 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
112 (define-translation-hash-table 'utf-subst-table-for-decode |
48882 | 113 ucs-unicode-to-mule-cjk) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
114 |
46496 | 115 (mapc |
116 (lambda (pair) | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
117 (aset utf-fragmentation-table (car pair) (cdr pair)) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
118 (aset utf-defragmentation-table (cdr pair) (car pair))) |
46496 | 119 '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) (?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B) |
120 (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) (?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,F@(B) | |
121 (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) (?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B) | |
122 (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) (?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B) | |
123 (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) (?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B) | |
124 (?$,1' (B . ?,FP(B) (?$,1'!(B . ?,FQ(B) (?$,1'#(B . ?,FS(B) (?$,1'$(B . ?,FT(B) (?$,1'%(B . ?,FU(B) | |
125 (?$,1'&(B . ?,FV(B) (?$,1''(B . ?,FW(B) (?$,1'((B . ?,FX(B) (?$,1')(B . ?,FY(B) (?$,1'*(B . ?,FZ(B) | |
126 (?$,1'+(B . ?,F[(B) (?$,1',(B . ?,F\(B) (?$,1'-(B . ?,F](B) (?$,1'.(B . ?,F^(B) (?$,1'/(B . ?,F_(B) | |
127 (?$,1'0(B . ?,F`(B) (?$,1'1(B . ?,Fa(B) (?$,1'2(B . ?,Fb(B) (?$,1'3(B . ?,Fc(B) (?$,1'4(B . ?,Fd(B) | |
128 (?$,1'5(B . ?,Fe(B) (?$,1'6(B . ?,Ff(B) (?$,1'7(B . ?,Fg(B) (?$,1'8(B . ?,Fh(B) (?$,1'9(B . ?,Fi(B) | |
129 (?$,1':(B . ?,Fj(B) (?$,1';(B . ?,Fk(B) (?$,1'<(B . ?,Fl(B) (?$,1'=(B . ?,Fm(B) (?$,1'>(B . ?,Fn(B) | |
130 (?$,1'?(B . ?,Fo(B) (?$,1'@(B . ?,Fp(B) (?$,1'A(B . ?,Fq(B) (?$,1'B(B . ?,Fr(B) (?$,1'C(B . ?,Fs(B) | |
131 (?$,1'D(B . ?,Ft(B) (?$,1'E(B . ?,Fu(B) (?$,1'F(B . ?,Fv(B) (?$,1'G(B . ?,Fw(B) (?$,1'H(B . ?,Fx(B) | |
132 (?$,1'I(B . ?,Fy(B) (?$,1'J(B . ?,Fz(B) (?$,1'K(B . ?,F{(B) (?$,1'L(B . ?,F|(B) (?$,1'M(B . ?,F}(B) | |
133 (?$,1'N(B . ?,F~(B) | |
134 | |
135 (?$,1(!(B . ?,L!(B) (?$,1("(B . ?,L"(B) (?$,1(#(B . ?,L#(B) (?$,1($(B . ?,L$(B) | |
136 (?$,1(%(B . ?,L%(B) (?$,1(&(B . ?,L&(B) (?$,1('(B . ?,L'(B) (?$,1(((B . ?,L((B) (?$,1()(B . ?,L)(B) | |
137 (?$,1(*(B . ?,L*(B) (?$,1(+(B . ?,L+(B) (?$,1(,(B . ?,L,(B) (?$,1(.(B . ?,L.(B) (?$,1(/(B . ?,L/(B) | |
138 (?$,1(0(B . ?,L0(B) (?$,1(1(B . ?,L1(B) (?$,1(2(B . ?,L2(B) (?$,1(3(B . ?,L3(B) (?$,1(4(B . ?,L4(B) | |
139 (?$,1(5(B . ?,L5(B) (?$,1(6(B . ?,L6(B) (?$,1(7(B . ?,L7(B) (?$,1(8(B . ?,L8(B) (?$,1(9(B . ?,L9(B) | |
140 (?$,1(:(B . ?,L:(B) (?$,1(;(B . ?,L;(B) (?$,1(<(B . ?,L<(B) (?$,1(=(B . ?,L=(B) (?$,1(>(B . ?,L>(B) | |
141 (?$,1(?(B . ?,L?(B) (?$,1(@(B . ?,L@(B) (?$,1(A(B . ?,LA(B) (?$,1(B(B . ?,LB(B) (?$,1(C(B . ?,LC(B) | |
142 (?$,1(D(B . ?,LD(B) (?$,1(E(B . ?,LE(B) (?$,1(F(B . ?,LF(B) (?$,1(G(B . ?,LG(B) (?$,1(H(B . ?,LH(B) | |
143 (?$,1(I(B . ?,LI(B) (?$,1(J(B . ?,LJ(B) (?$,1(K(B . ?,LK(B) (?$,1(L(B . ?,LL(B) (?$,1(M(B . ?,LM(B) | |
144 (?$,1(N(B . ?,LN(B) (?$,1(O(B . ?,LO(B) (?$,1(P(B . ?,LP(B) (?$,1(Q(B . ?,LQ(B) (?$,1(R(B . ?,LR(B) | |
145 (?$,1(S(B . ?,LS(B) (?$,1(T(B . ?,LT(B) (?$,1(U(B . ?,LU(B) (?$,1(V(B . ?,LV(B) (?$,1(W(B . ?,LW(B) | |
146 (?$,1(X(B . ?,LX(B) (?$,1(Y(B . ?,LY(B) (?$,1(Z(B . ?,LZ(B) (?$,1([(B . ?,L[(B) (?$,1(\(B . ?,L\(B) | |
147 (?$,1(](B . ?,L](B) (?$,1(^(B . ?,L^(B) (?$,1(_(B . ?,L_(B) (?$,1(`(B . ?,L`(B) (?$,1(a(B . ?,La(B) | |
148 (?$,1(b(B . ?,Lb(B) (?$,1(c(B . ?,Lc(B) (?$,1(d(B . ?,Ld(B) (?$,1(e(B . ?,Le(B) (?$,1(f(B . ?,Lf(B) | |
149 (?$,1(g(B . ?,Lg(B) (?$,1(h(B . ?,Lh(B) (?$,1(i(B . ?,Li(B) (?$,1(j(B . ?,Lj(B) (?$,1(k(B . ?,Lk(B) | |
150 (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) (?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B) | |
151 (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) (?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B) | |
152 (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) (?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B) | |
153 (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B))) | |
154 | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
155 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
156 (defcustom utf-fragment-on-decoding nil |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
157 "Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets. |
46496 | 158 Setting this means that the relevant Cyrillic and Greek characters are |
159 decoded into the iso8859 charsets rather than into | |
47231 | 160 mule-unicode-0100-24ff. The iso8859 charsets take half as much space |
46496 | 161 in the buffer, but using them may affect how the buffer can be re-encoded |
162 and may require a different input method to search for them, for instance. | |
163 See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode' | |
47231 | 164 for mechanisms to make this largely transparent. |
165 | |
166 Setting this variable outside customize has no effect." | |
46496 | 167 :set (lambda (s v) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
168 (if v |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
169 (progn |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
170 (define-translation-table 'utf-translation-table-for-decode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
171 utf-fragmentation-table) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
172 ;; Even if unify-8859-on-encoding-mode is off, make |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
173 ;; mule-utf-* encode characters in |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
174 ;; utf-fragmentation-table. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
175 (unless (eq (get 'utf-translation-table-for-encode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
176 'translation-table) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
177 ucs-mule-to-mule-unicode) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
178 (define-translation-table 'utf-translation-table-for-encode |
50179
65bb5afb37ef
(utf-fragment-on-decoding): Don't call
Kenichi Handa <handa@m17n.org>
parents:
50085
diff
changeset
|
179 utf-defragmentation-table))) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
180 (define-translation-table 'utf-translation-table-for-decode) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
181 ;; When unify-8859-on-encoding-mode is off, be sure to make |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
182 ;; mule-utf-* disabled for characters in |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
183 ;; utf-fragmentation-table. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
184 (unless (eq (get 'utf-translation-table-for-encode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
185 'translation-table) |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
186 ucs-mule-to-mule-unicode) |
50179
65bb5afb37ef
(utf-fragment-on-decoding): Don't call
Kenichi Handa <handa@m17n.org>
parents:
50085
diff
changeset
|
187 (define-translation-table 'utf-translation-table-for-encode))) |
46496 | 188 (set-default s v)) |
189 :version "21.4" | |
190 :type 'boolean | |
191 :group 'mule) | |
192 | |
50341
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
193 (define-minor-mode utf-translate-cjk-mode |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
194 "Whether the UTF based coding systems should decode/encode CJK characters. |
48848 | 195 Enabling this loads tables which allow the coding systems mule-utf-8, |
51628
abfc7d48b476
(utf-translate-cjk-mode): Fix docstring.
Kenichi Handa <handa@m17n.org>
parents:
50766
diff
changeset
|
196 mule-utf-16le and mule-utf-16be to encode characters in the charsets |
48848 | 197 `korean-ksc5601', `chinese-gb2312', `chinese-big5-1', |
198 `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to | |
199 decode the corresponding unicodes into such characters. | |
46496 | 200 |
48848 | 201 Where the charsets overlap, the one preferred for decoding is chosen |
202 according to the language environment in effect when this option is | |
203 turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for | |
204 Chinese-Big5 and jisx for other environments. | |
205 | |
206 The tables are large (over 40000 entries), so this option is not the | |
207 default. Also, installing them may be rather slow." | |
50341
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
208 :init-value nil |
46496 | 209 :version "21.4" |
210 :type 'boolean | |
48848 | 211 :set-after '(current-language-environment) |
50341
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
212 :group 'mule |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
213 :global t |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
214 (if utf-translate-cjk-mode |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
215 ;; Fixme: Allow the use of the CJK charsets to be |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
216 ;; customized by reordering and possible omission. |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
217 (progn |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
218 ;; Redefine them with realistic initial sizes and a |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
219 ;; smallish rehash size to avoid wasting significant |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
220 ;; space after they're built. |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
221 (setq ucs-mule-cjk-to-unicode |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
222 (make-hash-table :test 'eq :size 43000 :rehash-size 1000) |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
223 ucs-unicode-to-mule-cjk |
52284
29c8180a975d
(ucs-unicode-to-mule-cjk): Use smaller
Dave Love <fx@gnu.org>
parents:
51628
diff
changeset
|
224 (make-hash-table :test 'eq :size 21500 :rehash-size 1000)) |
50341
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
225 ;; Load the files explicitly, to avoid having to keep |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
226 ;; around the large tables they contain (as well as the |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
227 ;; ones which get built). |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
228 (cond |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
229 ((string= "Korean" current-language-environment) |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
230 (load "subst-jis") |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
231 (load "subst-big5") |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
232 (load "subst-gb2312") |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
233 (load "subst-ksc")) |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
234 ((string= "Chinese-BIG5" current-language-environment) |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
235 (load "subst-jis") |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
236 (load "subst-ksc") |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
237 (load "subst-gb2312") |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
238 (load "subst-big5")) |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
239 ((string= "Chinese-GB" current-language-environment) |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
240 (load "subst-jis") |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
241 (load "subst-ksc") |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
242 (load "subst-big5") |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
243 (load "subst-gb2312")) |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
244 (t |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
245 (load "subst-ksc") |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
246 (load "subst-gb2312") |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
247 (load "subst-big5") |
f49a20cb84ed
(utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents:
50207
diff
changeset
|
248 (load "subst-jis"))) ; jis covers as much as big5, gb2312 |
50766
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
249 (define-translation-hash-table 'utf-subst-table-for-decode |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
250 ucs-unicode-to-mule-cjk) |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
251 (define-translation-hash-table 'utf-subst-table-for-encode |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
252 ucs-mule-cjk-to-unicode) |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
253 (set-char-table-extra-slot (get 'utf-translation-table-for-encode |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
254 'translation-table) |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
255 1 ucs-mule-cjk-to-unicode)) |
50549
c8525ac04d76
(utf-translate-cjk-mode): Fix incorrect
Kenichi Handa <handa@m17n.org>
parents:
50341
diff
changeset
|
256 (define-translation-hash-table 'utf-subst-table-for-decode |
c8525ac04d76
(utf-translate-cjk-mode): Fix incorrect
Kenichi Handa <handa@m17n.org>
parents:
50341
diff
changeset
|
257 (make-hash-table :test 'eq)) |
c8525ac04d76
(utf-translate-cjk-mode): Fix incorrect
Kenichi Handa <handa@m17n.org>
parents:
50341
diff
changeset
|
258 (define-translation-hash-table 'utf-subst-table-for-encode |
50766
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
259 (make-hash-table :test 'eq)) |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
260 (set-char-table-extra-slot (get 'utf-translation-table-for-encode |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
261 'translation-table) |
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
262 1 nil))) |
46496 | 263 |
35542 | 264 (define-ccl-program ccl-decode-mule-utf-8 |
265 ;; | |
266 ;; charset | bytes in utf-8 | bytes in emacs | |
267 ;; -----------------------+----------------+--------------- | |
268 ;; ascii | 1 | 1 | |
269 ;; -----------------------+----------------+--------------- | |
270 ;; eight-bit-control | 2 | 2 | |
41873 | 271 ;; eight-bit-graphic | 2 | 1 |
35542 | 272 ;; latin-iso8859-1 | 2 | 2 |
273 ;; -----------------------+----------------+--------------- | |
274 ;; mule-unicode-0100-24ff | 2 | 4 | |
275 ;; (< 0800) | | | |
276 ;; -----------------------+----------------+--------------- | |
277 ;; mule-unicode-0100-24ff | 3 | 4 | |
278 ;; (>= 8000) | | | |
279 ;; mule-unicode-2500-33ff | 3 | 4 | |
280 ;; mule-unicode-e000-ffff | 3 | 4 | |
281 ;; | |
282 ;; Thus magnification factor is two. | |
283 ;; | |
284 `(2 | |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
285 ((r5 = ,(charset-id 'eight-bit-control)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
286 (r6 = ,(charset-id 'eight-bit-graphic)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
287 (loop |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
288 (r0 = -1) |
35542 | 289 (read r0) |
290 | |
291 ;; 1byte encoding, i.e., ascii | |
292 (if (r0 < #x80) | |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
293 ((write r0)) |
46496 | 294 (if (r0 < #xc0) ; continuation byte (invalid here) |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
295 ((if (r0 < #xa0) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
296 (write-multibyte-character r5 r0) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
297 (write-multibyte-character r6 r0))) |
46496 | 298 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx |
299 (if (r0 < #xe0) | |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
300 ((r1 = -1) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
301 (read r1) |
35542 | 302 |
46496 | 303 (if ((r1 & #b11000000) != #b10000000) |
304 ;; Invalid 2-byte sequence | |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
305 ((if (r0 < #xa0) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
306 (write-multibyte-character r5 r0) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
307 (write-multibyte-character r6 r0)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
308 (if (r1 < #x80) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
309 (write r1) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
310 (if (r1 < #xa0) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
311 (write-multibyte-character r5 r1) |
46496 | 312 (write-multibyte-character r6 r1)))) |
313 | |
314 ((r3 = r0) ; save in case of overlong sequence | |
315 (r2 = r1) | |
316 (r0 &= #x1f) | |
317 (r0 <<= 6) | |
318 (r1 &= #x3f) | |
319 (r1 += r0) | |
320 ;; Now r1 holds scalar value | |
321 | |
322 (if (r1 < 128) ; `overlong sequence' | |
323 ((if (r3 < #xa0) | |
324 (write-multibyte-character r5 r3) | |
325 (write-multibyte-character r6 r3)) | |
326 (if (r2 < #x80) | |
327 (write r2) | |
328 (if (r2 < #xa0) | |
329 (write-multibyte-character r5 r2) | |
330 (write-multibyte-character r6 r2)))) | |
331 | |
332 ;; eight-bit-control | |
333 (if (r1 < 160) | |
334 ((write-multibyte-character r5 r1)) | |
335 | |
336 ;; latin-iso8859-1 | |
337 (if (r1 < 256) | |
338 ((r0 = ,(charset-id 'latin-iso8859-1)) | |
339 (r1 -= 128) | |
340 (write-multibyte-character r0 r1)) | |
341 | |
342 ;; mule-unicode-0100-24ff (< 0800) | |
343 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
344 (r1 -= #x0100) | |
345 (r2 = (((r1 / 96) + 32) << 7)) | |
346 (r1 %= 96) | |
347 (r1 += (r2 + 32)) | |
348 (translate-character | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
349 utf-translation-table-for-decode r0 r1) |
46496 | 350 (write-multibyte-character r0 r1)))))))) |
351 | |
352 ;; 3byte encoding | |
353 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx | |
354 (if (r0 < #xf0) | |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
355 ((r1 = -1) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
356 (r2 = -1) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
357 (read r1 r2) |
46496 | 358 |
359 ;; This is set to 1 if the encoding is invalid. | |
360 (r4 = 0) | |
361 | |
362 (r3 = (r1 & #b11000000)) | |
363 (r3 |= ((r2 >> 2) & #b00110000)) | |
364 (if (r3 != #b10100000) | |
365 (r4 = 1) | |
366 ((r3 = ((r0 & #x0f) << 12)) | |
367 (r3 += ((r1 & #x3f) << 6)) | |
368 (r3 += (r2 & #x3f)) | |
369 (if (r3 < #x0800) | |
370 (r4 = 1)))) | |
371 | |
372 (if (r4 != 0) | |
373 ;; Invalid 3-byte sequence | |
374 ((if (r0 < #xa0) | |
375 (write-multibyte-character r5 r0) | |
376 (write-multibyte-character r6 r0)) | |
377 (if (r1 < #x80) | |
378 (write r1) | |
379 (if (r1 < #xa0) | |
380 (write-multibyte-character r5 r1) | |
381 (write-multibyte-character r6 r1))) | |
382 (if (r2 < #x80) | |
383 (write r2) | |
384 (if (r2 < #xa0) | |
385 (write-multibyte-character r5 r2) | |
386 (write-multibyte-character r6 r2)))) | |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
49028
diff
changeset
|
387 |
46496 | 388 ;; mule-unicode-0100-24ff (>= 0800) |
389 ((if (r3 < #x2500) | |
390 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
391 (r3 -= #x0100) | |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
392 (r3 //= 96) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
393 (r1 = (r7 + 32)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
394 (r1 += ((r3 + 32) << 7)) |
46496 | 395 (translate-character |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
396 utf-translation-table-for-decode r0 r1) |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
397 (write-multibyte-character r0 r1)) |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
49028
diff
changeset
|
398 |
46496 | 399 ;; mule-unicode-2500-33ff |
400 (if (r3 < #x3400) | |
48848 | 401 ((r4 = r3) ; don't zap r3 |
402 (lookup-integer utf-subst-table-for-decode r4 r5) | |
403 (if r7 | |
404 ;; got a translation | |
405 ((write-multibyte-character r4 r5) | |
406 ;; Zapped through register starvation. | |
407 (r5 = ,(charset-id 'eight-bit-control))) | |
408 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | |
409 (r3 -= #x2500) | |
410 (r3 //= 96) | |
411 (r1 = (r7 + 32)) | |
412 (r1 += ((r3 + 32) << 7)) | |
413 (write-multibyte-character r0 r1)))) | |
46496 | 414 |
415 ;; U+3400 .. U+D7FF | |
416 ;; Try to convert to CJK chars, else keep | |
417 ;; them as eight-bit-{control|graphic}. | |
418 (if (r3 < #xd800) | |
419 ((r4 = r3) ; don't zap r3 | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
420 (lookup-integer utf-subst-table-for-decode r4 r5) |
46496 | 421 (if r7 |
422 ;; got a translation | |
423 ((write-multibyte-character r4 r5) | |
424 ;; Zapped through register starvation. | |
425 (r5 = ,(charset-id 'eight-bit-control))) | |
426 ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic | |
427 ((r3 = r6) | |
428 (write-multibyte-character r3 r0) | |
429 (if (r1 < #xa0) | |
430 (r3 = r5)) | |
431 (write-multibyte-character r3 r1) | |
432 (if (r2 < #xa0) | |
433 (r3 = r5) | |
434 (r3 = r6)) | |
435 (write-multibyte-character r3 r2)))) | |
436 | |
437 ;; Surrogates, U+D800 .. U+DFFF | |
438 (if (r3 < #xe000) | |
439 ((r3 = r6) | |
440 (write-multibyte-character r3 r0) ; eight-bit-graphic | |
441 (if (r1 < #xa0) | |
442 (r3 = r5)) | |
443 (write-multibyte-character r3 r1) | |
444 (if (r2 < #xa0) | |
445 (r3 = r5) | |
446 (r3 = r6)) | |
447 (write-multibyte-character r3 r2)) | |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
49028
diff
changeset
|
448 |
46496 | 449 ;; mule-unicode-e000-ffff |
450 ;; Fixme: fffe and ffff are invalid. | |
52725
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
451 ((r4 = r3) ; don't zap r3 |
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
452 (lookup-integer utf-subst-table-for-decode r4 r5) |
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
453 (if r7 |
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
454 ;; got a translation |
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
455 ((write-multibyte-character r4 r5) |
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
456 ;; Zapped through register starvation. |
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
457 (r5 = ,(charset-id 'eight-bit-control))) |
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
458 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) |
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
459 (r3 -= #xe000) |
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
460 (r3 //= 96) |
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
461 (r1 = (r7 + 32)) |
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
462 (r1 += ((r3 + 32) << 7)) |
dd3cff7e93d7
(ccl-decode-mule-utf-8): Lookup
Kenichi Handa <handa@m17n.org>
parents:
52520
diff
changeset
|
463 (write-multibyte-character r0 r1))))))))))) |
35542 | 464 |
46496 | 465 (if (r0 < #xfe) |
466 ;; 4byte encoding | |
467 ;; keep those bytes as eight-bit-{control|graphic} | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
468 ;; Fixme: allow lookup in utf-subst-table-for-decode. |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
469 ((r1 = -1) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
470 (r2 = -1) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
471 (r3 = -1) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
472 (read r1 r2 r3) |
46496 | 473 ;; r0 > #xf0, thus eight-bit-graphic |
474 (write-multibyte-character r6 r0) | |
475 (if (r1 < #xa0) | |
476 (if (r1 < #x80) ; invalid byte | |
477 (write r1) | |
478 (write-multibyte-character r5 r1)) | |
479 (write-multibyte-character r6 r1)) | |
480 (if (r2 < #xa0) | |
481 (if (r2 < #x80) ; invalid byte | |
482 (write r2) | |
483 (write-multibyte-character r5 r2)) | |
484 (write-multibyte-character r6 r2)) | |
485 (if (r3 < #xa0) | |
486 (if (r3 < #x80) ; invalid byte | |
487 (write r3) | |
488 (write-multibyte-character r5 r3)) | |
489 (write-multibyte-character r6 r3)) | |
490 (if (r0 >= #xf8) ; 5- or 6-byte encoding | |
50207
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
491 ((r0 = -1) |
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
492 (read r0) |
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
493 (if (r0 < #xa0) |
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
494 (if (r0 < #x80) ; invalid byte |
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
495 (write r0) |
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
496 (write-multibyte-character r5 r0)) |
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
497 (write-multibyte-character r6 r0)) |
46496 | 498 (if (r0 >= #xfc) ; 6-byte |
50207
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
499 ((r0 = -1) |
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
500 (read r0) |
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
501 (if (r0 < #xa0) |
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
502 (if (r0 < #x80) ; invalid byte |
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
503 (write r0) |
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
504 (write-multibyte-character r5 r0)) |
b49563831bd2
(ccl-decode-mule-utf-8): Fix a bug of
Kenichi Handa <handa@m17n.org>
parents:
50179
diff
changeset
|
505 (write-multibyte-character r6 r0))))))) |
46496 | 506 ;; else invalid byte >= #xfe |
507 (write-multibyte-character r6 r0)))))) | |
50085
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
508 (repeat))) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
509 |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
510 ;; At EOF... |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
511 (if (r0 >= 0) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
512 ((if (r0 < #x80) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
513 (write r0) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
514 (if (r0 < #xa0) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
515 (write-multibyte-character r5 r0) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
516 ((write-multibyte-character r6 r0)))) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
517 (if (r1 >= 0) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
518 ((if (r1 < #x80) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
519 (write r1) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
520 (if (r1 < #xa0) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
521 (write-multibyte-character r5 r1) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
522 ((write-multibyte-character r6 r1)))) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
523 (if (r2 >= 0) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
524 ((if (r2 < #x80) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
525 (write r2) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
526 (if (r2 < #xa0) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
527 (write-multibyte-character r5 r2) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
528 ((write-multibyte-character r6 r2)))) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
529 (if (r3 >= 0) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
530 (if (r3 < #x80) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
531 (write r3) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
532 (if (r3 < #xa0) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
533 (write-multibyte-character r5 r3) |
575609f03daa
(ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents:
49598
diff
changeset
|
534 ((write-multibyte-character r6 r3)))))))))))) |
35542 | 535 |
36243 | 536 "CCL program to decode UTF-8. |
36465 | 537 Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
538 mule-unicode-*, but see also `utf-fragmentation-table' and |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
539 `ucs-mule-cjk-to-unicode'. |
46496 | 540 Encodings of un-representable Unicode characters are decoded asis into |
541 eight-bit-control and eight-bit-graphic characters.") | |
35542 | 542 |
543 (define-ccl-program ccl-encode-mule-utf-8 | |
544 `(1 | |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
545 ((r5 = -1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
546 (loop |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
547 (if (r5 < 0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
548 ((r1 = -1) |
41873 | 549 (read-multibyte-character r0 r1) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
550 (translate-character utf-translation-table-for-encode r0 r1)) |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
551 (;; We have already done read-multibyte-character. |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
552 (r0 = r5) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
553 (r1 = r6) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
554 (r5 = -1))) |
35542 | 555 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
556 (if (r0 == ,(charset-id 'ascii)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
557 (write r1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
558 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
559 (if (r0 == ,(charset-id 'latin-iso8859-1)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
560 ;; r1 scalar utf-8 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
561 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
562 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
563 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
564 ((r0 = (((r1 & #x40) >> 6) | #xc2)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
565 (r1 &= #x3f) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
566 (r1 |= #x80) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
567 (write r0 r1)) |
35542 | 568 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
569 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
570 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
571 ;; #x3f80 == (0011 1111 1000 0000)b |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
572 (r1 &= #x7f) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
573 (r1 += (r0 + 224)) ; 240 == -32 + #x0100 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
574 ;; now r1 holds scalar value |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
575 (if (r1 < #x0800) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
576 ;; 2byte encoding |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
577 ((r0 = (((r1 & #x07c0) >> 6) | #xc0)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
578 ;; #x07c0 == (0000 0111 1100 0000)b |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
579 (r1 &= #x3f) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
580 (r1 |= #x80) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
581 (write r0 r1)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
582 ;; 3byte encoding |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
583 ((r0 = (((r1 & #xf000) >> 12) | #xe0)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
584 (r2 = ((r1 & #x3f) | #x80)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
585 (r1 &= #x0fc0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
586 (r1 >>= 6) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
587 (r1 |= #x80) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
588 (write r0 r1 r2)))) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
589 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
590 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
591 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
592 (r1 &= #x7f) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
593 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
594 (r0 = (((r1 & #xf000) >> 12) | #xe0)) |
35542 | 595 (r2 = ((r1 & #x3f) | #x80)) |
596 (r1 &= #x0fc0) | |
597 (r1 >>= 6) | |
598 (r1 |= #x80) | |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
599 (write r0 r1 r2)) |
35542 | 600 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
601 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
602 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
603 (r1 &= #x7f) |
46496 | 604 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
605 (r0 = (((r1 & #xf000) >> 12) | #xe0)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
606 (r2 = ((r1 & #x3f) | #x80)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
607 (r1 &= #x0fc0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
608 (r1 >>= 6) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
609 (r1 |= #x80) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
610 (write r0 r1 r2)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
611 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
612 (if (r0 == ,(charset-id 'eight-bit-control)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
613 ;; r1 scalar utf-8 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
614 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
615 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
616 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
617 ((write #xc2) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
618 (write r1)) |
35542 | 619 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
620 (if (r0 == ,(charset-id 'eight-bit-graphic)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
621 ;; r1 scalar utf-8 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
622 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
623 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
624 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
625 ((write r1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
626 (r1 = -1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
627 (read-multibyte-character r0 r1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
628 (if (r0 != ,(charset-id 'eight-bit-graphic)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
629 (if (r0 != ,(charset-id 'eight-bit-control)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
630 ((r5 = r0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
631 (r6 = r1)))) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
632 (if (r5 < 0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
633 ((read-multibyte-character r0 r2) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
634 (if (r0 != ,(charset-id 'eight-bit-graphic)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
635 (if (r0 != ,(charset-id 'eight-bit-control)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
636 ((r5 = r0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
637 (r6 = r2)))) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
638 (if (r5 < 0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
639 (write r1 r2) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
640 (if (r1 < #xa0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
641 (write r1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
642 ((write #xc2) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
643 (write r1))))))) |
35542 | 644 |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
645 ((lookup-character utf-subst-table-for-encode r0 r1) |
46496 | 646 (if r7 ; lookup succeeded |
647 ((r1 = (((r0 & #xf000) >> 12) | #xe0)) | |
648 (r2 = ((r0 & #x3f) | #x80)) | |
649 (r0 &= #x0fc0) | |
650 (r0 >>= 6) | |
651 (r0 |= #x80) | |
652 (write r1 r0 r2)) | |
653 ;; Unsupported character. | |
654 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. | |
655 ((write #xef) | |
656 (write #xbf) | |
657 (write #xbd))))))))))) | |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
658 (repeat))) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
659 (if (r1 >= #xa0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
660 (write r1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
661 (if (r1 >= #x80) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
662 ((write #xc2) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
663 (write r1))))) |
35542 | 664 |
46496 | 665 "CCL program to encode into UTF-8.") |
35542 | 666 |
41873 | 667 |
46496 | 668 (define-ccl-program ccl-untranslated-to-ucs |
669 `(0 | |
670 (if (r0 < #xf0) ; 3-byte encoding, as above | |
671 ((r4 = 0) | |
672 (r3 = (r1 & #b11000000)) | |
673 (r3 |= ((r2 >> 2) & #b00110000)) | |
674 (if (r3 != #b10100000) | |
675 (r4 = 1) | |
676 ((r3 = ((r0 & #x0f) << 12)) | |
677 (r3 += ((r1 & #x3f) << 6)) | |
678 (r3 += (r2 & #x3f)) | |
679 (if (r3 < #x0800) | |
680 (r4 = 1)))) | |
681 (if (r4 != 0) | |
682 (r0 = 0) | |
683 (r0 = r3))) | |
684 (if (r0 < #xf8) ; 4-byte (Mule-UCS recipe) | |
685 ((r4 = (r1 >> 6)) | |
686 (if (r4 != #b10) | |
687 (r0 = 0) | |
688 ((r4 = (r2 >> 6)) | |
689 (if (r4 != #b10) | |
690 (r0 = 0) | |
691 ((r4 = (r3 >> 6)) | |
692 (if (r4 != #b10) | |
693 (r0 = 0) | |
694 ((r1 = ((r1 & #x3F) << 12)) | |
695 (r2 = ((r2 & #x3F) << 6)) | |
696 (r3 &= #x3F) | |
697 (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3))))))))) | |
698 (r0 = 0)))) | |
699 "Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0. | |
700 r0 == 0 for invalid sequence.") | |
701 | |
702 (defvar utf-8-ccl-regs (make-vector 8 0)) | |
703 | |
41873 | 704 (defsubst utf-8-untranslated-to-ucs () |
46496 | 705 "Return the UCS code for an untranslated sequence of raw bytes t point. |
706 Only for 3- or 4-byte sequences." | |
707 (aset utf-8-ccl-regs 0 (or (char-after) 0)) | |
708 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0)) | |
709 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0)) | |
710 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0)) | |
711 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs) | |
712 (aref utf-8-ccl-regs 0)) | |
41873 | 713 |
714 (defun utf-8-help-echo (window object position) | |
715 (format "Untranslated Unicode U+%04X" | |
716 (get-char-property position 'untranslated-utf-8 object))) | |
717 | |
718 ;; We compose the untranslatable sequences into a single character. | |
719 ;; This is infelicitous for editing, because there's currently no | |
720 ;; mechanism for treating compositions as atomic, but is OK for | |
46496 | 721 ;; display. They are composed to U+FFFD with help-echo which |
722 ;; indicates the unicodes they represent. This function GCs too much. | |
41873 | 723 (defsubst utf-8-compose () |
724 "Put a suitable composition on an untranslatable sequence. | |
725 Return the sequence's length." | |
726 (let* ((u (utf-8-untranslated-to-ucs)) | |
46496 | 727 (l (unless (zerop u) |
728 (if (>= u #x10000) | |
41873 | 729 4 |
46496 | 730 3)))) |
731 (when l | |
41873 | 732 (put-text-property (point) (min (point-max) (+ l (point))) |
733 'untranslated-utf-8 u) | |
46496 | 734 (put-text-property (point) (min (point-max) (+ l (point))) |
735 'help-echo 'utf-8-help-echo) | |
736 (compose-region (point) (+ l (point)) ?$,3u=(B) | |
41873 | 737 l))) |
738 | |
739 (defcustom utf-8-compose-scripts nil | |
46496 | 740 "*Non-nil means compose various scripts on decoding utf-8 text." |
41873 | 741 :group 'mule |
46496 | 742 :version "21.4" |
743 :type 'boolean) | |
41873 | 744 |
745 (defun utf-8-post-read-conversion (length) | |
746 "Compose untranslated utf-8 sequences into single characters. | |
747 Also compose particular scripts if `utf-8-compose-scripts' is non-nil." | |
748 (save-excursion | |
749 ;; Can't do eval-when-compile to insert a multibyte constant | |
750 ;; version of the string in the loop, since it's always loaded as | |
751 ;; unibyte from a byte-compiled file. | |
46496 | 752 (let ((range (string-as-multibyte "^\xe1-\xf7"))) |
753 (while (and (skip-chars-forward range) | |
41873 | 754 (not (eobp))) |
755 (forward-char (utf-8-compose))))) | |
46496 | 756 ;; Fixme: Takahashi-san implies it may not work this easily. I |
757 ;; asked why but didn't get a reply. -- fx | |
41873 | 758 (when (and utf-8-compose-scripts (> length 1)) |
759 ;; These currently have definitions which cover the relevant | |
46496 | 760 ;; unicodes. We could avoid loading thai-util &c by checking |
41873 | 761 ;; whether the region contains any characters with the appropriate |
762 ;; categories. There aren't yet Unicode-based rules for Tibetan. | |
763 (save-excursion (setq length (diacritic-post-read-conversion length))) | |
764 (save-excursion (setq length (thai-post-read-conversion length))) | |
765 (save-excursion (setq length (lao-post-read-conversion length))) | |
52520
f5d5daea4d3c
(utf-8-post-read-conversion): Call post-read-conversion functions for
Kenichi Handa <handa@m17n.org>
parents:
52401
diff
changeset
|
766 (save-excursion (setq length (devanagari-post-read-conversion length))) |
f5d5daea4d3c
(utf-8-post-read-conversion): Call post-read-conversion functions for
Kenichi Handa <handa@m17n.org>
parents:
52401
diff
changeset
|
767 (save-excursion (setq length (malayalam-post-read-conversion length))) |
f5d5daea4d3c
(utf-8-post-read-conversion): Call post-read-conversion functions for
Kenichi Handa <handa@m17n.org>
parents:
52401
diff
changeset
|
768 (save-excursion (setq length (tamil-post-read-conversion length)))) |
41873 | 769 length) |
770 | |
46496 | 771 ;; ucs-tables is preloaded |
772 ;; (defun utf-8-pre-write-conversion (beg end) | |
773 ;; "Semi-dummy pre-write function effectively to autoload ucs-tables." | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
774 ;; ;; Ensure translation-table is loaded. |
46496 | 775 ;; (require 'ucs-tables) |
776 ;; ;; Don't do this again. | |
777 ;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil) | |
778 ;; nil) | |
41873 | 779 |
35542 | 780 (make-coding-system |
781 'mule-utf-8 4 ?u | |
782 "UTF-8 encoding for Emacs-supported Unicode characters. | |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
783 It supports Unicode characters of these ranges: |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
784 U+0000..U+33FF, U+E000..U+FFFF. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
785 They correspond to these Emacs character sets: |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
786 ascii, latin-iso8859-1, mule-unicode-0100-24ff, |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
787 mule-unicode-2500-33ff, mule-unicode-e000-ffff |
35542 | 788 |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
789 On decoding (e.g. reading a file), Unicode characters not in the above |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
790 ranges are decoded into sequences of eight-bit-control and |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
791 eight-bit-graphic characters to preserve their byte sequences. The |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
792 byte sequence is preserved on i/o for valid utf-8, but not necessarily |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
793 for invalid utf-8. |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
794 |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
795 On encoding (e.g. writing a file), Emacs characters not belonging to |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
796 any of the character sets listed above are encoded into the UTF-8 byte |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
797 sequence representing U+FFFD (REPLACEMENT CHARACTER)." |
35542 | 798 |
799 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | |
800 '((safe-charsets | |
801 ascii | |
802 eight-bit-control | |
803 eight-bit-graphic | |
804 latin-iso8859-1 | |
805 mule-unicode-0100-24ff | |
806 mule-unicode-2500-33ff | |
807 mule-unicode-e000-ffff) | |
36371
f6bb3ed752b4
(mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents:
36243
diff
changeset
|
808 (mime-charset . utf-8) |
36423
aa776838b660
(mule-utf-8): Set coding-category property to coding-category-utf-8.
Kenichi Handa <handa@m17n.org>
parents:
36371
diff
changeset
|
809 (coding-category . coding-category-utf-8) |
41873 | 810 (valid-codes (0 . 255)) |
46496 | 811 ;; (pre-write-conversion . utf-8-pre-write-conversion) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
812 (post-read-conversion . utf-8-post-read-conversion) |
50766
fc9cb527333d
(utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents:
50549
diff
changeset
|
813 (translation-table-for-encode . utf-translation-table-for-encode) |
47703
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
814 (dependency unify-8859-on-encoding-mode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
815 unify-8859-on-decoding-mode |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
816 utf-fragment-on-decoding |
6d4430dfeafc
(ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents:
47409
diff
changeset
|
817 utf-translate-cjk))) |
35542 | 818 |
819 (define-coding-system-alias 'utf-8 'mule-utf-8) | |
38436
b174db545cfd
Some fixes to follow coding conventions.
Pavel Janík <Pavel@Janik.cz>
parents:
37934
diff
changeset
|
820 |
41873 | 821 ;; I think this needs special private charsets defined for the |
822 ;; untranslated sequences, if it's going to work well. | |
823 | |
824 ;;; (defun utf-8-compose-function (pos to pattern &optional string) | |
825 ;;; (let* ((prop (get-char-property pos 'composition string)) | |
826 ;;; (l (and prop (- (cadr prop) (car prop))))) | |
827 ;;; (cond ((and l (> l (- to pos))) | |
828 ;;; (delete-region pos to)) | |
829 ;;; ((and (> (char-after pos) 224) | |
830 ;;; (< (char-after pos) 256) | |
831 ;;; (save-restriction | |
832 ;;; (narrow-to-region pos to) | |
833 ;;; (utf-8-compose))) | |
834 ;;; t)))) | |
835 | |
836 ;;; (dotimes (i 96) | |
837 ;;; (aset composition-function-table | |
838 ;;; (+ 128 i) | |
839 ;;; `((,(string-as-multibyte "[\200-\237\240-\377]") | |
840 ;;; . utf-8-compose-function)))) | |
841 | |
52401 | 842 ;;; arch-tag: b08735b7-753b-4ae6-b754-0f3efe4515c5 |
38436
b174db545cfd
Some fixes to follow coding conventions.
Pavel Janík <Pavel@Janik.cz>
parents:
37934
diff
changeset
|
843 ;;; utf-8.el ends here |