annotate lisp/international/utf-8.el @ 46676:f0b8a25b0b7d

comment
author Dave Love <fx@gnu.org>
date Wed, 24 Jul 2002 22:23:44 +0000
parents 395e5c46761b
children 2d6a05542b5b
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
1 ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*-
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
2
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
4 ;; Licensed to the Free Software Foundation.
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
5 ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
6
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
7 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
8 ;; Maintainer: FSF
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
9 ;; Keywords: multilingual, Unicode, UTF-8, i18n
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
10
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
11 ;; This file is part of GNU Emacs.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
12
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
13 ;; GNU Emacs is free software; you can redistribute it and/or modify
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
14 ;; it under the terms of the GNU General Public License as published by
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
15 ;; the Free Software Foundation; either version 2, or (at your option)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
16 ;; any later version.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
17
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
18 ;; GNU Emacs is distributed in the hope that it will be useful,
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
21 ;; GNU General Public License for more details.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
22
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
23 ;; You should have received a copy of the GNU General Public License
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
24 ;; along with GNU Emacs; see the file COPYING. If not, write to the
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
25 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
26 ;; Boston, MA 02111-1307, USA.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
27
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
28 ;;; Commentary:
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
29
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
30 ;; The coding-system `mule-utf-8' basically supports encoding/decoding
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
31 ;; of the following character sets to and from UTF-8:
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
32 ;;
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
33 ;; ascii
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
34 ;; eight-bit-control
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
35 ;; latin-iso8859-1
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
36 ;; mule-unicode-0100-24ff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
37 ;; mule-unicode-2500-33ff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
38 ;; mule-unicode-e000-ffff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
39 ;;
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
40 ;; On decoding, Unicode characters that do not fit into the above
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
41 ;; character sets are handled as `eight-bit-control' or
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
42 ;; `eight-bit-graphic' characters to retain the information about the
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
43 ;; original byte sequence and text properties record the corresponding
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
44 ;; unicode.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
45 ;;
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
46 ;; Fixme: note that reading and writing invalid utf-8 may not be
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
47 ;; idempotent -- to represent the bytes to fix that needs a new charset.
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
48 ;;
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
49 ;; Characters from other character sets can be encoded with
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
50 ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
51 ;; registering the translation with `register-char-codings'. Hash
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
52 ;; tables `utf-8-subst-table' and `utf-8-subst-rev-table' are used to
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
53 ;; support encoding and decoding of about a quarter of the CJK space
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
54 ;; between U+3400 and U+DFFF.
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
55
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
56 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
57
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
58 ;; scalar | utf-8
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
59 ;; value | 1st byte | 2nd byte | 3rd byte
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
60 ;; --------------------+-----------+-----------+----------
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
61 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
62 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
63 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
64
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
65 ;;; Code:
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
66
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
67 (defvar ucs-mule-to-mule-unicode (make-translation-table)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
68 "Translation table for encoding to `mule-utf-8'.")
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
69 ;; Could have been done by ucs-tables loaded before.
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
70 (unless (get 'ucs-mule-to-mule-unicode 'translation-table)
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
71 (define-translation-table 'ucs-mule-to-mule-unicode
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
72 ucs-mule-to-mule-unicode))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
73
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
74 (defvar utf-8-subst-table (make-hash-table :test 'eq))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
75 (defvar utf-8-subst-rev-table (make-hash-table :test 'eq))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
76 (define-translation-hash-table 'utf-8-subst-table utf-8-subst-table)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
77 (define-translation-hash-table 'utf-8-subst-rev-table utf-8-subst-rev-table)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
78
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
79 (defvar utf-8-translation-table-for-decode (make-translation-table)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
80 "Translation table applied after decoding utf-8 to mule-unicode.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
81 This is only actually applied to characters which would normally be
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
82 decoded into mule-unicode-0100-24ff.")
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
83 (define-translation-table 'utf-8-translation-table-for-decode
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
84 utf-8-translation-table-for-decode)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
85
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
86 ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
87 ;; space of mule-unicode. For Latin scripts this isn't very
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
88 ;; important. Hebrew and Arabic might go here too when there's proper
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
89 ;; support for them.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
90 (mapc
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
91 (lambda (pair)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
92 (aset utf-8-translation-table-for-decode (car pair) (cdr pair)))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
93 '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) (?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
94 (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) (?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,F@(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
95 (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) (?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
96 (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) (?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
97 (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) (?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
98 (?$,1' (B . ?,FP(B) (?$,1'!(B . ?,FQ(B) (?$,1'#(B . ?,FS(B) (?$,1'$(B . ?,FT(B) (?$,1'%(B . ?,FU(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
99 (?$,1'&(B . ?,FV(B) (?$,1''(B . ?,FW(B) (?$,1'((B . ?,FX(B) (?$,1')(B . ?,FY(B) (?$,1'*(B . ?,FZ(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
100 (?$,1'+(B . ?,F[(B) (?$,1',(B . ?,F\(B) (?$,1'-(B . ?,F](B) (?$,1'.(B . ?,F^(B) (?$,1'/(B . ?,F_(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
101 (?$,1'0(B . ?,F`(B) (?$,1'1(B . ?,Fa(B) (?$,1'2(B . ?,Fb(B) (?$,1'3(B . ?,Fc(B) (?$,1'4(B . ?,Fd(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
102 (?$,1'5(B . ?,Fe(B) (?$,1'6(B . ?,Ff(B) (?$,1'7(B . ?,Fg(B) (?$,1'8(B . ?,Fh(B) (?$,1'9(B . ?,Fi(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
103 (?$,1':(B . ?,Fj(B) (?$,1';(B . ?,Fk(B) (?$,1'<(B . ?,Fl(B) (?$,1'=(B . ?,Fm(B) (?$,1'>(B . ?,Fn(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
104 (?$,1'?(B . ?,Fo(B) (?$,1'@(B . ?,Fp(B) (?$,1'A(B . ?,Fq(B) (?$,1'B(B . ?,Fr(B) (?$,1'C(B . ?,Fs(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
105 (?$,1'D(B . ?,Ft(B) (?$,1'E(B . ?,Fu(B) (?$,1'F(B . ?,Fv(B) (?$,1'G(B . ?,Fw(B) (?$,1'H(B . ?,Fx(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
106 (?$,1'I(B . ?,Fy(B) (?$,1'J(B . ?,Fz(B) (?$,1'K(B . ?,F{(B) (?$,1'L(B . ?,F|(B) (?$,1'M(B . ?,F}(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
107 (?$,1'N(B . ?,F~(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
108
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
109 (?$,1(!(B . ?,L!(B) (?$,1("(B . ?,L"(B) (?$,1(#(B . ?,L#(B) (?$,1($(B . ?,L$(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
110 (?$,1(%(B . ?,L%(B) (?$,1(&(B . ?,L&(B) (?$,1('(B . ?,L'(B) (?$,1(((B . ?,L((B) (?$,1()(B . ?,L)(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
111 (?$,1(*(B . ?,L*(B) (?$,1(+(B . ?,L+(B) (?$,1(,(B . ?,L,(B) (?$,1(.(B . ?,L.(B) (?$,1(/(B . ?,L/(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
112 (?$,1(0(B . ?,L0(B) (?$,1(1(B . ?,L1(B) (?$,1(2(B . ?,L2(B) (?$,1(3(B . ?,L3(B) (?$,1(4(B . ?,L4(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
113 (?$,1(5(B . ?,L5(B) (?$,1(6(B . ?,L6(B) (?$,1(7(B . ?,L7(B) (?$,1(8(B . ?,L8(B) (?$,1(9(B . ?,L9(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
114 (?$,1(:(B . ?,L:(B) (?$,1(;(B . ?,L;(B) (?$,1(<(B . ?,L<(B) (?$,1(=(B . ?,L=(B) (?$,1(>(B . ?,L>(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
115 (?$,1(?(B . ?,L?(B) (?$,1(@(B . ?,L@(B) (?$,1(A(B . ?,LA(B) (?$,1(B(B . ?,LB(B) (?$,1(C(B . ?,LC(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
116 (?$,1(D(B . ?,LD(B) (?$,1(E(B . ?,LE(B) (?$,1(F(B . ?,LF(B) (?$,1(G(B . ?,LG(B) (?$,1(H(B . ?,LH(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
117 (?$,1(I(B . ?,LI(B) (?$,1(J(B . ?,LJ(B) (?$,1(K(B . ?,LK(B) (?$,1(L(B . ?,LL(B) (?$,1(M(B . ?,LM(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
118 (?$,1(N(B . ?,LN(B) (?$,1(O(B . ?,LO(B) (?$,1(P(B . ?,LP(B) (?$,1(Q(B . ?,LQ(B) (?$,1(R(B . ?,LR(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
119 (?$,1(S(B . ?,LS(B) (?$,1(T(B . ?,LT(B) (?$,1(U(B . ?,LU(B) (?$,1(V(B . ?,LV(B) (?$,1(W(B . ?,LW(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
120 (?$,1(X(B . ?,LX(B) (?$,1(Y(B . ?,LY(B) (?$,1(Z(B . ?,LZ(B) (?$,1([(B . ?,L[(B) (?$,1(\(B . ?,L\(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
121 (?$,1(](B . ?,L](B) (?$,1(^(B . ?,L^(B) (?$,1(_(B . ?,L_(B) (?$,1(`(B . ?,L`(B) (?$,1(a(B . ?,La(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
122 (?$,1(b(B . ?,Lb(B) (?$,1(c(B . ?,Lc(B) (?$,1(d(B . ?,Ld(B) (?$,1(e(B . ?,Le(B) (?$,1(f(B . ?,Lf(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
123 (?$,1(g(B . ?,Lg(B) (?$,1(h(B . ?,Lh(B) (?$,1(i(B . ?,Li(B) (?$,1(j(B . ?,Lj(B) (?$,1(k(B . ?,Lk(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
124 (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) (?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
125 (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) (?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
126 (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) (?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
127 (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B)))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
128
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
129 (defcustom utf-8-fragment-on-decoding nil
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
130 "Whether or not to decode some scripts in UTF-8 text into 8-bit characters.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
131 Setting this means that the relevant Cyrillic and Greek characters are
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
132 decoded into the iso8859 charsets rather than into
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
133 mule-unicode-0100-24ff. The 8-bit characters take half as much space
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
134 in the buffer, but using them may affect how the buffer can be re-encoded
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
135 and may require a different input method to search for them, for instance.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
136 See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode'
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
137 for mechanisms to make this largely transparent."
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
138 :set (lambda (s v)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
139 (if v
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
140 (define-translation-table 'utf-8-translation-table-for-decode
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
141 utf-8-translation-table-for-decode)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
142 (define-translation-table 'utf-8-translation-table-for-decode))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
143 (set-default s v))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
144 :version "21.4"
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
145 :type 'boolean
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
146 :group 'mule)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
147
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
148 (defcustom utf-8-translate-cjk nil
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
149 "Whether the `mule-utf-8' coding system should encode many CJK characters.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
150
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
151 Enabling this loads tables which enable the coding system to encode
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
152 characters in the charsets `korean-ksc5601', `chinese-gb2312' and
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
153 `japanese-jisx0208', and to decode the corresponding unicodes into
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
154 such characters. This works by loading the library `utf-8-subst'; see
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
155 its commentary. The tables are fairly large (about 33000 entries), so this
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
156 option is not the default."
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
157 :link '(emacs-commentary-link "utf-8-subst")
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
158 :set (lambda (s v)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
159 (when v
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
160 (require 'utf-8-subst)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
161 (let ((table (make-char-table 'translation-table)))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
162 (coding-system-put 'mule-utf-8 'safe-charsets
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
163 (append (coding-system-get 'mule-utf-8
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
164 'safe-charsets)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
165 '(korean-ksc5601 chinese-gb2312
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
166 japanese-jisx0208)))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
167 (maphash (lambda (k v)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
168 (aset table k v))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
169 utf-8-subst-rev-table)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
170 (register-char-codings 'mule-utf-8 table)))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
171 (set-default s v))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
172 :version "21.4"
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
173 :type 'boolean
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
174 :group 'mule)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
175
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
176 (define-ccl-program ccl-decode-mule-utf-8
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
177 ;;
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
178 ;; charset | bytes in utf-8 | bytes in emacs
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
179 ;; -----------------------+----------------+---------------
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
180 ;; ascii | 1 | 1
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
181 ;; -----------------------+----------------+---------------
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
182 ;; eight-bit-control | 2 | 2
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
183 ;; eight-bit-graphic | 2 | 1
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
184 ;; latin-iso8859-1 | 2 | 2
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
185 ;; -----------------------+----------------+---------------
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
186 ;; mule-unicode-0100-24ff | 2 | 4
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
187 ;; (< 0800) | |
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
188 ;; -----------------------+----------------+---------------
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
189 ;; mule-unicode-0100-24ff | 3 | 4
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
190 ;; (>= 8000) | |
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
191 ;; mule-unicode-2500-33ff | 3 | 4
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
192 ;; mule-unicode-e000-ffff | 3 | 4
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
193 ;;
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
194 ;; Thus magnification factor is two.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
195 ;;
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
196 `(2
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
197 ((r5 = ,(charset-id 'eight-bit-control))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
198 (r6 = ,(charset-id 'eight-bit-graphic))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
199 (loop
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
200 (read r0)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
201
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
202 ;; 1byte encoding, i.e., ascii
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
203 (if (r0 < #x80)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
204 (write r0)
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
205 (if (r0 < #xc0) ; continuation byte (invalid here)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
206 (if (r0 < #xa0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
207 (write-multibyte-character r5 r0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
208 (write-multibyte-character r6 r0))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
209 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
210 (if (r0 < #xe0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
211 ((read r1)
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
212
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
213 (if ((r1 & #b11000000) != #b10000000)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
214 ;; Invalid 2-byte sequence
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
215 ((if (r0 < #xa0)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
216 (write-multibyte-character r5 r0)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
217 (write-multibyte-character r6 r0))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
218 (if (r1 < #x80)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
219 (write r1)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
220 (if (r1 < #xa0)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
221 (write-multibyte-character r5 r1)
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
222 (write-multibyte-character r6 r1))))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
223
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
224 ((r3 = r0) ; save in case of overlong sequence
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
225 (r2 = r1)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
226 (r0 &= #x1f)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
227 (r0 <<= 6)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
228 (r2 = r1) ; save in case of overlong sequence
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
229 (r1 &= #x3f)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
230 (r1 += r0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
231 ;; Now r1 holds scalar value
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
232
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
233 (if (r1 < 128) ; `overlong sequence'
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
234 ((if (r3 < #xa0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
235 (write-multibyte-character r5 r3)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
236 (write-multibyte-character r6 r3))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
237 (if (r2 < #x80)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
238 (write r2)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
239 (if (r2 < #xa0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
240 (write-multibyte-character r5 r2)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
241 (write-multibyte-character r6 r2))))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
242
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
243 ;; eight-bit-control
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
244 (if (r1 < 160)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
245 ((write-multibyte-character r5 r1))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
246
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
247 ;; latin-iso8859-1
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
248 (if (r1 < 256)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
249 ((r0 = ,(charset-id 'latin-iso8859-1))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
250 (r1 -= 128)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
251 (write-multibyte-character r0 r1))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
252
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
253 ;; mule-unicode-0100-24ff (< 0800)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
254 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
255 (r1 -= #x0100)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
256 (r2 = (((r1 / 96) + 32) << 7))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
257 (r1 %= 96)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
258 (r1 += (r2 + 32))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
259 (translate-character
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
260 utf-8-translation-table-for-decode r0 r1)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
261 (write-multibyte-character r0 r1))))))))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
262
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
263 ;; 3byte encoding
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
264 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
265 (if (r0 < #xf0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
266 ((read r1 r2)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
267
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
268 ;; This is set to 1 if the encoding is invalid.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
269 (r4 = 0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
270
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
271 (r3 = (r1 & #b11000000))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
272 (r3 |= ((r2 >> 2) & #b00110000))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
273 (if (r3 != #b10100000)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
274 (r4 = 1)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
275 ((r3 = ((r0 & #x0f) << 12))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
276 (r3 += ((r1 & #x3f) << 6))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
277 (r3 += (r2 & #x3f))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
278 (if (r3 < #x0800)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
279 (r4 = 1))))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
280
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
281 (if (r4 != 0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
282 ;; Invalid 3-byte sequence
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
283 ((if (r0 < #xa0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
284 (write-multibyte-character r5 r0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
285 (write-multibyte-character r6 r0))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
286 (if (r1 < #x80)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
287 (write r1)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
288 (if (r1 < #xa0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
289 (write-multibyte-character r5 r1)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
290 (write-multibyte-character r6 r1)))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
291 (if (r2 < #x80)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
292 (write r2)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
293 (if (r2 < #xa0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
294 (write-multibyte-character r5 r2)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
295 (write-multibyte-character r6 r2))))
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
296
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
297 ;; mule-unicode-0100-24ff (>= 0800)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
298 ((if (r3 < #x2500)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
299 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
300 (r3 -= #x0100)
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
301 (r3 //= 96)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
302 (r1 = (r7 + 32))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
303 (r1 += ((r3 + 32) << 7))
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
304 (translate-character
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
305 utf-8-translation-table-for-decode r0 r1)
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
306 (write-multibyte-character r0 r1))
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
307
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
308 ;; mule-unicode-2500-33ff
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
309 ;; Fixme: Perhaps allow translation via
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
310 ;; utf-8-subst-table for #x2e80 up, so that we use
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
311 ;; consistent charsets for all of CJK. Would need
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
312 ;; corresponding change to encoding tables.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
313 (if (r3 < #x3400)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
314 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
315 (r3 -= #x2500)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
316 (r3 //= 96)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
317 (r1 = (r7 + 32))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
318 (r1 += ((r3 + 32) << 7))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
319 (write-multibyte-character r0 r1))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
320
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
321 ;; U+3400 .. U+D7FF
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
322 ;; Try to convert to CJK chars, else keep
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
323 ;; them as eight-bit-{control|graphic}.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
324 (if (r3 < #xd800)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
325 ((r4 = r3) ; don't zap r3
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
326 (lookup-integer utf-8-subst-table r4 r5)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
327 (if r7
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
328 ;; got a translation
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
329 ((write-multibyte-character r4 r5)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
330 ;; Zapped through register starvation.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
331 (r5 = ,(charset-id 'eight-bit-control)))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
332 ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
333 ((r3 = r6)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
334 (write-multibyte-character r3 r0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
335 (if (r1 < #xa0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
336 (r3 = r5))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
337 (write-multibyte-character r3 r1)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
338 (if (r2 < #xa0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
339 (r3 = r5)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
340 (r3 = r6))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
341 (write-multibyte-character r3 r2))))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
342
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
343 ;; Surrogates, U+D800 .. U+DFFF
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
344 (if (r3 < #xe000)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
345 ((r3 = r6)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
346 (write-multibyte-character r3 r0) ; eight-bit-graphic
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
347 (if (r1 < #xa0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
348 (r3 = r5))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
349 (write-multibyte-character r3 r1)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
350 (if (r2 < #xa0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
351 (r3 = r5)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
352 (r3 = r6))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
353 (write-multibyte-character r3 r2))
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
354
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
355 ;; mule-unicode-e000-ffff
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
356 ;; Fixme: fffe and ffff are invalid.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
357 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
358 (r3 -= #xe000)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
359 (r3 //= 96)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
360 (r1 = (r7 + 32))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
361 (r1 += ((r3 + 32) << 7))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
362 (write-multibyte-character r0 r1)))))))))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
363
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
364 (if (r0 < #xfe)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
365 ;; 4byte encoding
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
366 ;; keep those bytes as eight-bit-{control|graphic}
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
367 ;; Fixme: allow lookup in utf-8-subst-table.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
368 ((read r1 r2 r3)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
369 ;; r0 > #xf0, thus eight-bit-graphic
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
370 (write-multibyte-character r6 r0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
371 (if (r1 < #xa0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
372 (if (r1 < #x80) ; invalid byte
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
373 (write r1)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
374 (write-multibyte-character r5 r1))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
375 (write-multibyte-character r6 r1))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
376 (if (r2 < #xa0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
377 (if (r2 < #x80) ; invalid byte
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
378 (write r2)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
379 (write-multibyte-character r5 r2))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
380 (write-multibyte-character r6 r2))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
381 (if (r3 < #xa0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
382 (if (r3 < #x80) ; invalid byte
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
383 (write r3)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
384 (write-multibyte-character r5 r3))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
385 (write-multibyte-character r6 r3))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
386 (if (r0 >= #xf8) ; 5- or 6-byte encoding
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
387 ((read r1)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
388 (if (r1 < #xa0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
389 (if (r1 < #x80) ; invalid byte
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
390 (write r1)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
391 (write-multibyte-character r5 r1))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
392 (write-multibyte-character r6 r1))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
393 (if (r0 >= #xfc) ; 6-byte
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
394 ((read r1)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
395 (if (r1 < #xa0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
396 (if (r1 < #x80) ; invalid byte
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
397 (write r1)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
398 (write-multibyte-character r5 r1))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
399 (write-multibyte-character r6 r1)))))))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
400 ;; else invalid byte >= #xfe
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
401 (write-multibyte-character r6 r0))))))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
402 (repeat))))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
403
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
404 "CCL program to decode UTF-8.
36465
f968e313e8ad Doc fixes.
Dave Love <fx@gnu.org>
parents: 36423
diff changeset
405 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
406 mule-unicode-*, but see also `utf-8-translation-table-for-decode' and
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
407 `utf-8-subst-table'.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
408 Encodings of un-representable Unicode characters are decoded asis into
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
409 eight-bit-control and eight-bit-graphic characters.")
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
410
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
411 (define-ccl-program ccl-encode-mule-utf-8
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
412 `(1
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
413 ((r5 = -1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
414 (loop
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
415 (if (r5 < 0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
416 ((r1 = -1)
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
417 (read-multibyte-character r0 r1)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
418 (translate-character ucs-mule-to-mule-unicode r0 r1))
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
419 (;; We have already done read-multibyte-character.
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
420 (r0 = r5)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
421 (r1 = r6)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
422 (r5 = -1)))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
423
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
424 (if (r0 == ,(charset-id 'ascii))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
425 (write r1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
426
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
427 (if (r0 == ,(charset-id 'latin-iso8859-1))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
428 ;; r1 scalar utf-8
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
429 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
430 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
431 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
432 ((r0 = (((r1 & #x40) >> 6) | #xc2))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
433 (r1 &= #x3f)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
434 (r1 |= #x80)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
435 (write r0 r1))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
436
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
437 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
438 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
439 ;; #x3f80 == (0011 1111 1000 0000)b
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
440 (r1 &= #x7f)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
441 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
442 ;; now r1 holds scalar value
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
443 (if (r1 < #x0800)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
444 ;; 2byte encoding
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
445 ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
446 ;; #x07c0 == (0000 0111 1100 0000)b
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
447 (r1 &= #x3f)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
448 (r1 |= #x80)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
449 (write r0 r1))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
450 ;; 3byte encoding
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
451 ((r0 = (((r1 & #xf000) >> 12) | #xe0))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
452 (r2 = ((r1 & #x3f) | #x80))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
453 (r1 &= #x0fc0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
454 (r1 >>= 6)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
455 (r1 |= #x80)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
456 (write r0 r1 r2))))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
457
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
458 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
459 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
460 (r1 &= #x7f)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
461 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
462 (r0 = (((r1 & #xf000) >> 12) | #xe0))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
463 (r2 = ((r1 & #x3f) | #x80))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
464 (r1 &= #x0fc0)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
465 (r1 >>= 6)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
466 (r1 |= #x80)
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
467 (write r0 r1 r2))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
468
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
469 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
470 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
471 (r1 &= #x7f)
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
472 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
473 (r0 = (((r1 & #xf000) >> 12) | #xe0))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
474 (r2 = ((r1 & #x3f) | #x80))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
475 (r1 &= #x0fc0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
476 (r1 >>= 6)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
477 (r1 |= #x80)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
478 (write r0 r1 r2))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
479
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
480 (if (r0 == ,(charset-id 'eight-bit-control))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
481 ;; r1 scalar utf-8
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
482 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
483 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
484 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
485 ((write #xc2)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
486 (write r1))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
487
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
488 (if (r0 == ,(charset-id 'eight-bit-graphic))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
489 ;; r1 scalar utf-8
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
490 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
491 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
492 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
493 ((write r1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
494 (r1 = -1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
495 (read-multibyte-character r0 r1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
496 (if (r0 != ,(charset-id 'eight-bit-graphic))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
497 (if (r0 != ,(charset-id 'eight-bit-control))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
498 ((r5 = r0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
499 (r6 = r1))))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
500 (if (r5 < 0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
501 ((read-multibyte-character r0 r2)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
502 (if (r0 != ,(charset-id 'eight-bit-graphic))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
503 (if (r0 != ,(charset-id 'eight-bit-control))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
504 ((r5 = r0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
505 (r6 = r2))))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
506 (if (r5 < 0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
507 (write r1 r2)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
508 (if (r1 < #xa0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
509 (write r1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
510 ((write #xc2)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
511 (write r1)))))))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
512
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
513 ((lookup-character utf-8-subst-rev-table r0 r1)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
514 (if r7 ; lookup succeeded
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
515 ((r1 = (((r0 & #xf000) >> 12) | #xe0))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
516 (r2 = ((r0 & #x3f) | #x80))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
517 (r0 &= #x0fc0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
518 (r0 >>= 6)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
519 (r0 |= #x80)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
520 (write r1 r0 r2))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
521 ;; Unsupported character.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
522 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
523 ((write #xef)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
524 (write #xbf)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
525 (write #xbd)))))))))))
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
526 (repeat)))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
527 (if (r1 >= #xa0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
528 (write r1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
529 (if (r1 >= #x80)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
530 ((write #xc2)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
531 (write r1)))))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
532
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
533 "CCL program to encode into UTF-8.")
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
534
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
535 ;; Dummy definition so that the CCL can be checked correctly; the
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
536 ;; actual data are loaded on demand.
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
537 (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
538 (define-translation-table 'ucs-mule-8859-to-mule-unicode))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
539
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
540 (define-ccl-program ccl-untranslated-to-ucs
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
541 `(0
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
542 (if (r0 < #xf0) ; 3-byte encoding, as above
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
543 ((r4 = 0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
544 (r3 = (r1 & #b11000000))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
545 (r3 |= ((r2 >> 2) & #b00110000))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
546 (if (r3 != #b10100000)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
547 (r4 = 1)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
548 ((r3 = ((r0 & #x0f) << 12))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
549 (r3 += ((r1 & #x3f) << 6))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
550 (r3 += (r2 & #x3f))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
551 (if (r3 < #x0800)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
552 (r4 = 1))))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
553 (if (r4 != 0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
554 (r0 = 0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
555 (r0 = r3)))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
556 (if (r0 < #xf8) ; 4-byte (Mule-UCS recipe)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
557 ((r4 = (r1 >> 6))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
558 (if (r4 != #b10)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
559 (r0 = 0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
560 ((r4 = (r2 >> 6))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
561 (if (r4 != #b10)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
562 (r0 = 0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
563 ((r4 = (r3 >> 6))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
564 (if (r4 != #b10)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
565 (r0 = 0)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
566 ((r1 = ((r1 & #x3F) << 12))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
567 (r2 = ((r2 & #x3F) << 6))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
568 (r3 &= #x3F)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
569 (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3)))))))))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
570 (r0 = 0))))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
571 "Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
572 r0 == 0 for invalid sequence.")
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
573
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
574 (defvar utf-8-ccl-regs (make-vector 8 0))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
575
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
576 (defsubst utf-8-untranslated-to-ucs ()
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
577 "Return the UCS code for an untranslated sequence of raw bytes t point.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
578 Only for 3- or 4-byte sequences."
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
579 (aset utf-8-ccl-regs 0 (or (char-after) 0))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
580 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
581 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
582 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
583 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
584 (aref utf-8-ccl-regs 0))
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
585
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
586 (defun utf-8-help-echo (window object position)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
587 (format "Untranslated Unicode U+%04X"
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
588 (get-char-property position 'untranslated-utf-8 object)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
589
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
590 ;; We compose the untranslatable sequences into a single character.
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
591 ;; This is infelicitous for editing, because there's currently no
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
592 ;; mechanism for treating compositions as atomic, but is OK for
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
593 ;; display. They are composed to U+FFFD with help-echo which
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
594 ;; indicates the unicodes they represent. This function GCs too much.
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
595 (defsubst utf-8-compose ()
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
596 "Put a suitable composition on an untranslatable sequence.
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
597 Return the sequence's length."
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
598 (let* ((u (utf-8-untranslated-to-ucs))
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
599 (l (unless (zerop u)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
600 (if (>= u #x10000)
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
601 4
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
602 3))))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
603 (when l
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
604 (put-text-property (point) (min (point-max) (+ l (point)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
605 'untranslated-utf-8 u)
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
606 (put-text-property (point) (min (point-max) (+ l (point)))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
607 'help-echo 'utf-8-help-echo)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
608 (compose-region (point) (+ l (point)) ?$,3u=(B)
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
609 l)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
610
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
611 (defcustom utf-8-compose-scripts nil
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
612 "*Non-nil means compose various scripts on decoding utf-8 text."
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
613 :group 'mule
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
614 :version "21.4"
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
615 :type 'boolean)
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
616
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
617 (defun utf-8-post-read-conversion (length)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
618 "Compose untranslated utf-8 sequences into single characters.
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
619 Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
620 (save-excursion
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
621 ;; Can't do eval-when-compile to insert a multibyte constant
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
622 ;; version of the string in the loop, since it's always loaded as
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
623 ;; unibyte from a byte-compiled file.
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
624 (let ((range (string-as-multibyte "^\xe1-\xf7")))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
625 (while (and (skip-chars-forward range)
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
626 (not (eobp)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
627 (forward-char (utf-8-compose)))))
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
628 ;; Fixme: Takahashi-san implies it may not work this easily. I
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
629 ;; asked why but didn't get a reply. -- fx
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
630 (when (and utf-8-compose-scripts (> length 1))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
631 ;; These currently have definitions which cover the relevant
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
632 ;; unicodes. We could avoid loading thai-util &c by checking
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
633 ;; whether the region contains any characters with the appropriate
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
634 ;; categories. There aren't yet Unicode-based rules for Tibetan.
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
635 (save-excursion (setq length (diacritic-post-read-conversion length)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
636 (save-excursion (setq length (thai-post-read-conversion length)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
637 (save-excursion (setq length (lao-post-read-conversion length)))
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
638 (save-excursion
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
639 (setq length (in-is13194-devanagari-post-read-conversion length))))
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
640 length)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
641
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
642 ;; ucs-tables is preloaded
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
643 ;; (defun utf-8-pre-write-conversion (beg end)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
644 ;; "Semi-dummy pre-write function effectively to autoload ucs-tables."
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
645 ;; ;; Ensure translation table is loaded.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
646 ;; (require 'ucs-tables)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
647 ;; ;; Don't do this again.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
648 ;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
649 ;; nil)
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
650
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
651 (make-coding-system
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
652 'mule-utf-8 4 ?u
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
653 "UTF-8 encoding for Emacs-supported Unicode characters.
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
654 The supported Emacs character sets are the following, plus any other
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
655 characters included in the tables `ucs-mule-to-mule-unicode' and
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
656 `utf-8-subst-rev-table':
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
657 ascii
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
658 eight-bit-control
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
659 eight-bit-graphic
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
660 latin-iso8859-1
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
661 latin-iso8859-2
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
662 latin-iso8859-3
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
663 latin-iso8859-4
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
664 cyrillic-iso8859-5
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
665 greek-iso8859-7
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
666 hebrew-iso8859-8
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
667 latin-iso8859-9
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
668 latin-iso8859-14
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
669 latin-iso8859-15
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
670 mule-unicode-0100-24ff
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
671 mule-unicode-2500-33ff
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
672 mule-unicode-e000-ffff
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
673
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
674 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
675 may be decoded into korean-ksc5601, chinese-gb2312, japanese-jisx0208
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
676 \(see user option `utf-8-translate-cjk'); otherwise, sequences of
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
677 eight-bit-control and eight-bit-graphic characters are used to
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
678 preserve their byte sequences, and these are composed to display as a
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
679 single character. Emacs characters that otherwise can't be encoded
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
680 are encoded as U+FFFD."
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
681
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
682 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
683 '((safe-charsets
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
684 ascii
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
685 eight-bit-control
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
686 eight-bit-graphic
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
687 latin-iso8859-1
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
688 latin-iso8859-15
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
689 latin-iso8859-14
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
690 latin-iso8859-9
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
691 hebrew-iso8859-8
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
692 greek-iso8859-7
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
693 cyrillic-iso8859-5
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
694 latin-iso8859-4
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
695 latin-iso8859-3
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
696 latin-iso8859-2
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
697 vietnamese-viscii-lower
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
698 vietnamese-viscii-upper
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
699 thai-tis620
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
700 ipa
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
701 ethiopic
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
702 indian-is13194
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
703 katakana-jisx0201
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
704 chinese-sisheng
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
705 lao
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
706 mule-unicode-0100-24ff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
707 mule-unicode-2500-33ff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
708 mule-unicode-e000-ffff)
36371
f6bb3ed752b4 (mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents: 36243
diff changeset
709 (mime-charset . utf-8)
36423
aa776838b660 (mule-utf-8): Set coding-category property to coding-category-utf-8.
Kenichi Handa <handa@m17n.org>
parents: 36371
diff changeset
710 (coding-category . coding-category-utf-8)
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
711 (valid-codes (0 . 255))
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
712 ;; (pre-write-conversion . utf-8-pre-write-conversion)
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
713 (post-read-conversion . utf-8-post-read-conversion)))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
714
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
715 (define-coding-system-alias 'utf-8 'mule-utf-8)
38436
b174db545cfd Some fixes to follow coding conventions.
Pavel Janík <Pavel@Janik.cz>
parents: 37934
diff changeset
716
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
717 ;; I think this needs special private charsets defined for the
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
718 ;; untranslated sequences, if it's going to work well.
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
719
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
720 ;;; (defun utf-8-compose-function (pos to pattern &optional string)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
721 ;;; (let* ((prop (get-char-property pos 'composition string))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
722 ;;; (l (and prop (- (cadr prop) (car prop)))))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
723 ;;; (cond ((and l (> l (- to pos)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
724 ;;; (delete-region pos to))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
725 ;;; ((and (> (char-after pos) 224)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
726 ;;; (< (char-after pos) 256)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
727 ;;; (save-restriction
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
728 ;;; (narrow-to-region pos to)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
729 ;;; (utf-8-compose)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
730 ;;; t))))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
731
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
732 ;;; (dotimes (i 96)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
733 ;;; (aset composition-function-table
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
734 ;;; (+ 128 i)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
735 ;;; `((,(string-as-multibyte "[\200-\237\240-\377]")
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
736 ;;; . utf-8-compose-function))))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
737
38436
b174db545cfd Some fixes to follow coding conventions.
Pavel Janík <Pavel@Janik.cz>
parents: 37934
diff changeset
738 ;;; utf-8.el ends here