annotate lisp/international/utf-8.el @ 56811:694cd033cd0d

Make "GNU GENERAL PUBLIC LICENSE" an appendix. Rearrange order of nodes and sections such that both "GNU GENERAL PUBLIC LICENSE" and "GNU Free Documentation License" appear at the end, as appropriate for appendices. (Acknowledgments): Use `@unnumberedsec'.
author Luc Teirlinck <teirllm@auburn.edu>
date Fri, 27 Aug 2004 23:36:38 +0000
parents 752ef76fcc08
children c3945be39e09
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
1 ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*-
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
2
55437
6e677a935fe9 Fix references to utf-translate-cjk into utf-translate-cjk-mode.
Andreas Schwab <schwab@suse.de>
parents: 54304
diff changeset
3 ;; Copyright (C) 2001, 2004 Electrotechnical Laboratory, JAPAN.
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
4 ;; Licensed to the Free Software Foundation.
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
5 ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
6
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
7 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
8 ;; Maintainer: FSF
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
9 ;; Keywords: multilingual, Unicode, UTF-8, i18n
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
10
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
11 ;; This file is part of GNU Emacs.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
12
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
13 ;; GNU Emacs is free software; you can redistribute it and/or modify
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
14 ;; it under the terms of the GNU General Public License as published by
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
15 ;; the Free Software Foundation; either version 2, or (at your option)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
16 ;; any later version.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
17
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
18 ;; GNU Emacs is distributed in the hope that it will be useful,
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
21 ;; GNU General Public License for more details.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
22
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
23 ;; You should have received a copy of the GNU General Public License
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
24 ;; along with GNU Emacs; see the file COPYING. If not, write to the
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
25 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
26 ;; Boston, MA 02111-1307, USA.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
27
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
28 ;;; Commentary:
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
29
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
30 ;; The coding-system `mule-utf-8' basically supports encoding/decoding
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
31 ;; of the following character sets to and from UTF-8:
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
32 ;;
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
33 ;; ascii
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
34 ;; eight-bit-control
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
35 ;; latin-iso8859-1
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
36 ;; mule-unicode-0100-24ff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
37 ;; mule-unicode-2500-33ff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
38 ;; mule-unicode-e000-ffff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
39 ;;
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
40 ;; On decoding, Unicode characters that do not fit into the above
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
41 ;; character sets are handled as `eight-bit-control' or
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
42 ;; `eight-bit-graphic' characters to retain the information about the
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
43 ;; original byte sequence and text properties record the corresponding
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
44 ;; unicode.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
45 ;;
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
46 ;; Fixme: note that reading and writing invalid utf-8 may not be
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
47 ;; idempotent -- to represent the bytes to fix that needs a new charset.
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
48 ;;
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
49 ;; Characters from other character sets can be encoded with mule-utf-8
48848
4eb835c1257d (ucs-mule-cjk-to-unicode)
Dave Love <fx@gnu.org>
parents: 47720
diff changeset
50 ;; by populating the translation table
50179
65bb5afb37ef (utf-fragment-on-decoding): Don't call
Kenichi Handa <handa@m17n.org>
parents: 50085
diff changeset
51 ;; `utf-translation-table-for-encode'. Hash tables
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
52 ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
53 ;; used to support encoding and decoding of about a quarter of the CJK
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
54 ;; space between U+3400 and U+DFFF.
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
55
54304
d61b01de8cdf UTF-8 is now RFC 3629.
Eli Zaretskii <eliz@gnu.org>
parents: 52725
diff changeset
56 ;; UTF-8 is defined in RFC 3629. A sketch of the encoding is:
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
57
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
58 ;; scalar | utf-8
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
59 ;; value | 1st byte | 2nd byte | 3rd byte
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
60 ;; --------------------+-----------+-----------+----------
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
61 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
62 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
63 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
64
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
65 ;;; Code:
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
66
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
67 (defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil)
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
68 "Char table mapping characters to latin-iso8859-1 or mule-unicode-*.
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
69
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
70 If `unify-8859-on-encoding-mode' is non-nil, this table populates the
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
71 translation-table named `utf-translation-table-for-encode'.")
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
72
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
73 (define-translation-table 'utf-translation-table-for-encode)
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
74
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
75
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
76 ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
77 ;; space of mule-unicode. For Latin scripts this isn't very
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
78 ;; important. Hebrew and Arabic might go here too when there's proper
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
79 ;; support for them.
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
80
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
81 (defvar utf-fragmentation-table (make-char-table 'translation-table nil)
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
82 "Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*.
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
83
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
84 If `utf-fragment-on-decoding' is non-nil, this table populates the
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
85 translation-table named `utf-translation-table-for-decode'")
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
86
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
87 (defvar utf-defragmentation-table (make-char-table 'translation-table nil)
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
88 "Char-table for reverse mapping of `utf-fragmentation-table'.
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
89
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
90 If `utf-fragment-on-decoding' is non-nil and
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
91 `unify-8859-on-encoding-mode' is nil, this table populates the
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
92 translation-table named `utf-translation-table-for-encode'")
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
93
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
94 (define-translation-table 'utf-translation-table-for-decode)
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
95
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
96
48882
d17c0d3e36ba (ucs-mule-cjk-to-unicode)
Dave Love <fx@gnu.org>
parents: 48848
diff changeset
97 (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq)
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
98 "Hash table mapping Emacs CJK character sets to Unicode code points.
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
99
55437
6e677a935fe9 Fix references to utf-translate-cjk into utf-translate-cjk-mode.
Andreas Schwab <schwab@suse.de>
parents: 54304
diff changeset
100 If `utf-translate-cjk-mode' is non-nil, this table populates the
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
101 translation-hash-table named `utf-subst-table-for-encode'.")
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
102
48882
d17c0d3e36ba (ucs-mule-cjk-to-unicode)
Dave Love <fx@gnu.org>
parents: 48848
diff changeset
103 (define-translation-hash-table 'utf-subst-table-for-encode
d17c0d3e36ba (ucs-mule-cjk-to-unicode)
Dave Love <fx@gnu.org>
parents: 48848
diff changeset
104 ucs-mule-cjk-to-unicode)
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
105
48882
d17c0d3e36ba (ucs-mule-cjk-to-unicode)
Dave Love <fx@gnu.org>
parents: 48848
diff changeset
106 (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq)
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
107 "Hash table mapping Unicode code points to Emacs CJK character sets.
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
108
55437
6e677a935fe9 Fix references to utf-translate-cjk into utf-translate-cjk-mode.
Andreas Schwab <schwab@suse.de>
parents: 54304
diff changeset
109 If `utf-translate-cjk-mode' is non-nil, this table populates the
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
110 translation-hash-table named `utf-subst-table-for-decode'.")
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
111
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
112 (define-translation-hash-table 'utf-subst-table-for-decode
48882
d17c0d3e36ba (ucs-mule-cjk-to-unicode)
Dave Love <fx@gnu.org>
parents: 48848
diff changeset
113 ucs-unicode-to-mule-cjk)
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
114
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
115 (mapc
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
116 (lambda (pair)
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
117 (aset utf-fragmentation-table (car pair) (cdr pair))
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
118 (aset utf-defragmentation-table (cdr pair) (car pair)))
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
119 '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) (?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
120 (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) (?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,F@(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
121 (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) (?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
122 (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) (?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
123 (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) (?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
124 (?$,1' (B . ?,FP(B) (?$,1'!(B . ?,FQ(B) (?$,1'#(B . ?,FS(B) (?$,1'$(B . ?,FT(B) (?$,1'%(B . ?,FU(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
125 (?$,1'&(B . ?,FV(B) (?$,1''(B . ?,FW(B) (?$,1'((B . ?,FX(B) (?$,1')(B . ?,FY(B) (?$,1'*(B . ?,FZ(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
126 (?$,1'+(B . ?,F[(B) (?$,1',(B . ?,F\(B) (?$,1'-(B . ?,F](B) (?$,1'.(B . ?,F^(B) (?$,1'/(B . ?,F_(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
127 (?$,1'0(B . ?,F`(B) (?$,1'1(B . ?,Fa(B) (?$,1'2(B . ?,Fb(B) (?$,1'3(B . ?,Fc(B) (?$,1'4(B . ?,Fd(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
128 (?$,1'5(B . ?,Fe(B) (?$,1'6(B . ?,Ff(B) (?$,1'7(B . ?,Fg(B) (?$,1'8(B . ?,Fh(B) (?$,1'9(B . ?,Fi(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
129 (?$,1':(B . ?,Fj(B) (?$,1';(B . ?,Fk(B) (?$,1'<(B . ?,Fl(B) (?$,1'=(B . ?,Fm(B) (?$,1'>(B . ?,Fn(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
130 (?$,1'?(B . ?,Fo(B) (?$,1'@(B . ?,Fp(B) (?$,1'A(B . ?,Fq(B) (?$,1'B(B . ?,Fr(B) (?$,1'C(B . ?,Fs(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
131 (?$,1'D(B . ?,Ft(B) (?$,1'E(B . ?,Fu(B) (?$,1'F(B . ?,Fv(B) (?$,1'G(B . ?,Fw(B) (?$,1'H(B . ?,Fx(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
132 (?$,1'I(B . ?,Fy(B) (?$,1'J(B . ?,Fz(B) (?$,1'K(B . ?,F{(B) (?$,1'L(B . ?,F|(B) (?$,1'M(B . ?,F}(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
133 (?$,1'N(B . ?,F~(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
134
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
135 (?$,1(!(B . ?,L!(B) (?$,1("(B . ?,L"(B) (?$,1(#(B . ?,L#(B) (?$,1($(B . ?,L$(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
136 (?$,1(%(B . ?,L%(B) (?$,1(&(B . ?,L&(B) (?$,1('(B . ?,L'(B) (?$,1(((B . ?,L((B) (?$,1()(B . ?,L)(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
137 (?$,1(*(B . ?,L*(B) (?$,1(+(B . ?,L+(B) (?$,1(,(B . ?,L,(B) (?$,1(.(B . ?,L.(B) (?$,1(/(B . ?,L/(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
138 (?$,1(0(B . ?,L0(B) (?$,1(1(B . ?,L1(B) (?$,1(2(B . ?,L2(B) (?$,1(3(B . ?,L3(B) (?$,1(4(B . ?,L4(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
139 (?$,1(5(B . ?,L5(B) (?$,1(6(B . ?,L6(B) (?$,1(7(B . ?,L7(B) (?$,1(8(B . ?,L8(B) (?$,1(9(B . ?,L9(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
140 (?$,1(:(B . ?,L:(B) (?$,1(;(B . ?,L;(B) (?$,1(<(B . ?,L<(B) (?$,1(=(B . ?,L=(B) (?$,1(>(B . ?,L>(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
141 (?$,1(?(B . ?,L?(B) (?$,1(@(B . ?,L@(B) (?$,1(A(B . ?,LA(B) (?$,1(B(B . ?,LB(B) (?$,1(C(B . ?,LC(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
142 (?$,1(D(B . ?,LD(B) (?$,1(E(B . ?,LE(B) (?$,1(F(B . ?,LF(B) (?$,1(G(B . ?,LG(B) (?$,1(H(B . ?,LH(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
143 (?$,1(I(B . ?,LI(B) (?$,1(J(B . ?,LJ(B) (?$,1(K(B . ?,LK(B) (?$,1(L(B . ?,LL(B) (?$,1(M(B . ?,LM(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
144 (?$,1(N(B . ?,LN(B) (?$,1(O(B . ?,LO(B) (?$,1(P(B . ?,LP(B) (?$,1(Q(B . ?,LQ(B) (?$,1(R(B . ?,LR(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
145 (?$,1(S(B . ?,LS(B) (?$,1(T(B . ?,LT(B) (?$,1(U(B . ?,LU(B) (?$,1(V(B . ?,LV(B) (?$,1(W(B . ?,LW(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
146 (?$,1(X(B . ?,LX(B) (?$,1(Y(B . ?,LY(B) (?$,1(Z(B . ?,LZ(B) (?$,1([(B . ?,L[(B) (?$,1(\(B . ?,L\(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
147 (?$,1(](B . ?,L](B) (?$,1(^(B . ?,L^(B) (?$,1(_(B . ?,L_(B) (?$,1(`(B . ?,L`(B) (?$,1(a(B . ?,La(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
148 (?$,1(b(B . ?,Lb(B) (?$,1(c(B . ?,Lc(B) (?$,1(d(B . ?,Ld(B) (?$,1(e(B . ?,Le(B) (?$,1(f(B . ?,Lf(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
149 (?$,1(g(B . ?,Lg(B) (?$,1(h(B . ?,Lh(B) (?$,1(i(B . ?,Li(B) (?$,1(j(B . ?,Lj(B) (?$,1(k(B . ?,Lk(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
150 (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) (?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
151 (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) (?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
152 (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) (?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
153 (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B)))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
154
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
155
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
156 (defcustom utf-fragment-on-decoding nil
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
157 "Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets.
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
158 Setting this means that the relevant Cyrillic and Greek characters are
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
159 decoded into the iso8859 charsets rather than into
47231
2d6a05542b5b *** empty log message ***
Kenichi Handa <handa@m17n.org>
parents: 46676
diff changeset
160 mule-unicode-0100-24ff. The iso8859 charsets take half as much space
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
161 in the buffer, but using them may affect how the buffer can be re-encoded
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
162 and may require a different input method to search for them, for instance.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
163 See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode'
47231
2d6a05542b5b *** empty log message ***
Kenichi Handa <handa@m17n.org>
parents: 46676
diff changeset
164 for mechanisms to make this largely transparent.
2d6a05542b5b *** empty log message ***
Kenichi Handa <handa@m17n.org>
parents: 46676
diff changeset
165
2d6a05542b5b *** empty log message ***
Kenichi Handa <handa@m17n.org>
parents: 46676
diff changeset
166 Setting this variable outside customize has no effect."
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
167 :set (lambda (s v)
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
168 (if v
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
169 (progn
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
170 (define-translation-table 'utf-translation-table-for-decode
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
171 utf-fragmentation-table)
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
172 ;; Even if unify-8859-on-encoding-mode is off, make
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
173 ;; mule-utf-* encode characters in
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
174 ;; utf-fragmentation-table.
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
175 (unless (eq (get 'utf-translation-table-for-encode
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
176 'translation-table)
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
177 ucs-mule-to-mule-unicode)
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
178 (define-translation-table 'utf-translation-table-for-encode
50179
65bb5afb37ef (utf-fragment-on-decoding): Don't call
Kenichi Handa <handa@m17n.org>
parents: 50085
diff changeset
179 utf-defragmentation-table)))
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
180 (define-translation-table 'utf-translation-table-for-decode)
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
181 ;; When unify-8859-on-encoding-mode is off, be sure to make
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
182 ;; mule-utf-* disabled for characters in
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
183 ;; utf-fragmentation-table.
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
184 (unless (eq (get 'utf-translation-table-for-encode
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
185 'translation-table)
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
186 ucs-mule-to-mule-unicode)
50179
65bb5afb37ef (utf-fragment-on-decoding): Don't call
Kenichi Handa <handa@m17n.org>
parents: 50085
diff changeset
187 (define-translation-table 'utf-translation-table-for-encode)))
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
188 (set-default s v))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
189 :version "21.4"
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
190 :type 'boolean
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
191 :group 'mule)
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
192
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
193
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
194 (defconst utf-translate-cjk-charsets '(chinese-gb2312
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
195 chinese-big5-1 chinese-big5-2
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
196 japanese-jisx0208 japanese-jisx0212
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
197 korean-ksc5601)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
198 "List of charsets supported by `utf-translate-cjk-mode'.")
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
199
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
200 (defconst utf-translate-cjk-unicode-range
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
201 '((#x2e80 . #xd7a3)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
202 (#xff00 . #xffef))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
203 "List of Unicode code ranges supported by `utf-translate-cjk-mode'.")
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
204
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
205 ;; Return non-nil if CODE-POINT is in `utf-translate-cjk-unicode-range'.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
206 (defsubst utf-translate-cjk-substitutable-p (code-point)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
207 (let ((tail utf-translate-cjk-unicode-range)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
208 elt)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
209 (while tail
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
210 (setq elt (car tail) tail (cdr tail))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
211 (if (and (>= code-point (car elt)) (<= code-point (cdr elt)))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
212 (setq tail nil)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
213 (setq elt nil)))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
214 elt))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
215
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
216 (defvar utf-translate-cjk-lang-env nil
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
217 "Language environment in which tables for `utf-translate-cjk-mode' is loaded.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
218 The value nil means that the tables are not yet loaded.")
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
219
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
220 (defun utf-translate-cjk-load-tables ()
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
221 "Load tables for `utf-translate-cjk-mode'."
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
222 ;; Fixme: Allow the use of the CJK charsets to be
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
223 ;; customized by reordering and possible omission.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
224 (let ((redefined (< (hash-table-size ucs-mule-cjk-to-unicode) 43000)))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
225 (if redefined
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
226 ;; Redefine them with realistic initial sizes and a
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
227 ;; smallish rehash size to avoid wasting significant
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
228 ;; space after they're built.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
229 (setq ucs-mule-cjk-to-unicode
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
230 (make-hash-table :test 'eq :size 43000 :rehash-size 1000)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
231 ucs-unicode-to-mule-cjk
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
232 (make-hash-table :test 'eq :size 21500 :rehash-size 1000)))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
233
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
234 ;; Load the files explicitly, to avoid having to keep
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
235 ;; around the large tables they contain (as well as the
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
236 ;; ones which get built).
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
237 (cond ((string= "Korean" current-language-environment)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
238 (load "subst-jis")
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
239 (load "subst-big5")
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
240 (load "subst-gb2312")
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
241 (load "subst-ksc"))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
242 ((string= "Chinese-BIG5" current-language-environment)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
243 (load "subst-jis")
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
244 (load "subst-ksc")
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
245 (load "subst-gb2312")
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
246 (load "subst-big5"))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
247 ((string= "Chinese-GB" current-language-environment)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
248 (load "subst-jis")
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
249 (load "subst-ksc")
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
250 (load "subst-big5")
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
251 (load "subst-gb2312"))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
252 (t
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
253 (load "subst-ksc")
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
254 (load "subst-gb2312")
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
255 (load "subst-big5")
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
256 (load "subst-jis"))) ; jis covers as much as big5, gb2312
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
257
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
258 (when redefined
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
259 (define-translation-hash-table 'utf-subst-table-for-decode
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
260 ucs-unicode-to-mule-cjk)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
261 (define-translation-hash-table 'utf-subst-table-for-encode
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
262 ucs-mule-cjk-to-unicode)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
263 (set-char-table-extra-slot (get 'utf-translation-table-for-encode
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
264 'translation-table)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
265 1 ucs-mule-cjk-to-unicode))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
266
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
267 (setq utf-translate-cjk-lang-env current-language-environment)))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
268
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
269 (defun utf-lookup-subst-table-for-decode (code-point)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
270 (if (and utf-translate-cjk-mode
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
271 (not utf-translate-cjk-lang-env)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
272 (utf-translate-cjk-substitutable-p code-point))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
273 (utf-translate-cjk-load-tables))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
274 (gethash code-point
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
275 (get 'utf-subst-table-for-decode 'translation-hash-table)))
56562
9274a15c1400 (utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents: 56095
diff changeset
276
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
277
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
278 (defun utf-lookup-subst-table-for-encode (char)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
279 (if (and utf-translate-cjk-mode
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
280 (not utf-translate-cjk-lang-env)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
281 (memq (char-charset char) utf-translate-cjk-charsets))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
282 (utf-translate-cjk-load-tables))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
283 (gethash char
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
284 (get 'utf-subst-table-for-encode 'translation-hash-table)))
56562
9274a15c1400 (utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents: 56095
diff changeset
285
50341
f49a20cb84ed (utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents: 50207
diff changeset
286 (define-minor-mode utf-translate-cjk-mode
56562
9274a15c1400 (utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents: 56095
diff changeset
287 "Toggle whether UTF based coding systems de/encode CJK characters.
9274a15c1400 (utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents: 56095
diff changeset
288 If ARG is an integer, enable if ARG is positive and disable if
9274a15c1400 (utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents: 56095
diff changeset
289 zero or negative. This is a minor mode.
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
290 Enabling this allows the coding systems mule-utf-8,
51628
abfc7d48b476 (utf-translate-cjk-mode): Fix docstring.
Kenichi Handa <handa@m17n.org>
parents: 50766
diff changeset
291 mule-utf-16le and mule-utf-16be to encode characters in the charsets
48848
4eb835c1257d (ucs-mule-cjk-to-unicode)
Dave Love <fx@gnu.org>
parents: 47720
diff changeset
292 `korean-ksc5601', `chinese-gb2312', `chinese-big5-1',
4eb835c1257d (ucs-mule-cjk-to-unicode)
Dave Love <fx@gnu.org>
parents: 47720
diff changeset
293 `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to
4eb835c1257d (ucs-mule-cjk-to-unicode)
Dave Love <fx@gnu.org>
parents: 47720
diff changeset
294 decode the corresponding unicodes into such characters.
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
295
48848
4eb835c1257d (ucs-mule-cjk-to-unicode)
Dave Love <fx@gnu.org>
parents: 47720
diff changeset
296 Where the charsets overlap, the one preferred for decoding is chosen
4eb835c1257d (ucs-mule-cjk-to-unicode)
Dave Love <fx@gnu.org>
parents: 47720
diff changeset
297 according to the language environment in effect when this option is
4eb835c1257d (ucs-mule-cjk-to-unicode)
Dave Love <fx@gnu.org>
parents: 47720
diff changeset
298 turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for
4eb835c1257d (ucs-mule-cjk-to-unicode)
Dave Love <fx@gnu.org>
parents: 47720
diff changeset
299 Chinese-Big5 and jisx for other environments.
4eb835c1257d (ucs-mule-cjk-to-unicode)
Dave Love <fx@gnu.org>
parents: 47720
diff changeset
300
56562
9274a15c1400 (utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents: 56095
diff changeset
301 This mode is on by default. If you are not interested in CJK
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
302 characters and want to avoid some overhead on encoding/decoding
56562
9274a15c1400 (utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents: 56095
diff changeset
303 by the above coding systems, you can customize the user option
9274a15c1400 (utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents: 56095
diff changeset
304 `utf-translate-cjk-mode' to nil."
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
305 :init-value t
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
306 :version "21.4"
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
307 :type 'boolean
50341
f49a20cb84ed (utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents: 50207
diff changeset
308 :group 'mule
f49a20cb84ed (utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents: 50207
diff changeset
309 :global t
f49a20cb84ed (utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents: 50207
diff changeset
310 (if utf-translate-cjk-mode
f49a20cb84ed (utf-translate-cjk-mode): Minor mode,
Kai Großjohann <kgrossjo@eu.uu.net>
parents: 50207
diff changeset
311 (progn
50766
fc9cb527333d (utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents: 50549
diff changeset
312 (define-translation-hash-table 'utf-subst-table-for-decode
fc9cb527333d (utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents: 50549
diff changeset
313 ucs-unicode-to-mule-cjk)
fc9cb527333d (utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents: 50549
diff changeset
314 (define-translation-hash-table 'utf-subst-table-for-encode
fc9cb527333d (utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents: 50549
diff changeset
315 ucs-mule-cjk-to-unicode)
fc9cb527333d (utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents: 50549
diff changeset
316 (set-char-table-extra-slot (get 'utf-translation-table-for-encode
fc9cb527333d (utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents: 50549
diff changeset
317 'translation-table)
fc9cb527333d (utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents: 50549
diff changeset
318 1 ucs-mule-cjk-to-unicode))
50549
c8525ac04d76 (utf-translate-cjk-mode): Fix incorrect
Kenichi Handa <handa@m17n.org>
parents: 50341
diff changeset
319 (define-translation-hash-table 'utf-subst-table-for-decode
c8525ac04d76 (utf-translate-cjk-mode): Fix incorrect
Kenichi Handa <handa@m17n.org>
parents: 50341
diff changeset
320 (make-hash-table :test 'eq))
c8525ac04d76 (utf-translate-cjk-mode): Fix incorrect
Kenichi Handa <handa@m17n.org>
parents: 50341
diff changeset
321 (define-translation-hash-table 'utf-subst-table-for-encode
50766
fc9cb527333d (utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents: 50549
diff changeset
322 (make-hash-table :test 'eq))
fc9cb527333d (utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents: 50549
diff changeset
323 (set-char-table-extra-slot (get 'utf-translation-table-for-encode
fc9cb527333d (utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents: 50549
diff changeset
324 'translation-table)
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
325 1 nil))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
326
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
327 ;; Update safe-chars of mule-utf-* coding systems.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
328 (dolist (elt (coding-system-list t))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
329 (if (string-match "^mule-utf" (symbol-name elt))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
330 (let ((safe-charsets (coding-system-get elt 'safe-charsets))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
331 (safe-chars (coding-system-get elt 'safe-chars))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
332 (need-update nil))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
333 (dolist (charset utf-translate-cjk-charsets)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
334 (unless (eq utf-translate-cjk-mode (memq charset safe-charsets))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
335 (setq safe-charsets
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
336 (if utf-translate-cjk-mode
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
337 (cons charset safe-charsets)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
338 (delq charset safe-charsets))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
339 need-update t)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
340 (aset safe-chars (make-char charset) utf-translate-cjk-mode)))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
341 (when need-update
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
342 (coding-system-put elt 'safe-charsets safe-charsets)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
343 (define-coding-system-internal elt))))))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
344
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
345 (define-ccl-program ccl-mule-utf-untrans
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
346 ;; R0 is an untranslatable Unicode code-point (U+3500..U+DFFF or
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
347 ;; U+10000..U+10FFFF) or an invaid byte (#x00..#xFF). Write
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
348 ;; eight-bit-control/graphic sequence (2 to 4 chars) representing
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
349 ;; UTF-8 sequence of r0. Registers r4, r5, r6 are modified.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
350 ;;
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
351 ;; This is a subrountine because we assume that this is called very
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
352 ;; rarely (so we don't have to worry about the overhead of the
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
353 ;; call).
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
354 `(0
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
355 ((r5 = ,(charset-id 'eight-bit-control))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
356 (r6 = ,(charset-id 'eight-bit-graphic))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
357 (if (r0 < #x100)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
358 ((r4 = ((r0 >> 6) | #xC0))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
359 (write-multibyte-character r6 r4))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
360 ((if (r0 < #x10000)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
361 ((r4 = ((r0 >> 12) | #xE0))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
362 (write-multibyte-character r6 r4))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
363 ((r4 = ((r0 >> 18) | #xF0))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
364 (write-multibyte-character r6 r4)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
365 (r4 = (((r0 >> 12) & #x3F) | #x80))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
366 (if (r4 < #xA0)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
367 (write-multibyte-character r5 r4)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
368 (write-multibyte-character r6 r4))))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
369 (r4 = (((r0 >> 6) & #x3F) | #x80))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
370 (if (r4 < #xA0)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
371 (write-multibyte-character r5 r4)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
372 (write-multibyte-character r6 r4))))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
373 (r4 = ((r0 & #x3F) | #x80))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
374 (if (r4 < #xA0)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
375 (write-multibyte-character r5 r4)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
376 (write-multibyte-character r6 r4)))))
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
377
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
378 (define-ccl-program ccl-decode-mule-utf-8
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
379 ;;
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
380 ;; charset | bytes in utf-8 | bytes in emacs
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
381 ;; -----------------------+----------------+---------------
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
382 ;; ascii | 1 | 1
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
383 ;; -----------------------+----------------+---------------
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
384 ;; eight-bit-control | 2 | 2
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
385 ;; eight-bit-graphic | 2 | 1
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
386 ;; latin-iso8859-1 | 2 | 2
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
387 ;; -----------------------+----------------+---------------
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
388 ;; mule-unicode-0100-24ff | 2 | 4
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
389 ;; (< 0800) | |
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
390 ;; -----------------------+----------------+---------------
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
391 ;; mule-unicode-0100-24ff | 3 | 4
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
392 ;; (>= 8000) | |
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
393 ;; mule-unicode-2500-33ff | 3 | 4
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
394 ;; mule-unicode-e000-ffff | 3 | 4
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
395 ;; -----------------------+----------------+---------------
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
396 ;; invalid byte | 1 | 2
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
397 ;;
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
398 ;; Thus magnification factor is two.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
399 ;;
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
400 `(2
56056
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
401 ((r6 = ,(charset-id 'latin-iso8859-1))
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
402 (read r0)
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
403 (loop
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
404 (if (r0 < #x80)
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
405 ;; 1-byte encoding, i.e., ascii
56056
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
406 (write-read-repeat r0))
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
407 (if (r0 < #xc2)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
408 ;; continuation byte (invalid here) or 1st byte of overlong
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
409 ;; 2-byte sequence.
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
410 ((call ccl-mule-utf-untrans)
56056
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
411 (r6 = ,(charset-id 'latin-iso8859-1))
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
412 (read r0)
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
413 (repeat)))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
414
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
415 ;; Read the 2nd byte.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
416 (read r1)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
417 (if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
418 ((call ccl-mule-utf-untrans)
56056
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
419 (r6 = ,(charset-id 'latin-iso8859-1))
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
420 ;; Handle it in the next loop.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
421 (r0 = r1)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
422 (repeat)))
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
423
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
424 (if (r0 < #xe0)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
425 ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
56056
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
426 ((r1 &= #x3F)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
427 (r1 |= ((r0 & #x1F) << 6))
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
428 ;; Now r2 holds scalar value. We don't have to check
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
429 ;; `overlong sequence' because r0 >= 0xC2.
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
430
56056
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
431 (if (r1 >= 256)
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
432 ;; mule-unicode-0100-24ff (< 0800)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
433 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
434 (r1 -= #x0100)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
435 (r2 = (((r1 / 96) + 32) << 7))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
436 (r1 %= 96)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
437 (r1 += (r2 + 32))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
438 (translate-character
56056
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
439 utf-translation-table-for-decode r0 r1)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
440 (write-multibyte-character r0 r1)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
441 (read r0)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
442 (repeat))
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
443 (if (r1 >= 160)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
444 ;; latin-iso8859-1
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
445 ((r1 -= 128)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
446 (write-multibyte-character r6 r1)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
447 (read r0)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
448 (repeat))
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
449 ;; eight-bit-control
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
450 ((r0 = ,(charset-id 'eight-bit-control))
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
451 (write-multibyte-character r0 r1)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
452 (read r0)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
453 (repeat))))))
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
454
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
455 ;; Read the 3rd bytes.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
456 (read r2)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
457 (if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
458 ((call ccl-mule-utf-untrans)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
459 (r0 = r1)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
460 (call ccl-mule-utf-untrans)
56056
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
461 (r6 = ,(charset-id 'latin-iso8859-1))
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
462 ;; Handle it in the next loop.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
463 (r0 = r2)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
464 (repeat)))
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
465
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
466 (if (r0 < #xF0)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
467 ;; 3byte encoding
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
468 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
469 ((r3 = ((r0 & #xF) << 12))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
470 (r3 |= ((r1 & #x3F) << 6))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
471 (r3 |= (r2 & #x3F))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
472
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
473 (if (r3 < #x800) ; `overlong sequence'
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
474 ((call ccl-mule-utf-untrans)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
475 (r0 = r1)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
476 (call ccl-mule-utf-untrans)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
477 (r0 = r2)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
478 (call ccl-mule-utf-untrans)
56056
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
479 (r6 = ,(charset-id 'latin-iso8859-1))
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
480 (read r0)
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
481 (repeat)))
49598
0d8b17d428b5 Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents: 49028
diff changeset
482
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
483 (if (r3 < #x2500)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
484 ;; mule-unicode-0100-24ff (>= 0800)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
485 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
486 (r3 -= #x0100)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
487 (r3 //= 96)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
488 (r1 = (r7 + 32))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
489 (r1 += ((r3 + 32) << 7))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
490 (translate-character
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
491 utf-translation-table-for-decode r0 r1)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
492 (write-multibyte-character r0 r1)
56056
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
493 (read r0)
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
494 (repeat)))
49598
0d8b17d428b5 Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents: 49028
diff changeset
495
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
496 (if (r3 < #x3400)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
497 ;; mule-unicode-2500-33ff
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
498 ((r0 = r3) ; don't zap r3
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
499 (lookup-integer utf-subst-table-for-decode r0 r1)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
500 (if (r7 == 0)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
501 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
502 (r3 -= #x2500)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
503 (r3 //= 96)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
504 (r1 = (r7 + 32))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
505 (r1 += ((r3 + 32) << 7))))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
506 (write-multibyte-character r0 r1)
56056
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
507 (read r0)
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
508 (repeat)))
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
509
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
510 (if (r3 < #xE000)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
511 ;; Try to convert to CJK chars, else
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
512 ;; keep them as eight-bit-{control|graphic}.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
513 ((r0 = r3)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
514 (lookup-integer utf-subst-table-for-decode r3 r1)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
515 (if r7
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
516 ;; got a translation
56056
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
517 ((write-multibyte-character r3 r1)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
518 (read r0)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
519 (repeat))
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
520 ((call ccl-mule-utf-untrans)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
521 (r6 = ,(charset-id 'latin-iso8859-1))
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
522 (read r0)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
523 (repeat)))))
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
524
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
525 ;; mule-unicode-e000-ffff
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
526 ;; Fixme: fffe and ffff are invalid.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
527 (r0 = r3) ; don't zap r3
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
528 (lookup-integer utf-subst-table-for-decode r0 r1)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
529 (if (r7 == 0)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
530 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
531 (r3 -= #xe000)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
532 (r3 //= 96)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
533 (r1 = (r7 + 32))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
534 (r1 += ((r3 + 32) << 7))))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
535 (write-multibyte-character r0 r1)
56056
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
536 (read r0)
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
537 (repeat)))
49598
0d8b17d428b5 Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents: 49028
diff changeset
538
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
539 ;; Read the 4th bytes.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
540 (read r3)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
541 (if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
542 ((call ccl-mule-utf-untrans)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
543 (r0 = r1)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
544 (call ccl-mule-utf-untrans)
56056
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
545 (r0 = r2)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
546 (call ccl-mule-utf-untrans)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
547 (r6 = ,(charset-id 'latin-iso8859-1))
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
548 ;; Handle it in the next loop.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
549 (r0 = r3)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
550 (repeat)))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
551
56056
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
552 (if (r0 < #xF8)
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
553 ;; 4-byte encoding:
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
554 ;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
555 ;; keep those bytes as eight-bit-{control|graphic}
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
556 ;; Fixme: allow lookup in utf-subst-table-for-decode.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
557 ((r4 = ((r0 & #x7) << 18))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
558 (r4 |= ((r1 & #x3F) << 12))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
559 (r4 |= ((r2 & #x3F) << 6))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
560 (r4 |= (r3 & #x3F))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
561
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
562 (if (r4 < #x10000) ; `overlong sequence'
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
563 ((call ccl-mule-utf-untrans)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
564 (r0 = r1)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
565 (call ccl-mule-utf-untrans)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
566 (r0 = r2)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
567 (call ccl-mule-utf-untrans)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
568 (r0 = r3)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
569 (call ccl-mule-utf-untrans))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
570 ((r0 = r4)
56056
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
571 (call ccl-mule-utf-untrans))))
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
572
56056
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
573 ;; Unsupported sequence.
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
574 ((call ccl-mule-utf-untrans)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
575 (r0 = r1)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
576 (call ccl-mule-utf-untrans)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
577 (r0 = r2)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
578 (call ccl-mule-utf-untrans)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
579 (r0 = r3)
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
580 (call ccl-mule-utf-untrans)))
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
581 (r6 = ,(charset-id 'latin-iso8859-1))
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
582 (read r0)
50085
575609f03daa (ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents: 49598
diff changeset
583 (repeat)))
575609f03daa (ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents: 49598
diff changeset
584
56056
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
585
50085
575609f03daa (ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents: 49598
diff changeset
586 ;; At EOF...
575609f03daa (ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents: 49598
diff changeset
587 (if (r0 >= 0)
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
588 ;; r0 >= #x80
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
589 ((call ccl-mule-utf-untrans)
50085
575609f03daa (ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents: 49598
diff changeset
590 (if (r1 >= 0)
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
591 ((r0 = r1)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
592 (call ccl-mule-utf-untrans)
50085
575609f03daa (ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents: 49598
diff changeset
593 (if (r2 >= 0)
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
594 ((r0 = r2)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
595 (call ccl-mule-utf-untrans)
50085
575609f03daa (ccl-decode-mule-utf-8): Don't loose
Kenichi Handa <handa@m17n.org>
parents: 49598
diff changeset
596 (if (r3 >= 0)
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
597 ((r0 = r3)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
598 (call ccl-mule-utf-untrans))))))))))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
599
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
600 "CCL program to decode UTF-8.
36465
f968e313e8ad Doc fixes.
Dave Love <fx@gnu.org>
parents: 36423
diff changeset
601 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
602 mule-unicode-*, but see also `utf-fragmentation-table' and
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
603 `ucs-mule-cjk-to-unicode'.
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
604 Encodings of un-representable Unicode characters are decoded asis into
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
605 eight-bit-control and eight-bit-graphic characters.")
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
606
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
607 (define-ccl-program ccl-mule-utf-8-encode-untrans
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
608 ;; UTF-8 decoder generates an UTF-8 sequence represented by a
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
609 ;; sequence eight-bit-control/graphic chars for an untranslatable
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
610 ;; character and an invalid byte.
56562
9274a15c1400 (utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents: 56095
diff changeset
611 ;;
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
612 ;; This CCL parses that sequence (the first byte is already in r1),
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
613 ;; writes out the original bytes of that sequence, and sets r5 to
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
614 ;; -1.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
615 ;;
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
616 ;; If the eight-bit-control/graphic sequence is shorter than what r1
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
617 ;; suggests, it sets r5 and r6 to the last character read that
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
618 ;; should be handled by the next loop of a caller.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
619 ;;
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
620 ;; Note: For UTF-8 validation, we only check if a character is
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
621 ;; eight-bit-control/graphic or not. It may result in incorrect
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
622 ;; handling of random binary data, but such a data can't be encoded
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
623 ;; by UTF-8 anyway. At least, UTF-8 decoders doesn't generate such
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
624 ;; a sequence even if a source contains invalid byte-sequence.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
625 `(0
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
626 (;; Read the 2nd byte.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
627 (read-multibyte-character r5 r6)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
628 (r0 = (r5 != ,(charset-id 'eight-bit-control)))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
629 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
56562
9274a15c1400 (utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents: 56095
diff changeset
630 ((write r1) ; invalid UTF-8
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
631 (r1 = -1)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
632 (end)))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
633
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
634 (if (r1 <= #xC3)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
635 ;; 2-byte sequence for an originally invalid byte.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
636 ((r6 &= #x3F)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
637 (r6 |= ((r1 & #x1F) << 6))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
638 (write r6)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
639 (r5 = -1)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
640 (end)))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
641
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
642 (write r1 r6)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
643 (r2 = r1)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
644 (r1 = -1)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
645 ;; Read the 3rd byte.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
646 (read-multibyte-character r5 r6)
56562
9274a15c1400 (utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents: 56095
diff changeset
647 (r0 = (r5 != ,(charset-id 'eight-bit-control)))
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
648 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
649 (end)) ; invalid UTF-8
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
650 (write r6)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
651 (if (r2 < #xF0)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
652 ;; 3-byte sequence for an untranslated character.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
653 ((r5 = -1)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
654 (end)))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
655 ;; Read the 4th byte.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
656 (read-multibyte-character r5 r6)
56562
9274a15c1400 (utf-translate-cjk-mode): Doc fix.
Luc Teirlinck <teirllm@auburn.edu>
parents: 56095
diff changeset
657 (r0 = (r5 != ,(charset-id 'eight-bit-control)))
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
658 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
659 (end)) ; invalid UTF-8
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
660 ;; 4-byte sequence for an untranslated character.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
661 (write r6)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
662 (r5 = -1)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
663 (end))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
664
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
665 ;; At EOF...
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
666 ((r5 = -1)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
667 (if (r1 >= 0)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
668 (write r1)))))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
669
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
670 (define-ccl-program ccl-encode-mule-utf-8
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
671 `(1
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
672 ((r5 = -1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
673 (loop
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
674 (if (r5 < 0)
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
675 (read-multibyte-character r0 r1)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
676 ;; Pre-read character is in r5 (charset-ID) and r6 (code-point).
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
677 ((r0 = r5)
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
678 (r1 = r6)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
679 (r5 = -1)))
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
680 (translate-character utf-translation-table-for-encode r0 r1)
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
681
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
682 (if (r0 == ,(charset-id 'ascii))
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
683 (write-repeat r1))
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
684
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
685 (if (r0 == ,(charset-id 'latin-iso8859-1))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
686 ;; r1 scalar utf-8
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
687 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
688 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
689 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
56095
4ec2da03a87c (ccl-encode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56056
diff changeset
690 ((write ((r1 >> 6) | #xc2))
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
691 (r1 &= #x3f)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
692 (r1 |= #x80)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
693 (write-repeat r1)))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
694
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
695 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
696 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
697 ;; #x3f80 == (0011 1111 1000 0000)b
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
698 (r1 &= #x7f)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
699 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
700 ;; now r1 holds scalar value
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
701 (if (r1 < #x0800)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
702 ;; 2byte encoding
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
703 ((write ((r1 >> 6) | #xC0))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
704 (r1 &= #x3F)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
705 (r1 |= #x80)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
706 (write-repeat r1))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
707 ;; 3byte encoding
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
708 ((write ((r1 >> 12) | #xE0))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
709 (write (((r1 & #x0FC0) >> 6) | #x80))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
710 (r1 &= #x3F)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
711 (r1 |= #x80)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
712 (write-repeat r1)))))
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
713
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
714 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
715 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
716 (r1 &= #x7f)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
717 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
718 ;; now r1 holds scalar value
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
719 (write ((r1 >> 12) | #xE0))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
720 (write (((r1 & #x0FC0) >> 6) | #x80))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
721 (r1 &= #x3F)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
722 (r1 |= #x80)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
723 (write-repeat r1)))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
724
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
725 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
726 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
727 (r1 &= #x7f)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
728 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
729 ;; now r1 holds scalar value
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
730 (write ((r1 >> 12) | #xE0))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
731 (write (((r1 & #x0FC0) >> 6) | #x80))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
732 (r1 &= #x3F)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
733 (r1 |= #x80)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
734 (write-repeat r1)))
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
735
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
736 (if (r0 == ,(charset-id 'eight-bit-control))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
737 ;; r1 scalar utf-8
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
738 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
739 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
740 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
741 ((write #xC2)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
742 (write-repeat r1)))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
743
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
744 (if (r0 == ,(charset-id 'eight-bit-graphic))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
745 ;; r1 scalar utf-8
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
746 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
747 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
748 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
749 ((r0 = (r1 >= #xC0))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
750 (r0 &= (r1 <= #xC3))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
751 (r4 = (r1 >= #xE1))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
752 (r4 &= (r1 <= #xF7))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
753 (r0 |= r4)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
754 (if r0
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
755 ((call ccl-mule-utf-8-encode-untrans)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
756 (repeat))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
757 (write-repeat r1))))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
758
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
759 (lookup-character utf-subst-table-for-encode r0 r1)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
760 (if r7 ; lookup succeeded
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
761 (if (r0 < #x800)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
762 ;; 2byte encoding
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
763 ((write ((r0 >> 6) | #xC0))
56095
4ec2da03a87c (ccl-encode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56056
diff changeset
764 (r0 = ((r0 & #x3F) | #x80))
4ec2da03a87c (ccl-encode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56056
diff changeset
765 (write-repeat r0))
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
766 ;; 3byte encoding
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
767 ((write ((r0 >> 12) | #xE0))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
768 (write (((r0 & #x0FC0) >> 6) | #x80))
56095
4ec2da03a87c (ccl-encode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56056
diff changeset
769 (r0 = ((r0 & #x3F) | #x80))
4ec2da03a87c (ccl-encode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56056
diff changeset
770 (write-repeat r0))))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
771
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
772 ;; Unsupported character.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
773 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
774 (write #xef)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
775 (write #xbf)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
776 (write-repeat #xbd))))
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
777 "CCL program to encode into UTF-8.")
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
778
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
779
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
780 (define-ccl-program ccl-untranslated-to-ucs
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
781 `(0
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
782 (if (r1 == 0)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
783 nil
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
784 (if (r0 <= #xC3) ; 2-byte encoding
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
785 ((r0 = ((r0 & #x3) << 6))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
786 (r0 |= (r1 & #x3F))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
787 (r1 = 2))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
788 (if (r2 == 0)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
789 (r1 = 0)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
790 (if (r0 < #xF0) ; 3-byte encoding, as above
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
791 ((r0 = ((r0 & #xF) << 12))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
792 (r0 |= ((r1 & #x3F) << 6))
56056
4575a565f45d (ccl-decode-mule-utf-8): Fix previous change.
Kenichi Handa <handa@m17n.org>
parents: 56037
diff changeset
793 (r0 |= (r2 & #x3F))
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
794 (r1 = 3))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
795 (if (r3 == 0)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
796 (r1 = 0)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
797 ((r0 = ((r0 & #x7) << 18))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
798 (r0 |= ((r1 & #x3F) << 12))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
799 (r0 |= ((r2 & #x3F) << 6))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
800 (r0 |= (r3 & #x3F))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
801 (r1 = 4))))))))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
802 "Decode 2-, 3-, or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
803 Set r1 to the byte length. r0 == 0 for invalid sequence.")
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
804
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
805 (defvar utf-8-ccl-regs (make-vector 8 0))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
806
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
807 (defsubst utf-8-untranslated-to-ucs ()
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
808 "Return the UCS code for an untranslated sequence of raw bytes t point.
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
809 Only for 3- or 4-byte sequences."
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
810 (aset utf-8-ccl-regs 0 (or (char-after) 0))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
811 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
812 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0))
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
813 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0))
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
814 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs))
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
815
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
816 (defun utf-8-help-echo (window object position)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
817 (format "Untranslated Unicode U+%04X"
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
818 (get-char-property position 'untranslated-utf-8 object)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
819
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
820 ;; We compose the untranslatable sequences into a single character,
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
821 ;; and move point to the next character.
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
822 ;; This is infelicitous for editing, because there's currently no
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
823 ;; mechanism for treating compositions as atomic, but is OK for
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
824 ;; display. They are composed to U+FFFD with help-echo which
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
825 ;; indicates the unicodes they represent. This function GCs too much.
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
826
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
827 ;; If utf-translate-cjk-mode is non-nil, this function is called with
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
828 ;; HASH-TABLE which translates CJK characters into some of CJK
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
829 ;; charsets.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
830
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
831 (defsubst utf-8-compose (hash-table)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
832 "Put a suitable composition on an untranslatable sequence at point.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
833 If HASH-TABLE is non-nil, try to translate CJK characters by it at first.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
834 Move point to the end of the sequence."
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
835 (utf-8-untranslated-to-ucs)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
836 (let ((l (aref utf-8-ccl-regs 1))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
837 ch)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
838 (if (> l 0)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
839 (if (and hash-table
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
840 (setq ch (gethash (aref utf-8-ccl-regs 0) hash-table)))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
841 (progn
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
842 (insert ch)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
843 (delete-region (point) (min (point-max) (+ l (point)))))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
844 (setq ch (aref utf-8-ccl-regs 0))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
845 (put-text-property (point) (min (point-max) (+ l (point)))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
846 'untranslated-utf-8 ch)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
847 (put-text-property (point) (min (point-max) (+ l (point)))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
848 'help-echo 'utf-8-help-echo)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
849 (if (= l 2)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
850 (put-text-property (point) (min (point-max) (+ l (point)))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
851 'display (format "\\%03o" ch))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
852 (compose-region (point) (+ l (point)) ?$,3u=(B))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
853 (forward-char l))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
854 (forward-char 1))))
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
855
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
856 (defcustom utf-8-compose-scripts nil
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
857 "*Non-nil means compose various scripts on decoding utf-8 text."
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
858 :group 'mule
46496
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
859 :version "21.4"
395e5c46761b (utf-8-subst-table)
Dave Love <fx@gnu.org>
parents: 44411
diff changeset
860 :type 'boolean)
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
861
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
862 (defun utf-8-post-read-conversion (length)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
863 "Compose untranslated utf-8 sequences into single characters.
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
864 If `utf-translate-cjk-mode' is non-nil, tries to translate CJK characters.
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
865 Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
866 (save-excursion
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
867 (save-restriction
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
868 (narrow-to-region (point) (+ (point) length))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
869 ;; Can't do eval-when-compile to insert a multibyte constant
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
870 ;; version of the string in the loop, since it's always loaded as
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
871 ;; unibyte from a byte-compiled file.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
872 (let ((range (string-as-multibyte "^\xc0-\xc3\xe1-\xf7"))
56800
752ef76fcc08 (utf-8-post-read-conversion): If the
Kenichi Handa <handa@m17n.org>
parents: 56562
diff changeset
873 (buffer-multibyte enable-multibyte-characters)
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
874 hash-table ch)
56800
752ef76fcc08 (utf-8-post-read-conversion): If the
Kenichi Handa <handa@m17n.org>
parents: 56562
diff changeset
875 (set-buffer-multibyte t)
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
876 (when utf-translate-cjk-mode
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
877 (if (not utf-translate-cjk-lang-env)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
878 ;; Check these characters:
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
879 ;; "U+2e80-U+33ff", "U+ff00-U+ffef"
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
880 ;; We may have to translate them to CJK charsets.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
881 (let ((range2 "$,29@(B-$,2G$,3r`(B-$,3u/(B"))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
882 (skip-chars-forward (concat range range2))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
883 (unless (eobp)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
884 (utf-translate-cjk-load-tables)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
885 (setq range (concat range range2)))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
886 (setq hash-table (get 'utf-subst-table-for-decode
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
887 'translation-hash-table)))))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
888 (while (and (skip-chars-forward range)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
889 (not (eobp)))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
890 (setq ch (following-char))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
891 (if (< ch 256)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
892 (utf-8-compose hash-table)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
893 (if (and hash-table
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
894 (setq ch (gethash (encode-char ch 'ucs) hash-table)))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
895 (progn
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
896 (insert ch)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
897 (delete-char 1))
56800
752ef76fcc08 (utf-8-post-read-conversion): If the
Kenichi Handa <handa@m17n.org>
parents: 56562
diff changeset
898 (forward-char 1))))
752ef76fcc08 (utf-8-post-read-conversion): If the
Kenichi Handa <handa@m17n.org>
parents: 56562
diff changeset
899 (or buffer-multibyte
752ef76fcc08 (utf-8-post-read-conversion): If the
Kenichi Handa <handa@m17n.org>
parents: 56562
diff changeset
900 (set-buffer-multibyte nil)))
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
901
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
902 (when (and utf-8-compose-scripts (> length 1))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
903 ;; These currently have definitions which cover the relevant
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
904 ;; unicodes. We could avoid loading thai-util &c by checking
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
905 ;; whether the region contains any characters with the appropriate
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
906 ;; categories. There aren't yet Unicode-based rules for Tibetan.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
907 (diacritic-compose-region (point-max) (point-min))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
908 (thai-compose-region (point-max) (point-min))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
909 (lao-compose-region (point-max) (point-min))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
910 (devanagari-compose-region (point-max) (point-min))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
911 (malayalam-compose-region (point-max) (point-min))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
912 (tamil-compose-region (point-max) (point-min)))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
913 (- (point-max) (point-min)))))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
914
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
915 (defun utf-8-pre-write-conversion (beg end)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
916 "Prepare for `utf-translate-cjk-mode' to encode text between BEG and END.
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
917 This is used as a post-read-conversion of utf-8 coding system."
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
918 (if (and utf-translate-cjk-mode
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
919 (not utf-translate-cjk-lang-env)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
920 (save-excursion
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
921 (goto-char beg)
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
922 (re-search-forward "\\cc\\|\\cj\\|\\ch" end t)))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
923 (utf-translate-cjk-load-tables))
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
924 nil)
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
925
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
926 (make-coding-system
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
927 'mule-utf-8 4 ?u
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
928 "UTF-8 encoding for Emacs-supported Unicode characters.
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
929 It supports Unicode characters of these ranges:
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
930 U+0000..U+33FF, U+E000..U+FFFF.
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
931 They correspond to these Emacs character sets:
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
932 ascii, latin-iso8859-1, mule-unicode-0100-24ff,
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
933 mule-unicode-2500-33ff, mule-unicode-e000-ffff
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
934
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
935 On decoding (e.g. reading a file), Unicode characters not in the above
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
936 ranges are decoded into sequences of eight-bit-control and
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
937 eight-bit-graphic characters to preserve their byte sequences. The
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
938 byte sequence is preserved on i/o for valid utf-8, but not necessarily
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
939 for invalid utf-8.
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
940
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
941 On encoding (e.g. writing a file), Emacs characters not belonging to
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
942 any of the character sets listed above are encoded into the UTF-8 byte
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
943 sequence representing U+FFFD (REPLACEMENT CHARACTER)."
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
944
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
945 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
946 `((safe-charsets
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
947 ascii
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
948 eight-bit-control
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
949 eight-bit-graphic
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
950 latin-iso8859-1
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
951 mule-unicode-0100-24ff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
952 mule-unicode-2500-33ff
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
953 mule-unicode-e000-ffff
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
954 ,@(if utf-translate-cjk-mode
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
955 utf-translate-cjk-charsets))
36371
f6bb3ed752b4 (mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents: 36243
diff changeset
956 (mime-charset . utf-8)
36423
aa776838b660 (mule-utf-8): Set coding-category property to coding-category-utf-8.
Kenichi Handa <handa@m17n.org>
parents: 36371
diff changeset
957 (coding-category . coding-category-utf-8)
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
958 (valid-codes (0 . 255))
56037
81dbb510a1db (utf-translate-cjk-charsets): New
Kenichi Handa <handa@m17n.org>
parents: 55437
diff changeset
959 (pre-write-conversion . utf-8-pre-write-conversion)
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
960 (post-read-conversion . utf-8-post-read-conversion)
50766
fc9cb527333d (utf-translate-cjk-mode): Update the
Kenichi Handa <handa@m17n.org>
parents: 50549
diff changeset
961 (translation-table-for-encode . utf-translation-table-for-encode)
47703
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
962 (dependency unify-8859-on-encoding-mode
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
963 unify-8859-on-decoding-mode
6d4430dfeafc (ucs-mule-to-mule-unicode): Don't define
Kenichi Handa <handa@m17n.org>
parents: 47409
diff changeset
964 utf-fragment-on-decoding
55437
6e677a935fe9 Fix references to utf-translate-cjk into utf-translate-cjk-mode.
Andreas Schwab <schwab@suse.de>
parents: 54304
diff changeset
965 utf-translate-cjk-mode)))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
966
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
967 (define-coding-system-alias 'utf-8 'mule-utf-8)
38436
b174db545cfd Some fixes to follow coding conventions.
Pavel Janík <Pavel@Janik.cz>
parents: 37934
diff changeset
968
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
969 ;; I think this needs special private charsets defined for the
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
970 ;; untranslated sequences, if it's going to work well.
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
971
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
972 ;;; (defun utf-8-compose-function (pos to pattern &optional string)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
973 ;;; (let* ((prop (get-char-property pos 'composition string))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
974 ;;; (l (and prop (- (cadr prop) (car prop)))))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
975 ;;; (cond ((and l (> l (- to pos)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
976 ;;; (delete-region pos to))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
977 ;;; ((and (> (char-after pos) 224)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
978 ;;; (< (char-after pos) 256)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
979 ;;; (save-restriction
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
980 ;;; (narrow-to-region pos to)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
981 ;;; (utf-8-compose)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
982 ;;; t))))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
983
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
984 ;;; (dotimes (i 96)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
985 ;;; (aset composition-function-table
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
986 ;;; (+ 128 i)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
987 ;;; `((,(string-as-multibyte "[\200-\237\240-\377]")
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
988 ;;; . utf-8-compose-function))))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
989
52401
695cf19ef79e Add arch taglines
Miles Bader <miles@gnu.org>
parents: 52284
diff changeset
990 ;;; arch-tag: b08735b7-753b-4ae6-b754-0f3efe4515c5
38436
b174db545cfd Some fixes to follow coding conventions.
Pavel Janík <Pavel@Janik.cz>
parents: 37934
diff changeset
991 ;;; utf-8.el ends here