annotate lisp/international/utf-8.el @ 37257:6bbf8e77d787

(dos-truncate-to-8+3): New function.
author Eli Zaretskii <eliz@gnu.org>
date Fri, 06 Apr 2001 19:03:00 +0000
parents b095952a8678
children 88389fa9b713
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
2
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
4 ;; Licensed to the Free Software Foundation.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
5
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
6 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
7 ;; Keywords: multilingual, Unicode, UTF-8, i18n
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
8
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
9 ;; This file is part of GNU Emacs.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
10
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
11 ;; GNU Emacs is free software; you can redistribute it and/or modify
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
12 ;; it under the terms of the GNU General Public License as published by
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
13 ;; the Free Software Foundation; either version 2, or (at your option)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
14 ;; any later version.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
15
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
16 ;; GNU Emacs is distributed in the hope that it will be useful,
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
19 ;; GNU General Public License for more details.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
20
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
21 ;; You should have received a copy of the GNU General Public License
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
22 ;; along with GNU Emacs; see the file COPYING. If not, write to the
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
24 ;; Boston, MA 02111-1307, USA.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
25
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
26 ;;; Commentary:
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
27
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
28 ;; The coding-system `mule-utf-8' supports encoding/decoding of the
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
29 ;; following character sets to and from UTF-8:
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
30 ;;
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
31 ;; ascii
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
32 ;; eight-bit-control
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
33 ;; latin-iso8859-1
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
34 ;; mule-unicode-0100-24ff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
35 ;; mule-unicode-2500-33ff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
36 ;; mule-unicode-e000-ffff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
37 ;;
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
38 ;; Characters of other character sets cannot be encoded with
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
39 ;; mule-utf-8. Note that the mule-unicode charsets currently lack
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
40 ;; case and syntax information, so things like `downcase' will only
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
41 ;; work for characters from ASCII and Latin-1.
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
42 ;;
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
43 ;; On decoding, Unicode characters that do not fit into the above
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
44 ;; character sets are handled as `eight-bit-control' or
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
45 ;; `eight-bit-graphic' characters to retain the information about the
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
46 ;; original byte sequence.
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
47
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
48 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
49
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
50 ;; scalar | utf-8
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
51 ;; value | 1st byte | 2nd byte | 3rd byte
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
52 ;; --------------------+-----------+-----------+----------
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
53 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
54 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
55 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
56
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
57 ;;; Code:
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
58
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
59 (define-ccl-program ccl-decode-mule-utf-8
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
60 ;;
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
61 ;; charset | bytes in utf-8 | bytes in emacs
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
62 ;; -----------------------+----------------+---------------
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
63 ;; ascii | 1 | 1
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
64 ;; -----------------------+----------------+---------------
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
65 ;; eight-bit-control | 2 | 2
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
66 ;; latin-iso8859-1 | 2 | 2
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
67 ;; -----------------------+----------------+---------------
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
68 ;; mule-unicode-0100-24ff | 2 | 4
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
69 ;; (< 0800) | |
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
70 ;; -----------------------+----------------+---------------
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
71 ;; mule-unicode-0100-24ff | 3 | 4
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
72 ;; (>= 8000) | |
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
73 ;; mule-unicode-2500-33ff | 3 | 4
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
74 ;; mule-unicode-e000-ffff | 3 | 4
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
75 ;;
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
76 ;; Thus magnification factor is two.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
77 ;;
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
78 `(2
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
79 ((loop
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
80 (read r0)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
81
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
82 ;; 1byte encoding, i.e., ascii
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
83 (if (r0 < #x80)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
84 (write r0)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
85
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
86 ;; 2byte encoding
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
87 (if (r0 < #xe0)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
88 ((read r1)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
89 (r0 &= #x1f)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
90 (r0 <<= 6)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
91 (r1 &= #x3f)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
92 (r1 += r0)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
93 ;; now r1 holds scalar value
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
94
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
95 ;; eight-bit-control
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
96 (if (r1 < 160)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
97 ((r0 = ,(charset-id 'eight-bit-control))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
98 (write-multibyte-character r0 r1))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
99
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
100 ;; latin-iso8859-1
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
101 (if (r1 < 256)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
102 ((r0 = ,(charset-id 'latin-iso8859-1))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
103 (r1 -= 128)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
104 (write-multibyte-character r0 r1))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
105
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
106 ;; mule-unicode-0100-24ff (< 0800)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
107 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
108 (r1 -= #x0100)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
109 (r2 = (((r1 / 96) + 32) << 7))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
110 (r1 %= 96)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
111 (r1 += (r2 + 32))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
112 (write-multibyte-character r0 r1)))))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
113
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
114 ;; 3byte encoding
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
115 (if (r0 < #xf0)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
116 ((read r1 r2)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
117 (r3 = ((r0 & #x0f) << 12))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
118 (r3 += ((r1 & #x3f) << 6))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
119 (r3 += (r2 & #x3f))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
120 ;; now r3 holds scalar value
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
121
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
122 ;; mule-unicode-0100-24ff (>= 0800)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
123 (if (r3 < #x2500)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
124 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
125 (r3 -= #x0100)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
126 (r3 //= 96)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
127 (r1 = (r7 + 32))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
128 (r1 += ((r3 + 32) << 7))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
129 (write-multibyte-character r0 r1))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
130
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
131 ;; mule-unicode-2500-33ff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
132 (if (r3 < #x3400)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
133 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
134 (r3 -= #x2500)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
135 (r3 //= 96)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
136 (r1 = (r7 + 32))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
137 (r1 += ((r3 + 32) << 7))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
138 (write-multibyte-character r0 r1))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
139
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
140 ;; U+3400 .. U+DFFF
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
141 ;; keep those bytes as eight-bit-{control|graphic}
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
142 (if (r3 < #xe000)
36522
898d0f4abcad *** empty log message ***
Kenichi Handa <handa@m17n.org>
parents: 36465
diff changeset
143 (;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
144 (r3 = ,(charset-id 'eight-bit-graphic))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
145 (write-multibyte-character r3 r0)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
146 (if (r1 < #xa0)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
147 (r3 = ,(charset-id 'eight-bit-control)))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
148 (write-multibyte-character r3 r1)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
149 (if (r2 < #xa0)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
150 (r3 = ,(charset-id 'eight-bit-control))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
151 (r3 = ,(charset-id 'eight-bit-graphic)))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
152 (write-multibyte-character r3 r2))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
153
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
154 ;; mule-unicode-e000-ffff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
155 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
156 (r3 -= #xe000)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
157 (r3 //= 96)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
158 (r1 = (r7 + 32))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
159 (r1 += ((r3 + 32) << 7))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
160 (write-multibyte-character r0 r1))))))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
161
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
162 ;; 4byte encoding
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
163 ;; keep those bytes as eight-bit-{control|graphic}
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
164 ((read r1 r2 r3)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
165 ;; r0 > #xf0, thus eight-bit-graphic
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
166 (r4 = ,(charset-id 'eight-bit-graphic))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
167 (write-multibyte-character r4 r0)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
168 (if (r1 < #xa0)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
169 (r4 = ,(charset-id 'eight-bit-control)))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
170 (write-multibyte-character r4 r1)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
171 (if (r2 < #xa0)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
172 (r4 = ,(charset-id 'eight-bit-control))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
173 (r4 = ,(charset-id 'eight-bit-graphic)))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
174 (write-multibyte-character r4 r2)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
175 (if (r3 < #xa0)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
176 (r4 = ,(charset-id 'eight-bit-control))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
177 (r4 = ,(charset-id 'eight-bit-graphic)))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
178 (write-multibyte-character r4 r3)))))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
179
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
180 (repeat))))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
181
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
182 "CCL program to decode UTF-8.
36465
f968e313e8ad Doc fixes.
Dave Love <fx@gnu.org>
parents: 36423
diff changeset
183 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
f968e313e8ad Doc fixes.
Dave Love <fx@gnu.org>
parents: 36423
diff changeset
184 mule-unicode-*. Encodings of un-representable Unicode characters are
f968e313e8ad Doc fixes.
Dave Love <fx@gnu.org>
parents: 36423
diff changeset
185 decoded asis into eight-bit-control and eight-bit-graphic
f968e313e8ad Doc fixes.
Dave Love <fx@gnu.org>
parents: 36423
diff changeset
186 characters.")
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
187
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
188 (define-ccl-program ccl-encode-mule-utf-8
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
189 `(1
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
190 ((r5 = -1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
191 (loop
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
192 (if (r5 < 0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
193 ((r1 = -1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
194 (read-multibyte-character r0 r1))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
195 (;; We have already done read-multibyte-character.
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
196 (r0 = r5)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
197 (r1 = r6)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
198 (r5 = -1)))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
199
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
200 (if (r0 == ,(charset-id 'ascii))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
201 (write r1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
202
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
203 (if (r0 == ,(charset-id 'latin-iso8859-1))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
204 ;; r1 scalar utf-8
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
205 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
206 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
207 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
208 ((r0 = (((r1 & #x40) >> 6) | #xc2))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
209 (r1 &= #x3f)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
210 (r1 |= #x80)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
211 (write r0 r1))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
212
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
213 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
214 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
215 ;; #x3f80 == (0011 1111 1000 0000)b
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
216 (r1 &= #x7f)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
217 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
218 ;; now r1 holds scalar value
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
219 (if (r1 < #x0800)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
220 ;; 2byte encoding
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
221 ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
222 ;; #x07c0 == (0000 0111 1100 0000)b
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
223 (r1 &= #x3f)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
224 (r1 |= #x80)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
225 (write r0 r1))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
226 ;; 3byte encoding
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
227 ((r0 = (((r1 & #xf000) >> 12) | #xe0))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
228 (r2 = ((r1 & #x3f) | #x80))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
229 (r1 &= #x0fc0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
230 (r1 >>= 6)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
231 (r1 |= #x80)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
232 (write r0 r1 r2))))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
233
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
234 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
235 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
236 (r1 &= #x7f)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
237 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
238 (r0 = (((r1 & #xf000) >> 12) | #xe0))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
239 (r2 = ((r1 & #x3f) | #x80))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
240 (r1 &= #x0fc0)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
241 (r1 >>= 6)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
242 (r1 |= #x80)
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
243 (write r0 r1 r2))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
244
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
245 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
246 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
247 (r1 &= #x7f)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
248 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
249 (r0 = (((r1 & #xf000) >> 12) | #xe0))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
250 (r2 = ((r1 & #x3f) | #x80))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
251 (r1 &= #x0fc0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
252 (r1 >>= 6)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
253 (r1 |= #x80)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
254 (write r0 r1 r2))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
255
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
256 (if (r0 == ,(charset-id 'eight-bit-control))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
257 ;; r1 scalar utf-8
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
258 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
259 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
260 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
261 ((write #xc2)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
262 (write r1))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
263
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
264 (if (r0 == ,(charset-id 'eight-bit-graphic))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
265 ;; r1 scalar utf-8
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
266 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
267 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
268 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
269 ((write r1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
270 (r1 = -1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
271 (read-multibyte-character r0 r1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
272 (if (r0 != ,(charset-id 'eight-bit-graphic))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
273 (if (r0 != ,(charset-id 'eight-bit-control))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
274 ((r5 = r0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
275 (r6 = r1))))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
276 (if (r5 < 0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
277 ((read-multibyte-character r0 r2)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
278 (if (r0 != ,(charset-id 'eight-bit-graphic))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
279 (if (r0 != ,(charset-id 'eight-bit-control))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
280 ((r5 = r0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
281 (r6 = r2))))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
282 (if (r5 < 0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
283 (write r1 r2)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
284 (if (r1 < #xa0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
285 (write r1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
286 ((write #xc2)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
287 (write r1)))))))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
288
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
289 ;; Unsupported character.
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
290 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
291 ((write #xef)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
292 (write #xbf)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
293 (write #xbd)))))))))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
294 (repeat)))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
295 (if (r1 >= #xa0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
296 (write r1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
297 (if (r1 >= #x80)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
298 ((write #xc2)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
299 (write r1)))))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
300
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
301 "CCL program to encode into UTF-8.
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
302 Only characters from the charsets ascii, eight-bit-control,
36465
f968e313e8ad Doc fixes.
Dave Love <fx@gnu.org>
parents: 36423
diff changeset
303 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
f968e313e8ad Doc fixes.
Dave Love <fx@gnu.org>
parents: 36423
diff changeset
304 Others are encoded as U+FFFD.")
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
305
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
306 (make-coding-system
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
307 'mule-utf-8 4 ?u
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
308 "UTF-8 encoding for Emacs-supported Unicode characters.
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
309 The supported Emacs character sets are:
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
310 ascii
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
311 eight-bit-control
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
312 eight-bit-graphic
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
313 latin-iso8859-1
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
314 mule-unicode-0100-24ff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
315 mule-unicode-2500-33ff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
316 mule-unicode-e000-ffff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
317
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
318 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
319 are decoded into sequences of eight-bit-control and eight-bit-graphic
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
320 characters to preserve their byte sequences. Emacs characters out of
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
321 these ranges are encoded into U+FFFD.
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
322
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
323 Note that, currently, characters in the mule-unicode charsets have no
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
324 syntax and case information. Thus, for instance, upper- and
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
325 lower-casing commands won't work with them."
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
326
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
327 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
328 '((safe-charsets
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
329 ascii
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
330 eight-bit-control
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
331 eight-bit-graphic
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
332 latin-iso8859-1
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
333 mule-unicode-0100-24ff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
334 mule-unicode-2500-33ff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
335 mule-unicode-e000-ffff)
36371
f6bb3ed752b4 (mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents: 36243
diff changeset
336 (mime-charset . utf-8)
36423
aa776838b660 (mule-utf-8): Set coding-category property to coding-category-utf-8.
Kenichi Handa <handa@m17n.org>
parents: 36371
diff changeset
337 (coding-category . coding-category-utf-8)
36371
f6bb3ed752b4 (mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents: 36243
diff changeset
338 (valid-codes (0 . 255))))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
339
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
340 (define-coding-system-alias 'utf-8 'mule-utf-8)