Mercurial > emacs
annotate lisp/international/utf-8.el @ 37124:b1c1c6ab6f85
Further clarification for DEL vs BS on text terminals.
author | Richard M. Stallman <rms@gnu.org> |
---|---|
date | Sun, 01 Apr 2001 03:27:41 +0000 |
parents | b095952a8678 |
children | 88389fa9b713 |
rev | line source |
---|---|
35542 | 1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support |
2 | |
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. | |
4 ;; Licensed to the Free Software Foundation. | |
5 | |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
6 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> |
36243 | 7 ;; Keywords: multilingual, Unicode, UTF-8, i18n |
35542 | 8 |
9 ;; This file is part of GNU Emacs. | |
10 | |
11 ;; GNU Emacs is free software; you can redistribute it and/or modify | |
12 ;; it under the terms of the GNU General Public License as published by | |
13 ;; the Free Software Foundation; either version 2, or (at your option) | |
14 ;; any later version. | |
15 | |
16 ;; GNU Emacs is distributed in the hope that it will be useful, | |
17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
19 ;; GNU General Public License for more details. | |
20 | |
21 ;; You should have received a copy of the GNU General Public License | |
22 ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
24 ;; Boston, MA 02111-1307, USA. | |
25 | |
26 ;;; Commentary: | |
27 | |
28 ;; The coding-system `mule-utf-8' supports encoding/decoding of the | |
36243 | 29 ;; following character sets to and from UTF-8: |
35542 | 30 ;; |
31 ;; ascii | |
32 ;; eight-bit-control | |
33 ;; latin-iso8859-1 | |
34 ;; mule-unicode-0100-24ff | |
35 ;; mule-unicode-2500-33ff | |
36 ;; mule-unicode-e000-ffff | |
37 ;; | |
38 ;; Characters of other character sets cannot be encoded with | |
36243 | 39 ;; mule-utf-8. Note that the mule-unicode charsets currently lack |
40 ;; case and syntax information, so things like `downcase' will only | |
41 ;; work for characters from ASCII and Latin-1. | |
35542 | 42 ;; |
36243 | 43 ;; On decoding, Unicode characters that do not fit into the above |
44 ;; character sets are handled as `eight-bit-control' or | |
45 ;; `eight-bit-graphic' characters to retain the information about the | |
46 ;; original byte sequence. | |
47 | |
48 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: | |
35542 | 49 |
50 ;; scalar | utf-8 | |
51 ;; value | 1st byte | 2nd byte | 3rd byte | |
52 ;; --------------------+-----------+-----------+---------- | |
53 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | | |
54 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | | |
55 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx | |
56 | |
57 ;;; Code: | |
58 | |
59 (define-ccl-program ccl-decode-mule-utf-8 | |
60 ;; | |
61 ;; charset | bytes in utf-8 | bytes in emacs | |
62 ;; -----------------------+----------------+--------------- | |
63 ;; ascii | 1 | 1 | |
64 ;; -----------------------+----------------+--------------- | |
65 ;; eight-bit-control | 2 | 2 | |
66 ;; latin-iso8859-1 | 2 | 2 | |
67 ;; -----------------------+----------------+--------------- | |
68 ;; mule-unicode-0100-24ff | 2 | 4 | |
69 ;; (< 0800) | | | |
70 ;; -----------------------+----------------+--------------- | |
71 ;; mule-unicode-0100-24ff | 3 | 4 | |
72 ;; (>= 8000) | | | |
73 ;; mule-unicode-2500-33ff | 3 | 4 | |
74 ;; mule-unicode-e000-ffff | 3 | 4 | |
75 ;; | |
76 ;; Thus magnification factor is two. | |
77 ;; | |
78 `(2 | |
79 ((loop | |
80 (read r0) | |
81 | |
82 ;; 1byte encoding, i.e., ascii | |
83 (if (r0 < #x80) | |
84 (write r0) | |
85 | |
86 ;; 2byte encoding | |
87 (if (r0 < #xe0) | |
88 ((read r1) | |
89 (r0 &= #x1f) | |
90 (r0 <<= 6) | |
91 (r1 &= #x3f) | |
92 (r1 += r0) | |
93 ;; now r1 holds scalar value | |
94 | |
95 ;; eight-bit-control | |
96 (if (r1 < 160) | |
97 ((r0 = ,(charset-id 'eight-bit-control)) | |
98 (write-multibyte-character r0 r1)) | |
99 | |
100 ;; latin-iso8859-1 | |
101 (if (r1 < 256) | |
102 ((r0 = ,(charset-id 'latin-iso8859-1)) | |
103 (r1 -= 128) | |
104 (write-multibyte-character r0 r1)) | |
105 | |
106 ;; mule-unicode-0100-24ff (< 0800) | |
107 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
108 (r1 -= #x0100) | |
109 (r2 = (((r1 / 96) + 32) << 7)) | |
110 (r1 %= 96) | |
111 (r1 += (r2 + 32)) | |
112 (write-multibyte-character r0 r1))))) | |
113 | |
114 ;; 3byte encoding | |
115 (if (r0 < #xf0) | |
116 ((read r1 r2) | |
117 (r3 = ((r0 & #x0f) << 12)) | |
118 (r3 += ((r1 & #x3f) << 6)) | |
119 (r3 += (r2 & #x3f)) | |
120 ;; now r3 holds scalar value | |
121 | |
122 ;; mule-unicode-0100-24ff (>= 0800) | |
123 (if (r3 < #x2500) | |
124 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
125 (r3 -= #x0100) | |
126 (r3 //= 96) | |
127 (r1 = (r7 + 32)) | |
128 (r1 += ((r3 + 32) << 7)) | |
129 (write-multibyte-character r0 r1)) | |
130 | |
131 ;; mule-unicode-2500-33ff | |
132 (if (r3 < #x3400) | |
133 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | |
134 (r3 -= #x2500) | |
135 (r3 //= 96) | |
136 (r1 = (r7 + 32)) | |
137 (r1 += ((r3 + 32) << 7)) | |
138 (write-multibyte-character r0 r1)) | |
139 | |
140 ;; U+3400 .. U+DFFF | |
141 ;; keep those bytes as eight-bit-{control|graphic} | |
142 (if (r3 < #xe000) | |
36522 | 143 (;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic |
35542 | 144 (r3 = ,(charset-id 'eight-bit-graphic)) |
145 (write-multibyte-character r3 r0) | |
146 (if (r1 < #xa0) | |
147 (r3 = ,(charset-id 'eight-bit-control))) | |
148 (write-multibyte-character r3 r1) | |
149 (if (r2 < #xa0) | |
150 (r3 = ,(charset-id 'eight-bit-control)) | |
151 (r3 = ,(charset-id 'eight-bit-graphic))) | |
152 (write-multibyte-character r3 r2)) | |
153 | |
154 ;; mule-unicode-e000-ffff | |
155 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | |
156 (r3 -= #xe000) | |
157 (r3 //= 96) | |
158 (r1 = (r7 + 32)) | |
159 (r1 += ((r3 + 32) << 7)) | |
160 (write-multibyte-character r0 r1)))))) | |
161 | |
162 ;; 4byte encoding | |
163 ;; keep those bytes as eight-bit-{control|graphic} | |
164 ((read r1 r2 r3) | |
165 ;; r0 > #xf0, thus eight-bit-graphic | |
166 (r4 = ,(charset-id 'eight-bit-graphic)) | |
167 (write-multibyte-character r4 r0) | |
168 (if (r1 < #xa0) | |
169 (r4 = ,(charset-id 'eight-bit-control))) | |
170 (write-multibyte-character r4 r1) | |
171 (if (r2 < #xa0) | |
172 (r4 = ,(charset-id 'eight-bit-control)) | |
173 (r4 = ,(charset-id 'eight-bit-graphic))) | |
174 (write-multibyte-character r4 r2) | |
175 (if (r3 < #xa0) | |
176 (r4 = ,(charset-id 'eight-bit-control)) | |
177 (r4 = ,(charset-id 'eight-bit-graphic))) | |
178 (write-multibyte-character r4 r3))))) | |
179 | |
180 (repeat)))) | |
181 | |
36243 | 182 "CCL program to decode UTF-8. |
36465 | 183 Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
184 mule-unicode-*. Encodings of un-representable Unicode characters are | |
185 decoded asis into eight-bit-control and eight-bit-graphic | |
186 characters.") | |
35542 | 187 |
188 (define-ccl-program ccl-encode-mule-utf-8 | |
189 `(1 | |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
190 ((r5 = -1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
191 (loop |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
192 (if (r5 < 0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
193 ((r1 = -1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
194 (read-multibyte-character r0 r1)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
195 (;; We have already done read-multibyte-character. |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
196 (r0 = r5) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
197 (r1 = r6) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
198 (r5 = -1))) |
35542 | 199 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
200 (if (r0 == ,(charset-id 'ascii)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
201 (write r1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
202 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
203 (if (r0 == ,(charset-id 'latin-iso8859-1)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
204 ;; r1 scalar utf-8 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
205 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
206 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
207 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
208 ((r0 = (((r1 & #x40) >> 6) | #xc2)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
209 (r1 &= #x3f) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
210 (r1 |= #x80) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
211 (write r0 r1)) |
35542 | 212 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
213 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
214 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
215 ;; #x3f80 == (0011 1111 1000 0000)b |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
216 (r1 &= #x7f) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
217 (r1 += (r0 + 224)) ; 240 == -32 + #x0100 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
218 ;; now r1 holds scalar value |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
219 (if (r1 < #x0800) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
220 ;; 2byte encoding |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
221 ((r0 = (((r1 & #x07c0) >> 6) | #xc0)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
222 ;; #x07c0 == (0000 0111 1100 0000)b |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
223 (r1 &= #x3f) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
224 (r1 |= #x80) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
225 (write r0 r1)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
226 ;; 3byte encoding |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
227 ((r0 = (((r1 & #xf000) >> 12) | #xe0)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
228 (r2 = ((r1 & #x3f) | #x80)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
229 (r1 &= #x0fc0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
230 (r1 >>= 6) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
231 (r1 |= #x80) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
232 (write r0 r1 r2)))) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
233 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
234 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
235 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
236 (r1 &= #x7f) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
237 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
238 (r0 = (((r1 & #xf000) >> 12) | #xe0)) |
35542 | 239 (r2 = ((r1 & #x3f) | #x80)) |
240 (r1 &= #x0fc0) | |
241 (r1 >>= 6) | |
242 (r1 |= #x80) | |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
243 (write r0 r1 r2)) |
35542 | 244 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
245 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
246 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
247 (r1 &= #x7f) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
248 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
249 (r0 = (((r1 & #xf000) >> 12) | #xe0)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
250 (r2 = ((r1 & #x3f) | #x80)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
251 (r1 &= #x0fc0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
252 (r1 >>= 6) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
253 (r1 |= #x80) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
254 (write r0 r1 r2)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
255 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
256 (if (r0 == ,(charset-id 'eight-bit-control)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
257 ;; r1 scalar utf-8 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
258 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
259 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
260 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
261 ((write #xc2) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
262 (write r1)) |
35542 | 263 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
264 (if (r0 == ,(charset-id 'eight-bit-graphic)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
265 ;; r1 scalar utf-8 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
266 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
267 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
268 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
269 ((write r1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
270 (r1 = -1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
271 (read-multibyte-character r0 r1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
272 (if (r0 != ,(charset-id 'eight-bit-graphic)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
273 (if (r0 != ,(charset-id 'eight-bit-control)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
274 ((r5 = r0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
275 (r6 = r1)))) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
276 (if (r5 < 0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
277 ((read-multibyte-character r0 r2) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
278 (if (r0 != ,(charset-id 'eight-bit-graphic)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
279 (if (r0 != ,(charset-id 'eight-bit-control)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
280 ((r5 = r0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
281 (r6 = r2)))) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
282 (if (r5 < 0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
283 (write r1 r2) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
284 (if (r1 < #xa0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
285 (write r1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
286 ((write #xc2) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
287 (write r1))))))) |
35542 | 288 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
289 ;; Unsupported character. |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
290 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
291 ((write #xef) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
292 (write #xbf) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
293 (write #xbd))))))))) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
294 (repeat))) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
295 (if (r1 >= #xa0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
296 (write r1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
297 (if (r1 >= #x80) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
298 ((write #xc2) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
299 (write r1))))) |
35542 | 300 |
36243 | 301 "CCL program to encode into UTF-8. |
302 Only characters from the charsets ascii, eight-bit-control, | |
36465 | 303 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized. |
304 Others are encoded as U+FFFD.") | |
35542 | 305 |
306 (make-coding-system | |
307 'mule-utf-8 4 ?u | |
308 "UTF-8 encoding for Emacs-supported Unicode characters. | |
36243 | 309 The supported Emacs character sets are: |
35542 | 310 ascii |
311 eight-bit-control | |
312 eight-bit-graphic | |
313 latin-iso8859-1 | |
314 mule-unicode-0100-24ff | |
315 mule-unicode-2500-33ff | |
316 mule-unicode-e000-ffff | |
317 | |
36243 | 318 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF |
319 are decoded into sequences of eight-bit-control and eight-bit-graphic | |
320 characters to preserve their byte sequences. Emacs characters out of | |
321 these ranges are encoded into U+FFFD. | |
322 | |
323 Note that, currently, characters in the mule-unicode charsets have no | |
324 syntax and case information. Thus, for instance, upper- and | |
325 lower-casing commands won't work with them." | |
35542 | 326 |
327 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | |
328 '((safe-charsets | |
329 ascii | |
330 eight-bit-control | |
331 eight-bit-graphic | |
332 latin-iso8859-1 | |
333 mule-unicode-0100-24ff | |
334 mule-unicode-2500-33ff | |
335 mule-unicode-e000-ffff) | |
36371
f6bb3ed752b4
(mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents:
36243
diff
changeset
|
336 (mime-charset . utf-8) |
36423
aa776838b660
(mule-utf-8): Set coding-category property to coding-category-utf-8.
Kenichi Handa <handa@m17n.org>
parents:
36371
diff
changeset
|
337 (coding-category . coding-category-utf-8) |
36371
f6bb3ed752b4
(mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents:
36243
diff
changeset
|
338 (valid-codes (0 . 255)))) |
35542 | 339 |
340 (define-coding-system-alias 'utf-8 'mule-utf-8) |