Mercurial > emacs
annotate lisp/international/utf-8.el @ 37032:620f6951fd8a
*** empty log message ***
author | Gerd Moellmann <gerd@gnu.org> |
---|---|
date | Wed, 28 Mar 2001 12:06:39 +0000 |
parents | 898d0f4abcad |
children | b095952a8678 |
rev | line source |
---|---|
35542 | 1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support |
2 | |
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. | |
4 ;; Licensed to the Free Software Foundation. | |
5 | |
36243 | 6 ;; Keywords: multilingual, Unicode, UTF-8, i18n |
35542 | 7 |
8 ;; This file is part of GNU Emacs. | |
9 | |
10 ;; GNU Emacs is free software; you can redistribute it and/or modify | |
11 ;; it under the terms of the GNU General Public License as published by | |
12 ;; the Free Software Foundation; either version 2, or (at your option) | |
13 ;; any later version. | |
14 | |
15 ;; GNU Emacs is distributed in the hope that it will be useful, | |
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 ;; GNU General Public License for more details. | |
19 | |
20 ;; You should have received a copy of the GNU General Public License | |
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
22 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
23 ;; Boston, MA 02111-1307, USA. | |
24 | |
25 ;;; Commentary: | |
26 | |
27 ;; The coding-system `mule-utf-8' supports encoding/decoding of the | |
36243 | 28 ;; following character sets to and from UTF-8: |
35542 | 29 ;; |
30 ;; ascii | |
31 ;; eight-bit-control | |
32 ;; latin-iso8859-1 | |
33 ;; mule-unicode-0100-24ff | |
34 ;; mule-unicode-2500-33ff | |
35 ;; mule-unicode-e000-ffff | |
36 ;; | |
37 ;; Characters of other character sets cannot be encoded with | |
36243 | 38 ;; mule-utf-8. Note that the mule-unicode charsets currently lack |
39 ;; case and syntax information, so things like `downcase' will only | |
40 ;; work for characters from ASCII and Latin-1. | |
35542 | 41 ;; |
36243 | 42 ;; On decoding, Unicode characters that do not fit into the above |
43 ;; character sets are handled as `eight-bit-control' or | |
44 ;; `eight-bit-graphic' characters to retain the information about the | |
45 ;; original byte sequence. | |
46 | |
47 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: | |
35542 | 48 |
49 ;; scalar | utf-8 | |
50 ;; value | 1st byte | 2nd byte | 3rd byte | |
51 ;; --------------------+-----------+-----------+---------- | |
52 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | | |
53 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | | |
54 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx | |
55 | |
56 ;;; Code: | |
57 | |
58 (define-ccl-program ccl-decode-mule-utf-8 | |
59 ;; | |
60 ;; charset | bytes in utf-8 | bytes in emacs | |
61 ;; -----------------------+----------------+--------------- | |
62 ;; ascii | 1 | 1 | |
63 ;; -----------------------+----------------+--------------- | |
64 ;; eight-bit-control | 2 | 2 | |
65 ;; latin-iso8859-1 | 2 | 2 | |
66 ;; -----------------------+----------------+--------------- | |
67 ;; mule-unicode-0100-24ff | 2 | 4 | |
68 ;; (< 0800) | | | |
69 ;; -----------------------+----------------+--------------- | |
70 ;; mule-unicode-0100-24ff | 3 | 4 | |
71 ;; (>= 8000) | | | |
72 ;; mule-unicode-2500-33ff | 3 | 4 | |
73 ;; mule-unicode-e000-ffff | 3 | 4 | |
74 ;; | |
75 ;; Thus magnification factor is two. | |
76 ;; | |
77 `(2 | |
78 ((loop | |
79 (read r0) | |
80 | |
81 ;; 1byte encoding, i.e., ascii | |
82 (if (r0 < #x80) | |
83 (write r0) | |
84 | |
85 ;; 2byte encoding | |
86 (if (r0 < #xe0) | |
87 ((read r1) | |
88 (r0 &= #x1f) | |
89 (r0 <<= 6) | |
90 (r1 &= #x3f) | |
91 (r1 += r0) | |
92 ;; now r1 holds scalar value | |
93 | |
94 ;; eight-bit-control | |
95 (if (r1 < 160) | |
96 ((r0 = ,(charset-id 'eight-bit-control)) | |
97 (write-multibyte-character r0 r1)) | |
98 | |
99 ;; latin-iso8859-1 | |
100 (if (r1 < 256) | |
101 ((r0 = ,(charset-id 'latin-iso8859-1)) | |
102 (r1 -= 128) | |
103 (write-multibyte-character r0 r1)) | |
104 | |
105 ;; mule-unicode-0100-24ff (< 0800) | |
106 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
107 (r1 -= #x0100) | |
108 (r2 = (((r1 / 96) + 32) << 7)) | |
109 (r1 %= 96) | |
110 (r1 += (r2 + 32)) | |
111 (write-multibyte-character r0 r1))))) | |
112 | |
113 ;; 3byte encoding | |
114 (if (r0 < #xf0) | |
115 ((read r1 r2) | |
116 (r3 = ((r0 & #x0f) << 12)) | |
117 (r3 += ((r1 & #x3f) << 6)) | |
118 (r3 += (r2 & #x3f)) | |
119 ;; now r3 holds scalar value | |
120 | |
121 ;; mule-unicode-0100-24ff (>= 0800) | |
122 (if (r3 < #x2500) | |
123 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
124 (r3 -= #x0100) | |
125 (r3 //= 96) | |
126 (r1 = (r7 + 32)) | |
127 (r1 += ((r3 + 32) << 7)) | |
128 (write-multibyte-character r0 r1)) | |
129 | |
130 ;; mule-unicode-2500-33ff | |
131 (if (r3 < #x3400) | |
132 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | |
133 (r3 -= #x2500) | |
134 (r3 //= 96) | |
135 (r1 = (r7 + 32)) | |
136 (r1 += ((r3 + 32) << 7)) | |
137 (write-multibyte-character r0 r1)) | |
138 | |
139 ;; U+3400 .. U+DFFF | |
140 ;; keep those bytes as eight-bit-{control|graphic} | |
141 (if (r3 < #xe000) | |
36522 | 142 (;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic |
35542 | 143 (r3 = ,(charset-id 'eight-bit-graphic)) |
144 (write-multibyte-character r3 r0) | |
145 (if (r1 < #xa0) | |
146 (r3 = ,(charset-id 'eight-bit-control))) | |
147 (write-multibyte-character r3 r1) | |
148 (if (r2 < #xa0) | |
149 (r3 = ,(charset-id 'eight-bit-control)) | |
150 (r3 = ,(charset-id 'eight-bit-graphic))) | |
151 (write-multibyte-character r3 r2)) | |
152 | |
153 ;; mule-unicode-e000-ffff | |
154 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | |
155 (r3 -= #xe000) | |
156 (r3 //= 96) | |
157 (r1 = (r7 + 32)) | |
158 (r1 += ((r3 + 32) << 7)) | |
159 (write-multibyte-character r0 r1)))))) | |
160 | |
161 ;; 4byte encoding | |
162 ;; keep those bytes as eight-bit-{control|graphic} | |
163 ((read r1 r2 r3) | |
164 ;; r0 > #xf0, thus eight-bit-graphic | |
165 (r4 = ,(charset-id 'eight-bit-graphic)) | |
166 (write-multibyte-character r4 r0) | |
167 (if (r1 < #xa0) | |
168 (r4 = ,(charset-id 'eight-bit-control))) | |
169 (write-multibyte-character r4 r1) | |
170 (if (r2 < #xa0) | |
171 (r4 = ,(charset-id 'eight-bit-control)) | |
172 (r4 = ,(charset-id 'eight-bit-graphic))) | |
173 (write-multibyte-character r4 r2) | |
174 (if (r3 < #xa0) | |
175 (r4 = ,(charset-id 'eight-bit-control)) | |
176 (r4 = ,(charset-id 'eight-bit-graphic))) | |
177 (write-multibyte-character r4 r3))))) | |
178 | |
179 (repeat)))) | |
180 | |
36243 | 181 "CCL program to decode UTF-8. |
36465 | 182 Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
183 mule-unicode-*. Encodings of un-representable Unicode characters are | |
184 decoded asis into eight-bit-control and eight-bit-graphic | |
185 characters.") | |
35542 | 186 |
187 (define-ccl-program ccl-encode-mule-utf-8 | |
188 `(1 | |
189 (loop | |
190 (read-multibyte-character r0 r1) | |
191 | |
192 (if (r0 == ,(charset-id 'ascii)) | |
193 (write r1) | |
194 | |
195 (if (r0 == ,(charset-id 'latin-iso8859-1)) | |
196 ;; r1 scalar utf-8 | |
197 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
198 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 | |
199 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 | |
200 ((r0 = (((r1 & #x40) >> 6) | #xc2)) | |
201 (r1 &= #x3f) | |
202 (r1 |= #x80) | |
203 (write r0 r1)) | |
204 | |
205 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) | |
206 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
207 ;; #x3f80 == (0011 1111 1000 0000)b | |
208 (r1 &= #x7f) | |
209 (r1 += (r0 + 224)) ; 240 == -32 + #x0100 | |
210 ;; now r1 holds scalar value | |
211 (if (r1 < #x0800) | |
212 ;; 2byte encoding | |
213 ((r0 = (((r1 & #x07c0) >> 6) | #xc0)) | |
214 ;; #x07c0 == (0000 0111 1100 0000)b | |
215 (r1 &= #x3f) | |
216 (r1 |= #x80) | |
217 (write r0 r1)) | |
218 ;; 3byte encoding | |
219 ((r0 = (((r1 & #xf000) >> 12) | #xe0)) | |
220 (r2 = ((r1 & #x3f) | #x80)) | |
221 (r1 &= #x0fc0) | |
222 (r1 >>= 6) | |
223 (r1 |= #x80) | |
224 (write r0 r1 r2)))) | |
225 | |
226 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) | |
227 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
228 (r1 &= #x7f) | |
229 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 | |
230 (r0 = (((r1 & #xf000) >> 12) | #xe0)) | |
231 (r2 = ((r1 & #x3f) | #x80)) | |
232 (r1 &= #x0fc0) | |
233 (r1 >>= 6) | |
234 (r1 |= #x80) | |
235 (write r0 r1 r2)) | |
236 | |
237 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) | |
238 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
239 (r1 &= #x7f) | |
240 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000 | |
241 (r0 = (((r1 & #xf000) >> 12) | #xe0)) | |
242 (r2 = ((r1 & #x3f) | #x80)) | |
243 (r1 &= #x0fc0) | |
244 (r1 >>= 6) | |
245 (r1 |= #x80) | |
246 (write r0 r1 r2)) | |
247 | |
248 (if (r0 == ,(charset-id 'eight-bit-control)) | |
249 ;; r1 scalar utf-8 | |
250 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
251 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 | |
252 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 | |
253 (write r1) | |
254 | |
255 (if (r0 == ,(charset-id 'eight-bit-graphic)) | |
256 ;; r1 scalar utf-8 | |
257 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
258 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 | |
259 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 | |
260 (write r1) | |
261 | |
36243 | 262 ;; Unsupported character. |
263 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. | |
35542 | 264 ((write #xef) |
265 (write #xbf) | |
266 (write #xbd))))))))) | |
267 (repeat))) | |
268 | |
36243 | 269 "CCL program to encode into UTF-8. |
270 Only characters from the charsets ascii, eight-bit-control, | |
36465 | 271 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized. |
272 Others are encoded as U+FFFD.") | |
35542 | 273 |
274 (make-coding-system | |
275 'mule-utf-8 4 ?u | |
276 "UTF-8 encoding for Emacs-supported Unicode characters. | |
36243 | 277 The supported Emacs character sets are: |
35542 | 278 ascii |
279 eight-bit-control | |
280 eight-bit-graphic | |
281 latin-iso8859-1 | |
282 mule-unicode-0100-24ff | |
283 mule-unicode-2500-33ff | |
284 mule-unicode-e000-ffff | |
285 | |
36243 | 286 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF |
287 are decoded into sequences of eight-bit-control and eight-bit-graphic | |
288 characters to preserve their byte sequences. Emacs characters out of | |
289 these ranges are encoded into U+FFFD. | |
290 | |
291 Note that, currently, characters in the mule-unicode charsets have no | |
292 syntax and case information. Thus, for instance, upper- and | |
293 lower-casing commands won't work with them." | |
35542 | 294 |
295 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | |
296 '((safe-charsets | |
297 ascii | |
298 eight-bit-control | |
299 eight-bit-graphic | |
300 latin-iso8859-1 | |
301 mule-unicode-0100-24ff | |
302 mule-unicode-2500-33ff | |
303 mule-unicode-e000-ffff) | |
36371
f6bb3ed752b4
(mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents:
36243
diff
changeset
|
304 (mime-charset . utf-8) |
36423
aa776838b660
(mule-utf-8): Set coding-category property to coding-category-utf-8.
Kenichi Handa <handa@m17n.org>
parents:
36371
diff
changeset
|
305 (coding-category . coding-category-utf-8) |
36371
f6bb3ed752b4
(mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents:
36243
diff
changeset
|
306 (valid-codes (0 . 255)))) |
35542 | 307 |
308 (define-coding-system-alias 'utf-8 'mule-utf-8) |