Mercurial > emacs
annotate lisp/international/utf-8.el @ 36381:4993e5072bcb
2001-02-24 Michael Kifer <kifer@cs.sunysb.edu>
* ediff.texi: Fixed some typos
author | Michael Kifer <kifer@cs.stonybrook.edu> |
---|---|
date | Sun, 25 Feb 2001 02:02:08 +0000 |
parents | f6bb3ed752b4 |
children | aa776838b660 |
rev | line source |
---|---|
35542 | 1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support |
2 | |
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. | |
4 ;; Licensed to the Free Software Foundation. | |
5 | |
36243 | 6 ;; Keywords: multilingual, Unicode, UTF-8, i18n |
35542 | 7 |
8 ;; This file is part of GNU Emacs. | |
9 | |
10 ;; GNU Emacs is free software; you can redistribute it and/or modify | |
11 ;; it under the terms of the GNU General Public License as published by | |
12 ;; the Free Software Foundation; either version 2, or (at your option) | |
13 ;; any later version. | |
14 | |
15 ;; GNU Emacs is distributed in the hope that it will be useful, | |
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 ;; GNU General Public License for more details. | |
19 | |
20 ;; You should have received a copy of the GNU General Public License | |
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
22 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
23 ;; Boston, MA 02111-1307, USA. | |
24 | |
25 ;;; Commentary: | |
26 | |
27 ;; The coding-system `mule-utf-8' supports encoding/decoding of the | |
36243 | 28 ;; following character sets to and from UTF-8: |
35542 | 29 ;; |
30 ;; ascii | |
31 ;; eight-bit-control | |
32 ;; latin-iso8859-1 | |
33 ;; mule-unicode-0100-24ff | |
34 ;; mule-unicode-2500-33ff | |
35 ;; mule-unicode-e000-ffff | |
36 ;; | |
37 ;; Characters of other character sets cannot be encoded with | |
36243 | 38 ;; mule-utf-8. Note that the mule-unicode charsets currently lack |
39 ;; case and syntax information, so things like `downcase' will only | |
40 ;; work for characters from ASCII and Latin-1. | |
35542 | 41 ;; |
36243 | 42 ;; On decoding, Unicode characters that do not fit into the above |
43 ;; character sets are handled as `eight-bit-control' or | |
44 ;; `eight-bit-graphic' characters to retain the information about the | |
45 ;; original byte sequence. | |
46 | |
47 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: | |
35542 | 48 |
49 ;; scalar | utf-8 | |
50 ;; value | 1st byte | 2nd byte | 3rd byte | |
51 ;; --------------------+-----------+-----------+---------- | |
52 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | | |
53 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | | |
54 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx | |
55 | |
56 ;;; Code: | |
57 | |
58 (define-ccl-program ccl-decode-mule-utf-8 | |
59 ;; | |
60 ;; charset | bytes in utf-8 | bytes in emacs | |
61 ;; -----------------------+----------------+--------------- | |
62 ;; ascii | 1 | 1 | |
63 ;; -----------------------+----------------+--------------- | |
64 ;; eight-bit-control | 2 | 2 | |
65 ;; latin-iso8859-1 | 2 | 2 | |
66 ;; -----------------------+----------------+--------------- | |
67 ;; mule-unicode-0100-24ff | 2 | 4 | |
68 ;; (< 0800) | | | |
69 ;; -----------------------+----------------+--------------- | |
70 ;; mule-unicode-0100-24ff | 3 | 4 | |
71 ;; (>= 8000) | | | |
72 ;; mule-unicode-2500-33ff | 3 | 4 | |
73 ;; mule-unicode-e000-ffff | 3 | 4 | |
74 ;; | |
75 ;; Thus magnification factor is two. | |
76 ;; | |
77 `(2 | |
78 ((loop | |
79 (read r0) | |
80 | |
81 ;; 1byte encoding, i.e., ascii | |
82 (if (r0 < #x80) | |
83 (write r0) | |
84 | |
85 ;; 2byte encoding | |
86 (if (r0 < #xe0) | |
87 ((read r1) | |
88 (r0 &= #x1f) | |
89 (r0 <<= 6) | |
90 (r1 &= #x3f) | |
91 (r1 += r0) | |
92 ;; now r1 holds scalar value | |
93 | |
94 ;; eight-bit-control | |
95 (if (r1 < 160) | |
96 ((r0 = ,(charset-id 'eight-bit-control)) | |
97 (write-multibyte-character r0 r1)) | |
98 | |
99 ;; latin-iso8859-1 | |
100 (if (r1 < 256) | |
101 ((r0 = ,(charset-id 'latin-iso8859-1)) | |
102 (r1 -= 128) | |
103 (write-multibyte-character r0 r1)) | |
104 | |
105 ;; mule-unicode-0100-24ff (< 0800) | |
106 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
107 (r1 -= #x0100) | |
108 (r2 = (((r1 / 96) + 32) << 7)) | |
109 (r1 %= 96) | |
110 (r1 += (r2 + 32)) | |
111 (write-multibyte-character r0 r1))))) | |
112 | |
113 ;; 3byte encoding | |
114 (if (r0 < #xf0) | |
115 ((read r1 r2) | |
116 (r3 = ((r0 & #x0f) << 12)) | |
117 (r3 += ((r1 & #x3f) << 6)) | |
118 (r3 += (r2 & #x3f)) | |
119 ;; now r3 holds scalar value | |
120 | |
121 ;; mule-unicode-0100-24ff (>= 0800) | |
122 (if (r3 < #x2500) | |
123 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
124 (r3 -= #x0100) | |
125 (r3 //= 96) | |
126 (r1 = (r7 + 32)) | |
127 (r1 += ((r3 + 32) << 7)) | |
128 (write-multibyte-character r0 r1)) | |
129 | |
130 ;; mule-unicode-2500-33ff | |
131 (if (r3 < #x3400) | |
132 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | |
133 (r3 -= #x2500) | |
134 (r3 //= 96) | |
135 (r1 = (r7 + 32)) | |
136 (r1 += ((r3 + 32) << 7)) | |
137 (write-multibyte-character r0 r1)) | |
138 | |
139 ;; U+3400 .. U+DFFF | |
140 ;; keep those bytes as eight-bit-{control|graphic} | |
141 (if (r3 < #xe000) | |
142 (;; #xe0 < r0 < #xf0, so r0 is eight-bit-graphic | |
143 (r3 = ,(charset-id 'eight-bit-graphic)) | |
144 (write-multibyte-character r3 r0) | |
145 (if (r1 < #xa0) | |
146 (r3 = ,(charset-id 'eight-bit-control))) | |
147 (write-multibyte-character r3 r1) | |
148 (if (r2 < #xa0) | |
149 (r3 = ,(charset-id 'eight-bit-control)) | |
150 (r3 = ,(charset-id 'eight-bit-graphic))) | |
151 (write-multibyte-character r3 r2)) | |
152 | |
153 ;; mule-unicode-e000-ffff | |
154 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | |
155 (r3 -= #xe000) | |
156 (r3 //= 96) | |
157 (r1 = (r7 + 32)) | |
158 (r1 += ((r3 + 32) << 7)) | |
159 (write-multibyte-character r0 r1)))))) | |
160 | |
161 ;; 4byte encoding | |
162 ;; keep those bytes as eight-bit-{control|graphic} | |
163 ((read r1 r2 r3) | |
164 ;; r0 > #xf0, thus eight-bit-graphic | |
165 (r4 = ,(charset-id 'eight-bit-graphic)) | |
166 (write-multibyte-character r4 r0) | |
167 (if (r1 < #xa0) | |
168 (r4 = ,(charset-id 'eight-bit-control))) | |
169 (write-multibyte-character r4 r1) | |
170 (if (r2 < #xa0) | |
171 (r4 = ,(charset-id 'eight-bit-control)) | |
172 (r4 = ,(charset-id 'eight-bit-graphic))) | |
173 (write-multibyte-character r4 r2) | |
174 (if (r3 < #xa0) | |
175 (r4 = ,(charset-id 'eight-bit-control)) | |
176 (r4 = ,(charset-id 'eight-bit-graphic))) | |
177 (write-multibyte-character r4 r3))))) | |
178 | |
179 (repeat)))) | |
180 | |
36243 | 181 "CCL program to decode UTF-8. |
182 Decoding is done into the charsets ascii, eight-bit-control, | |
183 latin-iso8859-1 and mule-unicode-* only.") | |
35542 | 184 |
185 (define-ccl-program ccl-encode-mule-utf-8 | |
186 `(1 | |
187 (loop | |
188 (read-multibyte-character r0 r1) | |
189 | |
190 (if (r0 == ,(charset-id 'ascii)) | |
191 (write r1) | |
192 | |
193 (if (r0 == ,(charset-id 'latin-iso8859-1)) | |
194 ;; r1 scalar utf-8 | |
195 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
196 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 | |
197 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 | |
198 ((r0 = (((r1 & #x40) >> 6) | #xc2)) | |
199 (r1 &= #x3f) | |
200 (r1 |= #x80) | |
201 (write r0 r1)) | |
202 | |
203 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) | |
204 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
205 ;; #x3f80 == (0011 1111 1000 0000)b | |
206 (r1 &= #x7f) | |
207 (r1 += (r0 + 224)) ; 240 == -32 + #x0100 | |
208 ;; now r1 holds scalar value | |
209 (if (r1 < #x0800) | |
210 ;; 2byte encoding | |
211 ((r0 = (((r1 & #x07c0) >> 6) | #xc0)) | |
212 ;; #x07c0 == (0000 0111 1100 0000)b | |
213 (r1 &= #x3f) | |
214 (r1 |= #x80) | |
215 (write r0 r1)) | |
216 ;; 3byte encoding | |
217 ((r0 = (((r1 & #xf000) >> 12) | #xe0)) | |
218 (r2 = ((r1 & #x3f) | #x80)) | |
219 (r1 &= #x0fc0) | |
220 (r1 >>= 6) | |
221 (r1 |= #x80) | |
222 (write r0 r1 r2)))) | |
223 | |
224 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) | |
225 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
226 (r1 &= #x7f) | |
227 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 | |
228 (r0 = (((r1 & #xf000) >> 12) | #xe0)) | |
229 (r2 = ((r1 & #x3f) | #x80)) | |
230 (r1 &= #x0fc0) | |
231 (r1 >>= 6) | |
232 (r1 |= #x80) | |
233 (write r0 r1 r2)) | |
234 | |
235 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) | |
236 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
237 (r1 &= #x7f) | |
238 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000 | |
239 (r0 = (((r1 & #xf000) >> 12) | #xe0)) | |
240 (r2 = ((r1 & #x3f) | #x80)) | |
241 (r1 &= #x0fc0) | |
242 (r1 >>= 6) | |
243 (r1 |= #x80) | |
244 (write r0 r1 r2)) | |
245 | |
246 (if (r0 == ,(charset-id 'eight-bit-control)) | |
247 ;; r1 scalar utf-8 | |
248 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
249 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 | |
250 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 | |
251 (write r1) | |
252 | |
253 (if (r0 == ,(charset-id 'eight-bit-graphic)) | |
254 ;; r1 scalar utf-8 | |
255 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
256 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 | |
257 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 | |
258 (write r1) | |
259 | |
36243 | 260 ;; Unsupported character. |
261 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. | |
35542 | 262 ((write #xef) |
263 (write #xbf) | |
264 (write #xbd))))))))) | |
265 (repeat))) | |
266 | |
36243 | 267 "CCL program to encode into UTF-8. |
268 Only characters from the charsets ascii, eight-bit-control, | |
269 latin-iso8859-1 and mule-unicode-* are recognized. Others are encoded | |
270 as U+FFFD.") | |
35542 | 271 |
272 (make-coding-system | |
273 'mule-utf-8 4 ?u | |
274 "UTF-8 encoding for Emacs-supported Unicode characters. | |
36243 | 275 The supported Emacs character sets are: |
35542 | 276 ascii |
277 eight-bit-control | |
278 eight-bit-graphic | |
279 latin-iso8859-1 | |
280 mule-unicode-0100-24ff | |
281 mule-unicode-2500-33ff | |
282 mule-unicode-e000-ffff | |
283 | |
36243 | 284 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF |
285 are decoded into sequences of eight-bit-control and eight-bit-graphic | |
286 characters to preserve their byte sequences. Emacs characters out of | |
287 these ranges are encoded into U+FFFD. | |
288 | |
289 Note that, currently, characters in the mule-unicode charsets have no | |
290 syntax and case information. Thus, for instance, upper- and | |
291 lower-casing commands won't work with them." | |
35542 | 292 |
293 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | |
294 '((safe-charsets | |
295 ascii | |
296 eight-bit-control | |
297 eight-bit-graphic | |
298 latin-iso8859-1 | |
299 mule-unicode-0100-24ff | |
300 mule-unicode-2500-33ff | |
301 mule-unicode-e000-ffff) | |
36371
f6bb3ed752b4
(mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents:
36243
diff
changeset
|
302 (mime-charset . utf-8) |
f6bb3ed752b4
(mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents:
36243
diff
changeset
|
303 (valid-codes (0 . 255)))) |
35542 | 304 |
305 (define-coding-system-alias 'utf-8 'mule-utf-8) |