Mercurial > emacs
comparison lisp/international/utf-8.el @ 35542:e4a75e66ee46
new file
author | Kenichi Handa <handa@m17n.org> |
---|---|
date | Thu, 25 Jan 2001 11:51:29 +0000 |
parents | |
children | a05ae5420f85 |
comparison
equal
deleted
inserted
replaced
35541:b671f9509b3b | 35542:e4a75e66ee46 |
---|---|
1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support | |
2 | |
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. | |
4 ;; Licensed to the Free Software Foundation. | |
5 | |
6 ;; Keywords: multilingual, Unicode, UTF-8 | |
7 | |
8 ;; This file is part of GNU Emacs. | |
9 | |
10 ;; GNU Emacs is free software; you can redistribute it and/or modify | |
11 ;; it under the terms of the GNU General Public License as published by | |
12 ;; the Free Software Foundation; either version 2, or (at your option) | |
13 ;; any later version. | |
14 | |
15 ;; GNU Emacs is distributed in the hope that it will be useful, | |
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 ;; GNU General Public License for more details. | |
19 | |
20 ;; You should have received a copy of the GNU General Public License | |
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
22 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
23 ;; Boston, MA 02111-1307, USA. | |
24 | |
25 ;;; Commentary: | |
26 | |
27 ;; The coding-system `mule-utf-8' supports encoding/decoding of the | |
28 ;; following character sets: | |
29 ;; | |
30 ;; ascii | |
31 ;; eight-bit-control | |
32 ;; latin-iso8859-1 | |
33 ;; mule-unicode-0100-24ff | |
34 ;; mule-unicode-2500-33ff | |
35 ;; mule-unicode-e000-ffff | |
36 ;; | |
37 ;; Characters of other character sets cannot be encoded with | |
38 ;; mule-utf-8. | |
39 ;; | |
40 ;; On decoding, Unicode characters that do not fit in above character | |
41 ;; sets are handled as `eight-bit-control' or `eight-bit-graphic' | |
42 ;; characters to retain original information (i.e. original byte | |
43 ;; sequence). | |
44 | |
45 ;; scalar | utf-8 | |
46 ;; value | 1st byte | 2nd byte | 3rd byte | |
47 ;; --------------------+-----------+-----------+---------- | |
48 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | | |
49 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | | |
50 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx | |
51 | |
52 ;;; Code: | |
53 | |
54 (define-ccl-program ccl-decode-mule-utf-8 | |
55 ;; | |
56 ;; charset | bytes in utf-8 | bytes in emacs | |
57 ;; -----------------------+----------------+--------------- | |
58 ;; ascii | 1 | 1 | |
59 ;; -----------------------+----------------+--------------- | |
60 ;; eight-bit-control | 2 | 2 | |
61 ;; latin-iso8859-1 | 2 | 2 | |
62 ;; -----------------------+----------------+--------------- | |
63 ;; mule-unicode-0100-24ff | 2 | 4 | |
64 ;; (< 0800) | | | |
65 ;; -----------------------+----------------+--------------- | |
66 ;; mule-unicode-0100-24ff | 3 | 4 | |
67 ;; (>= 8000) | | | |
68 ;; mule-unicode-2500-33ff | 3 | 4 | |
69 ;; mule-unicode-e000-ffff | 3 | 4 | |
70 ;; | |
71 ;; Thus magnification factor is two. | |
72 ;; | |
73 `(2 | |
74 ((loop | |
75 (read r0) | |
76 | |
77 ;; 1byte encoding, i.e., ascii | |
78 (if (r0 < #x80) | |
79 (write r0) | |
80 | |
81 ;; 2byte encoding | |
82 (if (r0 < #xe0) | |
83 ((read r1) | |
84 (r0 &= #x1f) | |
85 (r0 <<= 6) | |
86 (r1 &= #x3f) | |
87 (r1 += r0) | |
88 ;; now r1 holds scalar value | |
89 | |
90 ;; eight-bit-control | |
91 (if (r1 < 160) | |
92 ((r0 = ,(charset-id 'eight-bit-control)) | |
93 (write-multibyte-character r0 r1)) | |
94 | |
95 ;; latin-iso8859-1 | |
96 (if (r1 < 256) | |
97 ((r0 = ,(charset-id 'latin-iso8859-1)) | |
98 (r1 -= 128) | |
99 (write-multibyte-character r0 r1)) | |
100 | |
101 ;; mule-unicode-0100-24ff (< 0800) | |
102 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
103 (r1 -= #x0100) | |
104 (r2 = (((r1 / 96) + 32) << 7)) | |
105 (r1 %= 96) | |
106 (r1 += (r2 + 32)) | |
107 (write-multibyte-character r0 r1))))) | |
108 | |
109 ;; 3byte encoding | |
110 (if (r0 < #xf0) | |
111 ((read r1 r2) | |
112 (r3 = ((r0 & #x0f) << 12)) | |
113 (r3 += ((r1 & #x3f) << 6)) | |
114 (r3 += (r2 & #x3f)) | |
115 ;; now r3 holds scalar value | |
116 | |
117 ;; mule-unicode-0100-24ff (>= 0800) | |
118 (if (r3 < #x2500) | |
119 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
120 (r3 -= #x0100) | |
121 (r3 //= 96) | |
122 (r1 = (r7 + 32)) | |
123 (r1 += ((r3 + 32) << 7)) | |
124 (write-multibyte-character r0 r1)) | |
125 | |
126 ;; mule-unicode-2500-33ff | |
127 (if (r3 < #x3400) | |
128 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | |
129 (r3 -= #x2500) | |
130 (r3 //= 96) | |
131 (r1 = (r7 + 32)) | |
132 (r1 += ((r3 + 32) << 7)) | |
133 (write-multibyte-character r0 r1)) | |
134 | |
135 ;; U+3400 .. U+DFFF | |
136 ;; keep those bytes as eight-bit-{control|graphic} | |
137 (if (r3 < #xe000) | |
138 (;; #xe0 < r0 < #xf0, so r0 is eight-bit-graphic | |
139 (r3 = ,(charset-id 'eight-bit-graphic)) | |
140 (write-multibyte-character r3 r0) | |
141 (if (r1 < #xa0) | |
142 (r3 = ,(charset-id 'eight-bit-control))) | |
143 (write-multibyte-character r3 r1) | |
144 (if (r2 < #xa0) | |
145 (r3 = ,(charset-id 'eight-bit-control)) | |
146 (r3 = ,(charset-id 'eight-bit-graphic))) | |
147 (write-multibyte-character r3 r2)) | |
148 | |
149 ;; mule-unicode-e000-ffff | |
150 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | |
151 (r3 -= #xe000) | |
152 (r3 //= 96) | |
153 (r1 = (r7 + 32)) | |
154 (r1 += ((r3 + 32) << 7)) | |
155 (write-multibyte-character r0 r1)))))) | |
156 | |
157 ;; 4byte encoding | |
158 ;; keep those bytes as eight-bit-{control|graphic} | |
159 ((read r1 r2 r3) | |
160 ;; r0 > #xf0, thus eight-bit-graphic | |
161 (r4 = ,(charset-id 'eight-bit-graphic)) | |
162 (write-multibyte-character r4 r0) | |
163 (if (r1 < #xa0) | |
164 (r4 = ,(charset-id 'eight-bit-control))) | |
165 (write-multibyte-character r4 r1) | |
166 (if (r2 < #xa0) | |
167 (r4 = ,(charset-id 'eight-bit-control)) | |
168 (r4 = ,(charset-id 'eight-bit-graphic))) | |
169 (write-multibyte-character r4 r2) | |
170 (if (r3 < #xa0) | |
171 (r4 = ,(charset-id 'eight-bit-control)) | |
172 (r4 = ,(charset-id 'eight-bit-graphic))) | |
173 (write-multibyte-character r4 r3))))) | |
174 | |
175 (repeat)))) | |
176 | |
177 "CCL program to decode UTF-8 into ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*.") | |
178 | |
179 (define-ccl-program ccl-encode-mule-utf-8 | |
180 `(1 | |
181 (loop | |
182 (read-multibyte-character r0 r1) | |
183 | |
184 (if (r0 == ,(charset-id 'ascii)) | |
185 (write r1) | |
186 | |
187 (if (r0 == ,(charset-id 'latin-iso8859-1)) | |
188 ;; r1 scalar utf-8 | |
189 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
190 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 | |
191 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 | |
192 ((r0 = (((r1 & #x40) >> 6) | #xc2)) | |
193 (r1 &= #x3f) | |
194 (r1 |= #x80) | |
195 (write r0 r1)) | |
196 | |
197 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) | |
198 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
199 ;; #x3f80 == (0011 1111 1000 0000)b | |
200 (r1 &= #x7f) | |
201 (r1 += (r0 + 224)) ; 240 == -32 + #x0100 | |
202 ;; now r1 holds scalar value | |
203 (if (r1 < #x0800) | |
204 ;; 2byte encoding | |
205 ((r0 = (((r1 & #x07c0) >> 6) | #xc0)) | |
206 ;; #x07c0 == (0000 0111 1100 0000)b | |
207 (r1 &= #x3f) | |
208 (r1 |= #x80) | |
209 (write r0 r1)) | |
210 ;; 3byte encoding | |
211 ((r0 = (((r1 & #xf000) >> 12) | #xe0)) | |
212 (r2 = ((r1 & #x3f) | #x80)) | |
213 (r1 &= #x0fc0) | |
214 (r1 >>= 6) | |
215 (r1 |= #x80) | |
216 (write r0 r1 r2)))) | |
217 | |
218 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) | |
219 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
220 (r1 &= #x7f) | |
221 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 | |
222 (r0 = (((r1 & #xf000) >> 12) | #xe0)) | |
223 (r2 = ((r1 & #x3f) | #x80)) | |
224 (r1 &= #x0fc0) | |
225 (r1 >>= 6) | |
226 (r1 |= #x80) | |
227 (write r0 r1 r2)) | |
228 | |
229 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) | |
230 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | |
231 (r1 &= #x7f) | |
232 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000 | |
233 (r0 = (((r1 & #xf000) >> 12) | #xe0)) | |
234 (r2 = ((r1 & #x3f) | #x80)) | |
235 (r1 &= #x0fc0) | |
236 (r1 >>= 6) | |
237 (r1 |= #x80) | |
238 (write r0 r1 r2)) | |
239 | |
240 (if (r0 == ,(charset-id 'eight-bit-control)) | |
241 ;; r1 scalar utf-8 | |
242 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
243 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 | |
244 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 | |
245 (write r1) | |
246 | |
247 (if (r0 == ,(charset-id 'eight-bit-graphic)) | |
248 ;; r1 scalar utf-8 | |
249 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | |
250 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 | |
251 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 | |
252 (write r1) | |
253 | |
254 ;; unsupported character. | |
255 ;; output U+FFFD, which is `ef bf bd' in UTF-8 | |
256 ;; actually it never reach here | |
257 ((write #xef) | |
258 (write #xbf) | |
259 (write #xbd))))))))) | |
260 (repeat))) | |
261 | |
262 "CCL program to encode ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*. into UTF-8.") | |
263 | |
264 (make-coding-system | |
265 'mule-utf-8 4 ?u | |
266 "UTF-8 encoding for Emacs-supported Unicode characters. | |
267 Supported character sets are: | |
268 ascii | |
269 eight-bit-control | |
270 eight-bit-graphic | |
271 latin-iso8859-1 | |
272 mule-unicode-0100-24ff | |
273 mule-unicode-2500-33ff | |
274 mule-unicode-e000-ffff | |
275 | |
276 Unicode characters out of these ranges are decoded | |
277 into eight-bit-control or eight-bit-graphic." | |
278 | |
279 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | |
280 '((safe-charsets | |
281 ascii | |
282 eight-bit-control | |
283 eight-bit-graphic | |
284 latin-iso8859-1 | |
285 mule-unicode-0100-24ff | |
286 mule-unicode-2500-33ff | |
287 mule-unicode-e000-ffff) | |
288 (mime-charset . utf-8))) | |
289 | |
290 (define-coding-system-alias 'utf-8 'mule-utf-8) |