comparison lisp/international/utf-8.el @ 35542:e4a75e66ee46

new file
author Kenichi Handa <handa@m17n.org>
date Thu, 25 Jan 2001 11:51:29 +0000
parents
children a05ae5420f85
comparison
equal deleted inserted replaced
35541:b671f9509b3b 35542:e4a75e66ee46
1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support
2
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
4 ;; Licensed to the Free Software Foundation.
5
6 ;; Keywords: multilingual, Unicode, UTF-8
7
8 ;; This file is part of GNU Emacs.
9
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation; either version 2, or (at your option)
13 ;; any later version.
14
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the
22 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;; Boston, MA 02111-1307, USA.
24
25 ;;; Commentary:
26
27 ;; The coding-system `mule-utf-8' supports encoding/decoding of the
28 ;; following character sets:
29 ;;
30 ;; ascii
31 ;; eight-bit-control
32 ;; latin-iso8859-1
33 ;; mule-unicode-0100-24ff
34 ;; mule-unicode-2500-33ff
35 ;; mule-unicode-e000-ffff
36 ;;
37 ;; Characters of other character sets cannot be encoded with
38 ;; mule-utf-8.
39 ;;
40 ;; On decoding, Unicode characters that do not fit in above character
41 ;; sets are handled as `eight-bit-control' or `eight-bit-graphic'
42 ;; characters to retain original information (i.e. original byte
43 ;; sequence).
44
45 ;; scalar | utf-8
46 ;; value | 1st byte | 2nd byte | 3rd byte
47 ;; --------------------+-----------+-----------+----------
48 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
49 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
50 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
51
52 ;;; Code:
53
54 (define-ccl-program ccl-decode-mule-utf-8
55 ;;
56 ;; charset | bytes in utf-8 | bytes in emacs
57 ;; -----------------------+----------------+---------------
58 ;; ascii | 1 | 1
59 ;; -----------------------+----------------+---------------
60 ;; eight-bit-control | 2 | 2
61 ;; latin-iso8859-1 | 2 | 2
62 ;; -----------------------+----------------+---------------
63 ;; mule-unicode-0100-24ff | 2 | 4
64 ;; (< 0800) | |
65 ;; -----------------------+----------------+---------------
66 ;; mule-unicode-0100-24ff | 3 | 4
67 ;; (>= 8000) | |
68 ;; mule-unicode-2500-33ff | 3 | 4
69 ;; mule-unicode-e000-ffff | 3 | 4
70 ;;
71 ;; Thus magnification factor is two.
72 ;;
73 `(2
74 ((loop
75 (read r0)
76
77 ;; 1byte encoding, i.e., ascii
78 (if (r0 < #x80)
79 (write r0)
80
81 ;; 2byte encoding
82 (if (r0 < #xe0)
83 ((read r1)
84 (r0 &= #x1f)
85 (r0 <<= 6)
86 (r1 &= #x3f)
87 (r1 += r0)
88 ;; now r1 holds scalar value
89
90 ;; eight-bit-control
91 (if (r1 < 160)
92 ((r0 = ,(charset-id 'eight-bit-control))
93 (write-multibyte-character r0 r1))
94
95 ;; latin-iso8859-1
96 (if (r1 < 256)
97 ((r0 = ,(charset-id 'latin-iso8859-1))
98 (r1 -= 128)
99 (write-multibyte-character r0 r1))
100
101 ;; mule-unicode-0100-24ff (< 0800)
102 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
103 (r1 -= #x0100)
104 (r2 = (((r1 / 96) + 32) << 7))
105 (r1 %= 96)
106 (r1 += (r2 + 32))
107 (write-multibyte-character r0 r1)))))
108
109 ;; 3byte encoding
110 (if (r0 < #xf0)
111 ((read r1 r2)
112 (r3 = ((r0 & #x0f) << 12))
113 (r3 += ((r1 & #x3f) << 6))
114 (r3 += (r2 & #x3f))
115 ;; now r3 holds scalar value
116
117 ;; mule-unicode-0100-24ff (>= 0800)
118 (if (r3 < #x2500)
119 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
120 (r3 -= #x0100)
121 (r3 //= 96)
122 (r1 = (r7 + 32))
123 (r1 += ((r3 + 32) << 7))
124 (write-multibyte-character r0 r1))
125
126 ;; mule-unicode-2500-33ff
127 (if (r3 < #x3400)
128 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
129 (r3 -= #x2500)
130 (r3 //= 96)
131 (r1 = (r7 + 32))
132 (r1 += ((r3 + 32) << 7))
133 (write-multibyte-character r0 r1))
134
135 ;; U+3400 .. U+DFFF
136 ;; keep those bytes as eight-bit-{control|graphic}
137 (if (r3 < #xe000)
138 (;; #xe0 < r0 < #xf0, so r0 is eight-bit-graphic
139 (r3 = ,(charset-id 'eight-bit-graphic))
140 (write-multibyte-character r3 r0)
141 (if (r1 < #xa0)
142 (r3 = ,(charset-id 'eight-bit-control)))
143 (write-multibyte-character r3 r1)
144 (if (r2 < #xa0)
145 (r3 = ,(charset-id 'eight-bit-control))
146 (r3 = ,(charset-id 'eight-bit-graphic)))
147 (write-multibyte-character r3 r2))
148
149 ;; mule-unicode-e000-ffff
150 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
151 (r3 -= #xe000)
152 (r3 //= 96)
153 (r1 = (r7 + 32))
154 (r1 += ((r3 + 32) << 7))
155 (write-multibyte-character r0 r1))))))
156
157 ;; 4byte encoding
158 ;; keep those bytes as eight-bit-{control|graphic}
159 ((read r1 r2 r3)
160 ;; r0 > #xf0, thus eight-bit-graphic
161 (r4 = ,(charset-id 'eight-bit-graphic))
162 (write-multibyte-character r4 r0)
163 (if (r1 < #xa0)
164 (r4 = ,(charset-id 'eight-bit-control)))
165 (write-multibyte-character r4 r1)
166 (if (r2 < #xa0)
167 (r4 = ,(charset-id 'eight-bit-control))
168 (r4 = ,(charset-id 'eight-bit-graphic)))
169 (write-multibyte-character r4 r2)
170 (if (r3 < #xa0)
171 (r4 = ,(charset-id 'eight-bit-control))
172 (r4 = ,(charset-id 'eight-bit-graphic)))
173 (write-multibyte-character r4 r3)))))
174
175 (repeat))))
176
177 "CCL program to decode UTF-8 into ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*.")
178
179 (define-ccl-program ccl-encode-mule-utf-8
180 `(1
181 (loop
182 (read-multibyte-character r0 r1)
183
184 (if (r0 == ,(charset-id 'ascii))
185 (write r1)
186
187 (if (r0 == ,(charset-id 'latin-iso8859-1))
188 ;; r1 scalar utf-8
189 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
190 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
191 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
192 ((r0 = (((r1 & #x40) >> 6) | #xc2))
193 (r1 &= #x3f)
194 (r1 |= #x80)
195 (write r0 r1))
196
197 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
198 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
199 ;; #x3f80 == (0011 1111 1000 0000)b
200 (r1 &= #x7f)
201 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
202 ;; now r1 holds scalar value
203 (if (r1 < #x0800)
204 ;; 2byte encoding
205 ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
206 ;; #x07c0 == (0000 0111 1100 0000)b
207 (r1 &= #x3f)
208 (r1 |= #x80)
209 (write r0 r1))
210 ;; 3byte encoding
211 ((r0 = (((r1 & #xf000) >> 12) | #xe0))
212 (r2 = ((r1 & #x3f) | #x80))
213 (r1 &= #x0fc0)
214 (r1 >>= 6)
215 (r1 |= #x80)
216 (write r0 r1 r2))))
217
218 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
219 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
220 (r1 &= #x7f)
221 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
222 (r0 = (((r1 & #xf000) >> 12) | #xe0))
223 (r2 = ((r1 & #x3f) | #x80))
224 (r1 &= #x0fc0)
225 (r1 >>= 6)
226 (r1 |= #x80)
227 (write r0 r1 r2))
228
229 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
230 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
231 (r1 &= #x7f)
232 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
233 (r0 = (((r1 & #xf000) >> 12) | #xe0))
234 (r2 = ((r1 & #x3f) | #x80))
235 (r1 &= #x0fc0)
236 (r1 >>= 6)
237 (r1 |= #x80)
238 (write r0 r1 r2))
239
240 (if (r0 == ,(charset-id 'eight-bit-control))
241 ;; r1 scalar utf-8
242 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
243 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
244 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
245 (write r1)
246
247 (if (r0 == ,(charset-id 'eight-bit-graphic))
248 ;; r1 scalar utf-8
249 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
250 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
251 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
252 (write r1)
253
254 ;; unsupported character.
255 ;; output U+FFFD, which is `ef bf bd' in UTF-8
256 ;; actually it never reach here
257 ((write #xef)
258 (write #xbf)
259 (write #xbd)))))))))
260 (repeat)))
261
262 "CCL program to encode ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*. into UTF-8.")
263
264 (make-coding-system
265 'mule-utf-8 4 ?u
266 "UTF-8 encoding for Emacs-supported Unicode characters.
267 Supported character sets are:
268 ascii
269 eight-bit-control
270 eight-bit-graphic
271 latin-iso8859-1
272 mule-unicode-0100-24ff
273 mule-unicode-2500-33ff
274 mule-unicode-e000-ffff
275
276 Unicode characters out of these ranges are decoded
277 into eight-bit-control or eight-bit-graphic."
278
279 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
280 '((safe-charsets
281 ascii
282 eight-bit-control
283 eight-bit-graphic
284 latin-iso8859-1
285 mule-unicode-0100-24ff
286 mule-unicode-2500-33ff
287 mule-unicode-e000-ffff)
288 (mime-charset . utf-8)))
289
290 (define-coding-system-alias 'utf-8 'mule-utf-8)