35542
|
1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support
|
|
2
|
|
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
|
|
4 ;; Licensed to the Free Software Foundation.
|
|
5
|
36243
|
6 ;; Keywords: multilingual, Unicode, UTF-8, i18n
|
35542
|
7
|
|
8 ;; This file is part of GNU Emacs.
|
|
9
|
|
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
|
|
11 ;; it under the terms of the GNU General Public License as published by
|
|
12 ;; the Free Software Foundation; either version 2, or (at your option)
|
|
13 ;; any later version.
|
|
14
|
|
15 ;; GNU Emacs is distributed in the hope that it will be useful,
|
|
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
18 ;; GNU General Public License for more details.
|
|
19
|
|
20 ;; You should have received a copy of the GNU General Public License
|
|
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the
|
|
22 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
23 ;; Boston, MA 02111-1307, USA.
|
|
24
|
|
25 ;;; Commentary:
|
|
26
|
|
27 ;; The coding-system `mule-utf-8' supports encoding/decoding of the
|
36243
|
28 ;; following character sets to and from UTF-8:
|
35542
|
29 ;;
|
|
30 ;; ascii
|
|
31 ;; eight-bit-control
|
|
32 ;; latin-iso8859-1
|
|
33 ;; mule-unicode-0100-24ff
|
|
34 ;; mule-unicode-2500-33ff
|
|
35 ;; mule-unicode-e000-ffff
|
|
36 ;;
|
|
37 ;; Characters of other character sets cannot be encoded with
|
36243
|
38 ;; mule-utf-8. Note that the mule-unicode charsets currently lack
|
|
39 ;; case and syntax information, so things like `downcase' will only
|
|
40 ;; work for characters from ASCII and Latin-1.
|
35542
|
41 ;;
|
36243
|
42 ;; On decoding, Unicode characters that do not fit into the above
|
|
43 ;; character sets are handled as `eight-bit-control' or
|
|
44 ;; `eight-bit-graphic' characters to retain the information about the
|
|
45 ;; original byte sequence.
|
|
46
|
|
47 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
|
35542
|
48
|
|
49 ;; scalar | utf-8
|
|
50 ;; value | 1st byte | 2nd byte | 3rd byte
|
|
51 ;; --------------------+-----------+-----------+----------
|
|
52 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
|
|
53 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
|
|
54 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
|
|
55
|
|
56 ;;; Code:
|
|
57
|
|
58 (define-ccl-program ccl-decode-mule-utf-8
|
|
59 ;;
|
|
60 ;; charset | bytes in utf-8 | bytes in emacs
|
|
61 ;; -----------------------+----------------+---------------
|
|
62 ;; ascii | 1 | 1
|
|
63 ;; -----------------------+----------------+---------------
|
|
64 ;; eight-bit-control | 2 | 2
|
|
65 ;; latin-iso8859-1 | 2 | 2
|
|
66 ;; -----------------------+----------------+---------------
|
|
67 ;; mule-unicode-0100-24ff | 2 | 4
|
|
68 ;; (< 0800) | |
|
|
69 ;; -----------------------+----------------+---------------
|
|
70 ;; mule-unicode-0100-24ff | 3 | 4
|
|
71 ;; (>= 8000) | |
|
|
72 ;; mule-unicode-2500-33ff | 3 | 4
|
|
73 ;; mule-unicode-e000-ffff | 3 | 4
|
|
74 ;;
|
|
75 ;; Thus magnification factor is two.
|
|
76 ;;
|
|
77 `(2
|
|
78 ((loop
|
|
79 (read r0)
|
|
80
|
|
81 ;; 1byte encoding, i.e., ascii
|
|
82 (if (r0 < #x80)
|
|
83 (write r0)
|
|
84
|
|
85 ;; 2byte encoding
|
|
86 (if (r0 < #xe0)
|
|
87 ((read r1)
|
|
88 (r0 &= #x1f)
|
|
89 (r0 <<= 6)
|
|
90 (r1 &= #x3f)
|
|
91 (r1 += r0)
|
|
92 ;; now r1 holds scalar value
|
|
93
|
|
94 ;; eight-bit-control
|
|
95 (if (r1 < 160)
|
|
96 ((r0 = ,(charset-id 'eight-bit-control))
|
|
97 (write-multibyte-character r0 r1))
|
|
98
|
|
99 ;; latin-iso8859-1
|
|
100 (if (r1 < 256)
|
|
101 ((r0 = ,(charset-id 'latin-iso8859-1))
|
|
102 (r1 -= 128)
|
|
103 (write-multibyte-character r0 r1))
|
|
104
|
|
105 ;; mule-unicode-0100-24ff (< 0800)
|
|
106 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
|
|
107 (r1 -= #x0100)
|
|
108 (r2 = (((r1 / 96) + 32) << 7))
|
|
109 (r1 %= 96)
|
|
110 (r1 += (r2 + 32))
|
|
111 (write-multibyte-character r0 r1)))))
|
|
112
|
|
113 ;; 3byte encoding
|
|
114 (if (r0 < #xf0)
|
|
115 ((read r1 r2)
|
|
116 (r3 = ((r0 & #x0f) << 12))
|
|
117 (r3 += ((r1 & #x3f) << 6))
|
|
118 (r3 += (r2 & #x3f))
|
|
119 ;; now r3 holds scalar value
|
|
120
|
|
121 ;; mule-unicode-0100-24ff (>= 0800)
|
|
122 (if (r3 < #x2500)
|
|
123 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
|
|
124 (r3 -= #x0100)
|
|
125 (r3 //= 96)
|
|
126 (r1 = (r7 + 32))
|
|
127 (r1 += ((r3 + 32) << 7))
|
|
128 (write-multibyte-character r0 r1))
|
|
129
|
|
130 ;; mule-unicode-2500-33ff
|
|
131 (if (r3 < #x3400)
|
|
132 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
|
|
133 (r3 -= #x2500)
|
|
134 (r3 //= 96)
|
|
135 (r1 = (r7 + 32))
|
|
136 (r1 += ((r3 + 32) << 7))
|
|
137 (write-multibyte-character r0 r1))
|
|
138
|
|
139 ;; U+3400 .. U+DFFF
|
|
140 ;; keep those bytes as eight-bit-{control|graphic}
|
|
141 (if (r3 < #xe000)
|
|
142 (;; #xe0 < r0 < #xf0, so r0 is eight-bit-graphic
|
|
143 (r3 = ,(charset-id 'eight-bit-graphic))
|
|
144 (write-multibyte-character r3 r0)
|
|
145 (if (r1 < #xa0)
|
|
146 (r3 = ,(charset-id 'eight-bit-control)))
|
|
147 (write-multibyte-character r3 r1)
|
|
148 (if (r2 < #xa0)
|
|
149 (r3 = ,(charset-id 'eight-bit-control))
|
|
150 (r3 = ,(charset-id 'eight-bit-graphic)))
|
|
151 (write-multibyte-character r3 r2))
|
|
152
|
|
153 ;; mule-unicode-e000-ffff
|
|
154 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
|
|
155 (r3 -= #xe000)
|
|
156 (r3 //= 96)
|
|
157 (r1 = (r7 + 32))
|
|
158 (r1 += ((r3 + 32) << 7))
|
|
159 (write-multibyte-character r0 r1))))))
|
|
160
|
|
161 ;; 4byte encoding
|
|
162 ;; keep those bytes as eight-bit-{control|graphic}
|
|
163 ((read r1 r2 r3)
|
|
164 ;; r0 > #xf0, thus eight-bit-graphic
|
|
165 (r4 = ,(charset-id 'eight-bit-graphic))
|
|
166 (write-multibyte-character r4 r0)
|
|
167 (if (r1 < #xa0)
|
|
168 (r4 = ,(charset-id 'eight-bit-control)))
|
|
169 (write-multibyte-character r4 r1)
|
|
170 (if (r2 < #xa0)
|
|
171 (r4 = ,(charset-id 'eight-bit-control))
|
|
172 (r4 = ,(charset-id 'eight-bit-graphic)))
|
|
173 (write-multibyte-character r4 r2)
|
|
174 (if (r3 < #xa0)
|
|
175 (r4 = ,(charset-id 'eight-bit-control))
|
|
176 (r4 = ,(charset-id 'eight-bit-graphic)))
|
|
177 (write-multibyte-character r4 r3)))))
|
|
178
|
|
179 (repeat))))
|
|
180
|
36243
|
181 "CCL program to decode UTF-8.
|
|
182 Decoding is done into the charsets ascii, eight-bit-control,
|
|
183 latin-iso8859-1 and mule-unicode-* only.")
|
35542
|
184
|
|
185 (define-ccl-program ccl-encode-mule-utf-8
|
|
186 `(1
|
|
187 (loop
|
|
188 (read-multibyte-character r0 r1)
|
|
189
|
|
190 (if (r0 == ,(charset-id 'ascii))
|
|
191 (write r1)
|
|
192
|
|
193 (if (r0 == ,(charset-id 'latin-iso8859-1))
|
|
194 ;; r1 scalar utf-8
|
|
195 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
|
|
196 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
|
|
197 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
|
|
198 ((r0 = (((r1 & #x40) >> 6) | #xc2))
|
|
199 (r1 &= #x3f)
|
|
200 (r1 |= #x80)
|
|
201 (write r0 r1))
|
|
202
|
|
203 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
|
|
204 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
|
|
205 ;; #x3f80 == (0011 1111 1000 0000)b
|
|
206 (r1 &= #x7f)
|
|
207 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
|
|
208 ;; now r1 holds scalar value
|
|
209 (if (r1 < #x0800)
|
|
210 ;; 2byte encoding
|
|
211 ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
|
|
212 ;; #x07c0 == (0000 0111 1100 0000)b
|
|
213 (r1 &= #x3f)
|
|
214 (r1 |= #x80)
|
|
215 (write r0 r1))
|
|
216 ;; 3byte encoding
|
|
217 ((r0 = (((r1 & #xf000) >> 12) | #xe0))
|
|
218 (r2 = ((r1 & #x3f) | #x80))
|
|
219 (r1 &= #x0fc0)
|
|
220 (r1 >>= 6)
|
|
221 (r1 |= #x80)
|
|
222 (write r0 r1 r2))))
|
|
223
|
|
224 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
|
|
225 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
|
|
226 (r1 &= #x7f)
|
|
227 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
|
|
228 (r0 = (((r1 & #xf000) >> 12) | #xe0))
|
|
229 (r2 = ((r1 & #x3f) | #x80))
|
|
230 (r1 &= #x0fc0)
|
|
231 (r1 >>= 6)
|
|
232 (r1 |= #x80)
|
|
233 (write r0 r1 r2))
|
|
234
|
|
235 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
|
|
236 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
|
|
237 (r1 &= #x7f)
|
|
238 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
|
|
239 (r0 = (((r1 & #xf000) >> 12) | #xe0))
|
|
240 (r2 = ((r1 & #x3f) | #x80))
|
|
241 (r1 &= #x0fc0)
|
|
242 (r1 >>= 6)
|
|
243 (r1 |= #x80)
|
|
244 (write r0 r1 r2))
|
|
245
|
|
246 (if (r0 == ,(charset-id 'eight-bit-control))
|
|
247 ;; r1 scalar utf-8
|
|
248 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
|
|
249 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
|
|
250 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
|
|
251 (write r1)
|
|
252
|
|
253 (if (r0 == ,(charset-id 'eight-bit-graphic))
|
|
254 ;; r1 scalar utf-8
|
|
255 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
|
|
256 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
|
|
257 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
|
|
258 (write r1)
|
|
259
|
36243
|
260 ;; Unsupported character.
|
|
261 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
|
35542
|
262 ((write #xef)
|
|
263 (write #xbf)
|
|
264 (write #xbd)))))))))
|
|
265 (repeat)))
|
|
266
|
36243
|
267 "CCL program to encode into UTF-8.
|
|
268 Only characters from the charsets ascii, eight-bit-control,
|
|
269 latin-iso8859-1 and mule-unicode-* are recognized. Others are encoded
|
|
270 as U+FFFD.")
|
35542
|
271
|
|
272 (make-coding-system
|
|
273 'mule-utf-8 4 ?u
|
|
274 "UTF-8 encoding for Emacs-supported Unicode characters.
|
36243
|
275 The supported Emacs character sets are:
|
35542
|
276 ascii
|
|
277 eight-bit-control
|
|
278 eight-bit-graphic
|
|
279 latin-iso8859-1
|
|
280 mule-unicode-0100-24ff
|
|
281 mule-unicode-2500-33ff
|
|
282 mule-unicode-e000-ffff
|
|
283
|
36243
|
284 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
|
|
285 are decoded into sequences of eight-bit-control and eight-bit-graphic
|
|
286 characters to preserve their byte sequences. Emacs characters out of
|
|
287 these ranges are encoded into U+FFFD.
|
|
288
|
|
289 Note that, currently, characters in the mule-unicode charsets have no
|
|
290 syntax and case information. Thus, for instance, upper- and
|
|
291 lower-casing commands won't work with them."
|
35542
|
292
|
|
293 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
|
|
294 '((safe-charsets
|
|
295 ascii
|
|
296 eight-bit-control
|
|
297 eight-bit-graphic
|
|
298 latin-iso8859-1
|
|
299 mule-unicode-0100-24ff
|
|
300 mule-unicode-2500-33ff
|
|
301 mule-unicode-e000-ffff)
|
|
302 (mime-charset . utf-8)))
|
|
303
|
|
304 (define-coding-system-alias 'utf-8 'mule-utf-8)
|