35542
|
1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support
|
|
2
|
|
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
|
|
4 ;; Licensed to the Free Software Foundation.
|
|
5
|
|
6 ;; Keywords: multilingual, Unicode, UTF-8
|
|
7
|
|
8 ;; This file is part of GNU Emacs.
|
|
9
|
|
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
|
|
11 ;; it under the terms of the GNU General Public License as published by
|
|
12 ;; the Free Software Foundation; either version 2, or (at your option)
|
|
13 ;; any later version.
|
|
14
|
|
15 ;; GNU Emacs is distributed in the hope that it will be useful,
|
|
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
18 ;; GNU General Public License for more details.
|
|
19
|
|
20 ;; You should have received a copy of the GNU General Public License
|
|
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the
|
|
22 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
23 ;; Boston, MA 02111-1307, USA.
|
|
24
|
|
25 ;;; Commentary:
|
|
26
|
|
27 ;; The coding-system `mule-utf-8' supports encoding/decoding of the
|
|
28 ;; following character sets:
|
|
29 ;;
|
|
30 ;; ascii
|
|
31 ;; eight-bit-control
|
|
32 ;; latin-iso8859-1
|
|
33 ;; mule-unicode-0100-24ff
|
|
34 ;; mule-unicode-2500-33ff
|
|
35 ;; mule-unicode-e000-ffff
|
|
36 ;;
|
|
37 ;; Characters of other character sets cannot be encoded with
|
|
38 ;; mule-utf-8.
|
|
39 ;;
|
|
40 ;; On decoding, Unicode characters that do not fit in above character
|
|
41 ;; sets are handled as `eight-bit-control' or `eight-bit-graphic'
|
|
42 ;; characters to retain original information (i.e. original byte
|
|
43 ;; sequence).
|
|
44
|
|
45 ;; scalar | utf-8
|
|
46 ;; value | 1st byte | 2nd byte | 3rd byte
|
|
47 ;; --------------------+-----------+-----------+----------
|
|
48 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
|
|
49 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
|
|
50 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
|
|
51
|
|
52 ;;; Code:
|
|
53
|
|
54 (define-ccl-program ccl-decode-mule-utf-8
|
|
55 ;;
|
|
56 ;; charset | bytes in utf-8 | bytes in emacs
|
|
57 ;; -----------------------+----------------+---------------
|
|
58 ;; ascii | 1 | 1
|
|
59 ;; -----------------------+----------------+---------------
|
|
60 ;; eight-bit-control | 2 | 2
|
|
61 ;; latin-iso8859-1 | 2 | 2
|
|
62 ;; -----------------------+----------------+---------------
|
|
63 ;; mule-unicode-0100-24ff | 2 | 4
|
|
64 ;; (< 0800) | |
|
|
65 ;; -----------------------+----------------+---------------
|
|
66 ;; mule-unicode-0100-24ff | 3 | 4
|
|
67 ;; (>= 8000) | |
|
|
68 ;; mule-unicode-2500-33ff | 3 | 4
|
|
69 ;; mule-unicode-e000-ffff | 3 | 4
|
|
70 ;;
|
|
71 ;; Thus magnification factor is two.
|
|
72 ;;
|
|
73 `(2
|
|
74 ((loop
|
|
75 (read r0)
|
|
76
|
|
77 ;; 1byte encoding, i.e., ascii
|
|
78 (if (r0 < #x80)
|
|
79 (write r0)
|
|
80
|
|
81 ;; 2byte encoding
|
|
82 (if (r0 < #xe0)
|
|
83 ((read r1)
|
|
84 (r0 &= #x1f)
|
|
85 (r0 <<= 6)
|
|
86 (r1 &= #x3f)
|
|
87 (r1 += r0)
|
|
88 ;; now r1 holds scalar value
|
|
89
|
|
90 ;; eight-bit-control
|
|
91 (if (r1 < 160)
|
|
92 ((r0 = ,(charset-id 'eight-bit-control))
|
|
93 (write-multibyte-character r0 r1))
|
|
94
|
|
95 ;; latin-iso8859-1
|
|
96 (if (r1 < 256)
|
|
97 ((r0 = ,(charset-id 'latin-iso8859-1))
|
|
98 (r1 -= 128)
|
|
99 (write-multibyte-character r0 r1))
|
|
100
|
|
101 ;; mule-unicode-0100-24ff (< 0800)
|
|
102 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
|
|
103 (r1 -= #x0100)
|
|
104 (r2 = (((r1 / 96) + 32) << 7))
|
|
105 (r1 %= 96)
|
|
106 (r1 += (r2 + 32))
|
|
107 (write-multibyte-character r0 r1)))))
|
|
108
|
|
109 ;; 3byte encoding
|
|
110 (if (r0 < #xf0)
|
|
111 ((read r1 r2)
|
|
112 (r3 = ((r0 & #x0f) << 12))
|
|
113 (r3 += ((r1 & #x3f) << 6))
|
|
114 (r3 += (r2 & #x3f))
|
|
115 ;; now r3 holds scalar value
|
|
116
|
|
117 ;; mule-unicode-0100-24ff (>= 0800)
|
|
118 (if (r3 < #x2500)
|
|
119 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
|
|
120 (r3 -= #x0100)
|
|
121 (r3 //= 96)
|
|
122 (r1 = (r7 + 32))
|
|
123 (r1 += ((r3 + 32) << 7))
|
|
124 (write-multibyte-character r0 r1))
|
|
125
|
|
126 ;; mule-unicode-2500-33ff
|
|
127 (if (r3 < #x3400)
|
|
128 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
|
|
129 (r3 -= #x2500)
|
|
130 (r3 //= 96)
|
|
131 (r1 = (r7 + 32))
|
|
132 (r1 += ((r3 + 32) << 7))
|
|
133 (write-multibyte-character r0 r1))
|
|
134
|
|
135 ;; U+3400 .. U+DFFF
|
|
136 ;; keep those bytes as eight-bit-{control|graphic}
|
|
137 (if (r3 < #xe000)
|
|
138 (;; #xe0 < r0 < #xf0, so r0 is eight-bit-graphic
|
|
139 (r3 = ,(charset-id 'eight-bit-graphic))
|
|
140 (write-multibyte-character r3 r0)
|
|
141 (if (r1 < #xa0)
|
|
142 (r3 = ,(charset-id 'eight-bit-control)))
|
|
143 (write-multibyte-character r3 r1)
|
|
144 (if (r2 < #xa0)
|
|
145 (r3 = ,(charset-id 'eight-bit-control))
|
|
146 (r3 = ,(charset-id 'eight-bit-graphic)))
|
|
147 (write-multibyte-character r3 r2))
|
|
148
|
|
149 ;; mule-unicode-e000-ffff
|
|
150 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
|
|
151 (r3 -= #xe000)
|
|
152 (r3 //= 96)
|
|
153 (r1 = (r7 + 32))
|
|
154 (r1 += ((r3 + 32) << 7))
|
|
155 (write-multibyte-character r0 r1))))))
|
|
156
|
|
157 ;; 4byte encoding
|
|
158 ;; keep those bytes as eight-bit-{control|graphic}
|
|
159 ((read r1 r2 r3)
|
|
160 ;; r0 > #xf0, thus eight-bit-graphic
|
|
161 (r4 = ,(charset-id 'eight-bit-graphic))
|
|
162 (write-multibyte-character r4 r0)
|
|
163 (if (r1 < #xa0)
|
|
164 (r4 = ,(charset-id 'eight-bit-control)))
|
|
165 (write-multibyte-character r4 r1)
|
|
166 (if (r2 < #xa0)
|
|
167 (r4 = ,(charset-id 'eight-bit-control))
|
|
168 (r4 = ,(charset-id 'eight-bit-graphic)))
|
|
169 (write-multibyte-character r4 r2)
|
|
170 (if (r3 < #xa0)
|
|
171 (r4 = ,(charset-id 'eight-bit-control))
|
|
172 (r4 = ,(charset-id 'eight-bit-graphic)))
|
|
173 (write-multibyte-character r4 r3)))))
|
|
174
|
|
175 (repeat))))
|
|
176
|
|
177 "CCL program to decode UTF-8 into ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*.")
|
|
178
|
|
179 (define-ccl-program ccl-encode-mule-utf-8
|
|
180 `(1
|
|
181 (loop
|
|
182 (read-multibyte-character r0 r1)
|
|
183
|
|
184 (if (r0 == ,(charset-id 'ascii))
|
|
185 (write r1)
|
|
186
|
|
187 (if (r0 == ,(charset-id 'latin-iso8859-1))
|
|
188 ;; r1 scalar utf-8
|
|
189 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
|
|
190 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
|
|
191 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
|
|
192 ((r0 = (((r1 & #x40) >> 6) | #xc2))
|
|
193 (r1 &= #x3f)
|
|
194 (r1 |= #x80)
|
|
195 (write r0 r1))
|
|
196
|
|
197 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
|
|
198 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
|
|
199 ;; #x3f80 == (0011 1111 1000 0000)b
|
|
200 (r1 &= #x7f)
|
|
201 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
|
|
202 ;; now r1 holds scalar value
|
|
203 (if (r1 < #x0800)
|
|
204 ;; 2byte encoding
|
|
205 ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
|
|
206 ;; #x07c0 == (0000 0111 1100 0000)b
|
|
207 (r1 &= #x3f)
|
|
208 (r1 |= #x80)
|
|
209 (write r0 r1))
|
|
210 ;; 3byte encoding
|
|
211 ((r0 = (((r1 & #xf000) >> 12) | #xe0))
|
|
212 (r2 = ((r1 & #x3f) | #x80))
|
|
213 (r1 &= #x0fc0)
|
|
214 (r1 >>= 6)
|
|
215 (r1 |= #x80)
|
|
216 (write r0 r1 r2))))
|
|
217
|
|
218 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
|
|
219 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
|
|
220 (r1 &= #x7f)
|
|
221 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
|
|
222 (r0 = (((r1 & #xf000) >> 12) | #xe0))
|
|
223 (r2 = ((r1 & #x3f) | #x80))
|
|
224 (r1 &= #x0fc0)
|
|
225 (r1 >>= 6)
|
|
226 (r1 |= #x80)
|
|
227 (write r0 r1 r2))
|
|
228
|
|
229 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
|
|
230 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
|
|
231 (r1 &= #x7f)
|
|
232 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
|
|
233 (r0 = (((r1 & #xf000) >> 12) | #xe0))
|
|
234 (r2 = ((r1 & #x3f) | #x80))
|
|
235 (r1 &= #x0fc0)
|
|
236 (r1 >>= 6)
|
|
237 (r1 |= #x80)
|
|
238 (write r0 r1 r2))
|
|
239
|
|
240 (if (r0 == ,(charset-id 'eight-bit-control))
|
|
241 ;; r1 scalar utf-8
|
|
242 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
|
|
243 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
|
|
244 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
|
|
245 (write r1)
|
|
246
|
|
247 (if (r0 == ,(charset-id 'eight-bit-graphic))
|
|
248 ;; r1 scalar utf-8
|
|
249 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
|
|
250 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
|
|
251 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
|
|
252 (write r1)
|
|
253
|
|
254 ;; unsupported character.
|
|
255 ;; output U+FFFD, which is `ef bf bd' in UTF-8
|
|
256 ;; actually it never reach here
|
|
257 ((write #xef)
|
|
258 (write #xbf)
|
|
259 (write #xbd)))))))))
|
|
260 (repeat)))
|
|
261
|
|
262 "CCL program to encode ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*. into UTF-8.")
|
|
263
|
|
264 (make-coding-system
|
|
265 'mule-utf-8 4 ?u
|
|
266 "UTF-8 encoding for Emacs-supported Unicode characters.
|
|
267 Supported character sets are:
|
|
268 ascii
|
|
269 eight-bit-control
|
|
270 eight-bit-graphic
|
|
271 latin-iso8859-1
|
|
272 mule-unicode-0100-24ff
|
|
273 mule-unicode-2500-33ff
|
|
274 mule-unicode-e000-ffff
|
|
275
|
|
276 Unicode characters out of these ranges are decoded
|
|
277 into eight-bit-control or eight-bit-graphic."
|
|
278
|
|
279 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
|
|
280 '((safe-charsets
|
|
281 ascii
|
|
282 eight-bit-control
|
|
283 eight-bit-graphic
|
|
284 latin-iso8859-1
|
|
285 mule-unicode-0100-24ff
|
|
286 mule-unicode-2500-33ff
|
|
287 mule-unicode-e000-ffff)
|
|
288 (mime-charset . utf-8)))
|
|
289
|
|
290 (define-coding-system-alias 'utf-8 'mule-utf-8)
|