Mercurial > emacs
comparison lisp/international/utf-8.el @ 36243:a05ae5420f85
Doc and commentary fixes.
author | Dave Love <fx@gnu.org> |
---|---|
date | Tue, 20 Feb 2001 20:55:06 +0000 |
parents | e4a75e66ee46 |
children | f6bb3ed752b4 |
comparison
equal
deleted
inserted
replaced
36242:be9b15c9bc7b | 36243:a05ae5420f85 |
---|---|
1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support | 1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support |
2 | 2 |
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. | 3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. |
4 ;; Licensed to the Free Software Foundation. | 4 ;; Licensed to the Free Software Foundation. |
5 | 5 |
6 ;; Keywords: multilingual, Unicode, UTF-8 | 6 ;; Keywords: multilingual, Unicode, UTF-8, i18n |
7 | 7 |
8 ;; This file is part of GNU Emacs. | 8 ;; This file is part of GNU Emacs. |
9 | 9 |
10 ;; GNU Emacs is free software; you can redistribute it and/or modify | 10 ;; GNU Emacs is free software; you can redistribute it and/or modify |
11 ;; it under the terms of the GNU General Public License as published by | 11 ;; it under the terms of the GNU General Public License as published by |
23 ;; Boston, MA 02111-1307, USA. | 23 ;; Boston, MA 02111-1307, USA. |
24 | 24 |
25 ;;; Commentary: | 25 ;;; Commentary: |
26 | 26 |
27 ;; The coding-system `mule-utf-8' supports encoding/decoding of the | 27 ;; The coding-system `mule-utf-8' supports encoding/decoding of the |
28 ;; following character sets: | 28 ;; following character sets to and from UTF-8: |
29 ;; | 29 ;; |
30 ;; ascii | 30 ;; ascii |
31 ;; eight-bit-control | 31 ;; eight-bit-control |
32 ;; latin-iso8859-1 | 32 ;; latin-iso8859-1 |
33 ;; mule-unicode-0100-24ff | 33 ;; mule-unicode-0100-24ff |
34 ;; mule-unicode-2500-33ff | 34 ;; mule-unicode-2500-33ff |
35 ;; mule-unicode-e000-ffff | 35 ;; mule-unicode-e000-ffff |
36 ;; | 36 ;; |
37 ;; Characters of other character sets cannot be encoded with | 37 ;; Characters of other character sets cannot be encoded with |
38 ;; mule-utf-8. | 38 ;; mule-utf-8. Note that the mule-unicode charsets currently lack |
39 ;; case and syntax information, so things like `downcase' will only | |
40 ;; work for characters from ASCII and Latin-1. | |
39 ;; | 41 ;; |
40 ;; On decoding, Unicode characters that do not fit in above character | 42 ;; On decoding, Unicode characters that do not fit into the above |
41 ;; sets are handled as `eight-bit-control' or `eight-bit-graphic' | 43 ;; character sets are handled as `eight-bit-control' or |
42 ;; characters to retain original information (i.e. original byte | 44 ;; `eight-bit-graphic' characters to retain the information about the |
43 ;; sequence). | 45 ;; original byte sequence. |
46 | |
47 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: | |
44 | 48 |
45 ;; scalar | utf-8 | 49 ;; scalar | utf-8 |
46 ;; value | 1st byte | 2nd byte | 3rd byte | 50 ;; value | 1st byte | 2nd byte | 3rd byte |
47 ;; --------------------+-----------+-----------+---------- | 51 ;; --------------------+-----------+-----------+---------- |
48 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | | 52 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | |
172 (r4 = ,(charset-id 'eight-bit-graphic))) | 176 (r4 = ,(charset-id 'eight-bit-graphic))) |
173 (write-multibyte-character r4 r3))))) | 177 (write-multibyte-character r4 r3))))) |
174 | 178 |
175 (repeat)))) | 179 (repeat)))) |
176 | 180 |
177 "CCL program to decode UTF-8 into ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*.") | 181 "CCL program to decode UTF-8. |
182 Decoding is done into the charsets ascii, eight-bit-control, | |
183 latin-iso8859-1 and mule-unicode-* only.") | |
178 | 184 |
179 (define-ccl-program ccl-encode-mule-utf-8 | 185 (define-ccl-program ccl-encode-mule-utf-8 |
180 `(1 | 186 `(1 |
181 (loop | 187 (loop |
182 (read-multibyte-character r0 r1) | 188 (read-multibyte-character r0 r1) |
249 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | 255 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
250 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 | 256 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 |
251 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 | 257 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 |
252 (write r1) | 258 (write r1) |
253 | 259 |
254 ;; unsupported character. | 260 ;; Unsupported character. |
255 ;; output U+FFFD, which is `ef bf bd' in UTF-8 | 261 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. |
256 ;; actually it never reach here | |
257 ((write #xef) | 262 ((write #xef) |
258 (write #xbf) | 263 (write #xbf) |
259 (write #xbd))))))))) | 264 (write #xbd))))))))) |
260 (repeat))) | 265 (repeat))) |
261 | 266 |
262 "CCL program to encode ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*. into UTF-8.") | 267 "CCL program to encode into UTF-8. |
268 Only characters from the charsets ascii, eight-bit-control, | |
269 latin-iso8859-1 and mule-unicode-* are recognized. Others are encoded | |
270 as U+FFFD.") | |
263 | 271 |
264 (make-coding-system | 272 (make-coding-system |
265 'mule-utf-8 4 ?u | 273 'mule-utf-8 4 ?u |
266 "UTF-8 encoding for Emacs-supported Unicode characters. | 274 "UTF-8 encoding for Emacs-supported Unicode characters. |
267 Supported character sets are: | 275 The supported Emacs character sets are: |
268 ascii | 276 ascii |
269 eight-bit-control | 277 eight-bit-control |
270 eight-bit-graphic | 278 eight-bit-graphic |
271 latin-iso8859-1 | 279 latin-iso8859-1 |
272 mule-unicode-0100-24ff | 280 mule-unicode-0100-24ff |
273 mule-unicode-2500-33ff | 281 mule-unicode-2500-33ff |
274 mule-unicode-e000-ffff | 282 mule-unicode-e000-ffff |
275 | 283 |
276 Unicode characters out of these ranges are decoded | 284 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF |
277 into eight-bit-control or eight-bit-graphic." | 285 are decoded into sequences of eight-bit-control and eight-bit-graphic |
286 characters to preserve their byte sequences. Emacs characters out of | |
287 these ranges are encoded into U+FFFD. | |
288 | |
289 Note that, currently, characters in the mule-unicode charsets have no | |
290 syntax and case information. Thus, for instance, upper- and | |
291 lower-casing commands won't work with them." | |
278 | 292 |
279 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | 293 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) |
280 '((safe-charsets | 294 '((safe-charsets |
281 ascii | 295 ascii |
282 eight-bit-control | 296 eight-bit-control |