comparison lisp/international/utf-8.el @ 36243:a05ae5420f85

Doc and commentary fixes.
author Dave Love <fx@gnu.org>
date Tue, 20 Feb 2001 20:55:06 +0000
parents e4a75e66ee46
children f6bb3ed752b4
comparison
equal deleted inserted replaced
36242:be9b15c9bc7b 36243:a05ae5420f85
1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support 1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support
2 2
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. 3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
4 ;; Licensed to the Free Software Foundation. 4 ;; Licensed to the Free Software Foundation.
5 5
6 ;; Keywords: multilingual, Unicode, UTF-8 6 ;; Keywords: multilingual, Unicode, UTF-8, i18n
7 7
8 ;; This file is part of GNU Emacs. 8 ;; This file is part of GNU Emacs.
9 9
10 ;; GNU Emacs is free software; you can redistribute it and/or modify 10 ;; GNU Emacs is free software; you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by 11 ;; it under the terms of the GNU General Public License as published by
23 ;; Boston, MA 02111-1307, USA. 23 ;; Boston, MA 02111-1307, USA.
24 24
25 ;;; Commentary: 25 ;;; Commentary:
26 26
27 ;; The coding-system `mule-utf-8' supports encoding/decoding of the 27 ;; The coding-system `mule-utf-8' supports encoding/decoding of the
28 ;; following character sets: 28 ;; following character sets to and from UTF-8:
29 ;; 29 ;;
30 ;; ascii 30 ;; ascii
31 ;; eight-bit-control 31 ;; eight-bit-control
32 ;; latin-iso8859-1 32 ;; latin-iso8859-1
33 ;; mule-unicode-0100-24ff 33 ;; mule-unicode-0100-24ff
34 ;; mule-unicode-2500-33ff 34 ;; mule-unicode-2500-33ff
35 ;; mule-unicode-e000-ffff 35 ;; mule-unicode-e000-ffff
36 ;; 36 ;;
37 ;; Characters of other character sets cannot be encoded with 37 ;; Characters of other character sets cannot be encoded with
38 ;; mule-utf-8. 38 ;; mule-utf-8. Note that the mule-unicode charsets currently lack
39 ;; case and syntax information, so things like `downcase' will only
40 ;; work for characters from ASCII and Latin-1.
39 ;; 41 ;;
40 ;; On decoding, Unicode characters that do not fit in above character 42 ;; On decoding, Unicode characters that do not fit into the above
41 ;; sets are handled as `eight-bit-control' or `eight-bit-graphic' 43 ;; character sets are handled as `eight-bit-control' or
42 ;; characters to retain original information (i.e. original byte 44 ;; `eight-bit-graphic' characters to retain the information about the
43 ;; sequence). 45 ;; original byte sequence.
46
47 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
44 48
45 ;; scalar | utf-8 49 ;; scalar | utf-8
46 ;; value | 1st byte | 2nd byte | 3rd byte 50 ;; value | 1st byte | 2nd byte | 3rd byte
47 ;; --------------------+-----------+-----------+---------- 51 ;; --------------------+-----------+-----------+----------
48 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | 52 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
172 (r4 = ,(charset-id 'eight-bit-graphic))) 176 (r4 = ,(charset-id 'eight-bit-graphic)))
173 (write-multibyte-character r4 r3))))) 177 (write-multibyte-character r4 r3)))))
174 178
175 (repeat)))) 179 (repeat))))
176 180
177 "CCL program to decode UTF-8 into ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*.") 181 "CCL program to decode UTF-8.
182 Decoding is done into the charsets ascii, eight-bit-control,
183 latin-iso8859-1 and mule-unicode-* only.")
178 184
179 (define-ccl-program ccl-encode-mule-utf-8 185 (define-ccl-program ccl-encode-mule-utf-8
180 `(1 186 `(1
181 (loop 187 (loop
182 (read-multibyte-character r0 r1) 188 (read-multibyte-character r0 r1)
249 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx 255 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
250 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 256 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
251 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 257 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
252 (write r1) 258 (write r1)
253 259
254 ;; unsupported character. 260 ;; Unsupported character.
255 ;; output U+FFFD, which is `ef bf bd' in UTF-8 261 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
256 ;; actually it never reach here
257 ((write #xef) 262 ((write #xef)
258 (write #xbf) 263 (write #xbf)
259 (write #xbd))))))))) 264 (write #xbd)))))))))
260 (repeat))) 265 (repeat)))
261 266
262 "CCL program to encode ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*. into UTF-8.") 267 "CCL program to encode into UTF-8.
268 Only characters from the charsets ascii, eight-bit-control,
269 latin-iso8859-1 and mule-unicode-* are recognized. Others are encoded
270 as U+FFFD.")
263 271
264 (make-coding-system 272 (make-coding-system
265 'mule-utf-8 4 ?u 273 'mule-utf-8 4 ?u
266 "UTF-8 encoding for Emacs-supported Unicode characters. 274 "UTF-8 encoding for Emacs-supported Unicode characters.
267 Supported character sets are: 275 The supported Emacs character sets are:
268 ascii 276 ascii
269 eight-bit-control 277 eight-bit-control
270 eight-bit-graphic 278 eight-bit-graphic
271 latin-iso8859-1 279 latin-iso8859-1
272 mule-unicode-0100-24ff 280 mule-unicode-0100-24ff
273 mule-unicode-2500-33ff 281 mule-unicode-2500-33ff
274 mule-unicode-e000-ffff 282 mule-unicode-e000-ffff
275 283
276 Unicode characters out of these ranges are decoded 284 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
277 into eight-bit-control or eight-bit-graphic." 285 are decoded into sequences of eight-bit-control and eight-bit-graphic
286 characters to preserve their byte sequences. Emacs characters out of
287 these ranges are encoded into U+FFFD.
288
289 Note that, currently, characters in the mule-unicode charsets have no
290 syntax and case information. Thus, for instance, upper- and
291 lower-casing commands won't work with them."
278 292
279 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) 293 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
280 '((safe-charsets 294 '((safe-charsets
281 ascii 295 ascii
282 eight-bit-control 296 eight-bit-control