Mercurial > emacs
diff lisp/international/utf-8.el @ 36243:a05ae5420f85
Doc and commentary fixes.
author | Dave Love <fx@gnu.org> |
---|---|
date | Tue, 20 Feb 2001 20:55:06 +0000 |
parents | e4a75e66ee46 |
children | f6bb3ed752b4 |
line wrap: on
line diff
--- a/lisp/international/utf-8.el Tue Feb 20 20:05:13 2001 +0000 +++ b/lisp/international/utf-8.el Tue Feb 20 20:55:06 2001 +0000 @@ -3,7 +3,7 @@ ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. ;; Licensed to the Free Software Foundation. -;; Keywords: multilingual, Unicode, UTF-8 +;; Keywords: multilingual, Unicode, UTF-8, i18n ;; This file is part of GNU Emacs. @@ -25,7 +25,7 @@ ;;; Commentary: ;; The coding-system `mule-utf-8' supports encoding/decoding of the -;; following character sets: +;; following character sets to and from UTF-8: ;; ;; ascii ;; eight-bit-control @@ -35,12 +35,16 @@ ;; mule-unicode-e000-ffff ;; ;; Characters of other character sets cannot be encoded with -;; mule-utf-8. +;; mule-utf-8. Note that the mule-unicode charsets currently lack +;; case and syntax information, so things like `downcase' will only +;; work for characters from ASCII and Latin-1. ;; -;; On decoding, Unicode characters that do not fit in above character -;; sets are handled as `eight-bit-control' or `eight-bit-graphic' -;; characters to retain original information (i.e. original byte -;; sequence). +;; On decoding, Unicode characters that do not fit into the above +;; character sets are handled as `eight-bit-control' or +;; `eight-bit-graphic' characters to retain the information about the +;; original byte sequence. + +;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: ;; scalar | utf-8 ;; value | 1st byte | 2nd byte | 3rd byte @@ -174,7 +178,9 @@ (repeat)))) - "CCL program to decode UTF-8 into ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*.") + "CCL program to decode UTF-8. +Decoding is done into the charsets ascii, eight-bit-control, +latin-iso8859-1 and mule-unicode-* only.") (define-ccl-program ccl-encode-mule-utf-8 `(1 @@ -251,20 +257,22 @@ ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 (write r1) - ;; unsupported character. - ;; output U+FFFD, which is `ef bf bd' in UTF-8 - ;; actually it never reach here + ;; Unsupported character. + ;; Output U+FFFD, which is `ef bf bd' in UTF-8. ((write #xef) (write #xbf) (write #xbd))))))))) (repeat))) - "CCL program to encode ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*. into UTF-8.") + "CCL program to encode into UTF-8. +Only characters from the charsets ascii, eight-bit-control, +latin-iso8859-1 and mule-unicode-* are recognized. Others are encoded +as U+FFFD.") (make-coding-system 'mule-utf-8 4 ?u "UTF-8 encoding for Emacs-supported Unicode characters. -Supported character sets are: +The supported Emacs character sets are: ascii eight-bit-control eight-bit-graphic @@ -273,8 +281,14 @@ mule-unicode-2500-33ff mule-unicode-e000-ffff -Unicode characters out of these ranges are decoded -into eight-bit-control or eight-bit-graphic." +Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF +are decoded into sequences of eight-bit-control and eight-bit-graphic +characters to preserve their byte sequences. Emacs characters out of +these ranges are encoded into U+FFFD. + +Note that, currently, characters in the mule-unicode charsets have no +syntax and case information. Thus, for instance, upper- and +lower-casing commands won't work with them." '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) '((safe-charsets