Mercurial > emacs
changeset 35542:e4a75e66ee46
new file
author | Kenichi Handa <handa@m17n.org> |
---|---|
date | Thu, 25 Jan 2001 11:51:29 +0000 |
parents | b671f9509b3b |
children | c809110e1433 |
files | lisp/international/utf-8.el |
diffstat | 1 files changed, 290 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lisp/international/utf-8.el Thu Jan 25 11:51:29 2001 +0000 @@ -0,0 +1,290 @@ +;;; utf-8.el --- Limited UTF-8 decoding/encoding support + +;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. +;; Licensed to the Free Software Foundation. + +;; Keywords: multilingual, Unicode, UTF-8 + +;; This file is part of GNU Emacs. + +;; GNU Emacs is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2, or (at your option) +;; any later version. + +;; GNU Emacs is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GNU Emacs; see the file COPYING. If not, write to the +;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, +;; Boston, MA 02111-1307, USA. + +;;; Commentary: + +;; The coding-system `mule-utf-8' supports encoding/decoding of the +;; following character sets: +;; +;; ascii +;; eight-bit-control +;; latin-iso8859-1 +;; mule-unicode-0100-24ff +;; mule-unicode-2500-33ff +;; mule-unicode-e000-ffff +;; +;; Characters of other character sets cannot be encoded with +;; mule-utf-8. +;; +;; On decoding, Unicode characters that do not fit in above character +;; sets are handled as `eight-bit-control' or `eight-bit-graphic' +;; characters to retain original information (i.e. original byte +;; sequence). + +;; scalar | utf-8 +;; value | 1st byte | 2nd byte | 3rd byte +;; --------------------+-----------+-----------+---------- +;; 0000 0000 0xxx xxxx | 0xxx xxxx | | +;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | +;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx + +;;; Code: + +(define-ccl-program ccl-decode-mule-utf-8 + ;; + ;; charset | bytes in utf-8 | bytes in emacs + ;; -----------------------+----------------+--------------- + ;; ascii | 1 | 1 + ;; -----------------------+----------------+--------------- + ;; eight-bit-control | 2 | 2 + ;; latin-iso8859-1 | 2 | 2 + ;; -----------------------+----------------+--------------- + ;; mule-unicode-0100-24ff | 2 | 4 + ;; (< 0800) | | + ;; -----------------------+----------------+--------------- + ;; mule-unicode-0100-24ff | 3 | 4 + ;; (>= 8000) | | + ;; mule-unicode-2500-33ff | 3 | 4 + ;; mule-unicode-e000-ffff | 3 | 4 + ;; + ;; Thus magnification factor is two. + ;; + `(2 + ((loop + (read r0) + + ;; 1byte encoding, i.e., ascii + (if (r0 < #x80) + (write r0) + + ;; 2byte encoding + (if (r0 < #xe0) + ((read r1) + (r0 &= #x1f) + (r0 <<= 6) + (r1 &= #x3f) + (r1 += r0) + ;; now r1 holds scalar value + + ;; eight-bit-control + (if (r1 < 160) + ((r0 = ,(charset-id 'eight-bit-control)) + (write-multibyte-character r0 r1)) + + ;; latin-iso8859-1 + (if (r1 < 256) + ((r0 = ,(charset-id 'latin-iso8859-1)) + (r1 -= 128) + (write-multibyte-character r0 r1)) + + ;; mule-unicode-0100-24ff (< 0800) + ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) + (r1 -= #x0100) + (r2 = (((r1 / 96) + 32) << 7)) + (r1 %= 96) + (r1 += (r2 + 32)) + (write-multibyte-character r0 r1))))) + + ;; 3byte encoding + (if (r0 < #xf0) + ((read r1 r2) + (r3 = ((r0 & #x0f) << 12)) + (r3 += ((r1 & #x3f) << 6)) + (r3 += (r2 & #x3f)) + ;; now r3 holds scalar value + + ;; mule-unicode-0100-24ff (>= 0800) + (if (r3 < #x2500) + ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) + (r3 -= #x0100) + (r3 //= 96) + (r1 = (r7 + 32)) + (r1 += ((r3 + 32) << 7)) + (write-multibyte-character r0 r1)) + + ;; mule-unicode-2500-33ff + (if (r3 < #x3400) + ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) + (r3 -= #x2500) + (r3 //= 96) + (r1 = (r7 + 32)) + (r1 += ((r3 + 32) << 7)) + (write-multibyte-character r0 r1)) + + ;; U+3400 .. U+DFFF + ;; keep those bytes as eight-bit-{control|graphic} + (if (r3 < #xe000) + (;; #xe0 < r0 < #xf0, so r0 is eight-bit-graphic + (r3 = ,(charset-id 'eight-bit-graphic)) + (write-multibyte-character r3 r0) + (if (r1 < #xa0) + (r3 = ,(charset-id 'eight-bit-control))) + (write-multibyte-character r3 r1) + (if (r2 < #xa0) + (r3 = ,(charset-id 'eight-bit-control)) + (r3 = ,(charset-id 'eight-bit-graphic))) + (write-multibyte-character r3 r2)) + + ;; mule-unicode-e000-ffff + ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) + (r3 -= #xe000) + (r3 //= 96) + (r1 = (r7 + 32)) + (r1 += ((r3 + 32) << 7)) + (write-multibyte-character r0 r1)))))) + + ;; 4byte encoding + ;; keep those bytes as eight-bit-{control|graphic} + ((read r1 r2 r3) + ;; r0 > #xf0, thus eight-bit-graphic + (r4 = ,(charset-id 'eight-bit-graphic)) + (write-multibyte-character r4 r0) + (if (r1 < #xa0) + (r4 = ,(charset-id 'eight-bit-control))) + (write-multibyte-character r4 r1) + (if (r2 < #xa0) + (r4 = ,(charset-id 'eight-bit-control)) + (r4 = ,(charset-id 'eight-bit-graphic))) + (write-multibyte-character r4 r2) + (if (r3 < #xa0) + (r4 = ,(charset-id 'eight-bit-control)) + (r4 = ,(charset-id 'eight-bit-graphic))) + (write-multibyte-character r4 r3))))) + + (repeat)))) + + "CCL program to decode UTF-8 into ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*.") + +(define-ccl-program ccl-encode-mule-utf-8 + `(1 + (loop + (read-multibyte-character r0 r1) + + (if (r0 == ,(charset-id 'ascii)) + (write r1) + + (if (r0 == ,(charset-id 'latin-iso8859-1)) + ;; r1 scalar utf-8 + ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx + ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 + ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 + ((r0 = (((r1 & #x40) >> 6) | #xc2)) + (r1 &= #x3f) + (r1 |= #x80) + (write r0 r1)) + + (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) + ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) + ;; #x3f80 == (0011 1111 1000 0000)b + (r1 &= #x7f) + (r1 += (r0 + 224)) ; 240 == -32 + #x0100 + ;; now r1 holds scalar value + (if (r1 < #x0800) + ;; 2byte encoding + ((r0 = (((r1 & #x07c0) >> 6) | #xc0)) + ;; #x07c0 == (0000 0111 1100 0000)b + (r1 &= #x3f) + (r1 |= #x80) + (write r0 r1)) + ;; 3byte encoding + ((r0 = (((r1 & #xf000) >> 12) | #xe0)) + (r2 = ((r1 & #x3f) | #x80)) + (r1 &= #x0fc0) + (r1 >>= 6) + (r1 |= #x80) + (write r0 r1 r2)))) + + (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) + ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) + (r1 &= #x7f) + (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 + (r0 = (((r1 & #xf000) >> 12) | #xe0)) + (r2 = ((r1 & #x3f) | #x80)) + (r1 &= #x0fc0) + (r1 >>= 6) + (r1 |= #x80) + (write r0 r1 r2)) + + (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) + ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) + (r1 &= #x7f) + (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000 + (r0 = (((r1 & #xf000) >> 12) | #xe0)) + (r2 = ((r1 & #x3f) | #x80)) + (r1 &= #x0fc0) + (r1 >>= 6) + (r1 |= #x80) + (write r0 r1 r2)) + + (if (r0 == ,(charset-id 'eight-bit-control)) + ;; r1 scalar utf-8 + ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx + ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 + ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 + (write r1) + + (if (r0 == ,(charset-id 'eight-bit-graphic)) + ;; r1 scalar utf-8 + ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx + ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 + ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 + (write r1) + + ;; unsupported character. + ;; output U+FFFD, which is `ef bf bd' in UTF-8 + ;; actually it never reach here + ((write #xef) + (write #xbf) + (write #xbd))))))))) + (repeat))) + + "CCL program to encode ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*. into UTF-8.") + +(make-coding-system + 'mule-utf-8 4 ?u + "UTF-8 encoding for Emacs-supported Unicode characters. +Supported character sets are: + ascii + eight-bit-control + eight-bit-graphic + latin-iso8859-1 + mule-unicode-0100-24ff + mule-unicode-2500-33ff + mule-unicode-e000-ffff + +Unicode characters out of these ranges are decoded +into eight-bit-control or eight-bit-graphic." + + '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) + '((safe-charsets + ascii + eight-bit-control + eight-bit-graphic + latin-iso8859-1 + mule-unicode-0100-24ff + mule-unicode-2500-33ff + mule-unicode-e000-ffff) + (mime-charset . utf-8))) + +(define-coding-system-alias 'utf-8 'mule-utf-8)