Mercurial > emacs
annotate lisp/international/utf-8.el @ 45625:28f0b229040c
Initial revision
author | Thien-Thi Nguyen <ttn@gnuvola.org> |
---|---|
date | Mon, 03 Jun 2002 03:15:34 +0000 |
parents | c3c4e09c3eab |
children | 395e5c46761b |
rev | line source |
---|---|
44411 | 1 ;;; utf-8.el --- limited UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*- |
35542 | 2 |
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. | |
4 ;; Licensed to the Free Software Foundation. | |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
5 ;; Copyright (C) 2001 Free Software Foundation, Inc. |
35542 | 6 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
7 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> |
36243 | 8 ;; Keywords: multilingual, Unicode, UTF-8, i18n |
35542 | 9 |
10 ;; This file is part of GNU Emacs. | |
11 | |
12 ;; GNU Emacs is free software; you can redistribute it and/or modify | |
13 ;; it under the terms of the GNU General Public License as published by | |
14 ;; the Free Software Foundation; either version 2, or (at your option) | |
15 ;; any later version. | |
16 | |
17 ;; GNU Emacs is distributed in the hope that it will be useful, | |
18 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 ;; GNU General Public License for more details. | |
21 | |
22 ;; You should have received a copy of the GNU General Public License | |
23 ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
24 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
25 ;; Boston, MA 02111-1307, USA. | |
26 | |
27 ;;; Commentary: | |
28 | |
41873 | 29 ;; The coding-system `mule-utf-8' basically supports encoding/decoding |
30 ;; of the following character sets to and from UTF-8: | |
35542 | 31 ;; |
32 ;; ascii | |
33 ;; eight-bit-control | |
34 ;; latin-iso8859-1 | |
35 ;; mule-unicode-0100-24ff | |
36 ;; mule-unicode-2500-33ff | |
37 ;; mule-unicode-e000-ffff | |
38 ;; | |
36243 | 39 ;; On decoding, Unicode characters that do not fit into the above |
40 ;; character sets are handled as `eight-bit-control' or | |
41 ;; `eight-bit-graphic' characters to retain the information about the | |
42 ;; original byte sequence. | |
41873 | 43 ;; |
44 ;; Characters from other character sets can be encoded with | |
45 ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and | |
46 ;; registering the translation with `register-char-codings'. | |
36243 | 47 |
48 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: | |
35542 | 49 |
50 ;; scalar | utf-8 | |
51 ;; value | 1st byte | 2nd byte | 3rd byte | |
52 ;; --------------------+-----------+-----------+---------- | |
53 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | | |
54 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | | |
55 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx | |
56 | |
57 ;;; Code: | |
58 | |
41873 | 59 (defvar ucs-mule-to-mule-unicode (make-translation-table) |
60 "Translation table for encoding to `mule-utf-8'.") | |
61 ;; Could have been done by ucs-tables loaded before. | |
62 (unless (get 'ucs-mule-to-mule-unicode 'translation-table) | |
63 (define-translation-table 'ucs-mule-to-mule-unicode ucs-mule-to-mule-unicode)) | |
35542 | 64 (define-ccl-program ccl-decode-mule-utf-8 |
65 ;; | |
66 ;; charset | bytes in utf-8 | bytes in emacs | |
67 ;; -----------------------+----------------+--------------- | |
68 ;; ascii | 1 | 1 | |
69 ;; -----------------------+----------------+--------------- | |
70 ;; eight-bit-control | 2 | 2 | |
41873 | 71 ;; eight-bit-graphic | 2 | 1 |
35542 | 72 ;; latin-iso8859-1 | 2 | 2 |
73 ;; -----------------------+----------------+--------------- | |
74 ;; mule-unicode-0100-24ff | 2 | 4 | |
75 ;; (< 0800) | | | |
76 ;; -----------------------+----------------+--------------- | |
77 ;; mule-unicode-0100-24ff | 3 | 4 | |
78 ;; (>= 8000) | | | |
79 ;; mule-unicode-2500-33ff | 3 | 4 | |
80 ;; mule-unicode-e000-ffff | 3 | 4 | |
81 ;; | |
82 ;; Thus magnification factor is two. | |
83 ;; | |
84 `(2 | |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
85 ((r5 = ,(charset-id 'eight-bit-control)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
86 (r6 = ,(charset-id 'eight-bit-graphic)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
87 (loop |
35542 | 88 (read r0) |
89 | |
90 ;; 1byte encoding, i.e., ascii | |
91 (if (r0 < #x80) | |
92 (write r0) | |
93 | |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
94 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx |
35542 | 95 (if (r0 < #xe0) |
96 ((read r1) | |
97 | |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
98 (if ((r1 & #b11000000) != #b10000000) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
99 ;; Invalid 2-byte sequence |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
100 ((if (r0 < #xa0) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
101 (write-multibyte-character r5 r0) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
102 (write-multibyte-character r6 r0)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
103 (if (r1 < #x80) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
104 (write r1) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
105 (if (r1 < #xa0) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
106 (write-multibyte-character r5 r1) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
107 (write-multibyte-character r6 r1)))) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
108 |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
109 ((r0 &= #x1f) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
110 (r0 <<= 6) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
111 (r1 &= #x3f) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
112 (r1 += r0) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
113 ;; Now r1 holds scalar value |
35542 | 114 |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
115 ;; eight-bit-control |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
116 (if (r1 < 160) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
117 ((write-multibyte-character r5 r1)) |
35542 | 118 |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
119 ;; latin-iso8859-1 |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
120 (if (r1 < 256) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
121 ((r0 = ,(charset-id 'latin-iso8859-1)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
122 (r1 -= 128) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
123 (write-multibyte-character r0 r1)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
124 |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
125 ;; mule-unicode-0100-24ff (< 0800) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
126 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
127 (r1 -= #x0100) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
128 (r2 = (((r1 / 96) + 32) << 7)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
129 (r1 %= 96) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
130 (r1 += (r2 + 32)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
131 (write-multibyte-character r0 r1))))))) |
35542 | 132 |
133 ;; 3byte encoding | |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
134 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx |
35542 | 135 (if (r0 < #xf0) |
136 ((read r1 r2) | |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
137 |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
138 ;; This is set to 1 if the encoding is invalid. |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
139 (r4 = 0) |
35542 | 140 |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
141 (r3 = (r1 & #b11000000)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
142 (r3 |= ((r2 >> 2) & #b00110000)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
143 (if (r3 != #b10100000) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
144 (r4 = 1) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
145 ((r3 = ((r0 & #x0f) << 12)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
146 (r3 += ((r1 & #x3f) << 6)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
147 (r3 += (r2 & #x3f)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
148 (if (r3 < #x0800) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
149 (r4 = 1)))) |
35542 | 150 |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
151 (if (r4 != 0) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
152 ;; Invalid 3-byte sequence |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
153 ((if (r0 < #xa0) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
154 (write-multibyte-character r5 r0) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
155 (write-multibyte-character r6 r0)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
156 (if (r1 < #x80) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
157 (write r1) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
158 (if (r1 < #xa0) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
159 (write-multibyte-character r5 r1) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
160 (write-multibyte-character r6 r1))) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
161 (if (r2 < #x80) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
162 (write r2) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
163 (if (r2 < #xa0) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
164 (write-multibyte-character r5 r2) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
165 (write-multibyte-character r6 r2)))) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
166 |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
167 ;; mule-unicode-0100-24ff (>= 0800) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
168 ((if (r3 < #x2500) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
169 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
170 (r3 -= #x0100) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
171 (r3 //= 96) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
172 (r1 = (r7 + 32)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
173 (r1 += ((r3 + 32) << 7)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
174 (write-multibyte-character r0 r1)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
175 |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
176 ;; mule-unicode-2500-33ff |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
177 (if (r3 < #x3400) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
178 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
179 (r3 -= #x2500) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
180 (r3 //= 96) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
181 (r1 = (r7 + 32)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
182 (r1 += ((r3 + 32) << 7)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
183 (write-multibyte-character r0 r1)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
184 |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
185 ;; U+3400 .. U+DFFF |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
186 ;; keep those bytes as eight-bit-{control|graphic} |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
187 (if (r3 < #xe000) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
188 ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
189 (r3 = r6) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
190 (write-multibyte-character r3 r0) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
191 (if (r1 < #xa0) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
192 (r3 = r5)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
193 (write-multibyte-character r3 r1) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
194 (if (r2 < #xa0) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
195 (r3 = r5) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
196 (r3 = r6)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
197 (write-multibyte-character r3 r2)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
198 |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
199 ;; mule-unicode-e000-ffff |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
200 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
201 (r3 -= #xe000) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
202 (r3 //= 96) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
203 (r1 = (r7 + 32)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
204 (r1 += ((r3 + 32) << 7)) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
205 (write-multibyte-character r0 r1)))))))) |
35542 | 206 |
207 ;; 4byte encoding | |
208 ;; keep those bytes as eight-bit-{control|graphic} | |
209 ((read r1 r2 r3) | |
210 ;; r0 > #xf0, thus eight-bit-graphic | |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
211 (write-multibyte-character r6 r0) |
35542 | 212 (if (r1 < #xa0) |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
213 (write-multibyte-character r5 r1) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
214 (write-multibyte-character r6 r1)) |
35542 | 215 (if (r2 < #xa0) |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
216 (write-multibyte-character r5 r2) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
217 (write-multibyte-character r6 r2)) |
35542 | 218 (if (r3 < #xa0) |
37934
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
219 (write-multibyte-character r5 r3) |
88389fa9b713
(ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents:
37097
diff
changeset
|
220 (write-multibyte-character r6 r3)))))) |
35542 | 221 |
222 (repeat)))) | |
223 | |
36243 | 224 "CCL program to decode UTF-8. |
36465 | 225 Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
226 mule-unicode-*. Encodings of un-representable Unicode characters are | |
227 decoded asis into eight-bit-control and eight-bit-graphic | |
228 characters.") | |
35542 | 229 |
230 (define-ccl-program ccl-encode-mule-utf-8 | |
231 `(1 | |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
232 ((r5 = -1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
233 (loop |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
234 (if (r5 < 0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
235 ((r1 = -1) |
41873 | 236 (read-multibyte-character r0 r1) |
237 (translate-character ucs-mule-to-mule-unicode r0 r1)) | |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
238 (;; We have already done read-multibyte-character. |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
239 (r0 = r5) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
240 (r1 = r6) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
241 (r5 = -1))) |
35542 | 242 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
243 (if (r0 == ,(charset-id 'ascii)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
244 (write r1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
245 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
246 (if (r0 == ,(charset-id 'latin-iso8859-1)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
247 ;; r1 scalar utf-8 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
248 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
249 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
250 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
251 ((r0 = (((r1 & #x40) >> 6) | #xc2)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
252 (r1 &= #x3f) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
253 (r1 |= #x80) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
254 (write r0 r1)) |
35542 | 255 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
256 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
257 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
258 ;; #x3f80 == (0011 1111 1000 0000)b |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
259 (r1 &= #x7f) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
260 (r1 += (r0 + 224)) ; 240 == -32 + #x0100 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
261 ;; now r1 holds scalar value |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
262 (if (r1 < #x0800) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
263 ;; 2byte encoding |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
264 ((r0 = (((r1 & #x07c0) >> 6) | #xc0)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
265 ;; #x07c0 == (0000 0111 1100 0000)b |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
266 (r1 &= #x3f) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
267 (r1 |= #x80) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
268 (write r0 r1)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
269 ;; 3byte encoding |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
270 ((r0 = (((r1 & #xf000) >> 12) | #xe0)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
271 (r2 = ((r1 & #x3f) | #x80)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
272 (r1 &= #x0fc0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
273 (r1 >>= 6) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
274 (r1 |= #x80) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
275 (write r0 r1 r2)))) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
276 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
277 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
278 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
279 (r1 &= #x7f) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
280 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
281 (r0 = (((r1 & #xf000) >> 12) | #xe0)) |
35542 | 282 (r2 = ((r1 & #x3f) | #x80)) |
283 (r1 &= #x0fc0) | |
284 (r1 >>= 6) | |
285 (r1 |= #x80) | |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
286 (write r0 r1 r2)) |
35542 | 287 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
288 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
289 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
290 (r1 &= #x7f) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
291 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
292 (r0 = (((r1 & #xf000) >> 12) | #xe0)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
293 (r2 = ((r1 & #x3f) | #x80)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
294 (r1 &= #x0fc0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
295 (r1 >>= 6) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
296 (r1 |= #x80) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
297 (write r0 r1 r2)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
298 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
299 (if (r0 == ,(charset-id 'eight-bit-control)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
300 ;; r1 scalar utf-8 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
301 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
302 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
303 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
304 ((write #xc2) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
305 (write r1)) |
35542 | 306 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
307 (if (r0 == ,(charset-id 'eight-bit-graphic)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
308 ;; r1 scalar utf-8 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
309 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
310 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
311 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
312 ((write r1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
313 (r1 = -1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
314 (read-multibyte-character r0 r1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
315 (if (r0 != ,(charset-id 'eight-bit-graphic)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
316 (if (r0 != ,(charset-id 'eight-bit-control)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
317 ((r5 = r0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
318 (r6 = r1)))) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
319 (if (r5 < 0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
320 ((read-multibyte-character r0 r2) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
321 (if (r0 != ,(charset-id 'eight-bit-graphic)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
322 (if (r0 != ,(charset-id 'eight-bit-control)) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
323 ((r5 = r0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
324 (r6 = r2)))) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
325 (if (r5 < 0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
326 (write r1 r2) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
327 (if (r1 < #xa0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
328 (write r1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
329 ((write #xc2) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
330 (write r1))))))) |
35542 | 331 |
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
332 ;; Unsupported character. |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
333 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
334 ((write #xef) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
335 (write #xbf) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
336 (write #xbd))))))))) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
337 (repeat))) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
338 (if (r1 >= #xa0) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
339 (write r1) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
340 (if (r1 >= #x80) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
341 ((write #xc2) |
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
342 (write r1))))) |
35542 | 343 |
36243 | 344 "CCL program to encode into UTF-8. |
345 Only characters from the charsets ascii, eight-bit-control, | |
36465 | 346 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized. |
347 Others are encoded as U+FFFD.") | |
35542 | 348 |
41873 | 349 ;; Dummy definition so that the CCL can be checked correctly; the |
350 ;; actual data are loaded on demand. | |
351 (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it | |
352 (define-translation-table 'ucs-mule-8859-to-mule-unicode)) | |
353 | |
354 (defsubst utf-8-untranslated-to-ucs () | |
355 (let ((b1 (char-after)) | |
356 (b2 (char-after (1+ (point)))) | |
357 (b3 (char-after (+ 2 (point)))) | |
358 (b4 (char-after (+ 4 (point))))) | |
359 (if (and b1 b2 b3) | |
360 (cond ((< b1 ?\xf0) | |
361 (setq b2 (lsh (logand b2 ?\x3f) 6)) | |
362 (setq b3 (logand b3 ?\x3f)) | |
363 (logior b3 (logior b2 (lsh (logand b1 ?\x0f) 12)))) | |
364 (b4 | |
365 (setq b2 (lsh (logand b2 ?\x3f) 12)) | |
366 (setq b3 (lsh (logand b3 ?\x3f) 6)) | |
367 (setq b4 (logand b4 ?\x3f)) | |
368 (logior b4 (logior b3 (logior b2 (lsh (logand b1 ?\x07) | |
369 18))))))))) | |
370 | |
371 (defun utf-8-help-echo (window object position) | |
372 (format "Untranslated Unicode U+%04X" | |
373 (get-char-property position 'untranslated-utf-8 object))) | |
374 | |
375 (defvar utf-8-subst-table nil | |
376 "If non-nil, a hash table mapping `untranslatable utf-8' to Emacs characters.") | |
377 | |
378 ;; We compose the untranslatable sequences into a single character. | |
379 ;; This is infelicitous for editing, because there's currently no | |
380 ;; mechanism for treating compositions as atomic, but is OK for | |
381 ;; display. We try to compose an appropriate character from a hash | |
382 ;; table of CJK characters to display correctly. Otherwise we use | |
383 ;; U+FFFD. What we really should have is hash table lookup from CCL | |
384 ;; so that we could do this properly. This function GCs too much. | |
385 (defsubst utf-8-compose () | |
386 "Put a suitable composition on an untranslatable sequence. | |
387 Return the sequence's length." | |
388 (let* ((u (utf-8-untranslated-to-ucs)) | |
389 (l (and u (if (>= u ?\x10000) | |
390 4 | |
391 3))) | |
392 (subst (and utf-8-subst-table (gethash u utf-8-subst-table)))) | |
393 (when u | |
394 (put-text-property (point) (min (point-max) (+ l (point))) | |
395 'untranslated-utf-8 u) | |
396 (unless subst | |
397 (put-text-property (point) (min (point-max) (+ l (point))) | |
398 'help-echo 'utf-8-help-echo) | |
399 (setq subst ?$,3u=(B)) | |
400 (compose-region (point) (+ l (point)) subst) | |
401 l))) | |
402 | |
403 (defcustom utf-8-compose-scripts nil | |
404 "*Non-nil means compose various scipts on decoding utf-8 text." | |
405 :group 'mule | |
406 :type 'boolean) ; omitted in Emacs 21.1 | |
407 | |
408 (defun utf-8-post-read-conversion (length) | |
409 "Compose untranslated utf-8 sequences into single characters. | |
410 Also compose particular scripts if `utf-8-compose-scripts' is non-nil." | |
411 (save-excursion | |
412 ;; Can't do eval-when-compile to insert a multibyte constant | |
413 ;; version of the string in the loop, since it's always loaded as | |
414 ;; unibyte from a byte-compiled file. | |
415 (let ((range (string-as-multibyte "^\341-\377"))) | |
416 (while (and (skip-chars-forward | |
417 range) | |
418 (not (eobp))) | |
419 (forward-char (utf-8-compose))))) | |
420 ;; Fixme: Takahashi-san implies it may not work this easily -- needs | |
421 ;; checking with him. | |
422 (when (and utf-8-compose-scripts (> length 1)) | |
423 ;; These currently have definitions which cover the relevant | |
424 ;; Unicodes. We could avoid loading thai-util &c by checking | |
425 ;; whether the region contains any characters with the appropriate | |
426 ;; categories. There aren't yet Unicode-based rules for Tibetan. | |
427 (save-excursion (setq length (diacritic-post-read-conversion length))) | |
428 (save-excursion (setq length (thai-post-read-conversion length))) | |
429 (save-excursion (setq length (lao-post-read-conversion length))) | |
430 (save-excursion (setq length (devanagari-post-read-conversion length)))) | |
431 length) | |
432 | |
433 (defun utf-8-pre-write-conversion (beg end) | |
434 "Semi-dummy pre-write function effectively to autoload ucs-tables." | |
435 ;; Ensure translation table is loaded. | |
436 (require 'ucs-tables) | |
437 ;; Don't do this again. | |
438 (coding-system-put 'mule-utf-8 'pre-write-conversion nil) | |
439 nil) | |
440 | |
35542 | 441 (make-coding-system |
442 'mule-utf-8 4 ?u | |
443 "UTF-8 encoding for Emacs-supported Unicode characters. | |
41873 | 444 The supported Emacs character sets are the following, plus others |
445 which may be included in the translation table | |
446 `ucs-mule-to-mule-unicode': | |
447 ascii | |
448 eight-bit-control | |
449 eight-bit-graphic | |
450 latin-iso8859-1 | |
451 latin-iso8859-2 | |
452 latin-iso8859-3 | |
453 latin-iso8859-4 | |
454 cyrillic-iso8859-5 | |
455 greek-iso8859-7 | |
456 hebrew-iso8859-8 | |
457 latin-iso8859-9 | |
458 latin-iso8859-14 | |
459 latin-iso8859-15 | |
460 mule-unicode-0100-24ff | |
461 mule-unicode-2500-33ff | |
462 mule-unicode-e000-ffff | |
35542 | 463 |
36243 | 464 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF |
465 are decoded into sequences of eight-bit-control and eight-bit-graphic | |
41873 | 466 characters to preserve their byte sequences and composed to display as |
467 a single character. Emacs characters that can't be encoded to these | |
468 ranges are encoded as U+FFFD." | |
35542 | 469 |
470 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | |
471 '((safe-charsets | |
472 ascii | |
473 eight-bit-control | |
474 eight-bit-graphic | |
475 latin-iso8859-1 | |
41873 | 476 latin-iso8859-15 |
477 latin-iso8859-14 | |
478 latin-iso8859-9 | |
479 hebrew-iso8859-8 | |
480 greek-iso8859-7 | |
481 cyrillic-iso8859-5 | |
482 latin-iso8859-4 | |
483 latin-iso8859-3 | |
484 latin-iso8859-2 | |
485 vietnamese-viscii-lower | |
486 vietnamese-viscii-upper | |
487 thai-tis620 | |
488 ipa | |
489 ethiopic | |
490 indian-is13194 | |
491 katakana-jisx0201 | |
492 chinese-sisheng | |
493 lao | |
35542 | 494 mule-unicode-0100-24ff |
495 mule-unicode-2500-33ff | |
496 mule-unicode-e000-ffff) | |
36371
f6bb3ed752b4
(mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents:
36243
diff
changeset
|
497 (mime-charset . utf-8) |
36423
aa776838b660
(mule-utf-8): Set coding-category property to coding-category-utf-8.
Kenichi Handa <handa@m17n.org>
parents:
36371
diff
changeset
|
498 (coding-category . coding-category-utf-8) |
41873 | 499 (valid-codes (0 . 255)) |
500 (pre-write-conversion . utf-8-pre-write-conversion) | |
501 (post-read-conversion . utf-8-post-read-conversion))) | |
35542 | 502 |
503 (define-coding-system-alias 'utf-8 'mule-utf-8) | |
38436
b174db545cfd
Some fixes to follow coding conventions.
Pavel Janík <Pavel@Janik.cz>
parents:
37934
diff
changeset
|
504 |
41873 | 505 ;; I think this needs special private charsets defined for the |
506 ;; untranslated sequences, if it's going to work well. | |
507 | |
508 ;;; (defun utf-8-compose-function (pos to pattern &optional string) | |
509 ;;; (let* ((prop (get-char-property pos 'composition string)) | |
510 ;;; (l (and prop (- (cadr prop) (car prop))))) | |
511 ;;; (cond ((and l (> l (- to pos))) | |
512 ;;; (delete-region pos to)) | |
513 ;;; ((and (> (char-after pos) 224) | |
514 ;;; (< (char-after pos) 256) | |
515 ;;; (save-restriction | |
516 ;;; (narrow-to-region pos to) | |
517 ;;; (utf-8-compose))) | |
518 ;;; t)))) | |
519 | |
520 ;;; (dotimes (i 96) | |
521 ;;; (aset composition-function-table | |
522 ;;; (+ 128 i) | |
523 ;;; `((,(string-as-multibyte "[\200-\237\240-\377]") | |
524 ;;; . utf-8-compose-function)))) | |
525 | |
38436
b174db545cfd
Some fixes to follow coding conventions.
Pavel Janík <Pavel@Janik.cz>
parents:
37934
diff
changeset
|
526 ;;; utf-8.el ends here |