Mercurial > emacs
comparison lisp/international/utf-8.el @ 37934:88389fa9b713
(ccl-decode-mule-utf-8): Handle
invalid UTF-8 sequences.
author | Gerd Moellmann <gerd@gnu.org> |
---|---|
date | Tue, 29 May 2001 08:38:35 +0000 |
parents | b095952a8678 |
children | b174db545cfd |
comparison
equal
deleted
inserted
replaced
37933:f7113f2126e9 | 37934:88389fa9b713 |
---|---|
1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support | 1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support |
2 | 2 |
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. | 3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. |
4 ;; Licensed to the Free Software Foundation. | 4 ;; Licensed to the Free Software Foundation. |
5 ;; Copyright (C) 2001 Free Software Foundation, Inc. | |
5 | 6 |
6 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> | 7 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> |
7 ;; Keywords: multilingual, Unicode, UTF-8, i18n | 8 ;; Keywords: multilingual, Unicode, UTF-8, i18n |
8 | 9 |
9 ;; This file is part of GNU Emacs. | 10 ;; This file is part of GNU Emacs. |
74 ;; mule-unicode-e000-ffff | 3 | 4 | 75 ;; mule-unicode-e000-ffff | 3 | 4 |
75 ;; | 76 ;; |
76 ;; Thus magnification factor is two. | 77 ;; Thus magnification factor is two. |
77 ;; | 78 ;; |
78 `(2 | 79 `(2 |
79 ((loop | 80 ((r5 = ,(charset-id 'eight-bit-control)) |
81 (r6 = ,(charset-id 'eight-bit-graphic)) | |
82 (loop | |
80 (read r0) | 83 (read r0) |
81 | 84 |
82 ;; 1byte encoding, i.e., ascii | 85 ;; 1byte encoding, i.e., ascii |
83 (if (r0 < #x80) | 86 (if (r0 < #x80) |
84 (write r0) | 87 (write r0) |
85 | 88 |
86 ;; 2byte encoding | 89 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx |
87 (if (r0 < #xe0) | 90 (if (r0 < #xe0) |
88 ((read r1) | 91 ((read r1) |
89 (r0 &= #x1f) | 92 |
90 (r0 <<= 6) | 93 (if ((r1 & #b11000000) != #b10000000) |
91 (r1 &= #x3f) | 94 ;; Invalid 2-byte sequence |
92 (r1 += r0) | 95 ((if (r0 < #xa0) |
93 ;; now r1 holds scalar value | 96 (write-multibyte-character r5 r0) |
94 | 97 (write-multibyte-character r6 r0)) |
95 ;; eight-bit-control | 98 (if (r1 < #x80) |
96 (if (r1 < 160) | 99 (write r1) |
97 ((r0 = ,(charset-id 'eight-bit-control)) | 100 (if (r1 < #xa0) |
98 (write-multibyte-character r0 r1)) | 101 (write-multibyte-character r5 r1) |
99 | 102 (write-multibyte-character r6 r1)))) |
100 ;; latin-iso8859-1 | 103 |
101 (if (r1 < 256) | 104 ((r0 &= #x1f) |
102 ((r0 = ,(charset-id 'latin-iso8859-1)) | 105 (r0 <<= 6) |
103 (r1 -= 128) | 106 (r1 &= #x3f) |
104 (write-multibyte-character r0 r1)) | 107 (r1 += r0) |
105 | 108 ;; Now r1 holds scalar value |
106 ;; mule-unicode-0100-24ff (< 0800) | 109 |
107 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | 110 ;; eight-bit-control |
108 (r1 -= #x0100) | 111 (if (r1 < 160) |
109 (r2 = (((r1 / 96) + 32) << 7)) | 112 ((write-multibyte-character r5 r1)) |
110 (r1 %= 96) | 113 |
111 (r1 += (r2 + 32)) | 114 ;; latin-iso8859-1 |
112 (write-multibyte-character r0 r1))))) | 115 (if (r1 < 256) |
116 ((r0 = ,(charset-id 'latin-iso8859-1)) | |
117 (r1 -= 128) | |
118 (write-multibyte-character r0 r1)) | |
119 | |
120 ;; mule-unicode-0100-24ff (< 0800) | |
121 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
122 (r1 -= #x0100) | |
123 (r2 = (((r1 / 96) + 32) << 7)) | |
124 (r1 %= 96) | |
125 (r1 += (r2 + 32)) | |
126 (write-multibyte-character r0 r1))))))) | |
113 | 127 |
114 ;; 3byte encoding | 128 ;; 3byte encoding |
129 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx | |
115 (if (r0 < #xf0) | 130 (if (r0 < #xf0) |
116 ((read r1 r2) | 131 ((read r1 r2) |
117 (r3 = ((r0 & #x0f) << 12)) | 132 |
118 (r3 += ((r1 & #x3f) << 6)) | 133 ;; This is set to 1 if the encoding is invalid. |
119 (r3 += (r2 & #x3f)) | 134 (r4 = 0) |
120 ;; now r3 holds scalar value | 135 |
121 | 136 (r3 = (r1 & #b11000000)) |
122 ;; mule-unicode-0100-24ff (>= 0800) | 137 (r3 |= ((r2 >> 2) & #b00110000)) |
123 (if (r3 < #x2500) | 138 (if (r3 != #b10100000) |
124 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | 139 (r4 = 1) |
125 (r3 -= #x0100) | 140 ((r3 = ((r0 & #x0f) << 12)) |
126 (r3 //= 96) | 141 (r3 += ((r1 & #x3f) << 6)) |
127 (r1 = (r7 + 32)) | 142 (r3 += (r2 & #x3f)) |
128 (r1 += ((r3 + 32) << 7)) | 143 (if (r3 < #x0800) |
129 (write-multibyte-character r0 r1)) | 144 (r4 = 1)))) |
130 | 145 |
131 ;; mule-unicode-2500-33ff | 146 (if (r4 != 0) |
132 (if (r3 < #x3400) | 147 ;; Invalid 3-byte sequence |
133 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | 148 ((if (r0 < #xa0) |
134 (r3 -= #x2500) | 149 (write-multibyte-character r5 r0) |
135 (r3 //= 96) | 150 (write-multibyte-character r6 r0)) |
136 (r1 = (r7 + 32)) | 151 (if (r1 < #x80) |
137 (r1 += ((r3 + 32) << 7)) | 152 (write r1) |
138 (write-multibyte-character r0 r1)) | 153 (if (r1 < #xa0) |
139 | 154 (write-multibyte-character r5 r1) |
140 ;; U+3400 .. U+DFFF | 155 (write-multibyte-character r6 r1))) |
141 ;; keep those bytes as eight-bit-{control|graphic} | 156 (if (r2 < #x80) |
142 (if (r3 < #xe000) | 157 (write r2) |
143 (;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic | 158 (if (r2 < #xa0) |
144 (r3 = ,(charset-id 'eight-bit-graphic)) | 159 (write-multibyte-character r5 r2) |
145 (write-multibyte-character r3 r0) | 160 (write-multibyte-character r6 r2)))) |
146 (if (r1 < #xa0) | 161 |
147 (r3 = ,(charset-id 'eight-bit-control))) | 162 ;; mule-unicode-0100-24ff (>= 0800) |
148 (write-multibyte-character r3 r1) | 163 ((if (r3 < #x2500) |
149 (if (r2 < #xa0) | 164 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) |
150 (r3 = ,(charset-id 'eight-bit-control)) | 165 (r3 -= #x0100) |
151 (r3 = ,(charset-id 'eight-bit-graphic))) | 166 (r3 //= 96) |
152 (write-multibyte-character r3 r2)) | 167 (r1 = (r7 + 32)) |
153 | 168 (r1 += ((r3 + 32) << 7)) |
154 ;; mule-unicode-e000-ffff | 169 (write-multibyte-character r0 r1)) |
155 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | 170 |
156 (r3 -= #xe000) | 171 ;; mule-unicode-2500-33ff |
157 (r3 //= 96) | 172 (if (r3 < #x3400) |
158 (r1 = (r7 + 32)) | 173 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) |
159 (r1 += ((r3 + 32) << 7)) | 174 (r3 -= #x2500) |
160 (write-multibyte-character r0 r1)))))) | 175 (r3 //= 96) |
176 (r1 = (r7 + 32)) | |
177 (r1 += ((r3 + 32) << 7)) | |
178 (write-multibyte-character r0 r1)) | |
179 | |
180 ;; U+3400 .. U+DFFF | |
181 ;; keep those bytes as eight-bit-{control|graphic} | |
182 (if (r3 < #xe000) | |
183 ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic | |
184 (r3 = r6) | |
185 (write-multibyte-character r3 r0) | |
186 (if (r1 < #xa0) | |
187 (r3 = r5)) | |
188 (write-multibyte-character r3 r1) | |
189 (if (r2 < #xa0) | |
190 (r3 = r5) | |
191 (r3 = r6)) | |
192 (write-multibyte-character r3 r2)) | |
193 | |
194 ;; mule-unicode-e000-ffff | |
195 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | |
196 (r3 -= #xe000) | |
197 (r3 //= 96) | |
198 (r1 = (r7 + 32)) | |
199 (r1 += ((r3 + 32) << 7)) | |
200 (write-multibyte-character r0 r1)))))))) | |
161 | 201 |
162 ;; 4byte encoding | 202 ;; 4byte encoding |
163 ;; keep those bytes as eight-bit-{control|graphic} | 203 ;; keep those bytes as eight-bit-{control|graphic} |
164 ((read r1 r2 r3) | 204 ((read r1 r2 r3) |
165 ;; r0 > #xf0, thus eight-bit-graphic | 205 ;; r0 > #xf0, thus eight-bit-graphic |
166 (r4 = ,(charset-id 'eight-bit-graphic)) | 206 (write-multibyte-character r6 r0) |
167 (write-multibyte-character r4 r0) | |
168 (if (r1 < #xa0) | 207 (if (r1 < #xa0) |
169 (r4 = ,(charset-id 'eight-bit-control))) | 208 (write-multibyte-character r5 r1) |
170 (write-multibyte-character r4 r1) | 209 (write-multibyte-character r6 r1)) |
171 (if (r2 < #xa0) | 210 (if (r2 < #xa0) |
172 (r4 = ,(charset-id 'eight-bit-control)) | 211 (write-multibyte-character r5 r2) |
173 (r4 = ,(charset-id 'eight-bit-graphic))) | 212 (write-multibyte-character r6 r2)) |
174 (write-multibyte-character r4 r2) | |
175 (if (r3 < #xa0) | 213 (if (r3 < #xa0) |
176 (r4 = ,(charset-id 'eight-bit-control)) | 214 (write-multibyte-character r5 r3) |
177 (r4 = ,(charset-id 'eight-bit-graphic))) | 215 (write-multibyte-character r6 r3)))))) |
178 (write-multibyte-character r4 r3))))) | |
179 | 216 |
180 (repeat)))) | 217 (repeat)))) |
181 | 218 |
182 "CCL program to decode UTF-8. | 219 "CCL program to decode UTF-8. |
183 Basic decoding is done into the charsets ascii, latin-iso8859-1 and | 220 Basic decoding is done into the charsets ascii, latin-iso8859-1 and |