comparison lisp/international/utf-8.el @ 37934:88389fa9b713

(ccl-decode-mule-utf-8): Handle invalid UTF-8 sequences.
author Gerd Moellmann <gerd@gnu.org>
date Tue, 29 May 2001 08:38:35 +0000
parents b095952a8678
children b174db545cfd
comparison
equal deleted inserted replaced
37933:f7113f2126e9 37934:88389fa9b713
1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support 1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support
2 2
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. 3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
4 ;; Licensed to the Free Software Foundation. 4 ;; Licensed to the Free Software Foundation.
5 ;; Copyright (C) 2001 Free Software Foundation, Inc.
5 6
6 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> 7 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
7 ;; Keywords: multilingual, Unicode, UTF-8, i18n 8 ;; Keywords: multilingual, Unicode, UTF-8, i18n
8 9
9 ;; This file is part of GNU Emacs. 10 ;; This file is part of GNU Emacs.
74 ;; mule-unicode-e000-ffff | 3 | 4 75 ;; mule-unicode-e000-ffff | 3 | 4
75 ;; 76 ;;
76 ;; Thus magnification factor is two. 77 ;; Thus magnification factor is two.
77 ;; 78 ;;
78 `(2 79 `(2
79 ((loop 80 ((r5 = ,(charset-id 'eight-bit-control))
81 (r6 = ,(charset-id 'eight-bit-graphic))
82 (loop
80 (read r0) 83 (read r0)
81 84
82 ;; 1byte encoding, i.e., ascii 85 ;; 1byte encoding, i.e., ascii
83 (if (r0 < #x80) 86 (if (r0 < #x80)
84 (write r0) 87 (write r0)
85 88
86 ;; 2byte encoding 89 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
87 (if (r0 < #xe0) 90 (if (r0 < #xe0)
88 ((read r1) 91 ((read r1)
89 (r0 &= #x1f) 92
90 (r0 <<= 6) 93 (if ((r1 & #b11000000) != #b10000000)
91 (r1 &= #x3f) 94 ;; Invalid 2-byte sequence
92 (r1 += r0) 95 ((if (r0 < #xa0)
93 ;; now r1 holds scalar value 96 (write-multibyte-character r5 r0)
94 97 (write-multibyte-character r6 r0))
95 ;; eight-bit-control 98 (if (r1 < #x80)
96 (if (r1 < 160) 99 (write r1)
97 ((r0 = ,(charset-id 'eight-bit-control)) 100 (if (r1 < #xa0)
98 (write-multibyte-character r0 r1)) 101 (write-multibyte-character r5 r1)
99 102 (write-multibyte-character r6 r1))))
100 ;; latin-iso8859-1 103
101 (if (r1 < 256) 104 ((r0 &= #x1f)
102 ((r0 = ,(charset-id 'latin-iso8859-1)) 105 (r0 <<= 6)
103 (r1 -= 128) 106 (r1 &= #x3f)
104 (write-multibyte-character r0 r1)) 107 (r1 += r0)
105 108 ;; Now r1 holds scalar value
106 ;; mule-unicode-0100-24ff (< 0800) 109
107 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) 110 ;; eight-bit-control
108 (r1 -= #x0100) 111 (if (r1 < 160)
109 (r2 = (((r1 / 96) + 32) << 7)) 112 ((write-multibyte-character r5 r1))
110 (r1 %= 96) 113
111 (r1 += (r2 + 32)) 114 ;; latin-iso8859-1
112 (write-multibyte-character r0 r1))))) 115 (if (r1 < 256)
116 ((r0 = ,(charset-id 'latin-iso8859-1))
117 (r1 -= 128)
118 (write-multibyte-character r0 r1))
119
120 ;; mule-unicode-0100-24ff (< 0800)
121 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
122 (r1 -= #x0100)
123 (r2 = (((r1 / 96) + 32) << 7))
124 (r1 %= 96)
125 (r1 += (r2 + 32))
126 (write-multibyte-character r0 r1)))))))
113 127
114 ;; 3byte encoding 128 ;; 3byte encoding
129 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
115 (if (r0 < #xf0) 130 (if (r0 < #xf0)
116 ((read r1 r2) 131 ((read r1 r2)
117 (r3 = ((r0 & #x0f) << 12)) 132
118 (r3 += ((r1 & #x3f) << 6)) 133 ;; This is set to 1 if the encoding is invalid.
119 (r3 += (r2 & #x3f)) 134 (r4 = 0)
120 ;; now r3 holds scalar value 135
121 136 (r3 = (r1 & #b11000000))
122 ;; mule-unicode-0100-24ff (>= 0800) 137 (r3 |= ((r2 >> 2) & #b00110000))
123 (if (r3 < #x2500) 138 (if (r3 != #b10100000)
124 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) 139 (r4 = 1)
125 (r3 -= #x0100) 140 ((r3 = ((r0 & #x0f) << 12))
126 (r3 //= 96) 141 (r3 += ((r1 & #x3f) << 6))
127 (r1 = (r7 + 32)) 142 (r3 += (r2 & #x3f))
128 (r1 += ((r3 + 32) << 7)) 143 (if (r3 < #x0800)
129 (write-multibyte-character r0 r1)) 144 (r4 = 1))))
130 145
131 ;; mule-unicode-2500-33ff 146 (if (r4 != 0)
132 (if (r3 < #x3400) 147 ;; Invalid 3-byte sequence
133 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) 148 ((if (r0 < #xa0)
134 (r3 -= #x2500) 149 (write-multibyte-character r5 r0)
135 (r3 //= 96) 150 (write-multibyte-character r6 r0))
136 (r1 = (r7 + 32)) 151 (if (r1 < #x80)
137 (r1 += ((r3 + 32) << 7)) 152 (write r1)
138 (write-multibyte-character r0 r1)) 153 (if (r1 < #xa0)
139 154 (write-multibyte-character r5 r1)
140 ;; U+3400 .. U+DFFF 155 (write-multibyte-character r6 r1)))
141 ;; keep those bytes as eight-bit-{control|graphic} 156 (if (r2 < #x80)
142 (if (r3 < #xe000) 157 (write r2)
143 (;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic 158 (if (r2 < #xa0)
144 (r3 = ,(charset-id 'eight-bit-graphic)) 159 (write-multibyte-character r5 r2)
145 (write-multibyte-character r3 r0) 160 (write-multibyte-character r6 r2))))
146 (if (r1 < #xa0) 161
147 (r3 = ,(charset-id 'eight-bit-control))) 162 ;; mule-unicode-0100-24ff (>= 0800)
148 (write-multibyte-character r3 r1) 163 ((if (r3 < #x2500)
149 (if (r2 < #xa0) 164 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
150 (r3 = ,(charset-id 'eight-bit-control)) 165 (r3 -= #x0100)
151 (r3 = ,(charset-id 'eight-bit-graphic))) 166 (r3 //= 96)
152 (write-multibyte-character r3 r2)) 167 (r1 = (r7 + 32))
153 168 (r1 += ((r3 + 32) << 7))
154 ;; mule-unicode-e000-ffff 169 (write-multibyte-character r0 r1))
155 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) 170
156 (r3 -= #xe000) 171 ;; mule-unicode-2500-33ff
157 (r3 //= 96) 172 (if (r3 < #x3400)
158 (r1 = (r7 + 32)) 173 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
159 (r1 += ((r3 + 32) << 7)) 174 (r3 -= #x2500)
160 (write-multibyte-character r0 r1)))))) 175 (r3 //= 96)
176 (r1 = (r7 + 32))
177 (r1 += ((r3 + 32) << 7))
178 (write-multibyte-character r0 r1))
179
180 ;; U+3400 .. U+DFFF
181 ;; keep those bytes as eight-bit-{control|graphic}
182 (if (r3 < #xe000)
183 ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
184 (r3 = r6)
185 (write-multibyte-character r3 r0)
186 (if (r1 < #xa0)
187 (r3 = r5))
188 (write-multibyte-character r3 r1)
189 (if (r2 < #xa0)
190 (r3 = r5)
191 (r3 = r6))
192 (write-multibyte-character r3 r2))
193
194 ;; mule-unicode-e000-ffff
195 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
196 (r3 -= #xe000)
197 (r3 //= 96)
198 (r1 = (r7 + 32))
199 (r1 += ((r3 + 32) << 7))
200 (write-multibyte-character r0 r1))))))))
161 201
162 ;; 4byte encoding 202 ;; 4byte encoding
163 ;; keep those bytes as eight-bit-{control|graphic} 203 ;; keep those bytes as eight-bit-{control|graphic}
164 ((read r1 r2 r3) 204 ((read r1 r2 r3)
165 ;; r0 > #xf0, thus eight-bit-graphic 205 ;; r0 > #xf0, thus eight-bit-graphic
166 (r4 = ,(charset-id 'eight-bit-graphic)) 206 (write-multibyte-character r6 r0)
167 (write-multibyte-character r4 r0)
168 (if (r1 < #xa0) 207 (if (r1 < #xa0)
169 (r4 = ,(charset-id 'eight-bit-control))) 208 (write-multibyte-character r5 r1)
170 (write-multibyte-character r4 r1) 209 (write-multibyte-character r6 r1))
171 (if (r2 < #xa0) 210 (if (r2 < #xa0)
172 (r4 = ,(charset-id 'eight-bit-control)) 211 (write-multibyte-character r5 r2)
173 (r4 = ,(charset-id 'eight-bit-graphic))) 212 (write-multibyte-character r6 r2))
174 (write-multibyte-character r4 r2)
175 (if (r3 < #xa0) 213 (if (r3 < #xa0)
176 (r4 = ,(charset-id 'eight-bit-control)) 214 (write-multibyte-character r5 r3)
177 (r4 = ,(charset-id 'eight-bit-graphic))) 215 (write-multibyte-character r6 r3))))))
178 (write-multibyte-character r4 r3)))))
179 216
180 (repeat)))) 217 (repeat))))
181 218
182 "CCL program to decode UTF-8. 219 "CCL program to decode UTF-8.
183 Basic decoding is done into the charsets ascii, latin-iso8859-1 and 220 Basic decoding is done into the charsets ascii, latin-iso8859-1 and