1347
|
1 /*
|
|
2 The contents of this file are subject to the Mozilla Public License
|
|
3 Version 1.1 (the "License"); you may not use this file except in
|
|
4 compliance with the License. You may obtain a copy of the License at
|
|
5 http://www.mozilla.org/MPL/
|
|
6
|
|
7 Software distributed under the License is distributed on an "AS IS"
|
|
8 basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
|
9 License for the specific language governing rights and limitations
|
|
10 under the License.
|
|
11
|
|
12 The Original Code is expat.
|
|
13
|
|
14 The Initial Developer of the Original Code is James Clark.
|
|
15 Portions created by James Clark are Copyright (C) 1998, 1999
|
|
16 James Clark. All Rights Reserved.
|
|
17
|
|
18 Contributor(s):
|
|
19
|
|
20 Alternatively, the contents of this file may be used under the terms
|
|
21 of the GNU General Public License (the "GPL"), in which case the
|
|
22 provisions of the GPL are applicable instead of those above. If you
|
|
23 wish to allow use of your version of this file only under the terms of
|
|
24 the GPL and not to allow others to use your version of this file under
|
|
25 the MPL, indicate your decision by deleting the provisions above and
|
|
26 replace them with the notice and other provisions required by the
|
|
27 GPL. If you do not delete the provisions above, a recipient may use
|
|
28 your version of this file under either the MPL or the GPL.
|
|
29 */
|
|
30
|
|
31 #include "xmldef.h"
|
|
32 #include "xmltok.h"
|
|
33 #include "nametab.h"
|
|
34
|
|
35 #define VTABLE1 \
|
|
36 { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
|
|
37 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
|
|
38 PREFIX(sameName), \
|
|
39 PREFIX(nameMatchesAscii), \
|
|
40 PREFIX(nameLength), \
|
|
41 PREFIX(skipS), \
|
|
42 PREFIX(getAtts), \
|
|
43 PREFIX(charRefNumber), \
|
|
44 PREFIX(predefinedEntityName), \
|
|
45 PREFIX(updatePosition), \
|
|
46 PREFIX(isPublicId)
|
|
47
|
|
48 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
|
|
49
|
|
50 #define UCS2_GET_NAMING(pages, hi, lo) \
|
|
51 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
|
|
52
|
|
53 /* A 2 byte UTF-8 representation splits the characters 11 bits
|
|
54 between the bottom 5 and 6 bits of the bytes.
|
|
55 We need 8 bits to index into pages, 3 bits to add to that index and
|
|
56 5 bits to generate the mask. */
|
|
57 #define UTF8_GET_NAMING2(pages, byte) \
|
|
58 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
|
|
59 + ((((byte)[0]) & 3) << 1) \
|
|
60 + ((((byte)[1]) >> 5) & 1)] \
|
|
61 & (1 << (((byte)[1]) & 0x1F)))
|
|
62
|
|
63 /* A 3 byte UTF-8 representation splits the characters 16 bits
|
|
64 between the bottom 4, 6 and 6 bits of the bytes.
|
|
65 We need 8 bits to index into pages, 3 bits to add to that index and
|
|
66 5 bits to generate the mask. */
|
|
67 #define UTF8_GET_NAMING3(pages, byte) \
|
|
68 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
|
|
69 + ((((byte)[1]) >> 2) & 0xF)] \
|
|
70 << 3) \
|
|
71 + ((((byte)[1]) & 3) << 1) \
|
|
72 + ((((byte)[2]) >> 5) & 1)] \
|
|
73 & (1 << (((byte)[2]) & 0x1F)))
|
|
74
|
|
75 #define UTF8_GET_NAMING(pages, p, n) \
|
|
76 ((n) == 2 \
|
|
77 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
|
|
78 : ((n) == 3 \
|
|
79 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
|
|
80 : 0))
|
|
81
|
|
82 #define UTF8_INVALID3(p) \
|
|
83 ((*p) == 0xED \
|
|
84 ? (((p)[1] & 0x20) != 0) \
|
|
85 : ((*p) == 0xEF \
|
|
86 ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
|
|
87 : 0))
|
|
88
|
|
89 #define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
|
|
90
|
|
91 static
|
|
92 int isNever(const ENCODING *enc, const char *p)
|
|
93 {
|
|
94 return 0;
|
|
95 }
|
|
96
|
|
97 static
|
|
98 int utf8_isName2(const ENCODING *enc, const char *p)
|
|
99 {
|
|
100 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
|
|
101 }
|
|
102
|
|
103 static
|
|
104 int utf8_isName3(const ENCODING *enc, const char *p)
|
|
105 {
|
|
106 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
|
|
107 }
|
|
108
|
|
109 #define utf8_isName4 isNever
|
|
110
|
|
111 static
|
|
112 int utf8_isNmstrt2(const ENCODING *enc, const char *p)
|
|
113 {
|
|
114 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
|
|
115 }
|
|
116
|
|
117 static
|
|
118 int utf8_isNmstrt3(const ENCODING *enc, const char *p)
|
|
119 {
|
|
120 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
|
|
121 }
|
|
122
|
|
123 #define utf8_isNmstrt4 isNever
|
|
124
|
|
125 #define utf8_isInvalid2 isNever
|
|
126
|
|
127 static
|
|
128 int utf8_isInvalid3(const ENCODING *enc, const char *p)
|
|
129 {
|
|
130 return UTF8_INVALID3((const unsigned char *)p);
|
|
131 }
|
|
132
|
|
133 static
|
|
134 int utf8_isInvalid4(const ENCODING *enc, const char *p)
|
|
135 {
|
|
136 return UTF8_INVALID4((const unsigned char *)p);
|
|
137 }
|
|
138
|
|
139 struct normal_encoding {
|
|
140 ENCODING enc;
|
|
141 unsigned char type[256];
|
|
142 #ifdef XML_MIN_SIZE
|
|
143 int (*byteType)(const ENCODING *, const char *);
|
|
144 int (*isNameMin)(const ENCODING *, const char *);
|
|
145 int (*isNmstrtMin)(const ENCODING *, const char *);
|
|
146 int (*byteToAscii)(const ENCODING *, const char *);
|
|
147 int (*charMatches)(const ENCODING *, const char *, int);
|
|
148 #endif /* XML_MIN_SIZE */
|
|
149 int (*isName2)(const ENCODING *, const char *);
|
|
150 int (*isName3)(const ENCODING *, const char *);
|
|
151 int (*isName4)(const ENCODING *, const char *);
|
|
152 int (*isNmstrt2)(const ENCODING *, const char *);
|
|
153 int (*isNmstrt3)(const ENCODING *, const char *);
|
|
154 int (*isNmstrt4)(const ENCODING *, const char *);
|
|
155 int (*isInvalid2)(const ENCODING *, const char *);
|
|
156 int (*isInvalid3)(const ENCODING *, const char *);
|
|
157 int (*isInvalid4)(const ENCODING *, const char *);
|
|
158 };
|
|
159
|
|
160 #ifdef XML_MIN_SIZE
|
|
161
|
|
162 #define STANDARD_VTABLE(E) \
|
|
163 E ## byteType, \
|
|
164 E ## isNameMin, \
|
|
165 E ## isNmstrtMin, \
|
|
166 E ## byteToAscii, \
|
|
167 E ## charMatches,
|
|
168
|
|
169 #else
|
|
170
|
|
171 #define STANDARD_VTABLE(E) /* as nothing */
|
|
172
|
|
173 #endif
|
|
174
|
|
175 #define NORMAL_VTABLE(E) \
|
|
176 E ## isName2, \
|
|
177 E ## isName3, \
|
|
178 E ## isName4, \
|
|
179 E ## isNmstrt2, \
|
|
180 E ## isNmstrt3, \
|
|
181 E ## isNmstrt4, \
|
|
182 E ## isInvalid2, \
|
|
183 E ## isInvalid3, \
|
|
184 E ## isInvalid4
|
|
185
|
|
186 static int checkCharRefNumber(int);
|
|
187
|
|
188 #include "xmltok_impl.h"
|
|
189
|
|
190 #ifdef XML_MIN_SIZE
|
|
191 #define sb_isNameMin isNever
|
|
192 #define sb_isNmstrtMin isNever
|
|
193 #endif
|
|
194
|
|
195 #ifdef XML_MIN_SIZE
|
|
196 #define MINBPC(enc) ((enc)->minBytesPerChar)
|
|
197 #else
|
|
198 /* minimum bytes per character */
|
|
199 #define MINBPC(enc) 1
|
|
200 #endif
|
|
201
|
|
202 #define SB_BYTE_TYPE(enc, p) \
|
|
203 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
|
|
204
|
|
205 #ifdef XML_MIN_SIZE
|
|
206 static
|
|
207 int sb_byteType(const ENCODING *enc, const char *p)
|
|
208 {
|
|
209 return SB_BYTE_TYPE(enc, p);
|
|
210 }
|
|
211 #define BYTE_TYPE(enc, p) \
|
|
212 (((const struct normal_encoding *)(enc))->byteType(enc, p))
|
|
213 #else
|
|
214 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
|
|
215 #endif
|
|
216
|
|
217 #ifdef XML_MIN_SIZE
|
|
218 #define BYTE_TO_ASCII(enc, p) \
|
|
219 (((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
|
|
220 static
|
|
221 int sb_byteToAscii(const ENCODING *enc, const char *p)
|
|
222 {
|
|
223 return *p;
|
|
224 }
|
|
225 #else
|
|
226 #define BYTE_TO_ASCII(enc, p) (*p)
|
|
227 #endif
|
|
228
|
|
229 #define IS_NAME_CHAR(enc, p, n) \
|
|
230 (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
|
|
231 #define IS_NMSTRT_CHAR(enc, p, n) \
|
|
232 (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
|
|
233 #define IS_INVALID_CHAR(enc, p, n) \
|
|
234 (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
|
|
235
|
|
236 #ifdef XML_MIN_SIZE
|
|
237 #define IS_NAME_CHAR_MINBPC(enc, p) \
|
|
238 (((const struct normal_encoding *)(enc))->isNameMin(enc, p))
|
|
239 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
|
|
240 (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
|
|
241 #else
|
|
242 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
|
|
243 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
|
|
244 #endif
|
|
245
|
|
246 #ifdef XML_MIN_SIZE
|
|
247 #define CHAR_MATCHES(enc, p, c) \
|
|
248 (((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
|
|
249 static
|
|
250 int sb_charMatches(const ENCODING *enc, const char *p, int c)
|
|
251 {
|
|
252 return *p == c;
|
|
253 }
|
|
254 #else
|
|
255 /* c is an ASCII character */
|
|
256 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
|
|
257 #endif
|
|
258
|
|
259 #define PREFIX(ident) normal_ ## ident
|
|
260 #include "xmltok_impl.c"
|
|
261
|
|
262 #undef MINBPC
|
|
263 #undef BYTE_TYPE
|
|
264 #undef BYTE_TO_ASCII
|
|
265 #undef CHAR_MATCHES
|
|
266 #undef IS_NAME_CHAR
|
|
267 #undef IS_NAME_CHAR_MINBPC
|
|
268 #undef IS_NMSTRT_CHAR
|
|
269 #undef IS_NMSTRT_CHAR_MINBPC
|
|
270 #undef IS_INVALID_CHAR
|
|
271
|
|
272 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
|
|
273 UTF8_cval1 = 0x00,
|
|
274 UTF8_cval2 = 0xc0,
|
|
275 UTF8_cval3 = 0xe0,
|
|
276 UTF8_cval4 = 0xf0
|
|
277 };
|
|
278
|
|
279 static
|
|
280 void utf8_toUtf8(const ENCODING *enc,
|
|
281 const char **fromP, const char *fromLim,
|
|
282 char **toP, const char *toLim)
|
|
283 {
|
|
284 char *to;
|
|
285 const char *from;
|
|
286 if (fromLim - *fromP > toLim - *toP) {
|
|
287 /* Avoid copying partial characters. */
|
|
288 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
|
|
289 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
|
|
290 break;
|
|
291 }
|
|
292 for (to = *toP, from = *fromP; from != fromLim; from++, to++)
|
|
293 *to = *from;
|
|
294 *fromP = from;
|
|
295 *toP = to;
|
|
296 }
|
|
297
|
|
298 static
|
|
299 void utf8_toUtf16(const ENCODING *enc,
|
|
300 const char **fromP, const char *fromLim,
|
|
301 unsigned short **toP, const unsigned short *toLim)
|
|
302 {
|
|
303 unsigned short *to = *toP;
|
|
304 const char *from = *fromP;
|
|
305 while (from != fromLim && to != toLim) {
|
|
306 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
|
|
307 case BT_LEAD2:
|
|
308 *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
|
|
309 from += 2;
|
|
310 break;
|
|
311 case BT_LEAD3:
|
|
312 *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
|
|
313 from += 3;
|
|
314 break;
|
|
315 case BT_LEAD4:
|
|
316 {
|
|
317 unsigned long n;
|
|
318 if (to + 1 == toLim)
|
|
319 break;
|
|
320 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
|
|
321 n -= 0x10000;
|
|
322 to[0] = (unsigned short)((n >> 10) | 0xD800);
|
|
323 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
|
|
324 to += 2;
|
|
325 from += 4;
|
|
326 }
|
|
327 break;
|
|
328 default:
|
|
329 *to++ = *from++;
|
|
330 break;
|
|
331 }
|
|
332 }
|
|
333 *fromP = from;
|
|
334 *toP = to;
|
|
335 }
|
|
336
|
|
337 #ifdef XML_NS
|
|
338 static const struct normal_encoding utf8_encoding_ns = {
|
|
339 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
|
|
340 {
|
|
341 #include "asciitab.h"
|
|
342 #include "utf8tab.h"
|
|
343 },
|
|
344 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
|
|
345 };
|
|
346 #endif
|
|
347
|
|
348 static const struct normal_encoding utf8_encoding = {
|
|
349 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
|
|
350 {
|
|
351 #define BT_COLON BT_NMSTRT
|
|
352 #include "asciitab.h"
|
|
353 #undef BT_COLON
|
|
354 #include "utf8tab.h"
|
|
355 },
|
|
356 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
|
|
357 };
|
|
358
|
|
359 #ifdef XML_NS
|
|
360
|
|
361 static const struct normal_encoding internal_utf8_encoding_ns = {
|
|
362 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
|
|
363 {
|
|
364 #include "iasciitab.h"
|
|
365 #include "utf8tab.h"
|
|
366 },
|
|
367 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
|
|
368 };
|
|
369
|
|
370 #endif
|
|
371
|
|
372 static const struct normal_encoding internal_utf8_encoding = {
|
|
373 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
|
|
374 {
|
|
375 #define BT_COLON BT_NMSTRT
|
|
376 #include "iasciitab.h"
|
|
377 #undef BT_COLON
|
|
378 #include "utf8tab.h"
|
|
379 },
|
|
380 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
|
|
381 };
|
|
382
|
|
383 static
|
|
384 void latin1_toUtf8(const ENCODING *enc,
|
|
385 const char **fromP, const char *fromLim,
|
|
386 char **toP, const char *toLim)
|
|
387 {
|
|
388 for (;;) {
|
|
389 unsigned char c;
|
|
390 if (*fromP == fromLim)
|
|
391 break;
|
|
392 c = (unsigned char)**fromP;
|
|
393 if (c & 0x80) {
|
|
394 if (toLim - *toP < 2)
|
|
395 break;
|
|
396 *(*toP)++ = ((c >> 6) | UTF8_cval2);
|
|
397 *(*toP)++ = ((c & 0x3f) | 0x80);
|
|
398 (*fromP)++;
|
|
399 }
|
|
400 else {
|
|
401 if (*toP == toLim)
|
|
402 break;
|
|
403 *(*toP)++ = *(*fromP)++;
|
|
404 }
|
|
405 }
|
|
406 }
|
|
407
|
|
408 static
|
|
409 void latin1_toUtf16(const ENCODING *enc,
|
|
410 const char **fromP, const char *fromLim,
|
|
411 unsigned short **toP, const unsigned short *toLim)
|
|
412 {
|
|
413 while (*fromP != fromLim && *toP != toLim)
|
|
414 *(*toP)++ = (unsigned char)*(*fromP)++;
|
|
415 }
|
|
416
|
|
417 #ifdef XML_NS
|
|
418
|
|
419 static const struct normal_encoding latin1_encoding_ns = {
|
|
420 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
|
|
421 {
|
|
422 #include "asciitab.h"
|
|
423 #include "latin1tab.h"
|
|
424 },
|
|
425 STANDARD_VTABLE(sb_)
|
|
426 };
|
|
427
|
|
428 #endif
|
|
429
|
|
430 static const struct normal_encoding latin1_encoding = {
|
|
431 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
|
|
432 {
|
|
433 #define BT_COLON BT_NMSTRT
|
|
434 #include "asciitab.h"
|
|
435 #undef BT_COLON
|
|
436 #include "latin1tab.h"
|
|
437 },
|
|
438 STANDARD_VTABLE(sb_)
|
|
439 };
|
|
440
|
|
441 static
|
|
442 void ascii_toUtf8(const ENCODING *enc,
|
|
443 const char **fromP, const char *fromLim,
|
|
444 char **toP, const char *toLim)
|
|
445 {
|
|
446 while (*fromP != fromLim && *toP != toLim)
|
|
447 *(*toP)++ = *(*fromP)++;
|
|
448 }
|
|
449
|
|
450 #ifdef XML_NS
|
|
451
|
|
452 static const struct normal_encoding ascii_encoding_ns = {
|
|
453 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
|
|
454 {
|
|
455 #include "asciitab.h"
|
|
456 /* BT_NONXML == 0 */
|
|
457 },
|
|
458 STANDARD_VTABLE(sb_)
|
|
459 };
|
|
460
|
|
461 #endif
|
|
462
|
|
463 static const struct normal_encoding ascii_encoding = {
|
|
464 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
|
|
465 {
|
|
466 #define BT_COLON BT_NMSTRT
|
|
467 #include "asciitab.h"
|
|
468 #undef BT_COLON
|
|
469 /* BT_NONXML == 0 */
|
|
470 },
|
|
471 STANDARD_VTABLE(sb_)
|
|
472 };
|
|
473
|
|
474 static int unicode_byte_type(char hi, char lo)
|
|
475 {
|
|
476 switch ((unsigned char)hi) {
|
|
477 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
|
|
478 return BT_LEAD4;
|
|
479 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
|
|
480 return BT_TRAIL;
|
|
481 case 0xFF:
|
|
482 switch ((unsigned char)lo) {
|
|
483 case 0xFF:
|
|
484 case 0xFE:
|
|
485 return BT_NONXML;
|
|
486 }
|
|
487 break;
|
|
488 }
|
|
489 return BT_NONASCII;
|
|
490 }
|
|
491
|
|
492 #define DEFINE_UTF16_TO_UTF8(E) \
|
|
493 static \
|
|
494 void E ## toUtf8(const ENCODING *enc, \
|
|
495 const char **fromP, const char *fromLim, \
|
|
496 char **toP, const char *toLim) \
|
|
497 { \
|
|
498 const char *from; \
|
|
499 for (from = *fromP; from != fromLim; from += 2) { \
|
|
500 int plane; \
|
|
501 unsigned char lo2; \
|
|
502 unsigned char lo = GET_LO(from); \
|
|
503 unsigned char hi = GET_HI(from); \
|
|
504 switch (hi) { \
|
|
505 case 0: \
|
|
506 if (lo < 0x80) { \
|
|
507 if (*toP == toLim) { \
|
|
508 *fromP = from; \
|
|
509 return; \
|
|
510 } \
|
|
511 *(*toP)++ = lo; \
|
|
512 break; \
|
|
513 } \
|
|
514 /* fall through */ \
|
|
515 case 0x1: case 0x2: case 0x3: \
|
|
516 case 0x4: case 0x5: case 0x6: case 0x7: \
|
|
517 if (toLim - *toP < 2) { \
|
|
518 *fromP = from; \
|
|
519 return; \
|
|
520 } \
|
|
521 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
|
|
522 *(*toP)++ = ((lo & 0x3f) | 0x80); \
|
|
523 break; \
|
|
524 default: \
|
|
525 if (toLim - *toP < 3) { \
|
|
526 *fromP = from; \
|
|
527 return; \
|
|
528 } \
|
|
529 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
|
|
530 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
|
|
531 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
|
|
532 *(*toP)++ = ((lo & 0x3f) | 0x80); \
|
|
533 break; \
|
|
534 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
|
|
535 if (toLim - *toP < 4) { \
|
|
536 *fromP = from; \
|
|
537 return; \
|
|
538 } \
|
|
539 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
|
|
540 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
|
|
541 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
|
|
542 from += 2; \
|
|
543 lo2 = GET_LO(from); \
|
|
544 *(*toP)++ = (((lo & 0x3) << 4) \
|
|
545 | ((GET_HI(from) & 0x3) << 2) \
|
|
546 | (lo2 >> 6) \
|
|
547 | 0x80); \
|
|
548 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
|
|
549 break; \
|
|
550 } \
|
|
551 } \
|
|
552 *fromP = from; \
|
|
553 }
|
|
554
|
|
555 #define DEFINE_UTF16_TO_UTF16(E) \
|
|
556 static \
|
|
557 void E ## toUtf16(const ENCODING *enc, \
|
|
558 const char **fromP, const char *fromLim, \
|
|
559 unsigned short **toP, const unsigned short *toLim) \
|
|
560 { \
|
|
561 /* Avoid copying first half only of surrogate */ \
|
|
562 if (fromLim - *fromP > ((toLim - *toP) << 1) \
|
|
563 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
|
|
564 fromLim -= 2; \
|
|
565 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
|
|
566 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
|
|
567 }
|
|
568
|
|
569 #define SET2(ptr, ch) \
|
|
570 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
|
|
571 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
|
|
572 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
|
|
573
|
|
574 DEFINE_UTF16_TO_UTF8(little2_)
|
|
575 DEFINE_UTF16_TO_UTF16(little2_)
|
|
576
|
|
577 #undef SET2
|
|
578 #undef GET_LO
|
|
579 #undef GET_HI
|
|
580
|
|
581 #define SET2(ptr, ch) \
|
|
582 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
|
|
583 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
|
|
584 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
|
|
585
|
|
586 DEFINE_UTF16_TO_UTF8(big2_)
|
|
587 DEFINE_UTF16_TO_UTF16(big2_)
|
|
588
|
|
589 #undef SET2
|
|
590 #undef GET_LO
|
|
591 #undef GET_HI
|
|
592
|
|
593 #define LITTLE2_BYTE_TYPE(enc, p) \
|
|
594 ((p)[1] == 0 \
|
|
595 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
|
|
596 : unicode_byte_type((p)[1], (p)[0]))
|
|
597 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
|
|
598 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
|
|
599 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
|
|
600 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
|
|
601 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
|
|
602 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
|
|
603
|
|
604 #ifdef XML_MIN_SIZE
|
|
605
|
|
606 static
|
|
607 int little2_byteType(const ENCODING *enc, const char *p)
|
|
608 {
|
|
609 return LITTLE2_BYTE_TYPE(enc, p);
|
|
610 }
|
|
611
|
|
612 static
|
|
613 int little2_byteToAscii(const ENCODING *enc, const char *p)
|
|
614 {
|
|
615 return LITTLE2_BYTE_TO_ASCII(enc, p);
|
|
616 }
|
|
617
|
|
618 static
|
|
619 int little2_charMatches(const ENCODING *enc, const char *p, int c)
|
|
620 {
|
|
621 return LITTLE2_CHAR_MATCHES(enc, p, c);
|
|
622 }
|
|
623
|
|
624 static
|
|
625 int little2_isNameMin(const ENCODING *enc, const char *p)
|
|
626 {
|
|
627 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
|
|
628 }
|
|
629
|
|
630 static
|
|
631 int little2_isNmstrtMin(const ENCODING *enc, const char *p)
|
|
632 {
|
|
633 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
|
|
634 }
|
|
635
|
|
636 #undef VTABLE
|
|
637 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
|
|
638
|
|
639 #else /* not XML_MIN_SIZE */
|
|
640
|
|
641 #undef PREFIX
|
|
642 #define PREFIX(ident) little2_ ## ident
|
|
643 #define MINBPC(enc) 2
|
|
644 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
|
|
645 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
|
|
646 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
|
|
647 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
|
|
648 #define IS_NAME_CHAR(enc, p, n) 0
|
|
649 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
|
|
650 #define IS_NMSTRT_CHAR(enc, p, n) (0)
|
|
651 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
|
|
652
|
|
653 #include "xmltok_impl.c"
|
|
654
|
|
655 #undef MINBPC
|
|
656 #undef BYTE_TYPE
|
|
657 #undef BYTE_TO_ASCII
|
|
658 #undef CHAR_MATCHES
|
|
659 #undef IS_NAME_CHAR
|
|
660 #undef IS_NAME_CHAR_MINBPC
|
|
661 #undef IS_NMSTRT_CHAR
|
|
662 #undef IS_NMSTRT_CHAR_MINBPC
|
|
663 #undef IS_INVALID_CHAR
|
|
664
|
|
665 #endif /* not XML_MIN_SIZE */
|
|
666
|
|
667 #ifdef XML_NS
|
|
668
|
|
669 static const struct normal_encoding little2_encoding_ns = {
|
|
670 { VTABLE, 2, 0,
|
|
671 #if XML_BYTE_ORDER == 12
|
|
672 1
|
|
673 #else
|
|
674 0
|
|
675 #endif
|
|
676 },
|
|
677 {
|
|
678 #include "asciitab.h"
|
|
679 #include "latin1tab.h"
|
|
680 },
|
|
681 STANDARD_VTABLE(little2_)
|
|
682 };
|
|
683
|
|
684 #endif
|
|
685
|
|
686 static const struct normal_encoding little2_encoding = {
|
|
687 { VTABLE, 2, 0,
|
|
688 #if XML_BYTE_ORDER == 12
|
|
689 1
|
|
690 #else
|
|
691 0
|
|
692 #endif
|
|
693 },
|
|
694 {
|
|
695 #define BT_COLON BT_NMSTRT
|
|
696 #include "asciitab.h"
|
|
697 #undef BT_COLON
|
|
698 #include "latin1tab.h"
|
|
699 },
|
|
700 STANDARD_VTABLE(little2_)
|
|
701 };
|
|
702
|
|
703 #if XML_BYTE_ORDER != 21
|
|
704
|
|
705 #ifdef XML_NS
|
|
706
|
|
707 static const struct normal_encoding internal_little2_encoding_ns = {
|
|
708 { VTABLE, 2, 0, 1 },
|
|
709 {
|
|
710 #include "iasciitab.h"
|
|
711 #include "latin1tab.h"
|
|
712 },
|
|
713 STANDARD_VTABLE(little2_)
|
|
714 };
|
|
715
|
|
716 #endif
|
|
717
|
|
718 static const struct normal_encoding internal_little2_encoding = {
|
|
719 { VTABLE, 2, 0, 1 },
|
|
720 {
|
|
721 #define BT_COLON BT_NMSTRT
|
|
722 #include "iasciitab.h"
|
|
723 #undef BT_COLON
|
|
724 #include "latin1tab.h"
|
|
725 },
|
|
726 STANDARD_VTABLE(little2_)
|
|
727 };
|
|
728
|
|
729 #endif
|
|
730
|
|
731
|
|
732 #define BIG2_BYTE_TYPE(enc, p) \
|
|
733 ((p)[0] == 0 \
|
|
734 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
|
|
735 : unicode_byte_type((p)[0], (p)[1]))
|
|
736 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
|
|
737 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
|
|
738 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
|
|
739 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
|
|
740 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
|
|
741 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
|
|
742
|
|
743 #ifdef XML_MIN_SIZE
|
|
744
|
|
745 static
|
|
746 int big2_byteType(const ENCODING *enc, const char *p)
|
|
747 {
|
|
748 return BIG2_BYTE_TYPE(enc, p);
|
|
749 }
|
|
750
|
|
751 static
|
|
752 int big2_byteToAscii(const ENCODING *enc, const char *p)
|
|
753 {
|
|
754 return BIG2_BYTE_TO_ASCII(enc, p);
|
|
755 }
|
|
756
|
|
757 static
|
|
758 int big2_charMatches(const ENCODING *enc, const char *p, int c)
|
|
759 {
|
|
760 return BIG2_CHAR_MATCHES(enc, p, c);
|
|
761 }
|
|
762
|
|
763 static
|
|
764 int big2_isNameMin(const ENCODING *enc, const char *p)
|
|
765 {
|
|
766 return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
|
|
767 }
|
|
768
|
|
769 static
|
|
770 int big2_isNmstrtMin(const ENCODING *enc, const char *p)
|
|
771 {
|
|
772 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
|
|
773 }
|
|
774
|
|
775 #undef VTABLE
|
|
776 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
|
|
777
|
|
778 #else /* not XML_MIN_SIZE */
|
|
779
|
|
780 #undef PREFIX
|
|
781 #define PREFIX(ident) big2_ ## ident
|
|
782 #define MINBPC(enc) 2
|
|
783 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
|
|
784 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
|
|
785 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
|
|
786 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
|
|
787 #define IS_NAME_CHAR(enc, p, n) 0
|
|
788 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
|
|
789 #define IS_NMSTRT_CHAR(enc, p, n) (0)
|
|
790 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
|
|
791
|
|
792 #include "xmltok_impl.c"
|
|
793
|
|
794 #undef MINBPC
|
|
795 #undef BYTE_TYPE
|
|
796 #undef BYTE_TO_ASCII
|
|
797 #undef CHAR_MATCHES
|
|
798 #undef IS_NAME_CHAR
|
|
799 #undef IS_NAME_CHAR_MINBPC
|
|
800 #undef IS_NMSTRT_CHAR
|
|
801 #undef IS_NMSTRT_CHAR_MINBPC
|
|
802 #undef IS_INVALID_CHAR
|
|
803
|
|
804 #endif /* not XML_MIN_SIZE */
|
|
805
|
|
806 #ifdef XML_NS
|
|
807
|
|
808 static const struct normal_encoding big2_encoding_ns = {
|
|
809 { VTABLE, 2, 0,
|
|
810 #if XML_BYTE_ORDER == 21
|
|
811 1
|
|
812 #else
|
|
813 0
|
|
814 #endif
|
|
815 },
|
|
816 {
|
|
817 #include "asciitab.h"
|
|
818 #include "latin1tab.h"
|
|
819 },
|
|
820 STANDARD_VTABLE(big2_)
|
|
821 };
|
|
822
|
|
823 #endif
|
|
824
|
|
825 static const struct normal_encoding big2_encoding = {
|
|
826 { VTABLE, 2, 0,
|
|
827 #if XML_BYTE_ORDER == 21
|
|
828 1
|
|
829 #else
|
|
830 0
|
|
831 #endif
|
|
832 },
|
|
833 {
|
|
834 #define BT_COLON BT_NMSTRT
|
|
835 #include "asciitab.h"
|
|
836 #undef BT_COLON
|
|
837 #include "latin1tab.h"
|
|
838 },
|
|
839 STANDARD_VTABLE(big2_)
|
|
840 };
|
|
841
|
|
842 #if XML_BYTE_ORDER != 12
|
|
843
|
|
844 #ifdef XML_NS
|
|
845
|
|
846 static const struct normal_encoding internal_big2_encoding_ns = {
|
|
847 { VTABLE, 2, 0, 1 },
|
|
848 {
|
|
849 #include "iasciitab.h"
|
|
850 #include "latin1tab.h"
|
|
851 },
|
|
852 STANDARD_VTABLE(big2_)
|
|
853 };
|
|
854
|
|
855 #endif
|
|
856
|
|
857 static const struct normal_encoding internal_big2_encoding = {
|
|
858 { VTABLE, 2, 0, 1 },
|
|
859 {
|
|
860 #define BT_COLON BT_NMSTRT
|
|
861 #include "iasciitab.h"
|
|
862 #undef BT_COLON
|
|
863 #include "latin1tab.h"
|
|
864 },
|
|
865 STANDARD_VTABLE(big2_)
|
|
866 };
|
|
867
|
|
868 #endif
|
|
869
|
|
870 #undef PREFIX
|
|
871
|
|
872 static
|
|
873 int streqci(const char *s1, const char *s2)
|
|
874 {
|
|
875 for (;;) {
|
|
876 char c1 = *s1++;
|
|
877 char c2 = *s2++;
|
|
878 if ('a' <= c1 && c1 <= 'z')
|
|
879 c1 += 'A' - 'a';
|
|
880 if ('a' <= c2 && c2 <= 'z')
|
|
881 c2 += 'A' - 'a';
|
|
882 if (c1 != c2)
|
|
883 return 0;
|
|
884 if (!c1)
|
|
885 break;
|
|
886 }
|
|
887 return 1;
|
|
888 }
|
|
889
|
|
890 static
|
|
891 void initUpdatePosition(const ENCODING *enc, const char *ptr,
|
|
892 const char *end, POSITION *pos)
|
|
893 {
|
|
894 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
|
|
895 }
|
|
896
|
|
897 static
|
|
898 int toAscii(const ENCODING *enc, const char *ptr, const char *end)
|
|
899 {
|
|
900 char buf[1];
|
|
901 char *p = buf;
|
|
902 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
|
|
903 if (p == buf)
|
|
904 return -1;
|
|
905 else
|
|
906 return buf[0];
|
|
907 }
|
|
908
|
|
909 static
|
|
910 int isSpace(int c)
|
|
911 {
|
|
912 switch (c) {
|
|
913 case 0x20:
|
|
914 case 0xD:
|
|
915 case 0xA:
|
|
916 case 0x9:
|
|
917 return 1;
|
|
918 }
|
|
919 return 0;
|
|
920 }
|
|
921
|
|
922 /* Return 1 if there's just optional white space
|
|
923 or there's an S followed by name=val. */
|
|
924 static
|
|
925 int parsePseudoAttribute(const ENCODING *enc,
|
|
926 const char *ptr,
|
|
927 const char *end,
|
|
928 const char **namePtr,
|
|
929 const char **valPtr,
|
|
930 const char **nextTokPtr)
|
|
931 {
|
|
932 int c;
|
|
933 char open;
|
|
934 if (ptr == end) {
|
|
935 *namePtr = 0;
|
|
936 return 1;
|
|
937 }
|
|
938 if (!isSpace(toAscii(enc, ptr, end))) {
|
|
939 *nextTokPtr = ptr;
|
|
940 return 0;
|
|
941 }
|
|
942 do {
|
|
943 ptr += enc->minBytesPerChar;
|
|
944 } while (isSpace(toAscii(enc, ptr, end)));
|
|
945 if (ptr == end) {
|
|
946 *namePtr = 0;
|
|
947 return 1;
|
|
948 }
|
|
949 *namePtr = ptr;
|
|
950 for (;;) {
|
|
951 c = toAscii(enc, ptr, end);
|
|
952 if (c == -1) {
|
|
953 *nextTokPtr = ptr;
|
|
954 return 0;
|
|
955 }
|
|
956 if (c == '=')
|
|
957 break;
|
|
958 if (isSpace(c)) {
|
|
959 do {
|
|
960 ptr += enc->minBytesPerChar;
|
|
961 } while (isSpace(c = toAscii(enc, ptr, end)));
|
|
962 if (c != '=') {
|
|
963 *nextTokPtr = ptr;
|
|
964 return 0;
|
|
965 }
|
|
966 break;
|
|
967 }
|
|
968 ptr += enc->minBytesPerChar;
|
|
969 }
|
|
970 if (ptr == *namePtr) {
|
|
971 *nextTokPtr = ptr;
|
|
972 return 0;
|
|
973 }
|
|
974 ptr += enc->minBytesPerChar;
|
|
975 c = toAscii(enc, ptr, end);
|
|
976 while (isSpace(c)) {
|
|
977 ptr += enc->minBytesPerChar;
|
|
978 c = toAscii(enc, ptr, end);
|
|
979 }
|
|
980 if (c != '"' && c != '\'') {
|
|
981 *nextTokPtr = ptr;
|
|
982 return 0;
|
|
983 }
|
|
984 open = c;
|
|
985 ptr += enc->minBytesPerChar;
|
|
986 *valPtr = ptr;
|
|
987 for (;; ptr += enc->minBytesPerChar) {
|
|
988 c = toAscii(enc, ptr, end);
|
|
989 if (c == open)
|
|
990 break;
|
|
991 if (!('a' <= c && c <= 'z')
|
|
992 && !('A' <= c && c <= 'Z')
|
|
993 && !('0' <= c && c <= '9')
|
|
994 && c != '.'
|
|
995 && c != '-'
|
|
996 && c != '_') {
|
|
997 *nextTokPtr = ptr;
|
|
998 return 0;
|
|
999 }
|
|
1000 }
|
|
1001 *nextTokPtr = ptr + enc->minBytesPerChar;
|
|
1002 return 1;
|
|
1003 }
|
|
1004
|
|
1005 static
|
|
1006 int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
|
|
1007 const char *,
|
|
1008 const char *),
|
|
1009 int isGeneralTextEntity,
|
|
1010 const ENCODING *enc,
|
|
1011 const char *ptr,
|
|
1012 const char *end,
|
|
1013 const char **badPtr,
|
|
1014 const char **versionPtr,
|
|
1015 const char **encodingName,
|
|
1016 const ENCODING **encoding,
|
|
1017 int *standalone)
|
|
1018 {
|
|
1019 const char *val = 0;
|
|
1020 const char *name = 0;
|
|
1021 ptr += 5 * enc->minBytesPerChar;
|
|
1022 end -= 2 * enc->minBytesPerChar;
|
|
1023 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr) || !name) {
|
|
1024 *badPtr = ptr;
|
|
1025 return 0;
|
|
1026 }
|
|
1027 if (!XmlNameMatchesAscii(enc, name, "version")) {
|
|
1028 if (!isGeneralTextEntity) {
|
|
1029 *badPtr = name;
|
|
1030 return 0;
|
|
1031 }
|
|
1032 }
|
|
1033 else {
|
|
1034 if (versionPtr)
|
|
1035 *versionPtr = val;
|
|
1036 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
|
|
1037 *badPtr = ptr;
|
|
1038 return 0;
|
|
1039 }
|
|
1040 if (!name) {
|
|
1041 if (isGeneralTextEntity) {
|
|
1042 /* a TextDecl must have an EncodingDecl */
|
|
1043 *badPtr = ptr;
|
|
1044 return 0;
|
|
1045 }
|
|
1046 return 1;
|
|
1047 }
|
|
1048 }
|
|
1049 if (XmlNameMatchesAscii(enc, name, "encoding")) {
|
|
1050 int c = toAscii(enc, val, end);
|
|
1051 if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) {
|
|
1052 *badPtr = val;
|
|
1053 return 0;
|
|
1054 }
|
|
1055 if (encodingName)
|
|
1056 *encodingName = val;
|
|
1057 if (encoding)
|
|
1058 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
|
|
1059 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
|
|
1060 *badPtr = ptr;
|
|
1061 return 0;
|
|
1062 }
|
|
1063 if (!name)
|
|
1064 return 1;
|
|
1065 }
|
|
1066 if (!XmlNameMatchesAscii(enc, name, "standalone") || isGeneralTextEntity) {
|
|
1067 *badPtr = name;
|
|
1068 return 0;
|
|
1069 }
|
|
1070 if (XmlNameMatchesAscii(enc, val, "yes")) {
|
|
1071 if (standalone)
|
|
1072 *standalone = 1;
|
|
1073 }
|
|
1074 else if (XmlNameMatchesAscii(enc, val, "no")) {
|
|
1075 if (standalone)
|
|
1076 *standalone = 0;
|
|
1077 }
|
|
1078 else {
|
|
1079 *badPtr = val;
|
|
1080 return 0;
|
|
1081 }
|
|
1082 while (isSpace(toAscii(enc, ptr, end)))
|
|
1083 ptr += enc->minBytesPerChar;
|
|
1084 if (ptr != end) {
|
|
1085 *badPtr = ptr;
|
|
1086 return 0;
|
|
1087 }
|
|
1088 return 1;
|
|
1089 }
|
|
1090
|
|
1091 static
|
|
1092 int checkCharRefNumber(int result)
|
|
1093 {
|
|
1094 switch (result >> 8) {
|
|
1095 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
|
|
1096 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
|
|
1097 return -1;
|
|
1098 case 0:
|
|
1099 if (latin1_encoding.type[result] == BT_NONXML)
|
|
1100 return -1;
|
|
1101 break;
|
|
1102 case 0xFF:
|
|
1103 if (result == 0xFFFE || result == 0xFFFF)
|
|
1104 return -1;
|
|
1105 break;
|
|
1106 }
|
|
1107 return result;
|
|
1108 }
|
|
1109
|
|
1110 int XmlUtf8Encode(int c, char *buf)
|
|
1111 {
|
|
1112 enum {
|
|
1113 /* minN is minimum legal resulting value for N byte sequence */
|
|
1114 min2 = 0x80,
|
|
1115 min3 = 0x800,
|
|
1116 min4 = 0x10000
|
|
1117 };
|
|
1118
|
|
1119 if (c < 0)
|
|
1120 return 0;
|
|
1121 if (c < min2) {
|
|
1122 buf[0] = (c | UTF8_cval1);
|
|
1123 return 1;
|
|
1124 }
|
|
1125 if (c < min3) {
|
|
1126 buf[0] = ((c >> 6) | UTF8_cval2);
|
|
1127 buf[1] = ((c & 0x3f) | 0x80);
|
|
1128 return 2;
|
|
1129 }
|
|
1130 if (c < min4) {
|
|
1131 buf[0] = ((c >> 12) | UTF8_cval3);
|
|
1132 buf[1] = (((c >> 6) & 0x3f) | 0x80);
|
|
1133 buf[2] = ((c & 0x3f) | 0x80);
|
|
1134 return 3;
|
|
1135 }
|
|
1136 if (c < 0x110000) {
|
|
1137 buf[0] = ((c >> 18) | UTF8_cval4);
|
|
1138 buf[1] = (((c >> 12) & 0x3f) | 0x80);
|
|
1139 buf[2] = (((c >> 6) & 0x3f) | 0x80);
|
|
1140 buf[3] = ((c & 0x3f) | 0x80);
|
|
1141 return 4;
|
|
1142 }
|
|
1143 return 0;
|
|
1144 }
|
|
1145
|
|
1146 int XmlUtf16Encode(int charNum, unsigned short *buf)
|
|
1147 {
|
|
1148 if (charNum < 0)
|
|
1149 return 0;
|
|
1150 if (charNum < 0x10000) {
|
|
1151 buf[0] = charNum;
|
|
1152 return 1;
|
|
1153 }
|
|
1154 if (charNum < 0x110000) {
|
|
1155 charNum -= 0x10000;
|
|
1156 buf[0] = (charNum >> 10) + 0xD800;
|
|
1157 buf[1] = (charNum & 0x3FF) + 0xDC00;
|
|
1158 return 2;
|
|
1159 }
|
|
1160 return 0;
|
|
1161 }
|
|
1162
|
|
1163 struct unknown_encoding {
|
|
1164 struct normal_encoding normal;
|
|
1165 int (*convert)(void *userData, const char *p);
|
|
1166 void *userData;
|
|
1167 unsigned short utf16[256];
|
|
1168 char utf8[256][4];
|
|
1169 };
|
|
1170
|
|
1171 int XmlSizeOfUnknownEncoding()
|
|
1172 {
|
|
1173 return sizeof(struct unknown_encoding);
|
|
1174 }
|
|
1175
|
|
1176 static
|
|
1177 int unknown_isName(const ENCODING *enc, const char *p)
|
|
1178 {
|
|
1179 int c = ((const struct unknown_encoding *)enc)
|
|
1180 ->convert(((const struct unknown_encoding *)enc)->userData, p);
|
|
1181 if (c & ~0xFFFF)
|
|
1182 return 0;
|
|
1183 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
|
|
1184 }
|
|
1185
|
|
1186 static
|
|
1187 int unknown_isNmstrt(const ENCODING *enc, const char *p)
|
|
1188 {
|
|
1189 int c = ((const struct unknown_encoding *)enc)
|
|
1190 ->convert(((const struct unknown_encoding *)enc)->userData, p);
|
|
1191 if (c & ~0xFFFF)
|
|
1192 return 0;
|
|
1193 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
|
|
1194 }
|
|
1195
|
|
1196 static
|
|
1197 int unknown_isInvalid(const ENCODING *enc, const char *p)
|
|
1198 {
|
|
1199 int c = ((const struct unknown_encoding *)enc)
|
|
1200 ->convert(((const struct unknown_encoding *)enc)->userData, p);
|
|
1201 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
|
|
1202 }
|
|
1203
|
|
1204 static
|
|
1205 void unknown_toUtf8(const ENCODING *enc,
|
|
1206 const char **fromP, const char *fromLim,
|
|
1207 char **toP, const char *toLim)
|
|
1208 {
|
|
1209 char buf[XML_UTF8_ENCODE_MAX];
|
|
1210 for (;;) {
|
|
1211 const char *utf8;
|
|
1212 int n;
|
|
1213 if (*fromP == fromLim)
|
|
1214 break;
|
|
1215 utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
|
|
1216 n = *utf8++;
|
|
1217 if (n == 0) {
|
|
1218 int c = ((const struct unknown_encoding *)enc)
|
|
1219 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
|
|
1220 n = XmlUtf8Encode(c, buf);
|
|
1221 if (n > toLim - *toP)
|
|
1222 break;
|
|
1223 utf8 = buf;
|
|
1224 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
|
|
1225 - (BT_LEAD2 - 2);
|
|
1226 }
|
|
1227 else {
|
|
1228 if (n > toLim - *toP)
|
|
1229 break;
|
|
1230 (*fromP)++;
|
|
1231 }
|
|
1232 do {
|
|
1233 *(*toP)++ = *utf8++;
|
|
1234 } while (--n != 0);
|
|
1235 }
|
|
1236 }
|
|
1237
|
|
1238 static
|
|
1239 void unknown_toUtf16(const ENCODING *enc,
|
|
1240 const char **fromP, const char *fromLim,
|
|
1241 unsigned short **toP, const unsigned short *toLim)
|
|
1242 {
|
|
1243 while (*fromP != fromLim && *toP != toLim) {
|
|
1244 unsigned short c
|
|
1245 = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
|
|
1246 if (c == 0) {
|
|
1247 c = (unsigned short)((const struct unknown_encoding *)enc)
|
|
1248 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
|
|
1249 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
|
|
1250 - (BT_LEAD2 - 2);
|
|
1251 }
|
|
1252 else
|
|
1253 (*fromP)++;
|
|
1254 *(*toP)++ = c;
|
|
1255 }
|
|
1256 }
|
|
1257
|
|
1258 ENCODING *
|
|
1259 XmlInitUnknownEncoding(void *mem,
|
|
1260 int *table,
|
|
1261 int (*convert)(void *userData, const char *p),
|
|
1262 void *userData)
|
|
1263 {
|
|
1264 int i;
|
|
1265 struct unknown_encoding *e = mem;
|
|
1266 for (i = 0; i < sizeof(struct normal_encoding); i++)
|
|
1267 ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
|
|
1268 for (i = 0; i < 128; i++)
|
|
1269 if (latin1_encoding.type[i] != BT_OTHER
|
|
1270 && latin1_encoding.type[i] != BT_NONXML
|
|
1271 && table[i] != i)
|
|
1272 return 0;
|
|
1273 for (i = 0; i < 256; i++) {
|
|
1274 int c = table[i];
|
|
1275 if (c == -1) {
|
|
1276 e->normal.type[i] = BT_MALFORM;
|
|
1277 /* This shouldn't really get used. */
|
|
1278 e->utf16[i] = 0xFFFF;
|
|
1279 e->utf8[i][0] = 1;
|
|
1280 e->utf8[i][1] = 0;
|
|
1281 }
|
|
1282 else if (c < 0) {
|
|
1283 if (c < -4)
|
|
1284 return 0;
|
|
1285 e->normal.type[i] = BT_LEAD2 - (c + 2);
|
|
1286 e->utf8[i][0] = 0;
|
|
1287 e->utf16[i] = 0;
|
|
1288 }
|
|
1289 else if (c < 0x80) {
|
|
1290 if (latin1_encoding.type[c] != BT_OTHER
|
|
1291 && latin1_encoding.type[c] != BT_NONXML
|
|
1292 && c != i)
|
|
1293 return 0;
|
|
1294 e->normal.type[i] = latin1_encoding.type[c];
|
|
1295 e->utf8[i][0] = 1;
|
|
1296 e->utf8[i][1] = (char)c;
|
|
1297 e->utf16[i] = c == 0 ? 0xFFFF : c;
|
|
1298 }
|
|
1299 else if (checkCharRefNumber(c) < 0) {
|
|
1300 e->normal.type[i] = BT_NONXML;
|
|
1301 /* This shouldn't really get used. */
|
|
1302 e->utf16[i] = 0xFFFF;
|
|
1303 e->utf8[i][0] = 1;
|
|
1304 e->utf8[i][1] = 0;
|
|
1305 }
|
|
1306 else {
|
|
1307 if (c > 0xFFFF)
|
|
1308 return 0;
|
|
1309 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
|
|
1310 e->normal.type[i] = BT_NMSTRT;
|
|
1311 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
|
|
1312 e->normal.type[i] = BT_NAME;
|
|
1313 else
|
|
1314 e->normal.type[i] = BT_OTHER;
|
|
1315 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
|
|
1316 e->utf16[i] = c;
|
|
1317 }
|
|
1318 }
|
|
1319 e->userData = userData;
|
|
1320 e->convert = convert;
|
|
1321 if (convert) {
|
|
1322 e->normal.isName2 = unknown_isName;
|
|
1323 e->normal.isName3 = unknown_isName;
|
|
1324 e->normal.isName4 = unknown_isName;
|
|
1325 e->normal.isNmstrt2 = unknown_isNmstrt;
|
|
1326 e->normal.isNmstrt3 = unknown_isNmstrt;
|
|
1327 e->normal.isNmstrt4 = unknown_isNmstrt;
|
|
1328 e->normal.isInvalid2 = unknown_isInvalid;
|
|
1329 e->normal.isInvalid3 = unknown_isInvalid;
|
|
1330 e->normal.isInvalid4 = unknown_isInvalid;
|
|
1331 }
|
|
1332 e->normal.enc.utf8Convert = unknown_toUtf8;
|
|
1333 e->normal.enc.utf16Convert = unknown_toUtf16;
|
|
1334 return &(e->normal.enc);
|
|
1335 }
|
|
1336
|
|
1337 /* If this enumeration is changed, getEncodingIndex and encodings
|
|
1338 must also be changed. */
|
|
1339 enum {
|
|
1340 UNKNOWN_ENC = -1,
|
|
1341 ISO_8859_1_ENC = 0,
|
|
1342 US_ASCII_ENC,
|
|
1343 UTF_8_ENC,
|
|
1344 UTF_16_ENC,
|
|
1345 UTF_16BE_ENC,
|
|
1346 UTF_16LE_ENC,
|
|
1347 /* must match encodingNames up to here */
|
|
1348 NO_ENC
|
|
1349 };
|
|
1350
|
|
1351 static
|
|
1352 int getEncodingIndex(const char *name)
|
|
1353 {
|
|
1354 static const char *encodingNames[] = {
|
|
1355 "ISO-8859-1",
|
|
1356 "US-ASCII",
|
|
1357 "UTF-8",
|
|
1358 "UTF-16",
|
|
1359 "UTF-16BE"
|
|
1360 "UTF-16LE",
|
|
1361 };
|
|
1362 int i;
|
|
1363 if (name == 0)
|
|
1364 return NO_ENC;
|
|
1365 for (i = 0; i < sizeof(encodingNames)/sizeof(encodingNames[0]); i++)
|
|
1366 if (streqci(name, encodingNames[i]))
|
|
1367 return i;
|
|
1368 return UNKNOWN_ENC;
|
|
1369 }
|
|
1370
|
|
1371 /* For binary compatibility, we store the index of the encoding specified
|
|
1372 at initialization in the isUtf16 member. */
|
|
1373
|
|
1374 #define INIT_ENC_INDEX(enc) ((enc)->initEnc.isUtf16)
|
|
1375
|
|
1376 /* This is what detects the encoding.
|
|
1377 encodingTable maps from encoding indices to encodings;
|
|
1378 INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
|
|
1379 state is XML_CONTENT_STATE if we're parsing an external text entity,
|
|
1380 and XML_PROLOG_STATE otherwise.
|
|
1381 */
|
|
1382
|
|
1383
|
|
1384 static
|
|
1385 int initScan(const ENCODING **encodingTable,
|
|
1386 const INIT_ENCODING *enc,
|
|
1387 int state,
|
|
1388 const char *ptr,
|
|
1389 const char *end,
|
|
1390 const char **nextTokPtr)
|
|
1391 {
|
|
1392 const ENCODING **encPtr;
|
|
1393
|
|
1394 if (ptr == end)
|
|
1395 return XML_TOK_NONE;
|
|
1396 encPtr = enc->encPtr;
|
|
1397 if (ptr + 1 == end) {
|
|
1398 /* only a single byte available for auto-detection */
|
|
1399 /* a well-formed document entity must have more than one byte */
|
|
1400 if (state != XML_CONTENT_STATE)
|
|
1401 return XML_TOK_PARTIAL;
|
|
1402 /* so we're parsing an external text entity... */
|
|
1403 /* if UTF-16 was externally specified, then we need at least 2 bytes */
|
|
1404 switch (INIT_ENC_INDEX(enc)) {
|
|
1405 case UTF_16_ENC:
|
|
1406 case UTF_16LE_ENC:
|
|
1407 case UTF_16BE_ENC:
|
|
1408 return XML_TOK_PARTIAL;
|
|
1409 }
|
|
1410 switch ((unsigned char)*ptr) {
|
|
1411 case 0xFE:
|
|
1412 case 0xFF:
|
|
1413 case 0xEF: /* possibly first byte of UTF-8 BOM */
|
|
1414 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
|
|
1415 && state == XML_CONTENT_STATE)
|
|
1416 break;
|
|
1417 /* fall through */
|
|
1418 case 0x00:
|
|
1419 case 0x3C:
|
|
1420 return XML_TOK_PARTIAL;
|
|
1421 }
|
|
1422 }
|
|
1423 else {
|
|
1424 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
|
|
1425 case 0xFEFF:
|
|
1426 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
|
|
1427 && state == XML_CONTENT_STATE)
|
|
1428 break;
|
|
1429 *nextTokPtr = ptr + 2;
|
|
1430 *encPtr = encodingTable[UTF_16BE_ENC];
|
|
1431 return XML_TOK_BOM;
|
|
1432 /* 00 3C is handled in the default case */
|
|
1433 case 0x3C00:
|
|
1434 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
|
|
1435 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
|
|
1436 && state == XML_CONTENT_STATE)
|
|
1437 break;
|
|
1438 *encPtr = encodingTable[UTF_16LE_ENC];
|
|
1439 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
|
|
1440 case 0xFFFE:
|
|
1441 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
|
|
1442 && state == XML_CONTENT_STATE)
|
|
1443 break;
|
|
1444 *nextTokPtr = ptr + 2;
|
|
1445 *encPtr = encodingTable[UTF_16LE_ENC];
|
|
1446 return XML_TOK_BOM;
|
|
1447 case 0xEFBB:
|
|
1448 /* Maybe a UTF-8 BOM (EF BB BF) */
|
|
1449 /* If there's an explicitly specified (external) encoding
|
|
1450 of ISO-8859-1 or some flavour of UTF-16
|
|
1451 and this is an external text entity,
|
|
1452 don't look for the BOM,
|
|
1453 because it might be a legal data. */
|
|
1454 if (state == XML_CONTENT_STATE) {
|
|
1455 int e = INIT_ENC_INDEX(enc);
|
|
1456 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC)
|
|
1457 break;
|
|
1458 }
|
|
1459 if (ptr + 2 == end)
|
|
1460 return XML_TOK_PARTIAL;
|
|
1461 if ((unsigned char)ptr[2] == 0xBF) {
|
|
1462 *encPtr = encodingTable[UTF_8_ENC];
|
|
1463 return XML_TOK_BOM;
|
|
1464 }
|
|
1465 break;
|
|
1466 default:
|
|
1467 if (ptr[0] == '\0') {
|
|
1468 /* 0 isn't a legal data character. Furthermore a document entity can only
|
|
1469 start with ASCII characters. So the only way this can fail to be big-endian
|
|
1470 UTF-16 if it it's an external parsed general entity that's labelled as
|
|
1471 UTF-16LE. */
|
|
1472 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
|
|
1473 break;
|
|
1474 *encPtr = encodingTable[UTF_16BE_ENC];
|
|
1475 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
|
|
1476 }
|
|
1477 else if (ptr[1] == '\0') {
|
|
1478 /* We could recover here in the case:
|
|
1479 - parsing an external entity
|
|
1480 - second byte is 0
|
|
1481 - no externally specified encoding
|
|
1482 - no encoding declaration
|
|
1483 by assuming UTF-16LE. But we don't, because this would mean when
|
|
1484 presented just with a single byte, we couldn't reliably determine
|
|
1485 whether we needed further bytes. */
|
|
1486 if (state == XML_CONTENT_STATE)
|
|
1487 break;
|
|
1488 *encPtr = encodingTable[UTF_16LE_ENC];
|
|
1489 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
|
|
1490 }
|
|
1491 break;
|
|
1492 }
|
|
1493 }
|
|
1494 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
|
|
1495 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
|
|
1496 }
|
|
1497
|
|
1498
|
|
1499 #define NS(x) x
|
|
1500 #define ns(x) x
|
|
1501 #include "xmltok_ns.c"
|
|
1502 #undef NS
|
|
1503 #undef ns
|
|
1504
|
|
1505 #ifdef XML_NS
|
|
1506
|
|
1507 #define NS(x) x ## NS
|
|
1508 #define ns(x) x ## _ns
|
|
1509
|
|
1510 #include "xmltok_ns.c"
|
|
1511
|
|
1512 #undef NS
|
|
1513 #undef ns
|
|
1514
|
|
1515 ENCODING *
|
|
1516 XmlInitUnknownEncodingNS(void *mem,
|
|
1517 int *table,
|
|
1518 int (*convert)(void *userData, const char *p),
|
|
1519 void *userData)
|
|
1520 {
|
|
1521 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
|
|
1522 if (enc)
|
|
1523 ((struct normal_encoding *)enc)->type[':'] = BT_COLON;
|
|
1524 return enc;
|
|
1525 }
|
|
1526
|
|
1527 #endif /* XML_NS */
|