2086
|
1 /*
|
|
2 The contents of this file are subject to the Mozilla Public License
|
|
3 Version 1.1 (the "License"); you may not use this file except in
|
|
4 compliance with the License. You may obtain a copy of the License at
|
|
5 http://www.mozilla.org/MPL/
|
|
6
|
|
7 Software distributed under the License is distributed on an "AS IS"
|
|
8 basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
|
9 License for the specific language governing rights and limitations
|
|
10 under the License.
|
|
11
|
|
12 The Original Code is expat.
|
|
13
|
|
14 The Initial Developer of the Original Code is James Clark.
|
|
15 Portions created by James Clark are Copyright (C) 1998, 1999
|
|
16 James Clark. All Rights Reserved.
|
|
17
|
|
18 Contributor(s):
|
|
19
|
|
20 */
|
|
21
|
|
22 #include "xmldef.h"
|
|
23 #include "xmltok.h"
|
|
24 #include "nametab.h"
|
|
25
|
|
26 #define VTABLE1 \
|
|
27 { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
|
|
28 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
|
|
29 PREFIX(sameName), \
|
|
30 PREFIX(nameMatchesAscii), \
|
|
31 PREFIX(nameLength), \
|
|
32 PREFIX(skipS), \
|
|
33 PREFIX(getAtts), \
|
|
34 PREFIX(charRefNumber), \
|
|
35 PREFIX(predefinedEntityName), \
|
|
36 PREFIX(updatePosition), \
|
|
37 PREFIX(isPublicId)
|
|
38
|
|
39 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
|
|
40
|
|
41 #define UCS2_GET_NAMING(pages, hi, lo) \
|
|
42 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
|
|
43
|
|
44 /* A 2 byte UTF-8 representation splits the characters 11 bits
|
|
45 between the bottom 5 and 6 bits of the bytes.
|
|
46 We need 8 bits to index into pages, 3 bits to add to that index and
|
|
47 5 bits to generate the mask. */
|
|
48 #define UTF8_GET_NAMING2(pages, byte) \
|
|
49 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
|
|
50 + ((((byte)[0]) & 3) << 1) \
|
|
51 + ((((byte)[1]) >> 5) & 1)] \
|
|
52 & (1 << (((byte)[1]) & 0x1F)))
|
|
53
|
|
54 /* A 3 byte UTF-8 representation splits the characters 16 bits
|
|
55 between the bottom 4, 6 and 6 bits of the bytes.
|
|
56 We need 8 bits to index into pages, 3 bits to add to that index and
|
|
57 5 bits to generate the mask. */
|
|
58 #define UTF8_GET_NAMING3(pages, byte) \
|
|
59 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
|
|
60 + ((((byte)[1]) >> 2) & 0xF)] \
|
|
61 << 3) \
|
|
62 + ((((byte)[1]) & 3) << 1) \
|
|
63 + ((((byte)[2]) >> 5) & 1)] \
|
|
64 & (1 << (((byte)[2]) & 0x1F)))
|
|
65
|
|
66 #define UTF8_GET_NAMING(pages, p, n) \
|
|
67 ((n) == 2 \
|
|
68 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
|
|
69 : ((n) == 3 \
|
|
70 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
|
|
71 : 0))
|
|
72
|
|
73 #define UTF8_INVALID3(p) \
|
|
74 ((*p) == 0xED \
|
|
75 ? (((p)[1] & 0x20) != 0) \
|
|
76 : ((*p) == 0xEF \
|
|
77 ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
|
|
78 : 0))
|
|
79
|
|
80 #define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
|
|
81
|
|
82 static
|
|
83 int isNever(const ENCODING *enc, const char *p)
|
|
84 {
|
|
85 return 0;
|
|
86 }
|
|
87
|
|
88 static
|
|
89 int utf8_isName2(const ENCODING *enc, const char *p)
|
|
90 {
|
|
91 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
|
|
92 }
|
|
93
|
|
94 static
|
|
95 int utf8_isName3(const ENCODING *enc, const char *p)
|
|
96 {
|
|
97 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
|
|
98 }
|
|
99
|
|
100 #define utf8_isName4 isNever
|
|
101
|
|
102 static
|
|
103 int utf8_isNmstrt2(const ENCODING *enc, const char *p)
|
|
104 {
|
|
105 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
|
|
106 }
|
|
107
|
|
108 static
|
|
109 int utf8_isNmstrt3(const ENCODING *enc, const char *p)
|
|
110 {
|
|
111 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
|
|
112 }
|
|
113
|
|
114 #define utf8_isNmstrt4 isNever
|
|
115
|
|
116 #define utf8_isInvalid2 isNever
|
|
117
|
|
118 static
|
|
119 int utf8_isInvalid3(const ENCODING *enc, const char *p)
|
|
120 {
|
|
121 return UTF8_INVALID3((const unsigned char *)p);
|
|
122 }
|
|
123
|
|
124 static
|
|
125 int utf8_isInvalid4(const ENCODING *enc, const char *p)
|
|
126 {
|
|
127 return UTF8_INVALID4((const unsigned char *)p);
|
|
128 }
|
|
129
|
|
130 struct normal_encoding {
|
|
131 ENCODING enc;
|
|
132 unsigned char type[256];
|
|
133 #ifdef XML_MIN_SIZE
|
|
134 int (*byteType)(const ENCODING *, const char *);
|
|
135 int (*isNameMin)(const ENCODING *, const char *);
|
|
136 int (*isNmstrtMin)(const ENCODING *, const char *);
|
|
137 int (*byteToAscii)(const ENCODING *, const char *);
|
|
138 int (*charMatches)(const ENCODING *, const char *, int);
|
|
139 #endif /* XML_MIN_SIZE */
|
|
140 int (*isName2)(const ENCODING *, const char *);
|
|
141 int (*isName3)(const ENCODING *, const char *);
|
|
142 int (*isName4)(const ENCODING *, const char *);
|
|
143 int (*isNmstrt2)(const ENCODING *, const char *);
|
|
144 int (*isNmstrt3)(const ENCODING *, const char *);
|
|
145 int (*isNmstrt4)(const ENCODING *, const char *);
|
|
146 int (*isInvalid2)(const ENCODING *, const char *);
|
|
147 int (*isInvalid3)(const ENCODING *, const char *);
|
|
148 int (*isInvalid4)(const ENCODING *, const char *);
|
|
149 };
|
|
150
|
|
151 #ifdef XML_MIN_SIZE
|
|
152
|
|
153 #define STANDARD_VTABLE(E) \
|
|
154 E ## byteType, \
|
|
155 E ## isNameMin, \
|
|
156 E ## isNmstrtMin, \
|
|
157 E ## byteToAscii, \
|
|
158 E ## charMatches,
|
|
159
|
|
160 #else
|
|
161
|
|
162 #define STANDARD_VTABLE(E) /* as nothing */
|
|
163
|
|
164 #endif
|
|
165
|
|
166 #define NORMAL_VTABLE(E) \
|
|
167 E ## isName2, \
|
|
168 E ## isName3, \
|
|
169 E ## isName4, \
|
|
170 E ## isNmstrt2, \
|
|
171 E ## isNmstrt3, \
|
|
172 E ## isNmstrt4, \
|
|
173 E ## isInvalid2, \
|
|
174 E ## isInvalid3, \
|
|
175 E ## isInvalid4
|
|
176
|
|
177 static int checkCharRefNumber(int);
|
|
178
|
|
179 #include "xmltok_impl.h"
|
|
180
|
|
181 #ifdef XML_MIN_SIZE
|
|
182 #define sb_isNameMin isNever
|
|
183 #define sb_isNmstrtMin isNever
|
|
184 #endif
|
|
185
|
|
186 #ifdef XML_MIN_SIZE
|
|
187 #define MINBPC(enc) ((enc)->minBytesPerChar)
|
|
188 #else
|
|
189 /* minimum bytes per character */
|
|
190 #define MINBPC(enc) 1
|
|
191 #endif
|
|
192
|
|
193 #define SB_BYTE_TYPE(enc, p) \
|
|
194 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
|
|
195
|
|
196 #ifdef XML_MIN_SIZE
|
|
197 static
|
|
198 int sb_byteType(const ENCODING *enc, const char *p)
|
|
199 {
|
|
200 return SB_BYTE_TYPE(enc, p);
|
|
201 }
|
|
202 #define BYTE_TYPE(enc, p) \
|
|
203 (((const struct normal_encoding *)(enc))->byteType(enc, p))
|
|
204 #else
|
|
205 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
|
|
206 #endif
|
|
207
|
|
208 #ifdef XML_MIN_SIZE
|
|
209 #define BYTE_TO_ASCII(enc, p) \
|
|
210 (((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
|
|
211 static
|
|
212 int sb_byteToAscii(const ENCODING *enc, const char *p)
|
|
213 {
|
|
214 return *p;
|
|
215 }
|
|
216 #else
|
|
217 #define BYTE_TO_ASCII(enc, p) (*p)
|
|
218 #endif
|
|
219
|
|
220 #define IS_NAME_CHAR(enc, p, n) \
|
|
221 (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
|
|
222 #define IS_NMSTRT_CHAR(enc, p, n) \
|
|
223 (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
|
|
224 #define IS_INVALID_CHAR(enc, p, n) \
|
|
225 (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
|
|
226
|
|
227 #ifdef XML_MIN_SIZE
|
|
228 #define IS_NAME_CHAR_MINBPC(enc, p) \
|
|
229 (((const struct normal_encoding *)(enc))->isNameMin(enc, p))
|
|
230 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
|
|
231 (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
|
|
232 #else
|
|
233 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
|
|
234 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
|
|
235 #endif
|
|
236
|
|
237 #ifdef XML_MIN_SIZE
|
|
238 #define CHAR_MATCHES(enc, p, c) \
|
|
239 (((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
|
|
240 static
|
|
241 int sb_charMatches(const ENCODING *enc, const char *p, int c)
|
|
242 {
|
|
243 return *p == c;
|
|
244 }
|
|
245 #else
|
|
246 /* c is an ASCII character */
|
|
247 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
|
|
248 #endif
|
|
249
|
|
250 #define PREFIX(ident) normal_ ## ident
|
|
251 #include "xmltok_impl.c"
|
|
252
|
|
253 #undef MINBPC
|
|
254 #undef BYTE_TYPE
|
|
255 #undef BYTE_TO_ASCII
|
|
256 #undef CHAR_MATCHES
|
|
257 #undef IS_NAME_CHAR
|
|
258 #undef IS_NAME_CHAR_MINBPC
|
|
259 #undef IS_NMSTRT_CHAR
|
|
260 #undef IS_NMSTRT_CHAR_MINBPC
|
|
261 #undef IS_INVALID_CHAR
|
|
262
|
|
263 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
|
|
264 UTF8_cval1 = 0x00,
|
|
265 UTF8_cval2 = 0xc0,
|
|
266 UTF8_cval3 = 0xe0,
|
|
267 UTF8_cval4 = 0xf0
|
|
268 };
|
|
269
|
|
270 static
|
|
271 void utf8_toUtf8(const ENCODING *enc,
|
|
272 const char **fromP, const char *fromLim,
|
|
273 char **toP, const char *toLim)
|
|
274 {
|
|
275 char *to;
|
|
276 const char *from;
|
|
277 if (fromLim - *fromP > toLim - *toP) {
|
|
278 /* Avoid copying partial characters. */
|
|
279 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
|
|
280 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
|
|
281 break;
|
|
282 }
|
|
283 for (to = *toP, from = *fromP; from != fromLim; from++, to++)
|
|
284 *to = *from;
|
|
285 *fromP = from;
|
|
286 *toP = to;
|
|
287 }
|
|
288
|
|
289 static
|
|
290 void utf8_toUtf16(const ENCODING *enc,
|
|
291 const char **fromP, const char *fromLim,
|
|
292 unsigned short **toP, const unsigned short *toLim)
|
|
293 {
|
|
294 unsigned short *to = *toP;
|
|
295 const char *from = *fromP;
|
|
296 while (from != fromLim && to != toLim) {
|
|
297 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
|
|
298 case BT_LEAD2:
|
|
299 *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
|
|
300 from += 2;
|
|
301 break;
|
|
302 case BT_LEAD3:
|
|
303 *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
|
|
304 from += 3;
|
|
305 break;
|
|
306 case BT_LEAD4:
|
|
307 {
|
|
308 unsigned long n;
|
|
309 if (to + 1 == toLim)
|
|
310 break;
|
|
311 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
|
|
312 n -= 0x10000;
|
|
313 to[0] = (unsigned short)((n >> 10) | 0xD800);
|
|
314 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
|
|
315 to += 2;
|
|
316 from += 4;
|
|
317 }
|
|
318 break;
|
|
319 default:
|
|
320 *to++ = *from++;
|
|
321 break;
|
|
322 }
|
|
323 }
|
|
324 *fromP = from;
|
|
325 *toP = to;
|
|
326 }
|
|
327
|
|
328 #ifdef XML_NS
|
|
329 static const struct normal_encoding utf8_encoding_ns = {
|
|
330 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
|
|
331 {
|
|
332 #include "asciitab.h"
|
|
333 #include "utf8tab.h"
|
|
334 },
|
|
335 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
|
|
336 };
|
|
337 #endif
|
|
338
|
|
339 static const struct normal_encoding utf8_encoding = {
|
|
340 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
|
|
341 {
|
|
342 #define BT_COLON BT_NMSTRT
|
|
343 #include "asciitab.h"
|
|
344 #undef BT_COLON
|
|
345 #include "utf8tab.h"
|
|
346 },
|
|
347 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
|
|
348 };
|
|
349
|
|
350 #ifdef XML_NS
|
|
351
|
|
352 static const struct normal_encoding internal_utf8_encoding_ns = {
|
|
353 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
|
|
354 {
|
|
355 #include "iasciitab.h"
|
|
356 #include "utf8tab.h"
|
|
357 },
|
|
358 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
|
|
359 };
|
|
360
|
|
361 #endif
|
|
362
|
|
363 static const struct normal_encoding internal_utf8_encoding = {
|
|
364 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
|
|
365 {
|
|
366 #define BT_COLON BT_NMSTRT
|
|
367 #include "iasciitab.h"
|
|
368 #undef BT_COLON
|
|
369 #include "utf8tab.h"
|
|
370 },
|
|
371 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
|
|
372 };
|
|
373
|
|
374 static
|
|
375 void latin1_toUtf8(const ENCODING *enc,
|
|
376 const char **fromP, const char *fromLim,
|
|
377 char **toP, const char *toLim)
|
|
378 {
|
|
379 for (;;) {
|
|
380 unsigned char c;
|
|
381 if (*fromP == fromLim)
|
|
382 break;
|
|
383 c = (unsigned char)**fromP;
|
|
384 if (c & 0x80) {
|
|
385 if (toLim - *toP < 2)
|
|
386 break;
|
|
387 *(*toP)++ = ((c >> 6) | UTF8_cval2);
|
|
388 *(*toP)++ = ((c & 0x3f) | 0x80);
|
|
389 (*fromP)++;
|
|
390 }
|
|
391 else {
|
|
392 if (*toP == toLim)
|
|
393 break;
|
|
394 *(*toP)++ = *(*fromP)++;
|
|
395 }
|
|
396 }
|
|
397 }
|
|
398
|
|
399 static
|
|
400 void latin1_toUtf16(const ENCODING *enc,
|
|
401 const char **fromP, const char *fromLim,
|
|
402 unsigned short **toP, const unsigned short *toLim)
|
|
403 {
|
|
404 while (*fromP != fromLim && *toP != toLim)
|
|
405 *(*toP)++ = (unsigned char)*(*fromP)++;
|
|
406 }
|
|
407
|
|
408 #ifdef XML_NS
|
|
409
|
|
410 static const struct normal_encoding latin1_encoding_ns = {
|
|
411 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
|
|
412 {
|
|
413 #include "asciitab.h"
|
|
414 #include "latin1tab.h"
|
|
415 },
|
|
416 STANDARD_VTABLE(sb_)
|
|
417 };
|
|
418
|
|
419 #endif
|
|
420
|
|
421 static const struct normal_encoding latin1_encoding = {
|
|
422 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
|
|
423 {
|
|
424 #define BT_COLON BT_NMSTRT
|
|
425 #include "asciitab.h"
|
|
426 #undef BT_COLON
|
|
427 #include "latin1tab.h"
|
|
428 },
|
|
429 STANDARD_VTABLE(sb_)
|
|
430 };
|
|
431
|
|
432 static
|
|
433 void ascii_toUtf8(const ENCODING *enc,
|
|
434 const char **fromP, const char *fromLim,
|
|
435 char **toP, const char *toLim)
|
|
436 {
|
|
437 while (*fromP != fromLim && *toP != toLim)
|
|
438 *(*toP)++ = *(*fromP)++;
|
|
439 }
|
|
440
|
|
441 #ifdef XML_NS
|
|
442
|
|
443 static const struct normal_encoding ascii_encoding_ns = {
|
|
444 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
|
|
445 {
|
|
446 #include "asciitab.h"
|
|
447 /* BT_NONXML == 0 */
|
|
448 },
|
|
449 STANDARD_VTABLE(sb_)
|
|
450 };
|
|
451
|
|
452 #endif
|
|
453
|
|
454 static const struct normal_encoding ascii_encoding = {
|
|
455 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
|
|
456 {
|
|
457 #define BT_COLON BT_NMSTRT
|
|
458 #include "asciitab.h"
|
|
459 #undef BT_COLON
|
|
460 /* BT_NONXML == 0 */
|
|
461 },
|
|
462 STANDARD_VTABLE(sb_)
|
|
463 };
|
|
464
|
|
465 static int unicode_byte_type(char hi, char lo)
|
|
466 {
|
|
467 switch ((unsigned char)hi) {
|
|
468 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
|
|
469 return BT_LEAD4;
|
|
470 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
|
|
471 return BT_TRAIL;
|
|
472 case 0xFF:
|
|
473 switch ((unsigned char)lo) {
|
|
474 case 0xFF:
|
|
475 case 0xFE:
|
|
476 return BT_NONXML;
|
|
477 }
|
|
478 break;
|
|
479 }
|
|
480 return BT_NONASCII;
|
|
481 }
|
|
482
|
|
483 #define DEFINE_UTF16_TO_UTF8(E) \
|
|
484 static \
|
|
485 void E ## toUtf8(const ENCODING *enc, \
|
|
486 const char **fromP, const char *fromLim, \
|
|
487 char **toP, const char *toLim) \
|
|
488 { \
|
|
489 const char *from; \
|
|
490 for (from = *fromP; from != fromLim; from += 2) { \
|
|
491 int plane; \
|
|
492 unsigned char lo2; \
|
|
493 unsigned char lo = GET_LO(from); \
|
|
494 unsigned char hi = GET_HI(from); \
|
|
495 switch (hi) { \
|
|
496 case 0: \
|
|
497 if (lo < 0x80) { \
|
|
498 if (*toP == toLim) { \
|
|
499 *fromP = from; \
|
|
500 return; \
|
|
501 } \
|
|
502 *(*toP)++ = lo; \
|
|
503 break; \
|
|
504 } \
|
|
505 /* fall through */ \
|
|
506 case 0x1: case 0x2: case 0x3: \
|
|
507 case 0x4: case 0x5: case 0x6: case 0x7: \
|
|
508 if (toLim - *toP < 2) { \
|
|
509 *fromP = from; \
|
|
510 return; \
|
|
511 } \
|
|
512 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
|
|
513 *(*toP)++ = ((lo & 0x3f) | 0x80); \
|
|
514 break; \
|
|
515 default: \
|
|
516 if (toLim - *toP < 3) { \
|
|
517 *fromP = from; \
|
|
518 return; \
|
|
519 } \
|
|
520 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
|
|
521 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
|
|
522 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
|
|
523 *(*toP)++ = ((lo & 0x3f) | 0x80); \
|
|
524 break; \
|
|
525 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
|
|
526 if (toLim - *toP < 4) { \
|
|
527 *fromP = from; \
|
|
528 return; \
|
|
529 } \
|
|
530 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
|
|
531 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
|
|
532 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
|
|
533 from += 2; \
|
|
534 lo2 = GET_LO(from); \
|
|
535 *(*toP)++ = (((lo & 0x3) << 4) \
|
|
536 | ((GET_HI(from) & 0x3) << 2) \
|
|
537 | (lo2 >> 6) \
|
|
538 | 0x80); \
|
|
539 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
|
|
540 break; \
|
|
541 } \
|
|
542 } \
|
|
543 *fromP = from; \
|
|
544 }
|
|
545
|
|
546 #define DEFINE_UTF16_TO_UTF16(E) \
|
|
547 static \
|
|
548 void E ## toUtf16(const ENCODING *enc, \
|
|
549 const char **fromP, const char *fromLim, \
|
|
550 unsigned short **toP, const unsigned short *toLim) \
|
|
551 { \
|
|
552 /* Avoid copying first half only of surrogate */ \
|
|
553 if (fromLim - *fromP > ((toLim - *toP) << 1) \
|
|
554 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
|
|
555 fromLim -= 2; \
|
|
556 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
|
|
557 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
|
|
558 }
|
|
559
|
|
560 #define SET2(ptr, ch) \
|
|
561 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
|
|
562 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
|
|
563 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
|
|
564
|
|
565 DEFINE_UTF16_TO_UTF8(little2_)
|
|
566 DEFINE_UTF16_TO_UTF16(little2_)
|
|
567
|
|
568 #undef SET2
|
|
569 #undef GET_LO
|
|
570 #undef GET_HI
|
|
571
|
|
572 #define SET2(ptr, ch) \
|
|
573 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
|
|
574 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
|
|
575 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
|
|
576
|
|
577 DEFINE_UTF16_TO_UTF8(big2_)
|
|
578 DEFINE_UTF16_TO_UTF16(big2_)
|
|
579
|
|
580 #undef SET2
|
|
581 #undef GET_LO
|
|
582 #undef GET_HI
|
|
583
|
|
584 #define LITTLE2_BYTE_TYPE(enc, p) \
|
|
585 ((p)[1] == 0 \
|
|
586 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
|
|
587 : unicode_byte_type((p)[1], (p)[0]))
|
|
588 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
|
|
589 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
|
|
590 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
|
|
591 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
|
|
592 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
|
|
593 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
|
|
594
|
|
595 #ifdef XML_MIN_SIZE
|
|
596
|
|
597 static
|
|
598 int little2_byteType(const ENCODING *enc, const char *p)
|
|
599 {
|
|
600 return LITTLE2_BYTE_TYPE(enc, p);
|
|
601 }
|
|
602
|
|
603 static
|
|
604 int little2_byteToAscii(const ENCODING *enc, const char *p)
|
|
605 {
|
|
606 return LITTLE2_BYTE_TO_ASCII(enc, p);
|
|
607 }
|
|
608
|
|
609 static
|
|
610 int little2_charMatches(const ENCODING *enc, const char *p, int c)
|
|
611 {
|
|
612 return LITTLE2_CHAR_MATCHES(enc, p, c);
|
|
613 }
|
|
614
|
|
615 static
|
|
616 int little2_isNameMin(const ENCODING *enc, const char *p)
|
|
617 {
|
|
618 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
|
|
619 }
|
|
620
|
|
621 static
|
|
622 int little2_isNmstrtMin(const ENCODING *enc, const char *p)
|
|
623 {
|
|
624 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
|
|
625 }
|
|
626
|
|
627 #undef VTABLE
|
|
628 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
|
|
629
|
|
630 #else /* not XML_MIN_SIZE */
|
|
631
|
|
632 #undef PREFIX
|
|
633 #define PREFIX(ident) little2_ ## ident
|
|
634 #define MINBPC(enc) 2
|
|
635 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
|
|
636 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
|
|
637 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
|
|
638 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
|
|
639 #define IS_NAME_CHAR(enc, p, n) 0
|
|
640 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
|
|
641 #define IS_NMSTRT_CHAR(enc, p, n) (0)
|
|
642 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
|
|
643
|
|
644 #include "xmltok_impl.c"
|
|
645
|
|
646 #undef MINBPC
|
|
647 #undef BYTE_TYPE
|
|
648 #undef BYTE_TO_ASCII
|
|
649 #undef CHAR_MATCHES
|
|
650 #undef IS_NAME_CHAR
|
|
651 #undef IS_NAME_CHAR_MINBPC
|
|
652 #undef IS_NMSTRT_CHAR
|
|
653 #undef IS_NMSTRT_CHAR_MINBPC
|
|
654 #undef IS_INVALID_CHAR
|
|
655
|
|
656 #endif /* not XML_MIN_SIZE */
|
|
657
|
|
658 #ifdef XML_NS
|
|
659
|
|
660 static const struct normal_encoding little2_encoding_ns = {
|
|
661 { VTABLE, 2, 0,
|
|
662 #if XML_BYTE_ORDER == 12
|
|
663 1
|
|
664 #else
|
|
665 0
|
|
666 #endif
|
|
667 },
|
|
668 {
|
|
669 #include "asciitab.h"
|
|
670 #include "latin1tab.h"
|
|
671 },
|
|
672 STANDARD_VTABLE(little2_)
|
|
673 };
|
|
674
|
|
675 #endif
|
|
676
|
|
677 static const struct normal_encoding little2_encoding = {
|
|
678 { VTABLE, 2, 0,
|
|
679 #if XML_BYTE_ORDER == 12
|
|
680 1
|
|
681 #else
|
|
682 0
|
|
683 #endif
|
|
684 },
|
|
685 {
|
|
686 #define BT_COLON BT_NMSTRT
|
|
687 #include "asciitab.h"
|
|
688 #undef BT_COLON
|
|
689 #include "latin1tab.h"
|
|
690 },
|
|
691 STANDARD_VTABLE(little2_)
|
|
692 };
|
|
693
|
|
694 #if XML_BYTE_ORDER != 21
|
|
695
|
|
696 #ifdef XML_NS
|
|
697
|
|
698 static const struct normal_encoding internal_little2_encoding_ns = {
|
|
699 { VTABLE, 2, 0, 1 },
|
|
700 {
|
|
701 #include "iasciitab.h"
|
|
702 #include "latin1tab.h"
|
|
703 },
|
|
704 STANDARD_VTABLE(little2_)
|
|
705 };
|
|
706
|
|
707 #endif
|
|
708
|
|
709 static const struct normal_encoding internal_little2_encoding = {
|
|
710 { VTABLE, 2, 0, 1 },
|
|
711 {
|
|
712 #define BT_COLON BT_NMSTRT
|
|
713 #include "iasciitab.h"
|
|
714 #undef BT_COLON
|
|
715 #include "latin1tab.h"
|
|
716 },
|
|
717 STANDARD_VTABLE(little2_)
|
|
718 };
|
|
719
|
|
720 #endif
|
|
721
|
|
722
|
|
723 #define BIG2_BYTE_TYPE(enc, p) \
|
|
724 ((p)[0] == 0 \
|
|
725 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
|
|
726 : unicode_byte_type((p)[0], (p)[1]))
|
|
727 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
|
|
728 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
|
|
729 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
|
|
730 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
|
|
731 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
|
|
732 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
|
|
733
|
|
734 #ifdef XML_MIN_SIZE
|
|
735
|
|
736 static
|
|
737 int big2_byteType(const ENCODING *enc, const char *p)
|
|
738 {
|
|
739 return BIG2_BYTE_TYPE(enc, p);
|
|
740 }
|
|
741
|
|
742 static
|
|
743 int big2_byteToAscii(const ENCODING *enc, const char *p)
|
|
744 {
|
|
745 return BIG2_BYTE_TO_ASCII(enc, p);
|
|
746 }
|
|
747
|
|
748 static
|
|
749 int big2_charMatches(const ENCODING *enc, const char *p, int c)
|
|
750 {
|
|
751 return BIG2_CHAR_MATCHES(enc, p, c);
|
|
752 }
|
|
753
|
|
754 static
|
|
755 int big2_isNameMin(const ENCODING *enc, const char *p)
|
|
756 {
|
|
757 return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
|
|
758 }
|
|
759
|
|
760 static
|
|
761 int big2_isNmstrtMin(const ENCODING *enc, const char *p)
|
|
762 {
|
|
763 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
|
|
764 }
|
|
765
|
|
766 #undef VTABLE
|
|
767 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
|
|
768
|
|
769 #else /* not XML_MIN_SIZE */
|
|
770
|
|
771 #undef PREFIX
|
|
772 #define PREFIX(ident) big2_ ## ident
|
|
773 #define MINBPC(enc) 2
|
|
774 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
|
|
775 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
|
|
776 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
|
|
777 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
|
|
778 #define IS_NAME_CHAR(enc, p, n) 0
|
|
779 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
|
|
780 #define IS_NMSTRT_CHAR(enc, p, n) (0)
|
|
781 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
|
|
782
|
|
783 #include "xmltok_impl.c"
|
|
784
|
|
785 #undef MINBPC
|
|
786 #undef BYTE_TYPE
|
|
787 #undef BYTE_TO_ASCII
|
|
788 #undef CHAR_MATCHES
|
|
789 #undef IS_NAME_CHAR
|
|
790 #undef IS_NAME_CHAR_MINBPC
|
|
791 #undef IS_NMSTRT_CHAR
|
|
792 #undef IS_NMSTRT_CHAR_MINBPC
|
|
793 #undef IS_INVALID_CHAR
|
|
794
|
|
795 #endif /* not XML_MIN_SIZE */
|
|
796
|
|
797 #ifdef XML_NS
|
|
798
|
|
799 static const struct normal_encoding big2_encoding_ns = {
|
|
800 { VTABLE, 2, 0,
|
|
801 #if XML_BYTE_ORDER == 21
|
|
802 1
|
|
803 #else
|
|
804 0
|
|
805 #endif
|
|
806 },
|
|
807 {
|
|
808 #include "asciitab.h"
|
|
809 #include "latin1tab.h"
|
|
810 },
|
|
811 STANDARD_VTABLE(big2_)
|
|
812 };
|
|
813
|
|
814 #endif
|
|
815
|
|
816 static const struct normal_encoding big2_encoding = {
|
|
817 { VTABLE, 2, 0,
|
|
818 #if XML_BYTE_ORDER == 21
|
|
819 1
|
|
820 #else
|
|
821 0
|
|
822 #endif
|
|
823 },
|
|
824 {
|
|
825 #define BT_COLON BT_NMSTRT
|
|
826 #include "asciitab.h"
|
|
827 #undef BT_COLON
|
|
828 #include "latin1tab.h"
|
|
829 },
|
|
830 STANDARD_VTABLE(big2_)
|
|
831 };
|
|
832
|
|
833 #if XML_BYTE_ORDER != 12
|
|
834
|
|
835 #ifdef XML_NS
|
|
836
|
|
837 static const struct normal_encoding internal_big2_encoding_ns = {
|
|
838 { VTABLE, 2, 0, 1 },
|
|
839 {
|
|
840 #include "iasciitab.h"
|
|
841 #include "latin1tab.h"
|
|
842 },
|
|
843 STANDARD_VTABLE(big2_)
|
|
844 };
|
|
845
|
|
846 #endif
|
|
847
|
|
848 static const struct normal_encoding internal_big2_encoding = {
|
|
849 { VTABLE, 2, 0, 1 },
|
|
850 {
|
|
851 #define BT_COLON BT_NMSTRT
|
|
852 #include "iasciitab.h"
|
|
853 #undef BT_COLON
|
|
854 #include "latin1tab.h"
|
|
855 },
|
|
856 STANDARD_VTABLE(big2_)
|
|
857 };
|
|
858
|
|
859 #endif
|
|
860
|
|
861 #undef PREFIX
|
|
862
|
|
863 static
|
|
864 int streqci(const char *s1, const char *s2)
|
|
865 {
|
|
866 for (;;) {
|
|
867 char c1 = *s1++;
|
|
868 char c2 = *s2++;
|
|
869 if ('a' <= c1 && c1 <= 'z')
|
|
870 c1 += 'A' - 'a';
|
|
871 if ('a' <= c2 && c2 <= 'z')
|
|
872 c2 += 'A' - 'a';
|
|
873 if (c1 != c2)
|
|
874 return 0;
|
|
875 if (!c1)
|
|
876 break;
|
|
877 }
|
|
878 return 1;
|
|
879 }
|
|
880
|
|
881 static
|
|
882 void initUpdatePosition(const ENCODING *enc, const char *ptr,
|
|
883 const char *end, POSITION *pos)
|
|
884 {
|
|
885 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
|
|
886 }
|
|
887
|
|
888 static
|
|
889 int toAscii(const ENCODING *enc, const char *ptr, const char *end)
|
|
890 {
|
|
891 char buf[1];
|
|
892 char *p = buf;
|
|
893 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
|
|
894 if (p == buf)
|
|
895 return -1;
|
|
896 else
|
|
897 return buf[0];
|
|
898 }
|
|
899
|
|
900 static
|
|
901 int isSpace(int c)
|
|
902 {
|
|
903 switch (c) {
|
|
904 case 0x20:
|
|
905 case 0xD:
|
|
906 case 0xA:
|
|
907 case 0x9:
|
|
908 return 1;
|
|
909 }
|
|
910 return 0;
|
|
911 }
|
|
912
|
|
913 /* Return 1 if there's just optional white space
|
|
914 or there's an S followed by name=val. */
|
|
915 static
|
|
916 int parsePseudoAttribute(const ENCODING *enc,
|
|
917 const char *ptr,
|
|
918 const char *end,
|
|
919 const char **namePtr,
|
|
920 const char **valPtr,
|
|
921 const char **nextTokPtr)
|
|
922 {
|
|
923 int c;
|
|
924 char open;
|
|
925 if (ptr == end) {
|
|
926 *namePtr = 0;
|
|
927 return 1;
|
|
928 }
|
|
929 if (!isSpace(toAscii(enc, ptr, end))) {
|
|
930 *nextTokPtr = ptr;
|
|
931 return 0;
|
|
932 }
|
|
933 do {
|
|
934 ptr += enc->minBytesPerChar;
|
|
935 } while (isSpace(toAscii(enc, ptr, end)));
|
|
936 if (ptr == end) {
|
|
937 *namePtr = 0;
|
|
938 return 1;
|
|
939 }
|
|
940 *namePtr = ptr;
|
|
941 for (;;) {
|
|
942 c = toAscii(enc, ptr, end);
|
|
943 if (c == -1) {
|
|
944 *nextTokPtr = ptr;
|
|
945 return 0;
|
|
946 }
|
|
947 if (c == '=')
|
|
948 break;
|
|
949 if (isSpace(c)) {
|
|
950 do {
|
|
951 ptr += enc->minBytesPerChar;
|
|
952 } while (isSpace(c = toAscii(enc, ptr, end)));
|
|
953 if (c != '=') {
|
|
954 *nextTokPtr = ptr;
|
|
955 return 0;
|
|
956 }
|
|
957 break;
|
|
958 }
|
|
959 ptr += enc->minBytesPerChar;
|
|
960 }
|
|
961 if (ptr == *namePtr) {
|
|
962 *nextTokPtr = ptr;
|
|
963 return 0;
|
|
964 }
|
|
965 ptr += enc->minBytesPerChar;
|
|
966 c = toAscii(enc, ptr, end);
|
|
967 while (isSpace(c)) {
|
|
968 ptr += enc->minBytesPerChar;
|
|
969 c = toAscii(enc, ptr, end);
|
|
970 }
|
|
971 if (c != '"' && c != '\'') {
|
|
972 *nextTokPtr = ptr;
|
|
973 return 0;
|
|
974 }
|
|
975 open = c;
|
|
976 ptr += enc->minBytesPerChar;
|
|
977 *valPtr = ptr;
|
|
978 for (;; ptr += enc->minBytesPerChar) {
|
|
979 c = toAscii(enc, ptr, end);
|
|
980 if (c == open)
|
|
981 break;
|
|
982 if (!('a' <= c && c <= 'z')
|
|
983 && !('A' <= c && c <= 'Z')
|
|
984 && !('0' <= c && c <= '9')
|
|
985 && c != '.'
|
|
986 && c != '-'
|
|
987 && c != '_') {
|
|
988 *nextTokPtr = ptr;
|
|
989 return 0;
|
|
990 }
|
|
991 }
|
|
992 *nextTokPtr = ptr + enc->minBytesPerChar;
|
|
993 return 1;
|
|
994 }
|
|
995
|
|
996 static
|
|
997 int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
|
|
998 const char *,
|
|
999 const char *),
|
|
1000 int isGeneralTextEntity,
|
|
1001 const ENCODING *enc,
|
|
1002 const char *ptr,
|
|
1003 const char *end,
|
|
1004 const char **badPtr,
|
|
1005 const char **versionPtr,
|
|
1006 const char **encodingName,
|
|
1007 const ENCODING **encoding,
|
|
1008 int *standalone)
|
|
1009 {
|
|
1010 const char *val = 0;
|
|
1011 const char *name = 0;
|
|
1012 ptr += 5 * enc->minBytesPerChar;
|
|
1013 end -= 2 * enc->minBytesPerChar;
|
|
1014 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr) || !name) {
|
|
1015 *badPtr = ptr;
|
|
1016 return 0;
|
|
1017 }
|
|
1018 if (!XmlNameMatchesAscii(enc, name, "version")) {
|
|
1019 if (!isGeneralTextEntity) {
|
|
1020 *badPtr = name;
|
|
1021 return 0;
|
|
1022 }
|
|
1023 }
|
|
1024 else {
|
|
1025 if (versionPtr)
|
|
1026 *versionPtr = val;
|
|
1027 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
|
|
1028 *badPtr = ptr;
|
|
1029 return 0;
|
|
1030 }
|
|
1031 if (!name) {
|
|
1032 if (isGeneralTextEntity) {
|
|
1033 /* a TextDecl must have an EncodingDecl */
|
|
1034 *badPtr = ptr;
|
|
1035 return 0;
|
|
1036 }
|
|
1037 return 1;
|
|
1038 }
|
|
1039 }
|
|
1040 if (XmlNameMatchesAscii(enc, name, "encoding")) {
|
|
1041 int c = toAscii(enc, val, end);
|
|
1042 if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) {
|
|
1043 *badPtr = val;
|
|
1044 return 0;
|
|
1045 }
|
|
1046 if (encodingName)
|
|
1047 *encodingName = val;
|
|
1048 if (encoding)
|
|
1049 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
|
|
1050 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
|
|
1051 *badPtr = ptr;
|
|
1052 return 0;
|
|
1053 }
|
|
1054 if (!name)
|
|
1055 return 1;
|
|
1056 }
|
|
1057 if (!XmlNameMatchesAscii(enc, name, "standalone") || isGeneralTextEntity) {
|
|
1058 *badPtr = name;
|
|
1059 return 0;
|
|
1060 }
|
|
1061 if (XmlNameMatchesAscii(enc, val, "yes")) {
|
|
1062 if (standalone)
|
|
1063 *standalone = 1;
|
|
1064 }
|
|
1065 else if (XmlNameMatchesAscii(enc, val, "no")) {
|
|
1066 if (standalone)
|
|
1067 *standalone = 0;
|
|
1068 }
|
|
1069 else {
|
|
1070 *badPtr = val;
|
|
1071 return 0;
|
|
1072 }
|
|
1073 while (isSpace(toAscii(enc, ptr, end)))
|
|
1074 ptr += enc->minBytesPerChar;
|
|
1075 if (ptr != end) {
|
|
1076 *badPtr = ptr;
|
|
1077 return 0;
|
|
1078 }
|
|
1079 return 1;
|
|
1080 }
|
|
1081
|
|
1082 static
|
|
1083 int checkCharRefNumber(int result)
|
|
1084 {
|
|
1085 switch (result >> 8) {
|
|
1086 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
|
|
1087 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
|
|
1088 return -1;
|
|
1089 case 0:
|
|
1090 if (latin1_encoding.type[result] == BT_NONXML)
|
|
1091 return -1;
|
|
1092 break;
|
|
1093 case 0xFF:
|
|
1094 if (result == 0xFFFE || result == 0xFFFF)
|
|
1095 return -1;
|
|
1096 break;
|
|
1097 }
|
|
1098 return result;
|
|
1099 }
|
|
1100
|
|
1101 int XmlUtf8Encode(int c, char *buf)
|
|
1102 {
|
|
1103 enum {
|
|
1104 /* minN is minimum legal resulting value for N byte sequence */
|
|
1105 min2 = 0x80,
|
|
1106 min3 = 0x800,
|
|
1107 min4 = 0x10000
|
|
1108 };
|
|
1109
|
|
1110 if (c < 0)
|
|
1111 return 0;
|
|
1112 if (c < min2) {
|
|
1113 buf[0] = (c | UTF8_cval1);
|
|
1114 return 1;
|
|
1115 }
|
|
1116 if (c < min3) {
|
|
1117 buf[0] = ((c >> 6) | UTF8_cval2);
|
|
1118 buf[1] = ((c & 0x3f) | 0x80);
|
|
1119 return 2;
|
|
1120 }
|
|
1121 if (c < min4) {
|
|
1122 buf[0] = ((c >> 12) | UTF8_cval3);
|
|
1123 buf[1] = (((c >> 6) & 0x3f) | 0x80);
|
|
1124 buf[2] = ((c & 0x3f) | 0x80);
|
|
1125 return 3;
|
|
1126 }
|
|
1127 if (c < 0x110000) {
|
|
1128 buf[0] = ((c >> 18) | UTF8_cval4);
|
|
1129 buf[1] = (((c >> 12) & 0x3f) | 0x80);
|
|
1130 buf[2] = (((c >> 6) & 0x3f) | 0x80);
|
|
1131 buf[3] = ((c & 0x3f) | 0x80);
|
|
1132 return 4;
|
|
1133 }
|
|
1134 return 0;
|
|
1135 }
|
|
1136
|
|
1137 int XmlUtf16Encode(int charNum, unsigned short *buf)
|
|
1138 {
|
|
1139 if (charNum < 0)
|
|
1140 return 0;
|
|
1141 if (charNum < 0x10000) {
|
|
1142 buf[0] = charNum;
|
|
1143 return 1;
|
|
1144 }
|
|
1145 if (charNum < 0x110000) {
|
|
1146 charNum -= 0x10000;
|
|
1147 buf[0] = (charNum >> 10) + 0xD800;
|
|
1148 buf[1] = (charNum & 0x3FF) + 0xDC00;
|
|
1149 return 2;
|
|
1150 }
|
|
1151 return 0;
|
|
1152 }
|
|
1153
|
|
1154 struct unknown_encoding {
|
|
1155 struct normal_encoding normal;
|
|
1156 int (*convert)(void *userData, const char *p);
|
|
1157 void *userData;
|
|
1158 unsigned short utf16[256];
|
|
1159 char utf8[256][4];
|
|
1160 };
|
|
1161
|
|
1162 int XmlSizeOfUnknownEncoding()
|
|
1163 {
|
|
1164 return sizeof(struct unknown_encoding);
|
|
1165 }
|
|
1166
|
|
1167 static
|
|
1168 int unknown_isName(const ENCODING *enc, const char *p)
|
|
1169 {
|
|
1170 int c = ((const struct unknown_encoding *)enc)
|
|
1171 ->convert(((const struct unknown_encoding *)enc)->userData, p);
|
|
1172 if (c & ~0xFFFF)
|
|
1173 return 0;
|
|
1174 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
|
|
1175 }
|
|
1176
|
|
1177 static
|
|
1178 int unknown_isNmstrt(const ENCODING *enc, const char *p)
|
|
1179 {
|
|
1180 int c = ((const struct unknown_encoding *)enc)
|
|
1181 ->convert(((const struct unknown_encoding *)enc)->userData, p);
|
|
1182 if (c & ~0xFFFF)
|
|
1183 return 0;
|
|
1184 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
|
|
1185 }
|
|
1186
|
|
1187 static
|
|
1188 int unknown_isInvalid(const ENCODING *enc, const char *p)
|
|
1189 {
|
|
1190 int c = ((const struct unknown_encoding *)enc)
|
|
1191 ->convert(((const struct unknown_encoding *)enc)->userData, p);
|
|
1192 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
|
|
1193 }
|
|
1194
|
|
1195 static
|
|
1196 void unknown_toUtf8(const ENCODING *enc,
|
|
1197 const char **fromP, const char *fromLim,
|
|
1198 char **toP, const char *toLim)
|
|
1199 {
|
|
1200 char buf[XML_UTF8_ENCODE_MAX];
|
|
1201 for (;;) {
|
|
1202 const char *utf8;
|
|
1203 int n;
|
|
1204 if (*fromP == fromLim)
|
|
1205 break;
|
|
1206 utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
|
|
1207 n = *utf8++;
|
|
1208 if (n == 0) {
|
|
1209 int c = ((const struct unknown_encoding *)enc)
|
|
1210 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
|
|
1211 n = XmlUtf8Encode(c, buf);
|
|
1212 if (n > toLim - *toP)
|
|
1213 break;
|
|
1214 utf8 = buf;
|
|
1215 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
|
|
1216 - (BT_LEAD2 - 2);
|
|
1217 }
|
|
1218 else {
|
|
1219 if (n > toLim - *toP)
|
|
1220 break;
|
|
1221 (*fromP)++;
|
|
1222 }
|
|
1223 do {
|
|
1224 *(*toP)++ = *utf8++;
|
|
1225 } while (--n != 0);
|
|
1226 }
|
|
1227 }
|
|
1228
|
|
1229 static
|
|
1230 void unknown_toUtf16(const ENCODING *enc,
|
|
1231 const char **fromP, const char *fromLim,
|
|
1232 unsigned short **toP, const unsigned short *toLim)
|
|
1233 {
|
|
1234 while (*fromP != fromLim && *toP != toLim) {
|
|
1235 unsigned short c
|
|
1236 = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
|
|
1237 if (c == 0) {
|
|
1238 c = (unsigned short)((const struct unknown_encoding *)enc)
|
|
1239 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
|
|
1240 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
|
|
1241 - (BT_LEAD2 - 2);
|
|
1242 }
|
|
1243 else
|
|
1244 (*fromP)++;
|
|
1245 *(*toP)++ = c;
|
|
1246 }
|
|
1247 }
|
|
1248
|
|
1249 ENCODING *
|
|
1250 XmlInitUnknownEncoding(void *mem,
|
|
1251 int *table,
|
|
1252 int (*convert)(void *userData, const char *p),
|
|
1253 void *userData)
|
|
1254 {
|
|
1255 int i;
|
|
1256 struct unknown_encoding *e = mem;
|
|
1257 for (i = 0; i < sizeof(struct normal_encoding); i++)
|
|
1258 ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
|
|
1259 for (i = 0; i < 128; i++)
|
|
1260 if (latin1_encoding.type[i] != BT_OTHER
|
|
1261 && latin1_encoding.type[i] != BT_NONXML
|
|
1262 && table[i] != i)
|
|
1263 return 0;
|
|
1264 for (i = 0; i < 256; i++) {
|
|
1265 int c = table[i];
|
|
1266 if (c == -1) {
|
|
1267 e->normal.type[i] = BT_MALFORM;
|
|
1268 /* This shouldn't really get used. */
|
|
1269 e->utf16[i] = 0xFFFF;
|
|
1270 e->utf8[i][0] = 1;
|
|
1271 e->utf8[i][1] = 0;
|
|
1272 }
|
|
1273 else if (c < 0) {
|
|
1274 if (c < -4)
|
|
1275 return 0;
|
|
1276 e->normal.type[i] = BT_LEAD2 - (c + 2);
|
|
1277 e->utf8[i][0] = 0;
|
|
1278 e->utf16[i] = 0;
|
|
1279 }
|
|
1280 else if (c < 0x80) {
|
|
1281 if (latin1_encoding.type[c] != BT_OTHER
|
|
1282 && latin1_encoding.type[c] != BT_NONXML
|
|
1283 && c != i)
|
|
1284 return 0;
|
|
1285 e->normal.type[i] = latin1_encoding.type[c];
|
|
1286 e->utf8[i][0] = 1;
|
|
1287 e->utf8[i][1] = (char)c;
|
|
1288 e->utf16[i] = c == 0 ? 0xFFFF : c;
|
|
1289 }
|
|
1290 else if (checkCharRefNumber(c) < 0) {
|
|
1291 e->normal.type[i] = BT_NONXML;
|
|
1292 /* This shouldn't really get used. */
|
|
1293 e->utf16[i] = 0xFFFF;
|
|
1294 e->utf8[i][0] = 1;
|
|
1295 e->utf8[i][1] = 0;
|
|
1296 }
|
|
1297 else {
|
|
1298 if (c > 0xFFFF)
|
|
1299 return 0;
|
|
1300 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
|
|
1301 e->normal.type[i] = BT_NMSTRT;
|
|
1302 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
|
|
1303 e->normal.type[i] = BT_NAME;
|
|
1304 else
|
|
1305 e->normal.type[i] = BT_OTHER;
|
|
1306 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
|
|
1307 e->utf16[i] = c;
|
|
1308 }
|
|
1309 }
|
|
1310 e->userData = userData;
|
|
1311 e->convert = convert;
|
|
1312 if (convert) {
|
|
1313 e->normal.isName2 = unknown_isName;
|
|
1314 e->normal.isName3 = unknown_isName;
|
|
1315 e->normal.isName4 = unknown_isName;
|
|
1316 e->normal.isNmstrt2 = unknown_isNmstrt;
|
|
1317 e->normal.isNmstrt3 = unknown_isNmstrt;
|
|
1318 e->normal.isNmstrt4 = unknown_isNmstrt;
|
|
1319 e->normal.isInvalid2 = unknown_isInvalid;
|
|
1320 e->normal.isInvalid3 = unknown_isInvalid;
|
|
1321 e->normal.isInvalid4 = unknown_isInvalid;
|
|
1322 }
|
|
1323 e->normal.enc.utf8Convert = unknown_toUtf8;
|
|
1324 e->normal.enc.utf16Convert = unknown_toUtf16;
|
|
1325 return &(e->normal.enc);
|
|
1326 }
|
|
1327
|
|
1328 /* If this enumeration is changed, getEncodingIndex and encodings
|
|
1329 must also be changed. */
|
|
1330 enum {
|
|
1331 UNKNOWN_ENC = -1,
|
|
1332 ISO_8859_1_ENC = 0,
|
|
1333 US_ASCII_ENC,
|
|
1334 UTF_8_ENC,
|
|
1335 UTF_16_ENC,
|
|
1336 UTF_16BE_ENC,
|
|
1337 UTF_16LE_ENC,
|
|
1338 /* must match encodingNames up to here */
|
|
1339 NO_ENC
|
|
1340 };
|
|
1341
|
|
1342 static
|
|
1343 int getEncodingIndex(const char *name)
|
|
1344 {
|
|
1345 static const char *encodingNames[] = {
|
|
1346 "ISO-8859-1",
|
|
1347 "US-ASCII",
|
|
1348 "UTF-8",
|
|
1349 "UTF-16",
|
|
1350 "UTF-16BE"
|
|
1351 "UTF-16LE",
|
|
1352 };
|
|
1353 int i;
|
|
1354 if (name == 0)
|
|
1355 return NO_ENC;
|
|
1356 for (i = 0; i < sizeof(encodingNames)/sizeof(encodingNames[0]); i++)
|
|
1357 if (streqci(name, encodingNames[i]))
|
|
1358 return i;
|
|
1359 return UNKNOWN_ENC;
|
|
1360 }
|
|
1361
|
|
1362 /* For binary compatibility, we store the index of the encoding specified
|
|
1363 at initialization in the isUtf16 member. */
|
|
1364
|
|
1365 #define INIT_ENC_INDEX(enc) ((enc)->initEnc.isUtf16)
|
|
1366
|
|
1367 /* This is what detects the encoding.
|
|
1368 encodingTable maps from encoding indices to encodings;
|
|
1369 INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
|
|
1370 state is XML_CONTENT_STATE if we're parsing an external text entity,
|
|
1371 and XML_PROLOG_STATE otherwise.
|
|
1372 */
|
|
1373
|
|
1374
|
|
1375 static
|
|
1376 int initScan(const ENCODING **encodingTable,
|
|
1377 const INIT_ENCODING *enc,
|
|
1378 int state,
|
|
1379 const char *ptr,
|
|
1380 const char *end,
|
|
1381 const char **nextTokPtr)
|
|
1382 {
|
|
1383 const ENCODING **encPtr;
|
|
1384
|
|
1385 if (ptr == end)
|
|
1386 return XML_TOK_NONE;
|
|
1387 encPtr = enc->encPtr;
|
|
1388 if (ptr + 1 == end) {
|
|
1389 /* only a single byte available for auto-detection */
|
|
1390 /* a well-formed document entity must have more than one byte */
|
|
1391 if (state != XML_CONTENT_STATE)
|
|
1392 return XML_TOK_PARTIAL;
|
|
1393 /* so we're parsing an external text entity... */
|
|
1394 /* if UTF-16 was externally specified, then we need at least 2 bytes */
|
|
1395 switch (INIT_ENC_INDEX(enc)) {
|
|
1396 case UTF_16_ENC:
|
|
1397 case UTF_16LE_ENC:
|
|
1398 case UTF_16BE_ENC:
|
|
1399 return XML_TOK_PARTIAL;
|
|
1400 }
|
|
1401 switch ((unsigned char)*ptr) {
|
|
1402 case 0xFE:
|
|
1403 case 0xFF:
|
|
1404 case 0xEF: /* possibly first byte of UTF-8 BOM */
|
|
1405 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
|
|
1406 && state == XML_CONTENT_STATE)
|
|
1407 break;
|
|
1408 /* fall through */
|
|
1409 case 0x00:
|
|
1410 case 0x3C:
|
|
1411 return XML_TOK_PARTIAL;
|
|
1412 }
|
|
1413 }
|
|
1414 else {
|
|
1415 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
|
|
1416 case 0xFEFF:
|
|
1417 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
|
|
1418 && state == XML_CONTENT_STATE)
|
|
1419 break;
|
|
1420 *nextTokPtr = ptr + 2;
|
|
1421 *encPtr = encodingTable[UTF_16BE_ENC];
|
|
1422 return XML_TOK_BOM;
|
|
1423 /* 00 3C is handled in the default case */
|
|
1424 case 0x3C00:
|
|
1425 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
|
|
1426 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
|
|
1427 && state == XML_CONTENT_STATE)
|
|
1428 break;
|
|
1429 *encPtr = encodingTable[UTF_16LE_ENC];
|
|
1430 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
|
|
1431 case 0xFFFE:
|
|
1432 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
|
|
1433 && state == XML_CONTENT_STATE)
|
|
1434 break;
|
|
1435 *nextTokPtr = ptr + 2;
|
|
1436 *encPtr = encodingTable[UTF_16LE_ENC];
|
|
1437 return XML_TOK_BOM;
|
|
1438 case 0xEFBB:
|
|
1439 /* Maybe a UTF-8 BOM (EF BB BF) */
|
|
1440 /* If there's an explicitly specified (external) encoding
|
|
1441 of ISO-8859-1 or some flavour of UTF-16
|
|
1442 and this is an external text entity,
|
|
1443 don't look for the BOM,
|
|
1444 because it might be a legal data. */
|
|
1445 if (state == XML_CONTENT_STATE) {
|
|
1446 int e = INIT_ENC_INDEX(enc);
|
|
1447 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC)
|
|
1448 break;
|
|
1449 }
|
|
1450 if (ptr + 2 == end)
|
|
1451 return XML_TOK_PARTIAL;
|
|
1452 if ((unsigned char)ptr[2] == 0xBF) {
|
|
1453 *encPtr = encodingTable[UTF_8_ENC];
|
|
1454 return XML_TOK_BOM;
|
|
1455 }
|
|
1456 break;
|
|
1457 default:
|
|
1458 if (ptr[0] == '\0') {
|
|
1459 /* 0 isn't a legal data character. Furthermore a document entity can only
|
|
1460 start with ASCII characters. So the only way this can fail to be big-endian
|
|
1461 UTF-16 if it it's an external parsed general entity that's labelled as
|
|
1462 UTF-16LE. */
|
|
1463 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
|
|
1464 break;
|
|
1465 *encPtr = encodingTable[UTF_16BE_ENC];
|
|
1466 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
|
|
1467 }
|
|
1468 else if (ptr[1] == '\0') {
|
|
1469 /* We could recover here in the case:
|
|
1470 - parsing an external entity
|
|
1471 - second byte is 0
|
|
1472 - no externally specified encoding
|
|
1473 - no encoding declaration
|
|
1474 by assuming UTF-16LE. But we don't, because this would mean when
|
|
1475 presented just with a single byte, we couldn't reliably determine
|
|
1476 whether we needed further bytes. */
|
|
1477 if (state == XML_CONTENT_STATE)
|
|
1478 break;
|
|
1479 *encPtr = encodingTable[UTF_16LE_ENC];
|
|
1480 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
|
|
1481 }
|
|
1482 break;
|
|
1483 }
|
|
1484 }
|
3127
|
1485 *encPtr = encodingTable[(int)INIT_ENC_INDEX(enc)];
|
2086
|
1486 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
|
|
1487 }
|
|
1488
|
|
1489
|
|
1490 #define NS(x) x
|
|
1491 #define ns(x) x
|
|
1492 #include "xmltok_ns.c"
|
|
1493 #undef NS
|
|
1494 #undef ns
|
|
1495
|
|
1496 #ifdef XML_NS
|
|
1497
|
|
1498 #define NS(x) x ## NS
|
|
1499 #define ns(x) x ## _ns
|
|
1500
|
|
1501 #include "xmltok_ns.c"
|
|
1502
|
|
1503 #undef NS
|
|
1504 #undef ns
|
|
1505
|
|
1506 ENCODING *
|
|
1507 XmlInitUnknownEncodingNS(void *mem,
|
|
1508 int *table,
|
|
1509 int (*convert)(void *userData, const char *p),
|
|
1510 void *userData)
|
|
1511 {
|
|
1512 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
|
|
1513 if (enc)
|
|
1514 ((struct normal_encoding *)enc)->type[':'] = BT_COLON;
|
|
1515 return enc;
|
|
1516 }
|
|
1517
|
|
1518 #endif /* XML_NS */
|