Mercurial > pidgin.yaz
comparison plugins/jabber/xmltok.c @ 1347:afa63ac2fd84
[gaim-migrate @ 1357]
jabber for those not fortunate enough to have libjabber and libxode on their systems
committer: Tailor Script <tailor@pidgin.im>
author | Eric Warmenhoven <eric@warmenhoven.org> |
---|---|
date | Thu, 21 Dec 2000 14:54:13 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1346:83f78eb7c472 | 1347:afa63ac2fd84 |
---|---|
1 /* | |
2 The contents of this file are subject to the Mozilla Public License | |
3 Version 1.1 (the "License"); you may not use this file except in | |
4 compliance with the License. You may obtain a copy of the License at | |
5 http://www.mozilla.org/MPL/ | |
6 | |
7 Software distributed under the License is distributed on an "AS IS" | |
8 basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the | |
9 License for the specific language governing rights and limitations | |
10 under the License. | |
11 | |
12 The Original Code is expat. | |
13 | |
14 The Initial Developer of the Original Code is James Clark. | |
15 Portions created by James Clark are Copyright (C) 1998, 1999 | |
16 James Clark. All Rights Reserved. | |
17 | |
18 Contributor(s): | |
19 | |
20 Alternatively, the contents of this file may be used under the terms | |
21 of the GNU General Public License (the "GPL"), in which case the | |
22 provisions of the GPL are applicable instead of those above. If you | |
23 wish to allow use of your version of this file only under the terms of | |
24 the GPL and not to allow others to use your version of this file under | |
25 the MPL, indicate your decision by deleting the provisions above and | |
26 replace them with the notice and other provisions required by the | |
27 GPL. If you do not delete the provisions above, a recipient may use | |
28 your version of this file under either the MPL or the GPL. | |
29 */ | |
30 | |
31 #include "xmldef.h" | |
32 #include "xmltok.h" | |
33 #include "nametab.h" | |
34 | |
35 #define VTABLE1 \ | |
36 { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \ | |
37 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ | |
38 PREFIX(sameName), \ | |
39 PREFIX(nameMatchesAscii), \ | |
40 PREFIX(nameLength), \ | |
41 PREFIX(skipS), \ | |
42 PREFIX(getAtts), \ | |
43 PREFIX(charRefNumber), \ | |
44 PREFIX(predefinedEntityName), \ | |
45 PREFIX(updatePosition), \ | |
46 PREFIX(isPublicId) | |
47 | |
48 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) | |
49 | |
50 #define UCS2_GET_NAMING(pages, hi, lo) \ | |
51 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) | |
52 | |
53 /* A 2 byte UTF-8 representation splits the characters 11 bits | |
54 between the bottom 5 and 6 bits of the bytes. | |
55 We need 8 bits to index into pages, 3 bits to add to that index and | |
56 5 bits to generate the mask. */ | |
57 #define UTF8_GET_NAMING2(pages, byte) \ | |
58 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ | |
59 + ((((byte)[0]) & 3) << 1) \ | |
60 + ((((byte)[1]) >> 5) & 1)] \ | |
61 & (1 << (((byte)[1]) & 0x1F))) | |
62 | |
63 /* A 3 byte UTF-8 representation splits the characters 16 bits | |
64 between the bottom 4, 6 and 6 bits of the bytes. | |
65 We need 8 bits to index into pages, 3 bits to add to that index and | |
66 5 bits to generate the mask. */ | |
67 #define UTF8_GET_NAMING3(pages, byte) \ | |
68 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \ | |
69 + ((((byte)[1]) >> 2) & 0xF)] \ | |
70 << 3) \ | |
71 + ((((byte)[1]) & 3) << 1) \ | |
72 + ((((byte)[2]) >> 5) & 1)] \ | |
73 & (1 << (((byte)[2]) & 0x1F))) | |
74 | |
75 #define UTF8_GET_NAMING(pages, p, n) \ | |
76 ((n) == 2 \ | |
77 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ | |
78 : ((n) == 3 \ | |
79 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \ | |
80 : 0)) | |
81 | |
82 #define UTF8_INVALID3(p) \ | |
83 ((*p) == 0xED \ | |
84 ? (((p)[1] & 0x20) != 0) \ | |
85 : ((*p) == 0xEF \ | |
86 ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \ | |
87 : 0)) | |
88 | |
89 #define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0) | |
90 | |
91 static | |
92 int isNever(const ENCODING *enc, const char *p) | |
93 { | |
94 return 0; | |
95 } | |
96 | |
97 static | |
98 int utf8_isName2(const ENCODING *enc, const char *p) | |
99 { | |
100 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); | |
101 } | |
102 | |
103 static | |
104 int utf8_isName3(const ENCODING *enc, const char *p) | |
105 { | |
106 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); | |
107 } | |
108 | |
109 #define utf8_isName4 isNever | |
110 | |
111 static | |
112 int utf8_isNmstrt2(const ENCODING *enc, const char *p) | |
113 { | |
114 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p); | |
115 } | |
116 | |
117 static | |
118 int utf8_isNmstrt3(const ENCODING *enc, const char *p) | |
119 { | |
120 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); | |
121 } | |
122 | |
123 #define utf8_isNmstrt4 isNever | |
124 | |
125 #define utf8_isInvalid2 isNever | |
126 | |
127 static | |
128 int utf8_isInvalid3(const ENCODING *enc, const char *p) | |
129 { | |
130 return UTF8_INVALID3((const unsigned char *)p); | |
131 } | |
132 | |
133 static | |
134 int utf8_isInvalid4(const ENCODING *enc, const char *p) | |
135 { | |
136 return UTF8_INVALID4((const unsigned char *)p); | |
137 } | |
138 | |
139 struct normal_encoding { | |
140 ENCODING enc; | |
141 unsigned char type[256]; | |
142 #ifdef XML_MIN_SIZE | |
143 int (*byteType)(const ENCODING *, const char *); | |
144 int (*isNameMin)(const ENCODING *, const char *); | |
145 int (*isNmstrtMin)(const ENCODING *, const char *); | |
146 int (*byteToAscii)(const ENCODING *, const char *); | |
147 int (*charMatches)(const ENCODING *, const char *, int); | |
148 #endif /* XML_MIN_SIZE */ | |
149 int (*isName2)(const ENCODING *, const char *); | |
150 int (*isName3)(const ENCODING *, const char *); | |
151 int (*isName4)(const ENCODING *, const char *); | |
152 int (*isNmstrt2)(const ENCODING *, const char *); | |
153 int (*isNmstrt3)(const ENCODING *, const char *); | |
154 int (*isNmstrt4)(const ENCODING *, const char *); | |
155 int (*isInvalid2)(const ENCODING *, const char *); | |
156 int (*isInvalid3)(const ENCODING *, const char *); | |
157 int (*isInvalid4)(const ENCODING *, const char *); | |
158 }; | |
159 | |
160 #ifdef XML_MIN_SIZE | |
161 | |
162 #define STANDARD_VTABLE(E) \ | |
163 E ## byteType, \ | |
164 E ## isNameMin, \ | |
165 E ## isNmstrtMin, \ | |
166 E ## byteToAscii, \ | |
167 E ## charMatches, | |
168 | |
169 #else | |
170 | |
171 #define STANDARD_VTABLE(E) /* as nothing */ | |
172 | |
173 #endif | |
174 | |
175 #define NORMAL_VTABLE(E) \ | |
176 E ## isName2, \ | |
177 E ## isName3, \ | |
178 E ## isName4, \ | |
179 E ## isNmstrt2, \ | |
180 E ## isNmstrt3, \ | |
181 E ## isNmstrt4, \ | |
182 E ## isInvalid2, \ | |
183 E ## isInvalid3, \ | |
184 E ## isInvalid4 | |
185 | |
186 static int checkCharRefNumber(int); | |
187 | |
188 #include "xmltok_impl.h" | |
189 | |
190 #ifdef XML_MIN_SIZE | |
191 #define sb_isNameMin isNever | |
192 #define sb_isNmstrtMin isNever | |
193 #endif | |
194 | |
195 #ifdef XML_MIN_SIZE | |
196 #define MINBPC(enc) ((enc)->minBytesPerChar) | |
197 #else | |
198 /* minimum bytes per character */ | |
199 #define MINBPC(enc) 1 | |
200 #endif | |
201 | |
202 #define SB_BYTE_TYPE(enc, p) \ | |
203 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) | |
204 | |
205 #ifdef XML_MIN_SIZE | |
206 static | |
207 int sb_byteType(const ENCODING *enc, const char *p) | |
208 { | |
209 return SB_BYTE_TYPE(enc, p); | |
210 } | |
211 #define BYTE_TYPE(enc, p) \ | |
212 (((const struct normal_encoding *)(enc))->byteType(enc, p)) | |
213 #else | |
214 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p) | |
215 #endif | |
216 | |
217 #ifdef XML_MIN_SIZE | |
218 #define BYTE_TO_ASCII(enc, p) \ | |
219 (((const struct normal_encoding *)(enc))->byteToAscii(enc, p)) | |
220 static | |
221 int sb_byteToAscii(const ENCODING *enc, const char *p) | |
222 { | |
223 return *p; | |
224 } | |
225 #else | |
226 #define BYTE_TO_ASCII(enc, p) (*p) | |
227 #endif | |
228 | |
229 #define IS_NAME_CHAR(enc, p, n) \ | |
230 (((const struct normal_encoding *)(enc))->isName ## n(enc, p)) | |
231 #define IS_NMSTRT_CHAR(enc, p, n) \ | |
232 (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p)) | |
233 #define IS_INVALID_CHAR(enc, p, n) \ | |
234 (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p)) | |
235 | |
236 #ifdef XML_MIN_SIZE | |
237 #define IS_NAME_CHAR_MINBPC(enc, p) \ | |
238 (((const struct normal_encoding *)(enc))->isNameMin(enc, p)) | |
239 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \ | |
240 (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p)) | |
241 #else | |
242 #define IS_NAME_CHAR_MINBPC(enc, p) (0) | |
243 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) | |
244 #endif | |
245 | |
246 #ifdef XML_MIN_SIZE | |
247 #define CHAR_MATCHES(enc, p, c) \ | |
248 (((const struct normal_encoding *)(enc))->charMatches(enc, p, c)) | |
249 static | |
250 int sb_charMatches(const ENCODING *enc, const char *p, int c) | |
251 { | |
252 return *p == c; | |
253 } | |
254 #else | |
255 /* c is an ASCII character */ | |
256 #define CHAR_MATCHES(enc, p, c) (*(p) == c) | |
257 #endif | |
258 | |
259 #define PREFIX(ident) normal_ ## ident | |
260 #include "xmltok_impl.c" | |
261 | |
262 #undef MINBPC | |
263 #undef BYTE_TYPE | |
264 #undef BYTE_TO_ASCII | |
265 #undef CHAR_MATCHES | |
266 #undef IS_NAME_CHAR | |
267 #undef IS_NAME_CHAR_MINBPC | |
268 #undef IS_NMSTRT_CHAR | |
269 #undef IS_NMSTRT_CHAR_MINBPC | |
270 #undef IS_INVALID_CHAR | |
271 | |
272 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ | |
273 UTF8_cval1 = 0x00, | |
274 UTF8_cval2 = 0xc0, | |
275 UTF8_cval3 = 0xe0, | |
276 UTF8_cval4 = 0xf0 | |
277 }; | |
278 | |
279 static | |
280 void utf8_toUtf8(const ENCODING *enc, | |
281 const char **fromP, const char *fromLim, | |
282 char **toP, const char *toLim) | |
283 { | |
284 char *to; | |
285 const char *from; | |
286 if (fromLim - *fromP > toLim - *toP) { | |
287 /* Avoid copying partial characters. */ | |
288 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--) | |
289 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80) | |
290 break; | |
291 } | |
292 for (to = *toP, from = *fromP; from != fromLim; from++, to++) | |
293 *to = *from; | |
294 *fromP = from; | |
295 *toP = to; | |
296 } | |
297 | |
298 static | |
299 void utf8_toUtf16(const ENCODING *enc, | |
300 const char **fromP, const char *fromLim, | |
301 unsigned short **toP, const unsigned short *toLim) | |
302 { | |
303 unsigned short *to = *toP; | |
304 const char *from = *fromP; | |
305 while (from != fromLim && to != toLim) { | |
306 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { | |
307 case BT_LEAD2: | |
308 *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f); | |
309 from += 2; | |
310 break; | |
311 case BT_LEAD3: | |
312 *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f); | |
313 from += 3; | |
314 break; | |
315 case BT_LEAD4: | |
316 { | |
317 unsigned long n; | |
318 if (to + 1 == toLim) | |
319 break; | |
320 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); | |
321 n -= 0x10000; | |
322 to[0] = (unsigned short)((n >> 10) | 0xD800); | |
323 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); | |
324 to += 2; | |
325 from += 4; | |
326 } | |
327 break; | |
328 default: | |
329 *to++ = *from++; | |
330 break; | |
331 } | |
332 } | |
333 *fromP = from; | |
334 *toP = to; | |
335 } | |
336 | |
337 #ifdef XML_NS | |
338 static const struct normal_encoding utf8_encoding_ns = { | |
339 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, | |
340 { | |
341 #include "asciitab.h" | |
342 #include "utf8tab.h" | |
343 }, | |
344 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) | |
345 }; | |
346 #endif | |
347 | |
348 static const struct normal_encoding utf8_encoding = { | |
349 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, | |
350 { | |
351 #define BT_COLON BT_NMSTRT | |
352 #include "asciitab.h" | |
353 #undef BT_COLON | |
354 #include "utf8tab.h" | |
355 }, | |
356 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) | |
357 }; | |
358 | |
359 #ifdef XML_NS | |
360 | |
361 static const struct normal_encoding internal_utf8_encoding_ns = { | |
362 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, | |
363 { | |
364 #include "iasciitab.h" | |
365 #include "utf8tab.h" | |
366 }, | |
367 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) | |
368 }; | |
369 | |
370 #endif | |
371 | |
372 static const struct normal_encoding internal_utf8_encoding = { | |
373 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, | |
374 { | |
375 #define BT_COLON BT_NMSTRT | |
376 #include "iasciitab.h" | |
377 #undef BT_COLON | |
378 #include "utf8tab.h" | |
379 }, | |
380 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) | |
381 }; | |
382 | |
383 static | |
384 void latin1_toUtf8(const ENCODING *enc, | |
385 const char **fromP, const char *fromLim, | |
386 char **toP, const char *toLim) | |
387 { | |
388 for (;;) { | |
389 unsigned char c; | |
390 if (*fromP == fromLim) | |
391 break; | |
392 c = (unsigned char)**fromP; | |
393 if (c & 0x80) { | |
394 if (toLim - *toP < 2) | |
395 break; | |
396 *(*toP)++ = ((c >> 6) | UTF8_cval2); | |
397 *(*toP)++ = ((c & 0x3f) | 0x80); | |
398 (*fromP)++; | |
399 } | |
400 else { | |
401 if (*toP == toLim) | |
402 break; | |
403 *(*toP)++ = *(*fromP)++; | |
404 } | |
405 } | |
406 } | |
407 | |
408 static | |
409 void latin1_toUtf16(const ENCODING *enc, | |
410 const char **fromP, const char *fromLim, | |
411 unsigned short **toP, const unsigned short *toLim) | |
412 { | |
413 while (*fromP != fromLim && *toP != toLim) | |
414 *(*toP)++ = (unsigned char)*(*fromP)++; | |
415 } | |
416 | |
417 #ifdef XML_NS | |
418 | |
419 static const struct normal_encoding latin1_encoding_ns = { | |
420 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, | |
421 { | |
422 #include "asciitab.h" | |
423 #include "latin1tab.h" | |
424 }, | |
425 STANDARD_VTABLE(sb_) | |
426 }; | |
427 | |
428 #endif | |
429 | |
430 static const struct normal_encoding latin1_encoding = { | |
431 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, | |
432 { | |
433 #define BT_COLON BT_NMSTRT | |
434 #include "asciitab.h" | |
435 #undef BT_COLON | |
436 #include "latin1tab.h" | |
437 }, | |
438 STANDARD_VTABLE(sb_) | |
439 }; | |
440 | |
441 static | |
442 void ascii_toUtf8(const ENCODING *enc, | |
443 const char **fromP, const char *fromLim, | |
444 char **toP, const char *toLim) | |
445 { | |
446 while (*fromP != fromLim && *toP != toLim) | |
447 *(*toP)++ = *(*fromP)++; | |
448 } | |
449 | |
450 #ifdef XML_NS | |
451 | |
452 static const struct normal_encoding ascii_encoding_ns = { | |
453 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, | |
454 { | |
455 #include "asciitab.h" | |
456 /* BT_NONXML == 0 */ | |
457 }, | |
458 STANDARD_VTABLE(sb_) | |
459 }; | |
460 | |
461 #endif | |
462 | |
463 static const struct normal_encoding ascii_encoding = { | |
464 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, | |
465 { | |
466 #define BT_COLON BT_NMSTRT | |
467 #include "asciitab.h" | |
468 #undef BT_COLON | |
469 /* BT_NONXML == 0 */ | |
470 }, | |
471 STANDARD_VTABLE(sb_) | |
472 }; | |
473 | |
474 static int unicode_byte_type(char hi, char lo) | |
475 { | |
476 switch ((unsigned char)hi) { | |
477 case 0xD8: case 0xD9: case 0xDA: case 0xDB: | |
478 return BT_LEAD4; | |
479 case 0xDC: case 0xDD: case 0xDE: case 0xDF: | |
480 return BT_TRAIL; | |
481 case 0xFF: | |
482 switch ((unsigned char)lo) { | |
483 case 0xFF: | |
484 case 0xFE: | |
485 return BT_NONXML; | |
486 } | |
487 break; | |
488 } | |
489 return BT_NONASCII; | |
490 } | |
491 | |
492 #define DEFINE_UTF16_TO_UTF8(E) \ | |
493 static \ | |
494 void E ## toUtf8(const ENCODING *enc, \ | |
495 const char **fromP, const char *fromLim, \ | |
496 char **toP, const char *toLim) \ | |
497 { \ | |
498 const char *from; \ | |
499 for (from = *fromP; from != fromLim; from += 2) { \ | |
500 int plane; \ | |
501 unsigned char lo2; \ | |
502 unsigned char lo = GET_LO(from); \ | |
503 unsigned char hi = GET_HI(from); \ | |
504 switch (hi) { \ | |
505 case 0: \ | |
506 if (lo < 0x80) { \ | |
507 if (*toP == toLim) { \ | |
508 *fromP = from; \ | |
509 return; \ | |
510 } \ | |
511 *(*toP)++ = lo; \ | |
512 break; \ | |
513 } \ | |
514 /* fall through */ \ | |
515 case 0x1: case 0x2: case 0x3: \ | |
516 case 0x4: case 0x5: case 0x6: case 0x7: \ | |
517 if (toLim - *toP < 2) { \ | |
518 *fromP = from; \ | |
519 return; \ | |
520 } \ | |
521 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ | |
522 *(*toP)++ = ((lo & 0x3f) | 0x80); \ | |
523 break; \ | |
524 default: \ | |
525 if (toLim - *toP < 3) { \ | |
526 *fromP = from; \ | |
527 return; \ | |
528 } \ | |
529 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ | |
530 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ | |
531 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ | |
532 *(*toP)++ = ((lo & 0x3f) | 0x80); \ | |
533 break; \ | |
534 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \ | |
535 if (toLim - *toP < 4) { \ | |
536 *fromP = from; \ | |
537 return; \ | |
538 } \ | |
539 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ | |
540 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \ | |
541 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ | |
542 from += 2; \ | |
543 lo2 = GET_LO(from); \ | |
544 *(*toP)++ = (((lo & 0x3) << 4) \ | |
545 | ((GET_HI(from) & 0x3) << 2) \ | |
546 | (lo2 >> 6) \ | |
547 | 0x80); \ | |
548 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ | |
549 break; \ | |
550 } \ | |
551 } \ | |
552 *fromP = from; \ | |
553 } | |
554 | |
555 #define DEFINE_UTF16_TO_UTF16(E) \ | |
556 static \ | |
557 void E ## toUtf16(const ENCODING *enc, \ | |
558 const char **fromP, const char *fromLim, \ | |
559 unsigned short **toP, const unsigned short *toLim) \ | |
560 { \ | |
561 /* Avoid copying first half only of surrogate */ \ | |
562 if (fromLim - *fromP > ((toLim - *toP) << 1) \ | |
563 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \ | |
564 fromLim -= 2; \ | |
565 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \ | |
566 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ | |
567 } | |
568 | |
569 #define SET2(ptr, ch) \ | |
570 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8))) | |
571 #define GET_LO(ptr) ((unsigned char)(ptr)[0]) | |
572 #define GET_HI(ptr) ((unsigned char)(ptr)[1]) | |
573 | |
574 DEFINE_UTF16_TO_UTF8(little2_) | |
575 DEFINE_UTF16_TO_UTF16(little2_) | |
576 | |
577 #undef SET2 | |
578 #undef GET_LO | |
579 #undef GET_HI | |
580 | |
581 #define SET2(ptr, ch) \ | |
582 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF))) | |
583 #define GET_LO(ptr) ((unsigned char)(ptr)[1]) | |
584 #define GET_HI(ptr) ((unsigned char)(ptr)[0]) | |
585 | |
586 DEFINE_UTF16_TO_UTF8(big2_) | |
587 DEFINE_UTF16_TO_UTF16(big2_) | |
588 | |
589 #undef SET2 | |
590 #undef GET_LO | |
591 #undef GET_HI | |
592 | |
593 #define LITTLE2_BYTE_TYPE(enc, p) \ | |
594 ((p)[1] == 0 \ | |
595 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ | |
596 : unicode_byte_type((p)[1], (p)[0])) | |
597 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1) | |
598 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c) | |
599 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \ | |
600 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) | |
601 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ | |
602 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) | |
603 | |
604 #ifdef XML_MIN_SIZE | |
605 | |
606 static | |
607 int little2_byteType(const ENCODING *enc, const char *p) | |
608 { | |
609 return LITTLE2_BYTE_TYPE(enc, p); | |
610 } | |
611 | |
612 static | |
613 int little2_byteToAscii(const ENCODING *enc, const char *p) | |
614 { | |
615 return LITTLE2_BYTE_TO_ASCII(enc, p); | |
616 } | |
617 | |
618 static | |
619 int little2_charMatches(const ENCODING *enc, const char *p, int c) | |
620 { | |
621 return LITTLE2_CHAR_MATCHES(enc, p, c); | |
622 } | |
623 | |
624 static | |
625 int little2_isNameMin(const ENCODING *enc, const char *p) | |
626 { | |
627 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p); | |
628 } | |
629 | |
630 static | |
631 int little2_isNmstrtMin(const ENCODING *enc, const char *p) | |
632 { | |
633 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p); | |
634 } | |
635 | |
636 #undef VTABLE | |
637 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16 | |
638 | |
639 #else /* not XML_MIN_SIZE */ | |
640 | |
641 #undef PREFIX | |
642 #define PREFIX(ident) little2_ ## ident | |
643 #define MINBPC(enc) 2 | |
644 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ | |
645 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p) | |
646 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p) | |
647 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c) | |
648 #define IS_NAME_CHAR(enc, p, n) 0 | |
649 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) | |
650 #define IS_NMSTRT_CHAR(enc, p, n) (0) | |
651 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) | |
652 | |
653 #include "xmltok_impl.c" | |
654 | |
655 #undef MINBPC | |
656 #undef BYTE_TYPE | |
657 #undef BYTE_TO_ASCII | |
658 #undef CHAR_MATCHES | |
659 #undef IS_NAME_CHAR | |
660 #undef IS_NAME_CHAR_MINBPC | |
661 #undef IS_NMSTRT_CHAR | |
662 #undef IS_NMSTRT_CHAR_MINBPC | |
663 #undef IS_INVALID_CHAR | |
664 | |
665 #endif /* not XML_MIN_SIZE */ | |
666 | |
667 #ifdef XML_NS | |
668 | |
669 static const struct normal_encoding little2_encoding_ns = { | |
670 { VTABLE, 2, 0, | |
671 #if XML_BYTE_ORDER == 12 | |
672 1 | |
673 #else | |
674 0 | |
675 #endif | |
676 }, | |
677 { | |
678 #include "asciitab.h" | |
679 #include "latin1tab.h" | |
680 }, | |
681 STANDARD_VTABLE(little2_) | |
682 }; | |
683 | |
684 #endif | |
685 | |
686 static const struct normal_encoding little2_encoding = { | |
687 { VTABLE, 2, 0, | |
688 #if XML_BYTE_ORDER == 12 | |
689 1 | |
690 #else | |
691 0 | |
692 #endif | |
693 }, | |
694 { | |
695 #define BT_COLON BT_NMSTRT | |
696 #include "asciitab.h" | |
697 #undef BT_COLON | |
698 #include "latin1tab.h" | |
699 }, | |
700 STANDARD_VTABLE(little2_) | |
701 }; | |
702 | |
703 #if XML_BYTE_ORDER != 21 | |
704 | |
705 #ifdef XML_NS | |
706 | |
707 static const struct normal_encoding internal_little2_encoding_ns = { | |
708 { VTABLE, 2, 0, 1 }, | |
709 { | |
710 #include "iasciitab.h" | |
711 #include "latin1tab.h" | |
712 }, | |
713 STANDARD_VTABLE(little2_) | |
714 }; | |
715 | |
716 #endif | |
717 | |
718 static const struct normal_encoding internal_little2_encoding = { | |
719 { VTABLE, 2, 0, 1 }, | |
720 { | |
721 #define BT_COLON BT_NMSTRT | |
722 #include "iasciitab.h" | |
723 #undef BT_COLON | |
724 #include "latin1tab.h" | |
725 }, | |
726 STANDARD_VTABLE(little2_) | |
727 }; | |
728 | |
729 #endif | |
730 | |
731 | |
732 #define BIG2_BYTE_TYPE(enc, p) \ | |
733 ((p)[0] == 0 \ | |
734 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ | |
735 : unicode_byte_type((p)[0], (p)[1])) | |
736 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1) | |
737 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c) | |
738 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \ | |
739 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) | |
740 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ | |
741 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) | |
742 | |
743 #ifdef XML_MIN_SIZE | |
744 | |
745 static | |
746 int big2_byteType(const ENCODING *enc, const char *p) | |
747 { | |
748 return BIG2_BYTE_TYPE(enc, p); | |
749 } | |
750 | |
751 static | |
752 int big2_byteToAscii(const ENCODING *enc, const char *p) | |
753 { | |
754 return BIG2_BYTE_TO_ASCII(enc, p); | |
755 } | |
756 | |
757 static | |
758 int big2_charMatches(const ENCODING *enc, const char *p, int c) | |
759 { | |
760 return BIG2_CHAR_MATCHES(enc, p, c); | |
761 } | |
762 | |
763 static | |
764 int big2_isNameMin(const ENCODING *enc, const char *p) | |
765 { | |
766 return BIG2_IS_NAME_CHAR_MINBPC(enc, p); | |
767 } | |
768 | |
769 static | |
770 int big2_isNmstrtMin(const ENCODING *enc, const char *p) | |
771 { | |
772 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p); | |
773 } | |
774 | |
775 #undef VTABLE | |
776 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16 | |
777 | |
778 #else /* not XML_MIN_SIZE */ | |
779 | |
780 #undef PREFIX | |
781 #define PREFIX(ident) big2_ ## ident | |
782 #define MINBPC(enc) 2 | |
783 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ | |
784 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p) | |
785 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p) | |
786 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c) | |
787 #define IS_NAME_CHAR(enc, p, n) 0 | |
788 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p) | |
789 #define IS_NMSTRT_CHAR(enc, p, n) (0) | |
790 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) | |
791 | |
792 #include "xmltok_impl.c" | |
793 | |
794 #undef MINBPC | |
795 #undef BYTE_TYPE | |
796 #undef BYTE_TO_ASCII | |
797 #undef CHAR_MATCHES | |
798 #undef IS_NAME_CHAR | |
799 #undef IS_NAME_CHAR_MINBPC | |
800 #undef IS_NMSTRT_CHAR | |
801 #undef IS_NMSTRT_CHAR_MINBPC | |
802 #undef IS_INVALID_CHAR | |
803 | |
804 #endif /* not XML_MIN_SIZE */ | |
805 | |
806 #ifdef XML_NS | |
807 | |
808 static const struct normal_encoding big2_encoding_ns = { | |
809 { VTABLE, 2, 0, | |
810 #if XML_BYTE_ORDER == 21 | |
811 1 | |
812 #else | |
813 0 | |
814 #endif | |
815 }, | |
816 { | |
817 #include "asciitab.h" | |
818 #include "latin1tab.h" | |
819 }, | |
820 STANDARD_VTABLE(big2_) | |
821 }; | |
822 | |
823 #endif | |
824 | |
825 static const struct normal_encoding big2_encoding = { | |
826 { VTABLE, 2, 0, | |
827 #if XML_BYTE_ORDER == 21 | |
828 1 | |
829 #else | |
830 0 | |
831 #endif | |
832 }, | |
833 { | |
834 #define BT_COLON BT_NMSTRT | |
835 #include "asciitab.h" | |
836 #undef BT_COLON | |
837 #include "latin1tab.h" | |
838 }, | |
839 STANDARD_VTABLE(big2_) | |
840 }; | |
841 | |
842 #if XML_BYTE_ORDER != 12 | |
843 | |
844 #ifdef XML_NS | |
845 | |
846 static const struct normal_encoding internal_big2_encoding_ns = { | |
847 { VTABLE, 2, 0, 1 }, | |
848 { | |
849 #include "iasciitab.h" | |
850 #include "latin1tab.h" | |
851 }, | |
852 STANDARD_VTABLE(big2_) | |
853 }; | |
854 | |
855 #endif | |
856 | |
857 static const struct normal_encoding internal_big2_encoding = { | |
858 { VTABLE, 2, 0, 1 }, | |
859 { | |
860 #define BT_COLON BT_NMSTRT | |
861 #include "iasciitab.h" | |
862 #undef BT_COLON | |
863 #include "latin1tab.h" | |
864 }, | |
865 STANDARD_VTABLE(big2_) | |
866 }; | |
867 | |
868 #endif | |
869 | |
870 #undef PREFIX | |
871 | |
872 static | |
873 int streqci(const char *s1, const char *s2) | |
874 { | |
875 for (;;) { | |
876 char c1 = *s1++; | |
877 char c2 = *s2++; | |
878 if ('a' <= c1 && c1 <= 'z') | |
879 c1 += 'A' - 'a'; | |
880 if ('a' <= c2 && c2 <= 'z') | |
881 c2 += 'A' - 'a'; | |
882 if (c1 != c2) | |
883 return 0; | |
884 if (!c1) | |
885 break; | |
886 } | |
887 return 1; | |
888 } | |
889 | |
890 static | |
891 void initUpdatePosition(const ENCODING *enc, const char *ptr, | |
892 const char *end, POSITION *pos) | |
893 { | |
894 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); | |
895 } | |
896 | |
897 static | |
898 int toAscii(const ENCODING *enc, const char *ptr, const char *end) | |
899 { | |
900 char buf[1]; | |
901 char *p = buf; | |
902 XmlUtf8Convert(enc, &ptr, end, &p, p + 1); | |
903 if (p == buf) | |
904 return -1; | |
905 else | |
906 return buf[0]; | |
907 } | |
908 | |
909 static | |
910 int isSpace(int c) | |
911 { | |
912 switch (c) { | |
913 case 0x20: | |
914 case 0xD: | |
915 case 0xA: | |
916 case 0x9: | |
917 return 1; | |
918 } | |
919 return 0; | |
920 } | |
921 | |
922 /* Return 1 if there's just optional white space | |
923 or there's an S followed by name=val. */ | |
924 static | |
925 int parsePseudoAttribute(const ENCODING *enc, | |
926 const char *ptr, | |
927 const char *end, | |
928 const char **namePtr, | |
929 const char **valPtr, | |
930 const char **nextTokPtr) | |
931 { | |
932 int c; | |
933 char open; | |
934 if (ptr == end) { | |
935 *namePtr = 0; | |
936 return 1; | |
937 } | |
938 if (!isSpace(toAscii(enc, ptr, end))) { | |
939 *nextTokPtr = ptr; | |
940 return 0; | |
941 } | |
942 do { | |
943 ptr += enc->minBytesPerChar; | |
944 } while (isSpace(toAscii(enc, ptr, end))); | |
945 if (ptr == end) { | |
946 *namePtr = 0; | |
947 return 1; | |
948 } | |
949 *namePtr = ptr; | |
950 for (;;) { | |
951 c = toAscii(enc, ptr, end); | |
952 if (c == -1) { | |
953 *nextTokPtr = ptr; | |
954 return 0; | |
955 } | |
956 if (c == '=') | |
957 break; | |
958 if (isSpace(c)) { | |
959 do { | |
960 ptr += enc->minBytesPerChar; | |
961 } while (isSpace(c = toAscii(enc, ptr, end))); | |
962 if (c != '=') { | |
963 *nextTokPtr = ptr; | |
964 return 0; | |
965 } | |
966 break; | |
967 } | |
968 ptr += enc->minBytesPerChar; | |
969 } | |
970 if (ptr == *namePtr) { | |
971 *nextTokPtr = ptr; | |
972 return 0; | |
973 } | |
974 ptr += enc->minBytesPerChar; | |
975 c = toAscii(enc, ptr, end); | |
976 while (isSpace(c)) { | |
977 ptr += enc->minBytesPerChar; | |
978 c = toAscii(enc, ptr, end); | |
979 } | |
980 if (c != '"' && c != '\'') { | |
981 *nextTokPtr = ptr; | |
982 return 0; | |
983 } | |
984 open = c; | |
985 ptr += enc->minBytesPerChar; | |
986 *valPtr = ptr; | |
987 for (;; ptr += enc->minBytesPerChar) { | |
988 c = toAscii(enc, ptr, end); | |
989 if (c == open) | |
990 break; | |
991 if (!('a' <= c && c <= 'z') | |
992 && !('A' <= c && c <= 'Z') | |
993 && !('0' <= c && c <= '9') | |
994 && c != '.' | |
995 && c != '-' | |
996 && c != '_') { | |
997 *nextTokPtr = ptr; | |
998 return 0; | |
999 } | |
1000 } | |
1001 *nextTokPtr = ptr + enc->minBytesPerChar; | |
1002 return 1; | |
1003 } | |
1004 | |
1005 static | |
1006 int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, | |
1007 const char *, | |
1008 const char *), | |
1009 int isGeneralTextEntity, | |
1010 const ENCODING *enc, | |
1011 const char *ptr, | |
1012 const char *end, | |
1013 const char **badPtr, | |
1014 const char **versionPtr, | |
1015 const char **encodingName, | |
1016 const ENCODING **encoding, | |
1017 int *standalone) | |
1018 { | |
1019 const char *val = 0; | |
1020 const char *name = 0; | |
1021 ptr += 5 * enc->minBytesPerChar; | |
1022 end -= 2 * enc->minBytesPerChar; | |
1023 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr) || !name) { | |
1024 *badPtr = ptr; | |
1025 return 0; | |
1026 } | |
1027 if (!XmlNameMatchesAscii(enc, name, "version")) { | |
1028 if (!isGeneralTextEntity) { | |
1029 *badPtr = name; | |
1030 return 0; | |
1031 } | |
1032 } | |
1033 else { | |
1034 if (versionPtr) | |
1035 *versionPtr = val; | |
1036 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) { | |
1037 *badPtr = ptr; | |
1038 return 0; | |
1039 } | |
1040 if (!name) { | |
1041 if (isGeneralTextEntity) { | |
1042 /* a TextDecl must have an EncodingDecl */ | |
1043 *badPtr = ptr; | |
1044 return 0; | |
1045 } | |
1046 return 1; | |
1047 } | |
1048 } | |
1049 if (XmlNameMatchesAscii(enc, name, "encoding")) { | |
1050 int c = toAscii(enc, val, end); | |
1051 if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) { | |
1052 *badPtr = val; | |
1053 return 0; | |
1054 } | |
1055 if (encodingName) | |
1056 *encodingName = val; | |
1057 if (encoding) | |
1058 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar); | |
1059 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) { | |
1060 *badPtr = ptr; | |
1061 return 0; | |
1062 } | |
1063 if (!name) | |
1064 return 1; | |
1065 } | |
1066 if (!XmlNameMatchesAscii(enc, name, "standalone") || isGeneralTextEntity) { | |
1067 *badPtr = name; | |
1068 return 0; | |
1069 } | |
1070 if (XmlNameMatchesAscii(enc, val, "yes")) { | |
1071 if (standalone) | |
1072 *standalone = 1; | |
1073 } | |
1074 else if (XmlNameMatchesAscii(enc, val, "no")) { | |
1075 if (standalone) | |
1076 *standalone = 0; | |
1077 } | |
1078 else { | |
1079 *badPtr = val; | |
1080 return 0; | |
1081 } | |
1082 while (isSpace(toAscii(enc, ptr, end))) | |
1083 ptr += enc->minBytesPerChar; | |
1084 if (ptr != end) { | |
1085 *badPtr = ptr; | |
1086 return 0; | |
1087 } | |
1088 return 1; | |
1089 } | |
1090 | |
1091 static | |
1092 int checkCharRefNumber(int result) | |
1093 { | |
1094 switch (result >> 8) { | |
1095 case 0xD8: case 0xD9: case 0xDA: case 0xDB: | |
1096 case 0xDC: case 0xDD: case 0xDE: case 0xDF: | |
1097 return -1; | |
1098 case 0: | |
1099 if (latin1_encoding.type[result] == BT_NONXML) | |
1100 return -1; | |
1101 break; | |
1102 case 0xFF: | |
1103 if (result == 0xFFFE || result == 0xFFFF) | |
1104 return -1; | |
1105 break; | |
1106 } | |
1107 return result; | |
1108 } | |
1109 | |
1110 int XmlUtf8Encode(int c, char *buf) | |
1111 { | |
1112 enum { | |
1113 /* minN is minimum legal resulting value for N byte sequence */ | |
1114 min2 = 0x80, | |
1115 min3 = 0x800, | |
1116 min4 = 0x10000 | |
1117 }; | |
1118 | |
1119 if (c < 0) | |
1120 return 0; | |
1121 if (c < min2) { | |
1122 buf[0] = (c | UTF8_cval1); | |
1123 return 1; | |
1124 } | |
1125 if (c < min3) { | |
1126 buf[0] = ((c >> 6) | UTF8_cval2); | |
1127 buf[1] = ((c & 0x3f) | 0x80); | |
1128 return 2; | |
1129 } | |
1130 if (c < min4) { | |
1131 buf[0] = ((c >> 12) | UTF8_cval3); | |
1132 buf[1] = (((c >> 6) & 0x3f) | 0x80); | |
1133 buf[2] = ((c & 0x3f) | 0x80); | |
1134 return 3; | |
1135 } | |
1136 if (c < 0x110000) { | |
1137 buf[0] = ((c >> 18) | UTF8_cval4); | |
1138 buf[1] = (((c >> 12) & 0x3f) | 0x80); | |
1139 buf[2] = (((c >> 6) & 0x3f) | 0x80); | |
1140 buf[3] = ((c & 0x3f) | 0x80); | |
1141 return 4; | |
1142 } | |
1143 return 0; | |
1144 } | |
1145 | |
1146 int XmlUtf16Encode(int charNum, unsigned short *buf) | |
1147 { | |
1148 if (charNum < 0) | |
1149 return 0; | |
1150 if (charNum < 0x10000) { | |
1151 buf[0] = charNum; | |
1152 return 1; | |
1153 } | |
1154 if (charNum < 0x110000) { | |
1155 charNum -= 0x10000; | |
1156 buf[0] = (charNum >> 10) + 0xD800; | |
1157 buf[1] = (charNum & 0x3FF) + 0xDC00; | |
1158 return 2; | |
1159 } | |
1160 return 0; | |
1161 } | |
1162 | |
1163 struct unknown_encoding { | |
1164 struct normal_encoding normal; | |
1165 int (*convert)(void *userData, const char *p); | |
1166 void *userData; | |
1167 unsigned short utf16[256]; | |
1168 char utf8[256][4]; | |
1169 }; | |
1170 | |
1171 int XmlSizeOfUnknownEncoding() | |
1172 { | |
1173 return sizeof(struct unknown_encoding); | |
1174 } | |
1175 | |
1176 static | |
1177 int unknown_isName(const ENCODING *enc, const char *p) | |
1178 { | |
1179 int c = ((const struct unknown_encoding *)enc) | |
1180 ->convert(((const struct unknown_encoding *)enc)->userData, p); | |
1181 if (c & ~0xFFFF) | |
1182 return 0; | |
1183 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF); | |
1184 } | |
1185 | |
1186 static | |
1187 int unknown_isNmstrt(const ENCODING *enc, const char *p) | |
1188 { | |
1189 int c = ((const struct unknown_encoding *)enc) | |
1190 ->convert(((const struct unknown_encoding *)enc)->userData, p); | |
1191 if (c & ~0xFFFF) | |
1192 return 0; | |
1193 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF); | |
1194 } | |
1195 | |
1196 static | |
1197 int unknown_isInvalid(const ENCODING *enc, const char *p) | |
1198 { | |
1199 int c = ((const struct unknown_encoding *)enc) | |
1200 ->convert(((const struct unknown_encoding *)enc)->userData, p); | |
1201 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; | |
1202 } | |
1203 | |
1204 static | |
1205 void unknown_toUtf8(const ENCODING *enc, | |
1206 const char **fromP, const char *fromLim, | |
1207 char **toP, const char *toLim) | |
1208 { | |
1209 char buf[XML_UTF8_ENCODE_MAX]; | |
1210 for (;;) { | |
1211 const char *utf8; | |
1212 int n; | |
1213 if (*fromP == fromLim) | |
1214 break; | |
1215 utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP]; | |
1216 n = *utf8++; | |
1217 if (n == 0) { | |
1218 int c = ((const struct unknown_encoding *)enc) | |
1219 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP); | |
1220 n = XmlUtf8Encode(c, buf); | |
1221 if (n > toLim - *toP) | |
1222 break; | |
1223 utf8 = buf; | |
1224 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP] | |
1225 - (BT_LEAD2 - 2); | |
1226 } | |
1227 else { | |
1228 if (n > toLim - *toP) | |
1229 break; | |
1230 (*fromP)++; | |
1231 } | |
1232 do { | |
1233 *(*toP)++ = *utf8++; | |
1234 } while (--n != 0); | |
1235 } | |
1236 } | |
1237 | |
1238 static | |
1239 void unknown_toUtf16(const ENCODING *enc, | |
1240 const char **fromP, const char *fromLim, | |
1241 unsigned short **toP, const unsigned short *toLim) | |
1242 { | |
1243 while (*fromP != fromLim && *toP != toLim) { | |
1244 unsigned short c | |
1245 = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP]; | |
1246 if (c == 0) { | |
1247 c = (unsigned short)((const struct unknown_encoding *)enc) | |
1248 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP); | |
1249 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP] | |
1250 - (BT_LEAD2 - 2); | |
1251 } | |
1252 else | |
1253 (*fromP)++; | |
1254 *(*toP)++ = c; | |
1255 } | |
1256 } | |
1257 | |
1258 ENCODING * | |
1259 XmlInitUnknownEncoding(void *mem, | |
1260 int *table, | |
1261 int (*convert)(void *userData, const char *p), | |
1262 void *userData) | |
1263 { | |
1264 int i; | |
1265 struct unknown_encoding *e = mem; | |
1266 for (i = 0; i < sizeof(struct normal_encoding); i++) | |
1267 ((char *)mem)[i] = ((char *)&latin1_encoding)[i]; | |
1268 for (i = 0; i < 128; i++) | |
1269 if (latin1_encoding.type[i] != BT_OTHER | |
1270 && latin1_encoding.type[i] != BT_NONXML | |
1271 && table[i] != i) | |
1272 return 0; | |
1273 for (i = 0; i < 256; i++) { | |
1274 int c = table[i]; | |
1275 if (c == -1) { | |
1276 e->normal.type[i] = BT_MALFORM; | |
1277 /* This shouldn't really get used. */ | |
1278 e->utf16[i] = 0xFFFF; | |
1279 e->utf8[i][0] = 1; | |
1280 e->utf8[i][1] = 0; | |
1281 } | |
1282 else if (c < 0) { | |
1283 if (c < -4) | |
1284 return 0; | |
1285 e->normal.type[i] = BT_LEAD2 - (c + 2); | |
1286 e->utf8[i][0] = 0; | |
1287 e->utf16[i] = 0; | |
1288 } | |
1289 else if (c < 0x80) { | |
1290 if (latin1_encoding.type[c] != BT_OTHER | |
1291 && latin1_encoding.type[c] != BT_NONXML | |
1292 && c != i) | |
1293 return 0; | |
1294 e->normal.type[i] = latin1_encoding.type[c]; | |
1295 e->utf8[i][0] = 1; | |
1296 e->utf8[i][1] = (char)c; | |
1297 e->utf16[i] = c == 0 ? 0xFFFF : c; | |
1298 } | |
1299 else if (checkCharRefNumber(c) < 0) { | |
1300 e->normal.type[i] = BT_NONXML; | |
1301 /* This shouldn't really get used. */ | |
1302 e->utf16[i] = 0xFFFF; | |
1303 e->utf8[i][0] = 1; | |
1304 e->utf8[i][1] = 0; | |
1305 } | |
1306 else { | |
1307 if (c > 0xFFFF) | |
1308 return 0; | |
1309 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)) | |
1310 e->normal.type[i] = BT_NMSTRT; | |
1311 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)) | |
1312 e->normal.type[i] = BT_NAME; | |
1313 else | |
1314 e->normal.type[i] = BT_OTHER; | |
1315 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); | |
1316 e->utf16[i] = c; | |
1317 } | |
1318 } | |
1319 e->userData = userData; | |
1320 e->convert = convert; | |
1321 if (convert) { | |
1322 e->normal.isName2 = unknown_isName; | |
1323 e->normal.isName3 = unknown_isName; | |
1324 e->normal.isName4 = unknown_isName; | |
1325 e->normal.isNmstrt2 = unknown_isNmstrt; | |
1326 e->normal.isNmstrt3 = unknown_isNmstrt; | |
1327 e->normal.isNmstrt4 = unknown_isNmstrt; | |
1328 e->normal.isInvalid2 = unknown_isInvalid; | |
1329 e->normal.isInvalid3 = unknown_isInvalid; | |
1330 e->normal.isInvalid4 = unknown_isInvalid; | |
1331 } | |
1332 e->normal.enc.utf8Convert = unknown_toUtf8; | |
1333 e->normal.enc.utf16Convert = unknown_toUtf16; | |
1334 return &(e->normal.enc); | |
1335 } | |
1336 | |
1337 /* If this enumeration is changed, getEncodingIndex and encodings | |
1338 must also be changed. */ | |
1339 enum { | |
1340 UNKNOWN_ENC = -1, | |
1341 ISO_8859_1_ENC = 0, | |
1342 US_ASCII_ENC, | |
1343 UTF_8_ENC, | |
1344 UTF_16_ENC, | |
1345 UTF_16BE_ENC, | |
1346 UTF_16LE_ENC, | |
1347 /* must match encodingNames up to here */ | |
1348 NO_ENC | |
1349 }; | |
1350 | |
1351 static | |
1352 int getEncodingIndex(const char *name) | |
1353 { | |
1354 static const char *encodingNames[] = { | |
1355 "ISO-8859-1", | |
1356 "US-ASCII", | |
1357 "UTF-8", | |
1358 "UTF-16", | |
1359 "UTF-16BE" | |
1360 "UTF-16LE", | |
1361 }; | |
1362 int i; | |
1363 if (name == 0) | |
1364 return NO_ENC; | |
1365 for (i = 0; i < sizeof(encodingNames)/sizeof(encodingNames[0]); i++) | |
1366 if (streqci(name, encodingNames[i])) | |
1367 return i; | |
1368 return UNKNOWN_ENC; | |
1369 } | |
1370 | |
1371 /* For binary compatibility, we store the index of the encoding specified | |
1372 at initialization in the isUtf16 member. */ | |
1373 | |
1374 #define INIT_ENC_INDEX(enc) ((enc)->initEnc.isUtf16) | |
1375 | |
1376 /* This is what detects the encoding. | |
1377 encodingTable maps from encoding indices to encodings; | |
1378 INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding; | |
1379 state is XML_CONTENT_STATE if we're parsing an external text entity, | |
1380 and XML_PROLOG_STATE otherwise. | |
1381 */ | |
1382 | |
1383 | |
1384 static | |
1385 int initScan(const ENCODING **encodingTable, | |
1386 const INIT_ENCODING *enc, | |
1387 int state, | |
1388 const char *ptr, | |
1389 const char *end, | |
1390 const char **nextTokPtr) | |
1391 { | |
1392 const ENCODING **encPtr; | |
1393 | |
1394 if (ptr == end) | |
1395 return XML_TOK_NONE; | |
1396 encPtr = enc->encPtr; | |
1397 if (ptr + 1 == end) { | |
1398 /* only a single byte available for auto-detection */ | |
1399 /* a well-formed document entity must have more than one byte */ | |
1400 if (state != XML_CONTENT_STATE) | |
1401 return XML_TOK_PARTIAL; | |
1402 /* so we're parsing an external text entity... */ | |
1403 /* if UTF-16 was externally specified, then we need at least 2 bytes */ | |
1404 switch (INIT_ENC_INDEX(enc)) { | |
1405 case UTF_16_ENC: | |
1406 case UTF_16LE_ENC: | |
1407 case UTF_16BE_ENC: | |
1408 return XML_TOK_PARTIAL; | |
1409 } | |
1410 switch ((unsigned char)*ptr) { | |
1411 case 0xFE: | |
1412 case 0xFF: | |
1413 case 0xEF: /* possibly first byte of UTF-8 BOM */ | |
1414 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC | |
1415 && state == XML_CONTENT_STATE) | |
1416 break; | |
1417 /* fall through */ | |
1418 case 0x00: | |
1419 case 0x3C: | |
1420 return XML_TOK_PARTIAL; | |
1421 } | |
1422 } | |
1423 else { | |
1424 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { | |
1425 case 0xFEFF: | |
1426 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC | |
1427 && state == XML_CONTENT_STATE) | |
1428 break; | |
1429 *nextTokPtr = ptr + 2; | |
1430 *encPtr = encodingTable[UTF_16BE_ENC]; | |
1431 return XML_TOK_BOM; | |
1432 /* 00 3C is handled in the default case */ | |
1433 case 0x3C00: | |
1434 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC | |
1435 || INIT_ENC_INDEX(enc) == UTF_16_ENC) | |
1436 && state == XML_CONTENT_STATE) | |
1437 break; | |
1438 *encPtr = encodingTable[UTF_16LE_ENC]; | |
1439 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); | |
1440 case 0xFFFE: | |
1441 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC | |
1442 && state == XML_CONTENT_STATE) | |
1443 break; | |
1444 *nextTokPtr = ptr + 2; | |
1445 *encPtr = encodingTable[UTF_16LE_ENC]; | |
1446 return XML_TOK_BOM; | |
1447 case 0xEFBB: | |
1448 /* Maybe a UTF-8 BOM (EF BB BF) */ | |
1449 /* If there's an explicitly specified (external) encoding | |
1450 of ISO-8859-1 or some flavour of UTF-16 | |
1451 and this is an external text entity, | |
1452 don't look for the BOM, | |
1453 because it might be a legal data. */ | |
1454 if (state == XML_CONTENT_STATE) { | |
1455 int e = INIT_ENC_INDEX(enc); | |
1456 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC) | |
1457 break; | |
1458 } | |
1459 if (ptr + 2 == end) | |
1460 return XML_TOK_PARTIAL; | |
1461 if ((unsigned char)ptr[2] == 0xBF) { | |
1462 *encPtr = encodingTable[UTF_8_ENC]; | |
1463 return XML_TOK_BOM; | |
1464 } | |
1465 break; | |
1466 default: | |
1467 if (ptr[0] == '\0') { | |
1468 /* 0 isn't a legal data character. Furthermore a document entity can only | |
1469 start with ASCII characters. So the only way this can fail to be big-endian | |
1470 UTF-16 if it it's an external parsed general entity that's labelled as | |
1471 UTF-16LE. */ | |
1472 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC) | |
1473 break; | |
1474 *encPtr = encodingTable[UTF_16BE_ENC]; | |
1475 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); | |
1476 } | |
1477 else if (ptr[1] == '\0') { | |
1478 /* We could recover here in the case: | |
1479 - parsing an external entity | |
1480 - second byte is 0 | |
1481 - no externally specified encoding | |
1482 - no encoding declaration | |
1483 by assuming UTF-16LE. But we don't, because this would mean when | |
1484 presented just with a single byte, we couldn't reliably determine | |
1485 whether we needed further bytes. */ | |
1486 if (state == XML_CONTENT_STATE) | |
1487 break; | |
1488 *encPtr = encodingTable[UTF_16LE_ENC]; | |
1489 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); | |
1490 } | |
1491 break; | |
1492 } | |
1493 } | |
1494 *encPtr = encodingTable[INIT_ENC_INDEX(enc)]; | |
1495 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); | |
1496 } | |
1497 | |
1498 | |
1499 #define NS(x) x | |
1500 #define ns(x) x | |
1501 #include "xmltok_ns.c" | |
1502 #undef NS | |
1503 #undef ns | |
1504 | |
1505 #ifdef XML_NS | |
1506 | |
1507 #define NS(x) x ## NS | |
1508 #define ns(x) x ## _ns | |
1509 | |
1510 #include "xmltok_ns.c" | |
1511 | |
1512 #undef NS | |
1513 #undef ns | |
1514 | |
1515 ENCODING * | |
1516 XmlInitUnknownEncodingNS(void *mem, | |
1517 int *table, | |
1518 int (*convert)(void *userData, const char *p), | |
1519 void *userData) | |
1520 { | |
1521 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData); | |
1522 if (enc) | |
1523 ((struct normal_encoding *)enc)->type[':'] = BT_COLON; | |
1524 return enc; | |
1525 } | |
1526 | |
1527 #endif /* XML_NS */ |