Mercurial > pidgin.yaz
comparison src/protocols/jabber/xmltok_impl.c @ 2086:424a40f12a6c
[gaim-migrate @ 2096]
moving protocols from plugins/ to src/protocols. making it so that you can select which protocols are compiled statically.
committer: Tailor Script <tailor@pidgin.im>
author | Eric Warmenhoven <eric@warmenhoven.org> |
---|---|
date | Tue, 31 Jul 2001 01:00:39 +0000 |
parents | |
children | 4e7cefc55971 |
comparison
equal
deleted
inserted
replaced
2085:7ebb4322f89b | 2086:424a40f12a6c |
---|---|
1 /* | |
2 The contents of this file are subject to the Mozilla Public License | |
3 Version 1.1 (the "License"); you may not use this file except in | |
4 compliance with the License. You may obtain a copy of the License at | |
5 http://www.mozilla.org/MPL/ | |
6 | |
7 Software distributed under the License is distributed on an "AS IS" | |
8 basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the | |
9 License for the specific language governing rights and limitations | |
10 under the License. | |
11 | |
12 The Original Code is expat. | |
13 | |
14 The Initial Developer of the Original Code is James Clark. | |
15 Portions created by James Clark are Copyright (C) 1998, 1999 | |
16 James Clark. All Rights Reserved. | |
17 | |
18 Contributor(s): | |
19 | |
20 Alternatively, the contents of this file may be used under the terms | |
21 of the GNU General Public License (the "GPL"), in which case the | |
22 provisions of the GPL are applicable instead of those above. If you | |
23 wish to allow use of your version of this file only under the terms of | |
24 the GPL and not to allow others to use your version of this file under | |
25 the MPL, indicate your decision by deleting the provisions above and | |
26 replace them with the notice and other provisions required by the | |
27 GPL. If you do not delete the provisions above, a recipient may use | |
28 your version of this file under either the MPL or the GPL. | |
29 */ | |
30 | |
31 #ifndef IS_INVALID_CHAR | |
32 #define IS_INVALID_CHAR(enc, ptr, n) (0) | |
33 #endif | |
34 | |
35 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \ | |
36 case BT_LEAD ## n: \ | |
37 if (end - ptr < n) \ | |
38 return XML_TOK_PARTIAL_CHAR; \ | |
39 if (IS_INVALID_CHAR(enc, ptr, n)) { \ | |
40 *(nextTokPtr) = (ptr); \ | |
41 return XML_TOK_INVALID; \ | |
42 } \ | |
43 ptr += n; \ | |
44 break; | |
45 | |
46 #define INVALID_CASES(ptr, nextTokPtr) \ | |
47 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \ | |
48 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \ | |
49 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \ | |
50 case BT_NONXML: \ | |
51 case BT_MALFORM: \ | |
52 case BT_TRAIL: \ | |
53 *(nextTokPtr) = (ptr); \ | |
54 return XML_TOK_INVALID; | |
55 | |
56 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \ | |
57 case BT_LEAD ## n: \ | |
58 if (end - ptr < n) \ | |
59 return XML_TOK_PARTIAL_CHAR; \ | |
60 if (!IS_NAME_CHAR(enc, ptr, n)) { \ | |
61 *nextTokPtr = ptr; \ | |
62 return XML_TOK_INVALID; \ | |
63 } \ | |
64 ptr += n; \ | |
65 break; | |
66 | |
67 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \ | |
68 case BT_NONASCII: \ | |
69 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \ | |
70 *nextTokPtr = ptr; \ | |
71 return XML_TOK_INVALID; \ | |
72 } \ | |
73 case BT_NMSTRT: \ | |
74 case BT_HEX: \ | |
75 case BT_DIGIT: \ | |
76 case BT_NAME: \ | |
77 case BT_MINUS: \ | |
78 ptr += MINBPC(enc); \ | |
79 break; \ | |
80 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \ | |
81 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \ | |
82 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr) | |
83 | |
84 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \ | |
85 case BT_LEAD ## n: \ | |
86 if (end - ptr < n) \ | |
87 return XML_TOK_PARTIAL_CHAR; \ | |
88 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \ | |
89 *nextTokPtr = ptr; \ | |
90 return XML_TOK_INVALID; \ | |
91 } \ | |
92 ptr += n; \ | |
93 break; | |
94 | |
95 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \ | |
96 case BT_NONASCII: \ | |
97 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \ | |
98 *nextTokPtr = ptr; \ | |
99 return XML_TOK_INVALID; \ | |
100 } \ | |
101 case BT_NMSTRT: \ | |
102 case BT_HEX: \ | |
103 ptr += MINBPC(enc); \ | |
104 break; \ | |
105 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \ | |
106 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \ | |
107 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr) | |
108 | |
109 #ifndef PREFIX | |
110 #define PREFIX(ident) ident | |
111 #endif | |
112 | |
113 /* ptr points to character following "<!-" */ | |
114 | |
115 static | |
116 int PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end, | |
117 const char **nextTokPtr) | |
118 { | |
119 if (ptr != end) { | |
120 if (!CHAR_MATCHES(enc, ptr, '-')) { | |
121 *nextTokPtr = ptr; | |
122 return XML_TOK_INVALID; | |
123 } | |
124 ptr += MINBPC(enc); | |
125 while (ptr != end) { | |
126 switch (BYTE_TYPE(enc, ptr)) { | |
127 INVALID_CASES(ptr, nextTokPtr) | |
128 case BT_MINUS: | |
129 if ((ptr += MINBPC(enc)) == end) | |
130 return XML_TOK_PARTIAL; | |
131 if (CHAR_MATCHES(enc, ptr, '-')) { | |
132 if ((ptr += MINBPC(enc)) == end) | |
133 return XML_TOK_PARTIAL; | |
134 if (!CHAR_MATCHES(enc, ptr, '>')) { | |
135 *nextTokPtr = ptr; | |
136 return XML_TOK_INVALID; | |
137 } | |
138 *nextTokPtr = ptr + MINBPC(enc); | |
139 return XML_TOK_COMMENT; | |
140 } | |
141 break; | |
142 default: | |
143 ptr += MINBPC(enc); | |
144 break; | |
145 } | |
146 } | |
147 } | |
148 return XML_TOK_PARTIAL; | |
149 } | |
150 | |
151 /* ptr points to character following "<!" */ | |
152 | |
153 static | |
154 int PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end, | |
155 const char **nextTokPtr) | |
156 { | |
157 if (ptr == end) | |
158 return XML_TOK_PARTIAL; | |
159 switch (BYTE_TYPE(enc, ptr)) { | |
160 case BT_MINUS: | |
161 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr); | |
162 case BT_LSQB: | |
163 *nextTokPtr = ptr + MINBPC(enc); | |
164 return XML_TOK_COND_SECT_OPEN; | |
165 case BT_NMSTRT: | |
166 case BT_HEX: | |
167 ptr += MINBPC(enc); | |
168 break; | |
169 default: | |
170 *nextTokPtr = ptr; | |
171 return XML_TOK_INVALID; | |
172 } | |
173 while (ptr != end) { | |
174 switch (BYTE_TYPE(enc, ptr)) { | |
175 case BT_PERCNT: | |
176 if (ptr + MINBPC(enc) == end) | |
177 return XML_TOK_PARTIAL; | |
178 /* don't allow <!ENTITY% foo "whatever"> */ | |
179 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) { | |
180 case BT_S: case BT_CR: case BT_LF: case BT_PERCNT: | |
181 *nextTokPtr = ptr; | |
182 return XML_TOK_INVALID; | |
183 } | |
184 /* fall through */ | |
185 case BT_S: case BT_CR: case BT_LF: | |
186 *nextTokPtr = ptr; | |
187 return XML_TOK_DECL_OPEN; | |
188 case BT_NMSTRT: | |
189 case BT_HEX: | |
190 ptr += MINBPC(enc); | |
191 break; | |
192 default: | |
193 *nextTokPtr = ptr; | |
194 return XML_TOK_INVALID; | |
195 } | |
196 } | |
197 return XML_TOK_PARTIAL; | |
198 } | |
199 | |
200 static | |
201 int PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end, int *tokPtr) | |
202 { | |
203 int upper = 0; | |
204 *tokPtr = XML_TOK_PI; | |
205 if (end - ptr != MINBPC(enc)*3) | |
206 return 1; | |
207 switch (BYTE_TO_ASCII(enc, ptr)) { | |
208 case 'x': | |
209 break; | |
210 case 'X': | |
211 upper = 1; | |
212 break; | |
213 default: | |
214 return 1; | |
215 } | |
216 ptr += MINBPC(enc); | |
217 switch (BYTE_TO_ASCII(enc, ptr)) { | |
218 case 'm': | |
219 break; | |
220 case 'M': | |
221 upper = 1; | |
222 break; | |
223 default: | |
224 return 1; | |
225 } | |
226 ptr += MINBPC(enc); | |
227 switch (BYTE_TO_ASCII(enc, ptr)) { | |
228 case 'l': | |
229 break; | |
230 case 'L': | |
231 upper = 1; | |
232 break; | |
233 default: | |
234 return 1; | |
235 } | |
236 if (upper) | |
237 return 0; | |
238 *tokPtr = XML_TOK_XML_DECL; | |
239 return 1; | |
240 } | |
241 | |
242 /* ptr points to character following "<?" */ | |
243 | |
244 static | |
245 int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end, | |
246 const char **nextTokPtr) | |
247 { | |
248 int tok; | |
249 const char *target = ptr; | |
250 if (ptr == end) | |
251 return XML_TOK_PARTIAL; | |
252 switch (BYTE_TYPE(enc, ptr)) { | |
253 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) | |
254 default: | |
255 *nextTokPtr = ptr; | |
256 return XML_TOK_INVALID; | |
257 } | |
258 while (ptr != end) { | |
259 switch (BYTE_TYPE(enc, ptr)) { | |
260 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) | |
261 case BT_S: case BT_CR: case BT_LF: | |
262 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) { | |
263 *nextTokPtr = ptr; | |
264 return XML_TOK_INVALID; | |
265 } | |
266 ptr += MINBPC(enc); | |
267 while (ptr != end) { | |
268 switch (BYTE_TYPE(enc, ptr)) { | |
269 INVALID_CASES(ptr, nextTokPtr) | |
270 case BT_QUEST: | |
271 ptr += MINBPC(enc); | |
272 if (ptr == end) | |
273 return XML_TOK_PARTIAL; | |
274 if (CHAR_MATCHES(enc, ptr, '>')) { | |
275 *nextTokPtr = ptr + MINBPC(enc); | |
276 return tok; | |
277 } | |
278 break; | |
279 default: | |
280 ptr += MINBPC(enc); | |
281 break; | |
282 } | |
283 } | |
284 return XML_TOK_PARTIAL; | |
285 case BT_QUEST: | |
286 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) { | |
287 *nextTokPtr = ptr; | |
288 return XML_TOK_INVALID; | |
289 } | |
290 ptr += MINBPC(enc); | |
291 if (ptr == end) | |
292 return XML_TOK_PARTIAL; | |
293 if (CHAR_MATCHES(enc, ptr, '>')) { | |
294 *nextTokPtr = ptr + MINBPC(enc); | |
295 return tok; | |
296 } | |
297 /* fall through */ | |
298 default: | |
299 *nextTokPtr = ptr; | |
300 return XML_TOK_INVALID; | |
301 } | |
302 } | |
303 return XML_TOK_PARTIAL; | |
304 } | |
305 | |
306 | |
307 static | |
308 int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end, | |
309 const char **nextTokPtr) | |
310 { | |
311 int i; | |
312 /* CDATA[ */ | |
313 if (end - ptr < 6 * MINBPC(enc)) | |
314 return XML_TOK_PARTIAL; | |
315 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) { | |
316 if (!CHAR_MATCHES(enc, ptr, "CDATA["[i])) { | |
317 *nextTokPtr = ptr; | |
318 return XML_TOK_INVALID; | |
319 } | |
320 } | |
321 *nextTokPtr = ptr; | |
322 return XML_TOK_CDATA_SECT_OPEN; | |
323 } | |
324 | |
325 static | |
326 int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end, | |
327 const char **nextTokPtr) | |
328 { | |
329 if (ptr == end) | |
330 return XML_TOK_NONE; | |
331 if (MINBPC(enc) > 1) { | |
332 size_t n = end - ptr; | |
333 if (n & (MINBPC(enc) - 1)) { | |
334 n &= ~(MINBPC(enc) - 1); | |
335 if (n == 0) | |
336 return XML_TOK_PARTIAL; | |
337 end = ptr + n; | |
338 } | |
339 } | |
340 switch (BYTE_TYPE(enc, ptr)) { | |
341 case BT_RSQB: | |
342 ptr += MINBPC(enc); | |
343 if (ptr == end) | |
344 return XML_TOK_PARTIAL; | |
345 if (!CHAR_MATCHES(enc, ptr, ']')) | |
346 break; | |
347 ptr += MINBPC(enc); | |
348 if (ptr == end) | |
349 return XML_TOK_PARTIAL; | |
350 if (!CHAR_MATCHES(enc, ptr, '>')) { | |
351 ptr -= MINBPC(enc); | |
352 break; | |
353 } | |
354 *nextTokPtr = ptr + MINBPC(enc); | |
355 return XML_TOK_CDATA_SECT_CLOSE; | |
356 case BT_CR: | |
357 ptr += MINBPC(enc); | |
358 if (ptr == end) | |
359 return XML_TOK_PARTIAL; | |
360 if (BYTE_TYPE(enc, ptr) == BT_LF) | |
361 ptr += MINBPC(enc); | |
362 *nextTokPtr = ptr; | |
363 return XML_TOK_DATA_NEWLINE; | |
364 case BT_LF: | |
365 *nextTokPtr = ptr + MINBPC(enc); | |
366 return XML_TOK_DATA_NEWLINE; | |
367 INVALID_CASES(ptr, nextTokPtr) | |
368 default: | |
369 ptr += MINBPC(enc); | |
370 break; | |
371 } | |
372 while (ptr != end) { | |
373 switch (BYTE_TYPE(enc, ptr)) { | |
374 #define LEAD_CASE(n) \ | |
375 case BT_LEAD ## n: \ | |
376 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ | |
377 *nextTokPtr = ptr; \ | |
378 return XML_TOK_DATA_CHARS; \ | |
379 } \ | |
380 ptr += n; \ | |
381 break; | |
382 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) | |
383 #undef LEAD_CASE | |
384 case BT_NONXML: | |
385 case BT_MALFORM: | |
386 case BT_TRAIL: | |
387 case BT_CR: | |
388 case BT_LF: | |
389 case BT_RSQB: | |
390 *nextTokPtr = ptr; | |
391 return XML_TOK_DATA_CHARS; | |
392 default: | |
393 ptr += MINBPC(enc); | |
394 break; | |
395 } | |
396 } | |
397 *nextTokPtr = ptr; | |
398 return XML_TOK_DATA_CHARS; | |
399 } | |
400 | |
401 /* ptr points to character following "</" */ | |
402 | |
403 static | |
404 int PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end, | |
405 const char **nextTokPtr) | |
406 { | |
407 if (ptr == end) | |
408 return XML_TOK_PARTIAL; | |
409 switch (BYTE_TYPE(enc, ptr)) { | |
410 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) | |
411 default: | |
412 *nextTokPtr = ptr; | |
413 return XML_TOK_INVALID; | |
414 } | |
415 while (ptr != end) { | |
416 switch (BYTE_TYPE(enc, ptr)) { | |
417 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) | |
418 case BT_S: case BT_CR: case BT_LF: | |
419 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) { | |
420 switch (BYTE_TYPE(enc, ptr)) { | |
421 case BT_S: case BT_CR: case BT_LF: | |
422 break; | |
423 case BT_GT: | |
424 *nextTokPtr = ptr + MINBPC(enc); | |
425 return XML_TOK_END_TAG; | |
426 default: | |
427 *nextTokPtr = ptr; | |
428 return XML_TOK_INVALID; | |
429 } | |
430 } | |
431 return XML_TOK_PARTIAL; | |
432 #ifdef XML_NS | |
433 case BT_COLON: | |
434 /* no need to check qname syntax here, since end-tag must match exactly */ | |
435 ptr += MINBPC(enc); | |
436 break; | |
437 #endif | |
438 case BT_GT: | |
439 *nextTokPtr = ptr + MINBPC(enc); | |
440 return XML_TOK_END_TAG; | |
441 default: | |
442 *nextTokPtr = ptr; | |
443 return XML_TOK_INVALID; | |
444 } | |
445 } | |
446 return XML_TOK_PARTIAL; | |
447 } | |
448 | |
449 /* ptr points to character following "&#X" */ | |
450 | |
451 static | |
452 int PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end, | |
453 const char **nextTokPtr) | |
454 { | |
455 if (ptr != end) { | |
456 switch (BYTE_TYPE(enc, ptr)) { | |
457 case BT_DIGIT: | |
458 case BT_HEX: | |
459 break; | |
460 default: | |
461 *nextTokPtr = ptr; | |
462 return XML_TOK_INVALID; | |
463 } | |
464 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) { | |
465 switch (BYTE_TYPE(enc, ptr)) { | |
466 case BT_DIGIT: | |
467 case BT_HEX: | |
468 break; | |
469 case BT_SEMI: | |
470 *nextTokPtr = ptr + MINBPC(enc); | |
471 return XML_TOK_CHAR_REF; | |
472 default: | |
473 *nextTokPtr = ptr; | |
474 return XML_TOK_INVALID; | |
475 } | |
476 } | |
477 } | |
478 return XML_TOK_PARTIAL; | |
479 } | |
480 | |
481 /* ptr points to character following "&#" */ | |
482 | |
483 static | |
484 int PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end, | |
485 const char **nextTokPtr) | |
486 { | |
487 if (ptr != end) { | |
488 if (CHAR_MATCHES(enc, ptr, 'x')) | |
489 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); | |
490 switch (BYTE_TYPE(enc, ptr)) { | |
491 case BT_DIGIT: | |
492 break; | |
493 default: | |
494 *nextTokPtr = ptr; | |
495 return XML_TOK_INVALID; | |
496 } | |
497 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) { | |
498 switch (BYTE_TYPE(enc, ptr)) { | |
499 case BT_DIGIT: | |
500 break; | |
501 case BT_SEMI: | |
502 *nextTokPtr = ptr + MINBPC(enc); | |
503 return XML_TOK_CHAR_REF; | |
504 default: | |
505 *nextTokPtr = ptr; | |
506 return XML_TOK_INVALID; | |
507 } | |
508 } | |
509 } | |
510 return XML_TOK_PARTIAL; | |
511 } | |
512 | |
513 /* ptr points to character following "&" */ | |
514 | |
515 static | |
516 int PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end, | |
517 const char **nextTokPtr) | |
518 { | |
519 if (ptr == end) | |
520 return XML_TOK_PARTIAL; | |
521 switch (BYTE_TYPE(enc, ptr)) { | |
522 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) | |
523 case BT_NUM: | |
524 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); | |
525 default: | |
526 *nextTokPtr = ptr; | |
527 return XML_TOK_INVALID; | |
528 } | |
529 while (ptr != end) { | |
530 switch (BYTE_TYPE(enc, ptr)) { | |
531 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) | |
532 case BT_SEMI: | |
533 *nextTokPtr = ptr + MINBPC(enc); | |
534 return XML_TOK_ENTITY_REF; | |
535 default: | |
536 *nextTokPtr = ptr; | |
537 return XML_TOK_INVALID; | |
538 } | |
539 } | |
540 return XML_TOK_PARTIAL; | |
541 } | |
542 | |
543 /* ptr points to character following first character of attribute name */ | |
544 | |
545 static | |
546 int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, | |
547 const char **nextTokPtr) | |
548 { | |
549 #ifdef XML_NS | |
550 int hadColon = 0; | |
551 #endif | |
552 while (ptr != end) { | |
553 switch (BYTE_TYPE(enc, ptr)) { | |
554 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) | |
555 #ifdef XML_NS | |
556 case BT_COLON: | |
557 if (hadColon) { | |
558 *nextTokPtr = ptr; | |
559 return XML_TOK_INVALID; | |
560 } | |
561 hadColon = 1; | |
562 ptr += MINBPC(enc); | |
563 if (ptr == end) | |
564 return XML_TOK_PARTIAL; | |
565 switch (BYTE_TYPE(enc, ptr)) { | |
566 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) | |
567 default: | |
568 *nextTokPtr = ptr; | |
569 return XML_TOK_INVALID; | |
570 } | |
571 break; | |
572 #endif | |
573 case BT_S: case BT_CR: case BT_LF: | |
574 for (;;) { | |
575 int t; | |
576 | |
577 ptr += MINBPC(enc); | |
578 if (ptr == end) | |
579 return XML_TOK_PARTIAL; | |
580 t = BYTE_TYPE(enc, ptr); | |
581 if (t == BT_EQUALS) | |
582 break; | |
583 switch (t) { | |
584 case BT_S: | |
585 case BT_LF: | |
586 case BT_CR: | |
587 break; | |
588 default: | |
589 *nextTokPtr = ptr; | |
590 return XML_TOK_INVALID; | |
591 } | |
592 } | |
593 /* fall through */ | |
594 case BT_EQUALS: | |
595 { | |
596 int open; | |
597 #ifdef XML_NS | |
598 hadColon = 0; | |
599 #endif | |
600 for (;;) { | |
601 | |
602 ptr += MINBPC(enc); | |
603 if (ptr == end) | |
604 return XML_TOK_PARTIAL; | |
605 open = BYTE_TYPE(enc, ptr); | |
606 if (open == BT_QUOT || open == BT_APOS) | |
607 break; | |
608 switch (open) { | |
609 case BT_S: | |
610 case BT_LF: | |
611 case BT_CR: | |
612 break; | |
613 default: | |
614 *nextTokPtr = ptr; | |
615 return XML_TOK_INVALID; | |
616 } | |
617 } | |
618 ptr += MINBPC(enc); | |
619 /* in attribute value */ | |
620 for (;;) { | |
621 int t; | |
622 if (ptr == end) | |
623 return XML_TOK_PARTIAL; | |
624 t = BYTE_TYPE(enc, ptr); | |
625 if (t == open) | |
626 break; | |
627 switch (t) { | |
628 INVALID_CASES(ptr, nextTokPtr) | |
629 case BT_AMP: | |
630 { | |
631 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr); | |
632 if (tok <= 0) { | |
633 if (tok == XML_TOK_INVALID) | |
634 *nextTokPtr = ptr; | |
635 return tok; | |
636 } | |
637 break; | |
638 } | |
639 case BT_LT: | |
640 *nextTokPtr = ptr; | |
641 return XML_TOK_INVALID; | |
642 default: | |
643 ptr += MINBPC(enc); | |
644 break; | |
645 } | |
646 } | |
647 ptr += MINBPC(enc); | |
648 if (ptr == end) | |
649 return XML_TOK_PARTIAL; | |
650 switch (BYTE_TYPE(enc, ptr)) { | |
651 case BT_S: | |
652 case BT_CR: | |
653 case BT_LF: | |
654 break; | |
655 case BT_SOL: | |
656 goto sol; | |
657 case BT_GT: | |
658 goto gt; | |
659 default: | |
660 *nextTokPtr = ptr; | |
661 return XML_TOK_INVALID; | |
662 } | |
663 /* ptr points to closing quote */ | |
664 for (;;) { | |
665 ptr += MINBPC(enc); | |
666 if (ptr == end) | |
667 return XML_TOK_PARTIAL; | |
668 switch (BYTE_TYPE(enc, ptr)) { | |
669 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) | |
670 case BT_S: case BT_CR: case BT_LF: | |
671 continue; | |
672 case BT_GT: | |
673 gt: | |
674 *nextTokPtr = ptr + MINBPC(enc); | |
675 return XML_TOK_START_TAG_WITH_ATTS; | |
676 case BT_SOL: | |
677 sol: | |
678 ptr += MINBPC(enc); | |
679 if (ptr == end) | |
680 return XML_TOK_PARTIAL; | |
681 if (!CHAR_MATCHES(enc, ptr, '>')) { | |
682 *nextTokPtr = ptr; | |
683 return XML_TOK_INVALID; | |
684 } | |
685 *nextTokPtr = ptr + MINBPC(enc); | |
686 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS; | |
687 default: | |
688 *nextTokPtr = ptr; | |
689 return XML_TOK_INVALID; | |
690 } | |
691 break; | |
692 } | |
693 break; | |
694 } | |
695 default: | |
696 *nextTokPtr = ptr; | |
697 return XML_TOK_INVALID; | |
698 } | |
699 } | |
700 return XML_TOK_PARTIAL; | |
701 } | |
702 | |
703 /* ptr points to character following "<" */ | |
704 | |
705 static | |
706 int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end, | |
707 const char **nextTokPtr) | |
708 { | |
709 #ifdef XML_NS | |
710 int hadColon; | |
711 #endif | |
712 if (ptr == end) | |
713 return XML_TOK_PARTIAL; | |
714 switch (BYTE_TYPE(enc, ptr)) { | |
715 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) | |
716 case BT_EXCL: | |
717 if ((ptr += MINBPC(enc)) == end) | |
718 return XML_TOK_PARTIAL; | |
719 switch (BYTE_TYPE(enc, ptr)) { | |
720 case BT_MINUS: | |
721 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr); | |
722 case BT_LSQB: | |
723 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr); | |
724 } | |
725 *nextTokPtr = ptr; | |
726 return XML_TOK_INVALID; | |
727 case BT_QUEST: | |
728 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr); | |
729 case BT_SOL: | |
730 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr); | |
731 default: | |
732 *nextTokPtr = ptr; | |
733 return XML_TOK_INVALID; | |
734 } | |
735 #ifdef XML_NS | |
736 hadColon = 0; | |
737 #endif | |
738 /* we have a start-tag */ | |
739 while (ptr != end) { | |
740 switch (BYTE_TYPE(enc, ptr)) { | |
741 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) | |
742 #ifdef XML_NS | |
743 case BT_COLON: | |
744 if (hadColon) { | |
745 *nextTokPtr = ptr; | |
746 return XML_TOK_INVALID; | |
747 } | |
748 hadColon = 1; | |
749 ptr += MINBPC(enc); | |
750 if (ptr == end) | |
751 return XML_TOK_PARTIAL; | |
752 switch (BYTE_TYPE(enc, ptr)) { | |
753 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) | |
754 default: | |
755 *nextTokPtr = ptr; | |
756 return XML_TOK_INVALID; | |
757 } | |
758 break; | |
759 #endif | |
760 case BT_S: case BT_CR: case BT_LF: | |
761 { | |
762 ptr += MINBPC(enc); | |
763 while (ptr != end) { | |
764 switch (BYTE_TYPE(enc, ptr)) { | |
765 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) | |
766 case BT_GT: | |
767 goto gt; | |
768 case BT_SOL: | |
769 goto sol; | |
770 case BT_S: case BT_CR: case BT_LF: | |
771 ptr += MINBPC(enc); | |
772 continue; | |
773 default: | |
774 *nextTokPtr = ptr; | |
775 return XML_TOK_INVALID; | |
776 } | |
777 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr); | |
778 } | |
779 return XML_TOK_PARTIAL; | |
780 } | |
781 case BT_GT: | |
782 gt: | |
783 *nextTokPtr = ptr + MINBPC(enc); | |
784 return XML_TOK_START_TAG_NO_ATTS; | |
785 case BT_SOL: | |
786 sol: | |
787 ptr += MINBPC(enc); | |
788 if (ptr == end) | |
789 return XML_TOK_PARTIAL; | |
790 if (!CHAR_MATCHES(enc, ptr, '>')) { | |
791 *nextTokPtr = ptr; | |
792 return XML_TOK_INVALID; | |
793 } | |
794 *nextTokPtr = ptr + MINBPC(enc); | |
795 return XML_TOK_EMPTY_ELEMENT_NO_ATTS; | |
796 default: | |
797 *nextTokPtr = ptr; | |
798 return XML_TOK_INVALID; | |
799 } | |
800 } | |
801 return XML_TOK_PARTIAL; | |
802 } | |
803 | |
804 static | |
805 int PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end, | |
806 const char **nextTokPtr) | |
807 { | |
808 if (ptr == end) | |
809 return XML_TOK_NONE; | |
810 if (MINBPC(enc) > 1) { | |
811 size_t n = end - ptr; | |
812 if (n & (MINBPC(enc) - 1)) { | |
813 n &= ~(MINBPC(enc) - 1); | |
814 if (n == 0) | |
815 return XML_TOK_PARTIAL; | |
816 end = ptr + n; | |
817 } | |
818 } | |
819 switch (BYTE_TYPE(enc, ptr)) { | |
820 case BT_LT: | |
821 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr); | |
822 case BT_AMP: | |
823 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); | |
824 case BT_CR: | |
825 ptr += MINBPC(enc); | |
826 if (ptr == end) | |
827 return XML_TOK_TRAILING_CR; | |
828 if (BYTE_TYPE(enc, ptr) == BT_LF) | |
829 ptr += MINBPC(enc); | |
830 *nextTokPtr = ptr; | |
831 return XML_TOK_DATA_NEWLINE; | |
832 case BT_LF: | |
833 *nextTokPtr = ptr + MINBPC(enc); | |
834 return XML_TOK_DATA_NEWLINE; | |
835 case BT_RSQB: | |
836 ptr += MINBPC(enc); | |
837 if (ptr == end) | |
838 return XML_TOK_TRAILING_RSQB; | |
839 if (!CHAR_MATCHES(enc, ptr, ']')) | |
840 break; | |
841 ptr += MINBPC(enc); | |
842 if (ptr == end) | |
843 return XML_TOK_TRAILING_RSQB; | |
844 if (!CHAR_MATCHES(enc, ptr, '>')) { | |
845 ptr -= MINBPC(enc); | |
846 break; | |
847 } | |
848 *nextTokPtr = ptr; | |
849 return XML_TOK_INVALID; | |
850 INVALID_CASES(ptr, nextTokPtr) | |
851 default: | |
852 ptr += MINBPC(enc); | |
853 break; | |
854 } | |
855 while (ptr != end) { | |
856 switch (BYTE_TYPE(enc, ptr)) { | |
857 #define LEAD_CASE(n) \ | |
858 case BT_LEAD ## n: \ | |
859 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ | |
860 *nextTokPtr = ptr; \ | |
861 return XML_TOK_DATA_CHARS; \ | |
862 } \ | |
863 ptr += n; \ | |
864 break; | |
865 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) | |
866 #undef LEAD_CASE | |
867 case BT_RSQB: | |
868 if (ptr + MINBPC(enc) != end) { | |
869 if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ']')) { | |
870 ptr += MINBPC(enc); | |
871 break; | |
872 } | |
873 if (ptr + 2*MINBPC(enc) != end) { | |
874 if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), '>')) { | |
875 ptr += MINBPC(enc); | |
876 break; | |
877 } | |
878 *nextTokPtr = ptr + 2*MINBPC(enc); | |
879 return XML_TOK_INVALID; | |
880 } | |
881 } | |
882 /* fall through */ | |
883 case BT_AMP: | |
884 case BT_LT: | |
885 case BT_NONXML: | |
886 case BT_MALFORM: | |
887 case BT_TRAIL: | |
888 case BT_CR: | |
889 case BT_LF: | |
890 *nextTokPtr = ptr; | |
891 return XML_TOK_DATA_CHARS; | |
892 default: | |
893 ptr += MINBPC(enc); | |
894 break; | |
895 } | |
896 } | |
897 *nextTokPtr = ptr; | |
898 return XML_TOK_DATA_CHARS; | |
899 } | |
900 | |
901 /* ptr points to character following "%" */ | |
902 | |
903 static | |
904 int PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end, | |
905 const char **nextTokPtr) | |
906 { | |
907 if (ptr == end) | |
908 return XML_TOK_PARTIAL; | |
909 switch (BYTE_TYPE(enc, ptr)) { | |
910 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) | |
911 case BT_S: case BT_LF: case BT_CR: case BT_PERCNT: | |
912 *nextTokPtr = ptr; | |
913 return XML_TOK_PERCENT; | |
914 default: | |
915 *nextTokPtr = ptr; | |
916 return XML_TOK_INVALID; | |
917 } | |
918 while (ptr != end) { | |
919 switch (BYTE_TYPE(enc, ptr)) { | |
920 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) | |
921 case BT_SEMI: | |
922 *nextTokPtr = ptr + MINBPC(enc); | |
923 return XML_TOK_PARAM_ENTITY_REF; | |
924 default: | |
925 *nextTokPtr = ptr; | |
926 return XML_TOK_INVALID; | |
927 } | |
928 } | |
929 return XML_TOK_PARTIAL; | |
930 } | |
931 | |
932 static | |
933 int PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end, | |
934 const char **nextTokPtr) | |
935 { | |
936 if (ptr == end) | |
937 return XML_TOK_PARTIAL; | |
938 switch (BYTE_TYPE(enc, ptr)) { | |
939 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) | |
940 default: | |
941 *nextTokPtr = ptr; | |
942 return XML_TOK_INVALID; | |
943 } | |
944 while (ptr != end) { | |
945 switch (BYTE_TYPE(enc, ptr)) { | |
946 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) | |
947 case BT_CR: case BT_LF: case BT_S: | |
948 case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR: | |
949 *nextTokPtr = ptr; | |
950 return XML_TOK_POUND_NAME; | |
951 default: | |
952 *nextTokPtr = ptr; | |
953 return XML_TOK_INVALID; | |
954 } | |
955 } | |
956 return XML_TOK_PARTIAL; | |
957 } | |
958 | |
959 static | |
960 int PREFIX(scanLit)(int open, const ENCODING *enc, | |
961 const char *ptr, const char *end, | |
962 const char **nextTokPtr) | |
963 { | |
964 while (ptr != end) { | |
965 int t = BYTE_TYPE(enc, ptr); | |
966 switch (t) { | |
967 INVALID_CASES(ptr, nextTokPtr) | |
968 case BT_QUOT: | |
969 case BT_APOS: | |
970 ptr += MINBPC(enc); | |
971 if (t != open) | |
972 break; | |
973 if (ptr == end) | |
974 return XML_TOK_PARTIAL; | |
975 *nextTokPtr = ptr; | |
976 switch (BYTE_TYPE(enc, ptr)) { | |
977 case BT_S: case BT_CR: case BT_LF: | |
978 case BT_GT: case BT_PERCNT: case BT_LSQB: | |
979 return XML_TOK_LITERAL; | |
980 default: | |
981 return XML_TOK_INVALID; | |
982 } | |
983 default: | |
984 ptr += MINBPC(enc); | |
985 break; | |
986 } | |
987 } | |
988 return XML_TOK_PARTIAL; | |
989 } | |
990 | |
991 static | |
992 int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, | |
993 const char **nextTokPtr) | |
994 { | |
995 int tok; | |
996 if (ptr == end) | |
997 return XML_TOK_NONE; | |
998 if (MINBPC(enc) > 1) { | |
999 size_t n = end - ptr; | |
1000 if (n & (MINBPC(enc) - 1)) { | |
1001 n &= ~(MINBPC(enc) - 1); | |
1002 if (n == 0) | |
1003 return XML_TOK_PARTIAL; | |
1004 end = ptr + n; | |
1005 } | |
1006 } | |
1007 switch (BYTE_TYPE(enc, ptr)) { | |
1008 case BT_QUOT: | |
1009 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr); | |
1010 case BT_APOS: | |
1011 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr); | |
1012 case BT_LT: | |
1013 { | |
1014 ptr += MINBPC(enc); | |
1015 if (ptr == end) | |
1016 return XML_TOK_PARTIAL; | |
1017 switch (BYTE_TYPE(enc, ptr)) { | |
1018 case BT_EXCL: | |
1019 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr); | |
1020 case BT_QUEST: | |
1021 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr); | |
1022 case BT_NMSTRT: | |
1023 case BT_HEX: | |
1024 case BT_NONASCII: | |
1025 case BT_LEAD2: | |
1026 case BT_LEAD3: | |
1027 case BT_LEAD4: | |
1028 *nextTokPtr = ptr - MINBPC(enc); | |
1029 return XML_TOK_INSTANCE_START; | |
1030 } | |
1031 *nextTokPtr = ptr; | |
1032 return XML_TOK_INVALID; | |
1033 } | |
1034 case BT_CR: | |
1035 if (ptr + MINBPC(enc) == end) | |
1036 return XML_TOK_TRAILING_CR; | |
1037 /* fall through */ | |
1038 case BT_S: case BT_LF: | |
1039 for (;;) { | |
1040 ptr += MINBPC(enc); | |
1041 if (ptr == end) | |
1042 break; | |
1043 switch (BYTE_TYPE(enc, ptr)) { | |
1044 case BT_S: case BT_LF: | |
1045 break; | |
1046 case BT_CR: | |
1047 /* don't split CR/LF pair */ | |
1048 if (ptr + MINBPC(enc) != end) | |
1049 break; | |
1050 /* fall through */ | |
1051 default: | |
1052 *nextTokPtr = ptr; | |
1053 return XML_TOK_PROLOG_S; | |
1054 } | |
1055 } | |
1056 *nextTokPtr = ptr; | |
1057 return XML_TOK_PROLOG_S; | |
1058 case BT_PERCNT: | |
1059 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr); | |
1060 case BT_COMMA: | |
1061 *nextTokPtr = ptr + MINBPC(enc); | |
1062 return XML_TOK_COMMA; | |
1063 case BT_LSQB: | |
1064 *nextTokPtr = ptr + MINBPC(enc); | |
1065 return XML_TOK_OPEN_BRACKET; | |
1066 case BT_RSQB: | |
1067 ptr += MINBPC(enc); | |
1068 if (ptr == end) | |
1069 return XML_TOK_PARTIAL; | |
1070 if (CHAR_MATCHES(enc, ptr, ']')) { | |
1071 if (ptr + MINBPC(enc) == end) | |
1072 return XML_TOK_PARTIAL; | |
1073 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), '>')) { | |
1074 *nextTokPtr = ptr + 2*MINBPC(enc); | |
1075 return XML_TOK_COND_SECT_CLOSE; | |
1076 } | |
1077 } | |
1078 *nextTokPtr = ptr; | |
1079 return XML_TOK_CLOSE_BRACKET; | |
1080 case BT_LPAR: | |
1081 *nextTokPtr = ptr + MINBPC(enc); | |
1082 return XML_TOK_OPEN_PAREN; | |
1083 case BT_RPAR: | |
1084 ptr += MINBPC(enc); | |
1085 if (ptr == end) | |
1086 return XML_TOK_PARTIAL; | |
1087 switch (BYTE_TYPE(enc, ptr)) { | |
1088 case BT_AST: | |
1089 *nextTokPtr = ptr + MINBPC(enc); | |
1090 return XML_TOK_CLOSE_PAREN_ASTERISK; | |
1091 case BT_QUEST: | |
1092 *nextTokPtr = ptr + MINBPC(enc); | |
1093 return XML_TOK_CLOSE_PAREN_QUESTION; | |
1094 case BT_PLUS: | |
1095 *nextTokPtr = ptr + MINBPC(enc); | |
1096 return XML_TOK_CLOSE_PAREN_PLUS; | |
1097 case BT_CR: case BT_LF: case BT_S: | |
1098 case BT_GT: case BT_COMMA: case BT_VERBAR: | |
1099 case BT_RPAR: | |
1100 *nextTokPtr = ptr; | |
1101 return XML_TOK_CLOSE_PAREN; | |
1102 } | |
1103 *nextTokPtr = ptr; | |
1104 return XML_TOK_INVALID; | |
1105 case BT_VERBAR: | |
1106 *nextTokPtr = ptr + MINBPC(enc); | |
1107 return XML_TOK_OR; | |
1108 case BT_GT: | |
1109 *nextTokPtr = ptr + MINBPC(enc); | |
1110 return XML_TOK_DECL_CLOSE; | |
1111 case BT_NUM: | |
1112 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr); | |
1113 #define LEAD_CASE(n) \ | |
1114 case BT_LEAD ## n: \ | |
1115 if (end - ptr < n) \ | |
1116 return XML_TOK_PARTIAL_CHAR; \ | |
1117 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \ | |
1118 ptr += n; \ | |
1119 tok = XML_TOK_NAME; \ | |
1120 break; \ | |
1121 } \ | |
1122 if (IS_NAME_CHAR(enc, ptr, n)) { \ | |
1123 ptr += n; \ | |
1124 tok = XML_TOK_NMTOKEN; \ | |
1125 break; \ | |
1126 } \ | |
1127 *nextTokPtr = ptr; \ | |
1128 return XML_TOK_INVALID; | |
1129 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) | |
1130 #undef LEAD_CASE | |
1131 case BT_NMSTRT: | |
1132 case BT_HEX: | |
1133 tok = XML_TOK_NAME; | |
1134 ptr += MINBPC(enc); | |
1135 break; | |
1136 case BT_DIGIT: | |
1137 case BT_NAME: | |
1138 case BT_MINUS: | |
1139 #ifdef XML_NS | |
1140 case BT_COLON: | |
1141 #endif | |
1142 tok = XML_TOK_NMTOKEN; | |
1143 ptr += MINBPC(enc); | |
1144 break; | |
1145 case BT_NONASCII: | |
1146 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { | |
1147 ptr += MINBPC(enc); | |
1148 tok = XML_TOK_NAME; | |
1149 break; | |
1150 } | |
1151 if (IS_NAME_CHAR_MINBPC(enc, ptr)) { | |
1152 ptr += MINBPC(enc); | |
1153 tok = XML_TOK_NMTOKEN; | |
1154 break; | |
1155 } | |
1156 /* fall through */ | |
1157 default: | |
1158 *nextTokPtr = ptr; | |
1159 return XML_TOK_INVALID; | |
1160 } | |
1161 while (ptr != end) { | |
1162 switch (BYTE_TYPE(enc, ptr)) { | |
1163 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) | |
1164 case BT_GT: case BT_RPAR: case BT_COMMA: | |
1165 case BT_VERBAR: case BT_LSQB: case BT_PERCNT: | |
1166 case BT_S: case BT_CR: case BT_LF: | |
1167 *nextTokPtr = ptr; | |
1168 return tok; | |
1169 #ifdef XML_NS | |
1170 case BT_COLON: | |
1171 ptr += MINBPC(enc); | |
1172 switch (tok) { | |
1173 case XML_TOK_NAME: | |
1174 if (ptr == end) | |
1175 return XML_TOK_PARTIAL; | |
1176 tok = XML_TOK_PREFIXED_NAME; | |
1177 switch (BYTE_TYPE(enc, ptr)) { | |
1178 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) | |
1179 default: | |
1180 tok = XML_TOK_NMTOKEN; | |
1181 break; | |
1182 } | |
1183 break; | |
1184 case XML_TOK_PREFIXED_NAME: | |
1185 tok = XML_TOK_NMTOKEN; | |
1186 break; | |
1187 } | |
1188 break; | |
1189 #endif | |
1190 case BT_PLUS: | |
1191 if (tok == XML_TOK_NMTOKEN) { | |
1192 *nextTokPtr = ptr; | |
1193 return XML_TOK_INVALID; | |
1194 } | |
1195 *nextTokPtr = ptr + MINBPC(enc); | |
1196 return XML_TOK_NAME_PLUS; | |
1197 case BT_AST: | |
1198 if (tok == XML_TOK_NMTOKEN) { | |
1199 *nextTokPtr = ptr; | |
1200 return XML_TOK_INVALID; | |
1201 } | |
1202 *nextTokPtr = ptr + MINBPC(enc); | |
1203 return XML_TOK_NAME_ASTERISK; | |
1204 case BT_QUEST: | |
1205 if (tok == XML_TOK_NMTOKEN) { | |
1206 *nextTokPtr = ptr; | |
1207 return XML_TOK_INVALID; | |
1208 } | |
1209 *nextTokPtr = ptr + MINBPC(enc); | |
1210 return XML_TOK_NAME_QUESTION; | |
1211 default: | |
1212 *nextTokPtr = ptr; | |
1213 return XML_TOK_INVALID; | |
1214 } | |
1215 } | |
1216 return XML_TOK_PARTIAL; | |
1217 } | |
1218 | |
1219 static | |
1220 int PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end, | |
1221 const char **nextTokPtr) | |
1222 { | |
1223 const char *start; | |
1224 if (ptr == end) | |
1225 return XML_TOK_NONE; | |
1226 start = ptr; | |
1227 while (ptr != end) { | |
1228 switch (BYTE_TYPE(enc, ptr)) { | |
1229 #define LEAD_CASE(n) \ | |
1230 case BT_LEAD ## n: ptr += n; break; | |
1231 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) | |
1232 #undef LEAD_CASE | |
1233 case BT_AMP: | |
1234 if (ptr == start) | |
1235 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); | |
1236 *nextTokPtr = ptr; | |
1237 return XML_TOK_DATA_CHARS; | |
1238 case BT_LT: | |
1239 /* this is for inside entity references */ | |
1240 *nextTokPtr = ptr; | |
1241 return XML_TOK_INVALID; | |
1242 case BT_LF: | |
1243 if (ptr == start) { | |
1244 *nextTokPtr = ptr + MINBPC(enc); | |
1245 return XML_TOK_DATA_NEWLINE; | |
1246 } | |
1247 *nextTokPtr = ptr; | |
1248 return XML_TOK_DATA_CHARS; | |
1249 case BT_CR: | |
1250 if (ptr == start) { | |
1251 ptr += MINBPC(enc); | |
1252 if (ptr == end) | |
1253 return XML_TOK_TRAILING_CR; | |
1254 if (BYTE_TYPE(enc, ptr) == BT_LF) | |
1255 ptr += MINBPC(enc); | |
1256 *nextTokPtr = ptr; | |
1257 return XML_TOK_DATA_NEWLINE; | |
1258 } | |
1259 *nextTokPtr = ptr; | |
1260 return XML_TOK_DATA_CHARS; | |
1261 case BT_S: | |
1262 if (ptr == start) { | |
1263 *nextTokPtr = ptr + MINBPC(enc); | |
1264 return XML_TOK_ATTRIBUTE_VALUE_S; | |
1265 } | |
1266 *nextTokPtr = ptr; | |
1267 return XML_TOK_DATA_CHARS; | |
1268 default: | |
1269 ptr += MINBPC(enc); | |
1270 break; | |
1271 } | |
1272 } | |
1273 *nextTokPtr = ptr; | |
1274 return XML_TOK_DATA_CHARS; | |
1275 } | |
1276 | |
1277 static | |
1278 int PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end, | |
1279 const char **nextTokPtr) | |
1280 { | |
1281 const char *start; | |
1282 if (ptr == end) | |
1283 return XML_TOK_NONE; | |
1284 start = ptr; | |
1285 while (ptr != end) { | |
1286 switch (BYTE_TYPE(enc, ptr)) { | |
1287 #define LEAD_CASE(n) \ | |
1288 case BT_LEAD ## n: ptr += n; break; | |
1289 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) | |
1290 #undef LEAD_CASE | |
1291 case BT_AMP: | |
1292 if (ptr == start) | |
1293 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); | |
1294 *nextTokPtr = ptr; | |
1295 return XML_TOK_DATA_CHARS; | |
1296 case BT_PERCNT: | |
1297 if (ptr == start) | |
1298 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr); | |
1299 *nextTokPtr = ptr; | |
1300 return XML_TOK_DATA_CHARS; | |
1301 case BT_LF: | |
1302 if (ptr == start) { | |
1303 *nextTokPtr = ptr + MINBPC(enc); | |
1304 return XML_TOK_DATA_NEWLINE; | |
1305 } | |
1306 *nextTokPtr = ptr; | |
1307 return XML_TOK_DATA_CHARS; | |
1308 case BT_CR: | |
1309 if (ptr == start) { | |
1310 ptr += MINBPC(enc); | |
1311 if (ptr == end) | |
1312 return XML_TOK_TRAILING_CR; | |
1313 if (BYTE_TYPE(enc, ptr) == BT_LF) | |
1314 ptr += MINBPC(enc); | |
1315 *nextTokPtr = ptr; | |
1316 return XML_TOK_DATA_NEWLINE; | |
1317 } | |
1318 *nextTokPtr = ptr; | |
1319 return XML_TOK_DATA_CHARS; | |
1320 default: | |
1321 ptr += MINBPC(enc); | |
1322 break; | |
1323 } | |
1324 } | |
1325 *nextTokPtr = ptr; | |
1326 return XML_TOK_DATA_CHARS; | |
1327 } | |
1328 | |
1329 static | |
1330 int PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end, | |
1331 const char **badPtr) | |
1332 { | |
1333 ptr += MINBPC(enc); | |
1334 end -= MINBPC(enc); | |
1335 for (; ptr != end; ptr += MINBPC(enc)) { | |
1336 switch (BYTE_TYPE(enc, ptr)) { | |
1337 case BT_DIGIT: | |
1338 case BT_HEX: | |
1339 case BT_MINUS: | |
1340 case BT_APOS: | |
1341 case BT_LPAR: | |
1342 case BT_RPAR: | |
1343 case BT_PLUS: | |
1344 case BT_COMMA: | |
1345 case BT_SOL: | |
1346 case BT_EQUALS: | |
1347 case BT_QUEST: | |
1348 case BT_CR: | |
1349 case BT_LF: | |
1350 case BT_SEMI: | |
1351 case BT_EXCL: | |
1352 case BT_AST: | |
1353 case BT_PERCNT: | |
1354 case BT_NUM: | |
1355 #ifdef XML_NS | |
1356 case BT_COLON: | |
1357 #endif | |
1358 break; | |
1359 case BT_S: | |
1360 if (CHAR_MATCHES(enc, ptr, '\t')) { | |
1361 *badPtr = ptr; | |
1362 return 0; | |
1363 } | |
1364 break; | |
1365 case BT_NAME: | |
1366 case BT_NMSTRT: | |
1367 if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f)) | |
1368 break; | |
1369 default: | |
1370 switch (BYTE_TO_ASCII(enc, ptr)) { | |
1371 case 0x24: /* $ */ | |
1372 case 0x40: /* @ */ | |
1373 break; | |
1374 default: | |
1375 *badPtr = ptr; | |
1376 return 0; | |
1377 } | |
1378 break; | |
1379 } | |
1380 } | |
1381 return 1; | |
1382 } | |
1383 | |
1384 /* This must only be called for a well-formed start-tag or empty element tag. | |
1385 Returns the number of attributes. Pointers to the first attsMax attributes | |
1386 are stored in atts. */ | |
1387 | |
1388 static | |
1389 int PREFIX(getAtts)(const ENCODING *enc, const char *ptr, | |
1390 int attsMax, ATTRIBUTE *atts) | |
1391 { | |
1392 enum { other, inName, inValue } state = inName; | |
1393 int nAtts = 0; | |
1394 int open; | |
1395 | |
1396 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) { | |
1397 switch (BYTE_TYPE(enc, ptr)) { | |
1398 #define START_NAME \ | |
1399 if (state == other) { \ | |
1400 if (nAtts < attsMax) { \ | |
1401 atts[nAtts].name = ptr; \ | |
1402 atts[nAtts].normalized = 1; \ | |
1403 } \ | |
1404 state = inName; \ | |
1405 } | |
1406 #define LEAD_CASE(n) \ | |
1407 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break; | |
1408 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) | |
1409 #undef LEAD_CASE | |
1410 case BT_NONASCII: | |
1411 case BT_NMSTRT: | |
1412 case BT_HEX: | |
1413 START_NAME | |
1414 break; | |
1415 #undef START_NAME | |
1416 case BT_QUOT: | |
1417 if (state != inValue) { | |
1418 if (nAtts < attsMax) | |
1419 atts[nAtts].valuePtr = ptr + MINBPC(enc); | |
1420 state = inValue; | |
1421 open = BT_QUOT; | |
1422 } | |
1423 else if (open == BT_QUOT) { | |
1424 state = other; | |
1425 if (nAtts < attsMax) | |
1426 atts[nAtts].valueEnd = ptr; | |
1427 nAtts++; | |
1428 } | |
1429 break; | |
1430 case BT_APOS: | |
1431 if (state != inValue) { | |
1432 if (nAtts < attsMax) | |
1433 atts[nAtts].valuePtr = ptr + MINBPC(enc); | |
1434 state = inValue; | |
1435 open = BT_APOS; | |
1436 } | |
1437 else if (open == BT_APOS) { | |
1438 state = other; | |
1439 if (nAtts < attsMax) | |
1440 atts[nAtts].valueEnd = ptr; | |
1441 nAtts++; | |
1442 } | |
1443 break; | |
1444 case BT_AMP: | |
1445 if (nAtts < attsMax) | |
1446 atts[nAtts].normalized = 0; | |
1447 break; | |
1448 case BT_S: | |
1449 if (state == inName) | |
1450 state = other; | |
1451 else if (state == inValue | |
1452 && nAtts < attsMax | |
1453 && atts[nAtts].normalized | |
1454 && (ptr == atts[nAtts].valuePtr | |
1455 || BYTE_TO_ASCII(enc, ptr) != ' ' | |
1456 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ' ' | |
1457 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open)) | |
1458 atts[nAtts].normalized = 0; | |
1459 break; | |
1460 case BT_CR: case BT_LF: | |
1461 /* This case ensures that the first attribute name is counted | |
1462 Apart from that we could just change state on the quote. */ | |
1463 if (state == inName) | |
1464 state = other; | |
1465 else if (state == inValue && nAtts < attsMax) | |
1466 atts[nAtts].normalized = 0; | |
1467 break; | |
1468 case BT_GT: | |
1469 case BT_SOL: | |
1470 if (state != inValue) | |
1471 return nAtts; | |
1472 break; | |
1473 default: | |
1474 break; | |
1475 } | |
1476 } | |
1477 /* not reached */ | |
1478 } | |
1479 | |
1480 static | |
1481 int PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) | |
1482 { | |
1483 int result = 0; | |
1484 /* skip &# */ | |
1485 ptr += 2*MINBPC(enc); | |
1486 if (CHAR_MATCHES(enc, ptr, 'x')) { | |
1487 for (ptr += MINBPC(enc); !CHAR_MATCHES(enc, ptr, ';'); ptr += MINBPC(enc)) { | |
1488 int c = BYTE_TO_ASCII(enc, ptr); | |
1489 switch (c) { | |
1490 case '0': case '1': case '2': case '3': case '4': | |
1491 case '5': case '6': case '7': case '8': case '9': | |
1492 result <<= 4; | |
1493 result |= (c - '0'); | |
1494 break; | |
1495 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': | |
1496 result <<= 4; | |
1497 result += 10 + (c - 'A'); | |
1498 break; | |
1499 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': | |
1500 result <<= 4; | |
1501 result += 10 + (c - 'a'); | |
1502 break; | |
1503 } | |
1504 if (result >= 0x110000) | |
1505 return -1; | |
1506 } | |
1507 } | |
1508 else { | |
1509 for (; !CHAR_MATCHES(enc, ptr, ';'); ptr += MINBPC(enc)) { | |
1510 int c = BYTE_TO_ASCII(enc, ptr); | |
1511 result *= 10; | |
1512 result += (c - '0'); | |
1513 if (result >= 0x110000) | |
1514 return -1; | |
1515 } | |
1516 } | |
1517 return checkCharRefNumber(result); | |
1518 } | |
1519 | |
1520 static | |
1521 int PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, const char *end) | |
1522 { | |
1523 switch ((end - ptr)/MINBPC(enc)) { | |
1524 case 2: | |
1525 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), 't')) { | |
1526 switch (BYTE_TO_ASCII(enc, ptr)) { | |
1527 case 'l': | |
1528 return '<'; | |
1529 case 'g': | |
1530 return '>'; | |
1531 } | |
1532 } | |
1533 break; | |
1534 case 3: | |
1535 if (CHAR_MATCHES(enc, ptr, 'a')) { | |
1536 ptr += MINBPC(enc); | |
1537 if (CHAR_MATCHES(enc, ptr, 'm')) { | |
1538 ptr += MINBPC(enc); | |
1539 if (CHAR_MATCHES(enc, ptr, 'p')) | |
1540 return '&'; | |
1541 } | |
1542 } | |
1543 break; | |
1544 case 4: | |
1545 switch (BYTE_TO_ASCII(enc, ptr)) { | |
1546 case 'q': | |
1547 ptr += MINBPC(enc); | |
1548 if (CHAR_MATCHES(enc, ptr, 'u')) { | |
1549 ptr += MINBPC(enc); | |
1550 if (CHAR_MATCHES(enc, ptr, 'o')) { | |
1551 ptr += MINBPC(enc); | |
1552 if (CHAR_MATCHES(enc, ptr, 't')) | |
1553 return '"'; | |
1554 } | |
1555 } | |
1556 break; | |
1557 case 'a': | |
1558 ptr += MINBPC(enc); | |
1559 if (CHAR_MATCHES(enc, ptr, 'p')) { | |
1560 ptr += MINBPC(enc); | |
1561 if (CHAR_MATCHES(enc, ptr, 'o')) { | |
1562 ptr += MINBPC(enc); | |
1563 if (CHAR_MATCHES(enc, ptr, 's')) | |
1564 return '\''; | |
1565 } | |
1566 } | |
1567 break; | |
1568 } | |
1569 } | |
1570 return 0; | |
1571 } | |
1572 | |
1573 static | |
1574 int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2) | |
1575 { | |
1576 for (;;) { | |
1577 switch (BYTE_TYPE(enc, ptr1)) { | |
1578 #define LEAD_CASE(n) \ | |
1579 case BT_LEAD ## n: \ | |
1580 if (*ptr1++ != *ptr2++) \ | |
1581 return 0; | |
1582 LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2) | |
1583 #undef LEAD_CASE | |
1584 /* fall through */ | |
1585 if (*ptr1++ != *ptr2++) | |
1586 return 0; | |
1587 break; | |
1588 case BT_NONASCII: | |
1589 case BT_NMSTRT: | |
1590 #ifdef XML_NS | |
1591 case BT_COLON: | |
1592 #endif | |
1593 case BT_HEX: | |
1594 case BT_DIGIT: | |
1595 case BT_NAME: | |
1596 case BT_MINUS: | |
1597 if (*ptr2++ != *ptr1++) | |
1598 return 0; | |
1599 if (MINBPC(enc) > 1) { | |
1600 if (*ptr2++ != *ptr1++) | |
1601 return 0; | |
1602 if (MINBPC(enc) > 2) { | |
1603 if (*ptr2++ != *ptr1++) | |
1604 return 0; | |
1605 if (MINBPC(enc) > 3) { | |
1606 if (*ptr2++ != *ptr1++) | |
1607 return 0; | |
1608 } | |
1609 } | |
1610 } | |
1611 break; | |
1612 default: | |
1613 if (MINBPC(enc) == 1 && *ptr1 == *ptr2) | |
1614 return 1; | |
1615 switch (BYTE_TYPE(enc, ptr2)) { | |
1616 case BT_LEAD2: | |
1617 case BT_LEAD3: | |
1618 case BT_LEAD4: | |
1619 case BT_NONASCII: | |
1620 case BT_NMSTRT: | |
1621 #ifdef XML_NS | |
1622 case BT_COLON: | |
1623 #endif | |
1624 case BT_HEX: | |
1625 case BT_DIGIT: | |
1626 case BT_NAME: | |
1627 case BT_MINUS: | |
1628 return 0; | |
1629 default: | |
1630 return 1; | |
1631 } | |
1632 } | |
1633 } | |
1634 /* not reached */ | |
1635 } | |
1636 | |
1637 static | |
1638 int PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1, const char *ptr2) | |
1639 { | |
1640 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) { | |
1641 if (!CHAR_MATCHES(enc, ptr1, *ptr2)) | |
1642 return 0; | |
1643 } | |
1644 switch (BYTE_TYPE(enc, ptr1)) { | |
1645 case BT_LEAD2: | |
1646 case BT_LEAD3: | |
1647 case BT_LEAD4: | |
1648 case BT_NONASCII: | |
1649 case BT_NMSTRT: | |
1650 #ifdef XML_NS | |
1651 case BT_COLON: | |
1652 #endif | |
1653 case BT_HEX: | |
1654 case BT_DIGIT: | |
1655 case BT_NAME: | |
1656 case BT_MINUS: | |
1657 return 0; | |
1658 default: | |
1659 return 1; | |
1660 } | |
1661 } | |
1662 | |
1663 static | |
1664 int PREFIX(nameLength)(const ENCODING *enc, const char *ptr) | |
1665 { | |
1666 const char *start = ptr; | |
1667 for (;;) { | |
1668 switch (BYTE_TYPE(enc, ptr)) { | |
1669 #define LEAD_CASE(n) \ | |
1670 case BT_LEAD ## n: ptr += n; break; | |
1671 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) | |
1672 #undef LEAD_CASE | |
1673 case BT_NONASCII: | |
1674 case BT_NMSTRT: | |
1675 #ifdef XML_NS | |
1676 case BT_COLON: | |
1677 #endif | |
1678 case BT_HEX: | |
1679 case BT_DIGIT: | |
1680 case BT_NAME: | |
1681 case BT_MINUS: | |
1682 ptr += MINBPC(enc); | |
1683 break; | |
1684 default: | |
1685 return ptr - start; | |
1686 } | |
1687 } | |
1688 } | |
1689 | |
1690 static | |
1691 const char *PREFIX(skipS)(const ENCODING *enc, const char *ptr) | |
1692 { | |
1693 for (;;) { | |
1694 switch (BYTE_TYPE(enc, ptr)) { | |
1695 case BT_LF: | |
1696 case BT_CR: | |
1697 case BT_S: | |
1698 ptr += MINBPC(enc); | |
1699 break; | |
1700 default: | |
1701 return ptr; | |
1702 } | |
1703 } | |
1704 } | |
1705 | |
1706 static | |
1707 void PREFIX(updatePosition)(const ENCODING *enc, | |
1708 const char *ptr, | |
1709 const char *end, | |
1710 POSITION *pos) | |
1711 { | |
1712 while (ptr != end) { | |
1713 switch (BYTE_TYPE(enc, ptr)) { | |
1714 #define LEAD_CASE(n) \ | |
1715 case BT_LEAD ## n: \ | |
1716 ptr += n; \ | |
1717 break; | |
1718 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) | |
1719 #undef LEAD_CASE | |
1720 case BT_LF: | |
1721 pos->columnNumber = (unsigned)-1; | |
1722 pos->lineNumber++; | |
1723 ptr += MINBPC(enc); | |
1724 break; | |
1725 case BT_CR: | |
1726 pos->lineNumber++; | |
1727 ptr += MINBPC(enc); | |
1728 if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF) | |
1729 ptr += MINBPC(enc); | |
1730 pos->columnNumber = (unsigned)-1; | |
1731 break; | |
1732 default: | |
1733 ptr += MINBPC(enc); | |
1734 break; | |
1735 } | |
1736 pos->columnNumber++; | |
1737 } | |
1738 } | |
1739 | |
1740 #undef DO_LEAD_CASE | |
1741 #undef MULTIBYTE_CASES | |
1742 #undef INVALID_CASES | |
1743 #undef CHECK_NAME_CASE | |
1744 #undef CHECK_NAME_CASES | |
1745 #undef CHECK_NMSTRT_CASE | |
1746 #undef CHECK_NMSTRT_CASES |