Mercurial > emacs
annotate lisp/nxml/xmltok.el @ 104300:04280efb54c1
Credit Jan Seeger for last change.
author | Chong Yidong <cyd@stupidchicken.com> |
---|---|
date | Sun, 16 Aug 2009 14:36:01 +0000 |
parents | a9dc0e7c3f2b |
children | 3367f0022cf2 |
rev | line source |
---|---|
86361 | 1 ;;; xmltok.el --- XML tokenization |
2 | |
100908 | 3 ;; Copyright (C) 2003, 2007, 2008, 2009 Free Software Foundation, Inc. |
86361 | 4 |
5 ;; Author: James Clark | |
6 ;; Keywords: XML | |
7 | |
86557 | 8 ;; This file is part of GNU Emacs. |
9 | |
94666
d495d4d5452f
Switch to recommended form of GPLv3 permissions notice.
Glenn Morris <rgm@gnu.org>
parents:
87665
diff
changeset
|
10 ;; GNU Emacs is free software: you can redistribute it and/or modify |
86557 | 11 ;; it under the terms of the GNU General Public License as published by |
94666
d495d4d5452f
Switch to recommended form of GPLv3 permissions notice.
Glenn Morris <rgm@gnu.org>
parents:
87665
diff
changeset
|
12 ;; the Free Software Foundation, either version 3 of the License, or |
d495d4d5452f
Switch to recommended form of GPLv3 permissions notice.
Glenn Morris <rgm@gnu.org>
parents:
87665
diff
changeset
|
13 ;; (at your option) any later version. |
86361 | 14 |
86557 | 15 ;; GNU Emacs is distributed in the hope that it will be useful, |
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 ;; GNU General Public License for more details. | |
86361 | 19 |
86557 | 20 ;; You should have received a copy of the GNU General Public License |
94666
d495d4d5452f
Switch to recommended form of GPLv3 permissions notice.
Glenn Morris <rgm@gnu.org>
parents:
87665
diff
changeset
|
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. |
86361 | 22 |
23 ;;; Commentary: | |
24 | |
25 ;; This implements an XML 1.0 parser. It also implements the XML | |
26 ;; Namespaces Recommendation. It is designed to be conforming, but it | |
27 ;; works a bit differently from a normal XML parser. An XML document | |
28 ;; consists of the prolog and an instance. The prolog is parsed as a | |
29 ;; single unit using `xmltok-forward-prolog'. The instance is | |
30 ;; considered as a sequence of tokens, where a token is something like | |
31 ;; a start-tag, a comment, a chunk of data or a CDATA section. The | |
32 ;; tokenization of the instance is stateless: the tokenization of one | |
33 ;; part of the instance does not depend on tokenization of the | |
34 ;; preceding part of the instance. This allows the instance to be | |
35 ;; parsed incrementally. The main entry point is `xmltok-forward': | |
36 ;; this can be called at any point in the instance provided it is | |
37 ;; between tokens. The other entry point is `xmltok-forward-special' | |
38 ;; which skips over tokens other comments, processing instructions or | |
39 ;; CDATA sections (i.e. the constructs in an instance that can contain | |
40 ;; less than signs that don't start a token). | |
41 ;; | |
42 ;; This is a non-validating XML 1.0 processor. It does not resolve | |
43 ;; parameter entities (including the external DTD subset) and it does | |
44 ;; not resolve external general entities. | |
45 ;; | |
46 ;; It is non-conformant by design in the following respects. | |
47 ;; | |
48 ;; 1. It expects the client to detect aspects of well-formedness that | |
49 ;; are not internal to a single token, specifically checking that | |
50 ;; end-tags match start-tags and that the instance contains exactly | |
51 ;; one element. | |
52 ;; | |
53 ;; 2. It expects the client to detect duplicate attributes. Detection | |
54 ;; of duplicate attributes after expansion of namespace prefixes | |
55 ;; requires the namespace processing state. Detection of duplicate | |
56 ;; attributes before expansion of namespace prefixes does not, but is | |
57 ;; redundant given that the client will do detection of duplicate | |
58 ;; attributes after expansion of namespace prefixes. | |
59 ;; | |
60 ;; 3. It allows the client to recover from well-formedness errors. | |
61 ;; This is essential for use in applications where the document is | |
62 ;; being parsed during the editing process. | |
63 ;; | |
64 ;; 4. It does not support documents that do not conform to the lexical | |
65 ;; requirements of the XML Namespaces Recommendation (e.g. a document | |
66 ;; with a colon in an entity name). | |
67 ;; | |
68 ;; There are also a number of things that have not yet been | |
69 ;; implemented that make it non-conformant. | |
70 ;; | |
71 ;; 1. It does not implement default attributes. ATTLIST declarations | |
72 ;; are parsed, but no checking is done on the content of attribute | |
73 ;; value literals specifying default attribute values, and default | |
74 ;; attribute values are not reported to the client. | |
75 ;; | |
76 ;; 2. It does not implement internal entities containing elements. If | |
77 ;; an internal entity is referenced and parsing its replacement text | |
78 ;; yields one or more tags, then it will skip the reference and | |
79 ;; report this to the client. | |
80 ;; | |
81 ;; 3. It does not check the syntax of public identifiers in the DTD. | |
82 ;; | |
83 ;; 4. It allows some non-ASCII characters in certain situations where | |
84 ;; it should not. For example, it only enforces XML 1.0's | |
85 ;; restrictions on name characters strictly for ASCII characters. The | |
86 ;; problem here is XML's character model is based squarely on Unicode, | |
87 ;; whereas Emacs's is not (as of version 21). It is not clear what | |
88 ;; the right thing to do is. | |
89 | |
90 ;;; Code: | |
91 | |
92 (defvar xmltok-type nil) | |
93 (defvar xmltok-start nil) | |
94 (defvar xmltok-name-colon nil) | |
95 (defvar xmltok-name-end nil) | |
96 (defvar xmltok-replacement nil | |
97 "String containing replacement for a character or entity reference.") | |
98 | |
99 (defvar xmltok-attributes nil | |
100 "List containing attributes of last scanned element. | |
101 Each member of the list is a vector representing an attribute, which | |
102 can be accessed using the functions `xmltok-attribute-name-start', | |
103 `xmltok-attribute-name-colon', `xmltok-attribute-name-end', | |
104 `xmltok-attribute-value-start', `xmltok-attribute-value-end', | |
105 `xmltok-attribute-raw-normalized-value', `xmltok-attribute-refs'.") | |
106 | |
107 (defvar xmltok-namespace-attributes nil | |
108 "List containing namespace declarations of last scanned element. | |
109 List has same format as `xmltok-attributes'.") | |
110 | |
111 (defvar xmltok-dtd nil | |
112 "Information about the DTD used by `xmltok-forward'. | |
113 `xmltok-forward-prolog' sets this up. | |
114 | |
115 It consists of an alist of general entity names vs definitions. The | |
116 first member of the alist is t if references to entities not in the | |
117 alist are well-formed \(e.g. because there's an external subset that | |
118 wasn't parsed). | |
119 | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
120 Each general entity name is a string. The definition is either nil, |
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
121 a symbol, a string, a cons cell. If the definition is nil, then it |
86361 | 122 means that it's an internal entity but the result of parsing it is |
123 unknown. If it is a symbol, then the symbol is either `unparsed', | |
124 meaning the entity is an unparsed entity, `external', meaning the | |
125 entity is or references an external entity, `element', meaning the | |
126 entity includes one or more elements, or `not-well-formed', meaning | |
127 the replacement text is not well-formed. If the definition is a | |
128 string, then the replacement text of the entity is that string; this | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
129 happens only during the parsing of the prolog. If the definition is |
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
130 a cons cell \(ER . AR), then ER specifies the string that results |
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
131 from referencing the entity in element content and AR is either nil, |
86361 | 132 meaning the replacement text included a <, or a string which is the |
133 normalized attribute value.") | |
134 | |
135 (defvar xmltok-dependent-regions nil | |
136 "List of descriptors of regions that a parsed token depends on. | |
137 | |
138 A token depends on a region if the region occurs after the token and a | |
139 change in the region may require the token to be reparsed. This only | |
140 happens with markup that is not well-formed. For example, if a <? | |
141 occurs without a matching ?>, then the <? is returned as a | |
142 not-well-formed token. However, this token is dependent on region | |
143 from the end of the token to the end of the buffer: if this ever | |
144 contains ?> then the buffer must be reparsed from the <?. | |
145 | |
146 A region descriptor is a list (FUN START END ARG ...), where FUN is a | |
147 function to be called when the region changes, START and END are | |
148 integers giving the start and end of the region, and ARG... are | |
149 additional arguments to be passed to FUN. FUN will be called with 5 | |
150 arguments followed by the additional arguments if any: the position of | |
151 the start of the changed area in the region, the position of the end | |
152 of the changed area in the region, the length of the changed area | |
153 before the change, the position of the start of the region, the | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
154 position of the end of the region. FUN must return non-nil if the |
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
155 region needs reparsing. FUN will be called in a `save-excursion' |
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
156 with match-data saved. |
86361 | 157 |
158 `xmltok-forward', `xmltok-forward-special' and `xmltok-forward-prolog' | |
159 may add entries to the beginning of this list, but will not clear it. | |
160 `xmltok-forward' and `xmltok-forward-special' will only add entries | |
161 when returning tokens of type not-well-formed.") | |
162 | |
163 (defvar xmltok-errors nil | |
164 "List of errors detected by `xmltok-forward' and `xmltok-forward-prolog'. | |
165 When `xmltok-forward' and `xmltok-forward-prolog' detect a | |
166 well-formedness error, they will add an entry to the beginning of this | |
167 list. Each entry is a vector [MESSAGE START END], where MESSAGE is a | |
168 string giving the error message and START and END are integers | |
169 indicating the position of the error.") | |
170 | |
171 (defmacro xmltok-save (&rest body) | |
172 `(let (xmltok-type | |
173 xmltok-start | |
174 xmltok-name-colon | |
175 xmltok-name-end | |
176 xmltok-replacement | |
177 xmltok-attributes | |
178 xmltok-namespace-attributes | |
179 xmltok-dependent-regions | |
180 xmltok-errors) | |
181 ,@body)) | |
182 | |
183 (put 'xmltok-save 'lisp-indent-function 0) | |
184 (def-edebug-spec xmltok-save t) | |
185 | |
186 (defsubst xmltok-attribute-name-start (att) | |
187 (aref att 0)) | |
188 | |
189 (defsubst xmltok-attribute-name-colon (att) | |
190 (aref att 1)) | |
191 | |
192 (defsubst xmltok-attribute-name-end (att) | |
193 (aref att 2)) | |
194 | |
195 (defsubst xmltok-attribute-value-start (att) | |
196 (aref att 3)) | |
197 | |
198 (defsubst xmltok-attribute-value-end (att) | |
199 (aref att 4)) | |
200 | |
201 (defsubst xmltok-attribute-raw-normalized-value (att) | |
202 "Return an object representing the normalized value of ATT. | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
203 This can be t indicating that the normalized value is the same as |
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
204 the buffer substring from the start to the end of the value, or nil |
86361 | 205 indicating that the value is not well-formed or a string." |
206 (aref att 5)) | |
207 | |
208 (defsubst xmltok-attribute-refs (att) | |
209 "Return a list of the entity and character references in ATT. | |
210 Each member is a vector [TYPE START END] where TYPE is either char-ref | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
211 or entity-ref and START and END are integers giving the start and end of |
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
212 the reference. Nested entity references are not included in the list." |
86361 | 213 (aref att 6)) |
214 | |
215 (defun xmltok-attribute-prefix (att) | |
216 (let ((colon (xmltok-attribute-name-colon att))) | |
217 (and colon | |
218 (buffer-substring-no-properties (xmltok-attribute-name-start att) | |
219 colon)))) | |
220 | |
221 (defun xmltok-attribute-local-name (att) | |
222 (let ((colon (xmltok-attribute-name-colon att))) | |
223 (buffer-substring-no-properties (if colon | |
224 (1+ colon) | |
225 (xmltok-attribute-name-start att)) | |
226 (xmltok-attribute-name-end att)))) | |
227 | |
228 (defun xmltok-attribute-value (att) | |
229 (let ((rnv (xmltok-attribute-raw-normalized-value att))) | |
230 (and rnv | |
231 (if (stringp rnv) | |
232 rnv | |
233 (buffer-substring-no-properties (xmltok-attribute-value-start att) | |
234 (xmltok-attribute-value-end att)))))) | |
235 | |
236 (defun xmltok-start-tag-prefix () | |
237 (and xmltok-name-colon | |
238 (buffer-substring-no-properties (1+ xmltok-start) | |
239 xmltok-name-colon))) | |
240 | |
241 (defun xmltok-start-tag-local-name () | |
242 (buffer-substring-no-properties (1+ (or xmltok-name-colon | |
243 xmltok-start)) | |
244 xmltok-name-end)) | |
245 | |
246 (defun xmltok-end-tag-prefix () | |
247 (and xmltok-name-colon | |
248 (buffer-substring-no-properties (+ 2 xmltok-start) | |
249 xmltok-name-colon))) | |
250 | |
251 (defun xmltok-end-tag-local-name () | |
252 (buffer-substring-no-properties (if xmltok-name-colon | |
253 (1+ xmltok-name-colon) | |
254 (+ 2 xmltok-start)) | |
255 xmltok-name-end)) | |
256 | |
257 (defun xmltok-start-tag-qname () | |
258 (buffer-substring-no-properties (+ xmltok-start 1) xmltok-name-end)) | |
259 | |
260 (defun xmltok-end-tag-qname () | |
261 (buffer-substring-no-properties (+ xmltok-start 2) xmltok-name-end)) | |
262 | |
263 (defsubst xmltok-make-attribute (name-begin | |
264 name-colon | |
265 name-end | |
266 &optional | |
267 value-begin | |
268 value-end | |
269 raw-normalized-value) | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
270 "Make an attribute. |
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
271 RAW-NORMALIZED-VALUE is nil if the value is not well-formed, |
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
272 t if the normalized value is the string between VALUE-BEGIN |
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
273 and VALUE-END, otherwise a STRING giving the value." |
86361 | 274 (vector name-begin |
275 name-colon | |
276 name-end | |
277 value-begin | |
278 value-end | |
279 raw-normalized-value | |
280 nil)) | |
281 | |
282 (defsubst xmltok-error-message (err) | |
283 (aref err 0)) | |
284 | |
285 (defsubst xmltok-error-start (err) | |
286 (aref err 1)) | |
287 | |
288 (defsubst xmltok-error-end (err) | |
289 (aref err 2)) | |
290 | |
291 (defsubst xmltok-make-error (message start end) | |
292 (vector message start end)) | |
293 | |
294 (defun xmltok-add-error (message &optional start end) | |
295 (setq xmltok-errors | |
296 (cons (xmltok-make-error message | |
297 (or start xmltok-start) | |
298 (or end (point))) | |
299 xmltok-errors))) | |
300 | |
301 (defun xmltok-add-dependent (fun &optional start end &rest args) | |
302 (setq xmltok-dependent-regions | |
303 (cons (cons fun | |
304 (cons (or start xmltok-start) | |
305 (cons (or end (point-max)) | |
306 args))) | |
307 xmltok-dependent-regions))) | |
308 | |
309 (defun xmltok-forward () | |
310 (setq xmltok-start (point)) | |
311 (let* ((case-fold-search nil) | |
312 (space-count (skip-chars-forward " \t\r\n")) | |
313 (ch (char-after))) | |
314 (cond ((eq ch ?\<) | |
315 (cond ((> space-count 0) | |
316 (setq xmltok-type 'space)) | |
317 (t | |
98481
c14273ed6b9f
Mario Lang <mlang at delysid.org>
Glenn Morris <rgm@gnu.org>
parents:
96496
diff
changeset
|
318 (forward-char 1) |
86361 | 319 (xmltok-scan-after-lt)))) |
320 ((eq ch ?\&) | |
321 (cond ((> space-count 0) | |
322 (setq xmltok-type 'space)) | |
323 (t | |
98481
c14273ed6b9f
Mario Lang <mlang at delysid.org>
Glenn Morris <rgm@gnu.org>
parents:
96496
diff
changeset
|
324 (forward-char 1) |
c14273ed6b9f
Mario Lang <mlang at delysid.org>
Glenn Morris <rgm@gnu.org>
parents:
96496
diff
changeset
|
325 (xmltok-scan-after-amp 'xmltok-handle-entity)))) |
86361 | 326 ((re-search-forward "[<&]\\|\\(]]>\\)" nil t) |
327 (cond ((not (match-beginning 1)) | |
328 (goto-char (match-beginning 0)) | |
329 ;; must have got a non-space char | |
330 (setq xmltok-type 'data)) | |
331 ((= (match-beginning 1) xmltok-start) | |
332 (xmltok-add-error "Found `]]>' not closing a CDATA section") | |
333 (setq xmltok-type 'not-well-formed)) | |
334 (t | |
335 (goto-char (match-beginning 0)) | |
336 (setq xmltok-type | |
337 (if (= (point) (+ xmltok-start space-count)) | |
338 'space | |
339 'data))))) | |
340 ((eq ch nil) | |
341 (setq xmltok-type | |
342 (if (> space-count 0) | |
343 'space | |
344 nil))) | |
345 (t | |
346 (goto-char (point-max)) | |
347 (setq xmltok-type 'data))))) | |
348 | |
349 (defun xmltok-forward-special (bound) | |
350 "Scan forward past the first special token starting at or after point. | |
351 Return nil if there is no special token that starts before BOUND. | |
352 CDATA sections, processing instructions and comments (and indeed | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
353 anything starting with < following by ? or !) count as special. |
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
354 Return the type of the token." |
86361 | 355 (when (re-search-forward "<[?!]" (1+ bound) t) |
356 (setq xmltok-start (match-beginning 0)) | |
357 (goto-char (1+ xmltok-start)) | |
358 (let ((case-fold-search nil)) | |
359 (xmltok-scan-after-lt)))) | |
360 | |
361 (eval-when-compile | |
362 | |
363 ;; A symbolic regexp is represented by a list whose CAR is the string | |
364 ;; containing the regexp and whose cdr is a list of symbolic names | |
365 ;; for the groups in the string. | |
366 | |
367 ;; Construct a symbolic regexp from a regexp. | |
368 (defun xmltok-r (str) | |
369 (cons str nil)) | |
370 | |
371 ;; Concatenate zero of more regexps and symbolic regexps. | |
372 (defun xmltok+ (&rest args) | |
373 (let (strs names) | |
374 (while args | |
375 (let ((arg (car args))) | |
376 (if (stringp arg) | |
377 (setq strs (cons arg strs)) | |
378 (setq strs (cons (car arg) strs)) | |
379 (setq names (cons (cdr arg) names))) | |
380 (setq args (cdr args)))) | |
381 (cons (apply 'concat (nreverse strs)) | |
382 (apply 'append (nreverse names)))))) | |
383 | |
384 (eval-when-compile | |
385 ;; Make a symbolic group named NAME from the regexp R. | |
386 ;; R may be a symbolic regexp or an ordinary regexp. | |
387 (defmacro xmltok-g (name &rest r) | |
388 (let ((sym (make-symbol "r"))) | |
389 `(let ((,sym (xmltok+ ,@r))) | |
390 (if (stringp ,sym) | |
391 (cons (concat "\\(" ,sym "\\)") (cons ',name nil)) | |
392 (cons (concat "\\(" (car ,sym) "\\)") (cons ',name (cdr ,sym))))))) | |
393 | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
394 (defun xmltok-p (&rest r) (xmltok+ "\\(?:" |
86361 | 395 (apply 'xmltok+ r) |
396 "\\)")) | |
397 | |
398 ;; Get the group index of ELEM in a LIST of symbols. | |
399 (defun xmltok-get-index (elem list) | |
400 (or elem | |
401 (error "Missing group name")) | |
402 (let ((found nil) | |
403 (i 1)) | |
404 (while list | |
405 (cond ((eq elem (car list)) | |
406 (setq found i) | |
407 (setq list nil)) | |
408 (t | |
409 (setq i (1+ i)) | |
410 (setq list (cdr list))))) | |
411 (or found | |
412 (error "Bad group name %s" elem)))) | |
413 | |
414 ;; Define a macro SYM using a symbolic regexp R. | |
415 ;; SYM can be called in three ways: | |
416 ;; (SYM regexp) | |
417 ;; expands to the regexp in R | |
418 ;; (SYM start G) | |
419 ;; expands to | |
420 ;; (match-beginning N) | |
421 ;; where N is the group index of G in R. | |
422 ;; (SYM end G) | |
423 ;; expands to | |
424 ;; (match-end N) | |
425 ;; where N is the group index of G in R. | |
426 (defmacro xmltok-defregexp (sym r) | |
427 `(defalias ',sym | |
428 (let ((r ,r)) | |
429 `(macro lambda (action &optional group-name) | |
430 (cond ((eq action 'regexp) | |
431 ,(car r)) | |
432 ((or (eq action 'start) (eq action 'beginning)) | |
433 (list 'match-beginning (xmltok-get-index group-name | |
434 ',(cdr r)))) | |
435 ((eq action 'end) | |
436 (list 'match-end (xmltok-get-index group-name | |
437 ',(cdr r)))) | |
438 ((eq action 'string) | |
439 (list 'match-string | |
440 (xmltok-get-index group-name ',(cdr r)))) | |
441 ((eq action 'string-no-properties) | |
442 (list 'match-string-no-properties | |
443 (xmltok-get-index group-name ',(cdr r)))) | |
444 (t (error "Invalid action: %s" action)))))))) | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
445 |
86361 | 446 |
447 (eval-when-compile | |
448 (let* ((or "\\|") | |
449 (open "\\(?:") | |
450 (gopen "\\(") | |
451 (close "\\)") | |
452 (name-start-char "[_[:alpha:]]") | |
453 (name-continue-not-start-char "[-.[:digit:]]") | |
454 (name-continue-char "[-._[:alnum:]]") | |
455 (* "*") | |
456 (+ "+") | |
457 (opt "?") | |
458 (question "\\?") | |
459 (s "[ \r\t\n]") | |
460 (s+ (concat s +)) | |
461 (s* (concat s *)) | |
462 (ncname (concat name-start-char name-continue-char *)) | |
463 (entity-ref | |
464 (xmltok+ (xmltok-g entity-name ncname) | |
465 (xmltok-g entity-ref-close ";") opt)) | |
466 (decimal-ref | |
467 (xmltok+ (xmltok-g decimal "[0-9]" +) | |
468 (xmltok-g decimal-ref-close ";") opt)) | |
469 (hex-ref | |
470 (xmltok+ "x" open | |
471 (xmltok-g hex "[0-9a-fA-F]" +) | |
472 (xmltok-g hex-ref-close ";") opt | |
473 close opt)) | |
474 (char-ref | |
475 (xmltok+ (xmltok-g number-sign "#") | |
476 open decimal-ref or hex-ref close opt)) | |
477 (start-tag-close | |
478 (xmltok+ open (xmltok-g start-tag-close s* ">") | |
479 or open (xmltok-g empty-tag-slash s* "/") | |
480 (xmltok-g empty-tag-close ">") opt close | |
481 or (xmltok-g start-tag-s s+) | |
482 close)) | |
483 (start-tag | |
484 (xmltok+ (xmltok-g start-tag-name | |
485 ncname (xmltok-g start-tag-colon ":" ncname) opt) | |
486 start-tag-close opt)) | |
487 (end-tag | |
488 (xmltok+ (xmltok-g end-tag-slash "/") | |
489 open (xmltok-g end-tag-name | |
490 ncname | |
491 (xmltok-g end-tag-colon ":" ncname) opt) | |
492 (xmltok-g end-tag-close s* ">") opt | |
493 close opt)) | |
494 (comment | |
495 (xmltok+ (xmltok-g markup-declaration "!") | |
496 (xmltok-g comment-first-dash "-" | |
497 (xmltok-g comment-open "-") opt) opt)) | |
498 (cdata-section | |
499 (xmltok+ "!" | |
500 (xmltok-g marked-section-open "\\[") | |
501 open "C" | |
502 open "D" | |
503 open "A" | |
504 open "T" | |
505 open "A" | |
506 (xmltok-g cdata-section-open "\\[" ) opt | |
507 close opt ; A | |
508 close opt ; T | |
509 close opt ; A | |
510 close opt ; D | |
511 close opt)) ; C | |
512 (processing-instruction | |
513 (xmltok-g processing-instruction-question question))) | |
514 | |
515 (xmltok-defregexp xmltok-ncname (xmltok+ open ncname close)) | |
516 | |
517 (xmltok-defregexp xmltok-after-amp | |
518 (xmltok+ entity-ref or char-ref)) | |
519 (xmltok-defregexp xmltok-after-lt | |
520 (xmltok+ start-tag | |
521 or end-tag | |
522 ;; cdata-section must come before comment | |
523 ;; because we treat <! as a comment | |
524 ;; and Emacs doesn't do fully greedy matching | |
525 ;; by default | |
526 or cdata-section | |
527 or comment | |
528 or processing-instruction)) | |
529 (xmltok-defregexp | |
530 xmltok-attribute | |
531 (let* ((lit1 | |
532 (xmltok+ "'" | |
533 "[^<'&\r\n\t]*" | |
534 (xmltok-g complex1 "[&\r\n\t][^<']*") opt | |
535 "'")) | |
536 (lit2 (cons (replace-regexp-in-string "'" "\"" (car lit1)) | |
537 '(complex2))) | |
538 (literal (xmltok-g literal lit1 or lit2)) | |
539 (name (xmltok+ open (xmltok-g xmlns "xmlns") or ncname close | |
540 (xmltok-g colon ":" ncname) opt))) | |
541 (xmltok+ (xmltok-g name name) | |
542 s* "=" | |
543 ;; If the literal isn't followed by what it should be, | |
544 ;; then the closing delimiter is probably really the | |
545 ;; opening delimiter of another literal, so don't | |
546 ;; absorb the literal in this case. | |
547 open s* literal start-tag-close close opt))) | |
548 (xmltok-defregexp | |
549 xmltok-xml-declaration | |
550 (let* ((literal-content "[-._:a-zA-Z0-9]+") | |
551 (literal | |
552 (concat open "\"" literal-content "\"" | |
553 or "'" literal-content "'" close)) | |
554 (version-att | |
555 (xmltok+ open | |
556 s+ (xmltok-g version-name "version") | |
557 s* "=" | |
558 s* (xmltok-g version-value literal) | |
559 close opt)) | |
560 (encoding-att | |
561 (xmltok+ open | |
562 s+ (xmltok-g encoding-name "encoding") | |
563 s* "=" | |
564 s* (xmltok-g encoding-value literal) | |
565 close opt)) | |
566 (yes-no | |
567 (concat open "yes" or "no" close)) | |
568 (standalone-att | |
569 (xmltok+ open | |
570 s+ (xmltok-g standalone-name "standalone") | |
571 s* "=" | |
572 s* (xmltok-g standalone-value | |
573 "\"" yes-no "\"" or "'" yes-no "'") | |
574 close opt))) | |
575 (xmltok+ "<" question "xml" | |
576 version-att | |
577 encoding-att | |
578 standalone-att | |
579 s* question ">"))) | |
580 (xmltok-defregexp | |
581 xmltok-prolog | |
582 (let* ((single-char (xmltok-g single-char "[[|,(\"'>]")) | |
583 (internal-subset-close (xmltok-g internal-subset-close | |
584 "][ \t\r\n]*>")) | |
585 (starts-with-close-paren | |
586 (xmltok-g close-paren | |
587 ")" | |
588 (xmltok-p | |
589 (xmltok-g close-paren-occur "[+?]") | |
590 or | |
591 (xmltok-g close-paren-star "\\*")) | |
592 opt)) | |
593 (starts-with-percent | |
594 (xmltok-g percent | |
595 "%" (xmltok-g param-entity-ref | |
596 ncname | |
597 (xmltok-g param-entity-ref-close | |
598 ";") opt) opt)) | |
599 (starts-with-nmtoken-not-name | |
600 (xmltok-g nmtoken | |
601 (xmltok-p name-continue-not-start-char or ":") | |
602 (xmltok-p name-continue-char or ":") *)) | |
603 (nmtoken-after-colon | |
604 (xmltok+ | |
605 (xmltok-p name-continue-not-start-char or ":") | |
606 (xmltok-p name-continue-char or ":") * | |
607 or | |
608 name-start-char | |
609 name-continue-char * | |
610 ":" | |
611 (xmltok-p name-continue-char or ":") *)) | |
612 (after-ncname | |
613 (xmltok+ (xmltok-g ncname-nmtoken | |
614 ":" (xmltok-p nmtoken-after-colon)) | |
615 or (xmltok-p (xmltok-g colon ":" ncname) | |
616 (xmltok-g colon-name-occur "[?+*]") opt) | |
617 or (xmltok-g ncname-occur "[?+*]") | |
618 or (xmltok-g ncname-colon ":"))) | |
619 (starts-with-name | |
620 (xmltok-g name ncname (xmltok-p after-ncname) opt)) | |
621 (starts-with-hash | |
622 (xmltok-g pound | |
623 "#" (xmltok-g hash-name ncname))) | |
624 (markup-declaration | |
625 (xmltok-g markup-declaration | |
626 "!" (xmltok-p (xmltok-g comment-first-dash "-" | |
627 (xmltok-g comment-open "-") opt) | |
628 or (xmltok-g named-markup-declaration | |
629 ncname)) opt)) | |
630 (after-lt | |
631 (xmltok+ markup-declaration | |
632 or (xmltok-g processing-instruction-question | |
633 question) | |
634 or (xmltok-g instance-start | |
635 ncname))) | |
636 (starts-with-lt (xmltok-g less-than "<" (xmltok-p after-lt) opt))) | |
637 (xmltok+ starts-with-lt | |
638 or single-char | |
639 or starts-with-close-paren | |
640 or starts-with-percent | |
641 or starts-with-name | |
642 or starts-with-nmtoken-not-name | |
643 or starts-with-hash | |
644 or internal-subset-close))))) | |
645 | |
646 (defconst xmltok-ncname-regexp (xmltok-ncname regexp)) | |
647 | |
648 (defun xmltok-scan-after-lt () | |
649 (cond ((not (looking-at (xmltok-after-lt regexp))) | |
650 (xmltok-add-error "`<' that is not markup must be entered as `<'") | |
651 (setq xmltok-type 'not-well-formed)) | |
652 (t | |
653 (goto-char (match-end 0)) | |
654 (cond ((xmltok-after-lt start start-tag-close) | |
655 (setq xmltok-name-end | |
656 (xmltok-after-lt end start-tag-name)) | |
657 (setq xmltok-name-colon | |
658 (xmltok-after-lt start start-tag-colon)) | |
659 (setq xmltok-attributes nil) | |
660 (setq xmltok-namespace-attributes nil) | |
661 (setq xmltok-type 'start-tag)) | |
662 ((xmltok-after-lt start end-tag-close) | |
663 (setq xmltok-name-end | |
664 (xmltok-after-lt end end-tag-name)) | |
665 (setq xmltok-name-colon | |
666 (xmltok-after-lt start end-tag-colon)) | |
667 (setq xmltok-type 'end-tag)) | |
668 ((xmltok-after-lt start start-tag-s) | |
669 (setq xmltok-name-end | |
670 (xmltok-after-lt end start-tag-name)) | |
671 (setq xmltok-name-colon | |
672 (xmltok-after-lt start start-tag-colon)) | |
673 (setq xmltok-namespace-attributes nil) | |
674 (setq xmltok-attributes nil) | |
675 (xmltok-scan-attributes) | |
676 xmltok-type) | |
677 ((xmltok-after-lt start empty-tag-close) | |
678 (setq xmltok-name-end | |
679 (xmltok-after-lt end start-tag-name)) | |
680 (setq xmltok-name-colon | |
681 (xmltok-after-lt start start-tag-colon)) | |
682 (setq xmltok-attributes nil) | |
683 (setq xmltok-namespace-attributes nil) | |
684 (setq xmltok-type 'empty-element)) | |
685 ((xmltok-after-lt start cdata-section-open) | |
686 (setq xmltok-type | |
687 (if (search-forward "]]>" nil t) | |
688 'cdata-section | |
689 (xmltok-add-error "No closing ]]>") | |
690 (xmltok-add-dependent 'xmltok-unclosed-reparse-p | |
691 nil | |
692 nil | |
693 "]]>") | |
694 'not-well-formed))) | |
695 ((xmltok-after-lt start processing-instruction-question) | |
696 (xmltok-scan-after-processing-instruction-open)) | |
697 ((xmltok-after-lt start comment-open) | |
698 (xmltok-scan-after-comment-open)) | |
699 ((xmltok-after-lt start empty-tag-slash) | |
700 (setq xmltok-name-end | |
701 (xmltok-after-lt end start-tag-name)) | |
702 (setq xmltok-name-colon | |
703 (xmltok-after-lt start start-tag-colon)) | |
704 (setq xmltok-attributes nil) | |
705 (setq xmltok-namespace-attributes nil) | |
706 (xmltok-add-error "Expected `/>'" (1- (point))) | |
707 (setq xmltok-type 'partial-empty-element)) | |
708 ((xmltok-after-lt start start-tag-name) | |
709 (xmltok-add-error "Missing `>'" | |
710 nil | |
711 (1+ xmltok-start)) | |
712 (setq xmltok-name-end | |
713 (xmltok-after-lt end start-tag-name)) | |
714 (setq xmltok-name-colon | |
715 (xmltok-after-lt start start-tag-colon)) | |
716 (setq xmltok-namespace-attributes nil) | |
717 (setq xmltok-attributes nil) | |
718 (setq xmltok-type 'partial-start-tag)) | |
719 ((xmltok-after-lt start end-tag-name) | |
720 (setq xmltok-name-end (xmltok-after-lt end end-tag-name)) | |
721 (setq xmltok-name-colon | |
722 (xmltok-after-lt start end-tag-colon)) | |
723 (cond ((and (not xmltok-name-colon) | |
724 (eq (char-after) ?:)) | |
725 (goto-char (1+ (point))) | |
726 (xmltok-add-error "Expected name following `:'" | |
727 (1- (point)))) | |
728 (t | |
729 (xmltok-add-error "Missing `>'" | |
730 nil | |
731 (1+ xmltok-start)))) | |
732 (setq xmltok-type 'partial-end-tag)) | |
733 ((xmltok-after-lt start end-tag-slash) | |
734 (xmltok-add-error "Expected name following `</'") | |
735 (setq xmltok-name-end nil) | |
736 (setq xmltok-name-colon nil) | |
737 (setq xmltok-type 'partial-end-tag)) | |
738 ((xmltok-after-lt start marked-section-open) | |
739 (xmltok-add-error "Expected `CDATA[' after `<!['" | |
740 xmltok-start | |
741 (+ 3 xmltok-start)) | |
742 (setq xmltok-type 'not-well-formed)) | |
743 ((xmltok-after-lt start comment-first-dash) | |
744 (xmltok-add-error "Expected `-' after `<!-'" | |
745 xmltok-start | |
746 (+ 3 xmltok-start)) | |
747 (setq xmltok-type 'not-well-formed)) | |
748 ((xmltok-after-lt start markup-declaration) | |
749 (xmltok-add-error "Expected `[CDATA[' or `--' after `<!'" | |
750 xmltok-start | |
751 (+ 2 xmltok-start)) | |
752 (setq xmltok-type 'not-well-formed)) | |
753 (t | |
754 (xmltok-add-error "Not well-formed") | |
755 (setq xmltok-type 'not-well-formed)))))) | |
756 | |
757 ;; XXX This should be unified with | |
758 ;; xmltok-scan-prolog-after-processing-instruction-open | |
759 ;; XXX maybe should include rest of line (up to any <,>) in unclosed PI | |
760 (defun xmltok-scan-after-processing-instruction-open () | |
761 (cond ((not (search-forward "?>" nil t)) | |
762 (xmltok-add-error "No closing ?>" | |
763 xmltok-start | |
764 (+ xmltok-start 2)) | |
765 (xmltok-add-dependent 'xmltok-unclosed-reparse-p | |
766 nil | |
767 nil | |
768 "?>") | |
769 (setq xmltok-type 'not-well-formed)) | |
770 (t | |
771 (cond ((not (save-excursion | |
772 (goto-char (+ 2 xmltok-start)) | |
773 (and (looking-at (xmltok-ncname regexp)) | |
774 (setq xmltok-name-end (match-end 0))))) | |
775 (setq xmltok-name-end (+ xmltok-start 2)) | |
776 (xmltok-add-error "<? not followed by name" | |
777 (+ xmltok-start 2) | |
778 (+ xmltok-start 3))) | |
779 ((not (or (memq (char-after xmltok-name-end) | |
780 '(?\n ?\t ?\r ? )) | |
781 (= xmltok-name-end (- (point) 2)))) | |
782 (xmltok-add-error "Target not followed by whitespace" | |
783 xmltok-name-end | |
784 (1+ xmltok-name-end))) | |
785 ((and (= xmltok-name-end (+ xmltok-start 5)) | |
786 (save-excursion | |
787 (goto-char (+ xmltok-start 2)) | |
788 (let ((case-fold-search t)) | |
789 (looking-at "xml")))) | |
790 (xmltok-add-error "Processing instruction target is xml" | |
791 (+ xmltok-start 2) | |
792 (+ xmltok-start 5)))) | |
793 (setq xmltok-type 'processing-instruction)))) | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
794 |
86361 | 795 (defun xmltok-scan-after-comment-open () |
796 (setq xmltok-type | |
797 (cond ((not (search-forward "--" nil t)) | |
798 (xmltok-add-error "No closing -->") | |
799 (xmltok-add-dependent 'xmltok-unclosed-reparse-p | |
800 nil | |
801 nil | |
802 ;; not --> because | |
803 ;; -- is not allowed | |
804 ;; in comments in XML | |
805 "--") | |
806 'not-well-formed) | |
807 ((eq (char-after) ?>) | |
808 (goto-char (1+ (point))) | |
809 'comment) | |
810 (t | |
811 (xmltok-add-dependent | |
812 'xmltok-semi-closed-reparse-p | |
813 nil | |
814 (point) | |
815 "--" | |
816 2) | |
817 ;; just include the <!-- in the token | |
818 (goto-char (+ xmltok-start 4)) | |
819 ;; Need do this after the goto-char because | |
820 ;; marked error should just apply to <!-- | |
821 (xmltok-add-error "First following `--' not followed by `>'") | |
822 'not-well-formed)))) | |
823 | |
824 (defun xmltok-scan-attributes () | |
825 (let ((recovering nil) | |
826 (atts-needing-normalization nil)) | |
827 (while (cond ((or (looking-at (xmltok-attribute regexp)) | |
828 ;; use non-greedy group | |
829 (when (looking-at (concat "[^<>\n]+?" | |
830 (xmltok-attribute regexp))) | |
831 (unless recovering | |
832 (xmltok-add-error "Malformed attribute" | |
833 (point) | |
834 (save-excursion | |
835 (goto-char (xmltok-attribute start | |
836 name)) | |
837 (skip-chars-backward "\r\n\t ") | |
838 (point)))) | |
839 t)) | |
840 (setq recovering nil) | |
841 (goto-char (match-end 0)) | |
842 (let ((att (xmltok-add-attribute))) | |
843 (when att | |
844 (setq atts-needing-normalization | |
845 (cons att atts-needing-normalization)))) | |
846 (cond ((xmltok-attribute start start-tag-s) t) | |
847 ((xmltok-attribute start start-tag-close) | |
848 (setq xmltok-type 'start-tag) | |
849 nil) | |
850 ((xmltok-attribute start empty-tag-close) | |
851 (setq xmltok-type 'empty-element) | |
852 nil) | |
853 ((xmltok-attribute start empty-tag-slash) | |
854 (setq xmltok-type 'partial-empty-element) | |
855 (xmltok-add-error "Expected `/>'" | |
856 (1- (point))) | |
857 nil) | |
858 ((looking-at "[ \t\r\n]*[\"']") | |
859 (goto-char (match-end 0)) | |
860 (xmltok-add-error "Missing closing delimiter" | |
861 (1- (point))) | |
862 (setq recovering t) | |
863 t) | |
864 ((looking-at "[ \t]*\\([^ \t\r\n\"'=<>/]+\\)[ \t\r\n/>]") | |
865 (goto-char (match-end 1)) | |
866 (xmltok-add-error "Attribute value not quoted" | |
867 (match-beginning 1)) | |
868 (setq recovering t) | |
869 t) | |
870 (t | |
871 (xmltok-add-error "Missing attribute value" | |
872 (1- (point))) | |
873 (setq recovering t) | |
874 t))) | |
875 ((looking-at "[^<>\n]*/>") | |
876 (let ((start (point))) | |
877 (goto-char (match-end 0)) | |
878 (unless recovering | |
879 (xmltok-add-error "Malformed empty-element" | |
880 start | |
881 (- (point) 2)))) | |
882 (setq xmltok-type 'empty-element) | |
883 nil) | |
884 ((looking-at "[^<>\n]*>") | |
885 (let ((start (point))) | |
886 (goto-char (match-end 0)) | |
887 (unless recovering | |
888 (xmltok-add-error "Malformed start-tag" | |
889 start | |
890 (1- (point))))) | |
891 (setq xmltok-type 'start-tag) | |
892 nil) | |
893 (t | |
894 (when recovering | |
895 (skip-chars-forward "^<>\n")) | |
896 (xmltok-add-error "Missing `>'" | |
897 xmltok-start | |
898 (1+ xmltok-start)) | |
899 (setq xmltok-type 'partial-start-tag) | |
900 nil))) | |
901 (while atts-needing-normalization | |
902 (xmltok-normalize-attribute (car atts-needing-normalization)) | |
903 (setq atts-needing-normalization (cdr atts-needing-normalization)))) | |
904 (setq xmltok-attributes | |
905 (nreverse xmltok-attributes)) | |
906 (setq xmltok-namespace-attributes | |
907 (nreverse xmltok-namespace-attributes))) | |
908 | |
909 (defun xmltok-add-attribute () | |
910 "Return the attribute if it needs normalizing, otherwise nil." | |
911 (let* ((needs-normalizing nil) | |
912 (att | |
913 (if (xmltok-attribute start literal) | |
914 (progn | |
915 (setq needs-normalizing | |
916 (or (xmltok-attribute start complex1) | |
917 (xmltok-attribute start complex2))) | |
918 (xmltok-make-attribute (xmltok-attribute start name) | |
919 (xmltok-attribute start colon) | |
920 (xmltok-attribute end name) | |
921 (1+ (xmltok-attribute start literal)) | |
922 (1- (xmltok-attribute end literal)) | |
923 (not needs-normalizing))) | |
924 (xmltok-make-attribute (xmltok-attribute start name) | |
925 (xmltok-attribute start colon) | |
926 (xmltok-attribute end name))))) | |
927 (if (xmltok-attribute start xmlns) | |
928 (setq xmltok-namespace-attributes | |
929 (cons att xmltok-namespace-attributes)) | |
930 (setq xmltok-attributes | |
931 (cons att xmltok-attributes))) | |
932 (and needs-normalizing | |
933 att))) | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
934 |
86361 | 935 (defun xmltok-normalize-attribute (att) |
936 (let ((end (xmltok-attribute-value-end att)) | |
937 (well-formed t) | |
938 (value-parts nil) | |
939 (refs nil)) | |
940 (save-excursion | |
941 (goto-char (xmltok-attribute-value-start att)) | |
942 (while (progn | |
943 (let ((n (skip-chars-forward "^\r\t\n&" end))) | |
944 (when (> n 0) | |
945 (setq value-parts | |
946 (cons (buffer-substring-no-properties (- (point) n) | |
947 (point)) | |
948 value-parts)))) | |
949 (when (< (point) end) | |
950 (goto-char (1+ (point))) | |
951 (cond ((eq (char-before) ?\&) | |
952 (let ((xmltok-start (1- (point))) | |
953 xmltok-type xmltok-replacement) | |
954 (xmltok-scan-after-amp | |
955 (lambda (start end) | |
956 (xmltok-handle-entity start end t))) | |
957 (cond ((or (eq xmltok-type 'char-ref) | |
958 (eq xmltok-type 'entity-ref)) | |
959 (setq refs | |
960 (cons (vector xmltok-type | |
961 xmltok-start | |
962 (point)) | |
963 refs)) | |
964 (if xmltok-replacement | |
965 (setq value-parts | |
966 (cons xmltok-replacement | |
967 value-parts)) | |
968 (setq well-formed nil))) | |
969 (t (setq well-formed nil))))) | |
970 (t (setq value-parts | |
971 (cons " " value-parts))))) | |
972 (< (point) end)))) | |
973 (when well-formed | |
974 (aset att 5 (apply 'concat (nreverse value-parts)))) | |
975 (aset att 6 (nreverse refs)))) | |
976 | |
977 (defun xmltok-scan-after-amp (entity-handler) | |
978 (cond ((not (looking-at (xmltok-after-amp regexp))) | |
979 (xmltok-add-error "`&' that is not markup must be entered as `&'") | |
980 (setq xmltok-type 'not-well-formed)) | |
981 (t | |
982 (goto-char (match-end 0)) | |
983 (cond ((xmltok-after-amp start entity-ref-close) | |
984 (funcall entity-handler | |
985 (xmltok-after-amp start entity-name) | |
986 (xmltok-after-amp end entity-name)) | |
987 (setq xmltok-type 'entity-ref)) | |
988 ((xmltok-after-amp start decimal-ref-close) | |
989 (xmltok-scan-char-ref (xmltok-after-amp start decimal) | |
990 (xmltok-after-amp end decimal) | |
991 10)) | |
992 ((xmltok-after-amp start hex-ref-close) | |
993 (xmltok-scan-char-ref (xmltok-after-amp start hex) | |
994 (xmltok-after-amp end hex) | |
995 16)) | |
996 ((xmltok-after-amp start number-sign) | |
997 (xmltok-add-error "Missing character number") | |
998 (setq xmltok-type 'not-well-formed)) | |
999 (t | |
1000 (xmltok-add-error "Missing closing `;'") | |
1001 (setq xmltok-type 'not-well-formed)))))) | |
1002 | |
1003 (defconst xmltok-entity-error-messages | |
1004 '((unparsed . "Referenced entity is unparsed") | |
1005 (not-well-formed . "Referenced entity is not well-formed") | |
1006 (external nil . "Referenced entity is external") | |
1007 (element nil . "Referenced entity contains <"))) | |
1008 | |
1009 (defun xmltok-handle-entity (start end &optional attributep) | |
1010 (let* ((name (buffer-substring-no-properties start end)) | |
1011 (name-def (assoc name xmltok-dtd)) | |
1012 (def (cdr name-def))) | |
1013 (cond ((setq xmltok-replacement (and (consp def) | |
1014 (if attributep | |
1015 (cdr def) | |
1016 (car def))))) | |
1017 ((null name-def) | |
1018 (unless (eq (car xmltok-dtd) t) | |
1019 (xmltok-add-error "Referenced entity has not been defined" | |
1020 start | |
1021 end))) | |
1022 ((and attributep (consp def)) | |
1023 (xmltok-add-error "Referenced entity contains <" | |
1024 start | |
1025 end)) | |
1026 (t | |
1027 (let ((err (cdr (assq def xmltok-entity-error-messages)))) | |
1028 (when (consp err) | |
1029 (setq err (if attributep (cdr err) (car err)))) | |
1030 (when err | |
1031 (xmltok-add-error err start end))))))) | |
1032 | |
1033 (defun xmltok-scan-char-ref (start end base) | |
1034 (setq xmltok-replacement | |
86557 | 1035 (let ((n (string-to-number (buffer-substring-no-properties start end) |
86361 | 1036 base))) |
1037 (cond ((and (integerp n) (xmltok-valid-char-p n)) | |
1038 (setq n (xmltok-unicode-to-char n)) | |
1039 (and n (string n))) | |
1040 (t | |
1041 (xmltok-add-error "Invalid character code" start end) | |
1042 nil)))) | |
1043 (setq xmltok-type 'char-ref)) | |
1044 | |
1045 (defun xmltok-char-number (start end) | |
1046 (let* ((base (if (eq (char-after (+ start 2)) ?x) | |
1047 16 | |
1048 10)) | |
86557 | 1049 (n (string-to-number |
86361 | 1050 (buffer-substring-no-properties (+ start (if (= base 16) 3 2)) |
1051 (1- end)) | |
1052 base))) | |
1053 (and (integerp n) | |
1054 (xmltok-valid-char-p n) | |
1055 n))) | |
1056 | |
1057 (defun xmltok-unclosed-reparse-p (change-start | |
1058 change-end | |
1059 pre-change-length | |
1060 start | |
1061 end | |
1062 delimiter) | |
1063 (let ((len-1 (1- (length delimiter)))) | |
1064 (goto-char (max start (- change-start len-1))) | |
1065 (search-forward delimiter (min end (+ change-end len-1)) t))) | |
1066 | |
1067 ;; Handles a <!-- with the next -- not followed by > | |
1068 | |
1069 (defun xmltok-semi-closed-reparse-p (change-start | |
1070 change-end | |
1071 pre-change-length | |
1072 start | |
1073 end | |
1074 delimiter | |
1075 delimiter-length) | |
1076 (or (<= (- end delimiter-length) change-end) | |
1077 (xmltok-unclosed-reparse-p change-start | |
1078 change-end | |
1079 pre-change-length | |
1080 start | |
1081 end | |
1082 delimiter))) | |
1083 | |
1084 (defun xmltok-valid-char-p (n) | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
1085 "Return non-nil if N is the Unicode code of a valid XML character." |
86361 | 1086 (cond ((< n #x20) (memq n '(#xA #xD #x9))) |
1087 ((< n #xD800) t) | |
1088 ((< n #xE000) nil) | |
1089 ((< n #xFFFE) t) | |
1090 (t (and (> n #xFFFF) | |
1091 (< n #x110000))))) | |
1092 | |
1093 (defun xmltok-unicode-to-char (n) | |
1094 "Return the character corresponding to Unicode scalar value N. | |
1095 Return nil if unsupported in Emacs." | |
1096 (decode-char 'ucs n)) | |
1097 | |
1098 ;;; Prolog parsing | |
1099 | |
1100 (defvar xmltok-contains-doctype nil) | |
1101 (defvar xmltok-doctype-external-subset-flag nil) | |
1102 (defvar xmltok-internal-subset-start nil) | |
1103 (defvar xmltok-had-param-entity-ref nil) | |
1104 (defvar xmltok-prolog-regions nil) | |
1105 (defvar xmltok-standalone nil | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
1106 "Non-nil if there was an XML declaration specifying standalone=\"yes\".") |
86361 | 1107 (defvar xmltok-markup-declaration-doctype-flag nil) |
1108 | |
1109 (defconst xmltok-predefined-entity-alist | |
1110 '(("lt" "<" . "<") | |
1111 ("gt" ">" . ">") | |
1112 ("amp" "&" . "&") | |
1113 ("apos" "'" . "'") | |
1114 ("quot" "\"" . "\""))) | |
1115 | |
1116 (defun xmltok-forward-prolog () | |
1117 "Move forward to the end of the XML prolog. | |
1118 | |
1119 Returns a list of vectors [TYPE START END] where TYPE is a symbol and | |
1120 START and END are integers giving the start and end of the region of | |
1121 that type. TYPE can be one of xml-declaration, | |
1122 xml-declaration-attribute-name, xml-declaration-attribute-value, | |
1123 comment, processing-instruction-left, processing-instruction-right, | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
1124 markup-declaration-open, markup-declaration-close, |
86361 | 1125 internal-subset-open, internal-subset-close, hash-name, keyword, |
1126 literal, encoding-name. | |
1127 Adds to `xmltok-errors' and `xmltok-dependent-regions' as appropriate." | |
1128 (let ((case-fold-search nil) | |
1129 xmltok-start | |
1130 xmltok-type | |
1131 xmltok-prolog-regions | |
1132 xmltok-contains-doctype | |
1133 xmltok-internal-subset-start | |
1134 xmltok-had-param-entity-ref | |
1135 xmltok-standalone | |
1136 xmltok-doctype-external-subset-flag | |
1137 xmltok-markup-declaration-doctype-flag) | |
1138 (setq xmltok-dtd xmltok-predefined-entity-alist) | |
1139 (xmltok-scan-xml-declaration) | |
1140 (xmltok-next-prolog-token) | |
1141 (while (condition-case err | |
1142 (when (xmltok-parse-prolog-item) | |
1143 (xmltok-next-prolog-token)) | |
1144 (xmltok-markup-declaration-parse-error | |
1145 (xmltok-skip-markup-declaration)))) | |
1146 (when xmltok-internal-subset-start | |
1147 (xmltok-add-error "No closing ]" | |
1148 (1- xmltok-internal-subset-start) | |
1149 xmltok-internal-subset-start)) | |
1150 (xmltok-parse-entities) | |
1151 ;; XXX prune dependent-regions for those entirely in prolog | |
1152 (nreverse xmltok-prolog-regions))) | |
1153 | |
1154 (defconst xmltok-bad-xml-decl-regexp | |
1155 "[ \t\r\n]*<\\?xml\\(?:[ \t\r\n]\\|\\?>\\)") | |
1156 | |
1157 ;;;###autoload | |
1158 (defun xmltok-get-declared-encoding-position (&optional limit) | |
1159 "Return the position of the encoding in the XML declaration at point. | |
1160 If there is a well-formed XML declaration starting at point and it | |
1161 contains an encoding declaration, then return (START . END) | |
1162 where START and END are the positions of the start and the end | |
1163 of the encoding name; if there is no encoding declaration return | |
1164 the position where and encoding declaration could be inserted. | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
1165 If there is XML that is not well-formed that looks like an XML |
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
1166 declaration, return nil. Otherwise, return t. |
86361 | 1167 If LIMIT is non-nil, then do not consider characters beyond LIMIT." |
1168 (cond ((let ((case-fold-search nil)) | |
1169 (and (looking-at (xmltok-xml-declaration regexp)) | |
1170 (or (not limit) (<= (match-end 0) limit)))) | |
1171 (let ((end (xmltok-xml-declaration end encoding-value))) | |
1172 (if end | |
1173 (cons (1+ (xmltok-xml-declaration start encoding-value)) | |
1174 (1- end)) | |
1175 (or (xmltok-xml-declaration end version-value) | |
1176 (+ (point) 5))))) | |
1177 ((not (let ((case-fold-search t)) | |
1178 (looking-at xmltok-bad-xml-decl-regexp)))))) | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
1179 |
86361 | 1180 (defun xmltok-scan-xml-declaration () |
1181 (when (looking-at (xmltok-xml-declaration regexp)) | |
1182 (xmltok-add-prolog-region 'xml-declaration (point) (match-end 0)) | |
1183 (goto-char (match-end 0)) | |
1184 (when (xmltok-xml-declaration start version-name) | |
1185 (xmltok-add-prolog-region 'xml-declaration-attribute-name | |
1186 (xmltok-xml-declaration start version-name) | |
1187 (xmltok-xml-declaration end version-name)) | |
1188 (let ((start (xmltok-xml-declaration start version-value)) | |
1189 (end (xmltok-xml-declaration end version-value))) | |
1190 (xmltok-add-prolog-region 'xml-declaration-attribute-value | |
1191 start | |
1192 end))) | |
1193 ;; XXX need to check encoding name | |
1194 ;; Should start with letter, not contain colon | |
1195 (when (xmltok-xml-declaration start encoding-name) | |
1196 (xmltok-add-prolog-region 'xml-declaration-attribute-name | |
1197 (xmltok-xml-declaration start encoding-name) | |
1198 (xmltok-xml-declaration end encoding-name)) | |
1199 (let ((start (xmltok-xml-declaration start encoding-value)) | |
1200 (end (xmltok-xml-declaration end encoding-value))) | |
1201 (xmltok-add-prolog-region 'encoding-name | |
1202 (1+ start) | |
1203 (1- end)) | |
1204 (xmltok-add-prolog-region 'xml-declaration-attribute-value | |
1205 start | |
1206 end))) | |
1207 (when (xmltok-xml-declaration start standalone-name) | |
1208 (xmltok-add-prolog-region 'xml-declaration-attribute-name | |
1209 (xmltok-xml-declaration start standalone-name) | |
1210 (xmltok-xml-declaration end standalone-name)) | |
1211 (let ((start (xmltok-xml-declaration start standalone-value)) | |
1212 (end (xmltok-xml-declaration end standalone-value))) | |
1213 (xmltok-add-prolog-region 'xml-declaration-attribute-value | |
1214 start | |
1215 end) | |
1216 (setq xmltok-standalone | |
1217 (string= (buffer-substring-no-properties (1+ start) (1- end)) | |
1218 "yes")))) | |
1219 t)) | |
1220 | |
1221 (defconst xmltok-markup-declaration-alist | |
1222 '(("ELEMENT" . xmltok-parse-element-declaration) | |
1223 ("ATTLIST" . xmltok-parse-attlist-declaration) | |
1224 ("ENTITY" . xmltok-parse-entity-declaration) | |
1225 ("NOTATION" . xmltok-parse-notation-declaration))) | |
1226 | |
1227 (defun xmltok-parse-prolog-item () | |
1228 (cond ((eq xmltok-type 'comment) | |
1229 (xmltok-add-prolog-region 'comment | |
1230 xmltok-start | |
1231 (point)) | |
1232 t) | |
1233 ((eq xmltok-type 'processing-instruction)) | |
1234 ((eq xmltok-type 'named-markup-declaration) | |
1235 (setq xmltok-markup-declaration-doctype-flag nil) | |
1236 (xmltok-add-prolog-region 'markup-declaration-open | |
1237 xmltok-start | |
1238 (point)) | |
1239 (let* ((name (buffer-substring-no-properties | |
1240 (+ xmltok-start 2) | |
1241 (point))) | |
1242 (fun (cdr (assoc name xmltok-markup-declaration-alist)))) | |
1243 (cond (fun | |
1244 (unless xmltok-internal-subset-start | |
1245 (xmltok-add-error | |
1246 "Declaration allowed only in internal subset")) | |
1247 (funcall fun)) | |
1248 ((string= name "DOCTYPE") | |
1249 (xmltok-parse-doctype)) | |
1250 (t | |
1251 (xmltok-add-error "Unknown markup declaration" | |
1252 (+ xmltok-start 2)) | |
1253 (xmltok-next-prolog-token) | |
1254 (xmltok-markup-declaration-parse-error)))) | |
1255 t) | |
1256 ((or (eq xmltok-type 'end-prolog) | |
1257 (not xmltok-type)) | |
1258 nil) | |
1259 ((eq xmltok-type 'internal-subset-close) | |
1260 (xmltok-add-prolog-region 'internal-subset-close | |
1261 xmltok-start | |
1262 (1+ xmltok-start)) | |
1263 (xmltok-add-prolog-region 'markup-declaration-close | |
1264 (1- (point)) | |
1265 (point)) | |
1266 (if xmltok-internal-subset-start | |
1267 (setq xmltok-internal-subset-start nil) | |
1268 (xmltok-add-error "]> outside internal subset")) | |
1269 t) | |
1270 ((eq xmltok-type 'param-entity-ref) | |
1271 (if xmltok-internal-subset-start | |
1272 (setq xmltok-had-param-entity-ref t) | |
1273 (xmltok-add-error "Parameter entity reference outside document type declaration")) | |
1274 t) | |
1275 ;; If we don't do this, we can get thousands of errors when | |
1276 ;; a plain text file is parsed. | |
1277 ((not xmltok-internal-subset-start) | |
1278 (when (let ((err (car xmltok-errors))) | |
1279 (or (not err) | |
1280 (<= (xmltok-error-end err) xmltok-start))) | |
1281 (goto-char xmltok-start)) | |
1282 nil) | |
1283 ((eq xmltok-type 'not-well-formed) t) | |
1284 (t | |
1285 (xmltok-add-error "Token allowed only inside markup declaration") | |
1286 t))) | |
1287 | |
1288 (defun xmltok-parse-doctype () | |
1289 (setq xmltok-markup-declaration-doctype-flag t) | |
1290 (xmltok-next-prolog-token) | |
1291 (when xmltok-internal-subset-start | |
1292 (xmltok-add-error "DOCTYPE declaration not allowed in internal subset") | |
1293 (xmltok-markup-declaration-parse-error)) | |
1294 (when xmltok-contains-doctype | |
1295 (xmltok-add-error "Duplicate DOCTYPE declaration") | |
1296 (xmltok-markup-declaration-parse-error)) | |
1297 (setq xmltok-contains-doctype t) | |
1298 (xmltok-require-token 'name 'prefixed-name) | |
1299 (xmltok-require-next-token "SYSTEM" "PUBLIC" ?\[ ?>) | |
1300 (cond ((eq xmltok-type ?\[) | |
1301 (setq xmltok-internal-subset-start (point))) | |
1302 ((eq xmltok-type ?>)) | |
1303 (t | |
1304 (setq xmltok-doctype-external-subset-flag t) | |
1305 (xmltok-parse-external-id) | |
1306 (xmltok-require-token ?\[ ?>) | |
1307 (when (eq xmltok-type ?\[) | |
1308 (setq xmltok-internal-subset-start (point)))))) | |
1309 | |
1310 (defun xmltok-parse-attlist-declaration () | |
1311 (xmltok-require-next-token 'prefixed-name 'name) | |
1312 (while (progn | |
1313 (xmltok-require-next-token ?> 'name 'prefixed-name) | |
1314 (if (eq xmltok-type ?>) | |
1315 nil | |
1316 (xmltok-require-next-token ?\( | |
1317 "CDATA" | |
1318 "ID" | |
1319 "IDREF" | |
1320 "IDREFS" | |
1321 "ENTITY" | |
1322 "ENTITIES" | |
1323 "NMTOKEN" | |
1324 "NMTOKENS" | |
1325 "NOTATION") | |
1326 (cond ((eq xmltok-type ?\() | |
1327 (xmltok-parse-nmtoken-group)) | |
1328 ((string= (xmltok-current-token-string) | |
1329 "NOTATION") | |
1330 (xmltok-require-next-token ?\() | |
1331 (xmltok-parse-nmtoken-group))) | |
1332 (xmltok-require-next-token "#IMPLIED" | |
1333 "#REQUIRED" | |
1334 "#FIXED" | |
1335 'literal) | |
1336 (when (string= (xmltok-current-token-string) "#FIXED") | |
1337 (xmltok-require-next-token 'literal)) | |
1338 t)))) | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
1339 |
86361 | 1340 (defun xmltok-parse-nmtoken-group () |
1341 (while (progn | |
1342 (xmltok-require-next-token 'nmtoken 'prefixed-name 'name) | |
1343 (xmltok-require-next-token ?| ?\)) | |
1344 (eq xmltok-type ?|)))) | |
1345 | |
1346 (defun xmltok-parse-element-declaration () | |
1347 (xmltok-require-next-token 'name 'prefixed-name) | |
1348 (xmltok-require-next-token "EMPTY" "ANY" ?\() | |
1349 (when (eq xmltok-type ?\() | |
1350 (xmltok-require-next-token "#PCDATA" | |
1351 'name | |
1352 'prefixed-name | |
1353 'name-occur | |
1354 ?\() | |
1355 (cond ((eq xmltok-type 'hash-name) | |
1356 (xmltok-require-next-token ?| ?\) 'close-paren-star) | |
1357 (while (eq xmltok-type ?|) | |
1358 (xmltok-require-next-token 'name 'prefixed-name) | |
1359 (xmltok-require-next-token 'close-paren-star ?|))) | |
1360 (t (xmltok-parse-model-group)))) | |
1361 (xmltok-require-next-token ?>)) | |
1362 | |
1363 (defun xmltok-parse-model-group () | |
1364 (xmltok-parse-model-group-member) | |
1365 (xmltok-require-next-token ?| | |
1366 ?, | |
1367 ?\) | |
1368 'close-paren-star | |
1369 'close-paren-occur) | |
1370 (when (memq xmltok-type '(?, ?|)) | |
1371 (let ((connector xmltok-type)) | |
1372 (while (progn | |
1373 (xmltok-next-prolog-token) | |
1374 (xmltok-parse-model-group-member) | |
1375 (xmltok-require-next-token connector | |
1376 ?\) | |
1377 'close-paren-star | |
1378 'close-paren-occur) | |
1379 (eq xmltok-type connector)))))) | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
1380 |
86361 | 1381 (defun xmltok-parse-model-group-member () |
1382 (xmltok-require-token 'name | |
1383 'prefixed-name | |
1384 'name-occur | |
1385 ?\() | |
1386 (when (eq xmltok-type ?\() | |
1387 (xmltok-next-prolog-token) | |
1388 (xmltok-parse-model-group))) | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
1389 |
86361 | 1390 (defun xmltok-parse-entity-declaration () |
1391 (let (paramp name) | |
1392 (xmltok-require-next-token 'name ?%) | |
1393 (when (eq xmltok-type ?%) | |
1394 (setq paramp t) | |
1395 (xmltok-require-next-token 'name)) | |
1396 (setq name (xmltok-current-token-string)) | |
1397 (xmltok-require-next-token 'literal "SYSTEM" "PUBLIC") | |
1398 (cond ((eq xmltok-type 'literal) | |
1399 (let ((replacement (xmltok-parse-entity-value))) | |
1400 (unless paramp | |
1401 (xmltok-define-entity name replacement))) | |
1402 (xmltok-require-next-token ?>)) | |
1403 (t | |
1404 (xmltok-parse-external-id) | |
1405 (if paramp | |
1406 (xmltok-require-token ?>) | |
1407 (xmltok-require-token ?> "NDATA") | |
1408 (if (eq xmltok-type ?>) | |
1409 (xmltok-define-entity name 'external) | |
1410 (xmltok-require-next-token 'name) | |
1411 (xmltok-require-next-token ?>) | |
1412 (xmltok-define-entity name 'unparsed))))))) | |
1413 | |
1414 (defun xmltok-define-entity (name value) | |
1415 (when (and (or (not xmltok-had-param-entity-ref) | |
1416 xmltok-standalone) | |
1417 (not (assoc name xmltok-dtd))) | |
1418 (setq xmltok-dtd | |
1419 (cons (cons name value) xmltok-dtd)))) | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
1420 |
86361 | 1421 (defun xmltok-parse-entity-value () |
1422 (let ((lim (1- (point))) | |
1423 (well-formed t) | |
1424 value-parts | |
1425 start) | |
1426 (save-excursion | |
1427 (goto-char (1+ xmltok-start)) | |
1428 (setq start (point)) | |
1429 (while (progn | |
1430 (skip-chars-forward "^%&" lim) | |
1431 (when (< (point) lim) | |
1432 (goto-char (1+ (point))) | |
1433 (cond ((eq (char-before) ?%) | |
1434 (xmltok-add-error "Parameter entity references are not allowed in the internal subset" | |
1435 (1- (point)) | |
1436 (point)) | |
1437 (setq well-formed nil)) | |
1438 (t | |
1439 (let ((xmltok-start (1- (point))) | |
1440 xmltok-type xmltok-replacement) | |
1441 (xmltok-scan-after-amp (lambda (start end))) | |
1442 (cond ((eq xmltok-type 'char-ref) | |
1443 (setq value-parts | |
1444 (cons (buffer-substring-no-properties | |
1445 start | |
1446 xmltok-start) | |
1447 value-parts)) | |
1448 (setq value-parts | |
1449 (cons xmltok-replacement | |
1450 value-parts)) | |
1451 (setq start (point))) | |
1452 ((eq xmltok-type 'not-well-formed) | |
1453 (setq well-formed nil)))))) | |
1454 t)))) | |
1455 (if (not well-formed) | |
1456 nil | |
1457 (apply 'concat | |
1458 (nreverse (cons (buffer-substring-no-properties start lim) | |
1459 value-parts)))))) | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
1460 |
86361 | 1461 (defun xmltok-parse-notation-declaration () |
1462 (xmltok-require-next-token 'name) | |
1463 (xmltok-require-next-token "SYSTEM" "PUBLIC") | |
1464 (let ((publicp (string= (xmltok-current-token-string) "PUBLIC"))) | |
1465 (xmltok-require-next-token 'literal) | |
1466 (cond (publicp | |
1467 (xmltok-require-next-token 'literal ?>) | |
1468 (unless (eq xmltok-type ?>) | |
1469 (xmltok-require-next-token ?>))) | |
1470 (t (xmltok-require-next-token ?>))))) | |
1471 | |
1472 (defun xmltok-parse-external-id () | |
1473 (xmltok-require-token "SYSTEM" "PUBLIC") | |
1474 (let ((publicp (string= (xmltok-current-token-string) "PUBLIC"))) | |
1475 (xmltok-require-next-token 'literal) | |
1476 (when publicp | |
1477 (xmltok-require-next-token 'literal))) | |
1478 (xmltok-next-prolog-token)) | |
1479 | |
1480 (defun xmltok-require-next-token (&rest types) | |
1481 (xmltok-next-prolog-token) | |
1482 (apply 'xmltok-require-token types)) | |
1483 | |
1484 (defun xmltok-require-token (&rest types) | |
1485 ;; XXX Generate a more helpful error message | |
1486 (while (and (not (let ((type (car types))) | |
1487 (if (stringp (car types)) | |
1488 (string= (xmltok-current-token-string) type) | |
1489 (eq type xmltok-type)))) | |
1490 (setq types (cdr types)))) | |
1491 (unless types | |
1492 (when (and xmltok-type | |
1493 (not (eq xmltok-type 'not-well-formed))) | |
1494 (xmltok-add-error "Unexpected token")) | |
1495 (xmltok-markup-declaration-parse-error)) | |
1496 (let ((region-type (xmltok-prolog-region-type (car types)))) | |
1497 (when region-type | |
1498 (xmltok-add-prolog-region region-type | |
1499 xmltok-start | |
1500 (point))))) | |
1501 | |
1502 (defun xmltok-current-token-string () | |
1503 (buffer-substring-no-properties xmltok-start (point))) | |
1504 | |
1505 (put 'xmltok-markup-declaration-parse-error | |
1506 'error-conditions | |
1507 '(error xmltok-markup-declaration-parse-error)) | |
1508 | |
1509 (put 'xmltok-markup-declaration-parse-error | |
1510 'error-message | |
1511 "Syntax error in markup declaration") | |
1512 | |
1513 (defun xmltok-markup-declaration-parse-error () | |
1514 (signal 'xmltok-markup-declaration-parse-error nil)) | |
1515 | |
1516 (defun xmltok-skip-markup-declaration () | |
1517 (while (cond ((eq xmltok-type ?>) | |
1518 (xmltok-next-prolog-token) | |
1519 nil) | |
1520 ((and xmltok-markup-declaration-doctype-flag | |
1521 (eq xmltok-type ?\[)) | |
1522 (setq xmltok-internal-subset-start (point)) | |
1523 (xmltok-next-prolog-token) | |
1524 nil) | |
1525 ((memq xmltok-type '(nil | |
1526 end-prolog | |
1527 named-markup-declaration | |
1528 comment | |
1529 processing-instruction)) | |
1530 nil) | |
1531 ((and xmltok-internal-subset-start | |
1532 (eq xmltok-type 'internal-subset-close)) | |
1533 nil) | |
1534 (t (xmltok-next-prolog-token) t))) | |
1535 xmltok-type) | |
1536 | |
1537 (defun xmltok-prolog-region-type (required) | |
1538 (cond ((cdr (assq xmltok-type | |
1539 '((literal . literal) | |
1540 (?> . markup-declaration-close) | |
1541 (?\[ . internal-subset-open) | |
1542 (hash-name . hash-name))))) | |
1543 ((and (stringp required) (eq xmltok-type 'name)) | |
1544 'keyword))) | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
1545 |
86361 | 1546 ;; Return new token type. |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
1547 |
86361 | 1548 (defun xmltok-next-prolog-token () |
1549 (skip-chars-forward " \t\r\n") | |
1550 (setq xmltok-start (point)) | |
1551 (cond ((not (and (looking-at (xmltok-prolog regexp)) | |
1552 (goto-char (match-end 0)))) | |
1553 (let ((ch (char-after))) | |
1554 (cond (ch | |
1555 (goto-char (1+ (point))) | |
1556 (xmltok-add-error "Illegal char in prolog") | |
1557 (setq xmltok-type 'not-well-formed)) | |
1558 (t (setq xmltok-type nil))))) | |
1559 ((or (xmltok-prolog start ncname-occur) | |
1560 (xmltok-prolog start colon-name-occur)) | |
1561 (setq xmltok-name-end (1- (point))) | |
1562 (setq xmltok-name-colon (xmltok-prolog start colon)) | |
1563 (setq xmltok-type 'name-occur)) | |
1564 ((xmltok-prolog start colon) | |
1565 (setq xmltok-name-end (point)) | |
1566 (setq xmltok-name-colon (xmltok-prolog start colon)) | |
1567 (unless (looking-at "[ \t\r\n>),|[%]") | |
1568 (xmltok-add-error "Missing space after name")) | |
1569 (setq xmltok-type 'prefixed-name)) | |
1570 ((or (xmltok-prolog start ncname-nmtoken) | |
1571 (xmltok-prolog start ncname-colon)) | |
1572 (unless (looking-at "[ \t\r\n>),|[%]") | |
1573 (xmltok-add-error "Missing space after name token")) | |
1574 (setq xmltok-type 'nmtoken)) | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
1575 ((xmltok-prolog start name) |
86361 | 1576 (setq xmltok-name-end (point)) |
1577 (setq xmltok-name-colon nil) | |
1578 (unless (looking-at "[ \t\r\n>),|[%]") | |
1579 (xmltok-add-error "Missing space after name")) | |
1580 (setq xmltok-type 'name)) | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
1581 ((xmltok-prolog start hash-name) |
86361 | 1582 (setq xmltok-name-end (point)) |
1583 (unless (looking-at "[ \t\r\n>)|%]") | |
1584 (xmltok-add-error "Missing space after name")) | |
1585 (setq xmltok-type 'hash-name)) | |
1586 ((xmltok-prolog start processing-instruction-question) | |
1587 (xmltok-scan-prolog-after-processing-instruction-open)) | |
1588 ((xmltok-prolog start comment-open) | |
1589 ;; XXX if not-well-formed, ignore some stuff | |
1590 (xmltok-scan-after-comment-open)) | |
1591 ((xmltok-prolog start named-markup-declaration) | |
1592 (setq xmltok-type 'named-markup-declaration)) | |
1593 ((xmltok-prolog start instance-start) | |
1594 (goto-char xmltok-start) | |
1595 (setq xmltok-type 'end-prolog)) | |
1596 ((xmltok-prolog start close-paren-star) | |
1597 (setq xmltok-type 'close-paren-star)) | |
1598 ((xmltok-prolog start close-paren-occur) | |
1599 (setq xmltok-type 'close-paren-occur)) | |
1600 ((xmltok-prolog start close-paren) | |
1601 (unless (looking-at "[ \t\r\n>,|)]") | |
1602 (xmltok-add-error "Missing space after )")) | |
1603 (setq xmltok-type ?\))) | |
1604 ((xmltok-prolog start single-char) | |
1605 (let ((ch (char-before))) | |
1606 (cond ((memq ch '(?\" ?\')) | |
1607 (xmltok-scan-prolog-literal)) | |
1608 (t (setq xmltok-type ch))))) | |
1609 ((xmltok-prolog start percent) | |
1610 (cond ((xmltok-prolog start param-entity-ref-close) | |
1611 (setq xmltok-name-end (1- (point))) | |
1612 (setq xmltok-type 'param-entity-ref)) | |
1613 ((xmltok-prolog start param-entity-ref) | |
1614 (xmltok-add-error "Missing ;") | |
1615 (setq xmltok-name-end (point)) | |
1616 (setq xmltok-type 'param-entity-ref)) | |
1617 ((looking-at "[ \t\r\n%]") | |
1618 (setq xmltok-type ?%)) | |
1619 (t | |
1620 (xmltok-add-error "Expected name after %") | |
1621 (setq xmltok-type 'not-well-formed)))) | |
1622 ((xmltok-prolog start nmtoken) | |
1623 (unless (looking-at "[ \t\r\n>),|[%]") | |
1624 (xmltok-add-error "Missing space after name token")) | |
1625 (setq xmltok-type 'nmtoken)) | |
1626 ((xmltok-prolog start internal-subset-close) | |
1627 (setq xmltok-type 'internal-subset-close)) | |
1628 ((xmltok-prolog start pound) | |
1629 (xmltok-add-error "Expected name after #") | |
1630 (setq xmltok-type 'not-well-formed)) | |
1631 ((xmltok-prolog start markup-declaration) | |
1632 (xmltok-add-error "Expected name or -- after <!") | |
1633 (setq xmltok-type 'not-well-formed)) | |
1634 ((xmltok-prolog start comment-first-dash) | |
1635 (xmltok-add-error "Expected <!--") | |
1636 (setq xmltok-type 'not-well-formed)) | |
1637 ((xmltok-prolog start less-than) | |
1638 (xmltok-add-error "Incomplete markup") | |
1639 (setq xmltok-type 'not-well-formed)) | |
1640 (t (error "Unhandled token in prolog %s" | |
1641 (match-string-no-properties 0))))) | |
1642 | |
1643 (defun xmltok-scan-prolog-literal () | |
1644 (let* ((delim (string (char-before))) | |
1645 (safe-end (save-excursion | |
1646 (skip-chars-forward (concat "^<>[]" delim)) | |
1647 (point))) | |
1648 (end (save-excursion | |
1649 (goto-char safe-end) | |
1650 (search-forward delim nil t)))) | |
1651 (or (cond ((not end) | |
1652 (xmltok-add-dependent 'xmltok-unclosed-reparse-p | |
1653 nil | |
1654 nil | |
1655 delim) | |
1656 nil) | |
1657 ((save-excursion | |
1658 (goto-char end) | |
1659 (looking-at "[ \t\r\n>%[]")) | |
1660 (goto-char end) | |
1661 (setq xmltok-type 'literal)) | |
1662 ((eq (1+ safe-end) end) | |
1663 (goto-char end) | |
1664 (xmltok-add-error (format "Missing space after %s" delim) | |
1665 safe-end) | |
1666 (setq xmltok-type 'literal)) | |
1667 (t | |
1668 (xmltok-add-dependent 'xmltok-semi-closed-reparse-p | |
1669 xmltok-start | |
1670 (1+ end) | |
1671 delim | |
1672 1) | |
1673 nil)) | |
1674 (progn | |
1675 (xmltok-add-error (format "Missing closing %s" delim)) | |
1676 (goto-char safe-end) | |
1677 (skip-chars-backward " \t\r\n") | |
1678 (setq xmltok-type 'not-well-formed))))) | |
1679 | |
1680 (defun xmltok-scan-prolog-after-processing-instruction-open () | |
1681 (cond ((not (search-forward "?>" nil t)) | |
1682 (xmltok-add-error "No closing ?>" | |
1683 xmltok-start | |
1684 (+ xmltok-start 2)) | |
1685 (xmltok-add-dependent 'xmltok-unclosed-reparse-p | |
1686 nil | |
1687 nil | |
1688 "?>") | |
1689 (setq xmltok-type 'not-well-formed)) | |
1690 (t | |
1691 (let* ((end (point)) | |
1692 (target | |
1693 (save-excursion | |
1694 (goto-char (+ xmltok-start 2)) | |
1695 (and (looking-at (xmltok-ncname regexp)) | |
1696 (or (memq (char-after (match-end 0)) | |
1697 '(?\n ?\t ?\r ? )) | |
1698 (= (match-end 0) (- end 2))) | |
1699 (match-string-no-properties 0))))) | |
1700 (cond ((not target) | |
1701 (xmltok-add-error "\ | |
1702 Processing instruction does not start with a name" | |
1703 (+ xmltok-start 2) | |
1704 (+ xmltok-start 3))) | |
1705 ((not (and (= (length target) 3) | |
1706 (let ((case-fold-search t)) | |
1707 (string-match "xml" target))))) | |
1708 ((= xmltok-start 1) | |
1709 (xmltok-add-error "Invalid XML declaration" | |
1710 xmltok-start | |
1711 (point))) | |
1712 ((save-excursion | |
1713 (goto-char xmltok-start) | |
1714 (looking-at (xmltok-xml-declaration regexp))) | |
1715 (xmltok-add-error "XML declaration not at beginning of file" | |
1716 xmltok-start | |
1717 (point))) | |
1718 (t | |
1719 (xmltok-add-error "Processing instruction has target of xml" | |
1720 (+ xmltok-start 2) | |
1721 (+ xmltok-start 5)))) | |
1722 (xmltok-add-prolog-region 'processing-instruction-left | |
1723 xmltok-start | |
1724 (+ xmltok-start | |
1725 2 | |
1726 (if target | |
1727 (length target) | |
1728 0))) | |
1729 (xmltok-add-prolog-region 'processing-instruction-right | |
1730 (if target | |
1731 (save-excursion | |
1732 (goto-char (+ xmltok-start | |
1733 (length target) | |
1734 2)) | |
1735 (skip-chars-forward " \t\r\n") | |
1736 (point)) | |
1737 (+ xmltok-start 2)) | |
1738 (point))) | |
1739 (setq xmltok-type 'processing-instruction)))) | |
1740 | |
1741 (defun xmltok-parse-entities () | |
1742 (let ((todo xmltok-dtd)) | |
1743 (when (and (or xmltok-had-param-entity-ref | |
1744 xmltok-doctype-external-subset-flag) | |
1745 (not xmltok-standalone)) | |
1746 (setq xmltok-dtd (cons t xmltok-dtd))) | |
1747 (while todo | |
1748 (xmltok-parse-entity (car todo)) | |
1749 (setq todo (cdr todo))))) | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
1750 |
86361 | 1751 (defun xmltok-parse-entity (name-def) |
1752 (let ((def (cdr name-def)) | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
1753 ;; in case its value is buffer local |
86361 | 1754 (xmltok-dtd xmltok-dtd) |
1755 buf) | |
1756 (when (stringp def) | |
1757 (if (string-match "\\`[^&<\t\r\n]*\\'" def) | |
1758 (setcdr name-def (cons def def)) | |
1759 (setcdr name-def 'not-well-formed) ; avoid infinite expansion loops | |
1760 (setq buf (get-buffer-create | |
1761 (format " *Entity %s*" (car name-def)))) | |
1762 (save-excursion | |
1763 (set-buffer buf) | |
1764 (erase-buffer) | |
1765 (insert def) | |
1766 (goto-char (point-min)) | |
1767 (setcdr name-def | |
1768 (xmltok-parse-entity-replacement))) | |
1769 (kill-buffer buf))))) | |
1770 | |
1771 (defun xmltok-parse-entity-replacement () | |
1772 (let ((def (cons "" ""))) | |
1773 (while (let* ((start (point)) | |
1774 (found (re-search-forward "[<&\t\r\n]\\|]]>" nil t)) | |
1775 (ch (and found (char-before))) | |
1776 (str (buffer-substring-no-properties | |
1777 start | |
1778 (if found | |
1779 (match-beginning 0) | |
1780 (point-max))))) | |
1781 (setq def | |
1782 (xmltok-append-entity-def def | |
1783 (cons str str))) | |
1784 (cond ((not found) nil) | |
1785 ((eq ch ?>) | |
1786 (setq def 'not-well-formed) | |
1787 nil) | |
1788 ((eq ch ?<) | |
1789 (xmltok-save | |
1790 (setq xmltok-start (1- (point))) | |
1791 (xmltok-scan-after-lt) | |
1792 (setq def | |
1793 (xmltok-append-entity-def | |
1794 def | |
1795 (cond ((memq xmltok-type | |
1796 '(start-tag | |
1797 end-tag | |
1798 empty-element)) | |
1799 'element) | |
1800 ((memq xmltok-type | |
1801 '(comment | |
1802 processing-instruction)) | |
1803 (cons "" nil)) | |
1804 ((eq xmltok-type | |
1805 'cdata-section) | |
1806 (cons (buffer-substring-no-properties | |
1807 (+ xmltok-start 9) | |
1808 (- (point) 3)) | |
1809 nil)) | |
1810 (t 'not-well-formed))))) | |
1811 t) | |
1812 ((eq ch ?&) | |
1813 (let ((xmltok-start (1- (point))) | |
1814 xmltok-type | |
1815 xmltok-replacement | |
1816 xmltok-errors) | |
1817 (xmltok-scan-after-amp 'xmltok-handle-nested-entity) | |
1818 (cond ((eq xmltok-type 'entity-ref) | |
1819 (setq def | |
1820 (xmltok-append-entity-def | |
1821 def | |
1822 xmltok-replacement))) | |
1823 ((eq xmltok-type 'char-ref) | |
1824 (setq def | |
1825 (xmltok-append-entity-def | |
1826 def | |
1827 (if xmltok-replacement | |
1828 (cons xmltok-replacement | |
1829 xmltok-replacement) | |
1830 (and xmltok-errors 'not-well-formed))))) | |
1831 (t | |
1832 (setq def 'not-well-formed)))) | |
1833 t) | |
1834 (t | |
1835 (setq def | |
1836 (xmltok-append-entity-def | |
1837 def | |
1838 (cons (match-string-no-properties 0) | |
1839 " "))) | |
1840 t)))) | |
1841 def)) | |
1842 | |
1843 (defun xmltok-handle-nested-entity (start end) | |
1844 (let* ((name-def (assoc (buffer-substring-no-properties start end) | |
1845 xmltok-dtd)) | |
1846 (def (cdr name-def))) | |
1847 (when (stringp def) | |
1848 (xmltok-parse-entity name-def) | |
1849 (setq def (cdr name-def))) | |
1850 (setq xmltok-replacement | |
1851 (cond ((null name-def) | |
1852 (if (eq (car xmltok-dtd) t) | |
1853 nil | |
1854 'not-well-formed)) | |
1855 ((eq def 'unparsed) 'not-well-formed) | |
1856 (t def))))) | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
1857 |
86361 | 1858 (defun xmltok-append-entity-def (d1 d2) |
1859 (cond ((consp d1) | |
1860 (if (consp d2) | |
1861 (cons (concat (car d1) (car d2)) | |
1862 (and (cdr d1) | |
1863 (cdr d2) | |
1864 (concat (cdr d1) (cdr d2)))) | |
1865 d2)) | |
1866 ((consp d2) d1) | |
1867 (t | |
1868 (let ((defs '(not-well-formed external element))) | |
1869 (while (not (or (eq (car defs) d1) | |
1870 (eq (car defs) d2))) | |
1871 (setq defs (cdr defs))) | |
1872 (car defs))))) | |
1873 | |
1874 (defun xmltok-add-prolog-region (type start end) | |
1875 (setq xmltok-prolog-regions | |
1876 (cons (vector type start end) | |
1877 xmltok-prolog-regions))) | |
1878 | |
1879 (defun xmltok-merge-attributes () | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
1880 "Return a list merging `xmltok-attributes' and `xmltok-namespace-attributes'. |
86361 | 1881 The members of the merged list are in order of occurrence in the |
1882 document. The list may share list structure with `xmltok-attributes' | |
1883 and `xmltok-namespace-attributes'." | |
1884 (cond ((not xmltok-namespace-attributes) | |
1885 xmltok-attributes) | |
1886 ((not xmltok-attributes) | |
1887 xmltok-namespace-attributes) | |
1888 (t | |
1889 (let ((atts1 xmltok-attributes) | |
1890 (atts2 xmltok-namespace-attributes) | |
1891 merged) | |
1892 (while (and atts1 atts2) | |
1893 (cond ((< (xmltok-attribute-name-start (car atts1)) | |
1894 (xmltok-attribute-name-start (car atts2))) | |
1895 (setq merged (cons (car atts1) merged)) | |
1896 (setq atts1 (cdr atts1))) | |
1897 (t | |
1898 (setq merged (cons (car atts2) merged)) | |
1899 (setq atts2 (cdr atts2))))) | |
1900 (setq merged (nreverse merged)) | |
1901 (cond (atts1 (setq merged (nconc merged atts1))) | |
1902 (atts2 (setq merged (nconc merged atts2)))) | |
1903 merged)))) | |
1904 | |
1905 ;;; Testing | |
1906 | |
1907 (defun xmltok-forward-test () | |
1908 (interactive) | |
1909 (if (xmltok-forward) | |
1910 (message "Scanned %s" xmltok-type) | |
1911 (message "Scanned nothing"))) | |
1912 | |
1913 (defun xmltok-next-prolog-token-test () | |
1914 (interactive) | |
1915 (if (xmltok-next-prolog-token) | |
1916 (message "Scanned %s" | |
1917 (if (integerp xmltok-type) | |
1918 (string xmltok-type) | |
1919 xmltok-type)) | |
1920 (message "Scanned end of file"))) | |
1921 | |
1922 (provide 'xmltok) | |
1923 | |
86379 | 1924 ;; arch-tag: 747e5f3a-6fc3-4f8d-bd96-89f05aa99f5e |
86361 | 1925 ;;; xmltok.el ends here |