86361
|
1 ;;; xmltok.el --- XML tokenization
|
|
2
|
87665
|
3 ;; Copyright (C) 2003, 2007, 2008 Free Software Foundation, Inc.
|
86361
|
4
|
|
5 ;; Author: James Clark
|
|
6 ;; Keywords: XML
|
|
7
|
86557
|
8 ;; This file is part of GNU Emacs.
|
|
9
|
|
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
|
|
11 ;; it under the terms of the GNU General Public License as published by
|
|
12 ;; the Free Software Foundation; either version 3, or (at your option)
|
|
13 ;; any later version.
|
86361
|
14
|
86557
|
15 ;; GNU Emacs is distributed in the hope that it will be useful,
|
|
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
18 ;; GNU General Public License for more details.
|
86361
|
19
|
86557
|
20 ;; You should have received a copy of the GNU General Public License
|
|
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the
|
|
22 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
|
23 ;; Boston, MA 02110-1301, USA.
|
86361
|
24
|
|
25 ;;; Commentary:
|
|
26
|
|
27 ;; This implements an XML 1.0 parser. It also implements the XML
|
|
28 ;; Namespaces Recommendation. It is designed to be conforming, but it
|
|
29 ;; works a bit differently from a normal XML parser. An XML document
|
|
30 ;; consists of the prolog and an instance. The prolog is parsed as a
|
|
31 ;; single unit using `xmltok-forward-prolog'. The instance is
|
|
32 ;; considered as a sequence of tokens, where a token is something like
|
|
33 ;; a start-tag, a comment, a chunk of data or a CDATA section. The
|
|
34 ;; tokenization of the instance is stateless: the tokenization of one
|
|
35 ;; part of the instance does not depend on tokenization of the
|
|
36 ;; preceding part of the instance. This allows the instance to be
|
|
37 ;; parsed incrementally. The main entry point is `xmltok-forward':
|
|
38 ;; this can be called at any point in the instance provided it is
|
|
39 ;; between tokens. The other entry point is `xmltok-forward-special'
|
|
40 ;; which skips over tokens other comments, processing instructions or
|
|
41 ;; CDATA sections (i.e. the constructs in an instance that can contain
|
|
42 ;; less than signs that don't start a token).
|
|
43 ;;
|
|
44 ;; This is a non-validating XML 1.0 processor. It does not resolve
|
|
45 ;; parameter entities (including the external DTD subset) and it does
|
|
46 ;; not resolve external general entities.
|
|
47 ;;
|
|
48 ;; It is non-conformant by design in the following respects.
|
|
49 ;;
|
|
50 ;; 1. It expects the client to detect aspects of well-formedness that
|
|
51 ;; are not internal to a single token, specifically checking that
|
|
52 ;; end-tags match start-tags and that the instance contains exactly
|
|
53 ;; one element.
|
|
54 ;;
|
|
55 ;; 2. It expects the client to detect duplicate attributes. Detection
|
|
56 ;; of duplicate attributes after expansion of namespace prefixes
|
|
57 ;; requires the namespace processing state. Detection of duplicate
|
|
58 ;; attributes before expansion of namespace prefixes does not, but is
|
|
59 ;; redundant given that the client will do detection of duplicate
|
|
60 ;; attributes after expansion of namespace prefixes.
|
|
61 ;;
|
|
62 ;; 3. It allows the client to recover from well-formedness errors.
|
|
63 ;; This is essential for use in applications where the document is
|
|
64 ;; being parsed during the editing process.
|
|
65 ;;
|
|
66 ;; 4. It does not support documents that do not conform to the lexical
|
|
67 ;; requirements of the XML Namespaces Recommendation (e.g. a document
|
|
68 ;; with a colon in an entity name).
|
|
69 ;;
|
|
70 ;; There are also a number of things that have not yet been
|
|
71 ;; implemented that make it non-conformant.
|
|
72 ;;
|
|
73 ;; 1. It does not implement default attributes. ATTLIST declarations
|
|
74 ;; are parsed, but no checking is done on the content of attribute
|
|
75 ;; value literals specifying default attribute values, and default
|
|
76 ;; attribute values are not reported to the client.
|
|
77 ;;
|
|
78 ;; 2. It does not implement internal entities containing elements. If
|
|
79 ;; an internal entity is referenced and parsing its replacement text
|
|
80 ;; yields one or more tags, then it will skip the reference and
|
|
81 ;; report this to the client.
|
|
82 ;;
|
|
83 ;; 3. It does not check the syntax of public identifiers in the DTD.
|
|
84 ;;
|
|
85 ;; 4. It allows some non-ASCII characters in certain situations where
|
|
86 ;; it should not. For example, it only enforces XML 1.0's
|
|
87 ;; restrictions on name characters strictly for ASCII characters. The
|
|
88 ;; problem here is XML's character model is based squarely on Unicode,
|
|
89 ;; whereas Emacs's is not (as of version 21). It is not clear what
|
|
90 ;; the right thing to do is.
|
|
91
|
|
92 ;;; Code:
|
|
93
|
|
94 (defvar xmltok-type nil)
|
|
95 (defvar xmltok-start nil)
|
|
96 (defvar xmltok-name-colon nil)
|
|
97 (defvar xmltok-name-end nil)
|
|
98 (defvar xmltok-replacement nil
|
|
99 "String containing replacement for a character or entity reference.")
|
|
100
|
|
101 (defvar xmltok-attributes nil
|
|
102 "List containing attributes of last scanned element.
|
|
103 Each member of the list is a vector representing an attribute, which
|
|
104 can be accessed using the functions `xmltok-attribute-name-start',
|
|
105 `xmltok-attribute-name-colon', `xmltok-attribute-name-end',
|
|
106 `xmltok-attribute-value-start', `xmltok-attribute-value-end',
|
|
107 `xmltok-attribute-raw-normalized-value', `xmltok-attribute-refs'.")
|
|
108
|
|
109 (defvar xmltok-namespace-attributes nil
|
|
110 "List containing namespace declarations of last scanned element.
|
|
111 List has same format as `xmltok-attributes'.")
|
|
112
|
|
113 (defvar xmltok-dtd nil
|
|
114 "Information about the DTD used by `xmltok-forward'.
|
|
115 `xmltok-forward-prolog' sets this up.
|
|
116
|
|
117 It consists of an alist of general entity names vs definitions. The
|
|
118 first member of the alist is t if references to entities not in the
|
|
119 alist are well-formed \(e.g. because there's an external subset that
|
|
120 wasn't parsed).
|
|
121
|
|
122 Each general entity name is a string. The definition is either nil, a
|
|
123 symbol, a string, a cons cell. If the definition is nil, then it
|
|
124 means that it's an internal entity but the result of parsing it is
|
|
125 unknown. If it is a symbol, then the symbol is either `unparsed',
|
|
126 meaning the entity is an unparsed entity, `external', meaning the
|
|
127 entity is or references an external entity, `element', meaning the
|
|
128 entity includes one or more elements, or `not-well-formed', meaning
|
|
129 the replacement text is not well-formed. If the definition is a
|
|
130 string, then the replacement text of the entity is that string; this
|
|
131 happens only during the parsing of the prolog. If the definition is a
|
|
132 cons cell \(ER . AR), then ER specifies the string that results from
|
|
133 referencing the entity in element content and AR is either nil,
|
|
134 meaning the replacement text included a <, or a string which is the
|
|
135 normalized attribute value.")
|
|
136
|
|
137 (defvar xmltok-dependent-regions nil
|
|
138 "List of descriptors of regions that a parsed token depends on.
|
|
139
|
|
140 A token depends on a region if the region occurs after the token and a
|
|
141 change in the region may require the token to be reparsed. This only
|
|
142 happens with markup that is not well-formed. For example, if a <?
|
|
143 occurs without a matching ?>, then the <? is returned as a
|
|
144 not-well-formed token. However, this token is dependent on region
|
|
145 from the end of the token to the end of the buffer: if this ever
|
|
146 contains ?> then the buffer must be reparsed from the <?.
|
|
147
|
|
148 A region descriptor is a list (FUN START END ARG ...), where FUN is a
|
|
149 function to be called when the region changes, START and END are
|
|
150 integers giving the start and end of the region, and ARG... are
|
|
151 additional arguments to be passed to FUN. FUN will be called with 5
|
|
152 arguments followed by the additional arguments if any: the position of
|
|
153 the start of the changed area in the region, the position of the end
|
|
154 of the changed area in the region, the length of the changed area
|
|
155 before the change, the position of the start of the region, the
|
|
156 position of the end of the region. FUN must return non-nil if the
|
|
157 region needs reparsing. FUN will be called in a save-excursion with
|
|
158 match-data saved.
|
|
159
|
|
160 `xmltok-forward', `xmltok-forward-special' and `xmltok-forward-prolog'
|
|
161 may add entries to the beginning of this list, but will not clear it.
|
|
162 `xmltok-forward' and `xmltok-forward-special' will only add entries
|
|
163 when returning tokens of type not-well-formed.")
|
|
164
|
|
165 (defvar xmltok-errors nil
|
|
166 "List of errors detected by `xmltok-forward' and `xmltok-forward-prolog'.
|
|
167 When `xmltok-forward' and `xmltok-forward-prolog' detect a
|
|
168 well-formedness error, they will add an entry to the beginning of this
|
|
169 list. Each entry is a vector [MESSAGE START END], where MESSAGE is a
|
|
170 string giving the error message and START and END are integers
|
|
171 indicating the position of the error.")
|
|
172
|
|
173 (defmacro xmltok-save (&rest body)
|
|
174 `(let (xmltok-type
|
|
175 xmltok-start
|
|
176 xmltok-name-colon
|
|
177 xmltok-name-end
|
|
178 xmltok-replacement
|
|
179 xmltok-attributes
|
|
180 xmltok-namespace-attributes
|
|
181 xmltok-dependent-regions
|
|
182 xmltok-errors)
|
|
183 ,@body))
|
|
184
|
|
185 (put 'xmltok-save 'lisp-indent-function 0)
|
|
186 (def-edebug-spec xmltok-save t)
|
|
187
|
|
188 (defsubst xmltok-attribute-name-start (att)
|
|
189 (aref att 0))
|
|
190
|
|
191 (defsubst xmltok-attribute-name-colon (att)
|
|
192 (aref att 1))
|
|
193
|
|
194 (defsubst xmltok-attribute-name-end (att)
|
|
195 (aref att 2))
|
|
196
|
|
197 (defsubst xmltok-attribute-value-start (att)
|
|
198 (aref att 3))
|
|
199
|
|
200 (defsubst xmltok-attribute-value-end (att)
|
|
201 (aref att 4))
|
|
202
|
|
203 (defsubst xmltok-attribute-raw-normalized-value (att)
|
|
204 "Return an object representing the normalized value of ATT.
|
|
205 This can t indicating that the normalized value is the same as the
|
|
206 buffer substring from the start to the end of the value or nil
|
|
207 indicating that the value is not well-formed or a string."
|
|
208 (aref att 5))
|
|
209
|
|
210 (defsubst xmltok-attribute-refs (att)
|
|
211 "Return a list of the entity and character references in ATT.
|
|
212 Each member is a vector [TYPE START END] where TYPE is either char-ref
|
|
213 or entity-ref and START and END are integers giving the start and end
|
|
214 of the reference. Nested entity references are not included in the list."
|
|
215 (aref att 6))
|
|
216
|
|
217 (defun xmltok-attribute-prefix (att)
|
|
218 (let ((colon (xmltok-attribute-name-colon att)))
|
|
219 (and colon
|
|
220 (buffer-substring-no-properties (xmltok-attribute-name-start att)
|
|
221 colon))))
|
|
222
|
|
223 (defun xmltok-attribute-local-name (att)
|
|
224 (let ((colon (xmltok-attribute-name-colon att)))
|
|
225 (buffer-substring-no-properties (if colon
|
|
226 (1+ colon)
|
|
227 (xmltok-attribute-name-start att))
|
|
228 (xmltok-attribute-name-end att))))
|
|
229
|
|
230 (defun xmltok-attribute-value (att)
|
|
231 (let ((rnv (xmltok-attribute-raw-normalized-value att)))
|
|
232 (and rnv
|
|
233 (if (stringp rnv)
|
|
234 rnv
|
|
235 (buffer-substring-no-properties (xmltok-attribute-value-start att)
|
|
236 (xmltok-attribute-value-end att))))))
|
|
237
|
|
238 (defun xmltok-start-tag-prefix ()
|
|
239 (and xmltok-name-colon
|
|
240 (buffer-substring-no-properties (1+ xmltok-start)
|
|
241 xmltok-name-colon)))
|
|
242
|
|
243 (defun xmltok-start-tag-local-name ()
|
|
244 (buffer-substring-no-properties (1+ (or xmltok-name-colon
|
|
245 xmltok-start))
|
|
246 xmltok-name-end))
|
|
247
|
|
248 (defun xmltok-end-tag-prefix ()
|
|
249 (and xmltok-name-colon
|
|
250 (buffer-substring-no-properties (+ 2 xmltok-start)
|
|
251 xmltok-name-colon)))
|
|
252
|
|
253 (defun xmltok-end-tag-local-name ()
|
|
254 (buffer-substring-no-properties (if xmltok-name-colon
|
|
255 (1+ xmltok-name-colon)
|
|
256 (+ 2 xmltok-start))
|
|
257 xmltok-name-end))
|
|
258
|
|
259 (defun xmltok-start-tag-qname ()
|
|
260 (buffer-substring-no-properties (+ xmltok-start 1) xmltok-name-end))
|
|
261
|
|
262 (defun xmltok-end-tag-qname ()
|
|
263 (buffer-substring-no-properties (+ xmltok-start 2) xmltok-name-end))
|
|
264
|
|
265 (defsubst xmltok-make-attribute (name-begin
|
|
266 name-colon
|
|
267 name-end
|
|
268 &optional
|
|
269 value-begin
|
|
270 value-end
|
|
271 raw-normalized-value)
|
|
272 "Make an attribute. RAW-NORMALIZED-VALUE is nil if the value is
|
|
273 not well-formed, t if the normalized value is the string between
|
|
274 VALUE-BEGIN and VALUE-END, otherwise a STRING giving the value."
|
|
275 (vector name-begin
|
|
276 name-colon
|
|
277 name-end
|
|
278 value-begin
|
|
279 value-end
|
|
280 raw-normalized-value
|
|
281 nil))
|
|
282
|
|
283 (defsubst xmltok-error-message (err)
|
|
284 (aref err 0))
|
|
285
|
|
286 (defsubst xmltok-error-start (err)
|
|
287 (aref err 1))
|
|
288
|
|
289 (defsubst xmltok-error-end (err)
|
|
290 (aref err 2))
|
|
291
|
|
292 (defsubst xmltok-make-error (message start end)
|
|
293 (vector message start end))
|
|
294
|
|
295 (defun xmltok-add-error (message &optional start end)
|
|
296 (setq xmltok-errors
|
|
297 (cons (xmltok-make-error message
|
|
298 (or start xmltok-start)
|
|
299 (or end (point)))
|
|
300 xmltok-errors)))
|
|
301
|
|
302 (defun xmltok-add-dependent (fun &optional start end &rest args)
|
|
303 (setq xmltok-dependent-regions
|
|
304 (cons (cons fun
|
|
305 (cons (or start xmltok-start)
|
|
306 (cons (or end (point-max))
|
|
307 args)))
|
|
308 xmltok-dependent-regions)))
|
|
309
|
|
310 (defun xmltok-forward ()
|
|
311 (setq xmltok-start (point))
|
|
312 (let* ((case-fold-search nil)
|
|
313 (space-count (skip-chars-forward " \t\r\n"))
|
|
314 (ch (char-after)))
|
|
315 (cond ((eq ch ?\<)
|
|
316 (cond ((> space-count 0)
|
|
317 (setq xmltok-type 'space))
|
|
318 (t
|
|
319 (goto-char (1+ (point)))
|
|
320 (xmltok-scan-after-lt))))
|
|
321 ((eq ch ?\&)
|
|
322 (cond ((> space-count 0)
|
|
323 (setq xmltok-type 'space))
|
|
324 (t
|
|
325 (goto-char (1+ (point)))
|
|
326 (xmltok-scan-after-amp
|
|
327 (lambda (start end)
|
|
328 (xmltok-handle-entity start end))))))
|
|
329 ((re-search-forward "[<&]\\|\\(]]>\\)" nil t)
|
|
330 (cond ((not (match-beginning 1))
|
|
331 (goto-char (match-beginning 0))
|
|
332 ;; must have got a non-space char
|
|
333 (setq xmltok-type 'data))
|
|
334 ((= (match-beginning 1) xmltok-start)
|
|
335 (xmltok-add-error "Found `]]>' not closing a CDATA section")
|
|
336 (setq xmltok-type 'not-well-formed))
|
|
337 (t
|
|
338 (goto-char (match-beginning 0))
|
|
339 (setq xmltok-type
|
|
340 (if (= (point) (+ xmltok-start space-count))
|
|
341 'space
|
|
342 'data)))))
|
|
343 ((eq ch nil)
|
|
344 (setq xmltok-type
|
|
345 (if (> space-count 0)
|
|
346 'space
|
|
347 nil)))
|
|
348 (t
|
|
349 (goto-char (point-max))
|
|
350 (setq xmltok-type 'data)))))
|
|
351
|
|
352 (defun xmltok-forward-special (bound)
|
|
353 "Scan forward past the first special token starting at or after point.
|
|
354 Return nil if there is no special token that starts before BOUND.
|
|
355 CDATA sections, processing instructions and comments (and indeed
|
|
356 anything starting with < following by ? or !) count
|
|
357 as special. Return the type of the token."
|
|
358 (when (re-search-forward "<[?!]" (1+ bound) t)
|
|
359 (setq xmltok-start (match-beginning 0))
|
|
360 (goto-char (1+ xmltok-start))
|
|
361 (let ((case-fold-search nil))
|
|
362 (xmltok-scan-after-lt))))
|
|
363
|
|
364 (eval-when-compile
|
|
365
|
|
366 ;; A symbolic regexp is represented by a list whose CAR is the string
|
|
367 ;; containing the regexp and whose cdr is a list of symbolic names
|
|
368 ;; for the groups in the string.
|
|
369
|
|
370 ;; Construct a symbolic regexp from a regexp.
|
|
371 (defun xmltok-r (str)
|
|
372 (cons str nil))
|
|
373
|
|
374 ;; Concatenate zero of more regexps and symbolic regexps.
|
|
375 (defun xmltok+ (&rest args)
|
|
376 (let (strs names)
|
|
377 (while args
|
|
378 (let ((arg (car args)))
|
|
379 (if (stringp arg)
|
|
380 (setq strs (cons arg strs))
|
|
381 (setq strs (cons (car arg) strs))
|
|
382 (setq names (cons (cdr arg) names)))
|
|
383 (setq args (cdr args))))
|
|
384 (cons (apply 'concat (nreverse strs))
|
|
385 (apply 'append (nreverse names))))))
|
|
386
|
|
387 (eval-when-compile
|
|
388 ;; Make a symbolic group named NAME from the regexp R.
|
|
389 ;; R may be a symbolic regexp or an ordinary regexp.
|
|
390 (defmacro xmltok-g (name &rest r)
|
|
391 (let ((sym (make-symbol "r")))
|
|
392 `(let ((,sym (xmltok+ ,@r)))
|
|
393 (if (stringp ,sym)
|
|
394 (cons (concat "\\(" ,sym "\\)") (cons ',name nil))
|
|
395 (cons (concat "\\(" (car ,sym) "\\)") (cons ',name (cdr ,sym)))))))
|
|
396
|
|
397 (defun xmltok-p (&rest r) (xmltok+ "\\(?:"
|
|
398 (apply 'xmltok+ r)
|
|
399 "\\)"))
|
|
400
|
|
401 ;; Get the group index of ELEM in a LIST of symbols.
|
|
402 (defun xmltok-get-index (elem list)
|
|
403 (or elem
|
|
404 (error "Missing group name"))
|
|
405 (let ((found nil)
|
|
406 (i 1))
|
|
407 (while list
|
|
408 (cond ((eq elem (car list))
|
|
409 (setq found i)
|
|
410 (setq list nil))
|
|
411 (t
|
|
412 (setq i (1+ i))
|
|
413 (setq list (cdr list)))))
|
|
414 (or found
|
|
415 (error "Bad group name %s" elem))))
|
|
416
|
|
417 ;; Define a macro SYM using a symbolic regexp R.
|
|
418 ;; SYM can be called in three ways:
|
|
419 ;; (SYM regexp)
|
|
420 ;; expands to the regexp in R
|
|
421 ;; (SYM start G)
|
|
422 ;; expands to
|
|
423 ;; (match-beginning N)
|
|
424 ;; where N is the group index of G in R.
|
|
425 ;; (SYM end G)
|
|
426 ;; expands to
|
|
427 ;; (match-end N)
|
|
428 ;; where N is the group index of G in R.
|
|
429 (defmacro xmltok-defregexp (sym r)
|
|
430 `(defalias ',sym
|
|
431 (let ((r ,r))
|
|
432 `(macro lambda (action &optional group-name)
|
|
433 (cond ((eq action 'regexp)
|
|
434 ,(car r))
|
|
435 ((or (eq action 'start) (eq action 'beginning))
|
|
436 (list 'match-beginning (xmltok-get-index group-name
|
|
437 ',(cdr r))))
|
|
438 ((eq action 'end)
|
|
439 (list 'match-end (xmltok-get-index group-name
|
|
440 ',(cdr r))))
|
|
441 ((eq action 'string)
|
|
442 (list 'match-string
|
|
443 (xmltok-get-index group-name ',(cdr r))))
|
|
444 ((eq action 'string-no-properties)
|
|
445 (list 'match-string-no-properties
|
|
446 (xmltok-get-index group-name ',(cdr r))))
|
|
447 (t (error "Invalid action: %s" action))))))))
|
|
448
|
|
449
|
|
450 (eval-when-compile
|
|
451 (let* ((or "\\|")
|
|
452 (open "\\(?:")
|
|
453 (gopen "\\(")
|
|
454 (close "\\)")
|
|
455 (name-start-char "[_[:alpha:]]")
|
|
456 (name-continue-not-start-char "[-.[:digit:]]")
|
|
457 (name-continue-char "[-._[:alnum:]]")
|
|
458 (* "*")
|
|
459 (+ "+")
|
|
460 (opt "?")
|
|
461 (question "\\?")
|
|
462 (s "[ \r\t\n]")
|
|
463 (s+ (concat s +))
|
|
464 (s* (concat s *))
|
|
465 (ncname (concat name-start-char name-continue-char *))
|
|
466 (entity-ref
|
|
467 (xmltok+ (xmltok-g entity-name ncname)
|
|
468 (xmltok-g entity-ref-close ";") opt))
|
|
469 (decimal-ref
|
|
470 (xmltok+ (xmltok-g decimal "[0-9]" +)
|
|
471 (xmltok-g decimal-ref-close ";") opt))
|
|
472 (hex-ref
|
|
473 (xmltok+ "x" open
|
|
474 (xmltok-g hex "[0-9a-fA-F]" +)
|
|
475 (xmltok-g hex-ref-close ";") opt
|
|
476 close opt))
|
|
477 (char-ref
|
|
478 (xmltok+ (xmltok-g number-sign "#")
|
|
479 open decimal-ref or hex-ref close opt))
|
|
480 (start-tag-close
|
|
481 (xmltok+ open (xmltok-g start-tag-close s* ">")
|
|
482 or open (xmltok-g empty-tag-slash s* "/")
|
|
483 (xmltok-g empty-tag-close ">") opt close
|
|
484 or (xmltok-g start-tag-s s+)
|
|
485 close))
|
|
486 (start-tag
|
|
487 (xmltok+ (xmltok-g start-tag-name
|
|
488 ncname (xmltok-g start-tag-colon ":" ncname) opt)
|
|
489 start-tag-close opt))
|
|
490 (end-tag
|
|
491 (xmltok+ (xmltok-g end-tag-slash "/")
|
|
492 open (xmltok-g end-tag-name
|
|
493 ncname
|
|
494 (xmltok-g end-tag-colon ":" ncname) opt)
|
|
495 (xmltok-g end-tag-close s* ">") opt
|
|
496 close opt))
|
|
497 (comment
|
|
498 (xmltok+ (xmltok-g markup-declaration "!")
|
|
499 (xmltok-g comment-first-dash "-"
|
|
500 (xmltok-g comment-open "-") opt) opt))
|
|
501 (cdata-section
|
|
502 (xmltok+ "!"
|
|
503 (xmltok-g marked-section-open "\\[")
|
|
504 open "C"
|
|
505 open "D"
|
|
506 open "A"
|
|
507 open "T"
|
|
508 open "A"
|
|
509 (xmltok-g cdata-section-open "\\[" ) opt
|
|
510 close opt ; A
|
|
511 close opt ; T
|
|
512 close opt ; A
|
|
513 close opt ; D
|
|
514 close opt)) ; C
|
|
515 (processing-instruction
|
|
516 (xmltok-g processing-instruction-question question)))
|
|
517
|
|
518 (xmltok-defregexp xmltok-ncname (xmltok+ open ncname close))
|
|
519
|
|
520 (xmltok-defregexp xmltok-after-amp
|
|
521 (xmltok+ entity-ref or char-ref))
|
|
522 (xmltok-defregexp xmltok-after-lt
|
|
523 (xmltok+ start-tag
|
|
524 or end-tag
|
|
525 ;; cdata-section must come before comment
|
|
526 ;; because we treat <! as a comment
|
|
527 ;; and Emacs doesn't do fully greedy matching
|
|
528 ;; by default
|
|
529 or cdata-section
|
|
530 or comment
|
|
531 or processing-instruction))
|
|
532 (xmltok-defregexp
|
|
533 xmltok-attribute
|
|
534 (let* ((lit1
|
|
535 (xmltok+ "'"
|
|
536 "[^<'&\r\n\t]*"
|
|
537 (xmltok-g complex1 "[&\r\n\t][^<']*") opt
|
|
538 "'"))
|
|
539 (lit2 (cons (replace-regexp-in-string "'" "\"" (car lit1))
|
|
540 '(complex2)))
|
|
541 (literal (xmltok-g literal lit1 or lit2))
|
|
542 (name (xmltok+ open (xmltok-g xmlns "xmlns") or ncname close
|
|
543 (xmltok-g colon ":" ncname) opt)))
|
|
544 (xmltok+ (xmltok-g name name)
|
|
545 s* "="
|
|
546 ;; If the literal isn't followed by what it should be,
|
|
547 ;; then the closing delimiter is probably really the
|
|
548 ;; opening delimiter of another literal, so don't
|
|
549 ;; absorb the literal in this case.
|
|
550 open s* literal start-tag-close close opt)))
|
|
551 (xmltok-defregexp
|
|
552 xmltok-xml-declaration
|
|
553 (let* ((literal-content "[-._:a-zA-Z0-9]+")
|
|
554 (literal
|
|
555 (concat open "\"" literal-content "\""
|
|
556 or "'" literal-content "'" close))
|
|
557 (version-att
|
|
558 (xmltok+ open
|
|
559 s+ (xmltok-g version-name "version")
|
|
560 s* "="
|
|
561 s* (xmltok-g version-value literal)
|
|
562 close opt))
|
|
563 (encoding-att
|
|
564 (xmltok+ open
|
|
565 s+ (xmltok-g encoding-name "encoding")
|
|
566 s* "="
|
|
567 s* (xmltok-g encoding-value literal)
|
|
568 close opt))
|
|
569 (yes-no
|
|
570 (concat open "yes" or "no" close))
|
|
571 (standalone-att
|
|
572 (xmltok+ open
|
|
573 s+ (xmltok-g standalone-name "standalone")
|
|
574 s* "="
|
|
575 s* (xmltok-g standalone-value
|
|
576 "\"" yes-no "\"" or "'" yes-no "'")
|
|
577 close opt)))
|
|
578 (xmltok+ "<" question "xml"
|
|
579 version-att
|
|
580 encoding-att
|
|
581 standalone-att
|
|
582 s* question ">")))
|
|
583 (xmltok-defregexp
|
|
584 xmltok-prolog
|
|
585 (let* ((single-char (xmltok-g single-char "[[|,(\"'>]"))
|
|
586 (internal-subset-close (xmltok-g internal-subset-close
|
|
587 "][ \t\r\n]*>"))
|
|
588 (starts-with-close-paren
|
|
589 (xmltok-g close-paren
|
|
590 ")"
|
|
591 (xmltok-p
|
|
592 (xmltok-g close-paren-occur "[+?]")
|
|
593 or
|
|
594 (xmltok-g close-paren-star "\\*"))
|
|
595 opt))
|
|
596 (starts-with-percent
|
|
597 (xmltok-g percent
|
|
598 "%" (xmltok-g param-entity-ref
|
|
599 ncname
|
|
600 (xmltok-g param-entity-ref-close
|
|
601 ";") opt) opt))
|
|
602 (starts-with-nmtoken-not-name
|
|
603 (xmltok-g nmtoken
|
|
604 (xmltok-p name-continue-not-start-char or ":")
|
|
605 (xmltok-p name-continue-char or ":") *))
|
|
606 (nmtoken-after-colon
|
|
607 (xmltok+
|
|
608 (xmltok-p name-continue-not-start-char or ":")
|
|
609 (xmltok-p name-continue-char or ":") *
|
|
610 or
|
|
611 name-start-char
|
|
612 name-continue-char *
|
|
613 ":"
|
|
614 (xmltok-p name-continue-char or ":") *))
|
|
615 (after-ncname
|
|
616 (xmltok+ (xmltok-g ncname-nmtoken
|
|
617 ":" (xmltok-p nmtoken-after-colon))
|
|
618 or (xmltok-p (xmltok-g colon ":" ncname)
|
|
619 (xmltok-g colon-name-occur "[?+*]") opt)
|
|
620 or (xmltok-g ncname-occur "[?+*]")
|
|
621 or (xmltok-g ncname-colon ":")))
|
|
622 (starts-with-name
|
|
623 (xmltok-g name ncname (xmltok-p after-ncname) opt))
|
|
624 (starts-with-hash
|
|
625 (xmltok-g pound
|
|
626 "#" (xmltok-g hash-name ncname)))
|
|
627 (markup-declaration
|
|
628 (xmltok-g markup-declaration
|
|
629 "!" (xmltok-p (xmltok-g comment-first-dash "-"
|
|
630 (xmltok-g comment-open "-") opt)
|
|
631 or (xmltok-g named-markup-declaration
|
|
632 ncname)) opt))
|
|
633 (after-lt
|
|
634 (xmltok+ markup-declaration
|
|
635 or (xmltok-g processing-instruction-question
|
|
636 question)
|
|
637 or (xmltok-g instance-start
|
|
638 ncname)))
|
|
639 (starts-with-lt (xmltok-g less-than "<" (xmltok-p after-lt) opt)))
|
|
640 (xmltok+ starts-with-lt
|
|
641 or single-char
|
|
642 or starts-with-close-paren
|
|
643 or starts-with-percent
|
|
644 or starts-with-name
|
|
645 or starts-with-nmtoken-not-name
|
|
646 or starts-with-hash
|
|
647 or internal-subset-close)))))
|
|
648
|
|
649 (defconst xmltok-ncname-regexp (xmltok-ncname regexp))
|
|
650
|
|
651 (defun xmltok-scan-after-lt ()
|
|
652 (cond ((not (looking-at (xmltok-after-lt regexp)))
|
|
653 (xmltok-add-error "`<' that is not markup must be entered as `<'")
|
|
654 (setq xmltok-type 'not-well-formed))
|
|
655 (t
|
|
656 (goto-char (match-end 0))
|
|
657 (cond ((xmltok-after-lt start start-tag-close)
|
|
658 (setq xmltok-name-end
|
|
659 (xmltok-after-lt end start-tag-name))
|
|
660 (setq xmltok-name-colon
|
|
661 (xmltok-after-lt start start-tag-colon))
|
|
662 (setq xmltok-attributes nil)
|
|
663 (setq xmltok-namespace-attributes nil)
|
|
664 (setq xmltok-type 'start-tag))
|
|
665 ((xmltok-after-lt start end-tag-close)
|
|
666 (setq xmltok-name-end
|
|
667 (xmltok-after-lt end end-tag-name))
|
|
668 (setq xmltok-name-colon
|
|
669 (xmltok-after-lt start end-tag-colon))
|
|
670 (setq xmltok-type 'end-tag))
|
|
671 ((xmltok-after-lt start start-tag-s)
|
|
672 (setq xmltok-name-end
|
|
673 (xmltok-after-lt end start-tag-name))
|
|
674 (setq xmltok-name-colon
|
|
675 (xmltok-after-lt start start-tag-colon))
|
|
676 (setq xmltok-namespace-attributes nil)
|
|
677 (setq xmltok-attributes nil)
|
|
678 (xmltok-scan-attributes)
|
|
679 xmltok-type)
|
|
680 ((xmltok-after-lt start empty-tag-close)
|
|
681 (setq xmltok-name-end
|
|
682 (xmltok-after-lt end start-tag-name))
|
|
683 (setq xmltok-name-colon
|
|
684 (xmltok-after-lt start start-tag-colon))
|
|
685 (setq xmltok-attributes nil)
|
|
686 (setq xmltok-namespace-attributes nil)
|
|
687 (setq xmltok-type 'empty-element))
|
|
688 ((xmltok-after-lt start cdata-section-open)
|
|
689 (setq xmltok-type
|
|
690 (if (search-forward "]]>" nil t)
|
|
691 'cdata-section
|
|
692 (xmltok-add-error "No closing ]]>")
|
|
693 (xmltok-add-dependent 'xmltok-unclosed-reparse-p
|
|
694 nil
|
|
695 nil
|
|
696 "]]>")
|
|
697 'not-well-formed)))
|
|
698 ((xmltok-after-lt start processing-instruction-question)
|
|
699 (xmltok-scan-after-processing-instruction-open))
|
|
700 ((xmltok-after-lt start comment-open)
|
|
701 (xmltok-scan-after-comment-open))
|
|
702 ((xmltok-after-lt start empty-tag-slash)
|
|
703 (setq xmltok-name-end
|
|
704 (xmltok-after-lt end start-tag-name))
|
|
705 (setq xmltok-name-colon
|
|
706 (xmltok-after-lt start start-tag-colon))
|
|
707 (setq xmltok-attributes nil)
|
|
708 (setq xmltok-namespace-attributes nil)
|
|
709 (xmltok-add-error "Expected `/>'" (1- (point)))
|
|
710 (setq xmltok-type 'partial-empty-element))
|
|
711 ((xmltok-after-lt start start-tag-name)
|
|
712 (xmltok-add-error "Missing `>'"
|
|
713 nil
|
|
714 (1+ xmltok-start))
|
|
715 (setq xmltok-name-end
|
|
716 (xmltok-after-lt end start-tag-name))
|
|
717 (setq xmltok-name-colon
|
|
718 (xmltok-after-lt start start-tag-colon))
|
|
719 (setq xmltok-namespace-attributes nil)
|
|
720 (setq xmltok-attributes nil)
|
|
721 (setq xmltok-type 'partial-start-tag))
|
|
722 ((xmltok-after-lt start end-tag-name)
|
|
723 (setq xmltok-name-end (xmltok-after-lt end end-tag-name))
|
|
724 (setq xmltok-name-colon
|
|
725 (xmltok-after-lt start end-tag-colon))
|
|
726 (cond ((and (not xmltok-name-colon)
|
|
727 (eq (char-after) ?:))
|
|
728 (goto-char (1+ (point)))
|
|
729 (xmltok-add-error "Expected name following `:'"
|
|
730 (1- (point))))
|
|
731 (t
|
|
732 (xmltok-add-error "Missing `>'"
|
|
733 nil
|
|
734 (1+ xmltok-start))))
|
|
735 (setq xmltok-type 'partial-end-tag))
|
|
736 ((xmltok-after-lt start end-tag-slash)
|
|
737 (xmltok-add-error "Expected name following `</'")
|
|
738 (setq xmltok-name-end nil)
|
|
739 (setq xmltok-name-colon nil)
|
|
740 (setq xmltok-type 'partial-end-tag))
|
|
741 ((xmltok-after-lt start marked-section-open)
|
|
742 (xmltok-add-error "Expected `CDATA[' after `<!['"
|
|
743 xmltok-start
|
|
744 (+ 3 xmltok-start))
|
|
745 (setq xmltok-type 'not-well-formed))
|
|
746 ((xmltok-after-lt start comment-first-dash)
|
|
747 (xmltok-add-error "Expected `-' after `<!-'"
|
|
748 xmltok-start
|
|
749 (+ 3 xmltok-start))
|
|
750 (setq xmltok-type 'not-well-formed))
|
|
751 ((xmltok-after-lt start markup-declaration)
|
|
752 (xmltok-add-error "Expected `[CDATA[' or `--' after `<!'"
|
|
753 xmltok-start
|
|
754 (+ 2 xmltok-start))
|
|
755 (setq xmltok-type 'not-well-formed))
|
|
756 (t
|
|
757 (xmltok-add-error "Not well-formed")
|
|
758 (setq xmltok-type 'not-well-formed))))))
|
|
759
|
|
760 ;; XXX This should be unified with
|
|
761 ;; xmltok-scan-prolog-after-processing-instruction-open
|
|
762 ;; XXX maybe should include rest of line (up to any <,>) in unclosed PI
|
|
763 (defun xmltok-scan-after-processing-instruction-open ()
|
|
764 (cond ((not (search-forward "?>" nil t))
|
|
765 (xmltok-add-error "No closing ?>"
|
|
766 xmltok-start
|
|
767 (+ xmltok-start 2))
|
|
768 (xmltok-add-dependent 'xmltok-unclosed-reparse-p
|
|
769 nil
|
|
770 nil
|
|
771 "?>")
|
|
772 (setq xmltok-type 'not-well-formed))
|
|
773 (t
|
|
774 (cond ((not (save-excursion
|
|
775 (goto-char (+ 2 xmltok-start))
|
|
776 (and (looking-at (xmltok-ncname regexp))
|
|
777 (setq xmltok-name-end (match-end 0)))))
|
|
778 (setq xmltok-name-end (+ xmltok-start 2))
|
|
779 (xmltok-add-error "<? not followed by name"
|
|
780 (+ xmltok-start 2)
|
|
781 (+ xmltok-start 3)))
|
|
782 ((not (or (memq (char-after xmltok-name-end)
|
|
783 '(?\n ?\t ?\r ? ))
|
|
784 (= xmltok-name-end (- (point) 2))))
|
|
785 (xmltok-add-error "Target not followed by whitespace"
|
|
786 xmltok-name-end
|
|
787 (1+ xmltok-name-end)))
|
|
788 ((and (= xmltok-name-end (+ xmltok-start 5))
|
|
789 (save-excursion
|
|
790 (goto-char (+ xmltok-start 2))
|
|
791 (let ((case-fold-search t))
|
|
792 (looking-at "xml"))))
|
|
793 (xmltok-add-error "Processing instruction target is xml"
|
|
794 (+ xmltok-start 2)
|
|
795 (+ xmltok-start 5))))
|
|
796 (setq xmltok-type 'processing-instruction))))
|
|
797
|
|
798 (defun xmltok-scan-after-comment-open ()
|
|
799 (setq xmltok-type
|
|
800 (cond ((not (search-forward "--" nil t))
|
|
801 (xmltok-add-error "No closing -->")
|
|
802 (xmltok-add-dependent 'xmltok-unclosed-reparse-p
|
|
803 nil
|
|
804 nil
|
|
805 ;; not --> because
|
|
806 ;; -- is not allowed
|
|
807 ;; in comments in XML
|
|
808 "--")
|
|
809 'not-well-formed)
|
|
810 ((eq (char-after) ?>)
|
|
811 (goto-char (1+ (point)))
|
|
812 'comment)
|
|
813 (t
|
|
814 (xmltok-add-dependent
|
|
815 'xmltok-semi-closed-reparse-p
|
|
816 nil
|
|
817 (point)
|
|
818 "--"
|
|
819 2)
|
|
820 ;; just include the <!-- in the token
|
|
821 (goto-char (+ xmltok-start 4))
|
|
822 ;; Need do this after the goto-char because
|
|
823 ;; marked error should just apply to <!--
|
|
824 (xmltok-add-error "First following `--' not followed by `>'")
|
|
825 'not-well-formed))))
|
|
826
|
|
827 (defun xmltok-scan-attributes ()
|
|
828 (let ((recovering nil)
|
|
829 (atts-needing-normalization nil))
|
|
830 (while (cond ((or (looking-at (xmltok-attribute regexp))
|
|
831 ;; use non-greedy group
|
|
832 (when (looking-at (concat "[^<>\n]+?"
|
|
833 (xmltok-attribute regexp)))
|
|
834 (unless recovering
|
|
835 (xmltok-add-error "Malformed attribute"
|
|
836 (point)
|
|
837 (save-excursion
|
|
838 (goto-char (xmltok-attribute start
|
|
839 name))
|
|
840 (skip-chars-backward "\r\n\t ")
|
|
841 (point))))
|
|
842 t))
|
|
843 (setq recovering nil)
|
|
844 (goto-char (match-end 0))
|
|
845 (let ((att (xmltok-add-attribute)))
|
|
846 (when att
|
|
847 (setq atts-needing-normalization
|
|
848 (cons att atts-needing-normalization))))
|
|
849 (cond ((xmltok-attribute start start-tag-s) t)
|
|
850 ((xmltok-attribute start start-tag-close)
|
|
851 (setq xmltok-type 'start-tag)
|
|
852 nil)
|
|
853 ((xmltok-attribute start empty-tag-close)
|
|
854 (setq xmltok-type 'empty-element)
|
|
855 nil)
|
|
856 ((xmltok-attribute start empty-tag-slash)
|
|
857 (setq xmltok-type 'partial-empty-element)
|
|
858 (xmltok-add-error "Expected `/>'"
|
|
859 (1- (point)))
|
|
860 nil)
|
|
861 ((looking-at "[ \t\r\n]*[\"']")
|
|
862 (goto-char (match-end 0))
|
|
863 (xmltok-add-error "Missing closing delimiter"
|
|
864 (1- (point)))
|
|
865 (setq recovering t)
|
|
866 t)
|
|
867 ((looking-at "[ \t]*\\([^ \t\r\n\"'=<>/]+\\)[ \t\r\n/>]")
|
|
868 (goto-char (match-end 1))
|
|
869 (xmltok-add-error "Attribute value not quoted"
|
|
870 (match-beginning 1))
|
|
871 (setq recovering t)
|
|
872 t)
|
|
873 (t
|
|
874 (xmltok-add-error "Missing attribute value"
|
|
875 (1- (point)))
|
|
876 (setq recovering t)
|
|
877 t)))
|
|
878 ((looking-at "[^<>\n]*/>")
|
|
879 (let ((start (point)))
|
|
880 (goto-char (match-end 0))
|
|
881 (unless recovering
|
|
882 (xmltok-add-error "Malformed empty-element"
|
|
883 start
|
|
884 (- (point) 2))))
|
|
885 (setq xmltok-type 'empty-element)
|
|
886 nil)
|
|
887 ((looking-at "[^<>\n]*>")
|
|
888 (let ((start (point)))
|
|
889 (goto-char (match-end 0))
|
|
890 (unless recovering
|
|
891 (xmltok-add-error "Malformed start-tag"
|
|
892 start
|
|
893 (1- (point)))))
|
|
894 (setq xmltok-type 'start-tag)
|
|
895 nil)
|
|
896 (t
|
|
897 (when recovering
|
|
898 (skip-chars-forward "^<>\n"))
|
|
899 (xmltok-add-error "Missing `>'"
|
|
900 xmltok-start
|
|
901 (1+ xmltok-start))
|
|
902 (setq xmltok-type 'partial-start-tag)
|
|
903 nil)))
|
|
904 (while atts-needing-normalization
|
|
905 (xmltok-normalize-attribute (car atts-needing-normalization))
|
|
906 (setq atts-needing-normalization (cdr atts-needing-normalization))))
|
|
907 (setq xmltok-attributes
|
|
908 (nreverse xmltok-attributes))
|
|
909 (setq xmltok-namespace-attributes
|
|
910 (nreverse xmltok-namespace-attributes)))
|
|
911
|
|
912 (defun xmltok-add-attribute ()
|
|
913 "Return the attribute if it needs normalizing, otherwise nil."
|
|
914 (let* ((needs-normalizing nil)
|
|
915 (att
|
|
916 (if (xmltok-attribute start literal)
|
|
917 (progn
|
|
918 (setq needs-normalizing
|
|
919 (or (xmltok-attribute start complex1)
|
|
920 (xmltok-attribute start complex2)))
|
|
921 (xmltok-make-attribute (xmltok-attribute start name)
|
|
922 (xmltok-attribute start colon)
|
|
923 (xmltok-attribute end name)
|
|
924 (1+ (xmltok-attribute start literal))
|
|
925 (1- (xmltok-attribute end literal))
|
|
926 (not needs-normalizing)))
|
|
927 (xmltok-make-attribute (xmltok-attribute start name)
|
|
928 (xmltok-attribute start colon)
|
|
929 (xmltok-attribute end name)))))
|
|
930 (if (xmltok-attribute start xmlns)
|
|
931 (setq xmltok-namespace-attributes
|
|
932 (cons att xmltok-namespace-attributes))
|
|
933 (setq xmltok-attributes
|
|
934 (cons att xmltok-attributes)))
|
|
935 (and needs-normalizing
|
|
936 att)))
|
|
937
|
|
938 (defun xmltok-normalize-attribute (att)
|
|
939 (let ((end (xmltok-attribute-value-end att))
|
|
940 (well-formed t)
|
|
941 (value-parts nil)
|
|
942 (refs nil))
|
|
943 (save-excursion
|
|
944 (goto-char (xmltok-attribute-value-start att))
|
|
945 (while (progn
|
|
946 (let ((n (skip-chars-forward "^\r\t\n&" end)))
|
|
947 (when (> n 0)
|
|
948 (setq value-parts
|
|
949 (cons (buffer-substring-no-properties (- (point) n)
|
|
950 (point))
|
|
951 value-parts))))
|
|
952 (when (< (point) end)
|
|
953 (goto-char (1+ (point)))
|
|
954 (cond ((eq (char-before) ?\&)
|
|
955 (let ((xmltok-start (1- (point)))
|
|
956 xmltok-type xmltok-replacement)
|
|
957 (xmltok-scan-after-amp
|
|
958 (lambda (start end)
|
|
959 (xmltok-handle-entity start end t)))
|
|
960 (cond ((or (eq xmltok-type 'char-ref)
|
|
961 (eq xmltok-type 'entity-ref))
|
|
962 (setq refs
|
|
963 (cons (vector xmltok-type
|
|
964 xmltok-start
|
|
965 (point))
|
|
966 refs))
|
|
967 (if xmltok-replacement
|
|
968 (setq value-parts
|
|
969 (cons xmltok-replacement
|
|
970 value-parts))
|
|
971 (setq well-formed nil)))
|
|
972 (t (setq well-formed nil)))))
|
|
973 (t (setq value-parts
|
|
974 (cons " " value-parts)))))
|
|
975 (< (point) end))))
|
|
976 (when well-formed
|
|
977 (aset att 5 (apply 'concat (nreverse value-parts))))
|
|
978 (aset att 6 (nreverse refs))))
|
|
979
|
|
980 (defun xmltok-scan-after-amp (entity-handler)
|
|
981 (cond ((not (looking-at (xmltok-after-amp regexp)))
|
|
982 (xmltok-add-error "`&' that is not markup must be entered as `&'")
|
|
983 (setq xmltok-type 'not-well-formed))
|
|
984 (t
|
|
985 (goto-char (match-end 0))
|
|
986 (cond ((xmltok-after-amp start entity-ref-close)
|
|
987 (funcall entity-handler
|
|
988 (xmltok-after-amp start entity-name)
|
|
989 (xmltok-after-amp end entity-name))
|
|
990 (setq xmltok-type 'entity-ref))
|
|
991 ((xmltok-after-amp start decimal-ref-close)
|
|
992 (xmltok-scan-char-ref (xmltok-after-amp start decimal)
|
|
993 (xmltok-after-amp end decimal)
|
|
994 10))
|
|
995 ((xmltok-after-amp start hex-ref-close)
|
|
996 (xmltok-scan-char-ref (xmltok-after-amp start hex)
|
|
997 (xmltok-after-amp end hex)
|
|
998 16))
|
|
999 ((xmltok-after-amp start number-sign)
|
|
1000 (xmltok-add-error "Missing character number")
|
|
1001 (setq xmltok-type 'not-well-formed))
|
|
1002 (t
|
|
1003 (xmltok-add-error "Missing closing `;'")
|
|
1004 (setq xmltok-type 'not-well-formed))))))
|
|
1005
|
|
1006 (defconst xmltok-entity-error-messages
|
|
1007 '((unparsed . "Referenced entity is unparsed")
|
|
1008 (not-well-formed . "Referenced entity is not well-formed")
|
|
1009 (external nil . "Referenced entity is external")
|
|
1010 (element nil . "Referenced entity contains <")))
|
|
1011
|
|
1012 (defun xmltok-handle-entity (start end &optional attributep)
|
|
1013 (let* ((name (buffer-substring-no-properties start end))
|
|
1014 (name-def (assoc name xmltok-dtd))
|
|
1015 (def (cdr name-def)))
|
|
1016 (cond ((setq xmltok-replacement (and (consp def)
|
|
1017 (if attributep
|
|
1018 (cdr def)
|
|
1019 (car def)))))
|
|
1020 ((null name-def)
|
|
1021 (unless (eq (car xmltok-dtd) t)
|
|
1022 (xmltok-add-error "Referenced entity has not been defined"
|
|
1023 start
|
|
1024 end)))
|
|
1025 ((and attributep (consp def))
|
|
1026 (xmltok-add-error "Referenced entity contains <"
|
|
1027 start
|
|
1028 end))
|
|
1029 (t
|
|
1030 (let ((err (cdr (assq def xmltok-entity-error-messages))))
|
|
1031 (when (consp err)
|
|
1032 (setq err (if attributep (cdr err) (car err))))
|
|
1033 (when err
|
|
1034 (xmltok-add-error err start end)))))))
|
|
1035
|
|
1036 (defun xmltok-scan-char-ref (start end base)
|
|
1037 (setq xmltok-replacement
|
86557
|
1038 (let ((n (string-to-number (buffer-substring-no-properties start end)
|
86361
|
1039 base)))
|
|
1040 (cond ((and (integerp n) (xmltok-valid-char-p n))
|
|
1041 (setq n (xmltok-unicode-to-char n))
|
|
1042 (and n (string n)))
|
|
1043 (t
|
|
1044 (xmltok-add-error "Invalid character code" start end)
|
|
1045 nil))))
|
|
1046 (setq xmltok-type 'char-ref))
|
|
1047
|
|
1048 (defun xmltok-char-number (start end)
|
|
1049 (let* ((base (if (eq (char-after (+ start 2)) ?x)
|
|
1050 16
|
|
1051 10))
|
86557
|
1052 (n (string-to-number
|
86361
|
1053 (buffer-substring-no-properties (+ start (if (= base 16) 3 2))
|
|
1054 (1- end))
|
|
1055 base)))
|
|
1056 (and (integerp n)
|
|
1057 (xmltok-valid-char-p n)
|
|
1058 n)))
|
|
1059
|
|
1060 (defun xmltok-unclosed-reparse-p (change-start
|
|
1061 change-end
|
|
1062 pre-change-length
|
|
1063 start
|
|
1064 end
|
|
1065 delimiter)
|
|
1066 (let ((len-1 (1- (length delimiter))))
|
|
1067 (goto-char (max start (- change-start len-1)))
|
|
1068 (search-forward delimiter (min end (+ change-end len-1)) t)))
|
|
1069
|
|
1070 ;; Handles a <!-- with the next -- not followed by >
|
|
1071
|
|
1072 (defun xmltok-semi-closed-reparse-p (change-start
|
|
1073 change-end
|
|
1074 pre-change-length
|
|
1075 start
|
|
1076 end
|
|
1077 delimiter
|
|
1078 delimiter-length)
|
|
1079 (or (<= (- end delimiter-length) change-end)
|
|
1080 (xmltok-unclosed-reparse-p change-start
|
|
1081 change-end
|
|
1082 pre-change-length
|
|
1083 start
|
|
1084 end
|
|
1085 delimiter)))
|
|
1086
|
|
1087 (defun xmltok-valid-char-p (n)
|
|
1088 "Return non-nil if n is the Unicode code of a valid XML character."
|
|
1089 (cond ((< n #x20) (memq n '(#xA #xD #x9)))
|
|
1090 ((< n #xD800) t)
|
|
1091 ((< n #xE000) nil)
|
|
1092 ((< n #xFFFE) t)
|
|
1093 (t (and (> n #xFFFF)
|
|
1094 (< n #x110000)))))
|
|
1095
|
|
1096 (defun xmltok-unicode-to-char (n)
|
|
1097 "Return the character corresponding to Unicode scalar value N.
|
|
1098 Return nil if unsupported in Emacs."
|
|
1099 (decode-char 'ucs n))
|
|
1100
|
|
1101 ;;; Prolog parsing
|
|
1102
|
|
1103 (defvar xmltok-contains-doctype nil)
|
|
1104 (defvar xmltok-doctype-external-subset-flag nil)
|
|
1105 (defvar xmltok-internal-subset-start nil)
|
|
1106 (defvar xmltok-had-param-entity-ref nil)
|
|
1107 (defvar xmltok-prolog-regions nil)
|
|
1108 (defvar xmltok-standalone nil
|
|
1109 "Non-nil if there was an XML declaration specifying standalone=\"yes\",")
|
|
1110 (defvar xmltok-markup-declaration-doctype-flag nil)
|
|
1111
|
|
1112 (defconst xmltok-predefined-entity-alist
|
|
1113 '(("lt" "<" . "<")
|
|
1114 ("gt" ">" . ">")
|
|
1115 ("amp" "&" . "&")
|
|
1116 ("apos" "'" . "'")
|
|
1117 ("quot" "\"" . "\"")))
|
|
1118
|
|
1119 (defun xmltok-forward-prolog ()
|
|
1120 "Move forward to the end of the XML prolog.
|
|
1121
|
|
1122 Returns a list of vectors [TYPE START END] where TYPE is a symbol and
|
|
1123 START and END are integers giving the start and end of the region of
|
|
1124 that type. TYPE can be one of xml-declaration,
|
|
1125 xml-declaration-attribute-name, xml-declaration-attribute-value,
|
|
1126 comment, processing-instruction-left, processing-instruction-right,
|
|
1127 markup-declaration-open. markup-declaration-close,
|
|
1128 internal-subset-open, internal-subset-close, hash-name, keyword,
|
|
1129 literal, encoding-name.
|
|
1130 Adds to `xmltok-errors' and `xmltok-dependent-regions' as appropriate."
|
|
1131 (let ((case-fold-search nil)
|
|
1132 xmltok-start
|
|
1133 xmltok-type
|
|
1134 xmltok-prolog-regions
|
|
1135 xmltok-contains-doctype
|
|
1136 xmltok-internal-subset-start
|
|
1137 xmltok-had-param-entity-ref
|
|
1138 xmltok-standalone
|
|
1139 xmltok-doctype-external-subset-flag
|
|
1140 xmltok-markup-declaration-doctype-flag)
|
|
1141 (setq xmltok-dtd xmltok-predefined-entity-alist)
|
|
1142 (xmltok-scan-xml-declaration)
|
|
1143 (xmltok-next-prolog-token)
|
|
1144 (while (condition-case err
|
|
1145 (when (xmltok-parse-prolog-item)
|
|
1146 (xmltok-next-prolog-token))
|
|
1147 (xmltok-markup-declaration-parse-error
|
|
1148 (xmltok-skip-markup-declaration))))
|
|
1149 (when xmltok-internal-subset-start
|
|
1150 (xmltok-add-error "No closing ]"
|
|
1151 (1- xmltok-internal-subset-start)
|
|
1152 xmltok-internal-subset-start))
|
|
1153 (xmltok-parse-entities)
|
|
1154 ;; XXX prune dependent-regions for those entirely in prolog
|
|
1155 (nreverse xmltok-prolog-regions)))
|
|
1156
|
|
1157 (defconst xmltok-bad-xml-decl-regexp
|
|
1158 "[ \t\r\n]*<\\?xml\\(?:[ \t\r\n]\\|\\?>\\)")
|
|
1159
|
|
1160 ;;;###autoload
|
|
1161 (defun xmltok-get-declared-encoding-position (&optional limit)
|
|
1162 "Return the position of the encoding in the XML declaration at point.
|
|
1163 If there is a well-formed XML declaration starting at point and it
|
|
1164 contains an encoding declaration, then return (START . END)
|
|
1165 where START and END are the positions of the start and the end
|
|
1166 of the encoding name; if there is no encoding declaration return
|
|
1167 the position where and encoding declaration could be inserted.
|
|
1168 If there is XML that is not well-formed that looks like an XML declaration,
|
|
1169 return nil. Otherwise, return t.
|
|
1170 If LIMIT is non-nil, then do not consider characters beyond LIMIT."
|
|
1171 (cond ((let ((case-fold-search nil))
|
|
1172 (and (looking-at (xmltok-xml-declaration regexp))
|
|
1173 (or (not limit) (<= (match-end 0) limit))))
|
|
1174 (let ((end (xmltok-xml-declaration end encoding-value)))
|
|
1175 (if end
|
|
1176 (cons (1+ (xmltok-xml-declaration start encoding-value))
|
|
1177 (1- end))
|
|
1178 (or (xmltok-xml-declaration end version-value)
|
|
1179 (+ (point) 5)))))
|
|
1180 ((not (let ((case-fold-search t))
|
|
1181 (looking-at xmltok-bad-xml-decl-regexp))))))
|
|
1182
|
|
1183 (defun xmltok-scan-xml-declaration ()
|
|
1184 (when (looking-at (xmltok-xml-declaration regexp))
|
|
1185 (xmltok-add-prolog-region 'xml-declaration (point) (match-end 0))
|
|
1186 (goto-char (match-end 0))
|
|
1187 (when (xmltok-xml-declaration start version-name)
|
|
1188 (xmltok-add-prolog-region 'xml-declaration-attribute-name
|
|
1189 (xmltok-xml-declaration start version-name)
|
|
1190 (xmltok-xml-declaration end version-name))
|
|
1191 (let ((start (xmltok-xml-declaration start version-value))
|
|
1192 (end (xmltok-xml-declaration end version-value)))
|
|
1193 (xmltok-add-prolog-region 'xml-declaration-attribute-value
|
|
1194 start
|
|
1195 end)))
|
|
1196 ;; XXX need to check encoding name
|
|
1197 ;; Should start with letter, not contain colon
|
|
1198 (when (xmltok-xml-declaration start encoding-name)
|
|
1199 (xmltok-add-prolog-region 'xml-declaration-attribute-name
|
|
1200 (xmltok-xml-declaration start encoding-name)
|
|
1201 (xmltok-xml-declaration end encoding-name))
|
|
1202 (let ((start (xmltok-xml-declaration start encoding-value))
|
|
1203 (end (xmltok-xml-declaration end encoding-value)))
|
|
1204 (xmltok-add-prolog-region 'encoding-name
|
|
1205 (1+ start)
|
|
1206 (1- end))
|
|
1207 (xmltok-add-prolog-region 'xml-declaration-attribute-value
|
|
1208 start
|
|
1209 end)))
|
|
1210 (when (xmltok-xml-declaration start standalone-name)
|
|
1211 (xmltok-add-prolog-region 'xml-declaration-attribute-name
|
|
1212 (xmltok-xml-declaration start standalone-name)
|
|
1213 (xmltok-xml-declaration end standalone-name))
|
|
1214 (let ((start (xmltok-xml-declaration start standalone-value))
|
|
1215 (end (xmltok-xml-declaration end standalone-value)))
|
|
1216 (xmltok-add-prolog-region 'xml-declaration-attribute-value
|
|
1217 start
|
|
1218 end)
|
|
1219 (setq xmltok-standalone
|
|
1220 (string= (buffer-substring-no-properties (1+ start) (1- end))
|
|
1221 "yes"))))
|
|
1222 t))
|
|
1223
|
|
1224 (defconst xmltok-markup-declaration-alist
|
|
1225 '(("ELEMENT" . xmltok-parse-element-declaration)
|
|
1226 ("ATTLIST" . xmltok-parse-attlist-declaration)
|
|
1227 ("ENTITY" . xmltok-parse-entity-declaration)
|
|
1228 ("NOTATION" . xmltok-parse-notation-declaration)))
|
|
1229
|
|
1230 (defun xmltok-parse-prolog-item ()
|
|
1231 (cond ((eq xmltok-type 'comment)
|
|
1232 (xmltok-add-prolog-region 'comment
|
|
1233 xmltok-start
|
|
1234 (point))
|
|
1235 t)
|
|
1236 ((eq xmltok-type 'processing-instruction))
|
|
1237 ((eq xmltok-type 'named-markup-declaration)
|
|
1238 (setq xmltok-markup-declaration-doctype-flag nil)
|
|
1239 (xmltok-add-prolog-region 'markup-declaration-open
|
|
1240 xmltok-start
|
|
1241 (point))
|
|
1242 (let* ((name (buffer-substring-no-properties
|
|
1243 (+ xmltok-start 2)
|
|
1244 (point)))
|
|
1245 (fun (cdr (assoc name xmltok-markup-declaration-alist))))
|
|
1246 (cond (fun
|
|
1247 (unless xmltok-internal-subset-start
|
|
1248 (xmltok-add-error
|
|
1249 "Declaration allowed only in internal subset"))
|
|
1250 (funcall fun))
|
|
1251 ((string= name "DOCTYPE")
|
|
1252 (xmltok-parse-doctype))
|
|
1253 (t
|
|
1254 (xmltok-add-error "Unknown markup declaration"
|
|
1255 (+ xmltok-start 2))
|
|
1256 (xmltok-next-prolog-token)
|
|
1257 (xmltok-markup-declaration-parse-error))))
|
|
1258 t)
|
|
1259 ((or (eq xmltok-type 'end-prolog)
|
|
1260 (not xmltok-type))
|
|
1261 nil)
|
|
1262 ((eq xmltok-type 'internal-subset-close)
|
|
1263 (xmltok-add-prolog-region 'internal-subset-close
|
|
1264 xmltok-start
|
|
1265 (1+ xmltok-start))
|
|
1266 (xmltok-add-prolog-region 'markup-declaration-close
|
|
1267 (1- (point))
|
|
1268 (point))
|
|
1269 (if xmltok-internal-subset-start
|
|
1270 (setq xmltok-internal-subset-start nil)
|
|
1271 (xmltok-add-error "]> outside internal subset"))
|
|
1272 t)
|
|
1273 ((eq xmltok-type 'param-entity-ref)
|
|
1274 (if xmltok-internal-subset-start
|
|
1275 (setq xmltok-had-param-entity-ref t)
|
|
1276 (xmltok-add-error "Parameter entity reference outside document type declaration"))
|
|
1277 t)
|
|
1278 ;; If we don't do this, we can get thousands of errors when
|
|
1279 ;; a plain text file is parsed.
|
|
1280 ((not xmltok-internal-subset-start)
|
|
1281 (when (let ((err (car xmltok-errors)))
|
|
1282 (or (not err)
|
|
1283 (<= (xmltok-error-end err) xmltok-start)))
|
|
1284 (goto-char xmltok-start))
|
|
1285 nil)
|
|
1286 ((eq xmltok-type 'not-well-formed) t)
|
|
1287 (t
|
|
1288 (xmltok-add-error "Token allowed only inside markup declaration")
|
|
1289 t)))
|
|
1290
|
|
1291 (defun xmltok-parse-doctype ()
|
|
1292 (setq xmltok-markup-declaration-doctype-flag t)
|
|
1293 (xmltok-next-prolog-token)
|
|
1294 (when xmltok-internal-subset-start
|
|
1295 (xmltok-add-error "DOCTYPE declaration not allowed in internal subset")
|
|
1296 (xmltok-markup-declaration-parse-error))
|
|
1297 (when xmltok-contains-doctype
|
|
1298 (xmltok-add-error "Duplicate DOCTYPE declaration")
|
|
1299 (xmltok-markup-declaration-parse-error))
|
|
1300 (setq xmltok-contains-doctype t)
|
|
1301 (xmltok-require-token 'name 'prefixed-name)
|
|
1302 (xmltok-require-next-token "SYSTEM" "PUBLIC" ?\[ ?>)
|
|
1303 (cond ((eq xmltok-type ?\[)
|
|
1304 (setq xmltok-internal-subset-start (point)))
|
|
1305 ((eq xmltok-type ?>))
|
|
1306 (t
|
|
1307 (setq xmltok-doctype-external-subset-flag t)
|
|
1308 (xmltok-parse-external-id)
|
|
1309 (xmltok-require-token ?\[ ?>)
|
|
1310 (when (eq xmltok-type ?\[)
|
|
1311 (setq xmltok-internal-subset-start (point))))))
|
|
1312
|
|
1313 (defun xmltok-parse-attlist-declaration ()
|
|
1314 (xmltok-require-next-token 'prefixed-name 'name)
|
|
1315 (while (progn
|
|
1316 (xmltok-require-next-token ?> 'name 'prefixed-name)
|
|
1317 (if (eq xmltok-type ?>)
|
|
1318 nil
|
|
1319 (xmltok-require-next-token ?\(
|
|
1320 "CDATA"
|
|
1321 "ID"
|
|
1322 "IDREF"
|
|
1323 "IDREFS"
|
|
1324 "ENTITY"
|
|
1325 "ENTITIES"
|
|
1326 "NMTOKEN"
|
|
1327 "NMTOKENS"
|
|
1328 "NOTATION")
|
|
1329 (cond ((eq xmltok-type ?\()
|
|
1330 (xmltok-parse-nmtoken-group))
|
|
1331 ((string= (xmltok-current-token-string)
|
|
1332 "NOTATION")
|
|
1333 (xmltok-require-next-token ?\()
|
|
1334 (xmltok-parse-nmtoken-group)))
|
|
1335 (xmltok-require-next-token "#IMPLIED"
|
|
1336 "#REQUIRED"
|
|
1337 "#FIXED"
|
|
1338 'literal)
|
|
1339 (when (string= (xmltok-current-token-string) "#FIXED")
|
|
1340 (xmltok-require-next-token 'literal))
|
|
1341 t))))
|
|
1342
|
|
1343 (defun xmltok-parse-nmtoken-group ()
|
|
1344 (while (progn
|
|
1345 (xmltok-require-next-token 'nmtoken 'prefixed-name 'name)
|
|
1346 (xmltok-require-next-token ?| ?\))
|
|
1347 (eq xmltok-type ?|))))
|
|
1348
|
|
1349 (defun xmltok-parse-element-declaration ()
|
|
1350 (xmltok-require-next-token 'name 'prefixed-name)
|
|
1351 (xmltok-require-next-token "EMPTY" "ANY" ?\()
|
|
1352 (when (eq xmltok-type ?\()
|
|
1353 (xmltok-require-next-token "#PCDATA"
|
|
1354 'name
|
|
1355 'prefixed-name
|
|
1356 'name-occur
|
|
1357 ?\()
|
|
1358 (cond ((eq xmltok-type 'hash-name)
|
|
1359 (xmltok-require-next-token ?| ?\) 'close-paren-star)
|
|
1360 (while (eq xmltok-type ?|)
|
|
1361 (xmltok-require-next-token 'name 'prefixed-name)
|
|
1362 (xmltok-require-next-token 'close-paren-star ?|)))
|
|
1363 (t (xmltok-parse-model-group))))
|
|
1364 (xmltok-require-next-token ?>))
|
|
1365
|
|
1366 (defun xmltok-parse-model-group ()
|
|
1367 (xmltok-parse-model-group-member)
|
|
1368 (xmltok-require-next-token ?|
|
|
1369 ?,
|
|
1370 ?\)
|
|
1371 'close-paren-star
|
|
1372 'close-paren-occur)
|
|
1373 (when (memq xmltok-type '(?, ?|))
|
|
1374 (let ((connector xmltok-type))
|
|
1375 (while (progn
|
|
1376 (xmltok-next-prolog-token)
|
|
1377 (xmltok-parse-model-group-member)
|
|
1378 (xmltok-require-next-token connector
|
|
1379 ?\)
|
|
1380 'close-paren-star
|
|
1381 'close-paren-occur)
|
|
1382 (eq xmltok-type connector))))))
|
|
1383
|
|
1384 (defun xmltok-parse-model-group-member ()
|
|
1385 (xmltok-require-token 'name
|
|
1386 'prefixed-name
|
|
1387 'name-occur
|
|
1388 ?\()
|
|
1389 (when (eq xmltok-type ?\()
|
|
1390 (xmltok-next-prolog-token)
|
|
1391 (xmltok-parse-model-group)))
|
|
1392
|
|
1393 (defun xmltok-parse-entity-declaration ()
|
|
1394 (let (paramp name)
|
|
1395 (xmltok-require-next-token 'name ?%)
|
|
1396 (when (eq xmltok-type ?%)
|
|
1397 (setq paramp t)
|
|
1398 (xmltok-require-next-token 'name))
|
|
1399 (setq name (xmltok-current-token-string))
|
|
1400 (xmltok-require-next-token 'literal "SYSTEM" "PUBLIC")
|
|
1401 (cond ((eq xmltok-type 'literal)
|
|
1402 (let ((replacement (xmltok-parse-entity-value)))
|
|
1403 (unless paramp
|
|
1404 (xmltok-define-entity name replacement)))
|
|
1405 (xmltok-require-next-token ?>))
|
|
1406 (t
|
|
1407 (xmltok-parse-external-id)
|
|
1408 (if paramp
|
|
1409 (xmltok-require-token ?>)
|
|
1410 (xmltok-require-token ?> "NDATA")
|
|
1411 (if (eq xmltok-type ?>)
|
|
1412 (xmltok-define-entity name 'external)
|
|
1413 (xmltok-require-next-token 'name)
|
|
1414 (xmltok-require-next-token ?>)
|
|
1415 (xmltok-define-entity name 'unparsed)))))))
|
|
1416
|
|
1417 (defun xmltok-define-entity (name value)
|
|
1418 (when (and (or (not xmltok-had-param-entity-ref)
|
|
1419 xmltok-standalone)
|
|
1420 (not (assoc name xmltok-dtd)))
|
|
1421 (setq xmltok-dtd
|
|
1422 (cons (cons name value) xmltok-dtd))))
|
|
1423
|
|
1424 (defun xmltok-parse-entity-value ()
|
|
1425 (let ((lim (1- (point)))
|
|
1426 (well-formed t)
|
|
1427 value-parts
|
|
1428 start)
|
|
1429 (save-excursion
|
|
1430 (goto-char (1+ xmltok-start))
|
|
1431 (setq start (point))
|
|
1432 (while (progn
|
|
1433 (skip-chars-forward "^%&" lim)
|
|
1434 (when (< (point) lim)
|
|
1435 (goto-char (1+ (point)))
|
|
1436 (cond ((eq (char-before) ?%)
|
|
1437 (xmltok-add-error "Parameter entity references are not allowed in the internal subset"
|
|
1438 (1- (point))
|
|
1439 (point))
|
|
1440 (setq well-formed nil))
|
|
1441 (t
|
|
1442 (let ((xmltok-start (1- (point)))
|
|
1443 xmltok-type xmltok-replacement)
|
|
1444 (xmltok-scan-after-amp (lambda (start end)))
|
|
1445 (cond ((eq xmltok-type 'char-ref)
|
|
1446 (setq value-parts
|
|
1447 (cons (buffer-substring-no-properties
|
|
1448 start
|
|
1449 xmltok-start)
|
|
1450 value-parts))
|
|
1451 (setq value-parts
|
|
1452 (cons xmltok-replacement
|
|
1453 value-parts))
|
|
1454 (setq start (point)))
|
|
1455 ((eq xmltok-type 'not-well-formed)
|
|
1456 (setq well-formed nil))))))
|
|
1457 t))))
|
|
1458 (if (not well-formed)
|
|
1459 nil
|
|
1460 (apply 'concat
|
|
1461 (nreverse (cons (buffer-substring-no-properties start lim)
|
|
1462 value-parts))))))
|
|
1463
|
|
1464 (defun xmltok-parse-notation-declaration ()
|
|
1465 (xmltok-require-next-token 'name)
|
|
1466 (xmltok-require-next-token "SYSTEM" "PUBLIC")
|
|
1467 (let ((publicp (string= (xmltok-current-token-string) "PUBLIC")))
|
|
1468 (xmltok-require-next-token 'literal)
|
|
1469 (cond (publicp
|
|
1470 (xmltok-require-next-token 'literal ?>)
|
|
1471 (unless (eq xmltok-type ?>)
|
|
1472 (xmltok-require-next-token ?>)))
|
|
1473 (t (xmltok-require-next-token ?>)))))
|
|
1474
|
|
1475 (defun xmltok-parse-external-id ()
|
|
1476 (xmltok-require-token "SYSTEM" "PUBLIC")
|
|
1477 (let ((publicp (string= (xmltok-current-token-string) "PUBLIC")))
|
|
1478 (xmltok-require-next-token 'literal)
|
|
1479 (when publicp
|
|
1480 (xmltok-require-next-token 'literal)))
|
|
1481 (xmltok-next-prolog-token))
|
|
1482
|
|
1483 (defun xmltok-require-next-token (&rest types)
|
|
1484 (xmltok-next-prolog-token)
|
|
1485 (apply 'xmltok-require-token types))
|
|
1486
|
|
1487 (defun xmltok-require-token (&rest types)
|
|
1488 ;; XXX Generate a more helpful error message
|
|
1489 (while (and (not (let ((type (car types)))
|
|
1490 (if (stringp (car types))
|
|
1491 (string= (xmltok-current-token-string) type)
|
|
1492 (eq type xmltok-type))))
|
|
1493 (setq types (cdr types))))
|
|
1494 (unless types
|
|
1495 (when (and xmltok-type
|
|
1496 (not (eq xmltok-type 'not-well-formed)))
|
|
1497 (xmltok-add-error "Unexpected token"))
|
|
1498 (xmltok-markup-declaration-parse-error))
|
|
1499 (let ((region-type (xmltok-prolog-region-type (car types))))
|
|
1500 (when region-type
|
|
1501 (xmltok-add-prolog-region region-type
|
|
1502 xmltok-start
|
|
1503 (point)))))
|
|
1504
|
|
1505 (defun xmltok-current-token-string ()
|
|
1506 (buffer-substring-no-properties xmltok-start (point)))
|
|
1507
|
|
1508 (put 'xmltok-markup-declaration-parse-error
|
|
1509 'error-conditions
|
|
1510 '(error xmltok-markup-declaration-parse-error))
|
|
1511
|
|
1512 (put 'xmltok-markup-declaration-parse-error
|
|
1513 'error-message
|
|
1514 "Syntax error in markup declaration")
|
|
1515
|
|
1516 (defun xmltok-markup-declaration-parse-error ()
|
|
1517 (signal 'xmltok-markup-declaration-parse-error nil))
|
|
1518
|
|
1519 (defun xmltok-skip-markup-declaration ()
|
|
1520 (while (cond ((eq xmltok-type ?>)
|
|
1521 (xmltok-next-prolog-token)
|
|
1522 nil)
|
|
1523 ((and xmltok-markup-declaration-doctype-flag
|
|
1524 (eq xmltok-type ?\[))
|
|
1525 (setq xmltok-internal-subset-start (point))
|
|
1526 (xmltok-next-prolog-token)
|
|
1527 nil)
|
|
1528 ((memq xmltok-type '(nil
|
|
1529 end-prolog
|
|
1530 named-markup-declaration
|
|
1531 comment
|
|
1532 processing-instruction))
|
|
1533 nil)
|
|
1534 ((and xmltok-internal-subset-start
|
|
1535 (eq xmltok-type 'internal-subset-close))
|
|
1536 nil)
|
|
1537 (t (xmltok-next-prolog-token) t)))
|
|
1538 xmltok-type)
|
|
1539
|
|
1540 (defun xmltok-prolog-region-type (required)
|
|
1541 (cond ((cdr (assq xmltok-type
|
|
1542 '((literal . literal)
|
|
1543 (?> . markup-declaration-close)
|
|
1544 (?\[ . internal-subset-open)
|
|
1545 (hash-name . hash-name)))))
|
|
1546 ((and (stringp required) (eq xmltok-type 'name))
|
|
1547 'keyword)))
|
|
1548
|
|
1549 ;; Return new token type.
|
|
1550
|
|
1551 (defun xmltok-next-prolog-token ()
|
|
1552 (skip-chars-forward " \t\r\n")
|
|
1553 (setq xmltok-start (point))
|
|
1554 (cond ((not (and (looking-at (xmltok-prolog regexp))
|
|
1555 (goto-char (match-end 0))))
|
|
1556 (let ((ch (char-after)))
|
|
1557 (cond (ch
|
|
1558 (goto-char (1+ (point)))
|
|
1559 (xmltok-add-error "Illegal char in prolog")
|
|
1560 (setq xmltok-type 'not-well-formed))
|
|
1561 (t (setq xmltok-type nil)))))
|
|
1562 ((or (xmltok-prolog start ncname-occur)
|
|
1563 (xmltok-prolog start colon-name-occur))
|
|
1564 (setq xmltok-name-end (1- (point)))
|
|
1565 (setq xmltok-name-colon (xmltok-prolog start colon))
|
|
1566 (setq xmltok-type 'name-occur))
|
|
1567 ((xmltok-prolog start colon)
|
|
1568 (setq xmltok-name-end (point))
|
|
1569 (setq xmltok-name-colon (xmltok-prolog start colon))
|
|
1570 (unless (looking-at "[ \t\r\n>),|[%]")
|
|
1571 (xmltok-add-error "Missing space after name"))
|
|
1572 (setq xmltok-type 'prefixed-name))
|
|
1573 ((or (xmltok-prolog start ncname-nmtoken)
|
|
1574 (xmltok-prolog start ncname-colon))
|
|
1575 (unless (looking-at "[ \t\r\n>),|[%]")
|
|
1576 (xmltok-add-error "Missing space after name token"))
|
|
1577 (setq xmltok-type 'nmtoken))
|
|
1578 ((xmltok-prolog start name)
|
|
1579 (setq xmltok-name-end (point))
|
|
1580 (setq xmltok-name-colon nil)
|
|
1581 (unless (looking-at "[ \t\r\n>),|[%]")
|
|
1582 (xmltok-add-error "Missing space after name"))
|
|
1583 (setq xmltok-type 'name))
|
|
1584 ((xmltok-prolog start hash-name)
|
|
1585 (setq xmltok-name-end (point))
|
|
1586 (unless (looking-at "[ \t\r\n>)|%]")
|
|
1587 (xmltok-add-error "Missing space after name"))
|
|
1588 (setq xmltok-type 'hash-name))
|
|
1589 ((xmltok-prolog start processing-instruction-question)
|
|
1590 (xmltok-scan-prolog-after-processing-instruction-open))
|
|
1591 ((xmltok-prolog start comment-open)
|
|
1592 ;; XXX if not-well-formed, ignore some stuff
|
|
1593 (xmltok-scan-after-comment-open))
|
|
1594 ((xmltok-prolog start named-markup-declaration)
|
|
1595 (setq xmltok-type 'named-markup-declaration))
|
|
1596 ((xmltok-prolog start instance-start)
|
|
1597 (goto-char xmltok-start)
|
|
1598 (setq xmltok-type 'end-prolog))
|
|
1599 ((xmltok-prolog start close-paren-star)
|
|
1600 (setq xmltok-type 'close-paren-star))
|
|
1601 ((xmltok-prolog start close-paren-occur)
|
|
1602 (setq xmltok-type 'close-paren-occur))
|
|
1603 ((xmltok-prolog start close-paren)
|
|
1604 (unless (looking-at "[ \t\r\n>,|)]")
|
|
1605 (xmltok-add-error "Missing space after )"))
|
|
1606 (setq xmltok-type ?\)))
|
|
1607 ((xmltok-prolog start single-char)
|
|
1608 (let ((ch (char-before)))
|
|
1609 (cond ((memq ch '(?\" ?\'))
|
|
1610 (xmltok-scan-prolog-literal))
|
|
1611 (t (setq xmltok-type ch)))))
|
|
1612 ((xmltok-prolog start percent)
|
|
1613 (cond ((xmltok-prolog start param-entity-ref-close)
|
|
1614 (setq xmltok-name-end (1- (point)))
|
|
1615 (setq xmltok-type 'param-entity-ref))
|
|
1616 ((xmltok-prolog start param-entity-ref)
|
|
1617 (xmltok-add-error "Missing ;")
|
|
1618 (setq xmltok-name-end (point))
|
|
1619 (setq xmltok-type 'param-entity-ref))
|
|
1620 ((looking-at "[ \t\r\n%]")
|
|
1621 (setq xmltok-type ?%))
|
|
1622 (t
|
|
1623 (xmltok-add-error "Expected name after %")
|
|
1624 (setq xmltok-type 'not-well-formed))))
|
|
1625 ((xmltok-prolog start nmtoken)
|
|
1626 (unless (looking-at "[ \t\r\n>),|[%]")
|
|
1627 (xmltok-add-error "Missing space after name token"))
|
|
1628 (setq xmltok-type 'nmtoken))
|
|
1629 ((xmltok-prolog start internal-subset-close)
|
|
1630 (setq xmltok-type 'internal-subset-close))
|
|
1631 ((xmltok-prolog start pound)
|
|
1632 (xmltok-add-error "Expected name after #")
|
|
1633 (setq xmltok-type 'not-well-formed))
|
|
1634 ((xmltok-prolog start markup-declaration)
|
|
1635 (xmltok-add-error "Expected name or -- after <!")
|
|
1636 (setq xmltok-type 'not-well-formed))
|
|
1637 ((xmltok-prolog start comment-first-dash)
|
|
1638 (xmltok-add-error "Expected <!--")
|
|
1639 (setq xmltok-type 'not-well-formed))
|
|
1640 ((xmltok-prolog start less-than)
|
|
1641 (xmltok-add-error "Incomplete markup")
|
|
1642 (setq xmltok-type 'not-well-formed))
|
|
1643 (t (error "Unhandled token in prolog %s"
|
|
1644 (match-string-no-properties 0)))))
|
|
1645
|
|
1646 (defun xmltok-scan-prolog-literal ()
|
|
1647 (let* ((delim (string (char-before)))
|
|
1648 (safe-end (save-excursion
|
|
1649 (skip-chars-forward (concat "^<>[]" delim))
|
|
1650 (point)))
|
|
1651 (end (save-excursion
|
|
1652 (goto-char safe-end)
|
|
1653 (search-forward delim nil t))))
|
|
1654 (or (cond ((not end)
|
|
1655 (xmltok-add-dependent 'xmltok-unclosed-reparse-p
|
|
1656 nil
|
|
1657 nil
|
|
1658 delim)
|
|
1659 nil)
|
|
1660 ((save-excursion
|
|
1661 (goto-char end)
|
|
1662 (looking-at "[ \t\r\n>%[]"))
|
|
1663 (goto-char end)
|
|
1664 (setq xmltok-type 'literal))
|
|
1665 ((eq (1+ safe-end) end)
|
|
1666 (goto-char end)
|
|
1667 (xmltok-add-error (format "Missing space after %s" delim)
|
|
1668 safe-end)
|
|
1669 (setq xmltok-type 'literal))
|
|
1670 (t
|
|
1671 (xmltok-add-dependent 'xmltok-semi-closed-reparse-p
|
|
1672 xmltok-start
|
|
1673 (1+ end)
|
|
1674 delim
|
|
1675 1)
|
|
1676 nil))
|
|
1677 (progn
|
|
1678 (xmltok-add-error (format "Missing closing %s" delim))
|
|
1679 (goto-char safe-end)
|
|
1680 (skip-chars-backward " \t\r\n")
|
|
1681 (setq xmltok-type 'not-well-formed)))))
|
|
1682
|
|
1683 (defun xmltok-scan-prolog-after-processing-instruction-open ()
|
|
1684 (cond ((not (search-forward "?>" nil t))
|
|
1685 (xmltok-add-error "No closing ?>"
|
|
1686 xmltok-start
|
|
1687 (+ xmltok-start 2))
|
|
1688 (xmltok-add-dependent 'xmltok-unclosed-reparse-p
|
|
1689 nil
|
|
1690 nil
|
|
1691 "?>")
|
|
1692 (setq xmltok-type 'not-well-formed))
|
|
1693 (t
|
|
1694 (let* ((end (point))
|
|
1695 (target
|
|
1696 (save-excursion
|
|
1697 (goto-char (+ xmltok-start 2))
|
|
1698 (and (looking-at (xmltok-ncname regexp))
|
|
1699 (or (memq (char-after (match-end 0))
|
|
1700 '(?\n ?\t ?\r ? ))
|
|
1701 (= (match-end 0) (- end 2)))
|
|
1702 (match-string-no-properties 0)))))
|
|
1703 (cond ((not target)
|
|
1704 (xmltok-add-error "\
|
|
1705 Processing instruction does not start with a name"
|
|
1706 (+ xmltok-start 2)
|
|
1707 (+ xmltok-start 3)))
|
|
1708 ((not (and (= (length target) 3)
|
|
1709 (let ((case-fold-search t))
|
|
1710 (string-match "xml" target)))))
|
|
1711 ((= xmltok-start 1)
|
|
1712 (xmltok-add-error "Invalid XML declaration"
|
|
1713 xmltok-start
|
|
1714 (point)))
|
|
1715 ((save-excursion
|
|
1716 (goto-char xmltok-start)
|
|
1717 (looking-at (xmltok-xml-declaration regexp)))
|
|
1718 (xmltok-add-error "XML declaration not at beginning of file"
|
|
1719 xmltok-start
|
|
1720 (point)))
|
|
1721 (t
|
|
1722 (xmltok-add-error "Processing instruction has target of xml"
|
|
1723 (+ xmltok-start 2)
|
|
1724 (+ xmltok-start 5))))
|
|
1725 (xmltok-add-prolog-region 'processing-instruction-left
|
|
1726 xmltok-start
|
|
1727 (+ xmltok-start
|
|
1728 2
|
|
1729 (if target
|
|
1730 (length target)
|
|
1731 0)))
|
|
1732 (xmltok-add-prolog-region 'processing-instruction-right
|
|
1733 (if target
|
|
1734 (save-excursion
|
|
1735 (goto-char (+ xmltok-start
|
|
1736 (length target)
|
|
1737 2))
|
|
1738 (skip-chars-forward " \t\r\n")
|
|
1739 (point))
|
|
1740 (+ xmltok-start 2))
|
|
1741 (point)))
|
|
1742 (setq xmltok-type 'processing-instruction))))
|
|
1743
|
|
1744 (defun xmltok-parse-entities ()
|
|
1745 (let ((todo xmltok-dtd))
|
|
1746 (when (and (or xmltok-had-param-entity-ref
|
|
1747 xmltok-doctype-external-subset-flag)
|
|
1748 (not xmltok-standalone))
|
|
1749 (setq xmltok-dtd (cons t xmltok-dtd)))
|
|
1750 (while todo
|
|
1751 (xmltok-parse-entity (car todo))
|
|
1752 (setq todo (cdr todo)))))
|
|
1753
|
|
1754 (defun xmltok-parse-entity (name-def)
|
|
1755 (let ((def (cdr name-def))
|
|
1756 ;; in case its value is buffer local
|
|
1757 (xmltok-dtd xmltok-dtd)
|
|
1758 buf)
|
|
1759 (when (stringp def)
|
|
1760 (if (string-match "\\`[^&<\t\r\n]*\\'" def)
|
|
1761 (setcdr name-def (cons def def))
|
|
1762 (setcdr name-def 'not-well-formed) ; avoid infinite expansion loops
|
|
1763 (setq buf (get-buffer-create
|
|
1764 (format " *Entity %s*" (car name-def))))
|
|
1765 (save-excursion
|
|
1766 (set-buffer buf)
|
|
1767 (erase-buffer)
|
|
1768 (insert def)
|
|
1769 (goto-char (point-min))
|
|
1770 (setcdr name-def
|
|
1771 (xmltok-parse-entity-replacement)))
|
|
1772 (kill-buffer buf)))))
|
|
1773
|
|
1774 (defun xmltok-parse-entity-replacement ()
|
|
1775 (let ((def (cons "" "")))
|
|
1776 (while (let* ((start (point))
|
|
1777 (found (re-search-forward "[<&\t\r\n]\\|]]>" nil t))
|
|
1778 (ch (and found (char-before)))
|
|
1779 (str (buffer-substring-no-properties
|
|
1780 start
|
|
1781 (if found
|
|
1782 (match-beginning 0)
|
|
1783 (point-max)))))
|
|
1784 (setq def
|
|
1785 (xmltok-append-entity-def def
|
|
1786 (cons str str)))
|
|
1787 (cond ((not found) nil)
|
|
1788 ((eq ch ?>)
|
|
1789 (setq def 'not-well-formed)
|
|
1790 nil)
|
|
1791 ((eq ch ?<)
|
|
1792 (xmltok-save
|
|
1793 (setq xmltok-start (1- (point)))
|
|
1794 (xmltok-scan-after-lt)
|
|
1795 (setq def
|
|
1796 (xmltok-append-entity-def
|
|
1797 def
|
|
1798 (cond ((memq xmltok-type
|
|
1799 '(start-tag
|
|
1800 end-tag
|
|
1801 empty-element))
|
|
1802 'element)
|
|
1803 ((memq xmltok-type
|
|
1804 '(comment
|
|
1805 processing-instruction))
|
|
1806 (cons "" nil))
|
|
1807 ((eq xmltok-type
|
|
1808 'cdata-section)
|
|
1809 (cons (buffer-substring-no-properties
|
|
1810 (+ xmltok-start 9)
|
|
1811 (- (point) 3))
|
|
1812 nil))
|
|
1813 (t 'not-well-formed)))))
|
|
1814 t)
|
|
1815 ((eq ch ?&)
|
|
1816 (let ((xmltok-start (1- (point)))
|
|
1817 xmltok-type
|
|
1818 xmltok-replacement
|
|
1819 xmltok-errors)
|
|
1820 (xmltok-scan-after-amp 'xmltok-handle-nested-entity)
|
|
1821 (cond ((eq xmltok-type 'entity-ref)
|
|
1822 (setq def
|
|
1823 (xmltok-append-entity-def
|
|
1824 def
|
|
1825 xmltok-replacement)))
|
|
1826 ((eq xmltok-type 'char-ref)
|
|
1827 (setq def
|
|
1828 (xmltok-append-entity-def
|
|
1829 def
|
|
1830 (if xmltok-replacement
|
|
1831 (cons xmltok-replacement
|
|
1832 xmltok-replacement)
|
|
1833 (and xmltok-errors 'not-well-formed)))))
|
|
1834 (t
|
|
1835 (setq def 'not-well-formed))))
|
|
1836 t)
|
|
1837 (t
|
|
1838 (setq def
|
|
1839 (xmltok-append-entity-def
|
|
1840 def
|
|
1841 (cons (match-string-no-properties 0)
|
|
1842 " ")))
|
|
1843 t))))
|
|
1844 def))
|
|
1845
|
|
1846 (defun xmltok-handle-nested-entity (start end)
|
|
1847 (let* ((name-def (assoc (buffer-substring-no-properties start end)
|
|
1848 xmltok-dtd))
|
|
1849 (def (cdr name-def)))
|
|
1850 (when (stringp def)
|
|
1851 (xmltok-parse-entity name-def)
|
|
1852 (setq def (cdr name-def)))
|
|
1853 (setq xmltok-replacement
|
|
1854 (cond ((null name-def)
|
|
1855 (if (eq (car xmltok-dtd) t)
|
|
1856 nil
|
|
1857 'not-well-formed))
|
|
1858 ((eq def 'unparsed) 'not-well-formed)
|
|
1859 (t def)))))
|
|
1860
|
|
1861 (defun xmltok-append-entity-def (d1 d2)
|
|
1862 (cond ((consp d1)
|
|
1863 (if (consp d2)
|
|
1864 (cons (concat (car d1) (car d2))
|
|
1865 (and (cdr d1)
|
|
1866 (cdr d2)
|
|
1867 (concat (cdr d1) (cdr d2))))
|
|
1868 d2))
|
|
1869 ((consp d2) d1)
|
|
1870 (t
|
|
1871 (let ((defs '(not-well-formed external element)))
|
|
1872 (while (not (or (eq (car defs) d1)
|
|
1873 (eq (car defs) d2)))
|
|
1874 (setq defs (cdr defs)))
|
|
1875 (car defs)))))
|
|
1876
|
|
1877 (defun xmltok-add-prolog-region (type start end)
|
|
1878 (setq xmltok-prolog-regions
|
|
1879 (cons (vector type start end)
|
|
1880 xmltok-prolog-regions)))
|
|
1881
|
|
1882 (defun xmltok-merge-attributes ()
|
|
1883 "Return a list merging `xmltok-attributes' and 'xmltok-namespace-attributes'.
|
|
1884 The members of the merged list are in order of occurrence in the
|
|
1885 document. The list may share list structure with `xmltok-attributes'
|
|
1886 and `xmltok-namespace-attributes'."
|
|
1887 (cond ((not xmltok-namespace-attributes)
|
|
1888 xmltok-attributes)
|
|
1889 ((not xmltok-attributes)
|
|
1890 xmltok-namespace-attributes)
|
|
1891 (t
|
|
1892 (let ((atts1 xmltok-attributes)
|
|
1893 (atts2 xmltok-namespace-attributes)
|
|
1894 merged)
|
|
1895 (while (and atts1 atts2)
|
|
1896 (cond ((< (xmltok-attribute-name-start (car atts1))
|
|
1897 (xmltok-attribute-name-start (car atts2)))
|
|
1898 (setq merged (cons (car atts1) merged))
|
|
1899 (setq atts1 (cdr atts1)))
|
|
1900 (t
|
|
1901 (setq merged (cons (car atts2) merged))
|
|
1902 (setq atts2 (cdr atts2)))))
|
|
1903 (setq merged (nreverse merged))
|
|
1904 (cond (atts1 (setq merged (nconc merged atts1)))
|
|
1905 (atts2 (setq merged (nconc merged atts2))))
|
|
1906 merged))))
|
|
1907
|
|
1908 ;;; Testing
|
|
1909
|
|
1910 (defun xmltok-forward-test ()
|
|
1911 (interactive)
|
|
1912 (if (xmltok-forward)
|
|
1913 (message "Scanned %s" xmltok-type)
|
|
1914 (message "Scanned nothing")))
|
|
1915
|
|
1916 (defun xmltok-next-prolog-token-test ()
|
|
1917 (interactive)
|
|
1918 (if (xmltok-next-prolog-token)
|
|
1919 (message "Scanned %s"
|
|
1920 (if (integerp xmltok-type)
|
|
1921 (string xmltok-type)
|
|
1922 xmltok-type))
|
|
1923 (message "Scanned end of file")))
|
|
1924
|
|
1925 (provide 'xmltok)
|
|
1926
|
86379
|
1927 ;; arch-tag: 747e5f3a-6fc3-4f8d-bd96-89f05aa99f5e
|
86361
|
1928 ;;; xmltok.el ends here
|