Mercurial > emacs
annotate lisp/nxml/nxml-parse.el @ 112433:7b649f6eac4f
Merge from mainline.
author | Paul Eggert <eggert@cs.ucla.edu> |
---|---|
date | Sat, 22 Jan 2011 17:09:09 -0800 |
parents | ef719132ddfa |
children |
rev | line source |
---|---|
86361 | 1 ;;; nxml-parse.el --- XML parser, sharing infrastructure with nxml-mode |
2 | |
112218
376148b31b5e
Add 2011 to FSF/AIST copyright years.
Glenn Morris <rgm@gnu.org>
parents:
106815
diff
changeset
|
3 ;; Copyright (C) 2003, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. |
86361 | 4 |
5 ;; Author: James Clark | |
6 ;; Keywords: XML | |
7 | |
86541 | 8 ;; This file is part of GNU Emacs. |
9 | |
94666
d495d4d5452f
Switch to recommended form of GPLv3 permissions notice.
Glenn Morris <rgm@gnu.org>
parents:
87665
diff
changeset
|
10 ;; GNU Emacs is free software: you can redistribute it and/or modify |
86541 | 11 ;; it under the terms of the GNU General Public License as published by |
94666
d495d4d5452f
Switch to recommended form of GPLv3 permissions notice.
Glenn Morris <rgm@gnu.org>
parents:
87665
diff
changeset
|
12 ;; the Free Software Foundation, either version 3 of the License, or |
d495d4d5452f
Switch to recommended form of GPLv3 permissions notice.
Glenn Morris <rgm@gnu.org>
parents:
87665
diff
changeset
|
13 ;; (at your option) any later version. |
86361 | 14 |
86541 | 15 ;; GNU Emacs is distributed in the hope that it will be useful, |
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 ;; GNU General Public License for more details. | |
86361 | 19 |
86541 | 20 ;; You should have received a copy of the GNU General Public License |
94666
d495d4d5452f
Switch to recommended form of GPLv3 permissions notice.
Glenn Morris <rgm@gnu.org>
parents:
87665
diff
changeset
|
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. |
86361 | 22 |
23 ;;; Commentary: | |
24 | |
25 ;; Entry point is `nxml-parse-file'. | |
26 | |
27 ;;; Code: | |
28 | |
29 (require 'nxml-util) | |
30 (require 'xmltok) | |
31 (require 'nxml-enc) | |
32 (require 'nxml-ns) | |
33 | |
34 (defvar nxml-parse-file-name nil) | |
35 | |
36 (defvar nxml-validate-function nil | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
37 "Either nil or a function called by `nxml-parse-file' to perform validation. |
86361 | 38 The function will be called once for each start-tag or end-tag. The |
39 function is passed two arguments TEXT and START-TAG. For a start-tag, | |
40 START-TAG is a list (NAME ATTRIBUTES) where NAME and ATTRIBUTES are in | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
41 the same form as returned by `nxml-parse-file'. For an end-tag, |
86361 | 42 START-TAG is nil. TEXT is a string containing the text immediately |
43 preceding the tag, or nil if there was no such text. An empty element | |
44 is treated as a start-tag followed by an end-tag. | |
45 | |
46 For a start-tag, the namespace state will be the state after | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
47 processing the namespace declarations in the start-tag. For an |
86361 | 48 end-tag, the namespace state will be the state before popping the |
49 namespace declarations for the corresponding start-tag. | |
50 | |
51 The function must return nil if no error is detected or a | |
52 cons (MESSAGE . LOCATION) where MESSAGE is a string containing | |
53 an error message and LOCATION indicates what caused the error | |
54 as follows: | |
55 | |
56 - nil indicates the tag as whole caused it; this is always allowed; | |
57 | |
58 - text indicates the text caused it; this is allowed only if | |
59 TEXT is non-nil; | |
60 | |
61 - tag-close indicates the close of the tag caused it; this is | |
62 allowed only if START-TAG is non-nil; | |
63 | |
64 - (attribute-name . N) indicates that the name of the Nth attribute | |
65 caused it; N counts from 0; this is allowed only if START-TAG is non-nil | |
66 and N must be less than the number of attributes; | |
67 | |
68 - (attribute-value . N) indicates that the value of the Nth attribute | |
69 caused it; N counts from 0; this is allowed only if START-TAG is non-nil | |
70 and N must be less than the number of attributes.") | |
71 | |
72 (defun nxml-parse-file (file) | |
73 "Parse the XML document in FILE and return it as a list. | |
74 An XML element is represented as a list (NAME ATTRIBUTES . CHILDREN). | |
75 NAME is either a string, in the case where the name does not have a | |
76 namespace, or a cons (NAMESPACE . LOCAL-NAME), where NAMESPACE is a | |
77 symbol and LOCAL-NAME is a string, in the case where the name does | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
78 have a namespace. NAMESPACE is a keyword whose name is `:URI', where |
86361 | 79 URI is the namespace name. ATTRIBUTES is an alist of attributes where |
80 each attribute has the form (NAME . VALUE), where NAME has the same | |
81 form as an element name, and VALUE is a string. A namespace | |
82 declaration is represented as an attribute whose name is | |
83 \(:http://www.w3.org/2000/xmlns/ . LOCAL-NAME). CHILDREN is a list | |
84 containing strings and child elements; CHILDREN never contains two | |
85 consecutive strings and never contains an empty string. Processing | |
86 instructions and comments are not represented. The return value is a | |
87 list representing the document element. | |
88 | |
89 If the XML document is not well-formed, an error having the condition | |
90 `nxml-file-parse-error' will be signaled; the error data will be a | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
91 list of the form \(FILE POSITION MESSAGE), where POSITION is an |
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
92 integer specifying the position where the error was detected, and |
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
93 MESSAGE is a string describing the error. |
86361 | 94 |
95 The current contents of FILE will be parsed even if there is a | |
96 modified buffer currently visiting FILE. | |
97 | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
98 If the variable `nxml-validate-function' is non-nil, it will be called |
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
99 twice for each element, and any reported error will be signaled in the |
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
100 same way as well-formedness error." |
105866
3367f0022cf2
* nxml/xsd-regexp.el (xsdre-gen-categories):
Stefan Monnier <monnier@iro.umontreal.ca>
parents:
100908
diff
changeset
|
101 (with-current-buffer (nxml-parse-find-file file) |
86361 | 102 (unwind-protect |
103 (let ((nxml-parse-file-name file)) | |
104 (nxml-parse-instance)) | |
105 (kill-buffer nil)))) | |
106 | |
107 (defun nxml-parse-find-file (file) | |
105866
3367f0022cf2
* nxml/xsd-regexp.el (xsdre-gen-categories):
Stefan Monnier <monnier@iro.umontreal.ca>
parents:
100908
diff
changeset
|
108 (with-current-buffer (get-buffer-create " *nXML Parse*") |
86361 | 109 (erase-buffer) |
110 (let ((set-auto-coding-function 'nxml-set-xml-coding)) | |
111 (insert-file-contents file)) | |
112 (current-buffer))) | |
96496
e374c747704b
Fix typos, and general docstring cleanup.
Juanma Barranquero <lekktu@gmail.com>
parents:
94666
diff
changeset
|
113 |
86361 | 114 (defun nxml-parse-instance () |
115 (let (xmltok-dtd) | |
116 (xmltok-save | |
117 (xmltok-forward-prolog) | |
118 (nxml-check-xmltok-errors) | |
119 (nxml-ns-save | |
120 (nxml-parse-instance-1))))) | |
121 | |
122 (defun nxml-parse-instance-1 () | |
123 (let* ((top (cons nil nil)) | |
124 ;; tail is a cons cell, whose cdr is nil | |
125 ;; additional elements will destructively appended to tail | |
126 (tail top) | |
127 ;; stack of tails one for each open element | |
128 tail-stack | |
129 ;; list of QNames of open elements | |
130 open-element-tags | |
131 ;; list of strings buffering a text node, in reverse order | |
132 text | |
133 ;; position of beginning of first (in buffer) string in text | |
134 text-pos) | |
135 (while (xmltok-forward) | |
136 (nxml-check-xmltok-errors) | |
137 (cond ((memq xmltok-type '(start-tag end-tag empty-element)) | |
138 (when text | |
139 (setq text (apply 'concat (nreverse text))) | |
140 (setcdr tail (cons text nil)) | |
141 (setq tail (cdr tail))) | |
142 (when (not (eq xmltok-type 'end-tag)) | |
143 (when (and (not open-element-tags) | |
144 (not (eq tail top))) | |
145 (nxml-parse-error nil "Multiple top-level elements")) | |
146 (setq open-element-tags | |
147 (cons (xmltok-start-tag-qname) | |
148 open-element-tags)) | |
149 (nxml-ns-push-state) | |
150 (let ((tag (nxml-parse-start-tag))) | |
151 (nxml-validate-tag text text-pos tag) | |
152 (setq text nil) | |
153 (setcdr tail (cons tag nil)) | |
154 (setq tail (cdr tail)) | |
155 (setq tail-stack (cons tail tail-stack)) | |
156 (setq tail (last tag)))) | |
157 (when (not (eq xmltok-type 'start-tag)) | |
158 (or (eq xmltok-type 'empty-element) | |
159 (equal (car open-element-tags) | |
160 (xmltok-end-tag-qname)) | |
161 (if open-element-tags | |
162 (nxml-parse-error nil | |
163 "Unbalanced end-tag; expected </%s>" | |
164 (car open-element-tags)) | |
165 (nxml-parse-error nil "Extra end-tag"))) | |
166 (nxml-validate-tag text text-pos nil) | |
167 (setq text nil) | |
168 (nxml-ns-pop-state) | |
169 (setq open-element-tags (cdr open-element-tags)) | |
170 (setq tail (car tail-stack)) | |
171 (setq tail-stack (cdr tail-stack))) | |
172 (setq text-pos nil)) | |
173 ((memq xmltok-type '(space data entity-ref char-ref cdata-section)) | |
174 (cond (open-element-tags | |
175 (unless text-pos | |
176 (setq text-pos xmltok-start)) | |
177 (setq text | |
178 (cons (nxml-current-text-string) text))) | |
179 ((not (eq xmltok-type 'space)) | |
180 (nxml-parse-error | |
181 nil | |
182 "%s at top-level" | |
183 (cdr (assq xmltok-type | |
184 '((data . "Text characters") | |
185 (entity-ref . "Entity reference") | |
186 (char-ref . "Character reference") | |
187 (cdata-section . "CDATA section")))))))))) | |
188 (unless (cdr top) | |
189 (nxml-parse-error (point-max) "Missing document element")) | |
190 (cadr top))) | |
191 | |
192 (defun nxml-parse-start-tag () | |
193 (let (parsed-attributes | |
194 parsed-namespace-attributes | |
195 atts att prefixes prefix ns value name) | |
196 (setq atts xmltok-namespace-attributes) | |
197 (while atts | |
198 (setq att (car atts)) | |
199 (setq value (or (xmltok-attribute-value att) | |
200 (nxml-parse-error nil "Invalid attribute value"))) | |
201 (setq ns (nxml-make-namespace value)) | |
202 (setq prefix (and (xmltok-attribute-prefix att) | |
203 (xmltok-attribute-local-name att))) | |
204 (cond ((member prefix prefixes) | |
205 (nxml-parse-error nil "Duplicate namespace declaration")) | |
206 ((not prefix) | |
207 (nxml-ns-set-default ns)) | |
208 (ns | |
209 (nxml-ns-set-prefix prefix ns)) | |
210 (t (nxml-parse-error nil "Cannot undeclare namespace prefix"))) | |
211 (setq prefixes (cons prefix prefixes)) | |
212 (setq parsed-namespace-attributes | |
213 (cons (cons (nxml-make-name nxml-xmlns-namespace-uri | |
214 (xmltok-attribute-local-name att)) | |
215 value) | |
216 parsed-namespace-attributes)) | |
217 (setq atts (cdr atts))) | |
218 (setq name | |
219 (nxml-make-name | |
220 (let ((prefix (xmltok-start-tag-prefix))) | |
221 (if prefix | |
222 (or (nxml-ns-get-prefix prefix) | |
223 (nxml-parse-error (1+ xmltok-start) | |
224 "Prefix `%s' undeclared" | |
225 prefix)) | |
226 (nxml-ns-get-default))) | |
227 (xmltok-start-tag-local-name))) | |
228 (setq atts xmltok-attributes) | |
229 (while atts | |
230 (setq att (car atts)) | |
231 (setq ns | |
232 (let ((prefix (xmltok-attribute-prefix att))) | |
233 (and prefix | |
234 (or (nxml-ns-get-prefix prefix) | |
235 (nxml-parse-error (xmltok-attribute-name-start att) | |
236 "Prefix `%s' undeclared" | |
237 prefix))))) | |
238 (setq parsed-attributes | |
239 (let ((nm (nxml-make-name ns | |
240 (xmltok-attribute-local-name att)))) | |
241 (when (assoc nm parsed-attributes) | |
242 (nxml-parse-error (xmltok-attribute-name-start att) | |
243 "Duplicate attribute")) | |
244 (cons (cons nm (or (xmltok-attribute-value att) | |
245 (nxml-parse-error nil "Invalid attribute value"))) | |
246 parsed-attributes))) | |
247 (setq atts (cdr atts))) | |
248 ;; We want to end up with the attributes followed by the | |
249 ;; the namespace attributes in the same order as | |
250 ;; xmltok-attributes and xmltok-namespace-attributes respectively. | |
251 (when parsed-namespace-attributes | |
252 (setq parsed-attributes | |
253 (nconc parsed-namespace-attributes parsed-attributes))) | |
254 (list name (nreverse parsed-attributes)))) | |
255 | |
256 (defun nxml-validate-tag (text text-pos tag) | |
257 (when nxml-validate-function | |
258 (let ((err (funcall nxml-validate-function text tag)) | |
259 pos) | |
260 (when err | |
261 (setq pos (nxml-validate-error-position (cdr err) | |
262 (and text text-pos) | |
263 tag)) | |
264 (or pos (error "Incorrect return value from %s" | |
265 nxml-validate-function)) | |
266 (nxml-parse-error pos (car err)))))) | |
267 | |
268 (defun nxml-validate-error-position (location text-pos tag) | |
269 (cond ((null location) xmltok-start) | |
270 ((eq location 'text) text-pos) | |
271 ((eq location 'tag-close) | |
272 (and tag (- (point) (if (eq xmltok-type 'empty-element ) 2 1)))) | |
273 ((consp location) | |
274 (let ((att (nth (cdr location) xmltok-attributes))) | |
275 (when (not att) | |
276 (setq att (nth (- (cdr location) (length xmltok-attributes)) | |
277 xmltok-namespace-attributes))) | |
278 (cond ((not att)) | |
279 ((eq (car location) 'attribute-name) | |
280 (xmltok-attribute-name-start att)) | |
281 ((eq (car location) 'attribute-value) | |
282 (xmltok-attribute-value-start att))))))) | |
283 | |
284 (defun nxml-make-name (ns local-name) | |
285 (if ns | |
286 (cons ns local-name) | |
287 local-name)) | |
288 | |
289 (defun nxml-current-text-string () | |
290 (cond ((memq xmltok-type '(space data)) | |
291 (buffer-substring-no-properties xmltok-start | |
292 (point))) | |
293 ((eq xmltok-type 'cdata-section) | |
294 (buffer-substring-no-properties (+ xmltok-start 9) | |
295 (- (point) 3))) | |
296 ((memq xmltok-type '(char-ref entity-ref)) | |
297 (unless xmltok-replacement | |
298 (nxml-parse-error nil | |
299 (if (eq xmltok-type 'char-ref) | |
300 "Reference to unsupported Unicode character" | |
301 "Unresolvable entity reference"))) | |
302 xmltok-replacement))) | |
303 | |
304 (defun nxml-parse-error (position &rest args) | |
305 (nxml-signal-file-parse-error nxml-parse-file-name | |
306 (or position xmltok-start) | |
307 (apply 'format args))) | |
308 | |
309 (defun nxml-check-xmltok-errors () | |
310 (when xmltok-errors | |
311 (let ((err (car (last xmltok-errors)))) | |
312 (nxml-signal-file-parse-error nxml-parse-file-name | |
313 (xmltok-error-start err) | |
314 (xmltok-error-message err))))) | |
315 | |
316 (provide 'nxml-parse) | |
317 | |
318 ;;; nxml-parse.el ends here |