30329
|
1 ;; @(#) xml.el --- XML parser
|
|
2
|
|
3 ;; Copyright (C) 2000 Free Software Foundation, Inc.
|
|
4
|
|
5 ;; Author: Emmanuel Briot <briot@gnat.com>
|
|
6 ;; Maintainer: Emmanuel Briot <briot@gnat.com>
|
|
7 ;; Keywords: xml
|
|
8
|
|
9 ;; This file is part of GNU Emacs.
|
|
10
|
|
11 ;; GNU Emacs is free software; you can redistribute it and/or modify
|
|
12 ;; it under the terms of the GNU General Public License as published by
|
|
13 ;; the Free Software Foundation; either version 2, or (at your option)
|
|
14 ;; any later version.
|
|
15
|
|
16 ;; GNU Emacs is distributed in the hope that it will be useful,
|
|
17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
19 ;; GNU General Public License for more details.
|
|
20
|
|
21 ;; You should have received a copy of the GNU General Public License
|
|
22 ;; along with GNU Emacs; see the file COPYING. If not, write to the
|
|
23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
24 ;; Boston, MA 02111-1307, USA.
|
|
25
|
|
26 ;;; Commentary:
|
|
27
|
|
28 ;; This file contains a full XML parser. It parses a file, and returns a list
|
|
29 ;; that can be used internally by any other lisp file.
|
|
30 ;; See some example in todo.el
|
|
31
|
|
32 ;;; FILE FORMAT
|
|
33
|
|
34 ;; It does not parse the DTD, if present in the XML file, but knows how to
|
|
35 ;; ignore it. The XML file is assumed to be well-formed. In case of error, the
|
|
36 ;; parsing stops and the XML file is shown where the parsing stopped.
|
|
37 ;;
|
|
38 ;; It also knows how to ignore comments, as well as the special ?xml? tag
|
|
39 ;; in the XML file.
|
|
40 ;;
|
|
41 ;; The XML file should have the following format:
|
|
42 ;; <node1 attr1="name1" attr2="name2" ...> value
|
|
43 ;; <node2 attr3="name3" attr4="name4"> value2 </node2>
|
|
44 ;; <node3 attr5="name5" attr6="name6"> value3 </node3>
|
|
45 ;; </node1>
|
|
46 ;; Of course, the name of the nodes and attributes can be anything. There can
|
|
47 ;; be any number of attributes (or none), as well as any number of children
|
|
48 ;; below the nodes.
|
|
49 ;;
|
|
50 ;; There can be only top level node, but with any number of children below.
|
|
51
|
|
52 ;;; LIST FORMAT
|
|
53
|
|
54 ;; The functions `xml-parse-file' and `xml-parse-tag' return a list with
|
|
55 ;; the following format:
|
|
56 ;;
|
|
57 ;; xml-list ::= (node node ...)
|
|
58 ;; node ::= (tag_name attribute-list . child_node_list)
|
|
59 ;; child_node_list ::= child_node child_node ...
|
|
60 ;; child_node ::= node | string
|
|
61 ;; tag_name ::= string
|
|
62 ;; attribute_list ::= (("attribute" . "value") ("attribute" . "value") ...)
|
|
63 ;; | nil
|
|
64 ;; string ::= "..."
|
|
65 ;;
|
|
66 ;; Since XML is case insensitive, tag_name is always converted to lower-cases.
|
|
67 ;; tag_name is then converted to a symbol (this is not a string, so that the
|
|
68 ;; list takes less space in memory and is faster to traverse).
|
|
69 ;;
|
|
70 ;; Some macros are provided to ease the parsing of this list
|
|
71
|
|
72 ;;; Code:
|
|
73
|
|
74 ;;*******************************************************************
|
|
75 ;;**
|
|
76 ;;** Macros to parse the list
|
|
77 ;;**
|
|
78 ;;*******************************************************************
|
|
79
|
|
80 (defmacro xml-node-name (node)
|
|
81 "Return the tag associated with NODE.
|
|
82 The tag is a lower-case symbol."
|
|
83 (list 'car node))
|
|
84
|
|
85 (defmacro xml-node-attributes (node)
|
|
86 "Return the list of attributes of NODE.
|
|
87 The list can be nil."
|
|
88 (list 'nth 1 node))
|
|
89
|
|
90 (defmacro xml-node-children (node)
|
|
91 "Return the list of children of NODE.
|
|
92 This is a list of nodes, and it can be nil."
|
|
93 (list 'cddr node))
|
|
94
|
|
95 (defun xml-get-children (node child-name)
|
|
96 "Return the children of NODE whose tag is CHILD-NAME.
|
|
97 CHILD-NAME should be a lower case symbol."
|
|
98 (let ((children (xml-node-children node))
|
|
99 match)
|
|
100 (while children
|
|
101 (if (car children)
|
|
102 (if (equal (xml-node-name (car children)) child-name)
|
|
103 (set 'match (append match (list (car children))))))
|
|
104 (set 'children (cdr children)))
|
|
105 match))
|
|
106
|
|
107 (defun xml-get-attribute (node attribute)
|
|
108 "Get from NODE the value of ATTRIBUTE.
|
|
109 An empty string is returned if the attribute was not found."
|
|
110 (if (xml-node-attributes node)
|
|
111 (let ((value (assoc attribute (xml-node-attributes node))))
|
|
112 (if value
|
|
113 (cdr value)
|
|
114 ""))
|
|
115 ""))
|
|
116
|
|
117 ;;*******************************************************************
|
|
118 ;;**
|
|
119 ;;** Creating the list
|
|
120 ;;**
|
|
121 ;;*******************************************************************
|
|
122
|
|
123 (defun xml-parse-file (file &optional parse-dtd)
|
|
124 "Parse the well-formed XML FILE.
|
|
125 Returns the top node with all its children.
|
|
126 If PARSE-DTD is non-nil, the DTD is parsed rather than skipped."
|
|
127 (find-file file)
|
|
128 (let ((xml (xml-parse-region (point-min)
|
|
129 (point-max)
|
|
130 (current-buffer)
|
|
131 parse-dtd)))
|
|
132 (kill-buffer (current-buffer))
|
|
133 xml))
|
|
134
|
|
135 (defun xml-parse-region (beg end &optional buffer parse-dtd)
|
|
136 "Parse the region from BEG to END in BUFFER.
|
|
137 If BUFFER is nil, it defaults to the current buffer.
|
|
138 Returns the XML list for the region, or raises an error if the region
|
|
139 is not a well-formed XML file.
|
|
140 If PARSE-DTD is non-nil, the DTD is parsed rather than skipped,
|
|
141 and returned as the first element of the list"
|
|
142 (let (xml result dtd)
|
|
143 (save-excursion
|
|
144 (if buffer
|
|
145 (set-buffer buffer))
|
|
146 (goto-char beg)
|
|
147 (while (< (point) end)
|
|
148 (if (search-forward "<" end t)
|
|
149 (progn
|
|
150 (forward-char -1)
|
|
151 (if (null xml)
|
|
152 (progn
|
|
153 (set 'result (xml-parse-tag end parse-dtd))
|
|
154 (cond
|
|
155 ((listp (car result))
|
|
156 (set 'dtd (car result))
|
|
157 (add-to-list 'xml (cdr result)))
|
|
158 (t
|
|
159 (add-to-list 'xml result))))
|
|
160
|
|
161 ;; translation of rule [1] of XML specifications
|
|
162 (error "XML files can have only one toplevel tag.")))
|
|
163 (goto-char end)))
|
|
164 (if parse-dtd
|
|
165 (cons dtd (reverse xml))
|
|
166 (reverse xml)))))
|
|
167
|
|
168
|
|
169 (defun xml-parse-tag (end &optional parse-dtd)
|
|
170 "Parse the tag that is just in front of point.
|
|
171 The end tag must be found before the position END in the current buffer.
|
|
172 If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and
|
|
173 returned as the first element in the list.
|
|
174 Returns one of:
|
|
175 - a list : the matching node
|
|
176 - nil : the point is not looking at a tag.
|
|
177 - a cons cell: the first element is the DTD, the second is the node"
|
|
178 (cond
|
|
179 ;; Processing instructions (like the <?xml version="1.0"?> tag at the
|
|
180 ;; beginning of a document)
|
|
181 ((looking-at "<\\?")
|
|
182 (search-forward "?>" end)
|
|
183 (skip-chars-forward " \t\n")
|
|
184 (xml-parse-tag end))
|
|
185 ;; Character data (CDATA) sections, in which no tag should be interpreted
|
|
186 ((looking-at "<!\\[CDATA\\[")
|
|
187 (let ((pos (match-end 0)))
|
|
188 (unless (search-forward "]]>" end t)
|
|
189 (error "CDATA section does not end anywhere in the document"))
|
|
190 (buffer-substring-no-properties pos (match-beginning 0))))
|
|
191 ;; DTD for the document
|
|
192 ((looking-at "<!DOCTYPE")
|
|
193 (let (dtd)
|
|
194 (if parse-dtd
|
|
195 (set 'dtd (xml-parse-dtd end))
|
|
196 (xml-skip-dtd end))
|
|
197 (skip-chars-forward " \t\n")
|
|
198 (if dtd
|
|
199 (cons dtd (xml-parse-tag end))
|
|
200 (xml-parse-tag end))))
|
|
201 ;; skip comments
|
|
202 ((looking-at "<!--")
|
|
203 (search-forward "-->" end)
|
|
204 (skip-chars-forward " \t\n")
|
|
205 (xml-parse-tag end))
|
|
206 ;; end tag
|
|
207 ((looking-at "</")
|
|
208 '())
|
|
209 ;; opening tag
|
|
210 ((looking-at "<\\([^/> \t]+\\)")
|
|
211 (let* ((node-name (match-string 1))
|
|
212 (children (list (intern (downcase node-name))))
|
|
213 pos)
|
|
214 (goto-char (match-end 1))
|
|
215
|
|
216 ;; parses the attribute list
|
|
217 (set 'children (append children (list (xml-parse-attlist end))))
|
|
218
|
|
219 ;; is this an empty element ?
|
|
220 (if (looking-at "/>")
|
|
221 (progn
|
|
222 (forward-char 2)
|
|
223 (skip-chars-forward " \t\n")
|
|
224 (append children '("")))
|
|
225
|
|
226 ;; is this a valid start tag ?
|
|
227 (if (= (char-after) ?>)
|
|
228 (progn
|
|
229 (forward-char 1)
|
|
230 (skip-chars-forward " \t\n")
|
|
231 (while (not (looking-at (concat "</" node-name ">")))
|
|
232 (cond
|
|
233 ((looking-at "</")
|
|
234 (error (concat
|
|
235 "XML: invalid syntax -- invalid end tag (expecting "
|
|
236 node-name
|
|
237 ")")))
|
|
238 ((= (char-after) ?<)
|
|
239 (set 'children (append children (list (xml-parse-tag end)))))
|
|
240 (t
|
|
241 (set 'pos (point))
|
|
242 (search-forward "<" end)
|
|
243 (forward-char -1)
|
|
244 (let ((string (buffer-substring-no-properties pos (point)))
|
|
245 (pos 0))
|
|
246
|
|
247 ;; Clean up the string (no newline characters)
|
|
248 ;; Not done, since as per XML specifications, the XML processor
|
|
249 ;; should always pass the whole string to the application.
|
|
250 ;; (while (string-match "\\s +" string pos)
|
|
251 ;; (set 'string (replace-match " " t t string))
|
|
252 ;; (set 'pos (1+ (match-beginning 0))))
|
|
253
|
|
254 (set 'children (append children
|
|
255 (list (xml-substitute-special string))))))))
|
|
256 (goto-char (match-end 0))
|
|
257 (skip-chars-forward " \t\n")
|
|
258 (if (> (point) end)
|
|
259 (error "XML: End tag for %s not found before end of region."
|
|
260 node-name))
|
|
261 children
|
|
262 )
|
|
263
|
|
264 ;; This was an invalid start tag
|
|
265 (error "XML: Invalid attribute list")
|
|
266 ))))
|
|
267 ))
|
|
268
|
|
269 (defun xml-parse-attlist (end)
|
|
270 "Return the attribute-list that point is looking at.
|
|
271 The search for attributes end at the position END in the current buffer.
|
|
272 Leaves the point on the first non-blank character after the tag."
|
|
273 (let ((attlist '())
|
|
274 name)
|
|
275 (skip-chars-forward " \t\n")
|
|
276 (while (looking-at "\\([a-zA-Z_:][a-zA-Z0-9.-_:]*\\)[ \t\n]*=[ \t\n]*")
|
|
277 (set 'name (intern (downcase (match-string 1))))
|
|
278 (goto-char (match-end 0))
|
|
279
|
|
280 ;; Do we have a string between quotes (or double-quotes),
|
|
281 ;; or a simple word ?
|
|
282 (unless (looking-at "\"\\([^\"]+\\)\"")
|
|
283 (unless (looking-at "'\\([^\"]+\\)'")
|
|
284 (error "XML: Attribute values must be given between quotes.")))
|
|
285
|
|
286 ;; Each attribute must be unique within a given element
|
|
287 (if (assoc name attlist)
|
|
288 (error "XML: each attribute must be unique within an element."))
|
|
289
|
|
290 (set 'attlist (append attlist
|
|
291 (list (cons name (match-string 1)))))
|
|
292 (goto-char (match-end 0))
|
|
293 (skip-chars-forward " \t\n")
|
|
294 (if (> (point) end)
|
|
295 (error "XML: end of attribute list not found before end of region."))
|
|
296 )
|
|
297 attlist
|
|
298 ))
|
|
299
|
|
300 ;;*******************************************************************
|
|
301 ;;**
|
|
302 ;;** The DTD (document type declaration)
|
|
303 ;;** The following functions know how to skip or parse the DTD of
|
|
304 ;;** a document
|
|
305 ;;**
|
|
306 ;;*******************************************************************
|
|
307
|
|
308 (defun xml-skip-dtd (end)
|
|
309 "Skip the DTD that point is looking at.
|
|
310 The DTD must end before the position END in the current buffer.
|
|
311 The point must be just before the starting tag of the DTD.
|
|
312 This follows the rule [28] in the XML specifications."
|
|
313 (forward-char (length "<!DOCTYPE"))
|
|
314 (if (looking-at "[ \t\n]*>")
|
|
315 (error "XML: invalid DTD (excepting name of the document)"))
|
|
316 (condition-case nil
|
|
317 (progn
|
|
318 (forward-word 1) ;; name of the document
|
|
319 (skip-chars-forward " \t\n")
|
|
320 (if (looking-at "\\[")
|
|
321 (re-search-forward "\\][ \t\n]*>" end)
|
|
322 (search-forward ">" end)))
|
|
323 (error (error "XML: No end to the DTD"))))
|
|
324
|
|
325 (defun xml-parse-dtd (end)
|
|
326 "Parse the DTD that point is looking at.
|
|
327 The DTD must end before the position END in the current buffer."
|
|
328 (let (dtd type element end-pos)
|
|
329 (forward-char (length "<!DOCTYPE"))
|
|
330 (skip-chars-forward " \t\n")
|
|
331 (if (looking-at ">")
|
|
332 (error "XML: invalid DTD (excepting name of the document)"))
|
|
333
|
|
334 ;; Get the name of the document
|
|
335 (looking-at "\\sw+")
|
|
336 (set 'dtd (list 'dtd (match-string-no-properties 0)))
|
|
337 (goto-char (match-end 0))
|
|
338
|
|
339 (skip-chars-forward " \t\n")
|
|
340
|
|
341 ;; External DTDs => don't know how to handle them yet
|
|
342 (if (looking-at "SYSTEM")
|
|
343 (error "XML: Don't know how to handle external DTDs."))
|
|
344
|
|
345 (if (not (= (char-after) ?\[))
|
|
346 (error "XML: Unknown declaration in the DTD."))
|
|
347
|
|
348 ;; Parse the rest of the DTD
|
|
349 (forward-char 1)
|
|
350 (while (and (not (looking-at "[ \t\n]*\\]"))
|
|
351 (<= (point) end))
|
|
352 (cond
|
|
353
|
|
354 ;; Translation of rule [45] of XML specifications
|
|
355 ((looking-at
|
|
356 "[\t \n]*<!ELEMENT[ \t\n]+\\([a-zA-Z0-9.%;]+\\)[ \t\n]+\\([^>]+\\)>")
|
|
357
|
|
358 (setq element (intern (downcase (match-string-no-properties 1)))
|
|
359 type (match-string-no-properties 2))
|
|
360 (set 'end-pos (match-end 0))
|
|
361
|
|
362 ;; Translation of rule [46] of XML specifications
|
|
363 (cond
|
|
364 ((string-match "^EMPTY[ \t\n]*$" type) ;; empty declaration
|
|
365 (set 'type 'empty))
|
|
366 ((string-match "^ANY[ \t\n]*$" type) ;; any type of contents
|
|
367 (set 'type 'any))
|
|
368 ((string-match "^(\\(.*\\))[ \t\n]*$" type) ;; children ([47])
|
|
369 (set 'type (xml-parse-elem-type (match-string-no-properties 1 type))))
|
|
370 ((string-match "^%[^;]+;[ \t\n]*$" type) ;; substitution
|
|
371 nil)
|
|
372 (t
|
|
373 (error "XML: Invalid element type in the DTD")))
|
|
374
|
|
375 ;; rule [45]: the element declaration must be unique
|
|
376 (if (assoc element dtd)
|
|
377 (error "XML: elements declaration must be unique in a DTD (<%s>)."
|
|
378 (symbol-name element)))
|
|
379
|
|
380 ;; Store the element in the DTD
|
|
381 (set 'dtd (append dtd (list (list element type))))
|
|
382 (goto-char end-pos)
|
|
383 )
|
|
384
|
|
385
|
|
386 (t
|
|
387 (error "XML: Invalid DTD item"))
|
|
388 )
|
|
389 )
|
|
390
|
|
391 ;; Skip the end of the DTD
|
|
392 (search-forward ">" end)
|
|
393 dtd
|
|
394 ))
|
|
395
|
|
396
|
|
397 (defun xml-parse-elem-type (string)
|
|
398 "Convert a STRING for an element type into an elisp structure."
|
|
399
|
|
400 (let (elem modifier)
|
|
401 (if (string-match "(\\([^)]+\\))\\([+*?]?\\)" string)
|
|
402 (progn
|
|
403 (setq elem (match-string 1 string)
|
|
404 modifier (match-string 2 string))
|
|
405 (if (string-match "|" elem)
|
|
406 (set 'elem (append '(choice)
|
|
407 (mapcar 'xml-parse-elem-type
|
|
408 (split-string elem "|"))))
|
|
409 (if (string-match "," elem)
|
|
410 (set 'elem (append '(seq)
|
|
411 (mapcar 'xml-parse-elem-type
|
|
412 (split-string elem ","))))
|
|
413 )))
|
|
414 (if (string-match "[ \t\n]*\\([^+*?]+\\)\\([+*?]?\\)" string)
|
|
415 (setq elem (match-string 1 string)
|
|
416 modifier (match-string 2 string))))
|
|
417
|
|
418 (if (and (stringp elem)
|
|
419 (string= elem "#PCDATA"))
|
|
420 (set 'elem 'pcdata))
|
|
421
|
|
422 (cond
|
|
423 ((string= modifier "+")
|
|
424 (list '+ elem))
|
|
425 ((string= modifier "*")
|
|
426 (list '* elem))
|
|
427 ((string= modifier "?")
|
|
428 (list '? elem))
|
|
429 (t
|
|
430 elem))))
|
|
431
|
|
432
|
|
433 ;;*******************************************************************
|
|
434 ;;**
|
|
435 ;;** Substituting special XML sequences
|
|
436 ;;**
|
|
437 ;;*******************************************************************
|
|
438
|
|
439 (defun xml-substitute-special (string)
|
|
440 "Return STRING, after subsituting special XML sequences."
|
|
441 (while (string-match "&" string)
|
|
442 (set 'string (replace-match "&" t nil string)))
|
|
443 (while (string-match "<" string)
|
|
444 (set 'string (replace-match "<" t nil string)))
|
|
445 (while (string-match ">" string)
|
|
446 (set 'string (replace-match ">" t nil string)))
|
|
447 (while (string-match "'" string)
|
|
448 (set 'string (replace-match "'" t nil string)))
|
|
449 (while (string-match """ string)
|
|
450 (set 'string (replace-match "\"" t nil string)))
|
|
451 string)
|
|
452
|
|
453 ;;*******************************************************************
|
|
454 ;;**
|
|
455 ;;** Printing a tree.
|
|
456 ;;** This function is intended mainly for debugging purposes.
|
|
457 ;;**
|
|
458 ;;*******************************************************************
|
|
459
|
|
460 (defun xml-debug-print (xml)
|
|
461 (while xml
|
|
462 (xml-debug-print-internal (car xml) "")
|
|
463 (set 'xml (cdr xml)))
|
|
464 )
|
|
465
|
|
466 (defun xml-debug-print-internal (xml &optional indent-string)
|
|
467 "Outputs the XML tree in the current buffer.
|
|
468 The first line indented with INDENT-STRING."
|
|
469 (let ((tree xml)
|
|
470 attlist)
|
|
471 (unless indent-string
|
|
472 (set 'indent-string ""))
|
|
473
|
|
474 (insert indent-string "<" (symbol-name (xml-node-name tree)))
|
|
475
|
|
476 ;; output the attribute list
|
|
477 (set 'attlist (xml-node-attributes tree))
|
|
478 (while attlist
|
|
479 (insert " ")
|
|
480 (insert (symbol-name (caar attlist)) "=\"" (cdar attlist) "\"")
|
|
481 (set 'attlist (cdr attlist)))
|
|
482
|
|
483 (insert ">")
|
|
484
|
|
485 (set 'tree (xml-node-children tree))
|
|
486
|
|
487 ;; output the children
|
|
488 (while tree
|
|
489 (cond
|
|
490 ((listp (car tree))
|
|
491 (insert "\n")
|
|
492 (xml-debug-print-internal (car tree) (concat indent-string " "))
|
|
493 )
|
|
494 ((stringp (car tree))
|
|
495 (insert (car tree))
|
|
496 )
|
|
497 (t
|
|
498 (error "Invalid XML tree")))
|
|
499 (set 'tree (cdr tree))
|
|
500 )
|
|
501
|
|
502 (insert "\n" indent-string
|
|
503 "</" (symbol-name (xml-node-name xml)) ">")
|
|
504 ))
|
|
505
|
|
506 (provide 'xml)
|
|
507
|
|
508 ;;; xml.el ends here
|