comparison lisp/xml.el @ 51102:d604b76d3bbd

Doc fixes. (xml-parse-file, xml-parse-region): Autoload. (xml-syntax-table, xml-name-regexp): New. (xml-parse-region): Narrow to region, set syntax-table and case-fold-search. Reject fewer valid documents. (xml-parse-tag): Remove arg END. Callers changed. (xml-parse-tag): Use skip-syntax-forward. Use PARSE-DTD arg properly. Don't use buffer-substring-no-properties. Don't bind case-fold-search. Fix syntax for empty elements. Hoist consing of end-of-tag regexp out of loop. (xml-parse-attlist): Remove arg. Callers changed. Use skip-syntax-forward, replace-regexp-in-string, forward-sexp. Allow non-ASCII names. (xml-skip-dtd): Remove arg. Callers changed. Change matching code. (xml-parse-dtd): Grok external DTDs. Allow non-ASCII. Don't use match-string-no-properties. (xml-ucs-to-string): Deleted. (xml-substitute-entity): New. (xml-substitute-special): Use it. (xml-debug-print-internal): Simplify insertions. (xml-parse-file): Avoid finding file in xml-mode.
author Dave Love <fx@gnu.org>
date Mon, 19 May 2003 16:01:39 +0000
parents 575aa6820adc
children aac5eaf1454e
comparison
equal deleted inserted replaced
51101:821f85e23a1f 51102:d604b76d3bbd
1 ;;; xml.el --- XML parser 1 ;;; xml.el --- XML parser
2 2
3 ;; Copyright (C) 2000, 2001 Free Software Foundation, Inc. 3 ;; Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
4 4
5 ;; Author: Emmanuel Briot <briot@gnat.com> 5 ;; Author: Emmanuel Briot <briot@gnat.com>
6 ;; Maintainer: Emmanuel Briot <briot@gnat.com> 6 ;; Maintainer: FSF
7 ;; Keywords: xml 7 ;; Keywords: xml, data
8 8
9 ;; This file is part of GNU Emacs. 9 ;; This file is part of GNU Emacs.
10 10
11 ;; GNU Emacs is free software; you can redistribute it and/or modify 11 ;; GNU Emacs is free software; you can redistribute it and/or modify
12 ;; it under the terms of the GNU General Public License as published by 12 ;; it under the terms of the GNU General Public License as published by
23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 ;; Boston, MA 02111-1307, USA. 24 ;; Boston, MA 02111-1307, USA.
25 25
26 ;;; Commentary: 26 ;;; Commentary:
27 27
28 ;; This file contains a full XML parser. It parses a file, and returns a list 28 ;; This file contains a somewhat incomplete non-validating XML parser. It
29 ;; that can be used internally by any other lisp file. 29 ;; parses a file, and returns a list that can be used internally by
30 ;; See some example in todo.el 30 ;; any other lisp libraries.
31 31
32 ;;; FILE FORMAT 32 ;;; FILE FORMAT
33 33
34 ;; It does not parse the DTD, if present in the XML file, but knows how to 34 ;; The document type declaration may either be ignored or (optionally)
35 ;; ignore it. The XML file is assumed to be well-formed. In case of error, the 35 ;; parsed, but currently the parsing will only accept element
36 ;; parsing stops and the XML file is shown where the parsing stopped. 36 ;; declarations. The XML file is assumed to be well-formed. In case
37 ;; of error, the parsing stops and the XML file is shown where the
38 ;; parsing stopped.
37 ;; 39 ;;
38 ;; It also knows how to ignore comments, as well as the special ?xml? tag 40 ;; It also knows how to ignore comments and processing instructions.
39 ;; in the XML file.
40 ;; 41 ;;
41 ;; The XML file should have the following format: 42 ;; The XML file should have the following format:
42 ;; <node1 attr1="name1" attr2="name2" ...>value 43 ;; <node1 attr1="name1" attr2="name2" ...>value
43 ;; <node2 attr3="name3" attr4="name4">value2</node2> 44 ;; <node2 attr3="name3" attr4="name4">value2</node2>
44 ;; <node3 attr5="name5" attr6="name6">value3</node3> 45 ;; <node3 attr5="name5" attr6="name6">value3</node3>
61 ;; tag_name ::= string 62 ;; tag_name ::= string
62 ;; attribute_list ::= (("attribute" . "value") ("attribute" . "value") ...) 63 ;; attribute_list ::= (("attribute" . "value") ("attribute" . "value") ...)
63 ;; | nil 64 ;; | nil
64 ;; string ::= "..." 65 ;; string ::= "..."
65 ;; 66 ;;
66 ;; Some macros are provided to ease the parsing of this list 67 ;; Some macros are provided to ease the parsing of this list.
68 ;; Whitespace is preserved. Fixme: There should be a tree-walker that
69 ;; can remove it.
67 70
68 ;;; Code: 71 ;;; Code:
72
73 ;; Note that {buffer-substring,match-string}-no-properties were
74 ;; formerly used in several places, but that removes composition info.
69 75
70 ;;******************************************************************* 76 ;;*******************************************************************
71 ;;** 77 ;;**
72 ;;** Macros to parse the list 78 ;;** Macros to parse the list
73 ;;** 79 ;;**
112 ;;** 118 ;;**
113 ;;** Creating the list 119 ;;** Creating the list
114 ;;** 120 ;;**
115 ;;******************************************************************* 121 ;;*******************************************************************
116 122
123 ;;;###autoload
117 (defun xml-parse-file (file &optional parse-dtd) 124 (defun xml-parse-file (file &optional parse-dtd)
118 "Parse the well-formed XML FILE. 125 "Parse the well-formed XML file FILE.
119 If FILE is already edited, this will keep the buffer alive. 126 If FILE is already visited, use its buffer and don't kill it.
120 Returns the top node with all its children. 127 Returns the top node with all its children.
121 If PARSE-DTD is non-nil, the DTD is parsed rather than skipped." 128 If PARSE-DTD is non-nil, the DTD is parsed rather than skipped."
122 (let ((keep)) 129 (let ((keep))
123 (if (get-file-buffer file) 130 (if (get-file-buffer file)
124 (progn 131 (progn
125 (set-buffer (get-file-buffer file)) 132 (set-buffer (get-file-buffer file))
126 (setq keep (point))) 133 (setq keep (point)))
127 (find-file file)) 134 (let (auto-mode-alist) ; no need for xml-mode
135 (find-file file)))
128 136
129 (let ((xml (xml-parse-region (point-min) 137 (let ((xml (xml-parse-region (point-min)
130 (point-max) 138 (point-max)
131 (current-buffer) 139 (current-buffer)
132 parse-dtd))) 140 parse-dtd)))
133 (if keep 141 (if keep
134 (goto-char keep) 142 (goto-char keep)
135 (kill-buffer (current-buffer))) 143 (kill-buffer (current-buffer)))
136 xml))) 144 xml)))
137 145
146 ;; Note that this is setup so that we can do whitespace-skipping with
147 ;; `(skip-syntax-forward " ")', inter alia. Previously this was slow
148 ;; compared with `re-search-forward', but that has been fixed. Also
149 ;; note that the standard syntax table contains other characters with
150 ;; whitespace syntax, like NBSP, but they are invalid in contexts in
151 ;; which we might skip whitespace -- specifically, they're not
152 ;; NameChars [XML 4].
153
154 (defvar xml-syntax-table
155 (let ((table (make-syntax-table)))
156 ;; Get space syntax correct per XML [3].
157 (dotimes (c 31)
158 (modify-syntax-entry c "." table)) ; all are space in standard table
159 (dolist (c '(?\t ?\n ?\r)) ; these should be space
160 (modify-syntax-entry c " " table))
161 ;; For skipping attributes.
162 (modify-syntax-entry ?\" "\"" table)
163 (modify-syntax-entry ?' "\"" table)
164 ;; Non-alnum name chars should be symbol constituents (`-' and `_'
165 ;; are OK by default).
166 (modify-syntax-entry ?. "_" table)
167 (modify-syntax-entry ?: "_" table)
168 ;; XML [89]
169 (dolist (c '(#x00B7 #x02D0 #x02D1 #x0387 #x0640 #x0E46 #x0EC6 #x3005
170 #x3031 #x3032 #x3033 #x3034 #x3035 #x309D #x309E #x30FC
171 #x30FD #x30FE))
172 (modify-syntax-entry (decode-char 'ucs c) "w" table))
173 ;; Fixme: rest of [4]
174 table)
175 "Syntax table used by `xml-parse-region'.")
176
177 ;; XML [5]
178 ;; Note that [:alpha:] matches all multibyte chars with word syntax.
179 (defconst xml-name-regexp "[[:alpha:]_:][[:alnum:]._:-]*")
180
181 ;; Fixme: This needs re-writing to deal with the XML grammar properly, i.e.
182 ;; document ::= prolog element Misc*
183 ;; prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
184
185 ;;;###autoload
138 (defun xml-parse-region (beg end &optional buffer parse-dtd) 186 (defun xml-parse-region (beg end &optional buffer parse-dtd)
139 "Parse the region from BEG to END in BUFFER. 187 "Parse the region from BEG to END in BUFFER.
140 If BUFFER is nil, it defaults to the current buffer. 188 If BUFFER is nil, it defaults to the current buffer.
141 Returns the XML list for the region, or raises an error if the region 189 Returns the XML list for the region, or raises an error if the region
142 is not a well-formed XML file. 190 is not a well-formed XML file.
143 If PARSE-DTD is non-nil, the DTD is parsed rather than skipped, 191 If PARSE-DTD is non-nil, the DTD is parsed rather than skipped,
144 and returned as the first element of the list" 192 and returned as the first element of the list."
145 (let (xml result dtd) 193 (save-restriction
146 (save-excursion 194 (narrow-to-region beg end)
147 (if buffer 195 ;; Use fixed syntax table to ensure regexp char classes and syntax
148 (set-buffer buffer)) 196 ;; specs DTRT.
149 (goto-char beg) 197 (with-syntax-table (standard-syntax-table)
150 (while (< (point) end) 198 (let ((case-fold-search nil) ; XML is case-sensitive.
151 (if (search-forward "<" end t) 199 xml result dtd)
152 (progn 200 (save-excursion
153 (forward-char -1) 201 (if buffer
154 (if (null xml) 202 (set-buffer buffer))
155 (progn 203 (goto-char (point-min))
156 (setq result (xml-parse-tag end parse-dtd)) 204 (while (not (eobp))
205 (if (search-forward "<" nil t)
206 (progn
207 (forward-char -1)
208 (if xml
209 ;; translation of rule [1] of XML specifications
210 (error "XML files can have only one toplevel tag")
211 (setq result (xml-parse-tag parse-dtd))
157 (cond 212 (cond
158 ((null result)) 213 ((null result))
159 ((listp (car result)) 214 ((listp (car result))
160 (setq dtd (car result)) 215 (setq dtd (car result))
161 (add-to-list 'xml (cdr result))) 216 (if (cdr result) ; possible leading comment
217 (add-to-list 'xml (cdr result))))
162 (t 218 (t
163 (add-to-list 'xml result)))) 219 (add-to-list 'xml result)))))
164 220 (goto-char (point-max))))
165 ;; translation of rule [1] of XML specifications 221 (if parse-dtd
166 (error "XML files can have only one toplevel tag"))) 222 (cons dtd (nreverse xml))
167 (goto-char end))) 223 (nreverse xml)))))))
168 (if parse-dtd 224
169 (cons dtd (reverse xml)) 225
170 (reverse xml))))) 226 (defun xml-parse-tag (&optional parse-dtd)
171 227 "Parse the tag at point.
172
173 (defun xml-parse-tag (end &optional parse-dtd)
174 "Parse the tag that is just in front of point.
175 The end tag must be found before the position END in the current buffer.
176 If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and 228 If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and
177 returned as the first element in the list. 229 returned as the first element in the list.
178 Returns one of: 230 Returns one of:
179 - a list : the matching node 231 - a list : the matching node
180 - nil : the point is not looking at a tag. 232 - nil : the point is not looking at a tag.
181 - a cons cell: the first element is the DTD, the second is the node" 233 - a pair : the first element is the DTD, the second is the node."
182 (cond 234 (cond
183 ;; Processing instructions (like the <?xml version="1.0"?> tag at the 235 ;; Processing instructions (like the <?xml version="1.0"?> tag at the
184 ;; beginning of a document) 236 ;; beginning of a document).
185 ((looking-at "<\\?") 237 ((looking-at "<\\?")
186 (search-forward "?>" end) 238 (search-forward "?>")
187 (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) 239 (skip-syntax-forward " ")
188 (xml-parse-tag end)) 240 (xml-parse-tag parse-dtd))
189 ;; Character data (CDATA) sections, in which no tag should be interpreted 241 ;; Character data (CDATA) sections, in which no tag should be interpreted
190 ((looking-at "<!\\[CDATA\\[") 242 ((looking-at "<!\\[CDATA\\[")
191 (let ((pos (match-end 0))) 243 (let ((pos (match-end 0)))
192 (unless (search-forward "]]>" end t) 244 (unless (search-forward "]]>" nil t)
193 (error "CDATA section does not end anywhere in the document")) 245 (error "CDATA section does not end anywhere in the document"))
194 (buffer-substring-no-properties pos (match-beginning 0)))) 246 (buffer-substring pos (match-beginning 0))))
195 ;; DTD for the document 247 ;; DTD for the document
196 ((looking-at "<!DOCTYPE") 248 ((looking-at "<!DOCTYPE")
197 (let (dtd) 249 (let (dtd)
198 (if parse-dtd 250 (if parse-dtd
199 (setq dtd (xml-parse-dtd end)) 251 (setq dtd (xml-parse-dtd))
200 (xml-skip-dtd end)) 252 (xml-skip-dtd))
201 (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) 253 (skip-syntax-forward " ")
202 (if dtd 254 (if dtd
203 (cons dtd (xml-parse-tag end)) 255 (cons dtd (xml-parse-tag))
204 (xml-parse-tag end)))) 256 (xml-parse-tag))))
205 ;; skip comments 257 ;; skip comments
206 ((looking-at "<!--") 258 ((looking-at "<!--")
207 (search-forward "-->" end) 259 (search-forward "-->")
208 nil) 260 nil)
209 ;; end tag 261 ;; end tag
210 ((looking-at "</") 262 ((looking-at "</")
211 '()) 263 '())
212 ;; opening tag 264 ;; opening tag
213 ((looking-at "<\\([^/> \t\n\r]+\\)") 265 ((looking-at "<\\([^/>[:space:]]+\\)")
214 (goto-char (match-end 1)) 266 (goto-char (match-end 1))
215 (let* ((case-fold-search nil) ;; XML is case-sensitive. 267 (let* ((node-name (match-string 1))
216 (node-name (match-string 1))
217 ;; Parse the attribute list. 268 ;; Parse the attribute list.
218 (children (list (xml-parse-attlist end) (intern node-name))) 269 (children (list (xml-parse-attlist) (intern node-name)))
219 pos) 270 pos)
220 271
221 ;; is this an empty element ? 272 ;; is this an empty element ?
222 (if (looking-at "/[ \t\n\r]*>") 273 (if (looking-at "/>")
223 (progn 274 (progn
224 (forward-char 2) 275 (forward-char 2)
276 ;; Fixme: Inconsistent with the nil content returned from
277 ;; `<tag></tag>'.
225 (nreverse (cons '("") children))) 278 (nreverse (cons '("") children)))
226 279
227 ;; is this a valid start tag ? 280 ;; is this a valid start tag ?
228 (if (eq (char-after) ?>) 281 (if (eq (char-after) ?>)
229 (progn 282 (progn
230 (forward-char 1) 283 (forward-char 1)
231 ;; Now check that we have the right end-tag. Note that this 284 ;; Now check that we have the right end-tag. Note that this
232 ;; one might contain spaces after the tag name 285 ;; one might contain spaces after the tag name
233 (while (not (looking-at (concat "</" node-name "[ \t\n\r]*>"))) 286 (let ((end (concat "</" node-name "\\s-*>")))
234 (cond 287 (while (not (looking-at end))
235 ((looking-at "</") 288 (cond
236 (error (concat 289 ((looking-at "</")
237 "XML: invalid syntax -- invalid end tag (expecting " 290 (error "XML: Invalid end tag (expecting %s) at pos %d"
238 node-name 291 node-name (point)))
239 ") at pos " (number-to-string (point))))) 292 ((= (char-after) ?<)
240 ((= (char-after) ?<) 293 (let ((tag (xml-parse-tag)))
241 (let ((tag (xml-parse-tag end))) 294 (when tag
242 (when tag 295 (push tag children))))
243 (push tag children)))) 296 (t
244 (t 297 (setq pos (point))
245 (setq pos (point)) 298 (search-forward "<")
246 (search-forward "<" end) 299 (forward-char -1)
247 (forward-char -1) 300 (let ((string (buffer-substring pos (point)))
248 (let ((string (buffer-substring-no-properties pos (point))) 301 (pos 0))
249 (pos 0)) 302
250 303 ;; Clean up the string. As per XML
251 ;; Clean up the string. As per XML 304 ;; specifications, the XML processor should
252 ;; specifications, the XML processor should 305 ;; always pass the whole string to the
253 ;; always pass the whole string to the 306 ;; application. But \r's should be replaced:
254 ;; application. But \r's should be replaced: 307 ;; http://www.w3.org/TR/2000/REC-xml-20001006#sec-line-ends
255 ;; http://www.w3.org/TR/2000/REC-xml-20001006#sec-line-ends 308 (while (string-match "\r\n?" string pos)
256 (while (string-match "\r\n?" string pos) 309 (setq string (replace-match "\n" t t string))
257 (setq string (replace-match "\n" t t string)) 310 (setq pos (1+ (match-beginning 0))))
258 (setq pos (1+ (match-beginning 0)))) 311
259 312 (setq string (xml-substitute-special string))
260 (setq string (xml-substitute-special string)) 313 (setq children
261 (setq children 314 (if (stringp (car children))
262 (if (stringp (car children)) 315 ;; The two strings were separated by a comment.
263 ;; The two strings were separated by a comment. 316 (cons (concat (car children) string)
264 (cons (concat (car children) string) 317 (cdr children))
265 (cdr children)) 318 (cons string children))))))))
266 (cons string children))))))) 319
267 (goto-char (match-end 0)) 320 (goto-char (match-end 0))
268 (if (> (point) end)
269 (error "XML: End tag for %s not found before end of region"
270 node-name))
271 (nreverse children)) 321 (nreverse children))
272
273 ;; This was an invalid start tag 322 ;; This was an invalid start tag
274 (error "XML: Invalid attribute list") 323 (error "XML: Invalid attribute list")))))
275 ))))
276 (t ;; This is not a tag. 324 (t ;; This is not a tag.
277 (error "XML: Invalid character")) 325 (error "XML: Invalid character"))))
278 )) 326
279 327 (defun xml-parse-attlist ()
280 (defun xml-parse-attlist (end) 328 "Return the attribute-list after point.
281 "Return the attribute-list that point is looking at. 329 Leave point at the first non-blank character after the tag."
282 The search for attributes end at the position END in the current buffer.
283 Leaves the point on the first non-blank character after the tag."
284 (let ((attlist ()) 330 (let ((attlist ())
285 start-pos name) 331 start-pos name)
286 (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) 332 (skip-syntax-forward " ")
287 (while (looking-at "\\([a-zA-Z_:][-a-zA-Z0-9._:]*\\)[ \t\n\r]*=[ \t\n\r]*") 333 (while (looking-at (eval-when-compile
334 (concat "\\(" xml-name-regexp "\\)\\s-*=\\s-*")))
288 (setq name (intern (match-string 1))) 335 (setq name (intern (match-string 1)))
289 (goto-char (match-end 0)) 336 (goto-char (match-end 0))
290 337
291 ;; See also: http://www.w3.org/TR/2000/REC-xml-20001006#AVNormalize 338 ;; See also: http://www.w3.org/TR/2000/REC-xml-20001006#AVNormalize
292 339
302 (if (assoc name attlist) 349 (if (assoc name attlist)
303 (error "XML: each attribute must be unique within an element")) 350 (error "XML: each attribute must be unique within an element"))
304 351
305 ;; Multiple whitespace characters should be replaced with a single one 352 ;; Multiple whitespace characters should be replaced with a single one
306 ;; in the attributes 353 ;; in the attributes
307 (let ((string (match-string-no-properties 1)) 354 (let ((string (match-string 1))
308 (pos 0)) 355 (pos 0))
309 (while (string-match "[ \t\n\r]+" string pos) 356 (replace-regexp-in-string "\\s-\\{2,\\}" " " string)
310 (setq string (replace-match " " t nil string))
311 (setq pos (1+ (match-beginning 0))))
312 (push (cons name (xml-substitute-special string)) attlist)) 357 (push (cons name (xml-substitute-special string)) attlist))
313 358
314 (goto-char start-pos) 359 (goto-char start-pos)
315 (if (looking-at "\"\\([^\"]*\\)\"") 360 (forward-sexp) ; we have string syntax
316 (goto-char (match-end 0)) 361
317 (if (looking-at "'\\([^']*\\)'") 362 (skip-syntax-forward " "))
318 (goto-char (match-end 0))))
319
320 (goto-char (- (re-search-forward "[^ \t\n\r]") 1))
321 (if (> (point) end)
322 (error "XML: end of attribute list not found before end of region")))
323 (nreverse attlist))) 363 (nreverse attlist)))
324 364
325 ;;******************************************************************* 365 ;;*******************************************************************
326 ;;** 366 ;;**
327 ;;** The DTD (document type declaration) 367 ;;** The DTD (document type declaration)
328 ;;** The following functions know how to skip or parse the DTD of 368 ;;** The following functions know how to skip or parse the DTD of
329 ;;** a document 369 ;;** a document
330 ;;** 370 ;;**
331 ;;******************************************************************* 371 ;;*******************************************************************
332 372
333 (defun xml-skip-dtd (end) 373 ;; Fixme: This fails at least if the DTD contains conditional sections.
334 "Skip the DTD that point is looking at. 374
335 The DTD must end before the position END in the current buffer. 375 (defun xml-skip-dtd ()
336 The point must be just before the starting tag of the DTD. 376 "Skip the DTD at point.
337 This follows the rule [28] in the XML specifications." 377 This follows the rule [28] in the XML specifications."
338 (forward-char (length "<!DOCTYPE")) 378 (forward-char (length "<!DOCTYPE"))
339 (if (looking-at "[ \t\n\r]*>") 379 (if (looking-at "\\s-*>")
340 (error "XML: invalid DTD (excepting name of the document)")) 380 (error "XML: invalid DTD (excepting name of the document)"))
341 (condition-case nil 381 (condition-case nil
342 (progn 382 (progn
343 (forward-word 1) 383 (forward-sexp)
344 (goto-char (- (re-search-forward "[ \t\n\r]") 1)) 384 (skip-syntax-forward " ")
345 (goto-char (- (re-search-forward "[^ \t\n\r]") 1))
346 (if (looking-at "\\[") 385 (if (looking-at "\\[")
347 (re-search-forward "\\][ \t\n\r]*>" end) 386 (re-search-forward "]\\s-*>")
348 (search-forward ">" end))) 387 (search-forward ">")))
349 (error (error "XML: No end to the DTD")))) 388 (error (error "XML: No end to the DTD"))))
350 389
351 (defun xml-parse-dtd (end) 390 (defun xml-parse-dtd ()
352 "Parse the DTD that point is looking at. 391 "Parse the DTD at point."
353 The DTD must end before the position END in the current buffer." 392 (forward-char (eval-when-compile (length "<!DOCTYPE")))
354 (forward-char (length "<!DOCTYPE")) 393 (skip-syntax-forward " ")
355 (goto-char (- (re-search-forward "[^ \t\n\r]") 1))
356 (if (looking-at ">") 394 (if (looking-at ">")
357 (error "XML: invalid DTD (excepting name of the document)")) 395 (error "XML: invalid DTD (excepting name of the document)"))
358 396
359 ;; Get the name of the document 397 ;; Get the name of the document
360 (looking-at "\\sw+") 398 (looking-at xml-name-regexp)
361 (let ((dtd (list (match-string-no-properties 0) 'dtd)) 399 (let ((dtd (list (match-string 0) 'dtd))
362 type element end-pos) 400 type element end-pos)
363 (goto-char (match-end 0)) 401 (goto-char (match-end 0))
364 402
365 (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) 403 (skip-syntax-forward " ")
366 404 ;; XML [75]
367 ;; External DTDs => don't know how to handle them yet 405 (cond ((looking-at "PUBLIC\\s-+")
368 (if (looking-at "SYSTEM") 406 (goto-char (match-end 0))
369 (error "XML: Don't know how to handle external DTDs")) 407 (unless (or (re-search-forward
370 408 "\\=\"\\([[:space:][:alnum:]-'()+,./:=?;!*#@$_%]*\\)\""
371 (if (not (= (char-after) ?\[)) 409 nil t)
372 (error "XML: Unknown declaration in the DTD")) 410 (re-search-forward
373 411 "\\='\\([[:space:][:alnum:]-()+,./:=?;!*#@$_%]*\\)'"
374 ;; Parse the rest of the DTD 412 nil t))
375 (forward-char 1) 413 (error "XML: missing public id"))
376 (while (and (not (looking-at "[ \t\n\r]*\\]")) 414 (let ((pubid (match-string 1)))
377 (<= (point) end)) 415 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t)
378 (cond 416 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t))
379 417 (error "XML: missing system id"))
380 ;; Translation of rule [45] of XML specifications 418 (push (list pubid (match-string 1) 'public) dtd)))
381 ((looking-at 419 ((looking-at "SYSTEM\\s-+")
382 "[ \t\n\r]*<!ELEMENT[ \t\n\r]+\\([a-zA-Z0-9.%;]+\\)[ \t\n\r]+\\([^>]+\\)>") 420 (goto-char (match-end 0))
383 421 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t)
384 (setq element (intern (match-string-no-properties 1)) 422 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t))
385 type (match-string-no-properties 2)) 423 (error "XML: missing system id"))
386 (setq end-pos (match-end 0)) 424 (push (list (match-string 1) 'system) dtd)))
387 425 (skip-syntax-forward " ")
388 ;; Translation of rule [46] of XML specifications 426 (if (eq ?> (char-after))
389 (cond 427 (forward-char)
390 ((string-match "^EMPTY[ \t\n\r]*$" type) ;; empty declaration 428 (skip-syntax-forward " ")
391 (setq type 'empty)) 429 (if (not (eq (char-after) ?\[))
392 ((string-match "^ANY[ \t\n\r]*$" type) ;; any type of contents 430 (error "XML: bad DTD")
393 (setq type 'any)) 431 (forward-char)
394 ((string-match "^(\\(.*\\))[ \t\n\r]*$" type) ;; children ([47]) 432 ;; Parse the rest of the DTD
395 (setq type (xml-parse-elem-type (match-string-no-properties 1 type)))) 433 ;; Fixme: Deal with ENTITY, ATTLIST, NOTATION, PIs.
396 ((string-match "^%[^;]+;[ \t\n\r]*$" type) ;; substitution 434 (while (not (looking-at "\\s-*\\]"))
397 nil) 435 (skip-syntax-forward " ")
398 (t 436 (cond
399 (error "XML: Invalid element type in the DTD"))) 437
400 438 ;; Translation of rule [45] of XML specifications
401 ;; rule [45]: the element declaration must be unique 439 ((looking-at
402 (if (assoc element dtd) 440 "<!ELEMENT\\s-+\\([[:alnum:].%;]+\\)\\s-+\\([^>]+\\)>")
403 (error "XML: elements declaration must be unique in a DTD (<%s>)" 441
404 (symbol-name element))) 442 (setq element (intern (match-string 1))
405 443 type (match-string-no-properties 2))
406 ;; Store the element in the DTD 444 (setq end-pos (match-end 0))
407 (push (list element type) dtd) 445
408 (goto-char end-pos)) 446 ;; Translation of rule [46] of XML specifications
409 447 (cond
410 448 ((string-match "^EMPTY[ \t\n\r]*$" type) ;; empty declaration
411 (t 449 (setq type 'empty))
412 (error "XML: Invalid DTD item")) 450 ((string-match "^ANY[ \t\n\r]*$" type) ;; any type of contents
413 ) 451 (setq type 'any))
414 ) 452 ((string-match "^(\\(.*\\))[ \t\n\r]*$" type) ;; children ([47])
415 453 (setq type (xml-parse-elem-type (match-string 1 type))))
416 ;; Skip the end of the DTD 454 ((string-match "^%[^;]+;[ \t\n\r]*$" type) ;; substitution
417 (search-forward ">" end) 455 nil)
456 (t
457 (error "XML: Invalid element type in the DTD")))
458
459 ;; rule [45]: the element declaration must be unique
460 (if (assoc element dtd)
461 (error "XML: element declarations must be unique in a DTD (<%s>)"
462 (symbol-name element)))
463
464 ;; Store the element in the DTD
465 (push (list element type) dtd)
466 (goto-char end-pos))
467 ((looking-at "<!--")
468 (search-forward "-->"))
469
470 (t
471 (error "XML: Invalid DTD item")))
472
473 ;; Skip the end of the DTD
474 (search-forward ">"))))
418 (nreverse dtd))) 475 (nreverse dtd)))
419 476
420 477
421 (defun xml-parse-elem-type (string) 478 (defun xml-parse-elem-type (string)
422 "Convert a STRING for an element type into an elisp structure." 479 "Convert element type STRING into a Lisp structure."
423 480
424 (let (elem modifier) 481 (let (elem modifier)
425 (if (string-match "(\\([^)]+\\))\\([+*?]?\\)" string) 482 (if (string-match "(\\([^)]+\\))\\([+*?]?\\)" string)
426 (progn 483 (progn
427 (setq elem (match-string 1 string) 484 (setq elem (match-string 1 string)
431 (mapcar 'xml-parse-elem-type 488 (mapcar 'xml-parse-elem-type
432 (split-string elem "|")))) 489 (split-string elem "|"))))
433 (if (string-match "," elem) 490 (if (string-match "," elem)
434 (setq elem (cons 'seq 491 (setq elem (cons 'seq
435 (mapcar 'xml-parse-elem-type 492 (mapcar 'xml-parse-elem-type
436 (split-string elem ",")))) 493 (split-string elem ",")))))))
437 )))
438 (if (string-match "[ \t\n\r]*\\([^+*?]+\\)\\([+*?]?\\)" string) 494 (if (string-match "[ \t\n\r]*\\([^+*?]+\\)\\([+*?]?\\)" string)
439 (setq elem (match-string 1 string) 495 (setq elem (match-string 1 string)
440 modifier (match-string 2 string)))) 496 modifier (match-string 2 string))))
441 497
442 (if (and (stringp elem) (string= elem "#PCDATA")) 498 (if (and (stringp elem) (string= elem "#PCDATA"))
452 (t 508 (t
453 elem)))) 509 elem))))
454 510
455 ;;******************************************************************* 511 ;;*******************************************************************
456 ;;** 512 ;;**
457 ;;** Converting code points to strings
458 ;;**
459 ;;*******************************************************************
460
461 (defun xml-ucs-to-string (codepoint)
462 "Return a string representation of CODEPOINT. If it can't be
463 converted, return '?'."
464 (cond ((boundp 'decode-char)
465 (char-to-string (decode-char 'ucs codepoint)))
466 ((and (< codepoint 128)
467 (> codepoint 31))
468 (char-to-string codepoint))
469 (t "?"))) ; FIXME: There's gotta be a better way to
470 ; designate an unknown character.
471
472 ;;*******************************************************************
473 ;;**
474 ;;** Substituting special XML sequences 513 ;;** Substituting special XML sequences
475 ;;** 514 ;;**
476 ;;******************************************************************* 515 ;;*******************************************************************
477 516
517 (eval-when-compile
518 (defvar str)) ; dynamic from replace-regexp-in-string
519
520 ;; Fixme: Take declared entities from the DTD when they're available.
521 (defun xml-substitute-entity (match)
522 "Subroutine of xml-substitute-special."
523 (save-match-data
524 (let ((match1 (match-string 1 str)))
525 (cond ((string= match1 "lt") "<")
526 ((string= match1 "gt") ">")
527 ((string= match1 "apos") "'")
528 ((string= match1 "quot") "\"")
529 ((string= match1 "amp") "&")
530 ((and (string-match "#\\([0-9]+\\)" match1)
531 (let ((c (decode-char
532 'ucs
533 (string-to-number (match-string 1 match1)))))
534 (if c (string c))))) ; else unrepresentable
535 ((and (string-match "#x\\([[:xdigit:]]+\\)" match1)
536 (let ((c (decode-char
537 'ucs
538 (string-to-number (match-string 1 match1) 16))))
539 (if c (string c)))))
540 ;; Default to asis. Arguably, unrepresentable code points
541 ;; might be best replaced with U+FFFD.
542 (t match)))))
543
478 (defun xml-substitute-special (string) 544 (defun xml-substitute-special (string)
479 "Return STRING, after subsituting special XML sequences." 545 "Return STRING, after subsituting entity references."
480 (while (string-match "&lt;" string) 546 ;; This originally made repeated passes through the string from the
481 (setq string (replace-match "<" t nil string))) 547 ;; beginning, which isn't correct, since then either "&amp;amp;" or
482 (while (string-match "&gt;" string) 548 ;; "&#38;amp;" won't DTRT.
483 (setq string (replace-match ">" t nil string))) 549 (replace-regexp-in-string "&\\([^;]+\\);"
484 (while (string-match "&apos;" string) 550 #'xml-substitute-entity string t t))
485 (setq string (replace-match "'" t nil string)))
486 (while (string-match "&quot;" string)
487 (setq string (replace-match "\"" t nil string)))
488 (while (string-match "&#\\([0-9]+\\);" string)
489 (setq string (replace-match (xml-ucs-to-string
490 (string-to-number
491 (match-string-no-properties 1 string)))
492 t nil string)))
493 (while (string-match "&#x\\([0-9a-fA-F]+\\);" string)
494 (setq string (replace-match (xml-ucs-to-string
495 (string-to-number
496 (match-string-no-properties 1 string)
497 16))
498 t nil string)))
499
500 ;; This goes last so it doesn't confuse the matches above.
501 (while (string-match "&amp;" string)
502 (setq string (replace-match "&" t nil string)))
503 string)
504 551
505 ;;******************************************************************* 552 ;;*******************************************************************
506 ;;** 553 ;;**
507 ;;** Printing a tree. 554 ;;** Printing a tree.
508 ;;** This function is intended mainly for debugging purposes. 555 ;;** This function is intended mainly for debugging purposes.
513 (dolist (node xml) 560 (dolist (node xml)
514 (xml-debug-print-internal node ""))) 561 (xml-debug-print-internal node "")))
515 562
516 (defun xml-debug-print-internal (xml indent-string) 563 (defun xml-debug-print-internal (xml indent-string)
517 "Outputs the XML tree in the current buffer. 564 "Outputs the XML tree in the current buffer.
518 The first line indented with INDENT-STRING." 565 The first line is indented with INDENT-STRING."
519 (let ((tree xml) 566 (let ((tree xml)
520 attlist) 567 attlist)
521 (insert indent-string "<" (symbol-name (xml-node-name tree))) 568 (insert indent-string ?< (symbol-name (xml-node-name tree)))
522 569
523 ;; output the attribute list 570 ;; output the attribute list
524 (setq attlist (xml-node-attributes tree)) 571 (setq attlist (xml-node-attributes tree))
525 (while attlist 572 (while attlist
526 (insert " ") 573 (insert ?\ (symbol-name (caar attlist)) "=\"" (cdar attlist) ?\")
527 (insert (symbol-name (caar attlist)) "=\"" (cdar attlist) "\"")
528 (setq attlist (cdr attlist))) 574 (setq attlist (cdr attlist)))
529 575
530 (insert ">") 576 (insert ?>)
531 577
532 (setq tree (xml-node-children tree)) 578 (setq tree (xml-node-children tree))
533 579
534 ;; output the children 580 ;; output the children
535 (dolist (node tree) 581 (dolist (node tree)
536 (cond 582 (cond
537 ((listp node) 583 ((listp node)
538 (insert "\n") 584 (insert ?\n)
539 (xml-debug-print-internal node (concat indent-string " "))) 585 (xml-debug-print-internal node (concat indent-string " ")))
540 ((stringp node) (insert node)) 586 ((stringp node) (insert node))
541 (t 587 (t
542 (error "Invalid XML tree")))) 588 (error "Invalid XML tree"))))
543 589
544 (insert "\n" indent-string 590 (insert ?\n indent-string
545 "</" (symbol-name (xml-node-name xml)) ">"))) 591 ?< ?/ (symbol-name (xml-node-name xml)) ?>)))
546 592
547 (provide 'xml) 593 (provide 'xml)
548 594
549 ;;; xml.el ends here 595 ;;; xml.el ends here