Mercurial > emacs
comparison lisp/xml.el @ 51102:d604b76d3bbd
Doc fixes.
(xml-parse-file, xml-parse-region): Autoload.
(xml-syntax-table, xml-name-regexp): New.
(xml-parse-region): Narrow to region, set syntax-table and
case-fold-search. Reject fewer valid documents.
(xml-parse-tag): Remove arg END. Callers changed.
(xml-parse-tag): Use skip-syntax-forward. Use PARSE-DTD arg
properly. Don't use buffer-substring-no-properties. Don't bind
case-fold-search. Fix syntax for empty elements. Hoist consing
of end-of-tag regexp out of loop.
(xml-parse-attlist): Remove arg. Callers changed. Use
skip-syntax-forward, replace-regexp-in-string, forward-sexp.
Allow non-ASCII names.
(xml-skip-dtd): Remove arg. Callers changed. Change matching
code.
(xml-parse-dtd): Grok external DTDs. Allow non-ASCII. Don't use
match-string-no-properties.
(xml-ucs-to-string): Deleted.
(xml-substitute-entity): New.
(xml-substitute-special): Use it.
(xml-debug-print-internal): Simplify insertions.
(xml-parse-file): Avoid finding file in xml-mode.
author | Dave Love <fx@gnu.org> |
---|---|
date | Mon, 19 May 2003 16:01:39 +0000 |
parents | 575aa6820adc |
children | aac5eaf1454e |
comparison
equal
deleted
inserted
replaced
51101:821f85e23a1f | 51102:d604b76d3bbd |
---|---|
1 ;;; xml.el --- XML parser | 1 ;;; xml.el --- XML parser |
2 | 2 |
3 ;; Copyright (C) 2000, 2001 Free Software Foundation, Inc. | 3 ;; Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc. |
4 | 4 |
5 ;; Author: Emmanuel Briot <briot@gnat.com> | 5 ;; Author: Emmanuel Briot <briot@gnat.com> |
6 ;; Maintainer: Emmanuel Briot <briot@gnat.com> | 6 ;; Maintainer: FSF |
7 ;; Keywords: xml | 7 ;; Keywords: xml, data |
8 | 8 |
9 ;; This file is part of GNU Emacs. | 9 ;; This file is part of GNU Emacs. |
10 | 10 |
11 ;; GNU Emacs is free software; you can redistribute it and/or modify | 11 ;; GNU Emacs is free software; you can redistribute it and/or modify |
12 ;; it under the terms of the GNU General Public License as published by | 12 ;; it under the terms of the GNU General Public License as published by |
23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | 23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
24 ;; Boston, MA 02111-1307, USA. | 24 ;; Boston, MA 02111-1307, USA. |
25 | 25 |
26 ;;; Commentary: | 26 ;;; Commentary: |
27 | 27 |
28 ;; This file contains a full XML parser. It parses a file, and returns a list | 28 ;; This file contains a somewhat incomplete non-validating XML parser. It |
29 ;; that can be used internally by any other lisp file. | 29 ;; parses a file, and returns a list that can be used internally by |
30 ;; See some example in todo.el | 30 ;; any other lisp libraries. |
31 | 31 |
32 ;;; FILE FORMAT | 32 ;;; FILE FORMAT |
33 | 33 |
34 ;; It does not parse the DTD, if present in the XML file, but knows how to | 34 ;; The document type declaration may either be ignored or (optionally) |
35 ;; ignore it. The XML file is assumed to be well-formed. In case of error, the | 35 ;; parsed, but currently the parsing will only accept element |
36 ;; parsing stops and the XML file is shown where the parsing stopped. | 36 ;; declarations. The XML file is assumed to be well-formed. In case |
37 ;; of error, the parsing stops and the XML file is shown where the | |
38 ;; parsing stopped. | |
37 ;; | 39 ;; |
38 ;; It also knows how to ignore comments, as well as the special ?xml? tag | 40 ;; It also knows how to ignore comments and processing instructions. |
39 ;; in the XML file. | |
40 ;; | 41 ;; |
41 ;; The XML file should have the following format: | 42 ;; The XML file should have the following format: |
42 ;; <node1 attr1="name1" attr2="name2" ...>value | 43 ;; <node1 attr1="name1" attr2="name2" ...>value |
43 ;; <node2 attr3="name3" attr4="name4">value2</node2> | 44 ;; <node2 attr3="name3" attr4="name4">value2</node2> |
44 ;; <node3 attr5="name5" attr6="name6">value3</node3> | 45 ;; <node3 attr5="name5" attr6="name6">value3</node3> |
61 ;; tag_name ::= string | 62 ;; tag_name ::= string |
62 ;; attribute_list ::= (("attribute" . "value") ("attribute" . "value") ...) | 63 ;; attribute_list ::= (("attribute" . "value") ("attribute" . "value") ...) |
63 ;; | nil | 64 ;; | nil |
64 ;; string ::= "..." | 65 ;; string ::= "..." |
65 ;; | 66 ;; |
66 ;; Some macros are provided to ease the parsing of this list | 67 ;; Some macros are provided to ease the parsing of this list. |
68 ;; Whitespace is preserved. Fixme: There should be a tree-walker that | |
69 ;; can remove it. | |
67 | 70 |
68 ;;; Code: | 71 ;;; Code: |
72 | |
73 ;; Note that {buffer-substring,match-string}-no-properties were | |
74 ;; formerly used in several places, but that removes composition info. | |
69 | 75 |
70 ;;******************************************************************* | 76 ;;******************************************************************* |
71 ;;** | 77 ;;** |
72 ;;** Macros to parse the list | 78 ;;** Macros to parse the list |
73 ;;** | 79 ;;** |
112 ;;** | 118 ;;** |
113 ;;** Creating the list | 119 ;;** Creating the list |
114 ;;** | 120 ;;** |
115 ;;******************************************************************* | 121 ;;******************************************************************* |
116 | 122 |
123 ;;;###autoload | |
117 (defun xml-parse-file (file &optional parse-dtd) | 124 (defun xml-parse-file (file &optional parse-dtd) |
118 "Parse the well-formed XML FILE. | 125 "Parse the well-formed XML file FILE. |
119 If FILE is already edited, this will keep the buffer alive. | 126 If FILE is already visited, use its buffer and don't kill it. |
120 Returns the top node with all its children. | 127 Returns the top node with all its children. |
121 If PARSE-DTD is non-nil, the DTD is parsed rather than skipped." | 128 If PARSE-DTD is non-nil, the DTD is parsed rather than skipped." |
122 (let ((keep)) | 129 (let ((keep)) |
123 (if (get-file-buffer file) | 130 (if (get-file-buffer file) |
124 (progn | 131 (progn |
125 (set-buffer (get-file-buffer file)) | 132 (set-buffer (get-file-buffer file)) |
126 (setq keep (point))) | 133 (setq keep (point))) |
127 (find-file file)) | 134 (let (auto-mode-alist) ; no need for xml-mode |
135 (find-file file))) | |
128 | 136 |
129 (let ((xml (xml-parse-region (point-min) | 137 (let ((xml (xml-parse-region (point-min) |
130 (point-max) | 138 (point-max) |
131 (current-buffer) | 139 (current-buffer) |
132 parse-dtd))) | 140 parse-dtd))) |
133 (if keep | 141 (if keep |
134 (goto-char keep) | 142 (goto-char keep) |
135 (kill-buffer (current-buffer))) | 143 (kill-buffer (current-buffer))) |
136 xml))) | 144 xml))) |
137 | 145 |
146 ;; Note that this is setup so that we can do whitespace-skipping with | |
147 ;; `(skip-syntax-forward " ")', inter alia. Previously this was slow | |
148 ;; compared with `re-search-forward', but that has been fixed. Also | |
149 ;; note that the standard syntax table contains other characters with | |
150 ;; whitespace syntax, like NBSP, but they are invalid in contexts in | |
151 ;; which we might skip whitespace -- specifically, they're not | |
152 ;; NameChars [XML 4]. | |
153 | |
154 (defvar xml-syntax-table | |
155 (let ((table (make-syntax-table))) | |
156 ;; Get space syntax correct per XML [3]. | |
157 (dotimes (c 31) | |
158 (modify-syntax-entry c "." table)) ; all are space in standard table | |
159 (dolist (c '(?\t ?\n ?\r)) ; these should be space | |
160 (modify-syntax-entry c " " table)) | |
161 ;; For skipping attributes. | |
162 (modify-syntax-entry ?\" "\"" table) | |
163 (modify-syntax-entry ?' "\"" table) | |
164 ;; Non-alnum name chars should be symbol constituents (`-' and `_' | |
165 ;; are OK by default). | |
166 (modify-syntax-entry ?. "_" table) | |
167 (modify-syntax-entry ?: "_" table) | |
168 ;; XML [89] | |
169 (dolist (c '(#x00B7 #x02D0 #x02D1 #x0387 #x0640 #x0E46 #x0EC6 #x3005 | |
170 #x3031 #x3032 #x3033 #x3034 #x3035 #x309D #x309E #x30FC | |
171 #x30FD #x30FE)) | |
172 (modify-syntax-entry (decode-char 'ucs c) "w" table)) | |
173 ;; Fixme: rest of [4] | |
174 table) | |
175 "Syntax table used by `xml-parse-region'.") | |
176 | |
177 ;; XML [5] | |
178 ;; Note that [:alpha:] matches all multibyte chars with word syntax. | |
179 (defconst xml-name-regexp "[[:alpha:]_:][[:alnum:]._:-]*") | |
180 | |
181 ;; Fixme: This needs re-writing to deal with the XML grammar properly, i.e. | |
182 ;; document ::= prolog element Misc* | |
183 ;; prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? | |
184 | |
185 ;;;###autoload | |
138 (defun xml-parse-region (beg end &optional buffer parse-dtd) | 186 (defun xml-parse-region (beg end &optional buffer parse-dtd) |
139 "Parse the region from BEG to END in BUFFER. | 187 "Parse the region from BEG to END in BUFFER. |
140 If BUFFER is nil, it defaults to the current buffer. | 188 If BUFFER is nil, it defaults to the current buffer. |
141 Returns the XML list for the region, or raises an error if the region | 189 Returns the XML list for the region, or raises an error if the region |
142 is not a well-formed XML file. | 190 is not a well-formed XML file. |
143 If PARSE-DTD is non-nil, the DTD is parsed rather than skipped, | 191 If PARSE-DTD is non-nil, the DTD is parsed rather than skipped, |
144 and returned as the first element of the list" | 192 and returned as the first element of the list." |
145 (let (xml result dtd) | 193 (save-restriction |
146 (save-excursion | 194 (narrow-to-region beg end) |
147 (if buffer | 195 ;; Use fixed syntax table to ensure regexp char classes and syntax |
148 (set-buffer buffer)) | 196 ;; specs DTRT. |
149 (goto-char beg) | 197 (with-syntax-table (standard-syntax-table) |
150 (while (< (point) end) | 198 (let ((case-fold-search nil) ; XML is case-sensitive. |
151 (if (search-forward "<" end t) | 199 xml result dtd) |
152 (progn | 200 (save-excursion |
153 (forward-char -1) | 201 (if buffer |
154 (if (null xml) | 202 (set-buffer buffer)) |
155 (progn | 203 (goto-char (point-min)) |
156 (setq result (xml-parse-tag end parse-dtd)) | 204 (while (not (eobp)) |
205 (if (search-forward "<" nil t) | |
206 (progn | |
207 (forward-char -1) | |
208 (if xml | |
209 ;; translation of rule [1] of XML specifications | |
210 (error "XML files can have only one toplevel tag") | |
211 (setq result (xml-parse-tag parse-dtd)) | |
157 (cond | 212 (cond |
158 ((null result)) | 213 ((null result)) |
159 ((listp (car result)) | 214 ((listp (car result)) |
160 (setq dtd (car result)) | 215 (setq dtd (car result)) |
161 (add-to-list 'xml (cdr result))) | 216 (if (cdr result) ; possible leading comment |
217 (add-to-list 'xml (cdr result)))) | |
162 (t | 218 (t |
163 (add-to-list 'xml result)))) | 219 (add-to-list 'xml result))))) |
164 | 220 (goto-char (point-max)))) |
165 ;; translation of rule [1] of XML specifications | 221 (if parse-dtd |
166 (error "XML files can have only one toplevel tag"))) | 222 (cons dtd (nreverse xml)) |
167 (goto-char end))) | 223 (nreverse xml))))))) |
168 (if parse-dtd | 224 |
169 (cons dtd (reverse xml)) | 225 |
170 (reverse xml))))) | 226 (defun xml-parse-tag (&optional parse-dtd) |
171 | 227 "Parse the tag at point. |
172 | |
173 (defun xml-parse-tag (end &optional parse-dtd) | |
174 "Parse the tag that is just in front of point. | |
175 The end tag must be found before the position END in the current buffer. | |
176 If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and | 228 If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and |
177 returned as the first element in the list. | 229 returned as the first element in the list. |
178 Returns one of: | 230 Returns one of: |
179 - a list : the matching node | 231 - a list : the matching node |
180 - nil : the point is not looking at a tag. | 232 - nil : the point is not looking at a tag. |
181 - a cons cell: the first element is the DTD, the second is the node" | 233 - a pair : the first element is the DTD, the second is the node." |
182 (cond | 234 (cond |
183 ;; Processing instructions (like the <?xml version="1.0"?> tag at the | 235 ;; Processing instructions (like the <?xml version="1.0"?> tag at the |
184 ;; beginning of a document) | 236 ;; beginning of a document). |
185 ((looking-at "<\\?") | 237 ((looking-at "<\\?") |
186 (search-forward "?>" end) | 238 (search-forward "?>") |
187 (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) | 239 (skip-syntax-forward " ") |
188 (xml-parse-tag end)) | 240 (xml-parse-tag parse-dtd)) |
189 ;; Character data (CDATA) sections, in which no tag should be interpreted | 241 ;; Character data (CDATA) sections, in which no tag should be interpreted |
190 ((looking-at "<!\\[CDATA\\[") | 242 ((looking-at "<!\\[CDATA\\[") |
191 (let ((pos (match-end 0))) | 243 (let ((pos (match-end 0))) |
192 (unless (search-forward "]]>" end t) | 244 (unless (search-forward "]]>" nil t) |
193 (error "CDATA section does not end anywhere in the document")) | 245 (error "CDATA section does not end anywhere in the document")) |
194 (buffer-substring-no-properties pos (match-beginning 0)))) | 246 (buffer-substring pos (match-beginning 0)))) |
195 ;; DTD for the document | 247 ;; DTD for the document |
196 ((looking-at "<!DOCTYPE") | 248 ((looking-at "<!DOCTYPE") |
197 (let (dtd) | 249 (let (dtd) |
198 (if parse-dtd | 250 (if parse-dtd |
199 (setq dtd (xml-parse-dtd end)) | 251 (setq dtd (xml-parse-dtd)) |
200 (xml-skip-dtd end)) | 252 (xml-skip-dtd)) |
201 (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) | 253 (skip-syntax-forward " ") |
202 (if dtd | 254 (if dtd |
203 (cons dtd (xml-parse-tag end)) | 255 (cons dtd (xml-parse-tag)) |
204 (xml-parse-tag end)))) | 256 (xml-parse-tag)))) |
205 ;; skip comments | 257 ;; skip comments |
206 ((looking-at "<!--") | 258 ((looking-at "<!--") |
207 (search-forward "-->" end) | 259 (search-forward "-->") |
208 nil) | 260 nil) |
209 ;; end tag | 261 ;; end tag |
210 ((looking-at "</") | 262 ((looking-at "</") |
211 '()) | 263 '()) |
212 ;; opening tag | 264 ;; opening tag |
213 ((looking-at "<\\([^/> \t\n\r]+\\)") | 265 ((looking-at "<\\([^/>[:space:]]+\\)") |
214 (goto-char (match-end 1)) | 266 (goto-char (match-end 1)) |
215 (let* ((case-fold-search nil) ;; XML is case-sensitive. | 267 (let* ((node-name (match-string 1)) |
216 (node-name (match-string 1)) | |
217 ;; Parse the attribute list. | 268 ;; Parse the attribute list. |
218 (children (list (xml-parse-attlist end) (intern node-name))) | 269 (children (list (xml-parse-attlist) (intern node-name))) |
219 pos) | 270 pos) |
220 | 271 |
221 ;; is this an empty element ? | 272 ;; is this an empty element ? |
222 (if (looking-at "/[ \t\n\r]*>") | 273 (if (looking-at "/>") |
223 (progn | 274 (progn |
224 (forward-char 2) | 275 (forward-char 2) |
276 ;; Fixme: Inconsistent with the nil content returned from | |
277 ;; `<tag></tag>'. | |
225 (nreverse (cons '("") children))) | 278 (nreverse (cons '("") children))) |
226 | 279 |
227 ;; is this a valid start tag ? | 280 ;; is this a valid start tag ? |
228 (if (eq (char-after) ?>) | 281 (if (eq (char-after) ?>) |
229 (progn | 282 (progn |
230 (forward-char 1) | 283 (forward-char 1) |
231 ;; Now check that we have the right end-tag. Note that this | 284 ;; Now check that we have the right end-tag. Note that this |
232 ;; one might contain spaces after the tag name | 285 ;; one might contain spaces after the tag name |
233 (while (not (looking-at (concat "</" node-name "[ \t\n\r]*>"))) | 286 (let ((end (concat "</" node-name "\\s-*>"))) |
234 (cond | 287 (while (not (looking-at end)) |
235 ((looking-at "</") | 288 (cond |
236 (error (concat | 289 ((looking-at "</") |
237 "XML: invalid syntax -- invalid end tag (expecting " | 290 (error "XML: Invalid end tag (expecting %s) at pos %d" |
238 node-name | 291 node-name (point))) |
239 ") at pos " (number-to-string (point))))) | 292 ((= (char-after) ?<) |
240 ((= (char-after) ?<) | 293 (let ((tag (xml-parse-tag))) |
241 (let ((tag (xml-parse-tag end))) | 294 (when tag |
242 (when tag | 295 (push tag children)))) |
243 (push tag children)))) | 296 (t |
244 (t | 297 (setq pos (point)) |
245 (setq pos (point)) | 298 (search-forward "<") |
246 (search-forward "<" end) | 299 (forward-char -1) |
247 (forward-char -1) | 300 (let ((string (buffer-substring pos (point))) |
248 (let ((string (buffer-substring-no-properties pos (point))) | 301 (pos 0)) |
249 (pos 0)) | 302 |
250 | 303 ;; Clean up the string. As per XML |
251 ;; Clean up the string. As per XML | 304 ;; specifications, the XML processor should |
252 ;; specifications, the XML processor should | 305 ;; always pass the whole string to the |
253 ;; always pass the whole string to the | 306 ;; application. But \r's should be replaced: |
254 ;; application. But \r's should be replaced: | 307 ;; http://www.w3.org/TR/2000/REC-xml-20001006#sec-line-ends |
255 ;; http://www.w3.org/TR/2000/REC-xml-20001006#sec-line-ends | 308 (while (string-match "\r\n?" string pos) |
256 (while (string-match "\r\n?" string pos) | 309 (setq string (replace-match "\n" t t string)) |
257 (setq string (replace-match "\n" t t string)) | 310 (setq pos (1+ (match-beginning 0)))) |
258 (setq pos (1+ (match-beginning 0)))) | 311 |
259 | 312 (setq string (xml-substitute-special string)) |
260 (setq string (xml-substitute-special string)) | 313 (setq children |
261 (setq children | 314 (if (stringp (car children)) |
262 (if (stringp (car children)) | 315 ;; The two strings were separated by a comment. |
263 ;; The two strings were separated by a comment. | 316 (cons (concat (car children) string) |
264 (cons (concat (car children) string) | 317 (cdr children)) |
265 (cdr children)) | 318 (cons string children)))))))) |
266 (cons string children))))))) | 319 |
267 (goto-char (match-end 0)) | 320 (goto-char (match-end 0)) |
268 (if (> (point) end) | |
269 (error "XML: End tag for %s not found before end of region" | |
270 node-name)) | |
271 (nreverse children)) | 321 (nreverse children)) |
272 | |
273 ;; This was an invalid start tag | 322 ;; This was an invalid start tag |
274 (error "XML: Invalid attribute list") | 323 (error "XML: Invalid attribute list"))))) |
275 )))) | |
276 (t ;; This is not a tag. | 324 (t ;; This is not a tag. |
277 (error "XML: Invalid character")) | 325 (error "XML: Invalid character")))) |
278 )) | 326 |
279 | 327 (defun xml-parse-attlist () |
280 (defun xml-parse-attlist (end) | 328 "Return the attribute-list after point. |
281 "Return the attribute-list that point is looking at. | 329 Leave point at the first non-blank character after the tag." |
282 The search for attributes end at the position END in the current buffer. | |
283 Leaves the point on the first non-blank character after the tag." | |
284 (let ((attlist ()) | 330 (let ((attlist ()) |
285 start-pos name) | 331 start-pos name) |
286 (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) | 332 (skip-syntax-forward " ") |
287 (while (looking-at "\\([a-zA-Z_:][-a-zA-Z0-9._:]*\\)[ \t\n\r]*=[ \t\n\r]*") | 333 (while (looking-at (eval-when-compile |
334 (concat "\\(" xml-name-regexp "\\)\\s-*=\\s-*"))) | |
288 (setq name (intern (match-string 1))) | 335 (setq name (intern (match-string 1))) |
289 (goto-char (match-end 0)) | 336 (goto-char (match-end 0)) |
290 | 337 |
291 ;; See also: http://www.w3.org/TR/2000/REC-xml-20001006#AVNormalize | 338 ;; See also: http://www.w3.org/TR/2000/REC-xml-20001006#AVNormalize |
292 | 339 |
302 (if (assoc name attlist) | 349 (if (assoc name attlist) |
303 (error "XML: each attribute must be unique within an element")) | 350 (error "XML: each attribute must be unique within an element")) |
304 | 351 |
305 ;; Multiple whitespace characters should be replaced with a single one | 352 ;; Multiple whitespace characters should be replaced with a single one |
306 ;; in the attributes | 353 ;; in the attributes |
307 (let ((string (match-string-no-properties 1)) | 354 (let ((string (match-string 1)) |
308 (pos 0)) | 355 (pos 0)) |
309 (while (string-match "[ \t\n\r]+" string pos) | 356 (replace-regexp-in-string "\\s-\\{2,\\}" " " string) |
310 (setq string (replace-match " " t nil string)) | |
311 (setq pos (1+ (match-beginning 0)))) | |
312 (push (cons name (xml-substitute-special string)) attlist)) | 357 (push (cons name (xml-substitute-special string)) attlist)) |
313 | 358 |
314 (goto-char start-pos) | 359 (goto-char start-pos) |
315 (if (looking-at "\"\\([^\"]*\\)\"") | 360 (forward-sexp) ; we have string syntax |
316 (goto-char (match-end 0)) | 361 |
317 (if (looking-at "'\\([^']*\\)'") | 362 (skip-syntax-forward " ")) |
318 (goto-char (match-end 0)))) | |
319 | |
320 (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) | |
321 (if (> (point) end) | |
322 (error "XML: end of attribute list not found before end of region"))) | |
323 (nreverse attlist))) | 363 (nreverse attlist))) |
324 | 364 |
325 ;;******************************************************************* | 365 ;;******************************************************************* |
326 ;;** | 366 ;;** |
327 ;;** The DTD (document type declaration) | 367 ;;** The DTD (document type declaration) |
328 ;;** The following functions know how to skip or parse the DTD of | 368 ;;** The following functions know how to skip or parse the DTD of |
329 ;;** a document | 369 ;;** a document |
330 ;;** | 370 ;;** |
331 ;;******************************************************************* | 371 ;;******************************************************************* |
332 | 372 |
333 (defun xml-skip-dtd (end) | 373 ;; Fixme: This fails at least if the DTD contains conditional sections. |
334 "Skip the DTD that point is looking at. | 374 |
335 The DTD must end before the position END in the current buffer. | 375 (defun xml-skip-dtd () |
336 The point must be just before the starting tag of the DTD. | 376 "Skip the DTD at point. |
337 This follows the rule [28] in the XML specifications." | 377 This follows the rule [28] in the XML specifications." |
338 (forward-char (length "<!DOCTYPE")) | 378 (forward-char (length "<!DOCTYPE")) |
339 (if (looking-at "[ \t\n\r]*>") | 379 (if (looking-at "\\s-*>") |
340 (error "XML: invalid DTD (excepting name of the document)")) | 380 (error "XML: invalid DTD (excepting name of the document)")) |
341 (condition-case nil | 381 (condition-case nil |
342 (progn | 382 (progn |
343 (forward-word 1) | 383 (forward-sexp) |
344 (goto-char (- (re-search-forward "[ \t\n\r]") 1)) | 384 (skip-syntax-forward " ") |
345 (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) | |
346 (if (looking-at "\\[") | 385 (if (looking-at "\\[") |
347 (re-search-forward "\\][ \t\n\r]*>" end) | 386 (re-search-forward "]\\s-*>") |
348 (search-forward ">" end))) | 387 (search-forward ">"))) |
349 (error (error "XML: No end to the DTD")))) | 388 (error (error "XML: No end to the DTD")))) |
350 | 389 |
351 (defun xml-parse-dtd (end) | 390 (defun xml-parse-dtd () |
352 "Parse the DTD that point is looking at. | 391 "Parse the DTD at point." |
353 The DTD must end before the position END in the current buffer." | 392 (forward-char (eval-when-compile (length "<!DOCTYPE"))) |
354 (forward-char (length "<!DOCTYPE")) | 393 (skip-syntax-forward " ") |
355 (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) | |
356 (if (looking-at ">") | 394 (if (looking-at ">") |
357 (error "XML: invalid DTD (excepting name of the document)")) | 395 (error "XML: invalid DTD (excepting name of the document)")) |
358 | 396 |
359 ;; Get the name of the document | 397 ;; Get the name of the document |
360 (looking-at "\\sw+") | 398 (looking-at xml-name-regexp) |
361 (let ((dtd (list (match-string-no-properties 0) 'dtd)) | 399 (let ((dtd (list (match-string 0) 'dtd)) |
362 type element end-pos) | 400 type element end-pos) |
363 (goto-char (match-end 0)) | 401 (goto-char (match-end 0)) |
364 | 402 |
365 (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) | 403 (skip-syntax-forward " ") |
366 | 404 ;; XML [75] |
367 ;; External DTDs => don't know how to handle them yet | 405 (cond ((looking-at "PUBLIC\\s-+") |
368 (if (looking-at "SYSTEM") | 406 (goto-char (match-end 0)) |
369 (error "XML: Don't know how to handle external DTDs")) | 407 (unless (or (re-search-forward |
370 | 408 "\\=\"\\([[:space:][:alnum:]-'()+,./:=?;!*#@$_%]*\\)\"" |
371 (if (not (= (char-after) ?\[)) | 409 nil t) |
372 (error "XML: Unknown declaration in the DTD")) | 410 (re-search-forward |
373 | 411 "\\='\\([[:space:][:alnum:]-()+,./:=?;!*#@$_%]*\\)'" |
374 ;; Parse the rest of the DTD | 412 nil t)) |
375 (forward-char 1) | 413 (error "XML: missing public id")) |
376 (while (and (not (looking-at "[ \t\n\r]*\\]")) | 414 (let ((pubid (match-string 1))) |
377 (<= (point) end)) | 415 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t) |
378 (cond | 416 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t)) |
379 | 417 (error "XML: missing system id")) |
380 ;; Translation of rule [45] of XML specifications | 418 (push (list pubid (match-string 1) 'public) dtd))) |
381 ((looking-at | 419 ((looking-at "SYSTEM\\s-+") |
382 "[ \t\n\r]*<!ELEMENT[ \t\n\r]+\\([a-zA-Z0-9.%;]+\\)[ \t\n\r]+\\([^>]+\\)>") | 420 (goto-char (match-end 0)) |
383 | 421 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t) |
384 (setq element (intern (match-string-no-properties 1)) | 422 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t)) |
385 type (match-string-no-properties 2)) | 423 (error "XML: missing system id")) |
386 (setq end-pos (match-end 0)) | 424 (push (list (match-string 1) 'system) dtd))) |
387 | 425 (skip-syntax-forward " ") |
388 ;; Translation of rule [46] of XML specifications | 426 (if (eq ?> (char-after)) |
389 (cond | 427 (forward-char) |
390 ((string-match "^EMPTY[ \t\n\r]*$" type) ;; empty declaration | 428 (skip-syntax-forward " ") |
391 (setq type 'empty)) | 429 (if (not (eq (char-after) ?\[)) |
392 ((string-match "^ANY[ \t\n\r]*$" type) ;; any type of contents | 430 (error "XML: bad DTD") |
393 (setq type 'any)) | 431 (forward-char) |
394 ((string-match "^(\\(.*\\))[ \t\n\r]*$" type) ;; children ([47]) | 432 ;; Parse the rest of the DTD |
395 (setq type (xml-parse-elem-type (match-string-no-properties 1 type)))) | 433 ;; Fixme: Deal with ENTITY, ATTLIST, NOTATION, PIs. |
396 ((string-match "^%[^;]+;[ \t\n\r]*$" type) ;; substitution | 434 (while (not (looking-at "\\s-*\\]")) |
397 nil) | 435 (skip-syntax-forward " ") |
398 (t | 436 (cond |
399 (error "XML: Invalid element type in the DTD"))) | 437 |
400 | 438 ;; Translation of rule [45] of XML specifications |
401 ;; rule [45]: the element declaration must be unique | 439 ((looking-at |
402 (if (assoc element dtd) | 440 "<!ELEMENT\\s-+\\([[:alnum:].%;]+\\)\\s-+\\([^>]+\\)>") |
403 (error "XML: elements declaration must be unique in a DTD (<%s>)" | 441 |
404 (symbol-name element))) | 442 (setq element (intern (match-string 1)) |
405 | 443 type (match-string-no-properties 2)) |
406 ;; Store the element in the DTD | 444 (setq end-pos (match-end 0)) |
407 (push (list element type) dtd) | 445 |
408 (goto-char end-pos)) | 446 ;; Translation of rule [46] of XML specifications |
409 | 447 (cond |
410 | 448 ((string-match "^EMPTY[ \t\n\r]*$" type) ;; empty declaration |
411 (t | 449 (setq type 'empty)) |
412 (error "XML: Invalid DTD item")) | 450 ((string-match "^ANY[ \t\n\r]*$" type) ;; any type of contents |
413 ) | 451 (setq type 'any)) |
414 ) | 452 ((string-match "^(\\(.*\\))[ \t\n\r]*$" type) ;; children ([47]) |
415 | 453 (setq type (xml-parse-elem-type (match-string 1 type)))) |
416 ;; Skip the end of the DTD | 454 ((string-match "^%[^;]+;[ \t\n\r]*$" type) ;; substitution |
417 (search-forward ">" end) | 455 nil) |
456 (t | |
457 (error "XML: Invalid element type in the DTD"))) | |
458 | |
459 ;; rule [45]: the element declaration must be unique | |
460 (if (assoc element dtd) | |
461 (error "XML: element declarations must be unique in a DTD (<%s>)" | |
462 (symbol-name element))) | |
463 | |
464 ;; Store the element in the DTD | |
465 (push (list element type) dtd) | |
466 (goto-char end-pos)) | |
467 ((looking-at "<!--") | |
468 (search-forward "-->")) | |
469 | |
470 (t | |
471 (error "XML: Invalid DTD item"))) | |
472 | |
473 ;; Skip the end of the DTD | |
474 (search-forward ">")))) | |
418 (nreverse dtd))) | 475 (nreverse dtd))) |
419 | 476 |
420 | 477 |
421 (defun xml-parse-elem-type (string) | 478 (defun xml-parse-elem-type (string) |
422 "Convert a STRING for an element type into an elisp structure." | 479 "Convert element type STRING into a Lisp structure." |
423 | 480 |
424 (let (elem modifier) | 481 (let (elem modifier) |
425 (if (string-match "(\\([^)]+\\))\\([+*?]?\\)" string) | 482 (if (string-match "(\\([^)]+\\))\\([+*?]?\\)" string) |
426 (progn | 483 (progn |
427 (setq elem (match-string 1 string) | 484 (setq elem (match-string 1 string) |
431 (mapcar 'xml-parse-elem-type | 488 (mapcar 'xml-parse-elem-type |
432 (split-string elem "|")))) | 489 (split-string elem "|")))) |
433 (if (string-match "," elem) | 490 (if (string-match "," elem) |
434 (setq elem (cons 'seq | 491 (setq elem (cons 'seq |
435 (mapcar 'xml-parse-elem-type | 492 (mapcar 'xml-parse-elem-type |
436 (split-string elem ",")))) | 493 (split-string elem ","))))))) |
437 ))) | |
438 (if (string-match "[ \t\n\r]*\\([^+*?]+\\)\\([+*?]?\\)" string) | 494 (if (string-match "[ \t\n\r]*\\([^+*?]+\\)\\([+*?]?\\)" string) |
439 (setq elem (match-string 1 string) | 495 (setq elem (match-string 1 string) |
440 modifier (match-string 2 string)))) | 496 modifier (match-string 2 string)))) |
441 | 497 |
442 (if (and (stringp elem) (string= elem "#PCDATA")) | 498 (if (and (stringp elem) (string= elem "#PCDATA")) |
452 (t | 508 (t |
453 elem)))) | 509 elem)))) |
454 | 510 |
455 ;;******************************************************************* | 511 ;;******************************************************************* |
456 ;;** | 512 ;;** |
457 ;;** Converting code points to strings | |
458 ;;** | |
459 ;;******************************************************************* | |
460 | |
461 (defun xml-ucs-to-string (codepoint) | |
462 "Return a string representation of CODEPOINT. If it can't be | |
463 converted, return '?'." | |
464 (cond ((boundp 'decode-char) | |
465 (char-to-string (decode-char 'ucs codepoint))) | |
466 ((and (< codepoint 128) | |
467 (> codepoint 31)) | |
468 (char-to-string codepoint)) | |
469 (t "?"))) ; FIXME: There's gotta be a better way to | |
470 ; designate an unknown character. | |
471 | |
472 ;;******************************************************************* | |
473 ;;** | |
474 ;;** Substituting special XML sequences | 513 ;;** Substituting special XML sequences |
475 ;;** | 514 ;;** |
476 ;;******************************************************************* | 515 ;;******************************************************************* |
477 | 516 |
517 (eval-when-compile | |
518 (defvar str)) ; dynamic from replace-regexp-in-string | |
519 | |
520 ;; Fixme: Take declared entities from the DTD when they're available. | |
521 (defun xml-substitute-entity (match) | |
522 "Subroutine of xml-substitute-special." | |
523 (save-match-data | |
524 (let ((match1 (match-string 1 str))) | |
525 (cond ((string= match1 "lt") "<") | |
526 ((string= match1 "gt") ">") | |
527 ((string= match1 "apos") "'") | |
528 ((string= match1 "quot") "\"") | |
529 ((string= match1 "amp") "&") | |
530 ((and (string-match "#\\([0-9]+\\)" match1) | |
531 (let ((c (decode-char | |
532 'ucs | |
533 (string-to-number (match-string 1 match1))))) | |
534 (if c (string c))))) ; else unrepresentable | |
535 ((and (string-match "#x\\([[:xdigit:]]+\\)" match1) | |
536 (let ((c (decode-char | |
537 'ucs | |
538 (string-to-number (match-string 1 match1) 16)))) | |
539 (if c (string c))))) | |
540 ;; Default to asis. Arguably, unrepresentable code points | |
541 ;; might be best replaced with U+FFFD. | |
542 (t match))))) | |
543 | |
478 (defun xml-substitute-special (string) | 544 (defun xml-substitute-special (string) |
479 "Return STRING, after subsituting special XML sequences." | 545 "Return STRING, after subsituting entity references." |
480 (while (string-match "<" string) | 546 ;; This originally made repeated passes through the string from the |
481 (setq string (replace-match "<" t nil string))) | 547 ;; beginning, which isn't correct, since then either "&amp;" or |
482 (while (string-match ">" string) | 548 ;; "&amp;" won't DTRT. |
483 (setq string (replace-match ">" t nil string))) | 549 (replace-regexp-in-string "&\\([^;]+\\);" |
484 (while (string-match "'" string) | 550 #'xml-substitute-entity string t t)) |
485 (setq string (replace-match "'" t nil string))) | |
486 (while (string-match """ string) | |
487 (setq string (replace-match "\"" t nil string))) | |
488 (while (string-match "&#\\([0-9]+\\);" string) | |
489 (setq string (replace-match (xml-ucs-to-string | |
490 (string-to-number | |
491 (match-string-no-properties 1 string))) | |
492 t nil string))) | |
493 (while (string-match "&#x\\([0-9a-fA-F]+\\);" string) | |
494 (setq string (replace-match (xml-ucs-to-string | |
495 (string-to-number | |
496 (match-string-no-properties 1 string) | |
497 16)) | |
498 t nil string))) | |
499 | |
500 ;; This goes last so it doesn't confuse the matches above. | |
501 (while (string-match "&" string) | |
502 (setq string (replace-match "&" t nil string))) | |
503 string) | |
504 | 551 |
505 ;;******************************************************************* | 552 ;;******************************************************************* |
506 ;;** | 553 ;;** |
507 ;;** Printing a tree. | 554 ;;** Printing a tree. |
508 ;;** This function is intended mainly for debugging purposes. | 555 ;;** This function is intended mainly for debugging purposes. |
513 (dolist (node xml) | 560 (dolist (node xml) |
514 (xml-debug-print-internal node ""))) | 561 (xml-debug-print-internal node ""))) |
515 | 562 |
516 (defun xml-debug-print-internal (xml indent-string) | 563 (defun xml-debug-print-internal (xml indent-string) |
517 "Outputs the XML tree in the current buffer. | 564 "Outputs the XML tree in the current buffer. |
518 The first line indented with INDENT-STRING." | 565 The first line is indented with INDENT-STRING." |
519 (let ((tree xml) | 566 (let ((tree xml) |
520 attlist) | 567 attlist) |
521 (insert indent-string "<" (symbol-name (xml-node-name tree))) | 568 (insert indent-string ?< (symbol-name (xml-node-name tree))) |
522 | 569 |
523 ;; output the attribute list | 570 ;; output the attribute list |
524 (setq attlist (xml-node-attributes tree)) | 571 (setq attlist (xml-node-attributes tree)) |
525 (while attlist | 572 (while attlist |
526 (insert " ") | 573 (insert ?\ (symbol-name (caar attlist)) "=\"" (cdar attlist) ?\") |
527 (insert (symbol-name (caar attlist)) "=\"" (cdar attlist) "\"") | |
528 (setq attlist (cdr attlist))) | 574 (setq attlist (cdr attlist))) |
529 | 575 |
530 (insert ">") | 576 (insert ?>) |
531 | 577 |
532 (setq tree (xml-node-children tree)) | 578 (setq tree (xml-node-children tree)) |
533 | 579 |
534 ;; output the children | 580 ;; output the children |
535 (dolist (node tree) | 581 (dolist (node tree) |
536 (cond | 582 (cond |
537 ((listp node) | 583 ((listp node) |
538 (insert "\n") | 584 (insert ?\n) |
539 (xml-debug-print-internal node (concat indent-string " "))) | 585 (xml-debug-print-internal node (concat indent-string " "))) |
540 ((stringp node) (insert node)) | 586 ((stringp node) (insert node)) |
541 (t | 587 (t |
542 (error "Invalid XML tree")))) | 588 (error "Invalid XML tree")))) |
543 | 589 |
544 (insert "\n" indent-string | 590 (insert ?\n indent-string |
545 "</" (symbol-name (xml-node-name xml)) ">"))) | 591 ?< ?/ (symbol-name (xml-node-name xml)) ?>))) |
546 | 592 |
547 (provide 'xml) | 593 (provide 'xml) |
548 | 594 |
549 ;;; xml.el ends here | 595 ;;; xml.el ends here |