diff lisp/nxml/nxml-parse.el @ 86361:38f93f3d00a2

Initial merge of nxml
author Mark A. Hershberger <mah@everybody.org>
date Fri, 23 Nov 2007 06:58:00 +0000
parents
children 2ac1a9b70580
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lisp/nxml/nxml-parse.el	Fri Nov 23 06:58:00 2007 +0000
@@ -0,0 +1,320 @@
+;;; nxml-parse.el --- XML parser, sharing infrastructure with nxml-mode
+
+;; Copyright (C) 2003 Free Software Foundation, Inc.
+
+;; Author: James Clark
+;; Keywords: XML
+
+;; This program is free software; you can redistribute it and/or
+;; modify it under the terms of the GNU General Public License as
+;; published by the Free Software Foundation; either version 2 of
+;; the License, or (at your option) any later version.
+
+;; This program is distributed in the hope that it will be
+;; useful, but WITHOUT ANY WARRANTY; without even the implied
+;; warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+;; PURPOSE.  See the GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public
+;; License along with this program; if not, write to the Free
+;; Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+;; MA 02111-1307 USA
+
+;;; Commentary:
+
+;; Entry point is `nxml-parse-file'.
+
+;;; Code:
+
+(require 'nxml-util)
+(require 'xmltok)
+(require 'nxml-enc)
+(require 'nxml-ns)
+
+(defvar nxml-parse-file-name nil)
+
+(defvar nxml-validate-function nil
+  "Nil or a function to be called by `nxml-parse-file' to perform validation.
+The function will be called once for each start-tag or end-tag.  The
+function is passed two arguments TEXT and START-TAG.  For a start-tag,
+START-TAG is a list (NAME ATTRIBUTES) where NAME and ATTRIBUTES are in
+the same form as returned by `nxml-parse-file.  For an end-tag,
+START-TAG is nil.  TEXT is a string containing the text immediately
+preceding the tag, or nil if there was no such text.  An empty element
+is treated as a start-tag followed by an end-tag.
+
+For a start-tag, the namespace state will be the state after
+processing the namespace declarations in the start-tag. For an
+end-tag, the namespace state will be the state before popping the
+namespace declarations for the corresponding start-tag.
+
+The function must return nil if no error is detected or a
+cons (MESSAGE . LOCATION) where MESSAGE is a string containing
+an error message and LOCATION indicates what caused the error
+as follows:
+
+- nil indicates the tag as whole caused it; this is always allowed;
+
+- text indicates the text caused it; this is allowed only if
+TEXT is non-nil;
+
+- tag-close indicates the close of the tag caused it; this is
+allowed only if START-TAG is non-nil;
+
+- (attribute-name . N) indicates that the name of the Nth attribute
+caused it; N counts from 0; this is allowed only if START-TAG is non-nil
+and N must be less than the number of attributes;
+
+- (attribute-value . N) indicates that the value of the Nth attribute
+caused it; N counts from 0; this is allowed only if START-TAG is non-nil
+and N must be less than the number of attributes.")
+
+(defun nxml-parse-file (file)
+  "Parse the XML document in FILE and return it as a list.
+An XML element is represented as a list (NAME ATTRIBUTES . CHILDREN).
+NAME is either a string, in the case where the name does not have a
+namespace, or a cons (NAMESPACE . LOCAL-NAME), where NAMESPACE is a
+symbol and LOCAL-NAME is a string, in the case where the name does
+have a namespace. NAMESPACE is a keyword whose name is `:URI', where
+URI is the namespace name.  ATTRIBUTES is an alist of attributes where
+each attribute has the form (NAME . VALUE), where NAME has the same
+form as an element name, and VALUE is a string.  A namespace
+declaration is represented as an attribute whose name is
+\(:http://www.w3.org/2000/xmlns/ . LOCAL-NAME).  CHILDREN is a list
+containing strings and child elements; CHILDREN never contains two
+consecutive strings and never contains an empty string.  Processing
+instructions and comments are not represented.  The return value is a
+list representing the document element.
+
+If the XML document is not well-formed, an error having the condition
+`nxml-file-parse-error' will be signaled; the error data will be a
+list of the \(FILE POSITION MESSAGE), where POSITION is an integer
+specifying the position where the error was detected, and MESSAGE is a
+string describing the error.
+
+The current contents of FILE will be parsed even if there is a
+modified buffer currently visiting FILE.
+
+If the variable `nxml-validation-function' is non-nil, it will be
+called twice for each element, and any reported error will be signaled
+in the same way as well-formedness error."
+  (save-excursion
+    (set-buffer (nxml-parse-find-file file))
+    (unwind-protect
+	(let ((nxml-parse-file-name file))
+	  (nxml-parse-instance))
+      (kill-buffer nil))))
+
+(defun nxml-parse-find-file (file)
+  (save-excursion
+    (set-buffer (get-buffer-create " *nXML Parse*"))
+    (erase-buffer)
+    (let ((set-auto-coding-function 'nxml-set-xml-coding))
+      (insert-file-contents file))
+    (current-buffer)))
+      
+(defun nxml-parse-instance ()
+  (let (xmltok-dtd)
+    (xmltok-save
+      (xmltok-forward-prolog)
+      (nxml-check-xmltok-errors)
+      (nxml-ns-save
+	(nxml-parse-instance-1)))))
+
+(defun nxml-parse-instance-1 ()
+  (let* ((top (cons nil nil))
+	 ;; tail is a cons cell, whose cdr is nil
+	 ;; additional elements will destructively appended to tail
+	 (tail top)
+	 ;; stack of tails one for each open element
+	 tail-stack
+	 ;; list of QNames of open elements
+	 open-element-tags
+	 ;; list of strings buffering a text node, in reverse order
+	 text
+	 ;; position of beginning of first (in buffer) string in text
+	 text-pos)
+    (while (xmltok-forward)
+      (nxml-check-xmltok-errors)
+      (cond ((memq xmltok-type '(start-tag end-tag empty-element))
+	     (when text
+	       (setq text (apply 'concat (nreverse text)))
+	       (setcdr tail (cons text nil))
+	       (setq tail (cdr tail)))
+	     (when (not (eq xmltok-type 'end-tag))
+	       (when (and (not open-element-tags)
+			  (not (eq tail top)))
+		 (nxml-parse-error nil "Multiple top-level elements"))
+	       (setq open-element-tags
+		     (cons (xmltok-start-tag-qname)
+			   open-element-tags))
+	       (nxml-ns-push-state)
+	       (let ((tag (nxml-parse-start-tag)))
+		 (nxml-validate-tag text text-pos tag)
+		 (setq text nil)
+		 (setcdr tail (cons tag nil))
+		 (setq tail (cdr tail))
+		 (setq tail-stack (cons tail tail-stack))
+		 (setq tail (last tag))))
+	     (when (not (eq xmltok-type 'start-tag))
+	       (or (eq xmltok-type 'empty-element)
+		   (equal (car open-element-tags)
+			  (xmltok-end-tag-qname))
+		   (if open-element-tags
+		       (nxml-parse-error nil
+					 "Unbalanced end-tag; expected </%s>"
+					 (car open-element-tags))
+		     (nxml-parse-error nil "Extra end-tag")))
+	       (nxml-validate-tag text text-pos nil)
+	       (setq text nil)
+	       (nxml-ns-pop-state)
+	       (setq open-element-tags (cdr open-element-tags))
+	       (setq tail (car tail-stack))
+	       (setq tail-stack (cdr tail-stack)))
+	     (setq text-pos nil))
+	    ((memq xmltok-type '(space data entity-ref char-ref cdata-section))
+	     (cond (open-element-tags
+		    (unless text-pos
+		      (setq text-pos xmltok-start))
+		    (setq text
+			  (cons (nxml-current-text-string) text)))
+		   ((not (eq xmltok-type 'space))
+		    (nxml-parse-error
+		     nil
+		     "%s at top-level"
+		     (cdr (assq xmltok-type
+				'((data . "Text characters")
+				  (entity-ref . "Entity reference")
+				  (char-ref . "Character reference")
+				  (cdata-section . "CDATA section"))))))))))
+    (unless (cdr top)
+      (nxml-parse-error (point-max) "Missing document element"))
+    (cadr top)))
+
+(defun nxml-parse-start-tag ()
+  (let (parsed-attributes
+	parsed-namespace-attributes
+	atts att prefixes prefix ns value name)
+    (setq atts xmltok-namespace-attributes)
+    (while atts
+      (setq att (car atts))
+      (setq value (or (xmltok-attribute-value att)
+		      (nxml-parse-error nil "Invalid attribute value")))
+      (setq ns (nxml-make-namespace value))
+      (setq prefix (and (xmltok-attribute-prefix att)
+			(xmltok-attribute-local-name att)))
+      (cond ((member prefix prefixes)
+	     (nxml-parse-error nil "Duplicate namespace declaration"))
+	    ((not prefix)
+	     (nxml-ns-set-default ns))
+	    (ns
+	     (nxml-ns-set-prefix prefix ns))
+	    (t (nxml-parse-error nil "Cannot undeclare namespace prefix")))
+      (setq prefixes (cons prefix prefixes))
+      (setq parsed-namespace-attributes
+	    (cons (cons (nxml-make-name nxml-xmlns-namespace-uri
+					(xmltok-attribute-local-name att))
+			value)
+		  parsed-namespace-attributes))
+      (setq atts (cdr atts)))
+    (setq name
+	  (nxml-make-name
+	   (let ((prefix (xmltok-start-tag-prefix)))
+	     (if prefix
+		 (or (nxml-ns-get-prefix prefix)
+		     (nxml-parse-error (1+ xmltok-start)
+				       "Prefix `%s' undeclared"
+				       prefix))
+	       (nxml-ns-get-default)))
+	   (xmltok-start-tag-local-name)))
+    (setq atts xmltok-attributes)
+    (while atts
+      (setq att (car atts))
+      (setq ns
+	    (let ((prefix (xmltok-attribute-prefix att)))
+	      (and prefix
+		   (or (nxml-ns-get-prefix prefix)
+		       (nxml-parse-error (xmltok-attribute-name-start att)
+					 "Prefix `%s' undeclared"
+					 prefix)))))
+      (setq parsed-attributes
+	    (let ((nm (nxml-make-name ns
+				      (xmltok-attribute-local-name att))))
+	      (when (assoc nm parsed-attributes)
+		(nxml-parse-error (xmltok-attribute-name-start att)
+				  "Duplicate attribute"))
+	      (cons (cons nm (or (xmltok-attribute-value att)
+				 (nxml-parse-error nil "Invalid attribute value")))
+		    parsed-attributes)))
+      (setq atts (cdr atts)))
+    ;; We want to end up with the attributes followed by the
+    ;; the namespace attributes in the same order as
+    ;; xmltok-attributes and xmltok-namespace-attributes respectively.
+    (when parsed-namespace-attributes
+      (setq parsed-attributes
+	    (nconc parsed-namespace-attributes parsed-attributes)))
+    (list name (nreverse parsed-attributes))))
+
+(defun nxml-validate-tag (text text-pos tag)
+  (when nxml-validate-function
+    (let ((err (funcall nxml-validate-function text tag))
+	  pos)
+      (when err
+	(setq pos (nxml-validate-error-position (cdr err)
+						(and text text-pos)
+						tag))
+	(or pos (error "Incorrect return value from %s"
+		       nxml-validate-function))
+	(nxml-parse-error pos (car err))))))
+
+(defun nxml-validate-error-position (location text-pos tag)
+  (cond ((null location) xmltok-start)
+	((eq location 'text)  text-pos)
+	((eq location 'tag-close)
+	 (and tag (- (point) (if (eq xmltok-type 'empty-element ) 2 1))))
+	((consp location)
+	 (let ((att (nth (cdr location) xmltok-attributes)))
+	   (when (not att)
+	     (setq att (nth (- (cdr location) (length xmltok-attributes))
+			    xmltok-namespace-attributes)))
+	   (cond ((not att))
+		 ((eq (car location) 'attribute-name)
+		  (xmltok-attribute-name-start att))
+		 ((eq (car location) 'attribute-value)
+		  (xmltok-attribute-value-start att)))))))
+
+(defun nxml-make-name (ns local-name)
+  (if ns
+      (cons ns local-name)
+    local-name))
+
+(defun nxml-current-text-string ()
+  (cond ((memq xmltok-type '(space data))
+	 (buffer-substring-no-properties xmltok-start
+					 (point)))
+	((eq xmltok-type 'cdata-section)
+	 (buffer-substring-no-properties (+ xmltok-start 9)
+					 (- (point) 3)))
+	((memq xmltok-type '(char-ref entity-ref))
+	 (unless xmltok-replacement
+	   (nxml-parse-error nil
+			     (if (eq xmltok-type 'char-ref)
+				 "Reference to unsupported Unicode character"
+			       "Unresolvable entity reference")))
+	 xmltok-replacement)))
+
+(defun nxml-parse-error (position &rest args)
+  (nxml-signal-file-parse-error nxml-parse-file-name
+				(or position xmltok-start)
+				(apply 'format args)))
+
+(defun nxml-check-xmltok-errors ()
+  (when xmltok-errors
+    (let ((err (car (last xmltok-errors))))
+      (nxml-signal-file-parse-error nxml-parse-file-name
+				    (xmltok-error-start err)
+				    (xmltok-error-message err)))))
+
+(provide 'nxml-parse)
+
+;;; nxml-parse.el ends here