comparison lisp/xml.el @ 76507:d1760553962c

* xml.el (xml-parse-tag, xml-parse-string, xml-parse-attlist) (xml-parse-dtd, xml-parse-elem-type, xml-substitute-special): Return to use of the -no-properties variants. There was consensus on emacs-devel that the speed of these variants was prefered since we are usually parsing files (from the internet or on disk) instead of XML created in Emacs.
author Chong Yidong <cyd@stupidchicken.com>
date Sat, 17 Mar 2007 18:55:52 +0000
parents e3694f1cb928
children 9355f9b7bbff c0409ee15cee
comparison
equal deleted inserted replaced
76506:c6e92edb9f1e 76507:d1760553962c
74 ;; * more complete DOCTYPE parsing 74 ;; * more complete DOCTYPE parsing
75 ;; * pi support 75 ;; * pi support
76 76
77 ;;; Code: 77 ;;; Code:
78 78
79 ;; Note that {buffer-substring,match-string}-no-properties were 79 ;; Note that buffer-substring and match-string were formerly used in
80 ;; formerly used in several places, but that removes composition info. 80 ;; several places, because the -no-properties variants remove
81 ;; composition info. However, after some discussion on emacs-devel,
82 ;; the consensus was that the speed of the -no-properties variants was
83 ;; a worthwhile tradeoff especially since we're usually parsing files
84 ;; instead of hand-crafted XML.
81 85
82 ;;******************************************************************* 86 ;;*******************************************************************
83 ;;** 87 ;;**
84 ;;** Macros to parse the list 88 ;;** Macros to parse the list
85 ;;** 89 ;;**
404 ((looking-at "<!\\[CDATA\\[") 408 ((looking-at "<!\\[CDATA\\[")
405 (let ((pos (match-end 0))) 409 (let ((pos (match-end 0)))
406 (unless (search-forward "]]>" nil t) 410 (unless (search-forward "]]>" nil t)
407 (error "XML: (Not Well Formed) CDATA section does not end anywhere in the document")) 411 (error "XML: (Not Well Formed) CDATA section does not end anywhere in the document"))
408 (concat 412 (concat
409 (buffer-substring pos (match-beginning 0)) 413 (buffer-substring-no-properties pos (match-beginning 0))
410 (xml-parse-string)))) 414 (xml-parse-string))))
411 ;; DTD for the document 415 ;; DTD for the document
412 ((looking-at "<!DOCTYPE") 416 ((looking-at "<!DOCTYPE")
413 (let ((dtd (xml-parse-dtd parse-ns))) 417 (let ((dtd (xml-parse-dtd parse-ns)))
414 (skip-syntax-forward " ") 418 (skip-syntax-forward " ")
425 ;; opening tag 429 ;; opening tag
426 ((looking-at "<\\([^/>[:space:]]+\\)") 430 ((looking-at "<\\([^/>[:space:]]+\\)")
427 (goto-char (match-end 1)) 431 (goto-char (match-end 1))
428 432
429 ;; Parse this node 433 ;; Parse this node
430 (let* ((node-name (match-string 1)) 434 (let* ((node-name (match-string-no-properties 1))
431 ;; Parse the attribute list. 435 ;; Parse the attribute list.
432 (attrs (xml-parse-attlist xml-ns)) 436 (attrs (xml-parse-attlist xml-ns))
433 children pos) 437 children pos)
434 438
435 ;; add the xmlns:* attrs to our cache 439 ;; add the xmlns:* attrs to our cache
478 482
479 (goto-char (match-end 0)) 483 (goto-char (match-end 0))
480 (nreverse children))) 484 (nreverse children)))
481 ;; This was an invalid start tag (Expected ">", but didn't see it.) 485 ;; This was an invalid start tag (Expected ">", but didn't see it.)
482 (error "XML: (Well-Formed) Couldn't parse tag: %s" 486 (error "XML: (Well-Formed) Couldn't parse tag: %s"
483 (buffer-substring (- (point) 10) (+ (point) 1))))))) 487 (buffer-substring-no-properties (- (point) 10) (+ (point) 1)))))))
484 (t ;; (Not one of PI, CDATA, Comment, End tag, or Start tag) 488 (t ;; (Not one of PI, CDATA, Comment, End tag, or Start tag)
485 (unless xml-sub-parser ; Usually, we error out. 489 (unless xml-sub-parser ; Usually, we error out.
486 (error "XML: (Well-Formed) Invalid character")) 490 (error "XML: (Well-Formed) Invalid character"))
487 491
488 ;; However, if we're parsing incrementally, then we need to deal 492 ;; However, if we're parsing incrementally, then we need to deal
493 "Parse the next whatever. Could be a string, or an element." 497 "Parse the next whatever. Could be a string, or an element."
494 (let* ((pos (point)) 498 (let* ((pos (point))
495 (string (progn (if (search-forward "<" nil t) 499 (string (progn (if (search-forward "<" nil t)
496 (forward-char -1) 500 (forward-char -1)
497 (goto-char (point-max))) 501 (goto-char (point-max)))
498 (buffer-substring pos (point))))) 502 (buffer-substring-no-properties pos (point)))))
499 ;; Clean up the string. As per XML specifications, the XML 503 ;; Clean up the string. As per XML specifications, the XML
500 ;; processor should always pass the whole string to the 504 ;; processor should always pass the whole string to the
501 ;; application. But \r's should be replaced: 505 ;; application. But \r's should be replaced:
502 ;; http://www.w3.org/TR/2000/REC-xml-20001006#sec-line-ends 506 ;; http://www.w3.org/TR/2000/REC-xml-20001006#sec-line-ends
503 (setq pos 0) 507 (setq pos 0)
514 end-pos name) 518 end-pos name)
515 (skip-syntax-forward " ") 519 (skip-syntax-forward " ")
516 (while (looking-at (eval-when-compile 520 (while (looking-at (eval-when-compile
517 (concat "\\(" xml-name-regexp "\\)\\s-*=\\s-*"))) 521 (concat "\\(" xml-name-regexp "\\)\\s-*=\\s-*")))
518 (setq end-pos (match-end 0)) 522 (setq end-pos (match-end 0))
519 (setq name (xml-maybe-do-ns (match-string 1) nil xml-ns)) 523 (setq name (xml-maybe-do-ns (match-string-no-properties 1) nil xml-ns))
520 (goto-char end-pos) 524 (goto-char end-pos)
521 525
522 ;; See also: http://www.w3.org/TR/2000/REC-xml-20001006#AVNormalize 526 ;; See also: http://www.w3.org/TR/2000/REC-xml-20001006#AVNormalize
523 527
524 ;; Do we have a string between quotes (or double-quotes), 528 ;; Do we have a string between quotes (or double-quotes),
533 (if (assoc name attlist) 537 (if (assoc name attlist)
534 (error "XML: (Not Well-Formed) Each attribute must be unique within an element")) 538 (error "XML: (Not Well-Formed) Each attribute must be unique within an element"))
535 539
536 ;; Multiple whitespace characters should be replaced with a single one 540 ;; Multiple whitespace characters should be replaced with a single one
537 ;; in the attributes 541 ;; in the attributes
538 (let ((string (match-string 1)) 542 (let ((string (match-string-no-properties 1))
539 (pos 0)) 543 (pos 0))
540 (replace-regexp-in-string "\\s-\\{2,\\}" " " string) 544 (replace-regexp-in-string "\\s-\\{2,\\}" " " string)
541 (let ((expansion (xml-substitute-special string))) 545 (let ((expansion (xml-substitute-special string)))
542 (unless (stringp expansion) 546 (unless (stringp expansion)
543 ; We say this is the constraint. It is acctually that 547 ; We say this is the constraint. It is acctually that
573 xml-validating-parser) 577 xml-validating-parser)
574 (error "XML: (Validity) Invalid DTD (expecting name of the document)")) 578 (error "XML: (Validity) Invalid DTD (expecting name of the document)"))
575 579
576 ;; Get the name of the document 580 ;; Get the name of the document
577 (looking-at xml-name-regexp) 581 (looking-at xml-name-regexp)
578 (let ((dtd (list (match-string 0) 'dtd)) 582 (let ((dtd (list (match-string-no-properties 0) 'dtd))
579 type element end-pos) 583 type element end-pos)
580 (goto-char (match-end 0)) 584 (goto-char (match-end 0))
581 585
582 (skip-syntax-forward " ") 586 (skip-syntax-forward " ")
583 ;; XML [75] 587 ;; XML [75]
588 nil t) 592 nil t)
589 (re-search-forward 593 (re-search-forward
590 "\\='\\([[:space:][:alnum:]-()+,./:=?;!*#@$_%]*\\)'" 594 "\\='\\([[:space:][:alnum:]-()+,./:=?;!*#@$_%]*\\)'"
591 nil t)) 595 nil t))
592 (error "XML: Missing Public ID")) 596 (error "XML: Missing Public ID"))
593 (let ((pubid (match-string 1))) 597 (let ((pubid (match-string-no-properties 1)))
594 (skip-syntax-forward " ") 598 (skip-syntax-forward " ")
595 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t) 599 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t)
596 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t)) 600 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t))
597 (error "XML: Missing System ID")) 601 (error "XML: Missing System ID"))
598 (push (list pubid (match-string 1) 'public) dtd))) 602 (push (list pubid (match-string-no-properties 1) 'public) dtd)))
599 ((looking-at "SYSTEM\\s-+") 603 ((looking-at "SYSTEM\\s-+")
600 (goto-char (match-end 0)) 604 (goto-char (match-end 0))
601 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t) 605 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t)
602 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t)) 606 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t))
603 (error "XML: Missing System ID")) 607 (error "XML: Missing System ID"))
604 (push (list (match-string 1) 'system) dtd))) 608 (push (list (match-string-no-properties 1) 'system) dtd)))
605 (skip-syntax-forward " ") 609 (skip-syntax-forward " ")
606 (if (eq ?> (char-after)) 610 (if (eq ?> (char-after))
607 (forward-char) 611 (forward-char)
608 (if (not (eq (char-after) ?\[)) 612 (if (not (eq (char-after) ?\[))
609 (error "XML: Bad DTD") 613 (error "XML: Bad DTD")
616 620
617 ;; Translation of rule [45] of XML specifications 621 ;; Translation of rule [45] of XML specifications
618 ((looking-at 622 ((looking-at
619 "<!ELEMENT\\s-+\\([[:alnum:].%;]+\\)\\s-+\\([^>]+\\)>") 623 "<!ELEMENT\\s-+\\([[:alnum:].%;]+\\)\\s-+\\([^>]+\\)>")
620 624
621 (setq element (match-string 1) 625 (setq element (match-string-no-properties 1)
622 type (match-string-no-properties 2)) 626 type (match-string-no-properties 2))
623 (setq end-pos (match-end 0)) 627 (setq end-pos (match-end 0))
624 628
625 ;; Translation of rule [46] of XML specifications 629 ;; Translation of rule [46] of XML specifications
626 (cond 630 (cond
627 ((string-match "^EMPTY[ \t\n\r]*$" type) ;; empty declaration 631 ((string-match "^EMPTY[ \t\n\r]*$" type) ;; empty declaration
628 (setq type 'empty)) 632 (setq type 'empty))
629 ((string-match "^ANY[ \t\n\r]*$" type) ;; any type of contents 633 ((string-match "^ANY[ \t\n\r]*$" type) ;; any type of contents
630 (setq type 'any)) 634 (setq type 'any))
631 ((string-match "^(\\(.*\\))[ \t\n\r]*$" type) ;; children ([47]) 635 ((string-match "^(\\(.*\\))[ \t\n\r]*$" type) ;; children ([47])
632 (setq type (xml-parse-elem-type (match-string 1 type)))) 636 (setq type (xml-parse-elem-type (match-string-no-properties 1 type))))
633 ((string-match "^%[^;]+;[ \t\n\r]*$" type) ;; substitution 637 ((string-match "^%[^;]+;[ \t\n\r]*$" type) ;; substitution
634 nil) 638 nil)
635 (t 639 (t
636 (if xml-validating-parser 640 (if xml-validating-parser
637 (error "XML: (Validity) Invalid element type in the DTD")))) 641 (error "XML: (Validity) Invalid element type in the DTD"))))
657 ((looking-at "<!--") 661 ((looking-at "<!--")
658 (search-forward "-->")) 662 (search-forward "-->"))
659 ((looking-at (concat "<!ENTITY[ \t\n\r]*\\(" xml-name-re 663 ((looking-at (concat "<!ENTITY[ \t\n\r]*\\(" xml-name-re
660 "\\)[ \t\n\r]*\\(" xml-entity-value-re 664 "\\)[ \t\n\r]*\\(" xml-entity-value-re
661 "\\)[ \t\n\r]*>")) 665 "\\)[ \t\n\r]*>"))
662 (let ((name (match-string 1)) 666 (let ((name (match-string-no-properties 1))
663 (value (substring (match-string 2) 1 667 (value (substring (match-string-no-properties 2) 1
664 (- (length (match-string 2)) 1)))) 668 (- (length (match-string-no-properties 2)) 1))))
665 (goto-char (match-end 0)) 669 (goto-char (match-end 0))
666 (setq xml-entity-alist 670 (setq xml-entity-alist
667 (append xml-entity-alist 671 (append xml-entity-alist
668 (list (cons name 672 (list (cons name
669 (with-temp-buffer 673 (with-temp-buffer
679 "\\)[ \t\n\r]+PUBLIC[ \t\n\r]+" 683 "\\)[ \t\n\r]+PUBLIC[ \t\n\r]+"
680 "\"[- \r\na-zA-Z0-9'()+,./:=?;!*#@$_%]*\"" 684 "\"[- \r\na-zA-Z0-9'()+,./:=?;!*#@$_%]*\""
681 "\\|'[- \r\na-zA-Z0-9()+,./:=?;!*#@$_%]*'" 685 "\\|'[- \r\na-zA-Z0-9()+,./:=?;!*#@$_%]*'"
682 "[ \t\n\r]+\\(\"[^\"]*\"\\|'[^']*'\\)" 686 "[ \t\n\r]+\\(\"[^\"]*\"\\|'[^']*'\\)"
683 "[ \t\n\r]*>"))) 687 "[ \t\n\r]*>")))
684 (let ((name (match-string 1)) 688 (let ((name (match-string-no-properties 1))
685 (file (substring (match-string 2) 1 689 (file (substring (match-string-no-properties 2) 1
686 (- (length (match-string 2)) 1)))) 690 (- (length (match-string-no-properties 2)) 1))))
687 (goto-char (match-end 0)) 691 (goto-char (match-end 0))
688 (setq xml-entity-alist 692 (setq xml-entity-alist
689 (append xml-entity-alist 693 (append xml-entity-alist
690 (list (cons name (with-temp-buffer 694 (list (cons name (with-temp-buffer
691 (insert-file-contents file) 695 (insert-file-contents file)
720 "Convert element type STRING into a Lisp structure." 724 "Convert element type STRING into a Lisp structure."
721 725
722 (let (elem modifier) 726 (let (elem modifier)
723 (if (string-match "(\\([^)]+\\))\\([+*?]?\\)" string) 727 (if (string-match "(\\([^)]+\\))\\([+*?]?\\)" string)
724 (progn 728 (progn
725 (setq elem (match-string 1 string) 729 (setq elem (match-string-no-properties 1 string)
726 modifier (match-string 2 string)) 730 modifier (match-string-no-properties 2 string))
727 (if (string-match "|" elem) 731 (if (string-match "|" elem)
728 (setq elem (cons 'choice 732 (setq elem (cons 'choice
729 (mapcar 'xml-parse-elem-type 733 (mapcar 'xml-parse-elem-type
730 (split-string elem "|")))) 734 (split-string elem "|"))))
731 (if (string-match "," elem) 735 (if (string-match "," elem)
732 (setq elem (cons 'seq 736 (setq elem (cons 'seq
733 (mapcar 'xml-parse-elem-type 737 (mapcar 'xml-parse-elem-type
734 (split-string elem ","))))))) 738 (split-string elem ",")))))))
735 (if (string-match "[ \t\n\r]*\\([^+*?]+\\)\\([+*?]?\\)" string) 739 (if (string-match "[ \t\n\r]*\\([^+*?]+\\)\\([+*?]?\\)" string)
736 (setq elem (match-string 1 string) 740 (setq elem (match-string-no-properties 1 string)
737 modifier (match-string 2 string)))) 741 modifier (match-string-no-properties 2 string))))
738 742
739 (if (and (stringp elem) (string= elem "#PCDATA")) 743 (if (and (stringp elem) (string= elem "#PCDATA"))
740 (setq elem 'pcdata)) 744 (setq elem 'pcdata))
741 745
742 (cond 746 (cond
763 767
764 (let ((point 0) 768 (let ((point 0)
765 children end-point) 769 children end-point)
766 (while (string-match "&\\([^;]*\\);" string point) 770 (while (string-match "&\\([^;]*\\);" string point)
767 (setq end-point (match-end 0)) 771 (setq end-point (match-end 0))
768 (let* ((this-part (match-string 1 string)) 772 (let* ((this-part (match-string-no-properties 1 string))
769 (prev-part (substring string point (match-beginning 0))) 773 (prev-part (substring string point (match-beginning 0)))
770 (entity (assoc this-part xml-entity-alist)) 774 (entity (assoc this-part xml-entity-alist))
771 (expansion 775 (expansion
772 (cond ((string-match "#\\([0-9]+\\)" this-part) 776 (cond ((string-match "#\\([0-9]+\\)" this-part)
773 (let ((c (decode-char 777 (let ((c (decode-char
774 'ucs 778 'ucs
775 (string-to-number (match-string 1 this-part))))) 779 (string-to-number (match-string-no-properties 1 this-part)))))
776 (if c (string c)))) 780 (if c (string c))))
777 ((string-match "#x\\([[:xdigit:]]+\\)" this-part) 781 ((string-match "#x\\([[:xdigit:]]+\\)" this-part)
778 (let ((c (decode-char 782 (let ((c (decode-char
779 'ucs 783 'ucs
780 (string-to-number (match-string 1 this-part) 16)))) 784 (string-to-number (match-string-no-properties 1 this-part) 16))))
781 (if c (string c)))) 785 (if c (string c))))
782 (entity 786 (entity
783 (cdr entity)) 787 (cdr entity))
784 ((eq (length this-part) 0) 788 ((eq (length this-part) 0)
785 (error "XML: (Not Well-Formed) No entity given")) 789 (error "XML: (Not Well-Formed) No entity given"))