Mercurial > emacs
diff lisp/progmodes/ebnf-ebx.el @ 54623:d6b491b74eae
Parser for EBNF used to specify XML (EBNFX)
author | Vinicius Jose Latorre <viniciusjl@ig.com.br> |
---|---|
date | Sun, 28 Mar 2004 22:48:32 +0000 |
parents | |
children | 6e794586bfc8 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lisp/progmodes/ebnf-ebx.el Sun Mar 28 22:48:32 2004 +0000 @@ -0,0 +1,673 @@ +;;; ebnf-ebx.el --- parser for EBNF used to specify XML (EBNFX) + +;; Copyright (C) 2004 Free Sofware Foundation, Inc. + +;; Author: Vinicius Jose Latorre <viniciusjl@ig.com.br> +;; Maintainer: Vinicius Jose Latorre <viniciusjl@ig.com.br> +;; Time-stamp: <2004/03/22 08:53:21 vinicius> +;; Keywords: wp, ebnf, PostScript +;; Version: 1.0 + +;; This file is part of GNU Emacs. + +;; GNU Emacs is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2, or (at your option) +;; any later version. + +;; GNU Emacs is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GNU Emacs; see the file COPYING. If not, write to the +;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, +;; Boston, MA 02111-1307, USA. + +;;; Commentary: + +;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; +;; This is part of ebnf2ps package. +;; +;; This package defines a parser for EBNF used to specify XML (EBNFX). +;; +;; See ebnf2ps.el for documentation. +;; +;; +;; EBNFX Syntax +;; ------------ +;; +;; See the URL: +;; `http://www.w3.org/TR/2004/REC-xml-20040204/#sec-notation' +;; (Extensible Markup Language (XML) 1.0 (Third Edition)) +;; +;; +;; rule ::= symbol '::=' expression +;; /* rules are separated by at least one blank line. */ +;; +;; expression ::= concatenation ('|' concatenation)* +;; +;; concatenation ::= exception* +;; +;; exception ::= term ('-' term)? +;; +;; term ::= factor ('*' | '+' | '?')? +;; +;; factor ::= hex-char+ +;; | '[' '^'? ( char ( '-' char )? )+ ']' +;; | '"' 'string' '"' +;; | "'" "string" "'" +;; | '(' expression ')' +;; | symbol +;; +;; symbol ::= 'upper or lower case letter' +;; ('upper or lower case letter' | '-' | '_')* +;; /* upper and lower 8-bit accentuated characters are included */ +;; +;; hex-char ::= '#x' [0-9A-Fa-f]+ +;; +;; char ::= hex-char | 'any character except control characters' +;; /* 8-bit accentuated characters are included */ +;; +;; any-char ::= char | 'newline' | 'tab' +;; +;; ignore ::= '[' ('wfc' | 'WFC' | 'vc' | 'VC') ':' ( any-char - ']' )* ']' +;; +;; comment ::= '/*' ( any-char - '*/' ) '*/' +;; +;; +;; Below is the Notation section extracted from the URL cited above. +;; +;; 6 Notation +;; +;; The formal grammar of XML is given in this specification using a simple +;; Extended Backus-Naur Form (EBNF) notation. Each rule in the grammar defines +;; one symbol, in the form +;; +;; symbol ::= expression +;; +;; Symbols are written with an initial capital letter if they are the start +;; symbol of a regular language, otherwise with an initial lowercase letter. +;; Literal strings are quoted. +;; +;; Within the expression on the right-hand side of a rule, the following +;; expressions are used to match strings of one or more characters: +;; +;; #xN +;; +;; where N is a hexadecimal integer, the expression matches the character +;; whose number (code point) in ISO/IEC 10646 is N. The number of leading +;; zeros in the #xN form is insignificant. +;; +;; [a-zA-Z], [#xN-#xN] +;; +;; matches any Char with a value in the range(s) indicated (inclusive). +;; +;; [abc], [#xN#xN#xN] +;; +;; matches any Char with a value among the characters enumerated. +;; Enumerations and ranges can be mixed in one set of brackets. +;; +;; [^a-z], [^#xN-#xN] +;; +;; matches any Char with a value outside the range indicated. +;; +;; [^abc], [^#xN#xN#xN] +;; +;; matches any Char with a value not among the characters given. +;; Enumerations and ranges of forbidden values can be mixed in one set of +;; brackets. +;; +;; "string" +;; +;; matches a literal string matching that given inside the double quotes. +;; +;; 'string' +;; +;; matches a literal string matching that given inside the single quotes. +;; +;; These symbols may be combined to match more complex patterns as follows, +;; where A and B represent simple expressions: +;; +;; (expression) +;; +;; expression is treated as a unit and may be combined as described in this +;; list. +;; +;; A? +;; +;; matches A or nothing; optional A. +;; +;; A B +;; +;; matches A followed by B. This operator has higher precedence than +;; alternation; thus A B | C D is identical to (A B) | (C D). +;; +;; A | B +;; +;; matches A or B. +;; +;; A - B +;; +;; matches any string that matches A but does not match B. +;; +;; A+ +;; +;; matches one or more occurrences of A. Concatenation has higher +;; precedence than alternation; thus A+ | B+ is identical to (A+) | (B+). +;; +;; A* +;; +;; matches zero or more occurrences of A. Concatenation has higher +;; precedence than alternation; thus A* | B* is identical to (A*) | (B*). +;; +;; Other notations used in the productions are: +;; +;; /* ... */ +;; +;; comment. +;; +;; [ wfc: ... ] +;; +;; well-formedness constraint; this identifies by name a constraint on +;; well-formed documents associated with a production. +;; +;; [ vc: ... ] +;; +;; validity constraint; this identifies by name a constraint on valid +;; documents associated with a production. +;; +;; +;; Differences Between EBNFX And ebnf2ps EBNFX +;; ------------------------------------------- +;; +;; Besides the characters that EBNFX accepts, ebnf2ps EBNFX accepts also the +;; underscore (_) and minus (-) for rule name and european 8-bit accentuated +;; characters (from \240 to \377) for rule name, string and comment. Also +;; rule name can start with upper case letter. +;; +;; +;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Code: + + +(require 'ebnf-otz) + + +(defvar ebnf-ebx-lex nil + "Value returned by `ebnf-ebx-lex' function.") + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Syntactic analyzer + + +;;; rulelist ::= rule+ + +(defun ebnf-ebx-parser (start) + "EBNFX parser." + (let ((total (+ (- ebnf-limit start) 1)) + (bias (1- start)) + (origin (point)) + rule-list token rule) + (goto-char start) + (setq token (ebnf-ebx-lex)) + (and (eq token 'end-of-input) + (error "Invalid EBNFX file format")) + (and (eq token 'end-of-rule) + (setq token (ebnf-ebx-lex))) + (while (not (eq token 'end-of-input)) + (ebnf-message-float + "Parsing...%s%%" + (/ (* (- (point) bias) 100.0) total)) + (setq token (ebnf-ebx-rule token) + rule (cdr token) + token (car token)) + (or (ebnf-add-empty-rule-list rule) + (setq rule-list (cons rule rule-list)))) + (goto-char origin) + rule-list)) + + +;;; rule ::= symbol '::=' expression + + +(defun ebnf-ebx-rule (token) + (let ((name ebnf-ebx-lex) + (action ebnf-action) + elements) + (setq ebnf-action nil) + (or (eq token 'non-terminal) + (error "Invalid rule name")) + (setq token (ebnf-ebx-lex)) + (or (eq token 'production) + (error "Invalid rule: missing `::='")) + (setq elements (ebnf-ebx-expression)) + (or (memq (car elements) '(end-of-rule end-of-input)) + (error "Invalid rule: there is no end of rule")) + (setq elements (cdr elements)) + (ebnf-eps-add-production name) + (cons (ebnf-ebx-lex) + (ebnf-make-production name elements action)))) + + +;; expression ::= concatenation ('|' concatenation)* + + +(defun ebnf-ebx-expression () + (let (body concatenation) + (while (eq (car (setq concatenation + (ebnf-ebx-concatenation (ebnf-ebx-lex)))) + 'alternative) + (setq body (cons (cdr concatenation) body))) + (ebnf-token-alternative body concatenation))) + + +;; concatenation ::= exception* + + +(defun ebnf-ebx-concatenation (token) + (let ((term (ebnf-ebx-exception token)) + seq) + (or (setq token (car term) + term (cdr term)) + (error "Empty element")) + (setq seq (cons term seq)) + (while (setq term (ebnf-ebx-exception token) + token (car term) + term (cdr term)) + (setq seq (cons term seq))) + (cons token + (if (= (length seq) 1) + ;; sequence with only one element + (car seq) + ;; a real sequence + (ebnf-make-sequence (nreverse seq)))))) + + +;;; exception ::= term ('-' term)? + + +(defun ebnf-ebx-exception (token) + (let ((term (ebnf-ebx-term token))) + (if (eq (car term) 'exception) + (let ((except (ebnf-ebx-term (ebnf-ebx-lex)))) + (cons (car except) + (ebnf-make-except (cdr term) (cdr except)))) + term))) + + + +;;; term ::= factor ('*' | '+' | '?')? + + +(defun ebnf-ebx-term (token) + (let ((factor (ebnf-ebx-factor token))) + (when factor + (setq token (ebnf-ebx-lex)) + (cond ((eq token 'zero-or-more) + (setq factor (ebnf-make-zero-or-more factor) + token (ebnf-ebx-lex))) + ((eq token 'one-or-more) + (setq factor (ebnf-make-one-or-more factor) + token (ebnf-ebx-lex))) + ((eq token 'optional) + (setq factor (ebnf-token-optional factor) + token (ebnf-ebx-lex))))) + (cons token factor))) + + +;;; factor ::= hex-char+ +;;; | '[' '^'? ( char ( '-' char )? )+ ']' +;;; | '"' 'string' '"' +;;; | "'" "string" "'" +;;; | '(' expression ')' +;;; | symbol +;;; +;;; symbol ::= 'upper or lower case letter' +;;; ('upper or lower case letter' | '-' | '_')* +;;; /* upper and lower 8-bit accentuated characters are included */ +;;; +;;; hex-char ::= '#x' [0-9A-Fa-f]+ +;;; +;;; char ::= hex-char | 'any character except control characters' +;;; /* 8-bit accentuated characters are included */ +;;; +;;; any-char ::= char | 'newline' | 'tab' + + +(defun ebnf-ebx-factor (token) + (cond + ;; terminal + ((eq token 'terminal) + (ebnf-make-terminal ebnf-ebx-lex)) + ;; non-terminal + ((eq token 'non-terminal) + (ebnf-make-non-terminal ebnf-ebx-lex)) + ;; group + ((eq token 'begin-group) + (let ((body (ebnf-ebx-expression))) + (or (eq (car body) 'end-group) + (error "Missing `)'")) + (cdr body))) + ;; no element + (t + nil) + )) + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Lexical analyzer + + +(defconst ebnf-ebx-token-table (make-vector 256 'error) + "Vector used to map characters to a lexical token.") + + +(defun ebnf-ebx-initialize () + "Initialize EBNFX token table." + ;; control character & control 8-bit character are set to `error' + (let ((char ?\101)) + ;; printable character: A-Z + (while (< char ?\133) + (aset ebnf-ebx-token-table char 'non-terminal) + (setq char (1+ char))) + ;; printable character: a-z + (setq char ?\141) + (while (< char ?\173) + (aset ebnf-ebx-token-table char 'non-terminal) + (setq char (1+ char))) + ;; European 8-bit accentuated characters: + (setq char ?\240) + (while (< char ?\400) + (aset ebnf-ebx-token-table char 'non-terminal) + (setq char (1+ char))) + ;; Override end of line characters: + (aset ebnf-ebx-token-table ?\n 'end-of-rule) ; [NL] linefeed + (aset ebnf-ebx-token-table ?\r 'end-of-rule) ; [CR] carriage return + ;; Override space characters: + (aset ebnf-ebx-token-table ?\013 'space) ; [VT] vertical tab + (aset ebnf-ebx-token-table ?\t 'space) ; [HT] horizontal tab + (aset ebnf-ebx-token-table ?\ 'space) ; [SP] space + ;; Override form feed character: + (aset ebnf-ebx-token-table ?\f 'form-feed) ; [FF] form feed + ;; Override other lexical characters: + (aset ebnf-ebx-token-table ?# 'hash) + (aset ebnf-ebx-token-table ?\" 'double-quote) + (aset ebnf-ebx-token-table ?\' 'single-quote) + (aset ebnf-ebx-token-table ?\( 'begin-group) + (aset ebnf-ebx-token-table ?\) 'end-group) + (aset ebnf-ebx-token-table ?- 'exception) + (aset ebnf-ebx-token-table ?: 'colon) + (aset ebnf-ebx-token-table ?\[ 'begin-square) + (aset ebnf-ebx-token-table ?| 'alternative) + (aset ebnf-ebx-token-table ?* 'zero-or-more) + (aset ebnf-ebx-token-table ?+ 'one-or-more) + (aset ebnf-ebx-token-table ?\? 'optional) + ;; Override comment character: + (aset ebnf-ebx-token-table ?/ 'comment))) + + +;; replace the range "\240-\377" (see `ebnf-range-regexp'). +(defconst ebnf-ebx-non-terminal-chars + (ebnf-range-regexp "-_A-Za-z" ?\240 ?\377)) +(defconst ebnf-ebx-non-terminal-letter-chars + (ebnf-range-regexp "A-Za-z" ?\240 ?\377)) + + +(defun ebnf-ebx-lex () + "Lexical analyser for EBNFX. + +Return a lexical token. + +See documentation for variable `ebnf-ebx-lex'." + (if (>= (point) ebnf-limit) + 'end-of-input + (let (token) + ;; skip spaces and comments + (while (if (> (following-char) 255) + (progn + (setq token 'error) + nil) + (setq token (aref ebnf-ebx-token-table (following-char))) + (cond + ((eq token 'space) + (skip-chars-forward " \013\t" ebnf-limit) + (< (point) ebnf-limit)) + ((eq token 'comment) + (ebnf-ebx-skip-comment)) + ((eq token 'form-feed) + (forward-char) + (setq ebnf-action 'form-feed)) + ((eq token 'end-of-rule) + (ebnf-ebx-skip-end-of-rule)) + ((and (eq token 'begin-square) + (let ((case-fold-search t)) + (looking-at "\\[\\(wfc\\|vc\\):"))) + (ebnf-ebx-skip-constraint)) + (t nil) + ))) + (cond + ;; end of input + ((>= (point) ebnf-limit) + 'end-of-input) + ;; error + ((eq token 'error) + (error "Illegal character")) + ;; end of rule + ((eq token 'end-of-rule) + 'end-of-rule) + ;; terminal: #x [0-9A-Fa-f]+ + ((eq token 'hash) + (setq ebnf-ebx-lex (ebnf-ebx-character)) + 'terminal) + ;; terminal: "string" + ((eq token 'double-quote) + (setq ebnf-ebx-lex (ebnf-ebx-string ?\")) + 'terminal) + ;; terminal: 'string' + ((eq token 'single-quote) + (setq ebnf-ebx-lex (ebnf-ebx-string ?\')) + 'terminal) + ;; terminal: [ ^? ( char ( - char )? )+ ] + ((eq token 'begin-square) + (setq ebnf-ebx-lex (ebnf-ebx-range)) + 'terminal) + ;; non-terminal: NAME + ((eq token 'non-terminal) + (setq ebnf-ebx-lex + (ebnf-buffer-substring ebnf-ebx-non-terminal-chars)) + 'non-terminal) + ;; colon: ::= + ((eq token 'colon) + (or (looking-at "::=") + (error "Missing `::=' token")) + (forward-char 3) + 'production) + ;; miscellaneous: (, ), *, +, ?, |, - + (t + (forward-char) + token) + )))) + + +;; replace the range "\177-\237" (see `ebnf-range-regexp'). +(defconst ebnf-ebx-constraint-chars + (ebnf-range-regexp "^\000-\010\016-\037]" ?\177 ?\237)) + + +(defun ebnf-ebx-skip-constraint () + (or (> (skip-chars-forward ebnf-ebx-constraint-chars ebnf-limit) 0) + (error "Invalid character")) + (or (= (following-char) ?\]) + (error "Missing end of constraint `]'")) + (forward-char) + t) + + + +(defun ebnf-ebx-skip-end-of-rule () + (let (eor-p) + (while (progn + ;; end of rule ==> 2 or more consecutive end of lines + (setq eor-p (or (> (skip-chars-forward "\r\n" ebnf-limit) 1) + eor-p)) + ;; skip spaces + (skip-chars-forward " \013\t" ebnf-limit) + ;; skip comments + (and (= (following-char) ?/) + (ebnf-ebx-skip-comment)))) + (not eor-p))) + + +;; replace the range "\177-\237" (see `ebnf-range-regexp'). +(defconst ebnf-ebx-comment-chars + (ebnf-range-regexp "^\000-\010\016-\037\\*" ?\177 ?\237)) +(defconst ebnf-ebx-filename-chars + (ebnf-range-regexp "^\000-\037\\*" ?\177 ?\237)) + + +(defun ebnf-ebx-skip-comment () + (forward-char) + (or (= (following-char) ?*) + (error "Invalid beginning of comment")) + (forward-char) + (cond + ;; open EPS file + ((and ebnf-eps-executing (= (following-char) ?\[)) + (ebnf-eps-add-context (ebnf-ebx-eps-filename))) + ;; close EPS file + ((and ebnf-eps-executing (= (following-char) ?\])) + (ebnf-eps-remove-context (ebnf-ebx-eps-filename))) + ;; any other action in comment + (t + (setq ebnf-action (aref ebnf-comment-table (following-char)))) + ) + (while (progn + (skip-chars-forward ebnf-ebx-comment-chars ebnf-limit) + (or (= (following-char) ?*) + (error "Missing end of comment")) + (forward-char) + (and (/= (following-char) ?/) + (< (point) ebnf-limit)))) + ;; check for a valid end of comment + (and (>= (point) ebnf-limit) + (error "Missing end of comment")) + (forward-char) + t) + + +(defun ebnf-ebx-eps-filename () + (forward-char) + (let (fname nchar) + (while (progn + (setq fname + (concat fname + (ebnf-buffer-substring ebnf-ebx-filename-chars))) + (and (< (point) ebnf-limit) + (> (setq nchar (skip-chars-forward "*" ebnf-limit)) 0) + (< (point) ebnf-limit) + (/= (following-char) ?/))) + (setq fname (concat fname (make-string nchar ?*)) + nchar nil)) + (if (or (not nchar) (= nchar 0)) + fname + (and (< (point) ebnf-limit) + (= (following-char) ?/) + (setq nchar (1- nchar))) + (concat fname (make-string nchar ?*))))) + + +;; replace the range "\240-\377" (see `ebnf-range-regexp'). +(defconst ebnf-ebx-double-string-chars + (ebnf-range-regexp "\t -!#-~" ?\240 ?\377)) +(defconst ebnf-ebx-single-string-chars + (ebnf-range-regexp "\t -&(-~" ?\240 ?\377)) + + +(defun ebnf-ebx-string (delim) + (buffer-substring-no-properties + (progn + (forward-char) + (point)) + (progn + (skip-chars-forward (if (= delim ?\") + ebnf-ebx-double-string-chars + ebnf-ebx-single-string-chars) + ebnf-limit) + (or (= (following-char) delim) + (error "Missing string delimiter `%c'" delim)) + (prog1 + (point) + (forward-char))))) + + +(defun ebnf-ebx-character () + ;; #x [0-9A-Fa-f]+ + (buffer-substring-no-properties + (point) + (progn + (ebnf-ebx-hex-character) + (point)))) + + +(defun ebnf-ebx-range () + ;; [ ^? ( char ( - char )? )+ ] + (buffer-substring-no-properties + (point) + (progn + (forward-char) + (and (= (following-char) ?^) + (forward-char)) + (and (= (following-char) ?-) + (forward-char)) + (while (progn + (ebnf-ebx-any-character) + (when (= (following-char) ?-) + (forward-char) + (ebnf-ebx-any-character)) + (and (/= (following-char) ?\]) + (< (point) ebnf-limit)))) + (and (>= (point) ebnf-limit) + (error "Missing end of character range `]'")) + (forward-char) + (point)))) + + +(defun ebnf-ebx-any-character () + (let ((char (following-char))) + (cond ((= char ?#) + (ebnf-ebx-hex-character t)) + ((or (and (<= ?\ char) (<= char ?\")) ; # + (and (<= ?$ char) (<= char ?,)) ; - + (and (<= ?. char) (<= char ?\\)) ; ] + (and (<= ?^ char) (<= char ?~)) + (and (<= ?\240 char) (<= char ?\377))) + (forward-char)) + (t + (error "Invalid character `%c'" char))))) + + +(defun ebnf-ebx-hex-character (&optional no-error) + ;; #x [0-9A-Fa-f]+ + (forward-char) + (if (/= (following-char) ?x) + (or no-error + (error "Invalid hexadecimal character")) + (forward-char) + (or (> (skip-chars-forward "0-9A-Fa-f" ebnf-limit) 0) + (error "Invalid hexadecimal character")))) + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +(provide 'ebnf-ebx) + + +;;; arch-tag: +;;; ebnf-ebx.el ends here