Mercurial > emacs
changeset 104413:6524f06f3a75
cedet/semantic/lex.el: New file.
author | Chong Yidong <cyd@stupidchicken.com> |
---|---|
date | Fri, 28 Aug 2009 15:02:24 +0000 |
parents | 05443eb58935 |
children | b2b72cdd9d90 |
files | lisp/cedet/semantic/lex.el |
diffstat | 1 files changed, 2089 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lisp/cedet/semantic/lex.el Fri Aug 28 15:02:24 2009 +0000 @@ -0,0 +1,2089 @@ +;;; semantic-lex.el --- Lexical Analyzer builder + +;;; Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, +;;; 2007, 2008, 2009 Free Software Foundation, Inc. + +;; Author: Eric M. Ludlam <zappo@gnu.org> + +;; This file is part of GNU Emacs. + +;; GNU Emacs is free software: you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. + +;; GNU Emacs is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. + +;;; Commentary: +;; +;; This file handles the creation of lexical analyzers for different +;; languages in Emacs Lisp. The purpose of a lexical analyzer is to +;; convert a buffer into a list of lexical tokens. Each token +;; contains the token class (such as 'number, 'symbol, 'IF, etc) and +;; the location in the buffer it was found. Optionally, a token also +;; contains a string representing what is at the designated buffer +;; location. +;; +;; Tokens are pushed onto a token stream, which is basically a list of +;; all the lexical tokens from the analyzed region. The token stream +;; is then handed to the grammar which parsers the file. +;; +;;; How it works +;; +;; Each analyzer specifies a condition and forms. These conditions +;; and forms are assembled into a function by `define-lex' that does +;; the lexical analysis. +;; +;; In the lexical analyzer created with `define-lex', each condition +;; is tested for a given point. When the conditin is true, the forms +;; run. +;; +;; The forms can push a lexical token onto the token stream. The +;; analyzer forms also must move the current analyzer point. If the +;; analyzer point is moved without pushing a token, then tne matched +;; syntax is effectively ignored, or skipped. +;; +;; Thus, starting at the beginning of a region to be analyzed, each +;; condition is tested. One will match, and a lexical token might be +;; pushed, and the point is moved to the end of the lexical token +;; identified. At the new position, the process occurs again until +;; the end of the specified region is reached. +;; +;;; How to use semantic-lex +;; +;; To create a lexer for a language, use the `define-lex' macro. +;; +;; The `define-lex' macro accepts a list of lexical analyzers. Each +;; analyzer is created with `define-lex-analyzer', or one of the +;; derivitive macros. A single analyzer defines a regular expression +;; to match text in a buffer, and a short segment of code to create +;; one lexical token. +;; +;; Each analyzer has a NAME, DOC, a CONDITION, and possibly some +;; FORMS. The NAME is the name used in `define-lex'. The DOC +;; describes what the analyzer should do. +;; +;; The CONDITION evaluates the text at the current point in the +;; current buffer. If CONDITION is true, then the FORMS will be +;; executed. +;; +;; The purpose of the FORMS is to push new lexical tokens onto the +;; list of tokens for the current buffer, and to move point after the +;; matched text. +;; +;; Some macros for creating one analyzer are: +;; +;; define-lex-analyzer - A generic analyzer associating any style of +;; condition to forms. +;; define-lex-regex-analyzer - Matches a regular expression. +;; define-lex-simple-regex-analyzer - Matches a regular expressions, +;; and pushes the match. +;; define-lex-block-analyzer - Matches list syntax, and defines +;; handles open/close delimiters. +;; +;; These macros are used by the grammar compiler when lexical +;; information is specified in a grammar: +;; define-lex- * -type-analyzer - Matches syntax specified in +;; a grammar, and pushes one token for it. The * would +;; be `sexp' for things like lists or strings, and +;; `string' for things that need to match some special +;; string, such as "\\." where a literal match is needed. +;; +;;; Lexical Tables +;; +;; There are tables of different symbols managed in semantic-lex.el. +;; They are: +;; +;; Lexical keyword table - A Table of symbols declared in a grammar +;; file with the %keyword declaration. +;; Keywords are used by `semantic-lex-symbol-or-keyword' +;; to create lexical tokens based on the keyword. +;; +;; Lexical type table - A table of symbols declared in a grammer +;; file with the %type declaration. +;; The grammar compiler uses the type table to create new +;; lexical analyzers. These analyzers are then used to when +;; a new lexical analyzer is made for a language. +;; +;;; Lexical Types +;; +;; A lexical type defines a kind of lexical analyzer that will be +;; automatically generated from a grammar file based on some +;; predetermined attributes. For now these two attributes are +;; recognized : +;; +;; * matchdatatype : define the kind of lexical analyzer. That is : +;; +;; - regexp : define a regexp analyzer (see +;; `define-lex-regex-type-analyzer') +;; +;; - string : define a string analyzer (see +;; `define-lex-string-type-analyzer') +;; +;; - block : define a block type analyzer (see +;; `define-lex-block-type-analyzer') +;; +;; - sexp : define a sexp analyzer (see +;; `define-lex-sexp-type-analyzer') +;; +;; - keyword : define a keyword analyzer (see +;; `define-lex-keyword-type-analyzer') +;; +;; * syntax : define the syntax that matches a syntactic +;; expression. When syntax is matched the corresponding type +;; analyzer is entered and the resulting match data will be +;; interpreted based on the kind of analyzer (see matchdatatype +;; above). +;; +;; The following lexical types are predefined : +;; +;; +-------------+---------------+--------------------------------+ +;; | type | matchdatatype | syntax | +;; +-------------+---------------+--------------------------------+ +;; | punctuation | string | "\\(\\s.\\|\\s$\\|\\s'\\)+" | +;; | keyword | keyword | "\\(\\sw\\|\\s_\\)+" | +;; | symbol | regexp | "\\(\\sw\\|\\s_\\)+" | +;; | string | sexp | "\\s\"" | +;; | number | regexp | semantic-lex-number-expression | +;; | block | block | "\\s(\\|\\s)" | +;; +-------------+---------------+--------------------------------+ +;; +;; In a grammar you must use a %type expression to automatically generate +;; the corresponding analyzers of that type. +;; +;; Here is an example to auto-generate punctuation analyzers +;; with 'matchdatatype and 'syntax predefined (see table above) +;; +;; %type <punctuation> ;; will auto-generate this kind of analyzers +;; +;; It is equivalent to write : +;; +;; %type <punctuation> syntax "\\(\\s.\\|\\s$\\|\\s'\\)+" matchdatatype string +;; +;; ;; Some punctuations based on the type defines above +;; +;; %token <punctuation> NOT "!" +;; %token <punctuation> NOTEQ "!=" +;; %token <punctuation> MOD "%" +;; %token <punctuation> MODEQ "%=" +;; + +;;; On the Semantic 1.x lexer +;; +;; In semantic 1.x, the lexical analyzer was an all purpose routine. +;; To boost efficiency, the analyzer is now a series of routines that +;; are constructed at build time into a single routine. This will +;; eliminate unneeded if statements to speed the lexer. + +(require 'semantic/fw) +;;; Code: + +;;; Compatibility +;; +(eval-and-compile + (if (not (fboundp 'with-syntax-table)) + +;; Copied from Emacs 21 for compatibility with released Emacses. +(defmacro with-syntax-table (table &rest body) + "With syntax table of current buffer set to a copy of TABLE, evaluate BODY. +The syntax table of the current buffer is saved, BODY is evaluated, and the +saved table is restored, even in case of an abnormal exit. +Value is what BODY returns." + (let ((old-table (make-symbol "table")) + (old-buffer (make-symbol "buffer"))) + `(let ((,old-table (syntax-table)) + (,old-buffer (current-buffer))) + (unwind-protect + (progn + (set-syntax-table (copy-syntax-table ,table)) + ,@body) + (save-current-buffer + (set-buffer ,old-buffer) + (set-syntax-table ,old-table)))))) + +)) + +;;; Semantic 2.x lexical analysis +;; +(defun semantic-lex-map-symbols (fun table &optional property) + "Call function FUN on every symbol in TABLE. +If optional PROPERTY is non-nil, call FUN only on every symbol which +as a PROPERTY value. FUN receives a symbol as argument." + (if (arrayp table) + (mapatoms + #'(lambda (symbol) + (if (or (null property) (get symbol property)) + (funcall fun symbol))) + table))) + +;;; Lexical keyword table handling. +;; +;; These keywords are keywords defined for using in a grammar with the +;; %keyword declaration, and are not keywords used in Emacs Lisp. + +(defvar semantic-flex-keywords-obarray nil + "Buffer local keyword obarray for the lexical analyzer. +These keywords are matched explicitly, and converted into special symbols.") +(make-variable-buffer-local 'semantic-flex-keywords-obarray) + +(defmacro semantic-lex-keyword-invalid (name) + "Signal that NAME is an invalid keyword name." + `(signal 'wrong-type-argument '(semantic-lex-keyword-p ,name))) + +(defsubst semantic-lex-keyword-symbol (name) + "Return keyword symbol with NAME or nil if not found." + (and (arrayp semantic-flex-keywords-obarray) + (stringp name) + (intern-soft name semantic-flex-keywords-obarray))) + +(defsubst semantic-lex-keyword-p (name) + "Return non-nil if a keyword with NAME exists in the keyword table. +Return nil otherwise." + (and (setq name (semantic-lex-keyword-symbol name)) + (symbol-value name))) + +(defsubst semantic-lex-keyword-set (name value) + "Set value of keyword with NAME to VALUE and return VALUE." + (set (intern name semantic-flex-keywords-obarray) value)) + +(defsubst semantic-lex-keyword-value (name) + "Return value of keyword with NAME. +Signal an error if a keyword with NAME does not exist." + (let ((keyword (semantic-lex-keyword-symbol name))) + (if keyword + (symbol-value keyword) + (semantic-lex-keyword-invalid name)))) + +(defsubst semantic-lex-keyword-put (name property value) + "For keyword with NAME, set its PROPERTY to VALUE." + (let ((keyword (semantic-lex-keyword-symbol name))) + (if keyword + (put keyword property value) + (semantic-lex-keyword-invalid name)))) + +(defsubst semantic-lex-keyword-get (name property) + "For keyword with NAME, return its PROPERTY value." + (let ((keyword (semantic-lex-keyword-symbol name))) + (if keyword + (get keyword property) + (semantic-lex-keyword-invalid name)))) + +(defun semantic-lex-make-keyword-table (specs &optional propspecs) + "Convert keyword SPECS into an obarray and return it. +SPECS must be a list of (NAME . TOKSYM) elements, where: + + NAME is the name of the keyword symbol to define. + TOKSYM is the lexical token symbol of that keyword. + +If optional argument PROPSPECS is non nil, then interpret it, and +apply those properties. +PROPSPECS must be a list of (NAME PROPERTY VALUE) elements." + ;; Create the symbol hash table + (let ((semantic-flex-keywords-obarray (make-vector 13 0)) + spec) + ;; fill it with stuff + (while specs + (setq spec (car specs) + specs (cdr specs)) + (semantic-lex-keyword-set (car spec) (cdr spec))) + ;; Apply all properties + (while propspecs + (setq spec (car propspecs) + propspecs (cdr propspecs)) + (semantic-lex-keyword-put (car spec) (nth 1 spec) (nth 2 spec))) + semantic-flex-keywords-obarray)) + +(defsubst semantic-lex-map-keywords (fun &optional property) + "Call function FUN on every lexical keyword. +If optional PROPERTY is non-nil, call FUN only on every keyword which +as a PROPERTY value. FUN receives a lexical keyword as argument." + (semantic-lex-map-symbols + fun semantic-flex-keywords-obarray property)) + +(defun semantic-lex-keywords (&optional property) + "Return a list of lexical keywords. +If optional PROPERTY is non-nil, return only keywords which have a +PROPERTY set." + (let (keywords) + (semantic-lex-map-keywords + #'(lambda (symbol) (setq keywords (cons symbol keywords))) + property) + keywords)) + +;;; Type table handling. +;; +;; The lexical type table manages types that occur in a grammar file +;; with the %type declaration. Types represent different syntaxes. +;; See code for `semantic-lex-preset-default-types' for the classic +;; types of syntax. +(defvar semantic-lex-types-obarray nil + "Buffer local types obarray for the lexical analyzer.") +(make-variable-buffer-local 'semantic-lex-types-obarray) + +(defmacro semantic-lex-type-invalid (type) + "Signal that TYPE is an invalid lexical type name." + `(signal 'wrong-type-argument '(semantic-lex-type-p ,type))) + +(defsubst semantic-lex-type-symbol (type) + "Return symbol with TYPE or nil if not found." + (and (arrayp semantic-lex-types-obarray) + (stringp type) + (intern-soft type semantic-lex-types-obarray))) + +(defsubst semantic-lex-type-p (type) + "Return non-nil if a symbol with TYPE name exists." + (and (setq type (semantic-lex-type-symbol type)) + (symbol-value type))) + +(defsubst semantic-lex-type-set (type value) + "Set value of symbol with TYPE name to VALUE and return VALUE." + (set (intern type semantic-lex-types-obarray) value)) + +(defsubst semantic-lex-type-value (type &optional noerror) + "Return value of symbol with TYPE name. +If optional argument NOERROR is non-nil return nil if a symbol with +TYPE name does not exist. Otherwise signal an error." + (let ((sym (semantic-lex-type-symbol type))) + (if sym + (symbol-value sym) + (unless noerror + (semantic-lex-type-invalid type))))) + +(defsubst semantic-lex-type-put (type property value &optional add) + "For symbol with TYPE name, set its PROPERTY to VALUE. +If optional argument ADD is non-nil, create a new symbol with TYPE +name if it does not already exist. Otherwise signal an error." + (let ((sym (semantic-lex-type-symbol type))) + (unless sym + (or add (semantic-lex-type-invalid type)) + (semantic-lex-type-set type nil) + (setq sym (semantic-lex-type-symbol type))) + (put sym property value))) + +(defsubst semantic-lex-type-get (type property &optional noerror) + "For symbol with TYPE name, return its PROPERTY value. +If optional argument NOERROR is non-nil return nil if a symbol with +TYPE name does not exist. Otherwise signal an error." + (let ((sym (semantic-lex-type-symbol type))) + (if sym + (get sym property) + (unless noerror + (semantic-lex-type-invalid type))))) + +(defun semantic-lex-preset-default-types () + "Install useful default properties for well known types." + (semantic-lex-type-put "punctuation" 'matchdatatype 'string t) + (semantic-lex-type-put "punctuation" 'syntax "\\(\\s.\\|\\s$\\|\\s'\\)+") + (semantic-lex-type-put "keyword" 'matchdatatype 'keyword t) + (semantic-lex-type-put "keyword" 'syntax "\\(\\sw\\|\\s_\\)+") + (semantic-lex-type-put "symbol" 'matchdatatype 'regexp t) + (semantic-lex-type-put "symbol" 'syntax "\\(\\sw\\|\\s_\\)+") + (semantic-lex-type-put "string" 'matchdatatype 'sexp t) + (semantic-lex-type-put "string" 'syntax "\\s\"") + (semantic-lex-type-put "number" 'matchdatatype 'regexp t) + (semantic-lex-type-put "number" 'syntax 'semantic-lex-number-expression) + (semantic-lex-type-put "block" 'matchdatatype 'block t) + (semantic-lex-type-put "block" 'syntax "\\s(\\|\\s)") + ) + +(defun semantic-lex-make-type-table (specs &optional propspecs) + "Convert type SPECS into an obarray and return it. +SPECS must be a list of (TYPE . TOKENS) elements, where: + + TYPE is the name of the type symbol to define. + TOKENS is an list of (TOKSYM . MATCHER) elements, where: + + TOKSYM is any lexical token symbol. + MATCHER is a string or regexp a text must match to be a such + lexical token. + +If optional argument PROPSPECS is non nil, then interpret it, and +apply those properties. +PROPSPECS must be a list of (TYPE PROPERTY VALUE)." + ;; Create the symbol hash table + (let* ((semantic-lex-types-obarray (make-vector 13 0)) + spec type tokens token alist default) + ;; fill it with stuff + (while specs + (setq spec (car specs) + specs (cdr specs) + type (car spec) + tokens (cdr spec) + default nil + alist nil) + (while tokens + (setq token (car tokens) + tokens (cdr tokens)) + (if (cdr token) + (setq alist (cons token alist)) + (setq token (car token)) + (if default + (message + "*Warning* default value of <%s> tokens changed to %S, was %S" + type default token)) + (setq default token))) + ;; Ensure the default matching spec is the first one. + (semantic-lex-type-set type (cons default (nreverse alist)))) + ;; Install useful default types & properties + (semantic-lex-preset-default-types) + ;; Apply all properties + (while propspecs + (setq spec (car propspecs) + propspecs (cdr propspecs)) + ;; Create the type if necessary. + (semantic-lex-type-put (car spec) (nth 1 spec) (nth 2 spec) t)) + semantic-lex-types-obarray)) + +(defsubst semantic-lex-map-types (fun &optional property) + "Call function FUN on every lexical type. +If optional PROPERTY is non-nil, call FUN only on every type symbol +which as a PROPERTY value. FUN receives a type symbol as argument." + (semantic-lex-map-symbols + fun semantic-lex-types-obarray property)) + +(defun semantic-lex-types (&optional property) + "Return a list of lexical type symbols. +If optional PROPERTY is non-nil, return only type symbols which have +PROPERTY set." + (let (types) + (semantic-lex-map-types + #'(lambda (symbol) (setq types (cons symbol types))) + property) + types)) + +;;; Lexical Analyzer framework settings +;; + +(defvar semantic-lex-analyzer 'semantic-flex + "The lexical analyzer used for a given buffer. +See `semantic-lex' for documentation. +For compatibility with Semantic 1.x it defaults to `semantic-flex'.") +(make-variable-buffer-local 'semantic-lex-analyzer) + +(defvar semantic-lex-tokens + '( + (bol) + (charquote) + (close-paren) + (comment) + (newline) + (open-paren) + (punctuation) + (semantic-list) + (string) + (symbol) + (whitespace) + ) + "An alist of of semantic token types. +As of December 2001 (semantic 1.4beta13), this variable is not used in +any code. The only use is to refer to the doc-string from elsewhere. + +The key to this alist is the symbol representing token type that +\\[semantic-flex] returns. These are + + - bol: Empty string matching a beginning of line. + This token is produced with + `semantic-lex-beginning-of-line'. + + - charquote: String sequences that match `\\s\\+' regexp. + This token is produced with `semantic-lex-charquote'. + + - close-paren: Characters that match `\\s)' regexp. + These are typically `)', `}', `]', etc. + This token is produced with + `semantic-lex-close-paren'. + + - comment: A comment chunk. These token types are not + produced by default. + This token is produced with `semantic-lex-comments'. + Comments are ignored with `semantic-lex-ignore-comments'. + Comments are treated as whitespace with + `semantic-lex-comments-as-whitespace'. + + - newline Characters matching `\\s-*\\(\n\\|\\s>\\)' regexp. + This token is produced with `semantic-lex-newline'. + + - open-paren: Characters that match `\\s(' regexp. + These are typically `(', `{', `[', etc. + If `semantic-lex-paren-or-list' is used, + then `open-paren' is not usually generated unless + the `depth' argument to \\[semantic-lex] is + greater than 0. + This token is always produced if the analyzer + `semantic-lex-open-paren' is used. + + - punctuation: Characters matching `{\\(\\s.\\|\\s$\\|\\s'\\)' + regexp. + This token is produced with `semantic-lex-punctuation'. + Always specify this analyzer after the comment + analyzer. + + - semantic-list: String delimited by matching parenthesis, braces, + etc. that the lexer skipped over, because the + `depth' parameter to \\[semantic-flex] was not high + enough. + This token is produced with `semantic-lex-paren-or-list'. + + - string: Quoted strings, i.e., string sequences that start + and end with characters matching `\\s\"' + regexp. The lexer relies on @code{forward-sexp} to + find the matching end. + This token is produced with `semantic-lex-string'. + + - symbol: String sequences that match `\\(\\sw\\|\\s_\\)+' + regexp. + This token is produced with + `semantic-lex-symbol-or-keyword'. Always add this analyzer + after `semantic-lex-number', or other analyzers that + match its regular expression. + + - whitespace: Characters that match `\\s-+' regexp. + This token is produced with `semantic-lex-whitespace'.") + +(defvar semantic-lex-syntax-modifications nil + "Changes to the syntax table for this buffer. +These changes are active only while the buffer is being flexed. +This is a list where each element has the form: + (CHAR CLASS) +CHAR is the char passed to `modify-syntax-entry', +and CLASS is the string also passed to `modify-syntax-entry' to define +what syntax class CHAR has.") +(make-variable-buffer-local 'semantic-lex-syntax-modifications) + +(defvar semantic-lex-syntax-table nil + "Syntax table used by lexical analysis. +See also `semantic-lex-syntax-modifications'.") +(make-variable-buffer-local 'semantic-lex-syntax-table) + +(defvar semantic-lex-comment-regex nil + "Regular expression for identifying comment start during lexical analysis. +This may be automatically set when semantic initializes in a mode, but +may need to be overriden for some special languages.") +(make-variable-buffer-local 'semantic-lex-comment-regex) + +(defvar semantic-lex-number-expression + ;; This expression was written by David Ponce for Java, and copied + ;; here for C and any other similar language. + (eval-when-compile + (concat "\\(" + "\\<[0-9]+[.][0-9]+\\([eE][-+]?[0-9]+\\)?[fFdD]?\\>" + "\\|" + "\\<[0-9]+[.][eE][-+]?[0-9]+[fFdD]?\\>" + "\\|" + "\\<[0-9]+[.][fFdD]\\>" + "\\|" + "\\<[0-9]+[.]" + "\\|" + "[.][0-9]+\\([eE][-+]?[0-9]+\\)?[fFdD]?\\>" + "\\|" + "\\<[0-9]+[eE][-+]?[0-9]+[fFdD]?\\>" + "\\|" + "\\<0[xX][0-9a-fA-F]+[lL]?\\>" + "\\|" + "\\<[0-9]+[lLfFdD]?\\>" + "\\)" + )) + "Regular expression for matching a number. +If this value is nil, no number extraction is done during lex. +This expression tries to match C and Java like numbers. + +DECIMAL_LITERAL: + [1-9][0-9]* + ; +HEX_LITERAL: + 0[xX][0-9a-fA-F]+ + ; +OCTAL_LITERAL: + 0[0-7]* + ; +INTEGER_LITERAL: + <DECIMAL_LITERAL>[lL]? + | <HEX_LITERAL>[lL]? + | <OCTAL_LITERAL>[lL]? + ; +EXPONENT: + [eE][+-]?[09]+ + ; +FLOATING_POINT_LITERAL: + [0-9]+[.][0-9]*<EXPONENT>?[fFdD]? + | [.][0-9]+<EXPONENT>?[fFdD]? + | [0-9]+<EXPONENT>[fFdD]? + | [0-9]+<EXPONENT>?[fFdD] + ;") +(make-variable-buffer-local 'semantic-lex-number-expression) + +(defvar semantic-lex-depth 0 + "Default lexing depth. +This specifies how many lists to create tokens in.") +(make-variable-buffer-local 'semantic-lex-depth) + +(defvar semantic-lex-unterminated-syntax-end-function + (lambda (syntax syntax-start lex-end) lex-end) + "Function called when unterminated syntax is encountered. +This should be set to one function. That function should take three +parameters. The SYNTAX, or type of syntax which is unterminated. +SYNTAX-START where the broken syntax begins. +LEX-END is where the lexical analysis was asked to end. +This function can be used for languages that can intelligently fix up +broken syntax, or the exit lexical analysis via `throw' or `signal' +when finding unterminated syntax.") + +;;; Interactive testing commands + +(defun semantic-lex-test (arg) + "Test the semantic lexer in the current buffer. +If universal argument ARG, then try the whole buffer." + (interactive "P") + (let* ((start (current-time)) + (result (semantic-lex + (if arg (point-min) (point)) + (point-max))) + (end (current-time))) + (message "Elapsed Time: %.2f seconds." + (semantic-elapsed-time start end)) + (pop-to-buffer "*Lexer Output*") + (require 'pp) + (erase-buffer) + (insert (pp-to-string result)) + (goto-char (point-min)) + )) + +(defun semantic-lex-test-full-depth (arg) + "Test the semantic lexer in the current buffer parsing through lists. +Usually the lexer parses +If universal argument ARG, then try the whole buffer." + (interactive "P") + (let* ((start (current-time)) + (result (semantic-lex + (if arg (point-min) (point)) + (point-max) + 100)) + (end (current-time))) + (message "Elapsed Time: %.2f seconds." + (semantic-elapsed-time start end)) + (pop-to-buffer "*Lexer Output*") + (require 'pp) + (erase-buffer) + (insert (pp-to-string result)) + (goto-char (point-min)) + )) + +(defun semantic-lex-test-region (beg end) + "Test the semantic lexer in the current buffer. +Analyze the area between BEG and END." + (interactive "r") + (let ((result (semantic-lex beg end))) + (pop-to-buffer "*Lexer Output*") + (require 'pp) + (erase-buffer) + (insert (pp-to-string result)) + (goto-char (point-min)) + )) + +(defvar semantic-lex-debug nil + "When non-nil, debug the local lexical analyzer.") + +(defun semantic-lex-debug (arg) + "Debug the semantic lexer in the current buffer. +Argument ARG specifies of the analyze the whole buffer, or start at point. +While engaged, each token identified by the lexer will be highlighted +in the target buffer A description of the current token will be +displayed in the minibuffer. Press SPC to move to the next lexical token." + (interactive "P") + (require 'semantic/debug) + (let ((semantic-lex-debug t)) + (semantic-lex-test arg))) + +(defun semantic-lex-highlight-token (token) + "Highlight the lexical TOKEN. +TOKEN is a lexical token with a START And END position. +Return the overlay." + (let ((o (semantic-make-overlay (semantic-lex-token-start token) + (semantic-lex-token-end token)))) + (semantic-overlay-put o 'face 'highlight) + o)) + +(defsubst semantic-lex-debug-break (token) + "Break during lexical analysis at TOKEN." + (when semantic-lex-debug + (let ((o nil)) + (unwind-protect + (progn + (when token + (setq o (semantic-lex-highlight-token token))) + (semantic-read-event + (format "%S :: SPC - continue" token)) + ) + (when o + (semantic-overlay-delete o)))))) + +;;; Lexical analyzer creation +;; +;; Code for creating a lex function from lists of analyzers. +;; +;; A lexical analyzer is created from a list of individual analyzers. +;; Each individual analyzer specifies a single match, and code that +;; goes with it. +;; +;; Creation of an analyzer assembles these analyzers into a new function +;; with the behaviors of all the individual analyzers. +;; +(defmacro semantic-lex-one-token (analyzers) + "Calculate one token from the current buffer at point. +Uses locally bound variables from `define-lex'. +Argument ANALYZERS is the list of analyzers being used." + (cons 'cond (mapcar #'symbol-value analyzers))) + +(defvar semantic-lex-end-point nil + "The end point as tracked through lexical functions.") + +(defvar semantic-lex-current-depth nil + "The current depth as tracked through lexical functions.") + +(defvar semantic-lex-maximum-depth nil + "The maximum depth of parenthisis as tracked through lexical functions.") + +(defvar semantic-lex-token-stream nil + "The current token stream we are collecting.") + +(defvar semantic-lex-analysis-bounds nil + "The bounds of the current analysis.") + +(defvar semantic-lex-block-streams nil + "Streams of tokens inside collapsed blocks. +This is an alist of (ANCHOR . STREAM) elements where ANCHOR is the +start position of the block, and STREAM is the list of tokens in that +block.") + +(defvar semantic-lex-reset-hooks nil + "List of hooks major-modes use to reset lexical analyzers. +Hooks are called with START and END values for the current lexical pass. +Should be set with `add-hook'specifying a LOCAL option.") + +;; Stack of nested blocks. +(defvar semantic-lex-block-stack nil) +;;(defvar semantic-lex-timeout 5 +;; "*Number of sections of lexing before giving up.") + +(defmacro define-lex (name doc &rest analyzers) + "Create a new lexical analyzer with NAME. +DOC is a documentation string describing this analyzer. +ANALYZERS are small code snippets of analyzers to use when +building the new NAMED analyzer. Only use analyzers which +are written to be used in `define-lex'. +Each analyzer should be an analyzer created with `define-lex-analyzer'. +Note: The order in which analyzers are listed is important. +If two analyzers can match the same text, it is important to order the +analyzers so that the one you want to match first occurs first. For +example, it is good to put a numbe analyzer in front of a symbol +analyzer which might mistake a number for as a symbol." + `(defun ,name (start end &optional depth length) + ,(concat doc "\nSee `semantic-lex' for more information.") + ;; Make sure the state of block parsing starts over. + (setq semantic-lex-block-streams nil) + ;; Allow specialty reset items. + (run-hook-with-args 'semantic-lex-reset-hooks start end) + ;; Lexing state. + (let* (;(starttime (current-time)) + (starting-position (point)) + (semantic-lex-token-stream nil) + (semantic-lex-block-stack nil) + (tmp-start start) + (semantic-lex-end-point start) + (semantic-lex-current-depth 0) + ;; Use the default depth when not specified. + (semantic-lex-maximum-depth + (or depth semantic-lex-depth)) + ;; Bounds needed for unterminated syntax + (semantic-lex-analysis-bounds (cons start end)) + ;; This entry prevents text properties from + ;; confusing our lexical analysis. See Emacs 22 (CVS) + ;; version of C++ mode with template hack text properties. + (parse-sexp-lookup-properties nil) + ) + ;; Maybe REMOVE THIS LATER. + ;; Trying to find incremental parser bug. + (when (> end (point-max)) + (error ,(format "%s: end (%%d) > point-max (%%d)" name) + end (point-max))) + (with-syntax-table semantic-lex-syntax-table + (goto-char start) + (while (and (< (point) end) + (or (not length) + (<= (length semantic-lex-token-stream) length))) + (semantic-lex-one-token ,analyzers) + (when (eq semantic-lex-end-point tmp-start) + (error ,(format "%s: endless loop at %%d, after %%S" name) + tmp-start (car semantic-lex-token-stream))) + (setq tmp-start semantic-lex-end-point) + (goto-char semantic-lex-end-point) + ;;(when (> (semantic-elapsed-time starttime (current-time)) + ;; semantic-lex-timeout) + ;; (error "Timeout during lex at char %d" (point))) + (semantic-throw-on-input 'lex) + (semantic-lex-debug-break (car semantic-lex-token-stream)) + )) + ;; Check that there is no unterminated block. + (when semantic-lex-block-stack + (let* ((last (pop semantic-lex-block-stack)) + (blk last)) + (while blk + (message + ,(format "%s: `%%s' block from %%S is unterminated" name) + (car blk) (cadr blk)) + (setq blk (pop semantic-lex-block-stack))) + (semantic-lex-unterminated-syntax-detected (car last)))) + ;; Return to where we started. + ;; Do not wrap in protective stuff so that if there is an error + ;; thrown, the user knows where. + (goto-char starting-position) + ;; Return the token stream + (nreverse semantic-lex-token-stream)))) + +;;; Collapsed block tokens delimited by any tokens. +;; +(defun semantic-lex-start-block (syntax) + "Mark the last read token as the beginning of a SYNTAX block." + (if (or (not semantic-lex-maximum-depth) + (< semantic-lex-current-depth semantic-lex-maximum-depth)) + (setq semantic-lex-current-depth (1+ semantic-lex-current-depth)) + (push (list syntax (car semantic-lex-token-stream)) + semantic-lex-block-stack))) + +(defun semantic-lex-end-block (syntax) + "Process the end of a previously marked SYNTAX block. +That is, collapse the tokens inside that block, including the +beginning and end of block tokens, into a high level block token of +class SYNTAX. +The token at beginning of block is the one marked by a previous call +to `semantic-lex-start-block'. The current token is the end of block. +The collapsed tokens are saved in `semantic-lex-block-streams'." + (if (null semantic-lex-block-stack) + (setq semantic-lex-current-depth (1- semantic-lex-current-depth)) + (let* ((stream semantic-lex-token-stream) + (blk (pop semantic-lex-block-stack)) + (bstream (cdr blk)) + (first (car bstream)) + (last (pop stream)) ;; The current token mark the EOBLK + tok) + (if (not (eq (car blk) syntax)) + ;; SYNTAX doesn't match the syntax of the current block in + ;; the stack. So we encountered the end of the SYNTAX block + ;; before the end of the current one in the stack which is + ;; signaled unterminated. + (semantic-lex-unterminated-syntax-detected (car blk)) + ;; Move tokens found inside the block from the main stream + ;; into a separate block stream. + (while (and stream (not (eq (setq tok (pop stream)) first))) + (push tok bstream)) + ;; The token marked as beginning of block was not encountered. + ;; This should not happen! + (or (eq tok first) + (error "Token %S not found at beginning of block `%s'" + first syntax)) + ;; Save the block stream for future reuse, to avoid to redo + ;; the lexical analysis of the block content! + ;; Anchor the block stream with its start position, so we can + ;; use: (cdr (assq start semantic-lex-block-streams)) to + ;; quickly retrieve the lexical stream associated to a block. + (setcar blk (semantic-lex-token-start first)) + (setcdr blk (nreverse bstream)) + (push blk semantic-lex-block-streams) + ;; In the main stream, replace the tokens inside the block by + ;; a high level block token of class SYNTAX. + (setq semantic-lex-token-stream stream) + (semantic-lex-push-token + (semantic-lex-token + syntax (car blk) (semantic-lex-token-end last))) + )))) + +;;; Lexical token API +;; +;; Functions for accessing parts of a token. Use these functions +;; instead of accessing the list structure directly because the +;; contents of the lexical may change. +;; +(defmacro semantic-lex-token (symbol start end &optional str) + "Create a lexical token. +SYMBOL is a symbol representing the class of syntax found. +START and END define the bounds of the token in the current buffer. +Optional STR is the string for the token iff the the bounds +in the buffer do not cover the string they represent. (As from +macro expansion.)" + ;; This if statement checks the existance of a STR argument at + ;; compile time, where STR is some symbol or constant. If the + ;; variable STr (runtime) is nil, this will make an incorrect decision. + ;; + ;; It is like this to maintain the original speed of the compiled + ;; code. + (if str + `(cons ,symbol (cons ,str (cons ,start ,end))) + `(cons ,symbol (cons ,start ,end)))) + +(defun semantic-lex-token-p (thing) + "Return non-nil if THING is a semantic lex token. +This is an exhaustively robust check." + (and (consp thing) + (symbolp (car thing)) + (or (and (numberp (nth 1 thing)) + (numberp (nthcdr 2 thing))) + (and (stringp (nth 1 thing)) + (numberp (nth 2 thing)) + (numberp (nthcdr 3 thing))) + )) + ) + +(defun semantic-lex-token-with-text-p (thing) + "Return non-nil if THING is a semantic lex token. +This is an exhaustively robust check." + (and (consp thing) + (symbolp (car thing)) + (= (length thing) 4) + (stringp (nth 1 thing)) + (numberp (nth 2 thing)) + (numberp (nth 3 thing))) + ) + +(defun semantic-lex-token-without-text-p (thing) + "Return non-nil if THING is a semantic lex token. +This is an exhaustively robust check." + (and (consp thing) + (symbolp (car thing)) + (= (length thing) 3) + (numberp (nth 1 thing)) + (numberp (nth 2 thing))) + ) + +(defun semantic-lex-expand-block-specs (specs) + "Expand block specifications SPECS into a Lisp form. +SPECS is a list of (BLOCK BEGIN END) elements where BLOCK, BEGIN, and +END are token class symbols that indicate to produce one collapsed +BLOCK token from tokens found between BEGIN and END ones. +BLOCK must be a non-nil symbol, and at least one of the BEGIN or END +symbols must be non-nil too. +When BEGIN is non-nil, generate a call to `semantic-lex-start-block' +when a BEGIN token class is encountered. +When END is non-nil, generate a call to `semantic-lex-end-block' when +an END token class is encountered." + (let ((class (make-symbol "class")) + (form nil)) + (dolist (spec specs) + (when (car spec) + (when (nth 1 spec) + (push `((eq ',(nth 1 spec) ,class) + (semantic-lex-start-block ',(car spec))) + form)) + (when (nth 2 spec) + (push `((eq ',(nth 2 spec) ,class) + (semantic-lex-end-block ',(car spec))) + form)))) + (when form + `((let ((,class (semantic-lex-token-class + (car semantic-lex-token-stream)))) + (cond ,@(nreverse form)))) + ))) + +(defmacro semantic-lex-push-token (token &rest blockspecs) + "Push TOKEN in the lexical analyzer token stream. +Return the lexical analysis current end point. +If optional arguments BLOCKSPECS is non-nil, it specifies to process +collapsed block tokens. See `semantic-lex-expand-block-specs' for +more details. +This macro should only be called within the bounds of +`define-lex-analyzer'. It changes the values of the lexical analyzer +variables `token-stream' and `semantic-lex-end-point'. If you need to +move `semantic-lex-end-point' somewhere else, just modify this +variable after calling `semantic-lex-push-token'." + `(progn + (push ,token semantic-lex-token-stream) + ,@(semantic-lex-expand-block-specs blockspecs) + (setq semantic-lex-end-point + (semantic-lex-token-end (car semantic-lex-token-stream))) + )) + +(defsubst semantic-lex-token-class (token) + "Fetch the class of the lexical token TOKEN. +See also the function `semantic-lex-token'." + (car token)) + +(defsubst semantic-lex-token-bounds (token) + "Fetch the start and end locations of the lexical token TOKEN. +Return a pair (START . END)." + (if (not (numberp (car (cdr token)))) + (cdr (cdr token)) + (cdr token))) + +(defsubst semantic-lex-token-start (token) + "Fetch the start position of the lexical token TOKEN. +See also the function `semantic-lex-token'." + (car (semantic-lex-token-bounds token))) + +(defsubst semantic-lex-token-end (token) + "Fetch the end position of the lexical token TOKEN. +See also the function `semantic-lex-token'." + (cdr (semantic-lex-token-bounds token))) + +(defsubst semantic-lex-token-text (token) + "Fetch the text associated with the lexical token TOKEN. +See also the function `semantic-lex-token'." + (if (stringp (car (cdr token))) + (car (cdr token)) + (buffer-substring-no-properties + (semantic-lex-token-start token) + (semantic-lex-token-end token)))) + +(defun semantic-lex-init () + "Initialize any lexical state for this buffer." + (unless semantic-lex-comment-regex + (setq semantic-lex-comment-regex + (if comment-start-skip + (concat "\\(\\s<\\|" comment-start-skip "\\)") + "\\(\\s<\\)"))) + ;; Setup the lexer syntax-table + (setq semantic-lex-syntax-table (copy-syntax-table (syntax-table))) + (dolist (mod semantic-lex-syntax-modifications) + (modify-syntax-entry + (car mod) (nth 1 mod) semantic-lex-syntax-table))) + +(define-overloadable-function semantic-lex (start end &optional depth length) + "Lexically analyze text in the current buffer between START and END. +Optional argument DEPTH indicates at what level to scan over entire +lists. The last argument, LENGTH specifies that `semantic-lex' +should only return LENGTH tokens. The return value is a token stream. +Each element is a list, such of the form + (symbol start-expression . end-expression) +where SYMBOL denotes the token type. +See `semantic-lex-tokens' variable for details on token types. END +does not mark the end of the text scanned, only the end of the +beginning of text scanned. Thus, if a string extends past END, the +end of the return token will be larger than END. To truly restrict +scanning, use `narrow-to-region'." + (funcall semantic-lex-analyzer start end depth length)) + +(defsubst semantic-lex-buffer (&optional depth) + "Lex the current buffer. +Optional argument DEPTH is the depth to scan into lists." + (semantic-lex (point-min) (point-max) depth)) + +(defsubst semantic-lex-list (semlist depth) + "Lex the body of SEMLIST to DEPTH." + (semantic-lex (semantic-lex-token-start semlist) + (semantic-lex-token-end semlist) + depth)) + +;;; Analyzer creation macros +;; +;; An individual analyzer is a condition and code that goes with it. +;; +;; Created analyzers become variables with the code associated with them +;; as the symbol value. These analyzers are assembled into a lexer +;; to create new lexical analyzers. +;; +(defsubst semantic-lex-unterminated-syntax-detected (syntax) + "Inside a lexical analyzer, use this when unterminated syntax was found. +Argument SYNTAX indicates the type of syntax that is unterminated. +The job of this function is to move (point) to a new logical location +so that analysis can continue, if possible." + (goto-char + (funcall semantic-lex-unterminated-syntax-end-function + syntax + (car semantic-lex-analysis-bounds) + (cdr semantic-lex-analysis-bounds) + )) + (setq semantic-lex-end-point (point))) + +(defcustom semantic-lex-debug-analyzers nil + "Non nil means to debug analyzers with syntax protection. +Only in effect if `debug-on-error' is also non-nil." + :group 'semantic + :type 'boolean) + +(defmacro semantic-lex-unterminated-syntax-protection (syntax &rest forms) + "For SYNTAX, execute FORMS with protection for unterminated syntax. +If FORMS throws an error, treat this as a syntax problem, and +execute the unterminated syntax code. FORMS should return a position. +Irreguardless of an error, the cursor should be moved to the end of +the desired syntax, and a position returned. +If `debug-on-error' is set, errors are not caught, so that you can +debug them. +Avoid using a large FORMS since it is duplicated." + `(if (and debug-on-error semantic-lex-debug-analyzers) + (progn ,@forms) + (condition-case nil + (progn ,@forms) + (error + (semantic-lex-unterminated-syntax-detected ,syntax))))) +(put 'semantic-lex-unterminated-syntax-protection + 'lisp-indent-function 1) + +(defmacro define-lex-analyzer (name doc condition &rest forms) + "Create a single lexical analyzer NAME with DOC. +When an analyzer is called, the current buffer and point are +positioned in a buffer at the location to be analyzed. +CONDITION is an expression which returns t if FORMS should be run. +Within the bounds of CONDITION and FORMS, the use of backquote +can be used to evaluate expressions at compile time. +While forms are running, the following variables will be locally bound: + `semantic-lex-analysis-bounds' - The bounds of the current analysis. + of the form (START . END) + `semantic-lex-maximum-depth' - The maximum depth of semantic-list + for the current analysis. + `semantic-lex-current-depth' - The current depth of `semantic-list' that has + been decended. + `semantic-lex-end-point' - End Point after match. + Analyzers should set this to a buffer location if their + match string does not represent the end of the matched text. + `semantic-lex-token-stream' - The token list being collected. + Add new lexical tokens to this list. +Proper action in FORMS is to move the value of `semantic-lex-end-point' to +after the location of the analyzed entry, and to add any discovered tokens +at the beginning of `semantic-lex-token-stream'. +This can be done by using `semantic-lex-push-token'." + `(eval-and-compile + (defvar ,name nil ,doc) + (defun ,name nil) + ;; Do this part separately so that re-evaluation rebuilds this code. + (setq ,name '(,condition ,@forms)) + ;; Build a single lexical analyzer function, so the doc for + ;; function help is automatically provided, and perhaps the + ;; function could be useful for testing and debugging one + ;; analyzer. + (fset ',name (lambda () ,doc + (let ((semantic-lex-token-stream nil) + (semantic-lex-end-point (point)) + (semantic-lex-analysis-bounds + (cons (point) (point-max))) + (semantic-lex-current-depth 0) + (semantic-lex-maximum-depth + semantic-lex-depth) + ) + (when ,condition ,@forms) + semantic-lex-token-stream))) + )) + +(defmacro define-lex-regex-analyzer (name doc regexp &rest forms) + "Create a lexical analyzer with NAME and DOC that will match REGEXP. +FORMS are evaluated upon a successful match. +See `define-lex-analyzer' for more about analyzers." + `(define-lex-analyzer ,name + ,doc + (looking-at ,regexp) + ,@forms + )) + +(defmacro define-lex-simple-regex-analyzer (name doc regexp toksym + &optional index + &rest forms) + "Create a lexical analyzer with NAME and DOC that match REGEXP. +TOKSYM is the symbol to use when creating a semantic lexical token. +INDEX is the index into the match that defines the bounds of the token. +Index should be a plain integer, and not specified in the macro as an +expression. +FORMS are evaluated upon a successful match BEFORE the new token is +created. It is valid to ignore FORMS. +See `define-lex-analyzer' for more about analyzers." + `(define-lex-analyzer ,name + ,doc + (looking-at ,regexp) + ,@forms + (semantic-lex-push-token + (semantic-lex-token ,toksym + (match-beginning ,(or index 0)) + (match-end ,(or index 0)))) + )) + +(defmacro define-lex-block-analyzer (name doc spec1 &rest specs) + "Create a lexical analyzer NAME for paired delimiters blocks. +It detects a paired delimiters block or the corresponding open or +close delimiter depending on the value of the variable +`semantic-lex-current-depth'. DOC is the documentation string of the lexical +analyzer. SPEC1 and SPECS specify the token symbols and open, close +delimiters used. Each SPEC has the form: + +\(BLOCK-SYM (OPEN-DELIM OPEN-SYM) (CLOSE-DELIM CLOSE-SYM)) + +where BLOCK-SYM is the symbol returned in a block token. OPEN-DELIM +and CLOSE-DELIM are respectively the open and close delimiters +identifying a block. OPEN-SYM and CLOSE-SYM are respectively the +symbols returned in open and close tokens." + (let ((specs (cons spec1 specs)) + spec open olist clist) + (while specs + (setq spec (car specs) + specs (cdr specs) + open (nth 1 spec) + ;; build alist ((OPEN-DELIM OPEN-SYM BLOCK-SYM) ...) + olist (cons (list (car open) (cadr open) (car spec)) olist) + ;; build alist ((CLOSE-DELIM CLOSE-SYM) ...) + clist (cons (nth 2 spec) clist))) + `(define-lex-analyzer ,name + ,doc + (and + (looking-at "\\(\\s(\\|\\s)\\)") + (let ((text (match-string 0)) match) + (cond + ((setq match (assoc text ',olist)) + (if (or (not semantic-lex-maximum-depth) + (< semantic-lex-current-depth semantic-lex-maximum-depth)) + (progn + (setq semantic-lex-current-depth (1+ semantic-lex-current-depth)) + (semantic-lex-push-token + (semantic-lex-token + (nth 1 match) + (match-beginning 0) (match-end 0)))) + (semantic-lex-push-token + (semantic-lex-token + (nth 2 match) + (match-beginning 0) + (save-excursion + (semantic-lex-unterminated-syntax-protection (nth 2 match) + (forward-list 1) + (point))) + )) + )) + ((setq match (assoc text ',clist)) + (setq semantic-lex-current-depth (1- semantic-lex-current-depth)) + (semantic-lex-push-token + (semantic-lex-token + (nth 1 match) + (match-beginning 0) (match-end 0))))))) + ))) + +;;; Analyzers +;; +;; Pre-defined common analyzers. +;; +(define-lex-analyzer semantic-lex-default-action + "The default action when no other lexical actions match text. +This action will just throw an error." + t + (error "Unmatched Text during Lexical Analysis")) + +(define-lex-analyzer semantic-lex-beginning-of-line + "Detect and create a beginning of line token (BOL)." + (and (bolp) + ;; Just insert a (bol N . N) token in the token stream, + ;; without moving the point. N is the point at the + ;; beginning of line. + (semantic-lex-push-token (semantic-lex-token 'bol (point) (point))) + nil) ;; CONTINUE + ;; We identify and add the BOL token onto the stream, but since + ;; semantic-lex-end-point doesn't move, we always fail CONDITION, and have no + ;; FORMS body. + nil) + +(define-lex-simple-regex-analyzer semantic-lex-newline + "Detect and create newline tokens." + "\\s-*\\(\n\\|\\s>\\)" 'newline 1) + +(define-lex-regex-analyzer semantic-lex-newline-as-whitespace + "Detect and create newline tokens. +Use this ONLY if newlines are not whitespace characters (such as when +they are comment end characters) AND when you want whitespace tokens." + "\\s-*\\(\n\\|\\s>\\)" + ;; Language wants whitespaces. Create a token for it. + (if (eq (semantic-lex-token-class (car semantic-lex-token-stream)) + 'whitespace) + ;; Merge whitespace tokens together if they are adjacent. Two + ;; whitespace tokens may be sperated by a comment which is not in + ;; the token stream. + (setcdr (semantic-lex-token-bounds (car semantic-lex-token-stream)) + (match-end 0)) + (semantic-lex-push-token + (semantic-lex-token + 'whitespace (match-beginning 0) (match-end 0))))) + +(define-lex-regex-analyzer semantic-lex-ignore-newline + "Detect and ignore newline tokens. +Use this ONLY if newlines are not whitespace characters (such as when +they are comment end characters)." + "\\s-*\\(\n\\|\\s>\\)" + (setq semantic-lex-end-point (match-end 0))) + +(define-lex-regex-analyzer semantic-lex-whitespace + "Detect and create whitespace tokens." + ;; catch whitespace when needed + "\\s-+" + ;; Language wants whitespaces. Create a token for it. + (if (eq (semantic-lex-token-class (car semantic-lex-token-stream)) + 'whitespace) + ;; Merge whitespace tokens together if they are adjacent. Two + ;; whitespace tokens may be sperated by a comment which is not in + ;; the token stream. + (progn + (setq semantic-lex-end-point (match-end 0)) + (setcdr (semantic-lex-token-bounds (car semantic-lex-token-stream)) + semantic-lex-end-point)) + (semantic-lex-push-token + (semantic-lex-token + 'whitespace (match-beginning 0) (match-end 0))))) + +(define-lex-regex-analyzer semantic-lex-ignore-whitespace + "Detect and skip over whitespace tokens." + ;; catch whitespace when needed + "\\s-+" + ;; Skip over the detected whitespace, do not create a token for it. + (setq semantic-lex-end-point (match-end 0))) + +(define-lex-simple-regex-analyzer semantic-lex-number + "Detect and create number tokens. +See `semantic-lex-number-expression' for details on matching numbers, +and number formats." + semantic-lex-number-expression 'number) + +(define-lex-regex-analyzer semantic-lex-symbol-or-keyword + "Detect and create symbol and keyword tokens." + "\\(\\sw\\|\\s_\\)+" + (semantic-lex-push-token + (semantic-lex-token + (or (semantic-lex-keyword-p (match-string 0)) 'symbol) + (match-beginning 0) (match-end 0)))) + +(define-lex-simple-regex-analyzer semantic-lex-charquote + "Detect and create charquote tokens." + ;; Character quoting characters (ie, \n as newline) + "\\s\\+" 'charquote) + +(define-lex-simple-regex-analyzer semantic-lex-punctuation + "Detect and create punctuation tokens." + "\\(\\s.\\|\\s$\\|\\s'\\)" 'punctuation) + +(define-lex-analyzer semantic-lex-punctuation-type + "Detect and create a punctuation type token. +Recognized punctuations are defined in the current table of lexical +types, as the value of the `punctuation' token type." + (and (looking-at "\\(\\s.\\|\\s$\\|\\s'\\)+") + (let* ((key (match-string 0)) + (pos (match-beginning 0)) + (end (match-end 0)) + (len (- end pos)) + (lst (semantic-lex-type-value "punctuation" t)) + (def (car lst)) ;; default lexical symbol or nil + (lst (cdr lst)) ;; alist of (LEX-SYM . PUNCT-STRING) + (elt nil)) + (if lst + ;; Starting with the longest one, search if the + ;; punctuation string is defined for this language. + (while (and (> len 0) (not (setq elt (rassoc key lst)))) + (setq len (1- len) + key (substring key 0 len)))) + (if elt ;; Return the punctuation token found + (semantic-lex-push-token + (semantic-lex-token (car elt) pos (+ pos len))) + (if def ;; Return a default generic token + (semantic-lex-push-token + (semantic-lex-token def pos end)) + ;; Nothing match + ))))) + +(define-lex-regex-analyzer semantic-lex-paren-or-list + "Detect open parenthesis. +Return either a paren token or a semantic list token depending on +`semantic-lex-current-depth'." + "\\s(" + (if (or (not semantic-lex-maximum-depth) + (< semantic-lex-current-depth semantic-lex-maximum-depth)) + (progn + (setq semantic-lex-current-depth (1+ semantic-lex-current-depth)) + (semantic-lex-push-token + (semantic-lex-token + 'open-paren (match-beginning 0) (match-end 0)))) + (semantic-lex-push-token + (semantic-lex-token + 'semantic-list (match-beginning 0) + (save-excursion + (semantic-lex-unterminated-syntax-protection 'semantic-list + (forward-list 1) + (point)) + ))) + )) + +(define-lex-simple-regex-analyzer semantic-lex-open-paren + "Detect and create an open parenthisis token." + "\\s(" 'open-paren 0 (setq semantic-lex-current-depth (1+ semantic-lex-current-depth))) + +(define-lex-simple-regex-analyzer semantic-lex-close-paren + "Detect and create a close paren token." + "\\s)" 'close-paren 0 (setq semantic-lex-current-depth (1- semantic-lex-current-depth))) + +(define-lex-regex-analyzer semantic-lex-string + "Detect and create a string token." + "\\s\"" + ;; Zing to the end of this string. + (semantic-lex-push-token + (semantic-lex-token + 'string (point) + (save-excursion + (semantic-lex-unterminated-syntax-protection 'string + (forward-sexp 1) + (point)) + )))) + +(define-lex-regex-analyzer semantic-lex-comments + "Detect and create a comment token." + semantic-lex-comment-regex + (save-excursion + (forward-comment 1) + ;; Generate newline token if enabled + (if (bolp) (backward-char 1)) + (setq semantic-lex-end-point (point)) + ;; Language wants comments or want them as whitespaces, + ;; link them together. + (if (eq (semantic-lex-token-class (car semantic-lex-token-stream)) 'comment) + (setcdr (semantic-lex-token-bounds (car semantic-lex-token-stream)) + semantic-lex-end-point) + (semantic-lex-push-token + (semantic-lex-token + 'comment (match-beginning 0) semantic-lex-end-point))))) + +(define-lex-regex-analyzer semantic-lex-comments-as-whitespace + "Detect comments and create a whitespace token." + semantic-lex-comment-regex + (save-excursion + (forward-comment 1) + ;; Generate newline token if enabled + (if (bolp) (backward-char 1)) + (setq semantic-lex-end-point (point)) + ;; Language wants comments or want them as whitespaces, + ;; link them together. + (if (eq (semantic-lex-token-class (car semantic-lex-token-stream)) 'whitespace) + (setcdr (semantic-lex-token-bounds (car semantic-lex-token-stream)) + semantic-lex-end-point) + (semantic-lex-push-token + (semantic-lex-token + 'whitespace (match-beginning 0) semantic-lex-end-point))))) + +(define-lex-regex-analyzer semantic-lex-ignore-comments + "Detect and create a comment token." + semantic-lex-comment-regex + (let ((comment-start-point (point))) + (forward-comment 1) + (if (eq (point) comment-start-point) + ;; In this case our start-skip string failed + ;; to work properly. Lets try and move over + ;; whatever white space we matched to begin + ;; with. + (skip-syntax-forward "-.'" + (save-excursion + (end-of-line) + (point))) + ;; We may need to back up so newlines or whitespace is generated. + (if (bolp) + (backward-char 1))) + (if (eq (point) comment-start-point) + (error "Strange comment syntax prevents lexical analysis")) + (setq semantic-lex-end-point (point)))) + +;;; Comment lexer +;; +;; Predefined lexers that could be used instead of creating new +;; analyers. + +(define-lex semantic-comment-lexer + "A simple lexical analyzer that handles comments. +This lexer will only return comment tokens. It is the default lexer +used by `semantic-find-doc-snarf-comment' to snarf up the comment at +point." + semantic-lex-ignore-whitespace + semantic-lex-ignore-newline + semantic-lex-comments + semantic-lex-default-action) + +;;; Test Lexer +;; +(define-lex semantic-simple-lexer + "A simple lexical analyzer that handles simple buffers. +This lexer ignores comments and whitespace, and will return +syntax as specified by the syntax table." + semantic-lex-ignore-whitespace + semantic-lex-ignore-newline + semantic-lex-number + semantic-lex-symbol-or-keyword + semantic-lex-charquote + semantic-lex-paren-or-list + semantic-lex-close-paren + semantic-lex-string + semantic-lex-ignore-comments + semantic-lex-punctuation + semantic-lex-default-action) + +;;; Analyzers generated from grammar. +;; +;; Some analyzers are hand written. Analyzers created with these +;; functions are generated from the grammar files. + +(defmacro define-lex-keyword-type-analyzer (name doc syntax) + "Define a keyword type analyzer NAME with DOC string. +SYNTAX is the regexp that matches a keyword syntactic expression." + (let ((key (make-symbol "key"))) + `(define-lex-analyzer ,name + ,doc + (and (looking-at ,syntax) + (let ((,key (semantic-lex-keyword-p (match-string 0)))) + (when ,key + (semantic-lex-push-token + (semantic-lex-token + ,key (match-beginning 0) (match-end 0))))))) + )) + +(defmacro define-lex-sexp-type-analyzer (name doc syntax token) + "Define a sexp type analyzer NAME with DOC string. +SYNTAX is the regexp that matches the beginning of the s-expression. +TOKEN is the lexical token returned when SYNTAX matches." + `(define-lex-regex-analyzer ,name + ,doc + ,syntax + (semantic-lex-push-token + (semantic-lex-token + ,token (point) + (save-excursion + (semantic-lex-unterminated-syntax-protection ,token + (forward-sexp 1) + (point)))))) + ) + +(defmacro define-lex-regex-type-analyzer (name doc syntax matches default) + "Define a regexp type analyzer NAME with DOC string. +SYNTAX is the regexp that matches a syntactic expression. +MATCHES is an alist of lexical elements used to refine the syntactic +expression. +DEFAULT is the default lexical token returned when no MATCHES." + (if matches + (let* ((val (make-symbol "val")) + (lst (make-symbol "lst")) + (elt (make-symbol "elt")) + (pos (make-symbol "pos")) + (end (make-symbol "end"))) + `(define-lex-analyzer ,name + ,doc + (and (looking-at ,syntax) + (let* ((,val (match-string 0)) + (,pos (match-beginning 0)) + (,end (match-end 0)) + (,lst ,matches) + ,elt) + (while (and ,lst (not ,elt)) + (if (string-match (cdar ,lst) ,val) + (setq ,elt (caar ,lst)) + (setq ,lst (cdr ,lst)))) + (semantic-lex-push-token + (semantic-lex-token (or ,elt ,default) ,pos ,end)))) + )) + `(define-lex-simple-regex-analyzer ,name + ,doc + ,syntax ,default) + )) + +(defmacro define-lex-string-type-analyzer (name doc syntax matches default) + "Define a string type analyzer NAME with DOC string. +SYNTAX is the regexp that matches a syntactic expression. +MATCHES is an alist of lexical elements used to refine the syntactic +expression. +DEFAULT is the default lexical token returned when no MATCHES." + (if matches + (let* ((val (make-symbol "val")) + (lst (make-symbol "lst")) + (elt (make-symbol "elt")) + (pos (make-symbol "pos")) + (end (make-symbol "end")) + (len (make-symbol "len"))) + `(define-lex-analyzer ,name + ,doc + (and (looking-at ,syntax) + (let* ((,val (match-string 0)) + (,pos (match-beginning 0)) + (,end (match-end 0)) + (,len (- ,end ,pos)) + (,lst ,matches) + ,elt) + ;; Starting with the longest one, search if a lexical + ;; value match a token defined for this language. + (while (and (> ,len 0) (not (setq ,elt (rassoc ,val ,lst)))) + (setq ,len (1- ,len) + ,val (substring ,val 0 ,len))) + (when ,elt ;; Adjust token end position. + (setq ,elt (car ,elt) + ,end (+ ,pos ,len))) + (semantic-lex-push-token + (semantic-lex-token (or ,elt ,default) ,pos ,end)))) + )) + `(define-lex-simple-regex-analyzer ,name + ,doc + ,syntax ,default) + )) + +(defmacro define-lex-block-type-analyzer (name doc syntax matches) + "Define a block type analyzer NAME with DOC string. + +SYNTAX is the regexp that matches block delimiters, typically the +open (`\\\\s(') and close (`\\\\s)') parenthesis syntax classes. + +MATCHES is a pair (OPEN-SPECS . CLOSE-SPECS) that defines blocks. + + OPEN-SPECS is a list of (OPEN-DELIM OPEN-TOKEN BLOCK-TOKEN) elements + where: + + OPEN-DELIM is a string: the block open delimiter character. + + OPEN-TOKEN is the lexical token class associated to the OPEN-DELIM + delimiter. + + BLOCK-TOKEN is the lexical token class associated to the block + that starts at the OPEN-DELIM delimiter. + + CLOSE-SPECS is a list of (CLOSE-DELIM CLOSE-TOKEN) elements where: + + CLOSE-DELIM is a string: the block end delimiter character. + + CLOSE-TOKEN is the lexical token class associated to the + CLOSE-DELIM delimiter. + +Each element in OPEN-SPECS must have a corresponding element in +CLOSE-SPECS. + +The lexer will return a BLOCK-TOKEN token when the value of +`semantic-lex-current-depth' is greater than or equal to the maximum +depth of parenthesis tracking (see also the function `semantic-lex'). +Otherwise it will return OPEN-TOKEN and CLOSE-TOKEN tokens. + +TO DO: Put the following in the developer's guide and just put a +reference here. + +In the grammar: + +The value of a block token must be a string that contains a readable +sexp of the form: + + \"(OPEN-TOKEN CLOSE-TOKEN)\" + +OPEN-TOKEN and CLOSE-TOKEN represent the block delimiters, and must be +lexical tokens of respectively `open-paren' and `close-paren' types. +Their value is the corresponding delimiter character as a string. + +Here is a small example to analyze a parenthesis block: + + %token <block> PAREN_BLOCK \"(LPAREN RPAREN)\" + %token <open-paren> LPAREN \"(\" + %token <close-paren> RPAREN \")\" + +When the lexer encounters the open-paren delimiter \"(\": + + - If the maximum depth of parenthesis tracking is not reached (that + is, current depth < max depth), it returns a (LPAREN start . end) + token, then continue analysis inside the block. Later, when the + corresponding close-paren delimiter \")\" will be encountered, it + will return a (RPAREN start . end) token. + + - If the maximum depth of parenthesis tracking is reached (current + depth >= max depth), it returns the whole parenthesis block as + a (PAREN_BLOCK start . end) token." + (let* ((val (make-symbol "val")) + (lst (make-symbol "lst")) + (elt (make-symbol "elt"))) + `(define-lex-analyzer ,name + ,doc + (and + (looking-at ,syntax) ;; "\\(\\s(\\|\\s)\\)" + (let ((,val (match-string 0)) + (,lst ,matches) + ,elt) + (cond + ((setq ,elt (assoc ,val (car ,lst))) + (if (or (not semantic-lex-maximum-depth) + (< semantic-lex-current-depth semantic-lex-maximum-depth)) + (progn + (setq semantic-lex-current-depth (1+ semantic-lex-current-depth)) + (semantic-lex-push-token + (semantic-lex-token + (nth 1 ,elt) + (match-beginning 0) (match-end 0)))) + (semantic-lex-push-token + (semantic-lex-token + (nth 2 ,elt) + (match-beginning 0) + (save-excursion + (semantic-lex-unterminated-syntax-protection (nth 2 ,elt) + (forward-list 1) + (point))))))) + ((setq ,elt (assoc ,val (cdr ,lst))) + (setq semantic-lex-current-depth (1- semantic-lex-current-depth)) + (semantic-lex-push-token + (semantic-lex-token + (nth 1 ,elt) + (match-beginning 0) (match-end 0)))) + )))) + )) + +;;; Lexical Safety +;; +;; The semantic lexers, unlike other lexers, can throw errors on +;; unbalanced syntax. Since editing is all about changeging test +;; we need to provide a convenient way to protect against syntactic +;; inequalities. + +(defmacro semantic-lex-catch-errors (symbol &rest forms) + "Using SYMBOL, execute FORMS catching lexical errors. +If FORMS results in a call to the parser that throws a lexical error, +the error will be caught here without the buffer's cache being thrown +out of date. +If there is an error, the syntax that failed is returned. +If there is no error, then the last value of FORMS is returned." + (let ((ret (make-symbol "ret")) + (syntax (make-symbol "syntax")) + (start (make-symbol "start")) + (end (make-symbol "end"))) + `(let* ((semantic-lex-unterminated-syntax-end-function + (lambda (,syntax ,start ,end) + (throw ',symbol ,syntax))) + ;; Delete the below when semantic-flex is fully retired. + (semantic-flex-unterminated-syntax-end-function + semantic-lex-unterminated-syntax-end-function) + (,ret (catch ',symbol + (save-excursion + ,@forms + nil)))) + ;; Great Sadness. Assume that FORMS execute within the + ;; confines of the current buffer only! Mark this thing + ;; unparseable iff the special symbol was thrown. This + ;; will prevent future calls from parsing, but will allow + ;; then to still return the cache. + (when ,ret + ;; Leave this message off. If an APP using this fcn wants + ;; a message, they can do it themselves. This cleans up + ;; problems with the idle scheduler obscuring useful data. + ;;(message "Buffer not currently parsable (%S)." ,ret) + (semantic-parse-tree-unparseable)) + ,ret))) +(put 'semantic-lex-catch-errors 'lisp-indent-function 1) + + +;;; Interfacing with edebug +;; +(add-hook + 'edebug-setup-hook + #'(lambda () + + (def-edebug-spec define-lex + (&define name stringp (&rest symbolp)) + ) + (def-edebug-spec define-lex-analyzer + (&define name stringp form def-body) + ) + (def-edebug-spec define-lex-regex-analyzer + (&define name stringp form def-body) + ) + (def-edebug-spec define-lex-simple-regex-analyzer + (&define name stringp form symbolp [ &optional form ] def-body) + ) + (def-edebug-spec define-lex-block-analyzer + (&define name stringp form (&rest form)) + ) + (def-edebug-spec semantic-lex-catch-errors + (symbolp def-body) + ) + + )) + +;;; Compatibility with Semantic 1.x lexical analysis +;; +;; NOTE: DELETE THIS SOMEDAY SOON + +(semantic-alias-obsolete 'semantic-flex-start 'semantic-lex-token-start) +(semantic-alias-obsolete 'semantic-flex-end 'semantic-lex-token-end) +(semantic-alias-obsolete 'semantic-flex-text 'semantic-lex-token-text) +(semantic-alias-obsolete 'semantic-flex-make-keyword-table 'semantic-lex-make-keyword-table) +(semantic-alias-obsolete 'semantic-flex-keyword-p 'semantic-lex-keyword-p) +(semantic-alias-obsolete 'semantic-flex-keyword-put 'semantic-lex-keyword-put) +(semantic-alias-obsolete 'semantic-flex-keyword-get 'semantic-lex-keyword-get) +(semantic-alias-obsolete 'semantic-flex-map-keywords 'semantic-lex-map-keywords) +(semantic-alias-obsolete 'semantic-flex-keywords 'semantic-lex-keywords) +(semantic-alias-obsolete 'semantic-flex-buffer 'semantic-lex-buffer) +(semantic-alias-obsolete 'semantic-flex-list 'semantic-lex-list) + +;; This simple scanner uses the syntax table to generate a stream of +;; simple tokens of the form: +;; +;; (SYMBOL START . END) +;; +;; Where symbol is the type of thing it is. START and END mark that +;; objects boundary. + +(defvar semantic-flex-tokens semantic-lex-tokens + "An alist of of semantic token types. +See variable `semantic-lex-tokens'.") + +(defvar semantic-flex-unterminated-syntax-end-function + (lambda (syntax syntax-start flex-end) flex-end) + "Function called when unterminated syntax is encountered. +This should be set to one function. That function should take three +parameters. The SYNTAX, or type of syntax which is unterminated. +SYNTAX-START where the broken syntax begins. +FLEX-END is where the lexical analysis was asked to end. +This function can be used for languages that can intelligently fix up +broken syntax, or the exit lexical analysis via `throw' or `signal' +when finding unterminated syntax.") + +(defvar semantic-flex-extensions nil + "Buffer local extensions to the lexical analyzer. +This should contain an alist with a key of a regex and a data element of +a function. The function should both move point, and return a lexical +token of the form: + ( TYPE START . END) +nil is also a valid return value. +TYPE can be any type of symbol, as long as it doesn't occur as a +nonterminal in the language definition.") +(make-variable-buffer-local 'semantic-flex-extensions) + +(defvar semantic-flex-syntax-modifications nil + "Changes to the syntax table for this buffer. +These changes are active only while the buffer is being flexed. +This is a list where each element has the form: + (CHAR CLASS) +CHAR is the char passed to `modify-syntax-entry', +and CLASS is the string also passed to `modify-syntax-entry' to define +what syntax class CHAR has.") +(make-variable-buffer-local 'semantic-flex-syntax-modifications) + +(defvar semantic-ignore-comments t + "Default comment handling. +t means to strip comments when flexing. Nil means to keep comments +as part of the token stream.") +(make-variable-buffer-local 'semantic-ignore-comments) + +(defvar semantic-flex-enable-newlines nil + "When flexing, report 'newlines as syntactic elements. +Useful for languages where the newline is a special case terminator. +Only set this on a per mode basis, not globally.") +(make-variable-buffer-local 'semantic-flex-enable-newlines) + +(defvar semantic-flex-enable-whitespace nil + "When flexing, report 'whitespace as syntactic elements. +Useful for languages where the syntax is whitespace dependent. +Only set this on a per mode basis, not globally.") +(make-variable-buffer-local 'semantic-flex-enable-whitespace) + +(defvar semantic-flex-enable-bol nil + "When flexing, report beginning of lines as syntactic elements. +Useful for languages like python which are indentation sensitive. +Only set this on a per mode basis, not globally.") +(make-variable-buffer-local 'semantic-flex-enable-bol) + +(defvar semantic-number-expression semantic-lex-number-expression + "See variable `semantic-lex-number-expression'.") +(make-variable-buffer-local 'semantic-number-expression) + +(defvar semantic-flex-depth 0 + "Default flexing depth. +This specifies how many lists to create tokens in.") +(make-variable-buffer-local 'semantic-flex-depth) + +(defun semantic-flex (start end &optional depth length) + "Using the syntax table, do something roughly equivalent to flex. +Semantically check between START and END. Optional argument DEPTH +indicates at what level to scan over entire lists. +The return value is a token stream. Each element is a list, such of +the form (symbol start-expression . end-expression) where SYMBOL +denotes the token type. +See `semantic-flex-tokens' variable for details on token types. +END does not mark the end of the text scanned, only the end of the +beginning of text scanned. Thus, if a string extends past END, the +end of the return token will be larger than END. To truly restrict +scanning, use `narrow-to-region'. +The last argument, LENGTH specifies that `semantic-flex' should only +return LENGTH tokens." + (message "`semantic-flex' is an obsolete function. Use `define-lex' to create lexers.") + (if (not semantic-flex-keywords-obarray) + (setq semantic-flex-keywords-obarray [ nil ])) + (let ((ts nil) + (pos (point)) + (ep nil) + (curdepth 0) + (cs (if comment-start-skip + (concat "\\(\\s<\\|" comment-start-skip "\\)") + (concat "\\(\\s<\\)"))) + (newsyntax (copy-syntax-table (syntax-table))) + (mods semantic-flex-syntax-modifications) + ;; Use the default depth if it is not specified. + (depth (or depth semantic-flex-depth))) + ;; Update the syntax table + (while mods + (modify-syntax-entry (car (car mods)) (car (cdr (car mods))) newsyntax) + (setq mods (cdr mods))) + (with-syntax-table newsyntax + (goto-char start) + (while (and (< (point) end) (or (not length) (<= (length ts) length))) + (cond + ;; catch beginning of lines when needed. + ;; Must be done before catching any other tokens! + ((and semantic-flex-enable-bol + (bolp) + ;; Just insert a (bol N . N) token in the token stream, + ;; without moving the point. N is the point at the + ;; beginning of line. + (setq ts (cons (cons 'bol (cons (point) (point))) ts)) + nil)) ;; CONTINUE + ;; special extensions, includes whitespace, nl, etc. + ((and semantic-flex-extensions + (let ((fe semantic-flex-extensions) + (r nil)) + (while fe + (if (looking-at (car (car fe))) + (setq ts (cons (funcall (cdr (car fe))) ts) + r t + fe nil + ep (point))) + (setq fe (cdr fe))) + (if (and r (not (car ts))) (setq ts (cdr ts))) + r))) + ;; catch newlines when needed + ((looking-at "\\s-*\\(\n\\|\\s>\\)") + (if semantic-flex-enable-newlines + (setq ep (match-end 1) + ts (cons (cons 'newline + (cons (match-beginning 1) ep)) + ts)))) + ;; catch whitespace when needed + ((looking-at "\\s-+") + (if semantic-flex-enable-whitespace + ;; Language wants whitespaces, link them together. + (if (eq (car (car ts)) 'whitespace) + (setcdr (cdr (car ts)) (match-end 0)) + (setq ts (cons (cons 'whitespace + (cons (match-beginning 0) + (match-end 0))) + ts))))) + ;; numbers + ((and semantic-number-expression + (looking-at semantic-number-expression)) + (setq ts (cons (cons 'number + (cons (match-beginning 0) + (match-end 0))) + ts))) + ;; symbols + ((looking-at "\\(\\sw\\|\\s_\\)+") + (setq ts (cons (cons + ;; Get info on if this is a keyword or not + (or (semantic-flex-keyword-p (match-string 0)) + 'symbol) + (cons (match-beginning 0) (match-end 0))) + ts))) + ;; Character quoting characters (ie, \n as newline) + ((looking-at "\\s\\+") + (setq ts (cons (cons 'charquote + (cons (match-beginning 0) (match-end 0))) + ts))) + ;; Open parens, or semantic-lists. + ((looking-at "\\s(") + (if (or (not depth) (< curdepth depth)) + (progn + (setq curdepth (1+ curdepth)) + (setq ts (cons (cons 'open-paren + (cons (match-beginning 0) (match-end 0))) + ts))) + (setq ts (cons + (cons 'semantic-list + (cons (match-beginning 0) + (save-excursion + (condition-case nil + (forward-list 1) + ;; This case makes flex robust + ;; to broken lists. + (error + (goto-char + (funcall + semantic-flex-unterminated-syntax-end-function + 'semantic-list + start end)))) + (setq ep (point))))) + ts)))) + ;; Close parens + ((looking-at "\\s)") + (setq ts (cons (cons 'close-paren + (cons (match-beginning 0) (match-end 0))) + ts)) + (setq curdepth (1- curdepth))) + ;; String initiators + ((looking-at "\\s\"") + ;; Zing to the end of this string. + (setq ts (cons (cons 'string + (cons (match-beginning 0) + (save-excursion + (condition-case nil + (forward-sexp 1) + ;; This case makes flex + ;; robust to broken strings. + (error + (goto-char + (funcall + semantic-flex-unterminated-syntax-end-function + 'string + start end)))) + (setq ep (point))))) + ts))) + ;; comments + ((looking-at cs) + (if (and semantic-ignore-comments + (not semantic-flex-enable-whitespace)) + ;; If the language doesn't deal with comments nor + ;; whitespaces, ignore them here. + (let ((comment-start-point (point))) + (forward-comment 1) + (if (eq (point) comment-start-point) + ;; In this case our start-skip string failed + ;; to work properly. Lets try and move over + ;; whatever white space we matched to begin + ;; with. + (skip-syntax-forward "-.'" + (save-excursion + (end-of-line) + (point))) + ;;(forward-comment 1) + ;; Generate newline token if enabled + (if (and semantic-flex-enable-newlines + (bolp)) + (backward-char 1))) + (if (eq (point) comment-start-point) + (error "Strange comment syntax prevents lexical analysis")) + (setq ep (point))) + (let ((tk (if semantic-ignore-comments 'whitespace 'comment))) + (save-excursion + (forward-comment 1) + ;; Generate newline token if enabled + (if (and semantic-flex-enable-newlines + (bolp)) + (backward-char 1)) + (setq ep (point))) + ;; Language wants comments or want them as whitespaces, + ;; link them together. + (if (eq (car (car ts)) tk) + (setcdr (cdr (car ts)) ep) + (setq ts (cons (cons tk (cons (match-beginning 0) ep)) + ts)))))) + ;; punctuation + ((looking-at "\\(\\s.\\|\\s$\\|\\s'\\)") + (setq ts (cons (cons 'punctuation + (cons (match-beginning 0) (match-end 0))) + ts))) + ;; unknown token + (t + (error "What is that?"))) + (goto-char (or ep (match-end 0))) + (setq ep nil))) + ;; maybe catch the last beginning of line when needed + (and semantic-flex-enable-bol + (= (point) end) + (bolp) + (setq ts (cons (cons 'bol (cons (point) (point))) ts))) + (goto-char pos) + ;;(message "Flexing muscles...done") + (nreverse ts))) + +(provide 'semantic/lex) + +;;; semantic-lex.el ends here