changeset 18149:2fec7f622b82

emit charsets after strings so that the final regexp finds the longest match.
author Simon Marshall <simon@gnu.org>
date Fri, 06 Jun 1997 07:10:24 +0000
parents c6e694b6de26
children 960597385f1c
files lisp/emacs-lisp/regexp-opt.el
diffstat 1 files changed, 45 insertions(+), 37 deletions(-) [+]
line wrap: on
line diff
--- a/lisp/emacs-lisp/regexp-opt.el	Fri Jun 06 05:33:13 1997 +0000
+++ b/lisp/emacs-lisp/regexp-opt.el	Fri Jun 06 07:10:24 1997 +0000
@@ -4,7 +4,7 @@
 
 ;; Author: Simon Marshall <simon@gnu.ai.mit.edu>
 ;; Keywords: strings, regexps
-;; Version: 1.04.01
+;; Version: 1.05
 
 ;; This file is part of GNU Emacs.
 
@@ -27,10 +27,14 @@
 
 ;; The "opt" in "regexp-opt" stands for "optim\\(al\\|i\\(se\\|ze\\)\\)".
 ;;
-;; This package generates a regexp from a given list of strings (that matches
-;; one of those strings) that is equivalent to but more efficient than:
+;; This package generates a regexp from a given list of strings (which matches
+;; one of those strings) so that the regexp generated by:
 ;;
-;; (mapconcat 'identity (mapcar 'regexp-quote strings) "\\|")
+;; (regexp-opt strings)
+;;
+;; is equivalent to, but more efficient than, the regexp generated by:
+;;
+;; (mapconcat 'regexp-quote strings "\\|")
 ;;
 ;; For example:
 ;;
@@ -42,10 +46,9 @@
 ;;   (concat "(" (regexp-opt strings t) "\\>"))
 ;;  => "(\\(c\\(atch\\|ond\\(ition-case\\)?\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(current-buffer\\|excursion\\|match-data\\|restriction\\|window-excursion\\)\\|throw\\|un\\(less\\|wind-protect\\)\\|wh\\(en\\|ile\\)\\)\\>"
 ;;
-;; Searching using the above example `regexp-opt' regexp is significantly
-;; faster than searching using the equivalent `mapconcat' regexp, taking
-;; approximately two-thirds of the time.
-;;
+;; Searching using the above example `regexp-opt' regexp takes approximately
+;; two-thirds of the time taken using the equivalent `mapconcat' regexp.
+
 ;; Since this package was written to produce efficient regexps, not regexps
 ;; efficiently, it is probably not a good idea to in-line too many calls in
 ;; your code, unless you use the following trick with `eval-when-compile':
@@ -62,7 +65,15 @@
 ;; (defvar definition-regexp
 ;;   "^(\\(def\\(alias\\|const\\|macro\\|subst\\|un\\|var\\)\\)\\>")
 ;;
-;; Originally written for font-lock.el, from an idea from Stig's hl319.el.
+;; Note that if you use this trick for all instances of `regexp-opt' and
+;; `regexp-opt-depth' in your code, regexp-opt.el would only have to be loaded
+;; at compile time.  But note also that using this trick means that should
+;; regexp-opt.el be changed, perhaps to fix a bug or to add a feature to
+;; improve the efficiency of `regexp-opt' regexps, you would have to recompile
+;; your code for such changes to have effect in your code.
+
+;; Originally written for font-lock.el, from an idea from Stig's hl319.el, with
+;; thanks for ideas also to Michael Ernst, Bob Glickstein and Dan Nicolaescu.
 ;; Please don't tell me that it doesn't produce optimal regexps; I know that
 ;; already.  For example, the above explanation for the meaning of "opt" would
 ;; be more efficient as "optim\\(al\\|i[sz]e\\)", but this requires complex
@@ -73,15 +84,16 @@
 ;;;###autoload
 (defun regexp-opt (strings &optional paren)
   "Return a regexp to match a string in STRINGS.
+Each string should be unique in STRINGS and should not contain any regexps.
 If optional PAREN non-nil, ensure that the returned regexp is enclosed by at
 least one regexp grouping construct.
-Each string in STRINGS should be unique and should not contain any regexps.
 The returned regexp is typically more efficient than the equivalent regexp:
 
- (mapconcat 'identity (mapcar 'regexp-quote STRINGS) \"\\\\|\")
+ (let ((open-paren (if PAREN \"\\\\(\" \"\")) (close-paren (if PAREN \"\\\\)\" \"\")))
+   (concat open-paren (mapconcat 'regexp-quote STRINGS \"\\\\|\") close-paren))
 
-but typically contains regexp grouping constructs.  Use `regexp-opt-depth' to
-count them."
+but typically contains more regexp grouping constructs.
+Use `regexp-opt-depth' to count them."
   (save-match-data
     ;; Recurse on the sorted list.
     (let ((max-lisp-eval-depth (* 1024 1024))
@@ -165,12 +177,12 @@
 		    close-group)))
 	 ;;
 	 ;; If there are several one-character strings, remove them and recurse
-	 ;; on the rest.
+	 ;; on the rest (first so the final regexp finds the longest match).
 	 ((> (length letters) 1)
 	  (let ((rest (let ((completion-regexp-list '("^..+$")))
 			(all-completions "" (mapcar 'list strings)))))
 	    (concat open-group
-		    (regexp-opt-charset letters) "\\|" (regexp-opt-group rest)
+		    (regexp-opt-group rest) "\\|" (regexp-opt-charset letters)
 		    close-group)))
 	 ;;
 	 ;; Otherwise, divide the list into those that start with a particular
@@ -196,30 +208,26 @@
 	 (bracket "") (dash "") (caret ""))
     ;;
     ;; Make a character map but extract character set meta characters.
-    (let (char)
-      (while chars
-	(setq char (string-to-char (pop chars)))
-	(cond ((eq char ?\])
-	       (setq bracket "]"))
-	      ((eq char ?^)
-	       (setq caret "^"))
-	      ((eq char ?-)
-	       (setq dash "-"))
-	      (t
-	       (aset charmap char t)))))
+    (dolist (char (mapcar 'string-to-char chars))
+      (case char
+	(?\]
+	 (setq bracket "]"))
+	(?^
+	 (setq caret "^"))
+	(?-
+	 (setq dash "-"))
+	(otherwise
+	 (aset charmap char t))))
     ;;
     ;; Make a character set from the map using ranges where applicable.
-    (let ((elt 0) start)
-      (while (< elt charwidth)
-	(when (aref charmap elt)
-	  (setq start (1+ elt))
-	  (while (and (< start charwidth) (aref charmap start))
-	    (incf start))
-	  (if (< (- start elt) 4)
-	      (setq charset (format "%s%c" charset elt))
-	    (setq charset (format "%s%c-%c" charset elt (1- start))
-		  elt start)))
-	(incf elt)))
+    (dotimes (elt charwidth)
+      (when (aref charmap elt)
+	(let ((start elt))
+	  (while (and (< elt charwidth) (aref charmap elt))
+	    (incf elt))
+	  (if (> (- elt start) 3)
+	      (setq charset (format "%s%c-%c" charset start (1- elt)))
+	    (setq charset (format "%s%c" charset (setq elt start)))))))
     ;;
     ;; Make sure a caret is not first and a dash is first or last.
     (if (and (string-equal charset "") (string-equal bracket ""))