# HG changeset patch # User Gerd Moellmann # Date 939423909 0 # Node ID 6f591e2d9c0de2b01546d4a1c17e373d4bd7010e # Parent f52741f10a0a4da3386526c392cc8a7979b5cce1 (regexp-opt-try-suffix): New function. (regexp-opt-group): Use it to get common suffixes in STRINGS. If STRINGS is nil, return "" rather than nil. diff -r f52741f10a0a -r 6f591e2d9c0d lisp/emacs-lisp/regexp-opt.el --- a/lisp/emacs-lisp/regexp-opt.el Fri Oct 08 21:54:44 1999 +0000 +++ b/lisp/emacs-lisp/regexp-opt.el Fri Oct 08 23:05:09 1999 +0000 @@ -1,9 +1,10 @@ ;;; regexp-opt.el --- generate efficient regexps to match strings. -;; Copyright (C) 1994, 1995, 1996, 1997 Free Software Foundation, Inc. +;; Copyright (C) 1994, 95, 96, 97, 98, 1999 Free Software Foundation, Inc. ;; Author: Simon Marshall ;; Keywords: strings, regexps +;; Version: 1.07 ;; This file is part of GNU Emacs. @@ -24,7 +25,7 @@ ;;; Commentary: -;; The "opt" in "regexp-opt" stands for "optim\\(al\\|i\\(se\\|ze\\)\\)". +;; The "opt" in "regexp-opt" stands for "optim\\(al\\|i[sz]e\\)". ;; ;; This package generates a regexp from a given list of strings (which matches ;; one of those strings) so that the regexp generated by: @@ -47,6 +48,17 @@ ;; ;; Searching using the above example `regexp-opt' regexp takes approximately ;; two-thirds of the time taken using the equivalent `mapconcat' regexp. +;; +;; Note that this package will also find common suffix strings if this does not +;; increase the number of grouping constructs. For example: +;; +;; (regexp-opt '("these" "those")) +;; => "th[eo]se" +;; +;; but: +;; +;; (regexp-opt '("barfly" "housefly")) +;; => "barfly\\|housefly" rather than "\\(bar\\|house\\)fly" ;; Since this package was written to produce efficient regexps, not regexps ;; efficiently, it is probably not a good idea to in-line too many calls in @@ -72,13 +84,12 @@ ;; your code for such changes to have effect in your code. ;; Originally written for font-lock.el, from an idea from Stig's hl319.el, with -;; thanks for ideas also to Michael Ernst, Bob Glickstein and Dan Nicolaescu. -;; Please don't tell me that it doesn't produce optimal regexps; I know that -;; already. For example, the above explanation for the meaning of "opt" would -;; be more efficient as "optim\\(al\\|i[sz]e\\)", but this requires complex -;; forward looking. But (ideas or) code to improve things (are) is welcome. +;; thanks for ideas also to Michael Ernst, Bob Glickstein, Dan Nicolaescu and +;; Stefan Monnier. +;; No doubt `regexp-opt' doesn't always produce optimal regexps, so code, ideas +;; or any other information to improve things are welcome. -;;; Code: +;;; Code. ;;;###autoload (defun regexp-opt (strings &optional paren) @@ -128,9 +139,9 @@ ;; If LAX non-nil, don't output parentheses if it doesn't require them. ;; Merges keywords to avoid backtracking in Emacs' regexp matcher. ;; - ;; The basic idea is to find the shortest common prefix, remove it and - ;; recurse. If there is no prefix, we divide the list into two so that (at - ;; least) one half will have at least a one-character common prefix. + ;; The basic idea is to find the shortest common prefix or suffix, remove it + ;; and recurse. If there is no prefix, we divide the list into two so that + ;; (at least) one half will have at least a one-character common prefix. ;; ;; Also we delay the addition of grouping parenthesis as long as possible ;; until we're sure we need them, and try to remove one-character sequences @@ -139,11 +150,15 @@ (let* ((open-group (if paren "\\(" "")) (close-group (if paren "\\)" "")) (open-charset (if lax "" open-group)) - (close-charset (if lax "" close-group))) + (close-charset (if lax "" close-group)) + (open-presuf open-charset) + (close-presuf close-charset)) (cond - ;; Protect against user-stupidity... could call error here - ((null strings) - nil) + ;; + ;; If there are no strings, just return the empty string. + ((= (length strings) 0) + "") + ;; ;; If there is only one string, just return it. ((= (length strings) 1) (if (= (length (car strings)) 1) @@ -157,7 +172,7 @@ close-charset)) ;; ;; If all are one-character strings, just return a character set. - ((= (length strings) (apply '+ (mapcar 'length strings))) + ((= (apply 'max (mapcar 'length strings)) 1) (concat open-charset (regexp-opt-charset strings) close-charset)) @@ -165,17 +180,30 @@ ;; We have a list of different length strings. (t (let ((prefix (try-completion "" (mapcar 'list strings))) + (suffix (regexp-opt-try-suffix strings)) (letters (let ((completion-regexp-list '("^.$"))) (all-completions "" (mapcar 'list strings))))) (cond ;; ;; If there is a common prefix, remove it and recurse on the suffixes. ((> (length prefix) 0) - (let* ((length (length prefix)) - (suffixes (mapcar (lambda (s) (substring s length)) strings))) - (concat open-group + (let* ((end (length prefix)) + (suffixes (mapcar (lambda (s) (substring s end)) strings))) + (concat open-presuf (regexp-quote prefix) (regexp-opt-group suffixes t t) - close-group))) + close-presuf))) + ;; + ;; If there is a common suffix, remove it and recurse on the prefixes. + ((> (length suffix) (if lax + 0 + (- (apply 'max (mapcar 'length strings)) 2))) + (let* ((end (- (length suffix))) + (prefixes (sort (mapcar (lambda (s) (substring s 0 end)) + strings) + 'string-lessp))) + (concat open-presuf + (regexp-opt-group prefixes t t) (regexp-quote suffix) + close-presuf))) ;; ;; If there are several one-character strings, remove them and recurse ;; on the rest (first so the final regexp finds the longest match). @@ -235,6 +263,16 @@ (concat "[" dash caret "]") (concat "[" bracket charset caret dash "]")))) +(defun regexp-opt-try-suffix (strings) + ;; + ;; Return common suffix of each string in STRINGS. See `try-completion'. + ;; + (let* ((chars (mapcar (lambda (s) (mapcar 'identity s)) strings)) + (srahc (mapcar 'reverse chars)) + (sgnirts (mapcar (lambda (c) (mapconcat 'char-to-string c "")) srahc)) + (xiffus (try-completion "" (mapcar 'list sgnirts)))) + (mapconcat 'char-to-string (reverse (mapcar 'identity xiffus)) ""))) + (provide 'regexp-opt) ;;; regexp-opt.el ends here