changeset 98557:73eaaf9adee6

(rx-constituents): Change `anything' to call rx-anything. Change `not-wordchar' assignment to "\\W" from "[^[:word:]]". (rx-group-if): New function. (rx-parent): New variable. (rx-and, rx-or): Put shy groups only when necessary. (rx-bracket): Remove. (rx-anything): New function. (rx-any-delete-from-range, rx-any-condense-range) (rx-check-any-string): New functions. (rx-check-any): Return result as a list. Don't convert chars to strings. Don't prepend "\\" to "^". Don't search for close bracket. Check char category string. Call rx-form instead of rx-to-string. (rx-any): Rebuid to complete the function. (rx-check-not): Fix char category regexp pattern string. Call rx-form instead of rx-to-string. (rx-not): Call rx-form instead of rx-to-string. Convert "[^]" to "[^^]". Call regexp-quote for one char string when not called from rx-not. Add "\\w", and toggle to upcase. Add the case of "\\[SCBW]" to toggle. (rx-=, rx->=, rx -**, rx-repeat, rx-submatch): Call rx-form instead of rx-to-string. (rx-kleene): Call rx-form instead of rx-to-string. Call rx-group-if to adjust putting of shy groups. (rx-atomic-p): Make check more precisely. (rx-eval, rx-greedy): Call rx-form instead of rx-to-string. (rx-regexp): Call rx-group-if. (rx-form): New function. (rx-to-string): Call rx-form, rx-group-if. Refine definition of NO-GROUP.
author Chong Yidong <cyd@stupidchicken.com>
date Tue, 07 Oct 2008 18:08:26 +0000
parents de7d89f7c0f7
children f0a18da65f51
files lisp/emacs-lisp/rx.el
diffstat 1 files changed, 274 insertions(+), 105 deletions(-) [+]
line wrap: on
line diff
--- a/lisp/emacs-lisp/rx.el	Tue Oct 07 18:08:16 2008 +0000
+++ b/lisp/emacs-lisp/rx.el	Tue Oct 07 18:08:26 2008 +0000
@@ -118,7 +118,7 @@
     (|			. or)		; SRE
     (not-newline	. ".")
     (nonl		. not-newline)	; SRE
-    (anything		. "\\(?:.\\|\n\\)")
+    (anything		. (rx-anything 0 nil))
     (any		. (rx-any 1 nil rx-check-any)) ; inconsistent with SRE
     (in			. any)
     (char		. any)		; sregex
@@ -206,8 +206,7 @@
     (upper-case		. upper)	 ; SRE
     (word		. "[[:word:]]")	 ; inconsistent with SRE
     (wordchar		. word)		 ; sregex
-    (not-wordchar	. "[^[:word:]]") ; sregex (use \\W?)
-    )
+    (not-wordchar	. "\\W"))
   "Alist of sexp form regexp constituents.
 Each element of the alist has the form (SYMBOL . DEFN).
 SYMBOL is a valid constituent of sexp regular expressions.
@@ -332,82 +331,237 @@
 		 (car form) type-pred))))))
 
 
+(defun rx-group-if (regexp group)
+  "Put shy groups around REGEXP if seemingly necessary when GROUP
+is non-nil."
+  (cond
+   ;; for some repetition
+   ((eq group '*) (if (rx-atomic-p regexp) (setq group nil)))
+   ;; for concatenation
+   ((eq group ':)
+    (if (rx-atomic-p
+	 (if (string-match
+	      "\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp)
+	     (substring regexp 0 (match-beginning 0))
+	   regexp))
+	(setq group nil)))
+   ;; for OR
+   ((eq group '|) (setq group nil))
+   ;; do anyway
+   ((eq group t))
+   ((rx-atomic-p regexp t) (setq group nil)))
+  (if group
+      (concat "\\(?:" regexp "\\)")
+    regexp))
+
+
+(defvar rx-parent)
+;; dynamically bound in some functions.
+
+
 (defun rx-and (form)
   "Parse and produce code from FORM.
 FORM is of the form `(and FORM1 ...)'."
   (rx-check form)
-  (concat "\\(?:"
-	  (mapconcat
-	   (function (lambda (x) (rx-to-string x 'no-group)))
-	   (cdr form) nil)
-	  "\\)"))
+  (rx-group-if
+   (mapconcat (lambda (x) (rx-form x ':)) (cdr form) nil)
+   (and (memq rx-parent '(* t)) rx-parent)))
 
 
 (defun rx-or (form)
   "Parse and produce code from FORM, which is `(or FORM1 ...)'."
   (rx-check form)
-  (let ((all-args-strings t))
-    (dolist (arg (cdr form))
-      (unless (stringp arg)
-	(setq all-args-strings nil)))
-    (concat "\\(?:"
-	    (if all-args-strings
-		(regexp-opt (cdr form))
-	      (mapconcat #'rx-to-string (cdr form) "\\|"))
-	    "\\)")))
+  (rx-group-if
+   (if (memq nil (mapcar 'stringp (cdr form)))
+       (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|")
+     (regexp-opt (cdr form)))
+   (and (memq rx-parent '(: * t)) rx-parent)))
+
+
+(defun rx-anything (form)
+  "Match any character."
+  (if (consp form)
+      (error "rx `anythng' syntax error: %s" form))
+  (rx-or (list 'or 'not-newline ?\n)))
 
 
-(defvar rx-bracket)		       ; dynamically bound in `rx-any'
+(defun rx-any-delete-from-range (char ranges)
+  "Delete by side effect character CHAR from RANGES.
+Only both edges of each range is checked."
+  (let (m)
+    (cond
+     ((memq char ranges) (setq ranges (delq char ranges)))
+     ((setq m (assq char ranges))
+      (if (eq (1+ char) (cdr m))
+	  (setcar (memq m ranges) (1+ char))
+	(setcar m (1+ char))))
+     ((setq m (rassq char ranges))
+      (if (eq (1- char) (car m))
+	  (setcar (memq m ranges) (1- char))
+	(setcdr m (1- char)))))
+    ranges))
+
+    
+(defun rx-any-condense-range (args)
+  "Condense by side effect ARGS as range for Rx `any'."
+  (let (str
+	l)
+    ;; set STR list of all strings
+    ;; set L list of all ranges
+    (mapc (lambda (e) (cond ((stringp e) (push e str))
+			    ((numberp e) (push (cons e e) l))
+			    (t (push e l))))
+	  args)
+    ;; condense overlapped ranges in L
+    (let ((tail (setq l (sort l #'car-less-than-car)))
+	  d)
+      (while (setq d (cdr tail))
+	(if (>= (cdar tail) (1- (caar d)))
+	    (progn
+	      (setcdr (car tail) (max (cdar tail) (cdar d)))
+	      (setcdr tail (cdr d)))
+	  (setq tail d))))
+    ;; Separate small ranges to single number, and delete dups.
+    (nconc
+     (apply #'nconc
+	    (mapcar (lambda (e)
+		      (cond
+		       ((= (car e) (cdr e)) (list (car e)))
+		       ;; ((= (1+ (car e)) (cdr e)) (list (car e) (cdr e)))
+		       ((list e))))
+		    l))
+     (delete-dups str))))
+
+
+(defun rx-check-any-string (str)
+  "Check string argument STR for Rx `any'."
+  (let ((i 0)
+	c1 c2 l)
+    (if (= 0 (length str))
+	(error "String arg for Rx `any' must not be empty"))
+    (while (string-match ".-." str i)
+      ;; string before range: convert it to characters
+      (if (< i (match-beginning 0))
+	  (setq l (nconc
+		   l
+		   (append (substring str i (match-beginning 0)) nil))))
+      ;; range
+      (setq i (match-end 0)
+	    c1 (aref str (match-beginning 0))
+	    c2 (aref str (1- i)))
+      (cond
+       ((< c1 c2) (setq l (nconc l (list (cons c1 c2)))))
+       ((= c1 c2) (setq l (nconc l (list c1))))))
+    ;; rest?
+    (if (< i (length str))
+	(setq l (nconc l (append (substring str i) nil))))
+    l))
+
 
 (defun rx-check-any (arg)
    "Check arg ARG for Rx `any'."
-   (if (integerp arg)
-       (setq arg (string arg)))
-   (when (stringp arg)
-     (if (zerop (length arg))
-	 (error "String arg for Rx `any' must not be empty"))
-     ;; Quote ^ at start; don't bother to check whether this is first arg.
-     (if (eq ?^ (aref arg 0))
-	 (setq arg (concat "\\" arg)))
-     ;; Remove ] and set flag for adding it to start of overall result.
-     (when (string-match "\\]" arg)
-       (setq arg (replace-regexp-in-string "\\]" "" arg)
-	     rx-bracket "]")))
-   (when (symbolp arg)
+   (cond
+    ((integerp arg) (list arg))
+    ((symbolp arg)
      (let ((translation (condition-case nil
-			    (rx-to-string arg 'no-group)
+			    (rx-form arg)
 			  (error nil))))
-       (unless translation (error "Invalid char class `%s' in Rx `any'" arg))
-       (setq arg (substring translation 1 -1)))) ; strip outer brackets
-   ;; sregex compatibility
-   (when (and (integerp (car-safe arg))
-	      (integerp (cdr-safe arg)))
-     (setq arg (string (car arg) ?- (cdr arg))))
-   (unless (stringp arg)
-     (error "rx `any' requires string, character, char pair or char class args"))
-   arg)
+       (if (or (null translation)
+	       (null (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'" translation)))
+	   (error "Invalid char class `%s' in Rx `any'" arg))
+       (list (substring translation 1 -1)))) ; strip outer brackets
+    ((and (integerp (car-safe arg)) (integerp (cdr-safe arg)))
+     (list arg))
+    ((stringp arg) (rx-check-any-string arg))
+    ((error
+      "rx `any' requires string, character, char pair or char class args"))))
+
 
 (defun rx-any (form)
   "Parse and produce code from FORM, which is `(any ARG ...)'.
 ARG is optional."
   (rx-check form)
-  (let* ((rx-bracket nil)
-	 (args (mapcar #'rx-check-any (cdr form)))) ; side-effects `rx-bracket'
-    ;; If there was a ?- in the form, move it to the front to avoid
-    ;; accidental range.
-    (if (member "-" args)
-	(setq args (cons "-" (delete "-" args))))
-    (apply #'concat "[" rx-bracket (append args '("]")))))
+  (let* ((args (rx-any-condense-range
+		(apply
+		 #'nconc
+		 (mapcar #'rx-check-any (cdr form)))))
+	 m
+	 s)
+    (cond
+     ;; single close bracket
+     ;;	 => "[]...-]" or "[]...--.]"
+     ((memq ?\] args)
+      ;; set ] at the beginning
+      (setq args (cons ?\] (delq ?\] args)))
+      ;; set - at the end
+      (if (or (memq ?- args) (assq ?- args))
+	  (setq args (nconc (rx-any-delete-from-range ?- args)
+			    (list ?-)))))
+     ;; close bracket starts a range
+     ;;  => "[]-....-]" or "[]-.--....]"
+     ((setq m (assq ?\] args))
+      ;; bring it to the beginning
+      (setq args (cons m (delq m args)))
+      (cond ((memq ?- args)
+	     ;; to the end
+	     (setq args (nconc (delq ?- args) (list ?-))))
+	    ((setq m (assq ?- args))
+	     ;; next to the bracket's range, make the second range
+	     (setcdr args (cons m (delq m args))))))
+     ;; bracket in the end range
+     ;;	 => "[]...-]"
+     ((setq m (rassq ?\] args))
+      ;; set ] at the beginning
+      (setq args (cons ?\] (rx-any-delete-from-range ?\] args)))
+      ;; set - at the end
+      (if (or (memq ?- args) (assq ?- args))
+	  (setq args (nconc (rx-any-delete-from-range ?- args)
+			    (list ?-)))))
+     ;; {no close bracket appears}
+     ;;
+     ;; bring single bar to the beginning
+     ((memq ?- args)
+      (setq args (cons ?- (delq ?- args))))
+     ;; bar start a range, bring it to the beginning
+     ((setq m (assq ?- args))
+      (setq args (cons m (delq m args))))
+     ;;
+     ;; hat at the beginning?
+     ((or (eq (car args) ?^) (eq (car-safe (car args)) ?^))
+      (setq args (if (cdr args)
+		     `(,(cadr args) ,(car args) ,@(cddr args))
+		   (nconc (rx-any-delete-from-range ?^ args)
+			  (list ?^))))))
+    ;; some 1-char?
+    (if (and (null (cdr args)) (numberp (car args))
+	     (or (= 1 (length
+		       (setq s (regexp-quote (string (car args))))))
+		 (and (equal (car args) ?^) ;; unnecessary predicate?
+		      (null (eq rx-parent '!)))))
+	s
+      (concat "["
+	      (mapconcat
+	       (lambda (e) (cond
+			    ((numberp e) (string e))
+			    ((consp e)
+			     (if (and (= (1+ (car e)) (cdr e))
+				      (null (memq (car e) '(?\] ?-))))
+				 (string (car e) (cdr e))
+			       (string (car e) ?- (cdr e))))
+			    (e)))
+	       args
+	       nil)
+	      "]"))))
 
 
 (defun rx-check-not (arg)
   "Check arg ARG for Rx `not'."
   (unless (or (and (symbolp arg)
-		   (string-match "\\`\\[\\[:[-a-z]:\\]\\]\\'"
+		   (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'"
 				 (condition-case nil
-				     (rx-to-string arg 'no-group)
+				     (rx-form arg)
 				   (error ""))))
-	      (eq arg 'word-boundary)
+	      (eq arg 'word-boundary) 
 	      (and (consp arg)
 		   (memq (car arg) '(not any in syntax category))))
     (error "rx `not' syntax error: %s" arg))
@@ -417,16 +571,22 @@
 (defun rx-not (form)
   "Parse and produce code from FORM.  FORM is `(not ...)'."
   (rx-check form)
-  (let ((result (rx-to-string (cadr form) 'no-group))
+  (let ((result (rx-form (cadr form) '!))
 	case-fold-search)
     (cond ((string-match "\\`\\[^" result)
-	   (if (= (length result) 4)
-	       (substring result 2 3)
-	     (concat "[" (substring result 2))))
+	   (cond
+	    ((equal result "[^]") "[^^]")
+	    ((and (= (length result) 4) (null (eq rx-parent '!)))
+	     (regexp-quote (substring result 2 3)))
+	    ((concat "[" (substring result 2)))))
 	  ((eq ?\[ (aref result 0))
 	   (concat "[^" (substring result 1)))
-	  ((string-match "\\`\\\\[scb]" result)
-	   (concat (capitalize (substring result 0 2)) (substring result 2)))
+	  ((string-match "\\`\\\\[scbw]" result)
+	   (concat (upcase (substring result 0 2))
+		   (substring result 2)))
+	  ((string-match "\\`\\\\[SCBW]" result)
+	   (concat (downcase (substring result 0 2))
+		   (substring result 2)))
 	  (t
 	   (concat "[^" result "]")))))
 
@@ -464,7 +624,7 @@
   (unless (and (integerp (nth 1 form))
 	       (> (nth 1 form) 0))
     (error "rx `=' requires positive integer first arg"))
-  (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form)))
+  (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
 
 
 (defun rx->= (form)
@@ -474,14 +634,14 @@
   (unless (and (integerp (nth 1 form))
 	       (> (nth 1 form) 0))
     (error "rx `>=' requires positive integer first arg"))
-  (format "%s\\{%d,\\}" (rx-to-string (nth 2 form)) (nth 1 form)))
+  (format "%s\\{%d,\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
 
 
 (defun rx-** (form)
   "Parse and produce code from FORM `(** N M ...)'."
   (rx-check form)
   (setq form (cons 'repeat (cdr (rx-trans-forms form 2))))
-  (rx-to-string form))
+  (rx-form form '*))
 
 
 (defun rx-repeat (form)
@@ -492,7 +652,7 @@
 	 (unless (and (integerp (nth 1 form))
 		      (> (nth 1 form) 0))
 	   (error "rx `repeat' requires positive integer first arg"))
-	 (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form)))
+	 (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
 	((or (not (integerp (nth 2 form)))
 	     (< (nth 2 form) 0)
 	     (not (integerp (nth 1 form)))
@@ -500,16 +660,14 @@
 	     (< (nth 2 form) (nth 1 form)))
 	 (error "rx `repeat' range error"))
 	(t
-	 (format "%s\\{%d,%d\\}" (rx-to-string (nth 3 form))
+	 (format "%s\\{%d,%d\\}" (rx-form (nth 3 form) '*)
 		 (nth 1 form) (nth 2 form)))))
 
 
 (defun rx-submatch (form)
   "Parse and produce code from FORM, which is `(submatch ...)'."
-  (concat "\\("
-	  (mapconcat (function (lambda (x) (rx-to-string x 'no-group)))
-		     (cdr form) nil)
-	  "\\)"))
+  (concat "\\(" (mapconcat #'rx-form (cdr form) nil) "\\)"))
+
 
 (defun rx-backref (form)
   "Parse and produce code from FORM, which is `(backref N)'."
@@ -531,19 +689,19 @@
 is non-nil."
   (rx-check form)
   (setq form (rx-trans-forms form))
-  (let ((suffix (cond ((memq (car form) '(* + ? )) "")
+  (let ((suffix (cond ((memq (car form) '(* + ?\s)) "")
 		      ((memq (car form) '(*? +? ??)) "?")
 		      (rx-greedy-flag "")
 		      (t "?")))
 	(op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*")
 		  ((memq (car form) '(+ +? 1+ one-or-more))  "+")
-		  (t "?")))
-	(result (rx-to-string (cadr form) 'no-group)))
-    (if (not (rx-atomic-p result))
-	(setq result (concat "\\(?:" result "\\)")))
-    (concat result op suffix)))
+		  (t "?"))))
+    (rx-group-if
+     (concat (rx-form (cadr form) '*) op suffix)
+     (and (memq rx-parent '(t *)) rx-parent))))
 
-(defun rx-atomic-p (r)
+
+(defun rx-atomic-p (r &optional lax)
   "Return non-nil if regexp string R is atomic.
 An atomic regexp R is one such that a suffix operator
 appended to R will apply to all of R.  For example, \"a\"
@@ -568,13 +726,14 @@
 negatives would require a theoretic specification of the set
 of all atomic regexps."
   (let ((l (length r)))
-    (or (equal l 1)
-	(and (>= l 6)
-	     (equal (substring r 0 2) "\\(")
-	     (equal (substring r -2) "\\)"))
-	(and (>= l 2)
-	     (equal (substring r 0 1) "[")
-	     (equal (substring r -1) "]")))))
+    (cond
+     ((<= l 1))
+     ((= l 2) (= (aref r 0) ?\\))
+     ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r))
+     ((null lax)
+      (cond
+       ((string-match "\\`\\[^?\]?\\(?:\\[:[a-z]+:]\\|[^\]]\\)*\\]\\'" r))
+       ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^\)]\\)*\\\\)\\'" r)))))))
 
 
 (defun rx-syntax (form)
@@ -612,7 +771,7 @@
 (defun rx-eval (form)
   "Parse and produce code from FORM, which is `(eval FORM)'."
   (rx-check form)
-  (rx-to-string (eval (cadr form))))
+  (rx-form (eval (cadr form)) rx-parent))
 
 
 (defun rx-greedy (form)
@@ -622,13 +781,41 @@
 '(maximal-match FORM1)', greedy operators will be used."
   (rx-check form)
   (let ((rx-greedy-flag (eq (car form) 'maximal-match)))
-    (rx-to-string (cadr form))))
+    (rx-form (cadr form) rx-parent)))
 
 
 (defun rx-regexp (form)
   "Parse and produce code from FORM, which is `(regexp STRING)'."
   (rx-check form)
-  (concat "\\(?:" (cadr form) "\\)"))
+  (rx-group-if (cadr form) rx-parent))
+
+
+(defun rx-form (form &optional rx-parent)
+  "Parse and produce code for regular expression FORM.
+FORM is a regular expression in sexp form.
+RX-PARENT shows which type of expression calls and controls putting of
+shy groups around the result and some more in other functions."
+  (if (stringp form)
+      (rx-group-if (regexp-quote form)
+		   (if (and (eq rx-parent '*) (< 1 (length form)))
+		       rx-parent))
+    (cond ((integerp form)
+	   (regexp-quote (char-to-string form)))
+	  ((symbolp form)
+	   (let ((info (rx-info form)))
+	     (cond ((stringp info)
+		    info)
+		   ((null info)
+		    (error "Unknown rx form `%s'" form))
+		   (t
+		    (funcall (nth 0 info) form)))))
+	  ((consp form)
+	   (let ((info (rx-info (car form))))
+	     (unless (consp info)
+	       (error "Unknown rx form `%s'" (car form)))
+	     (funcall (nth 0 info) form)))
+	  (t
+	   (error "rx syntax error at `%s'" form)))))
 
 
 ;;;###autoload
@@ -636,28 +823,7 @@
   "Parse and produce code for regular expression FORM.
 FORM is a regular expression in sexp form.
 NO-GROUP non-nil means don't put shy groups around the result."
-  (cond ((stringp form)
-	 (regexp-quote form))
-	((integerp form)
-	 (regexp-quote (char-to-string form)))
-	((symbolp form)
-	 (let ((info (rx-info form)))
-	   (cond ((stringp info)
-		  info)
-		 ((null info)
-		  (error "Unknown rx form `%s'" form))
-		 (t
-		  (funcall (nth 0 info) form)))))
-	((consp form)
-	 (let ((info (rx-info (car form))))
-	   (unless (consp info)
-	     (error "Unknown rx form `%s'" (car form)))
-	   (let ((result (funcall (nth 0 info) form)))
-	     (if (or no-group (string-match "\\`\\\\[(]" result))
-		 result
-	       (concat "\\(?:" result "\\)")))))
-	(t
-	 (error "rx syntax error at `%s'" form))))
+  (rx-group-if (rx-form form) (null no-group)))
 
 
 ;;;###autoload
@@ -878,6 +1044,9 @@
      like `and', but makes the match accessible with `match-end',
      `match-beginning', and `match-string'.
 
+`(group SEXP1 SEXP2 ...)'
+     another name for `submatch'.
+
 `(or SEXP1 SEXP2 ...)'
 `(| SEXP1 SEXP2 ...)'
      matches anything that matches SEXP1 or SEXP2, etc.  If all