Mercurial > emacs
annotate lisp/emacs-lisp/rx.el @ 84142:cabf50644487
Move to ../doc/emacs/, misc/
author | Glenn Morris <rgm@gnu.org> |
---|---|
date | Thu, 06 Sep 2007 04:35:22 +0000 |
parents | bdeef0472e21 |
children | 107ccd98fa12 bdb3fe0ba9fa |
rev | line source |
---|---|
39516 | 1 ;;; rx.el --- sexp notation for regular expressions |
2 | |
68648
067115a6e738
Update years in copyright notice; nfc.
Thien-Thi Nguyen <ttn@gnuvola.org>
parents:
64751
diff
changeset
|
3 ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, |
75346 | 4 ;; 2006, 2007 Free Software Foundation, Inc. |
39516 | 5 |
6 ;; Author: Gerd Moellmann <gerd@gnu.org> | |
7 ;; Maintainer: FSF | |
8 ;; Keywords: strings, regexps, extensions | |
9 | |
10 ;; This file is part of GNU Emacs. | |
11 | |
12 ;; GNU Emacs is free software; you can redistribute it and/or modify | |
13 ;; it under the terms of the GNU General Public License as published by | |
78217
935157c0b596
Switch license to GPLv3 or later.
Glenn Morris <rgm@gnu.org>
parents:
77829
diff
changeset
|
14 ;; the Free Software Foundation; either version 3, or (at your option) |
39516 | 15 ;; any later version. |
16 | |
17 ;; GNU Emacs is distributed in the hope that it will be useful, | |
18 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 ;; GNU General Public License for more details. | |
21 | |
22 ;; You should have received a copy of the GNU General Public License | |
23 ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
64085 | 24 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
25 ;; Boston, MA 02110-1301, USA. | |
39516 | 26 |
27 ;;; Commentary: | |
28 | |
29 ;; This is another implementation of sexp-form regular expressions. | |
30 ;; It was unfortunately written without being aware of the Sregex | |
31 ;; package coming with Emacs, but as things stand, Rx completely | |
32 ;; covers all regexp features, which Sregex doesn't, doesn't suffer | |
33 ;; from the bugs mentioned in the commentary section of Sregex, and | |
34 ;; uses a nicer syntax (IMHO, of course :-). | |
35 | |
55102 | 36 ;; This significantly extended version of the original, is almost |
37 ;; compatible with Sregex. The only incompatibility I (fx) know of is | |
38 ;; that the `repeat' form can't have multiple regexp args. | |
39 | |
40 ;; Now alternative forms are provided for a degree of compatibility | |
41 ;; with Shivers' attempted definitive SRE notation | |
42 ;; <URL:http://www.ai.mit.edu/~/shivers/sre.txt>. SRE forms not | |
43 ;; catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>, | |
44 ;; ,<exp>, (word ...), word+, posix-string, and character class forms. | |
45 ;; Some forms are inconsistent with SRE, either for historical reasons | |
46 ;; or because of the implementation -- simple translation into Emacs | |
47 ;; regexp strings. These include: any, word. Also, case-sensitivity | |
48 ;; and greediness are controlled by variables external to the regexp, | |
49 ;; and you need to feed the forms to the `posix-' functions to get | |
50 ;; SRE's POSIX semantics. There are probably more difficulties. | |
51 | |
39516 | 52 ;; Rx translates a sexp notation for regular expressions into the |
53 ;; usual string notation. The translation can be done at compile-time | |
54 ;; by using the `rx' macro. It can be done at run-time by calling | |
55 ;; function `rx-to-string'. See the documentation of `rx' for a | |
56 ;; complete description of the sexp notation. | |
57 ;; | |
58 ;; Some examples of string regexps and their sexp counterparts: | |
59 ;; | |
60 ;; "^[a-z]*" | |
61 ;; (rx (and line-start (0+ (in "a-z")))) | |
62 ;; | |
63 ;; "\n[^ \t]" | |
64 ;; (rx (and "\n" (not blank))), or | |
65 ;; (rx (and "\n" (not (any " \t")))) | |
66 ;; | |
67 ;; "\\*\\*\\* EOOH \\*\\*\\*\n" | |
68 ;; (rx "*** EOOH ***\n") | |
69 ;; | |
70 ;; "\\<\\(catch\\|finally\\)\\>[^_]" | |
71 ;; (rx (and word-start (submatch (or "catch" "finally")) word-end | |
72 ;; (not (any ?_)))) | |
73 ;; | |
74 ;; "[ \t\n]*:\\([^:]+\\|$\\)" | |
75 ;; (rx (and (zero-or-more (in " \t\n")) ":" | |
76 ;; (submatch (or line-end (one-or-more (not (any ?:))))))) | |
77 ;; | |
78 ;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*" | |
79 ;; (rx (and line-start | |
80 ;; "content-transfer-encoding:" | |
48938
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
81 ;; (+ (? ?\n)) blank |
39516 | 82 ;; "quoted-printable" |
48938
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
83 ;; (+ (? ?\n)) blank)) |
39516 | 84 ;; |
85 ;; (concat "^\\(?:" something-else "\\)") | |
86 ;; (rx (and line-start (eval something-else))), statically or | |
87 ;; (rx-to-string '(and line-start ,something-else)), dynamically. | |
88 ;; | |
89 ;; (regexp-opt '(STRING1 STRING2 ...)) | |
90 ;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically | |
91 ;; calls `regexp-opt' as needed. | |
92 ;; | |
93 ;; "^;;\\s-*\n\\|^\n" | |
94 ;; (rx (or (and line-start ";;" (0+ space) ?\n) | |
95 ;; (and line-start ?\n))) | |
96 ;; | |
97 ;; "\\$[I]d: [^ ]+ \\([^ ]+\\) " | |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
48938
diff
changeset
|
98 ;; (rx (and "$Id: " |
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
48938
diff
changeset
|
99 ;; (1+ (not (in " "))) |
39516 | 100 ;; " " |
101 ;; (submatch (1+ (not (in " ")))) | |
48938
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
102 ;; " ")) |
39516 | 103 ;; |
104 ;; "\\\\\\\\\\[\\w+" | |
105 ;; (rx (and ?\\ ?\\ ?\[ (1+ word))) | |
106 ;; | |
107 ;; etc. | |
108 | |
109 ;;; History: | |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
48938
diff
changeset
|
110 ;; |
39516 | 111 |
112 ;;; Code: | |
113 | |
114 (defconst rx-constituents | |
115 '((and . (rx-and 1 nil)) | |
55102 | 116 (seq . and) ; SRE |
117 (: . and) ; SRE | |
118 (sequence . and) ; sregex | |
39516 | 119 (or . (rx-or 1 nil)) |
55102 | 120 (| . or) ; SRE |
39516 | 121 (not-newline . ".") |
55102 | 122 (nonl . not-newline) ; SRE |
81671
03548d4c812b
(rx-constituents): Fix up `anything'.
Stefan Monnier <monnier@iro.umontreal.ca>
parents:
77829
diff
changeset
|
123 (anything . "\\(?:.\\|\n\\)") |
55102 | 124 (any . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE |
39516 | 125 (in . any) |
55102 | 126 (char . any) ; sregex |
127 (not-char . (rx-not-char 1 nil rx-check-any)) ; sregex | |
39516 | 128 (not . (rx-not 1 1 rx-check-not)) |
55102 | 129 ;; Partially consistent with sregex, whose `repeat' is like our |
130 ;; `**'. (`repeat' with optional max arg and multiple sexp forms | |
131 ;; is ambiguous.) | |
39516 | 132 (repeat . (rx-repeat 2 3)) |
55102 | 133 (= . (rx-= 2 nil)) ; SRE |
134 (>= . (rx->= 2 nil)) ; SRE | |
135 (** . (rx-** 2 nil)) ; SRE | |
136 (submatch . (rx-submatch 1 nil)) ; SRE | |
39516 | 137 (group . submatch) |
55102 | 138 (zero-or-more . (rx-kleene 1 nil)) |
139 (one-or-more . (rx-kleene 1 nil)) | |
140 (zero-or-one . (rx-kleene 1 nil)) | |
141 (\? . zero-or-one) ; SRE | |
39516 | 142 (\?? . zero-or-one) |
55102 | 143 (* . zero-or-more) ; SRE |
39516 | 144 (*? . zero-or-more) |
145 (0+ . zero-or-more) | |
55102 | 146 (+ . one-or-more) ; SRE |
39516 | 147 (+? . one-or-more) |
148 (1+ . one-or-more) | |
149 (optional . zero-or-one) | |
55102 | 150 (opt . zero-or-one) ; sregex |
39516 | 151 (minimal-match . (rx-greedy 1 1)) |
152 (maximal-match . (rx-greedy 1 1)) | |
53992
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
153 (backref . (rx-backref 1 1 rx-check-backref)) |
39516 | 154 (line-start . "^") |
55102 | 155 (bol . line-start) ; SRE |
39516 | 156 (line-end . "$") |
55102 | 157 (eol . line-end) ; SRE |
39516 | 158 (string-start . "\\`") |
55102 | 159 (bos . string-start) ; SRE |
160 (bot . string-start) ; sregex | |
39516 | 161 (string-end . "\\'") |
55102 | 162 (eos . string-end) ; SRE |
163 (eot . string-end) ; sregex | |
39516 | 164 (buffer-start . "\\`") |
165 (buffer-end . "\\'") | |
166 (point . "\\=") | |
167 (word-start . "\\<") | |
55102 | 168 (bow . word-start) ; SRE |
39516 | 169 (word-end . "\\>") |
55102 | 170 (eow . word-end) ; SRE |
39516 | 171 (word-boundary . "\\b") |
55102 | 172 (not-word-boundary . "\\B") ; sregex |
60930
a6ae354aa8ef
(rx-constituents): Add symbol-start and symbol-end.
Stefan Monnier <monnier@iro.umontreal.ca>
parents:
55103
diff
changeset
|
173 (symbol-start . "\\_<") |
a6ae354aa8ef
(rx-constituents): Add symbol-start and symbol-end.
Stefan Monnier <monnier@iro.umontreal.ca>
parents:
55103
diff
changeset
|
174 (symbol-end . "\\_>") |
39516 | 175 (syntax . (rx-syntax 1 1)) |
55102 | 176 (not-syntax . (rx-not-syntax 1 1)) ; sregex |
39516 | 177 (category . (rx-category 1 1 rx-check-category)) |
178 (eval . (rx-eval 1 1)) | |
179 (regexp . (rx-regexp 1 1 stringp)) | |
180 (digit . "[[:digit:]]") | |
55102 | 181 (numeric . digit) ; SRE |
182 (num . digit) ; SRE | |
183 (control . "[[:cntrl:]]") ; SRE | |
184 (cntrl . control) ; SRE | |
185 (hex-digit . "[[:xdigit:]]") ; SRE | |
186 (hex . hex-digit) ; SRE | |
187 (xdigit . hex-digit) ; SRE | |
188 (blank . "[[:blank:]]") ; SRE | |
189 (graphic . "[[:graph:]]") ; SRE | |
190 (graph . graphic) ; SRE | |
191 (printing . "[[:print:]]") ; SRE | |
192 (print . printing) ; SRE | |
193 (alphanumeric . "[[:alnum:]]") ; SRE | |
194 (alnum . alphanumeric) ; SRE | |
39516 | 195 (letter . "[[:alpha:]]") |
55102 | 196 (alphabetic . letter) ; SRE |
197 (alpha . letter) ; SRE | |
198 (ascii . "[[:ascii:]]") ; SRE | |
39516 | 199 (nonascii . "[[:nonascii:]]") |
55102 | 200 (lower . "[[:lower:]]") ; SRE |
201 (lower-case . lower) ; SRE | |
202 (punctuation . "[[:punct:]]") ; SRE | |
203 (punct . punctuation) ; SRE | |
204 (space . "[[:space:]]") ; SRE | |
205 (whitespace . space) ; SRE | |
206 (white . space) ; SRE | |
207 (upper . "[[:upper:]]") ; SRE | |
208 (upper-case . upper) ; SRE | |
209 (word . "[[:word:]]") ; inconsistent with SRE | |
210 (wordchar . word) ; sregex | |
211 (not-wordchar . "[^[:word:]]") ; sregex (use \\W?) | |
212 ) | |
39516 | 213 "Alist of sexp form regexp constituents. |
214 Each element of the alist has the form (SYMBOL . DEFN). | |
215 SYMBOL is a valid constituent of sexp regular expressions. | |
216 If DEFN is a string, SYMBOL is translated into DEFN. | |
217 If DEFN is a symbol, use the definition of DEFN, recursively. | |
218 Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE). | |
219 FUNCTION is used to produce code for SYMBOL. MIN-ARGS and MAX-ARGS | |
220 are the minimum and maximum number of arguments the function-form | |
221 sexp constituent SYMBOL may have in sexp regular expressions. | |
222 MAX-ARGS nil means no limit. PREDICATE, if specified, means that | |
223 all arguments must satisfy PREDICATE.") | |
224 | |
225 | |
226 (defconst rx-syntax | |
227 '((whitespace . ?-) | |
228 (punctuation . ?.) | |
229 (word . ?w) | |
230 (symbol . ?_) | |
231 (open-parenthesis . ?\() | |
232 (close-parenthesis . ?\)) | |
233 (expression-prefix . ?\') | |
234 (string-quote . ?\") | |
235 (paired-delimiter . ?$) | |
236 (escape . ?\\) | |
237 (character-quote . ?/) | |
238 (comment-start . ?<) | |
53992
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
239 (comment-end . ?>) |
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
240 (string-delimiter . ?|) |
55103
93f6ab2a0eb5
(rx-syntax): Move sregex style syntax to code.
Stefan Monnier <monnier@iro.umontreal.ca>
parents:
55102
diff
changeset
|
241 (comment-delimiter . ?!)) |
39516 | 242 "Alist mapping Rx syntax symbols to syntax characters. |
243 Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid | |
244 symbol in `(syntax SYMBOL)', and CHAR is the syntax character | |
245 corresponding to SYMBOL, as it would be used with \\s or \\S in | |
246 regular expressions.") | |
247 | |
248 | |
249 (defconst rx-categories | |
250 '((consonant . ?0) | |
251 (base-vowel . ?1) | |
252 (upper-diacritical-mark . ?2) | |
253 (lower-diacritical-mark . ?3) | |
254 (tone-mark . ?4) | |
255 (symbol . ?5) | |
256 (digit . ?6) | |
257 (vowel-modifying-diacritical-mark . ?7) | |
258 (vowel-sign . ?8) | |
259 (semivowel-lower . ?9) | |
260 (not-at-end-of-line . ?<) | |
261 (not-at-beginning-of-line . ?>) | |
262 (alpha-numeric-two-byte . ?A) | |
263 (chinse-two-byte . ?C) | |
264 (greek-two-byte . ?G) | |
265 (japanese-hiragana-two-byte . ?H) | |
266 (indian-two-byte . ?I) | |
267 (japanese-katakana-two-byte . ?K) | |
268 (korean-hangul-two-byte . ?N) | |
269 (cyrillic-two-byte . ?Y) | |
53992
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
270 (combining-diacritic . ?^) |
39516 | 271 (ascii . ?a) |
272 (arabic . ?b) | |
273 (chinese . ?c) | |
274 (ethiopic . ?e) | |
275 (greek . ?g) | |
276 (korean . ?h) | |
277 (indian . ?i) | |
278 (japanese . ?j) | |
279 (japanese-katakana . ?k) | |
280 (latin . ?l) | |
281 (lao . ?o) | |
282 (tibetan . ?q) | |
283 (japanese-roman . ?r) | |
284 (thai . ?t) | |
285 (vietnamese . ?v) | |
286 (hebrew . ?w) | |
287 (cyrillic . ?y) | |
288 (can-break . ?|)) | |
289 "Alist mapping symbols to category characters. | |
290 Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid | |
291 symbol in `(category SYMBOL)', and CHAR is the category character | |
292 corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in | |
293 regular expression strings.") | |
294 | |
295 | |
296 (defvar rx-greedy-flag t | |
297 "Non-nil means produce greedy regular expressions for `zero-or-one', | |
298 `zero-or-more', and `one-or-more'. Dynamically bound.") | |
299 | |
300 | |
301 (defun rx-info (op) | |
302 "Return parsing/code generation info for OP. | |
303 If OP is the space character ASCII 32, return info for the symbol `?'. | |
304 If OP is the character `?', return info for the symbol `??'. | |
305 See also `rx-constituents'." | |
306 (cond ((eq op ? ) (setq op '\?)) | |
307 ((eq op ??) (setq op '\??))) | |
308 (while (and (not (null op)) (symbolp op)) | |
309 (setq op (cdr (assq op rx-constituents)))) | |
310 op) | |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
48938
diff
changeset
|
311 |
39516 | 312 |
313 (defun rx-check (form) | |
314 "Check FORM according to its car's parsing info." | |
55102 | 315 (unless (listp form) |
316 (error "rx `%s' needs argument(s)" form)) | |
39516 | 317 (let* ((rx (rx-info (car form))) |
318 (nargs (1- (length form))) | |
319 (min-args (nth 1 rx)) | |
320 (max-args (nth 2 rx)) | |
321 (type-pred (nth 3 rx))) | |
322 (when (and (not (null min-args)) | |
323 (< nargs min-args)) | |
53992
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
324 (error "rx form `%s' requires at least %d args" |
39516 | 325 (car form) min-args)) |
326 (when (and (not (null max-args)) | |
327 (> nargs max-args)) | |
53992
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
328 (error "rx form `%s' accepts at most %d args" |
39516 | 329 (car form) max-args)) |
330 (when (not (null type-pred)) | |
331 (dolist (sub-form (cdr form)) | |
332 (unless (funcall type-pred sub-form) | |
53992
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
333 (error "rx form `%s' requires args satisfying `%s'" |
39516 | 334 (car form) type-pred)))))) |
335 | |
336 | |
337 (defun rx-and (form) | |
338 "Parse and produce code from FORM. | |
339 FORM is of the form `(and FORM1 ...)'." | |
340 (rx-check form) | |
48938
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
341 (concat "\\(?:" |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
342 (mapconcat |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
343 (function (lambda (x) (rx-to-string x 'no-group))) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
344 (cdr form) nil) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
345 "\\)")) |
39516 | 346 |
347 | |
348 (defun rx-or (form) | |
349 "Parse and produce code from FORM, which is `(or FORM1 ...)'." | |
350 (rx-check form) | |
351 (let ((all-args-strings t)) | |
352 (dolist (arg (cdr form)) | |
353 (unless (stringp arg) | |
354 (setq all-args-strings nil))) | |
52971
d5c1eeaa97e2
(rx-or): Fix the case of "(rx (and ?a (or ?b ?c) ?d))".
Eli Zaretskii <eliz@gnu.org>
parents:
52401
diff
changeset
|
355 (concat "\\(?:" |
d5c1eeaa97e2
(rx-or): Fix the case of "(rx (and ?a (or ?b ?c) ?d))".
Eli Zaretskii <eliz@gnu.org>
parents:
52401
diff
changeset
|
356 (if all-args-strings |
d5c1eeaa97e2
(rx-or): Fix the case of "(rx (and ?a (or ?b ?c) ?d))".
Eli Zaretskii <eliz@gnu.org>
parents:
52401
diff
changeset
|
357 (regexp-opt (cdr form)) |
d5c1eeaa97e2
(rx-or): Fix the case of "(rx (and ?a (or ?b ?c) ?d))".
Eli Zaretskii <eliz@gnu.org>
parents:
52401
diff
changeset
|
358 (mapconcat #'rx-to-string (cdr form) "\\|")) |
d5c1eeaa97e2
(rx-or): Fix the case of "(rx (and ?a (or ?b ?c) ?d))".
Eli Zaretskii <eliz@gnu.org>
parents:
52401
diff
changeset
|
359 "\\)"))) |
39516 | 360 |
361 | |
55103
93f6ab2a0eb5
(rx-syntax): Move sregex style syntax to code.
Stefan Monnier <monnier@iro.umontreal.ca>
parents:
55102
diff
changeset
|
362 (defvar rx-bracket) ; dynamically bound in `rx-any' |
39516 | 363 |
364 (defun rx-check-any (arg) | |
365 "Check arg ARG for Rx `any'." | |
55102 | 366 (if (integerp arg) |
367 (setq arg (string arg))) | |
368 (when (stringp arg) | |
369 (if (zerop (length arg)) | |
370 (error "String arg for Rx `any' must not be empty")) | |
371 ;; Quote ^ at start; don't bother to check whether this is first arg. | |
372 (if (eq ?^ (aref arg 0)) | |
373 (setq arg (concat "\\" arg))) | |
374 ;; Remove ] and set flag for adding it to start of overall result. | |
69266
a080fc579e98
(rx-check-any, rx-check-not): Quote "]"s in regexps when they have no
Eli Zaretskii <eliz@gnu.org>
parents:
68648
diff
changeset
|
375 (when (string-match "\\]" arg) |
a080fc579e98
(rx-check-any, rx-check-not): Quote "]"s in regexps when they have no
Eli Zaretskii <eliz@gnu.org>
parents:
68648
diff
changeset
|
376 (setq arg (replace-regexp-in-string "\\]" "" arg) |
55103
93f6ab2a0eb5
(rx-syntax): Move sregex style syntax to code.
Stefan Monnier <monnier@iro.umontreal.ca>
parents:
55102
diff
changeset
|
377 rx-bracket "]"))) |
55102 | 378 (when (symbolp arg) |
379 (let ((translation (condition-case nil | |
380 (rx-to-string arg 'no-group) | |
381 (error nil)))) | |
382 (unless translation (error "Invalid char class `%s' in Rx `any'" arg)) | |
383 (setq arg (substring translation 1 -1)))) ; strip outer brackets | |
384 ;; sregex compatibility | |
385 (when (and (integerp (car-safe arg)) | |
386 (integerp (cdr-safe arg))) | |
387 (setq arg (string (car arg) ?- (cdr arg)))) | |
388 (unless (stringp arg) | |
389 (error "rx `any' requires string, character, char pair or char class args")) | |
390 arg) | |
39516 | 391 |
392 (defun rx-any (form) | |
55102 | 393 "Parse and produce code from FORM, which is `(any ARG ...)'. |
394 ARG is optional." | |
39516 | 395 (rx-check form) |
55103
93f6ab2a0eb5
(rx-syntax): Move sregex style syntax to code.
Stefan Monnier <monnier@iro.umontreal.ca>
parents:
55102
diff
changeset
|
396 (let* ((rx-bracket nil) |
93f6ab2a0eb5
(rx-syntax): Move sregex style syntax to code.
Stefan Monnier <monnier@iro.umontreal.ca>
parents:
55102
diff
changeset
|
397 (args (mapcar #'rx-check-any (cdr form)))) ; side-effects `rx-bracket' |
55102 | 398 ;; If there was a ?- in the form, move it to the front to avoid |
399 ;; accidental range. | |
400 (if (member "-" args) | |
401 (setq args (cons "-" (delete "-" args)))) | |
55103
93f6ab2a0eb5
(rx-syntax): Move sregex style syntax to code.
Stefan Monnier <monnier@iro.umontreal.ca>
parents:
55102
diff
changeset
|
402 (apply #'concat "[" rx-bracket (append args '("]"))))) |
39516 | 403 |
404 | |
53992
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
405 (defun rx-check-not (arg) |
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
406 "Check arg ARG for Rx `not'." |
55102 | 407 (unless (or (and (symbolp arg) |
69266
a080fc579e98
(rx-check-any, rx-check-not): Quote "]"s in regexps when they have no
Eli Zaretskii <eliz@gnu.org>
parents:
68648
diff
changeset
|
408 (string-match "\\`\\[\\[:[-a-z]:\\]\\]\\'" |
55102 | 409 (condition-case nil |
410 (rx-to-string arg 'no-group) | |
411 (error "")))) | |
412 (eq arg 'word-boundary) | |
413 (and (consp arg) | |
414 (memq (car arg) '(not any in syntax category)))) | |
415 (error "rx `not' syntax error: %s" arg)) | |
416 t) | |
39516 | 417 |
418 | |
419 (defun rx-not (form) | |
420 "Parse and produce code from FORM. FORM is `(not ...)'." | |
421 (rx-check form) | |
53974
818e19ae4c5a
(rx-not): Bind case-fold-search to nil.
Eli Zaretskii <eliz@is.elta.co.il>
parents:
52971
diff
changeset
|
422 (let ((result (rx-to-string (cadr form) 'no-group)) |
818e19ae4c5a
(rx-not): Bind case-fold-search to nil.
Eli Zaretskii <eliz@is.elta.co.il>
parents:
52971
diff
changeset
|
423 case-fold-search) |
39516 | 424 (cond ((string-match "\\`\\[^" result) |
425 (if (= (length result) 4) | |
426 (substring result 2 3) | |
427 (concat "[" (substring result 2)))) | |
55102 | 428 ((eq ?\[ (aref result 0)) |
39516 | 429 (concat "[^" (substring result 1))) |
55102 | 430 ((string-match "\\`\\\\[scb]" result) |
431 (concat (capitalize (substring result 0 2)) (substring result 2))) | |
39516 | 432 (t |
433 (concat "[^" result "]"))))) | |
434 | |
435 | |
55102 | 436 (defun rx-not-char (form) |
437 "Parse and produce code from FORM. FORM is `(not-char ...)'." | |
438 (rx-check form) | |
439 (rx-not `(not (in ,@(cdr form))))) | |
440 | |
441 | |
442 (defun rx-not-syntax (form) | |
443 "Parse and produce code from FORM. FORM is `(not-syntax SYNTAX)'." | |
444 (rx-check form) | |
445 (rx-not `(not (syntax ,@(cdr form))))) | |
446 | |
447 | |
448 (defun rx-trans-forms (form &optional skip) | |
449 "If FORM's length is greater than two, transform it to length two. | |
450 A form (HEAD REST ...) becomes (HEAD (and REST ...)). | |
451 If SKIP is non-nil, allow that number of items after the head, i.e. | |
452 `(= N REST ...)' becomes `(= N (and REST ...))' if SKIP is 1." | |
453 (unless skip (setq skip 0)) | |
454 (let ((tail (nthcdr (1+ skip) form))) | |
455 (if (= (length tail) 1) | |
456 form | |
457 (let ((form (copy-sequence form))) | |
458 (setcdr (nthcdr skip form) (list (cons 'and tail))) | |
459 form)))) | |
460 | |
461 | |
462 (defun rx-= (form) | |
463 "Parse and produce code from FORM `(= N ...)'." | |
464 (rx-check form) | |
465 (setq form (rx-trans-forms form 1)) | |
466 (unless (and (integerp (nth 1 form)) | |
467 (> (nth 1 form) 0)) | |
468 (error "rx `=' requires positive integer first arg")) | |
469 (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form))) | |
470 | |
471 | |
472 (defun rx->= (form) | |
473 "Parse and produce code from FORM `(>= N ...)'." | |
474 (rx-check form) | |
475 (setq form (rx-trans-forms form 1)) | |
476 (unless (and (integerp (nth 1 form)) | |
477 (> (nth 1 form) 0)) | |
478 (error "rx `>=' requires positive integer first arg")) | |
479 (format "%s\\{%d,\\}" (rx-to-string (nth 2 form)) (nth 1 form))) | |
480 | |
481 | |
482 (defun rx-** (form) | |
483 "Parse and produce code from FORM `(** N M ...)'." | |
484 (rx-check form) | |
485 (setq form (cons 'repeat (cdr (rx-trans-forms form 2)))) | |
486 (rx-to-string form)) | |
487 | |
488 | |
39516 | 489 (defun rx-repeat (form) |
490 "Parse and produce code from FORM. | |
491 FORM is either `(repeat N FORM1)' or `(repeat N M FORM1)'." | |
492 (rx-check form) | |
493 (cond ((= (length form) 3) | |
494 (unless (and (integerp (nth 1 form)) | |
495 (> (nth 1 form) 0)) | |
53992
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
496 (error "rx `repeat' requires positive integer first arg")) |
39516 | 497 (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form))) |
498 ((or (not (integerp (nth 2 form))) | |
499 (< (nth 2 form) 0) | |
500 (not (integerp (nth 1 form))) | |
501 (< (nth 1 form) 0) | |
502 (< (nth 2 form) (nth 1 form))) | |
53992
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
503 (error "rx `repeat' range error")) |
39516 | 504 (t |
505 (format "%s\\{%d,%d\\}" (rx-to-string (nth 3 form)) | |
506 (nth 1 form) (nth 2 form))))) | |
507 | |
508 | |
509 (defun rx-submatch (form) | |
510 "Parse and produce code from FORM, which is `(submatch ...)'." | |
48938
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
511 (concat "\\(" |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
512 (mapconcat (function (lambda (x) (rx-to-string x 'no-group))) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
513 (cdr form) nil) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
514 "\\)")) |
39516 | 515 |
53992
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
516 (defun rx-backref (form) |
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
517 "Parse and produce code from FORM, which is `(backref N)'." |
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
518 (rx-check form) |
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
519 (format "\\%d" (nth 1 form))) |
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
520 |
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
521 (defun rx-check-backref (arg) |
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
522 "Check arg ARG for Rx `backref'." |
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
523 (or (and (integerp arg) (>= arg 1) (<= arg 9)) |
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
524 (error "rx `backref' requires numeric 1<=arg<=9: %s" arg))) |
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
525 |
39516 | 526 (defun rx-kleene (form) |
527 "Parse and produce code from FORM. | |
528 FORM is `(OP FORM1)', where OP is one of the `zero-or-one', | |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
48938
diff
changeset
|
529 `zero-or-more' etc. operators. |
39516 | 530 If OP is one of `*', `+', `?', produce a greedy regexp. |
531 If OP is one of `*?', `+?', `??', produce a non-greedy regexp. | |
532 If OP is anything else, produce a greedy regexp if `rx-greedy-flag' | |
533 is non-nil." | |
534 (rx-check form) | |
55102 | 535 (setq form (rx-trans-forms form)) |
39516 | 536 (let ((suffix (cond ((memq (car form) '(* + ? )) "") |
537 ((memq (car form) '(*? +? ??)) "?") | |
538 (rx-greedy-flag "") | |
539 (t "?"))) | |
540 (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*") | |
541 ((memq (car form) '(+ +? 1+ one-or-more)) "+") | |
48938
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
542 (t "?"))) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
543 (result (rx-to-string (cadr form) 'no-group))) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
544 (if (not (rx-atomic-p result)) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
545 (setq result (concat "\\(?:" result "\\)"))) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
546 (concat result op suffix))) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
547 |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
548 (defun rx-atomic-p (r) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
549 "Return non-nil if regexp string R is atomic. |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
550 An atomic regexp R is one such that a suffix operator |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
551 appended to R will apply to all of R. For example, \"a\" |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
552 \"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\", |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
553 \"[ab]c\", and \"ab\\|ab*c\" are not atomic. |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
554 |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
555 This function may return false negatives, but it will not |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
556 return false positives. It is nevertheless useful in |
78474
88c9f4e4160e
Replace `iff' in doc-strings and comments.
Glenn Morris <rgm@gnu.org>
parents:
78217
diff
changeset
|
557 situations where an efficiency shortcut can be taken only if a |
48938
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
558 regexp is atomic. The function can be improved to detect |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
559 more cases of atomic regexps. Presently, this function |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
560 detects the following categories of atomic regexp; |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
561 |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
562 a group or shy group: \\(...\\) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
563 a character class: [...] |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
564 a single character: a |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
565 |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
566 On the other hand, false negatives will be returned for |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
567 regexps that are atomic but end in operators, such as |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
568 \"a+\". I think these are rare. Probably such cases could |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
569 be detected without much effort. A guarantee of no false |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
570 negatives would require a theoretic specification of the set |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
571 of all atomic regexps." |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
572 (let ((l (length r))) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
573 (or (equal l 1) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
574 (and (>= l 6) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
575 (equal (substring r 0 2) "\\(") |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
576 (equal (substring r -2) "\\)")) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
577 (and (>= l 2) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
578 (equal (substring r 0 1) "[") |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
579 (equal (substring r -1) "]"))))) |
39516 | 580 |
581 | |
582 (defun rx-syntax (form) | |
583 "Parse and produce code from FORM, which is `(syntax SYMBOL)'." | |
584 (rx-check form) | |
55103
93f6ab2a0eb5
(rx-syntax): Move sregex style syntax to code.
Stefan Monnier <monnier@iro.umontreal.ca>
parents:
55102
diff
changeset
|
585 (let* ((sym (cadr form)) |
93f6ab2a0eb5
(rx-syntax): Move sregex style syntax to code.
Stefan Monnier <monnier@iro.umontreal.ca>
parents:
55102
diff
changeset
|
586 (syntax (assq sym rx-syntax))) |
39516 | 587 (unless syntax |
55103
93f6ab2a0eb5
(rx-syntax): Move sregex style syntax to code.
Stefan Monnier <monnier@iro.umontreal.ca>
parents:
55102
diff
changeset
|
588 ;; Try sregex compatibility. |
93f6ab2a0eb5
(rx-syntax): Move sregex style syntax to code.
Stefan Monnier <monnier@iro.umontreal.ca>
parents:
55102
diff
changeset
|
589 (let ((name (symbol-name sym))) |
93f6ab2a0eb5
(rx-syntax): Move sregex style syntax to code.
Stefan Monnier <monnier@iro.umontreal.ca>
parents:
55102
diff
changeset
|
590 (if (= 1 (length name)) |
93f6ab2a0eb5
(rx-syntax): Move sregex style syntax to code.
Stefan Monnier <monnier@iro.umontreal.ca>
parents:
55102
diff
changeset
|
591 (setq syntax (rassq (aref name 0) rx-syntax)))) |
93f6ab2a0eb5
(rx-syntax): Move sregex style syntax to code.
Stefan Monnier <monnier@iro.umontreal.ca>
parents:
55102
diff
changeset
|
592 (unless syntax |
93f6ab2a0eb5
(rx-syntax): Move sregex style syntax to code.
Stefan Monnier <monnier@iro.umontreal.ca>
parents:
55102
diff
changeset
|
593 (error "Unknown rx syntax `%s'" (cadr form)))) |
39516 | 594 (format "\\s%c" (cdr syntax)))) |
595 | |
596 | |
597 (defun rx-check-category (form) | |
598 "Check the argument FORM of a `(category FORM)'." | |
599 (unless (or (integerp form) | |
600 (cdr (assq form rx-categories))) | |
601 (error "Unknown category `%s'" form)) | |
602 t) | |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
48938
diff
changeset
|
603 |
39516 | 604 |
605 (defun rx-category (form) | |
55102 | 606 "Parse and produce code from FORM, which is `(category SYMBOL)'." |
39516 | 607 (rx-check form) |
608 (let ((char (if (integerp (cadr form)) | |
609 (cadr form) | |
610 (cdr (assq (cadr form) rx-categories))))) | |
611 (format "\\c%c" char))) | |
612 | |
613 | |
614 (defun rx-eval (form) | |
615 "Parse and produce code from FORM, which is `(eval FORM)'." | |
616 (rx-check form) | |
617 (rx-to-string (eval (cadr form)))) | |
618 | |
619 | |
620 (defun rx-greedy (form) | |
53992
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
621 "Parse and produce code from FORM. |
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
622 If FORM is '(minimal-match FORM1)', non-greedy versions of `*', |
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
623 `+', and `?' operators will be used in FORM1. If FORM is |
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
624 '(maximal-match FORM1)', greedy operators will be used." |
39516 | 625 (rx-check form) |
626 (let ((rx-greedy-flag (eq (car form) 'maximal-match))) | |
627 (rx-to-string (cadr form)))) | |
628 | |
629 | |
630 (defun rx-regexp (form) | |
631 "Parse and produce code from FORM, which is `(regexp STRING)'." | |
632 (rx-check form) | |
633 (concat "\\(?:" (cadr form) "\\)")) | |
634 | |
635 | |
636 ;;;###autoload | |
637 (defun rx-to-string (form &optional no-group) | |
638 "Parse and produce code for regular expression FORM. | |
639 FORM is a regular expression in sexp form. | |
640 NO-GROUP non-nil means don't put shy groups around the result." | |
641 (cond ((stringp form) | |
642 (regexp-quote form)) | |
643 ((integerp form) | |
644 (regexp-quote (char-to-string form))) | |
645 ((symbolp form) | |
646 (let ((info (rx-info form))) | |
647 (cond ((stringp info) | |
648 info) | |
649 ((null info) | |
53992
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
650 (error "Unknown rx form `%s'" form)) |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
48938
diff
changeset
|
651 (t |
39516 | 652 (funcall (nth 0 info) form))))) |
653 ((consp form) | |
654 (let ((info (rx-info (car form)))) | |
655 (unless (consp info) | |
53992
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
656 (error "Unknown rx form `%s'" (car form))) |
39516 | 657 (let ((result (funcall (nth 0 info) form))) |
658 (if (or no-group (string-match "\\`\\\\[(]" result)) | |
659 result | |
660 (concat "\\(?:" result "\\)"))))) | |
661 (t | |
53992
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
662 (error "rx syntax error at `%s'" form)))) |
39516 | 663 |
664 | |
665 ;;;###autoload | |
55102 | 666 (defmacro rx (&rest regexps) |
667 "Translate regular expressions REGEXPS in sexp form to a regexp string. | |
668 REGEXPS is a non-empty sequence of forms of the sort listed below. | |
39516 | 669 See also `rx-to-string' for how to do such a translation at run-time. |
670 | |
671 The following are valid subforms of regular expressions in sexp | |
672 notation. | |
673 | |
674 STRING | |
675 matches string STRING literally. | |
676 | |
677 CHAR | |
678 matches character CHAR literally. | |
679 | |
55102 | 680 `not-newline', `nonl' |
39516 | 681 matches any character except a newline. |
83899
bdeef0472e21
(rx): Fix typo in docstring.
Michaël Cadilhac <michael.cadilhac@lrde.org>
parents:
82365
diff
changeset
|
682 |
39516 | 683 `anything' |
684 matches any character | |
685 | |
55102 | 686 `(any SET ...)' |
687 `(in SET ...)' | |
688 `(char SET ...)' | |
689 matches any character in SET .... SET may be a character or string. | |
39516 | 690 Ranges of characters can be specified as `A-Z' in strings. |
55102 | 691 Ranges may also be specified as conses like `(?A . ?Z)'. |
39516 | 692 |
55102 | 693 SET may also be the name of a character class: `digit', |
694 `control', `hex-digit', `blank', `graph', `print', `alnum', | |
695 `alpha', `ascii', `nonascii', `lower', `punct', `space', `upper', | |
696 `word', or one of their synonyms. | |
39516 | 697 |
55102 | 698 `(not (any SET ...))' |
699 matches any character not in SET ... | |
39516 | 700 |
55102 | 701 `line-start', `bol' |
39516 | 702 matches the empty string, but only at the beginning of a line |
703 in the text being matched | |
704 | |
55102 | 705 `line-end', `eol' |
39516 | 706 is similar to `line-start' but matches only at the end of a line |
707 | |
55102 | 708 `string-start', `bos', `bot' |
39516 | 709 matches the empty string, but only at the beginning of the |
710 string being matched against. | |
711 | |
55102 | 712 `string-end', `eos', `eot' |
39516 | 713 matches the empty string, but only at the end of the |
714 string being matched against. | |
715 | |
716 `buffer-start' | |
717 matches the empty string, but only at the beginning of the | |
55102 | 718 buffer being matched against. Actually equivalent to `string-start'. |
39516 | 719 |
720 `buffer-end' | |
721 matches the empty string, but only at the end of the | |
55102 | 722 buffer being matched against. Actually equivalent to `string-end'. |
39516 | 723 |
724 `point' | |
725 matches the empty string, but only at point. | |
726 | |
55102 | 727 `word-start', `bow' |
77829
d858d80ae609
Nikolaj Schumacher <n_schumacher at web.de> (tiny change)
Glenn Morris <rgm@gnu.org>
parents:
75346
diff
changeset
|
728 matches the empty string, but only at the beginning of a word. |
39516 | 729 |
55102 | 730 `word-end', `eow' |
39516 | 731 matches the empty string, but only at the end of a word. |
732 | |
733 `word-boundary' | |
734 matches the empty string, but only at the beginning or end of a | |
735 word. | |
736 | |
737 `(not word-boundary)' | |
55102 | 738 `not-word-boundary' |
39516 | 739 matches the empty string, but not at the beginning or end of a |
740 word. | |
741 | |
77829
d858d80ae609
Nikolaj Schumacher <n_schumacher at web.de> (tiny change)
Glenn Morris <rgm@gnu.org>
parents:
75346
diff
changeset
|
742 `symbol-start' |
d858d80ae609
Nikolaj Schumacher <n_schumacher at web.de> (tiny change)
Glenn Morris <rgm@gnu.org>
parents:
75346
diff
changeset
|
743 matches the empty string, but only at the beginning of a symbol. |
d858d80ae609
Nikolaj Schumacher <n_schumacher at web.de> (tiny change)
Glenn Morris <rgm@gnu.org>
parents:
75346
diff
changeset
|
744 |
d858d80ae609
Nikolaj Schumacher <n_schumacher at web.de> (tiny change)
Glenn Morris <rgm@gnu.org>
parents:
75346
diff
changeset
|
745 `symbol-end' |
d858d80ae609
Nikolaj Schumacher <n_schumacher at web.de> (tiny change)
Glenn Morris <rgm@gnu.org>
parents:
75346
diff
changeset
|
746 matches the empty string, but only at the end of a symbol. |
d858d80ae609
Nikolaj Schumacher <n_schumacher at web.de> (tiny change)
Glenn Morris <rgm@gnu.org>
parents:
75346
diff
changeset
|
747 |
55102 | 748 `digit', `numeric', `num' |
39516 | 749 matches 0 through 9. |
750 | |
55102 | 751 `control', `cntrl' |
39516 | 752 matches ASCII control characters. |
753 | |
55102 | 754 `hex-digit', `hex', `xdigit' |
39516 | 755 matches 0 through 9, a through f and A through F. |
756 | |
757 `blank' | |
758 matches space and tab only. | |
759 | |
55102 | 760 `graphic', `graph' |
39516 | 761 matches graphic characters--everything except ASCII control chars, |
762 space, and DEL. | |
763 | |
55102 | 764 `printing', `print' |
39516 | 765 matches printing characters--everything except ASCII control chars |
766 and DEL. | |
767 | |
55102 | 768 `alphanumeric', `alnum' |
39516 | 769 matches letters and digits. (But at present, for multibyte characters, |
770 it matches anything that has word syntax.) | |
771 | |
55102 | 772 `letter', `alphabetic', `alpha' |
39516 | 773 matches letters. (But at present, for multibyte characters, |
774 it matches anything that has word syntax.) | |
775 | |
776 `ascii' | |
777 matches ASCII (unibyte) characters. | |
778 | |
779 `nonascii' | |
780 matches non-ASCII (multibyte) characters. | |
781 | |
55102 | 782 `lower', `lower-case' |
39516 | 783 matches anything lower-case. |
784 | |
55102 | 785 `upper', `upper-case' |
39516 | 786 matches anything upper-case. |
787 | |
55102 | 788 `punctuation', `punct' |
39516 | 789 matches punctuation. (But at present, for multibyte characters, |
790 it matches anything that has non-word syntax.) | |
791 | |
55102 | 792 `space', `whitespace', `white' |
39516 | 793 matches anything that has whitespace syntax. |
794 | |
55102 | 795 `word', `wordchar' |
39516 | 796 matches anything that has word syntax. |
797 | |
55102 | 798 `not-wordchar' |
799 matches anything that has non-word syntax. | |
800 | |
39516 | 801 `(syntax SYNTAX)' |
802 matches a character with syntax SYNTAX. SYNTAX must be one | |
55102 | 803 of the following symbols, or a symbol corresponding to the syntax |
804 character, e.g. `\\.' for `\\s.'. | |
39516 | 805 |
806 `whitespace' (\\s- in string notation) | |
807 `punctuation' (\\s.) | |
808 `word' (\\sw) | |
809 `symbol' (\\s_) | |
810 `open-parenthesis' (\\s() | |
811 `close-parenthesis' (\\s)) | |
812 `expression-prefix' (\\s') | |
813 `string-quote' (\\s\") | |
814 `paired-delimiter' (\\s$) | |
815 `escape' (\\s\\) | |
816 `character-quote' (\\s/) | |
817 `comment-start' (\\s<) | |
818 `comment-end' (\\s>) | |
53992
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
819 `string-delimiter' (\\s|) |
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
820 `comment-delimiter' (\\s!) |
39516 | 821 |
822 `(not (syntax SYNTAX))' | |
55102 | 823 matches a character that doesn't have syntax SYNTAX. |
39516 | 824 |
825 `(category CATEGORY)' | |
826 matches a character with category CATEGORY. CATEGORY must be | |
827 either a character to use for C, or one of the following symbols. | |
828 | |
829 `consonant' (\\c0 in string notation) | |
830 `base-vowel' (\\c1) | |
831 `upper-diacritical-mark' (\\c2) | |
832 `lower-diacritical-mark' (\\c3) | |
833 `tone-mark' (\\c4) | |
834 `symbol' (\\c5) | |
835 `digit' (\\c6) | |
836 `vowel-modifying-diacritical-mark' (\\c7) | |
837 `vowel-sign' (\\c8) | |
838 `semivowel-lower' (\\c9) | |
839 `not-at-end-of-line' (\\c<) | |
840 `not-at-beginning-of-line' (\\c>) | |
841 `alpha-numeric-two-byte' (\\cA) | |
842 `chinse-two-byte' (\\cC) | |
843 `greek-two-byte' (\\cG) | |
844 `japanese-hiragana-two-byte' (\\cH) | |
845 `indian-tow-byte' (\\cI) | |
846 `japanese-katakana-two-byte' (\\cK) | |
847 `korean-hangul-two-byte' (\\cN) | |
848 `cyrillic-two-byte' (\\cY) | |
55102 | 849 `combining-diacritic' (\\c^) |
39516 | 850 `ascii' (\\ca) |
851 `arabic' (\\cb) | |
852 `chinese' (\\cc) | |
853 `ethiopic' (\\ce) | |
854 `greek' (\\cg) | |
855 `korean' (\\ch) | |
856 `indian' (\\ci) | |
857 `japanese' (\\cj) | |
858 `japanese-katakana' (\\ck) | |
859 `latin' (\\cl) | |
860 `lao' (\\co) | |
861 `tibetan' (\\cq) | |
862 `japanese-roman' (\\cr) | |
863 `thai' (\\ct) | |
864 `vietnamese' (\\cv) | |
865 `hebrew' (\\cw) | |
866 `cyrillic' (\\cy) | |
867 `can-break' (\\c|) | |
868 | |
869 `(not (category CATEGORY))' | |
55102 | 870 matches a character that doesn't have category CATEGORY. |
39516 | 871 |
872 `(and SEXP1 SEXP2 ...)' | |
55102 | 873 `(: SEXP1 SEXP2 ...)' |
874 `(seq SEXP1 SEXP2 ...)' | |
875 `(sequence SEXP1 SEXP2 ...)' | |
39516 | 876 matches what SEXP1 matches, followed by what SEXP2 matches, etc. |
877 | |
878 `(submatch SEXP1 SEXP2 ...)' | |
55102 | 879 `(group SEXP1 SEXP2 ...)' |
39516 | 880 like `and', but makes the match accessible with `match-end', |
881 `match-beginning', and `match-string'. | |
882 | |
883 `(group SEXP1 SEXP2 ...)' | |
884 another name for `submatch'. | |
885 | |
886 `(or SEXP1 SEXP2 ...)' | |
55102 | 887 `(| SEXP1 SEXP2 ...)' |
39516 | 888 matches anything that matches SEXP1 or SEXP2, etc. If all |
889 args are strings, use `regexp-opt' to optimize the resulting | |
890 regular expression. | |
891 | |
892 `(minimal-match SEXP)' | |
893 produce a non-greedy regexp for SEXP. Normally, regexps matching | |
53992
c5c237251824
(rx-check, rx-check-any, rx-check-not)
Eli Zaretskii <eliz@is.elta.co.il>
parents:
53974
diff
changeset
|
894 zero or more occurrences of something are \"greedy\" in that they |
39516 | 895 match as much as they can, as long as the overall regexp can |
896 still match. A non-greedy regexp matches as little as possible. | |
897 | |
898 `(maximal-match SEXP)' | |
47257 | 899 produce a greedy regexp for SEXP. This is the default. |
39516 | 900 |
55102 | 901 Below, `SEXP ...' represents a sequence of regexp forms, treated as if |
902 enclosed in `(and ...)'. | |
39516 | 903 |
55102 | 904 `(zero-or-more SEXP ...)' |
905 `(0+ SEXP ...)' | |
906 matches zero or more occurrences of what SEXP ... matches. | |
39516 | 907 |
55102 | 908 `(* SEXP ...)' |
909 like `zero-or-more', but always produces a greedy regexp, independent | |
910 of `rx-greedy-flag'. | |
39516 | 911 |
55102 | 912 `(*? SEXP ...)' |
913 like `zero-or-more', but always produces a non-greedy regexp, | |
914 independent of `rx-greedy-flag'. | |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
48938
diff
changeset
|
915 |
55102 | 916 `(one-or-more SEXP ...)' |
917 `(1+ SEXP ...)' | |
918 matches one or more occurrences of SEXP ... | |
39516 | 919 |
55102 | 920 `(+ SEXP ...)' |
39516 | 921 like `one-or-more', but always produces a greedy regexp. |
922 | |
55102 | 923 `(+? SEXP ...)' |
39516 | 924 like `one-or-more', but always produces a non-greedy regexp. |
925 | |
55102 | 926 `(zero-or-one SEXP ...)' |
927 `(optional SEXP ...)' | |
928 `(opt SEXP ...)' | |
39516 | 929 matches zero or one occurrences of A. |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
48938
diff
changeset
|
930 |
55102 | 931 `(? SEXP ...)' |
39516 | 932 like `zero-or-one', but always produces a greedy regexp. |
933 | |
55102 | 934 `(?? SEXP ...)' |
39516 | 935 like `zero-or-one', but always produces a non-greedy regexp. |
936 | |
937 `(repeat N SEXP)' | |
55102 | 938 `(= N SEXP ...)' |
939 matches N occurrences. | |
940 | |
941 `(>= N SEXP ...)' | |
942 matches N or more occurrences. | |
39516 | 943 |
944 `(repeat N M SEXP)' | |
55102 | 945 `(** N M SEXP ...)' |
946 matches N to M occurrences. | |
947 | |
948 `(backref N)' | |
949 matches what was matched previously by submatch N. | |
39516 | 950 |
54461
5c8be4779a36
(rx): Work at compile time, not run time.
Juanma Barranquero <lekktu@gmail.com>
parents:
53992
diff
changeset
|
951 `(backref N)' |
5c8be4779a36
(rx): Work at compile time, not run time.
Juanma Barranquero <lekktu@gmail.com>
parents:
53992
diff
changeset
|
952 matches what was matched previously by submatch N. |
5c8be4779a36
(rx): Work at compile time, not run time.
Juanma Barranquero <lekktu@gmail.com>
parents:
53992
diff
changeset
|
953 |
54601
d2d20534e329
(rx): Work at compile time, not run time.
Eli Zaretskii <eliz@gnu.org>
parents:
54503
diff
changeset
|
954 `(backref N)' |
d2d20534e329
(rx): Work at compile time, not run time.
Eli Zaretskii <eliz@gnu.org>
parents:
54503
diff
changeset
|
955 matches what was matched previously by submatch N. |
d2d20534e329
(rx): Work at compile time, not run time.
Eli Zaretskii <eliz@gnu.org>
parents:
54503
diff
changeset
|
956 |
39516 | 957 `(eval FORM)' |
54461
5c8be4779a36
(rx): Work at compile time, not run time.
Juanma Barranquero <lekktu@gmail.com>
parents:
53992
diff
changeset
|
958 evaluate FORM and insert result. If result is a string, |
5c8be4779a36
(rx): Work at compile time, not run time.
Juanma Barranquero <lekktu@gmail.com>
parents:
53992
diff
changeset
|
959 `regexp-quote' it. |
39516 | 960 |
961 `(regexp REGEXP)' | |
54461
5c8be4779a36
(rx): Work at compile time, not run time.
Juanma Barranquero <lekktu@gmail.com>
parents:
53992
diff
changeset
|
962 include REGEXP in string notation in the result." |
55102 | 963 (cond ((null regexps) |
964 (error "No regexp")) | |
965 ((cdr regexps) | |
966 (rx-to-string `(and ,@regexps) t)) | |
967 (t | |
968 (rx-to-string (car regexps) t)))) | |
969 | |
970 ;; ;; sregex.el replacement | |
39516 | 971 |
55102 | 972 ;; ;;;###autoload (provide 'sregex) |
973 ;; ;;;###autoload (autoload 'sregex "rx") | |
974 ;; (defalias 'sregex 'rx-to-string) | |
975 ;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro) | |
976 ;; (defalias 'sregexq 'rx) | |
977 | |
39516 | 978 (provide 'rx) |
979 | |
60930
a6ae354aa8ef
(rx-constituents): Add symbol-start and symbol-end.
Stefan Monnier <monnier@iro.umontreal.ca>
parents:
55103
diff
changeset
|
980 ;; arch-tag: 12d01a63-0008-42bb-ab8c-1c7d63be370b |
39516 | 981 ;;; rx.el ends here |