Mercurial > emacs
annotate lisp/emacs-lisp/rx.el @ 53370:3f0eafd05a7b
Document the change in insert-for-yank.
author | Eli Zaretskii <eliz@is.elta.co.il> |
---|---|
date | Mon, 29 Dec 2003 11:46:03 +0000 |
parents | d5c1eeaa97e2 |
children | 818e19ae4c5a |
rev | line source |
---|---|
39516 | 1 ;;; rx.el --- sexp notation for regular expressions |
2 | |
3 ;; Copyright (C) 2001 Free Software Foundation, Inc. | |
4 | |
5 ;; Author: Gerd Moellmann <gerd@gnu.org> | |
6 ;; Maintainer: FSF | |
7 ;; Keywords: strings, regexps, extensions | |
8 | |
9 ;; This file is part of GNU Emacs. | |
10 | |
11 ;; GNU Emacs is free software; you can redistribute it and/or modify | |
12 ;; it under the terms of the GNU General Public License as published by | |
13 ;; the Free Software Foundation; either version 2, or (at your option) | |
14 ;; any later version. | |
15 | |
16 ;; GNU Emacs is distributed in the hope that it will be useful, | |
17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
19 ;; GNU General Public License for more details. | |
20 | |
21 ;; You should have received a copy of the GNU General Public License | |
22 ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
24 ;; Boston, MA 02111-1307, USA. | |
25 | |
26 ;;; Commentary: | |
27 | |
28 ;; This is another implementation of sexp-form regular expressions. | |
29 ;; It was unfortunately written without being aware of the Sregex | |
30 ;; package coming with Emacs, but as things stand, Rx completely | |
31 ;; covers all regexp features, which Sregex doesn't, doesn't suffer | |
32 ;; from the bugs mentioned in the commentary section of Sregex, and | |
33 ;; uses a nicer syntax (IMHO, of course :-). | |
34 | |
35 ;; Rx translates a sexp notation for regular expressions into the | |
36 ;; usual string notation. The translation can be done at compile-time | |
37 ;; by using the `rx' macro. It can be done at run-time by calling | |
38 ;; function `rx-to-string'. See the documentation of `rx' for a | |
39 ;; complete description of the sexp notation. | |
40 ;; | |
41 ;; Some examples of string regexps and their sexp counterparts: | |
42 ;; | |
43 ;; "^[a-z]*" | |
44 ;; (rx (and line-start (0+ (in "a-z")))) | |
45 ;; | |
46 ;; "\n[^ \t]" | |
47 ;; (rx (and "\n" (not blank))), or | |
48 ;; (rx (and "\n" (not (any " \t")))) | |
49 ;; | |
50 ;; "\\*\\*\\* EOOH \\*\\*\\*\n" | |
51 ;; (rx "*** EOOH ***\n") | |
52 ;; | |
53 ;; "\\<\\(catch\\|finally\\)\\>[^_]" | |
54 ;; (rx (and word-start (submatch (or "catch" "finally")) word-end | |
55 ;; (not (any ?_)))) | |
56 ;; | |
57 ;; "[ \t\n]*:\\([^:]+\\|$\\)" | |
58 ;; (rx (and (zero-or-more (in " \t\n")) ":" | |
59 ;; (submatch (or line-end (one-or-more (not (any ?:))))))) | |
60 ;; | |
61 ;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*" | |
62 ;; (rx (and line-start | |
63 ;; "content-transfer-encoding:" | |
48938
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
64 ;; (+ (? ?\n)) blank |
39516 | 65 ;; "quoted-printable" |
48938
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
66 ;; (+ (? ?\n)) blank)) |
39516 | 67 ;; |
68 ;; (concat "^\\(?:" something-else "\\)") | |
69 ;; (rx (and line-start (eval something-else))), statically or | |
70 ;; (rx-to-string '(and line-start ,something-else)), dynamically. | |
71 ;; | |
72 ;; (regexp-opt '(STRING1 STRING2 ...)) | |
73 ;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically | |
74 ;; calls `regexp-opt' as needed. | |
75 ;; | |
76 ;; "^;;\\s-*\n\\|^\n" | |
77 ;; (rx (or (and line-start ";;" (0+ space) ?\n) | |
78 ;; (and line-start ?\n))) | |
79 ;; | |
80 ;; "\\$[I]d: [^ ]+ \\([^ ]+\\) " | |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
48938
diff
changeset
|
81 ;; (rx (and "$Id: " |
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
48938
diff
changeset
|
82 ;; (1+ (not (in " "))) |
39516 | 83 ;; " " |
84 ;; (submatch (1+ (not (in " ")))) | |
48938
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
85 ;; " ")) |
39516 | 86 ;; |
87 ;; "\\\\\\\\\\[\\w+" | |
88 ;; (rx (and ?\\ ?\\ ?\[ (1+ word))) | |
89 ;; | |
90 ;; etc. | |
91 | |
92 ;;; History: | |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
48938
diff
changeset
|
93 ;; |
39516 | 94 |
95 ;;; Code: | |
96 | |
97 | |
98 (defconst rx-constituents | |
99 '((and . (rx-and 1 nil)) | |
100 (or . (rx-or 1 nil)) | |
101 (not-newline . ".") | |
102 (anything . ".\\|\n") | |
103 (any . (rx-any 1 1 rx-check-any)) | |
104 (in . any) | |
105 (not . (rx-not 1 1 rx-check-not)) | |
106 (repeat . (rx-repeat 2 3)) | |
107 (submatch . (rx-submatch 1 nil)) | |
108 (group . submatch) | |
109 (zero-or-more . (rx-kleene 1 1)) | |
110 (one-or-more . (rx-kleene 1 1)) | |
111 (zero-or-one . (rx-kleene 1 1)) | |
112 (\? . zero-or-one) | |
113 (\?? . zero-or-one) | |
114 (* . zero-or-more) | |
115 (*? . zero-or-more) | |
116 (0+ . zero-or-more) | |
117 (+ . one-or-more) | |
118 (+? . one-or-more) | |
119 (1+ . one-or-more) | |
120 (optional . zero-or-one) | |
121 (minimal-match . (rx-greedy 1 1)) | |
122 (maximal-match . (rx-greedy 1 1)) | |
123 (line-start . "^") | |
124 (line-end . "$") | |
125 (string-start . "\\`") | |
126 (string-end . "\\'") | |
127 (buffer-start . "\\`") | |
128 (buffer-end . "\\'") | |
129 (point . "\\=") | |
130 (word-start . "\\<") | |
131 (word-end . "\\>") | |
132 (word-boundary . "\\b") | |
133 (syntax . (rx-syntax 1 1)) | |
134 (category . (rx-category 1 1 rx-check-category)) | |
135 (eval . (rx-eval 1 1)) | |
136 (regexp . (rx-regexp 1 1 stringp)) | |
137 (digit . "[[:digit:]]") | |
138 (control . "[[:cntrl:]]") | |
139 (hex-digit . "[[:xdigit:]]") | |
140 (blank . "[[:blank:]]") | |
141 (graphic . "[[:graph:]]") | |
142 (printing . "[[:print:]]") | |
143 (alphanumeric . "[[:alnum:]]") | |
144 (letter . "[[:alpha:]]") | |
145 (ascii . "[[:ascii:]]") | |
146 (nonascii . "[[:nonascii:]]") | |
147 (lower . "[[:lower:]]") | |
148 (punctuation . "[[:punct:]]") | |
149 (space . "[[:space:]]") | |
150 (upper . "[[:upper:]]") | |
151 (word . "[[:word:]]")) | |
152 "Alist of sexp form regexp constituents. | |
153 Each element of the alist has the form (SYMBOL . DEFN). | |
154 SYMBOL is a valid constituent of sexp regular expressions. | |
155 If DEFN is a string, SYMBOL is translated into DEFN. | |
156 If DEFN is a symbol, use the definition of DEFN, recursively. | |
157 Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE). | |
158 FUNCTION is used to produce code for SYMBOL. MIN-ARGS and MAX-ARGS | |
159 are the minimum and maximum number of arguments the function-form | |
160 sexp constituent SYMBOL may have in sexp regular expressions. | |
161 MAX-ARGS nil means no limit. PREDICATE, if specified, means that | |
162 all arguments must satisfy PREDICATE.") | |
163 | |
164 | |
165 (defconst rx-syntax | |
166 '((whitespace . ?-) | |
167 (punctuation . ?.) | |
168 (word . ?w) | |
169 (symbol . ?_) | |
170 (open-parenthesis . ?\() | |
171 (close-parenthesis . ?\)) | |
172 (expression-prefix . ?\') | |
173 (string-quote . ?\") | |
174 (paired-delimiter . ?$) | |
175 (escape . ?\\) | |
176 (character-quote . ?/) | |
177 (comment-start . ?<) | |
178 (comment-end . ?>)) | |
179 "Alist mapping Rx syntax symbols to syntax characters. | |
180 Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid | |
181 symbol in `(syntax SYMBOL)', and CHAR is the syntax character | |
182 corresponding to SYMBOL, as it would be used with \\s or \\S in | |
183 regular expressions.") | |
184 | |
185 | |
186 (defconst rx-categories | |
187 '((consonant . ?0) | |
188 (base-vowel . ?1) | |
189 (upper-diacritical-mark . ?2) | |
190 (lower-diacritical-mark . ?3) | |
191 (tone-mark . ?4) | |
192 (symbol . ?5) | |
193 (digit . ?6) | |
194 (vowel-modifying-diacritical-mark . ?7) | |
195 (vowel-sign . ?8) | |
196 (semivowel-lower . ?9) | |
197 (not-at-end-of-line . ?<) | |
198 (not-at-beginning-of-line . ?>) | |
199 (alpha-numeric-two-byte . ?A) | |
200 (chinse-two-byte . ?C) | |
201 (greek-two-byte . ?G) | |
202 (japanese-hiragana-two-byte . ?H) | |
203 (indian-two-byte . ?I) | |
204 (japanese-katakana-two-byte . ?K) | |
205 (korean-hangul-two-byte . ?N) | |
206 (cyrillic-two-byte . ?Y) | |
207 (ascii . ?a) | |
208 (arabic . ?b) | |
209 (chinese . ?c) | |
210 (ethiopic . ?e) | |
211 (greek . ?g) | |
212 (korean . ?h) | |
213 (indian . ?i) | |
214 (japanese . ?j) | |
215 (japanese-katakana . ?k) | |
216 (latin . ?l) | |
217 (lao . ?o) | |
218 (tibetan . ?q) | |
219 (japanese-roman . ?r) | |
220 (thai . ?t) | |
221 (vietnamese . ?v) | |
222 (hebrew . ?w) | |
223 (cyrillic . ?y) | |
224 (can-break . ?|)) | |
225 "Alist mapping symbols to category characters. | |
226 Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid | |
227 symbol in `(category SYMBOL)', and CHAR is the category character | |
228 corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in | |
229 regular expression strings.") | |
230 | |
231 | |
232 (defvar rx-greedy-flag t | |
233 "Non-nil means produce greedy regular expressions for `zero-or-one', | |
234 `zero-or-more', and `one-or-more'. Dynamically bound.") | |
235 | |
236 | |
237 (defun rx-info (op) | |
238 "Return parsing/code generation info for OP. | |
239 If OP is the space character ASCII 32, return info for the symbol `?'. | |
240 If OP is the character `?', return info for the symbol `??'. | |
241 See also `rx-constituents'." | |
242 (cond ((eq op ? ) (setq op '\?)) | |
243 ((eq op ??) (setq op '\??))) | |
244 (while (and (not (null op)) (symbolp op)) | |
245 (setq op (cdr (assq op rx-constituents)))) | |
246 op) | |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
48938
diff
changeset
|
247 |
39516 | 248 |
249 (defun rx-check (form) | |
250 "Check FORM according to its car's parsing info." | |
251 (let* ((rx (rx-info (car form))) | |
252 (nargs (1- (length form))) | |
253 (min-args (nth 1 rx)) | |
254 (max-args (nth 2 rx)) | |
255 (type-pred (nth 3 rx))) | |
256 (when (and (not (null min-args)) | |
257 (< nargs min-args)) | |
258 (error "Rx form `%s' requires at least %d args" | |
259 (car form) min-args)) | |
260 (when (and (not (null max-args)) | |
261 (> nargs max-args)) | |
262 (error "Rx form `%s' accepts at most %d args" | |
263 (car form) max-args)) | |
264 (when (not (null type-pred)) | |
265 (dolist (sub-form (cdr form)) | |
266 (unless (funcall type-pred sub-form) | |
267 (error "Rx form `%s' requires args satisfying `%s'" | |
268 (car form) type-pred)))))) | |
269 | |
270 | |
271 (defun rx-and (form) | |
272 "Parse and produce code from FORM. | |
273 FORM is of the form `(and FORM1 ...)'." | |
274 (rx-check form) | |
48938
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
275 (concat "\\(?:" |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
276 (mapconcat |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
277 (function (lambda (x) (rx-to-string x 'no-group))) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
278 (cdr form) nil) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
279 "\\)")) |
39516 | 280 |
281 | |
282 (defun rx-or (form) | |
283 "Parse and produce code from FORM, which is `(or FORM1 ...)'." | |
284 (rx-check form) | |
285 (let ((all-args-strings t)) | |
286 (dolist (arg (cdr form)) | |
287 (unless (stringp arg) | |
288 (setq all-args-strings nil))) | |
52971
d5c1eeaa97e2
(rx-or): Fix the case of "(rx (and ?a (or ?b ?c) ?d))".
Eli Zaretskii <eliz@gnu.org>
parents:
52401
diff
changeset
|
289 (concat "\\(?:" |
d5c1eeaa97e2
(rx-or): Fix the case of "(rx (and ?a (or ?b ?c) ?d))".
Eli Zaretskii <eliz@gnu.org>
parents:
52401
diff
changeset
|
290 (if all-args-strings |
d5c1eeaa97e2
(rx-or): Fix the case of "(rx (and ?a (or ?b ?c) ?d))".
Eli Zaretskii <eliz@gnu.org>
parents:
52401
diff
changeset
|
291 (regexp-opt (cdr form)) |
d5c1eeaa97e2
(rx-or): Fix the case of "(rx (and ?a (or ?b ?c) ?d))".
Eli Zaretskii <eliz@gnu.org>
parents:
52401
diff
changeset
|
292 (mapconcat #'rx-to-string (cdr form) "\\|")) |
d5c1eeaa97e2
(rx-or): Fix the case of "(rx (and ?a (or ?b ?c) ?d))".
Eli Zaretskii <eliz@gnu.org>
parents:
52401
diff
changeset
|
293 "\\)"))) |
39516 | 294 |
295 | |
296 (defun rx-quote-for-set (string) | |
297 "Transform STRING for use in a character set. | |
298 If STRING contains a `]', move it to the front. | |
299 If STRING starts with a '^', move it to the end." | |
300 (when (string-match "\\`\\(\\(?:.\\|\n\\)+\\)\\]\\(\\(?:.\\|\n\\)\\)*\\'" | |
301 string) | |
302 (setq string (concat "]" (match-string 1 string) | |
303 (match-string 2 string)))) | |
304 (when (string-match "\\`^\\(\\(?:.\\|\n\\)+\\)\\'" string) | |
305 (setq string (concat (substring string 1) "^"))) | |
306 string) | |
307 | |
308 | |
309 (defun rx-check-any (arg) | |
310 "Check arg ARG for Rx `any'." | |
311 (cond ((integerp arg) t) | |
312 ((and (stringp arg) (zerop (length arg))) | |
313 (error "String arg for Rx `any' must not be empty")) | |
314 ((stringp arg) t) | |
315 (t | |
316 (error "Rx `any' requires string or character arg")))) | |
317 | |
318 | |
319 (defun rx-any (form) | |
320 "Parse and produce code from FORM, which is `(any STRING)'. | |
321 STRING is optional. If it is omitted, build a regexp that | |
322 matches anything." | |
323 (rx-check form) | |
324 (let ((arg (cadr form))) | |
325 (cond ((integerp arg) | |
326 (char-to-string arg)) | |
327 ((= (length arg) 1) | |
328 arg) | |
329 (t | |
330 (concat "[" (rx-quote-for-set (cadr form)) "]"))))) | |
331 | |
332 | |
333 (defun rx-check-not (form) | |
334 "Check arguments of FORM. FORM is `(not ...)'." | |
335 (unless (or (memq form | |
336 '(digit control hex-digit blank graphic printing | |
337 alphanumeric letter ascii nonascii lower | |
338 punctuation space upper word)) | |
339 (and (consp form) | |
340 (memq (car form) '(not any in syntax category:)))) | |
341 (error "Rx `not' syntax error: %s" form)) | |
342 t) | |
343 | |
344 | |
345 (defun rx-not (form) | |
346 "Parse and produce code from FORM. FORM is `(not ...)'." | |
347 (rx-check form) | |
348 (let ((result (rx-to-string (cadr form) 'no-group))) | |
349 (cond ((string-match "\\`\\[^" result) | |
350 (if (= (length result) 4) | |
351 (substring result 2 3) | |
352 (concat "[" (substring result 2)))) | |
353 ((string-match "\\`\\[" result) | |
354 (concat "[^" (substring result 1))) | |
355 ((string-match "\\`\\\\s." result) | |
356 (concat "\\S" (substring result 2))) | |
357 ((string-match "\\`\\\\S." result) | |
358 (concat "\\s" (substring result 2))) | |
359 ((string-match "\\`\\\\c." result) | |
360 (concat "\\C" (substring result 2))) | |
361 ((string-match "\\`\\\\C." result) | |
362 (concat "\\c" (substring result 2))) | |
363 ((string-match "\\`\\\\B" result) | |
364 (concat "\\b" (substring result 2))) | |
365 ((string-match "\\`\\\\b" result) | |
366 (concat "\\B" (substring result 2))) | |
367 (t | |
368 (concat "[^" result "]"))))) | |
369 | |
370 | |
371 (defun rx-repeat (form) | |
372 "Parse and produce code from FORM. | |
373 FORM is either `(repeat N FORM1)' or `(repeat N M FORM1)'." | |
374 (rx-check form) | |
375 (cond ((= (length form) 3) | |
376 (unless (and (integerp (nth 1 form)) | |
377 (> (nth 1 form) 0)) | |
378 (error "Rx `repeat' requires positive integer first arg")) | |
379 (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form))) | |
380 ((or (not (integerp (nth 2 form))) | |
381 (< (nth 2 form) 0) | |
382 (not (integerp (nth 1 form))) | |
383 (< (nth 1 form) 0) | |
384 (< (nth 2 form) (nth 1 form))) | |
385 (error "Rx `repeat' range error")) | |
386 (t | |
387 (format "%s\\{%d,%d\\}" (rx-to-string (nth 3 form)) | |
388 (nth 1 form) (nth 2 form))))) | |
389 | |
390 | |
391 (defun rx-submatch (form) | |
392 "Parse and produce code from FORM, which is `(submatch ...)'." | |
48938
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
393 (concat "\\(" |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
394 (mapconcat (function (lambda (x) (rx-to-string x 'no-group))) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
395 (cdr form) nil) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
396 "\\)")) |
39516 | 397 |
398 (defun rx-kleene (form) | |
399 "Parse and produce code from FORM. | |
400 FORM is `(OP FORM1)', where OP is one of the `zero-or-one', | |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
48938
diff
changeset
|
401 `zero-or-more' etc. operators. |
39516 | 402 If OP is one of `*', `+', `?', produce a greedy regexp. |
403 If OP is one of `*?', `+?', `??', produce a non-greedy regexp. | |
404 If OP is anything else, produce a greedy regexp if `rx-greedy-flag' | |
405 is non-nil." | |
406 (rx-check form) | |
407 (let ((suffix (cond ((memq (car form) '(* + ? )) "") | |
408 ((memq (car form) '(*? +? ??)) "?") | |
409 (rx-greedy-flag "") | |
410 (t "?"))) | |
411 (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*") | |
412 ((memq (car form) '(+ +? 1+ one-or-more)) "+") | |
48938
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
413 (t "?"))) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
414 (result (rx-to-string (cadr form) 'no-group))) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
415 (if (not (rx-atomic-p result)) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
416 (setq result (concat "\\(?:" result "\\)"))) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
417 (concat result op suffix))) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
418 |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
419 (defun rx-atomic-p (r) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
420 "Return non-nil if regexp string R is atomic. |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
421 An atomic regexp R is one such that a suffix operator |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
422 appended to R will apply to all of R. For example, \"a\" |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
423 \"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\", |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
424 \"[ab]c\", and \"ab\\|ab*c\" are not atomic. |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
425 |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
426 This function may return false negatives, but it will not |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
427 return false positives. It is nevertheless useful in |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
428 situations where an efficiency shortcut can be taken iff a |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
429 regexp is atomic. The function can be improved to detect |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
430 more cases of atomic regexps. Presently, this function |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
431 detects the following categories of atomic regexp; |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
432 |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
433 a group or shy group: \\(...\\) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
434 a character class: [...] |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
435 a single character: a |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
436 |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
437 On the other hand, false negatives will be returned for |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
438 regexps that are atomic but end in operators, such as |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
439 \"a+\". I think these are rare. Probably such cases could |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
440 be detected without much effort. A guarantee of no false |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
441 negatives would require a theoretic specification of the set |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
442 of all atomic regexps." |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
443 (let ((l (length r))) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
444 (or (equal l 1) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
445 (and (>= l 6) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
446 (equal (substring r 0 2) "\\(") |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
447 (equal (substring r -2) "\\)")) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
448 (and (>= l 2) |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
449 (equal (substring r 0 1) "[") |
05f00479612c
(rx-and): Generate a shy group.
Richard M. Stallman <rms@gnu.org>
parents:
47257
diff
changeset
|
450 (equal (substring r -1) "]"))))) |
39516 | 451 |
452 | |
453 (defun rx-syntax (form) | |
454 "Parse and produce code from FORM, which is `(syntax SYMBOL)'." | |
455 (rx-check form) | |
456 (let ((syntax (assq (cadr form) rx-syntax))) | |
457 (unless syntax | |
458 (error "Unknown rx syntax `%s'" (cadr form))) | |
459 (format "\\s%c" (cdr syntax)))) | |
460 | |
461 | |
462 (defun rx-check-category (form) | |
463 "Check the argument FORM of a `(category FORM)'." | |
464 (unless (or (integerp form) | |
465 (cdr (assq form rx-categories))) | |
466 (error "Unknown category `%s'" form)) | |
467 t) | |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
48938
diff
changeset
|
468 |
39516 | 469 |
470 (defun rx-category (form) | |
471 "Parse and produce code from FORM, which is `(category SYMBOL ...)'." | |
472 (rx-check form) | |
473 (let ((char (if (integerp (cadr form)) | |
474 (cadr form) | |
475 (cdr (assq (cadr form) rx-categories))))) | |
476 (format "\\c%c" char))) | |
477 | |
478 | |
479 (defun rx-eval (form) | |
480 "Parse and produce code from FORM, which is `(eval FORM)'." | |
481 (rx-check form) | |
482 (rx-to-string (eval (cadr form)))) | |
483 | |
484 | |
485 (defun rx-greedy (form) | |
486 "Parse and produce code from FORM. If FORM is '(minimal-match | |
487 FORM1)', non-greedy versions of `*', `+', and `?' operators will be | |
488 used in FORM1. If FORM is '(maximal-match FORM1)', greedy operators | |
489 will be used." | |
490 (rx-check form) | |
491 (let ((rx-greedy-flag (eq (car form) 'maximal-match))) | |
492 (rx-to-string (cadr form)))) | |
493 | |
494 | |
495 (defun rx-regexp (form) | |
496 "Parse and produce code from FORM, which is `(regexp STRING)'." | |
497 (rx-check form) | |
498 (concat "\\(?:" (cadr form) "\\)")) | |
499 | |
500 | |
501 ;;;###autoload | |
502 (defun rx-to-string (form &optional no-group) | |
503 "Parse and produce code for regular expression FORM. | |
504 FORM is a regular expression in sexp form. | |
505 NO-GROUP non-nil means don't put shy groups around the result." | |
506 (cond ((stringp form) | |
507 (regexp-quote form)) | |
508 ((integerp form) | |
509 (regexp-quote (char-to-string form))) | |
510 ((symbolp form) | |
511 (let ((info (rx-info form))) | |
512 (cond ((stringp info) | |
513 info) | |
514 ((null info) | |
515 (error "Unknown Rx form `%s'" form)) | |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
48938
diff
changeset
|
516 (t |
39516 | 517 (funcall (nth 0 info) form))))) |
518 ((consp form) | |
519 (let ((info (rx-info (car form)))) | |
520 (unless (consp info) | |
521 (error "Unknown Rx form `%s'" (car form))) | |
522 (let ((result (funcall (nth 0 info) form))) | |
523 (if (or no-group (string-match "\\`\\\\[(]" result)) | |
524 result | |
525 (concat "\\(?:" result "\\)"))))) | |
526 (t | |
527 (error "Rx syntax error at `%s'" form)))) | |
528 | |
529 | |
530 ;;;###autoload | |
531 (defmacro rx (regexp) | |
532 "Translate a regular expression REGEXP in sexp form to a regexp string. | |
533 See also `rx-to-string' for how to do such a translation at run-time. | |
534 | |
535 The following are valid subforms of regular expressions in sexp | |
536 notation. | |
537 | |
538 STRING | |
539 matches string STRING literally. | |
540 | |
541 CHAR | |
542 matches character CHAR literally. | |
543 | |
544 `not-newline' | |
545 matches any character except a newline. | |
546 . | |
547 `anything' | |
548 matches any character | |
549 | |
550 `(any SET)' | |
551 matches any character in SET. SET may be a character or string. | |
552 Ranges of characters can be specified as `A-Z' in strings. | |
553 | |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
48938
diff
changeset
|
554 '(in SET)' |
39516 | 555 like `any'. |
556 | |
557 `(not (any SET))' | |
558 matches any character not in SET | |
559 | |
560 `line-start' | |
561 matches the empty string, but only at the beginning of a line | |
562 in the text being matched | |
563 | |
564 `line-end' | |
565 is similar to `line-start' but matches only at the end of a line | |
566 | |
567 `string-start' | |
568 matches the empty string, but only at the beginning of the | |
569 string being matched against. | |
570 | |
571 `string-end' | |
572 matches the empty string, but only at the end of the | |
573 string being matched against. | |
574 | |
575 `buffer-start' | |
576 matches the empty string, but only at the beginning of the | |
577 buffer being matched against. | |
578 | |
579 `buffer-end' | |
580 matches the empty string, but only at the end of the | |
581 buffer being matched against. | |
582 | |
583 `point' | |
584 matches the empty string, but only at point. | |
585 | |
586 `word-start' | |
587 matches the empty string, but only at the beginning or end of a | |
588 word. | |
589 | |
590 `word-end' | |
591 matches the empty string, but only at the end of a word. | |
592 | |
593 `word-boundary' | |
594 matches the empty string, but only at the beginning or end of a | |
595 word. | |
596 | |
597 `(not word-boundary)' | |
598 matches the empty string, but not at the beginning or end of a | |
599 word. | |
600 | |
601 `digit' | |
602 matches 0 through 9. | |
603 | |
604 `control' | |
605 matches ASCII control characters. | |
606 | |
607 `hex-digit' | |
608 matches 0 through 9, a through f and A through F. | |
609 | |
610 `blank' | |
611 matches space and tab only. | |
612 | |
613 `graphic' | |
614 matches graphic characters--everything except ASCII control chars, | |
615 space, and DEL. | |
616 | |
617 `printing' | |
618 matches printing characters--everything except ASCII control chars | |
619 and DEL. | |
620 | |
621 `alphanumeric' | |
622 matches letters and digits. (But at present, for multibyte characters, | |
623 it matches anything that has word syntax.) | |
624 | |
625 `letter' | |
626 matches letters. (But at present, for multibyte characters, | |
627 it matches anything that has word syntax.) | |
628 | |
629 `ascii' | |
630 matches ASCII (unibyte) characters. | |
631 | |
632 `nonascii' | |
633 matches non-ASCII (multibyte) characters. | |
634 | |
635 `lower' | |
636 matches anything lower-case. | |
637 | |
638 `upper' | |
639 matches anything upper-case. | |
640 | |
641 `punctuation' | |
642 matches punctuation. (But at present, for multibyte characters, | |
643 it matches anything that has non-word syntax.) | |
644 | |
645 `space' | |
646 matches anything that has whitespace syntax. | |
647 | |
648 `word' | |
649 matches anything that has word syntax. | |
650 | |
651 `(syntax SYNTAX)' | |
652 matches a character with syntax SYNTAX. SYNTAX must be one | |
653 of the following symbols. | |
654 | |
655 `whitespace' (\\s- in string notation) | |
656 `punctuation' (\\s.) | |
657 `word' (\\sw) | |
658 `symbol' (\\s_) | |
659 `open-parenthesis' (\\s() | |
660 `close-parenthesis' (\\s)) | |
661 `expression-prefix' (\\s') | |
662 `string-quote' (\\s\") | |
663 `paired-delimiter' (\\s$) | |
664 `escape' (\\s\\) | |
665 `character-quote' (\\s/) | |
666 `comment-start' (\\s<) | |
667 `comment-end' (\\s>) | |
668 | |
669 `(not (syntax SYNTAX))' | |
670 matches a character that has not syntax SYNTAX. | |
671 | |
672 `(category CATEGORY)' | |
673 matches a character with category CATEGORY. CATEGORY must be | |
674 either a character to use for C, or one of the following symbols. | |
675 | |
676 `consonant' (\\c0 in string notation) | |
677 `base-vowel' (\\c1) | |
678 `upper-diacritical-mark' (\\c2) | |
679 `lower-diacritical-mark' (\\c3) | |
680 `tone-mark' (\\c4) | |
681 `symbol' (\\c5) | |
682 `digit' (\\c6) | |
683 `vowel-modifying-diacritical-mark' (\\c7) | |
684 `vowel-sign' (\\c8) | |
685 `semivowel-lower' (\\c9) | |
686 `not-at-end-of-line' (\\c<) | |
687 `not-at-beginning-of-line' (\\c>) | |
688 `alpha-numeric-two-byte' (\\cA) | |
689 `chinse-two-byte' (\\cC) | |
690 `greek-two-byte' (\\cG) | |
691 `japanese-hiragana-two-byte' (\\cH) | |
692 `indian-tow-byte' (\\cI) | |
693 `japanese-katakana-two-byte' (\\cK) | |
694 `korean-hangul-two-byte' (\\cN) | |
695 `cyrillic-two-byte' (\\cY) | |
696 `ascii' (\\ca) | |
697 `arabic' (\\cb) | |
698 `chinese' (\\cc) | |
699 `ethiopic' (\\ce) | |
700 `greek' (\\cg) | |
701 `korean' (\\ch) | |
702 `indian' (\\ci) | |
703 `japanese' (\\cj) | |
704 `japanese-katakana' (\\ck) | |
705 `latin' (\\cl) | |
706 `lao' (\\co) | |
707 `tibetan' (\\cq) | |
708 `japanese-roman' (\\cr) | |
709 `thai' (\\ct) | |
710 `vietnamese' (\\cv) | |
711 `hebrew' (\\cw) | |
712 `cyrillic' (\\cy) | |
713 `can-break' (\\c|) | |
714 | |
715 `(not (category CATEGORY))' | |
716 matches a character that has not category CATEGORY. | |
717 | |
718 `(and SEXP1 SEXP2 ...)' | |
719 matches what SEXP1 matches, followed by what SEXP2 matches, etc. | |
720 | |
721 `(submatch SEXP1 SEXP2 ...)' | |
722 like `and', but makes the match accessible with `match-end', | |
723 `match-beginning', and `match-string'. | |
724 | |
725 `(group SEXP1 SEXP2 ...)' | |
726 another name for `submatch'. | |
727 | |
728 `(or SEXP1 SEXP2 ...)' | |
729 matches anything that matches SEXP1 or SEXP2, etc. If all | |
730 args are strings, use `regexp-opt' to optimize the resulting | |
731 regular expression. | |
732 | |
733 `(minimal-match SEXP)' | |
734 produce a non-greedy regexp for SEXP. Normally, regexps matching | |
735 zero or more occurrances of something are \"greedy\" in that they | |
736 match as much as they can, as long as the overall regexp can | |
737 still match. A non-greedy regexp matches as little as possible. | |
738 | |
739 `(maximal-match SEXP)' | |
47257 | 740 produce a greedy regexp for SEXP. This is the default. |
39516 | 741 |
742 `(zero-or-more SEXP)' | |
743 matches zero or more occurrences of what SEXP matches. | |
744 | |
745 `(0+ SEXP)' | |
746 like `zero-or-more'. | |
747 | |
748 `(* SEXP)' | |
749 like `zero-or-more', but always produces a greedy regexp. | |
750 | |
751 `(*? SEXP)' | |
752 like `zero-or-more', but always produces a non-greedy regexp. | |
753 | |
754 `(one-or-more SEXP)' | |
755 matches one or more occurrences of A. | |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
48938
diff
changeset
|
756 |
39516 | 757 `(1+ SEXP)' |
758 like `one-or-more'. | |
759 | |
760 `(+ SEXP)' | |
761 like `one-or-more', but always produces a greedy regexp. | |
762 | |
763 `(+? SEXP)' | |
764 like `one-or-more', but always produces a non-greedy regexp. | |
765 | |
766 `(zero-or-one SEXP)' | |
767 matches zero or one occurrences of A. | |
49598
0d8b17d428b5
Trailing whitepace deleted.
Juanma Barranquero <lekktu@gmail.com>
parents:
48938
diff
changeset
|
768 |
39516 | 769 `(optional SEXP)' |
770 like `zero-or-one'. | |
771 | |
772 `(? SEXP)' | |
773 like `zero-or-one', but always produces a greedy regexp. | |
774 | |
775 `(?? SEXP)' | |
776 like `zero-or-one', but always produces a non-greedy regexp. | |
777 | |
778 `(repeat N SEXP)' | |
779 matches N occurrences of what SEXP matches. | |
780 | |
781 `(repeat N M SEXP)' | |
782 matches N to M occurrences of what SEXP matches. | |
783 | |
784 `(eval FORM)' | |
47257 | 785 evaluate FORM and insert result. If result is a string, |
39516 | 786 `regexp-quote' it. |
787 | |
788 `(regexp REGEXP)' | |
789 include REGEXP in string notation in the result." | |
790 | |
791 `(rx-to-string ',regexp)) | |
792 | |
793 | |
794 (provide 'rx) | |
795 | |
52401 | 796 ;;; arch-tag: 12d01a63-0008-42bb-ab8c-1c7d63be370b |
39516 | 797 ;;; rx.el ends here |