Mercurial > emacs
comparison lisp/emacs-lisp/sregex.el @ 22537:7947a4ea28a8
Initial revision
author | Dan Nicolaescu <done@ece.arizona.edu> |
---|---|
date | Mon, 22 Jun 1998 02:03:41 +0000 |
parents | |
children | 6e0af7097929 |
comparison
equal
deleted
inserted
replaced
22536:e4bcb7cb0038 | 22537:7947a4ea28a8 |
---|---|
1 ;;; sregex.el --- symbolic regular expressions | |
2 | |
3 ;; Copyright (C) 1997, 1998 Free Software Foundation, Inc. | |
4 | |
5 ;; Author: Bob Glickstein <bobg+sregex@zanshin.com> | |
6 ;; Maintainer: Bob Glickstein <bobg+sregex@zanshin.com> | |
7 | |
8 ;; This file is part of GNU Emacs. | |
9 | |
10 ;; GNU Emacs is free software; you can redistribute it and/or modify | |
11 ;; it under the terms of the GNU General Public License as published by | |
12 ;; the Free Software Foundation; either version 2, or (at your option) | |
13 ;; any later version. | |
14 | |
15 ;; GNU Emacs is distributed in the hope that it will be useful, | |
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 ;; GNU General Public License for more details. | |
19 | |
20 ;; You should have received a copy of the GNU General Public License | |
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
22 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
23 ;; Boston, MA 02111-1307, USA. | |
24 | |
25 ;;; Commentary: | |
26 | |
27 ;; This package allows you to write regular expressions using a | |
28 ;; totally new, Lisp-like syntax. | |
29 | |
30 ;; A "symbolic regular expression" (sregex for short) is a Lisp form | |
31 ;; that, when evaluated, produces the string form of the specified | |
32 ;; regular expression. Here's a simple example: | |
33 | |
34 ;; (sregexq (or "Bob" "Robert")) => "Bob\\|Robert" | |
35 | |
36 ;; As you can see, an sregex is specified by placing one or more | |
37 ;; special clauses in a call to `sregexq'. The clause in this case is | |
38 ;; the `or' of two strings (not to be confused with the Lisp function | |
39 ;; `or'). The list of allowable clauses appears below. | |
40 | |
41 ;; With sregex, it is never necessary to "escape" magic characters | |
42 ;; that are meant to be taken literally; that happens automatically. | |
43 ;; For example: | |
44 | |
45 ;; (sregexq "M*A*S*H") => "M\\*A\\*S\\*H" | |
46 | |
47 ;; It is also unnecessary to "group" parts of the expression together | |
48 ;; to overcome operator precedence; that also happens automatically. | |
49 ;; For example: | |
50 | |
51 ;; (sregexq (opt (or "Bob" "Robert"))) => "\\(Bob\\|Robert\\)?" | |
52 | |
53 ;; It *is* possible to group parts of the expression in order to refer | |
54 ;; to them with numbered backreferences: | |
55 | |
56 ;; (sregexq (group (or "Go" "Run")) | |
57 ;; ", Spot, " | |
58 ;; (backref 1)) => "\\(Go\\|Run\\), Spot, \\1" | |
59 | |
60 ;; If `sregexq' needs to introduce its own grouping parentheses, it | |
61 ;; will automatically renumber your backreferences: | |
62 | |
63 ;; (sregexq (opt "resent-") | |
64 ;; (group (or "to" "cc" "bcc")) | |
65 ;; ": " | |
66 ;; (backref 1)) => "\\(resent-\\)?\\(to\\|cc\\|bcc\\): \\2" | |
67 | |
68 ;; `sregexq' is a macro. Each time it is used, it constructs a simple | |
69 ;; Lisp expression that then invokes a moderately complex engine to | |
70 ;; interpret the sregex and render the string form. Because of this, | |
71 ;; I don't recommend sprinkling calls to `sregexq' throughout your | |
72 ;; code, the way one normally does with string regexes (which are | |
73 ;; cheap to evaluate). Instead, it's wiser to precompute the regexes | |
74 ;; you need wherever possible instead of repeatedly constructing the | |
75 ;; same ones over and over. Example: | |
76 | |
77 ;; (let ((field-regex (sregexq (opt "resent-") | |
78 ;; (or "to" "cc" "bcc")))) | |
79 ;; ... | |
80 ;; (while ... | |
81 ;; ... | |
82 ;; (re-search-forward field-regex ...) | |
83 ;; ...)) | |
84 | |
85 ;; The arguments to `sregexq' are automatically quoted, but the | |
86 ;; flipside of this is that it is not straightforward to include | |
87 ;; computed (i.e., non-constant) values in `sregexq' expressions. So | |
88 ;; `sregex' is a function that is like `sregexq' but which does not | |
89 ;; automatically quote its values. Literal sregex clauses must be | |
90 ;; explicitly quoted like so: | |
91 | |
92 ;; (sregex '(or "Bob" "Robert")) => "Bob\\|Robert" | |
93 | |
94 ;; but computed clauses can be included easily, allowing for the reuse | |
95 ;; of common clauses: | |
96 | |
97 ;; (let ((dotstar '(0+ any)) | |
98 ;; (whitespace '(1+ (syntax ?-))) | |
99 ;; (digits '(1+ (char (?0 . ?9))))) | |
100 ;; (sregex 'bol dotstar ":" whitespace digits)) => "^.*:\\s-+[0-9]+" | |
101 | |
102 ;; This package also provides sregex-specific versions of the Emacs | |
103 ;; functions `replace-match', `match-string', | |
104 ;; `match-string-no-properties', `match-beginning', `match-end', and | |
105 ;; `match-data'. In each case, the sregex version's name begins with | |
106 ;; `sregex-' and takes one additional optional parameter, an sregex | |
107 ;; "info" object. Each of these functions is concerned with numbered | |
108 ;; submatches. Since sregex may renumber submatches, alternate | |
109 ;; versions of these functions are needed that know how to adjust the | |
110 ;; supplied number. | |
111 | |
112 ;; The sregex info object for the most recently evaluated sregex can | |
113 ;; be obtained with `sregex-info'; so if you precompute your sregexes | |
114 ;; and you plan to use `replace-match' or one of the others with it, | |
115 ;; you need to record the info object for later use: | |
116 | |
117 ;; (let* ((regex (sregexq (opt "resent-") | |
118 ;; (group (or "to" "cc" "bcc")) | |
119 ;; ":")) | |
120 ;; (regex-info (sregex-info))) | |
121 ;; ... | |
122 ;; (if (re-search-forward regex ...) | |
123 ;; (let ((which (sregex-match-string 1 nil regex-info))) | |
124 ;; ...))) | |
125 | |
126 ;; In this example, `regex' is "\\(resent-\\)?\\(to\\|cc\\|bcc\\):", | |
127 ;; so the call to (sregex-match-string 1 ...) is automatically turned | |
128 ;; into a call to (match-string 2 ...). | |
129 | |
130 ;; If the sregex info argument to `sregex-replace-match', | |
131 ;; `sregex-match-string', `sregex-match-string-no-properties', | |
132 ;; `sregex-match-beginning', `sregex-match-end', or | |
133 ;; `sregex-match-data' is omitted, the current value of (sregex-info) | |
134 ;; is used. | |
135 | |
136 ;; You can do your own sregex submatch renumbering with | |
137 ;; `sregex-backref-num'. | |
138 | |
139 ;; Finally, `sregex-save-match-data' is like `save-match-data' but | |
140 ;; also saves and restores the information maintained by | |
141 ;; `sregex-info'. | |
142 | |
143 ;; To use this package in a Lisp program, simply (require 'sregex). | |
144 | |
145 ;; Here are the clauses allowed in an `sregex' or `sregexq' | |
146 ;; expression: | |
147 | |
148 ;; - a string | |
149 ;; This stands for the literal string. If it contains | |
150 ;; metacharacters, they will be escaped in the resulting regex | |
151 ;; (using `regexp-quote'). | |
152 | |
153 ;; - the symbol `any' | |
154 ;; This stands for ".", a regex matching any character except | |
155 ;; newline. | |
156 | |
157 ;; - the symbol `bol' | |
158 ;; Stands for "^", matching the empty string at the beginning of a line | |
159 | |
160 ;; - the symbol `eol' | |
161 ;; Stands for "$", matching the empty string at the end of a line | |
162 | |
163 ;; - (group CLAUSE ...) | |
164 ;; Groups the given CLAUSEs using "\\(" and "\\)". | |
165 | |
166 ;; - (sequence CLAUSE ...) | |
167 | |
168 ;; Groups the given CLAUSEs; may or may not use "\\(" and "\\)". | |
169 ;; Clauses groups by `sequence' do not count for purposes of | |
170 ;; numbering backreferences. Use `sequence' in situations like | |
171 ;; this: | |
172 | |
173 ;; (sregexq (or "dog" "cat" | |
174 ;; (sequence (opt "sea ") "monkey"))) | |
175 ;; => "dog\\|cat\\|\\(sea \\)?monkey" | |
176 | |
177 ;; where a single `or' alternate needs to contain multiple | |
178 ;; subclauses. | |
179 | |
180 ;; - (backref N) | |
181 ;; Matches the same string previously matched by the Nth "group" in | |
182 ;; the same sregex. N is a positive integer. In the resulting | |
183 ;; regex, N may be adjusted to account for automatically introduced | |
184 ;; groups. | |
185 | |
186 ;; - (or CLAUSE ...) | |
187 ;; Matches any one of the CLAUSEs by separating them with "\\|". | |
188 | |
189 ;; - (0+ CLAUSE ...) | |
190 ;; Concatenates the given CLAUSEs and matches zero or more | |
191 ;; occurrences by appending "*". | |
192 | |
193 ;; - (1+ CLAUSE ...) | |
194 ;; Concatenates the given CLAUSEs and matches one or more | |
195 ;; occurrences by appending "+". | |
196 | |
197 ;; - (opt CLAUSE ...) | |
198 ;; Concatenates the given CLAUSEs and matches zero or one occurrence | |
199 ;; by appending "?". | |
200 | |
201 ;; - (repeat MIN MAX CLAUSE ...) | |
202 ;; Concatenates the given CLAUSEs and constructs a regex matching at | |
203 ;; least MIN occurrences and at most MAX occurrences. MIN must be a | |
204 ;; non-negative integer. MAX must be a non-negative integer greater | |
205 ;; than or equal to MIN; or MAX can be nil to mean "infinity." | |
206 | |
207 ;; - (char CHAR-CLAUSE ...) | |
208 ;; Creates a "character class" matching one character from the given | |
209 ;; set. See below for how to construct a CHAR-CLAUSE. | |
210 | |
211 ;; - (not-char CHAR-CLAUSE ...) | |
212 ;; Creates a "character class" matching any one character not in the | |
213 ;; given set. See below for how to construct a CHAR-CLAUSE. | |
214 | |
215 ;; - the symbol `bot' | |
216 ;; Stands for "\\`", matching the empty string at the beginning of | |
217 ;; text (beginning of a string or of a buffer). | |
218 | |
219 ;; - the symbol `eot' | |
220 ;; Stands for "\\'", matching the empty string at the end of text. | |
221 | |
222 ;; - the symbol `point' | |
223 ;; Stands for "\\=", matching the empty string at point. | |
224 | |
225 ;; - the symbol `word-boundary' | |
226 ;; Stands for "\\b", matching the empty string at the beginning or | |
227 ;; end of a word. | |
228 | |
229 ;; - the symbol `not-word-boundary' | |
230 ;; Stands for "\\B", matching the empty string not at the beginning | |
231 ;; or end of a word. | |
232 | |
233 ;; - the symbol `bow' | |
234 ;; Stands for "\\<", matching the empty string at the beginning of a | |
235 ;; word. | |
236 | |
237 ;; - the symbol `eow' | |
238 ;; Stands for "\\>", matching the empty string at the end of a word. | |
239 | |
240 ;; - the symbol `wordchar' | |
241 ;; Stands for the regex "\\w", matching a word-constituent character | |
242 ;; (as determined by the current syntax table) | |
243 | |
244 ;; - the symbol `not-wordchar' | |
245 ;; Stands for the regex "\\W", matching a non-word-constituent | |
246 ;; character. | |
247 | |
248 ;; - (syntax CODE) | |
249 ;; Stands for the regex "\\sCODE", where CODE is a syntax table code | |
250 ;; (a single character). Matches any character with the requested | |
251 ;; syntax. | |
252 | |
253 ;; - (not-syntax CODE) | |
254 ;; Stands for the regex "\\SCODE", where CODE is a syntax table code | |
255 ;; (a single character). Matches any character without the | |
256 ;; requested syntax. | |
257 | |
258 ;; - (regex REGEX) | |
259 ;; This is a "trapdoor" for including ordinary regular expression | |
260 ;; strings in the result. Some regular expressions are clearer when | |
261 ;; written the old way: "[a-z]" vs. (sregexq (char (?a . ?z))), for | |
262 ;; instance. However, see the note under "Bugs," below. | |
263 | |
264 ;; Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...) | |
265 ;; has one of the following forms: | |
266 | |
267 ;; - a character | |
268 ;; Adds that character to the set. | |
269 | |
270 ;; - a string | |
271 ;; Adds all the characters in the string to the set. | |
272 | |
273 ;; - A pair (MIN . MAX) | |
274 ;; Where MIN and MAX are characters, adds the range of characters | |
275 ;; from MIN through MAX to the set. | |
276 | |
277 ;;; To do: | |
278 | |
279 ;; Make (sregexq (or "a" (sequence "b" "c"))) return "a\\|bc" instead | |
280 ;; of "a\\|\\(bc\\)" | |
281 | |
282 ;; An earlier version of this package could optionally translate the | |
283 ;; symbolic regex into other languages' syntaxes, e.g. Perl. For | |
284 ;; instance, with Perl syntax selected, (sregexq (or "ab" "cd")) would | |
285 ;; yield "ab|cd" instead of "ab\\|cd". It might be useful to restore | |
286 ;; such a facility. | |
287 | |
288 ;;; Bugs: | |
289 | |
290 ;; The (regex REGEX) form can confuse the code that distinguishes | |
291 ;; introduced groups from user-specified groups. Try to avoid using | |
292 ;; grouping within a `regex' form. Failing that, try to avoid using | |
293 ;; backrefs if you're using `regex'. | |
294 | |
295 ;;; Code: | |
296 | |
297 (defsubst sregex--value-unitp (val) (nth 0 val)) | |
298 (defsubst sregex--value-groups (val) (nth 1 val)) | |
299 (defsubst sregex--value-tree (val) (nth 2 val)) | |
300 | |
301 (defun sregex--make-value (unitp groups tree) | |
302 (list unitp groups tree)) | |
303 | |
304 (defvar sregex--current-sregex nil | |
305 "Global state for `sregex-info'.") | |
306 | |
307 (defun sregex-info () | |
308 "Return extra information about the latest call to `sregex'. | |
309 This extra information is needed in order to adjust user-requested | |
310 backreference numbers to numbers suitable for the generated regexp. | |
311 See e.g. `sregex-match-string' and `sregex-backref-num'." | |
312 sregex--current-sregex) | |
313 | |
314 ; (require 'advice) | |
315 ; (defadvice save-match-data (around sregex-save-match-data protect) | |
316 ; (let ((sregex--saved-sregex sregex--current-sregex)) | |
317 ; (unwind-protect | |
318 ; ad-do-it | |
319 ; (setq sregex--current-sregex sregex--saved-sregex)))) | |
320 (defmacro sregex-save-match-data (&rest forms) | |
321 "Like `save-match-data', but also saves and restores `sregex-info' data." | |
322 `(let ((sregex--saved-sregex sregex--current-sregex)) | |
323 (unwind-protect | |
324 (save-match-data ,@forms) | |
325 (setq sregex--current-sregex sregex--saved-sregex)))) | |
326 | |
327 (defun sregex-replace-match (replacement | |
328 &optional fixedcase literal string subexp sregex) | |
329 "Like `replace-match', for a regexp made with `sregex'. | |
330 This takes one additional optional argument, the `sregex' info, which | |
331 can be obtained with `sregex-info'. The SUBEXP argument is adjusted | |
332 to allow for \"introduced groups\". If the extra argument is omitted | |
333 or nil, it defaults to the current value of (sregex-info)." | |
334 (replace-match replacement fixedcase literal string | |
335 (and subexp | |
336 (sregex-backref-num | |
337 subexp | |
338 (sregex--value-groups (or sregex | |
339 sregex--current-sregex)))))) | |
340 | |
341 (defun sregex-match-string (count &optional in-string sregex) | |
342 "Like `match-string', for a regexp made with `sregex'. | |
343 This takes one additional optional argument, the `sregex' info, which | |
344 can be obtained with `sregex-info'. The COUNT argument is adjusted to | |
345 allow for \"introduced groups\". If the extra argument is omitted or | |
346 nil, it defaults to the current value of (sregex-info)." | |
347 (match-string (and count | |
348 (sregex-backref-num | |
349 count | |
350 (sregex--value-groups (or sregex | |
351 sregex--current-sregex)))) | |
352 in-string)) | |
353 | |
354 (defun sregex-match-string-no-properties (count &optional in-string sregex) | |
355 "Like `match-string-no-properties', for a regexp made with `sregex'. | |
356 This takes one additional optional argument, the `sregex' info, which | |
357 can be obtained with `sregex-info'. The COUNT argument is adjusted to | |
358 allow for \"introduced groups\". If the extra argument is omitted or | |
359 nil, it defaults to the current value of (sregex-info)." | |
360 (match-string-no-properties | |
361 (and count | |
362 (sregex-backref-num | |
363 count | |
364 (sregex--value-groups (or sregex | |
365 sregex--current-sregex)))) | |
366 in-string)) | |
367 | |
368 (defun sregex-match-beginning (count &optional sregex) | |
369 "Like `match-beginning', for a regexp made with `sregex'. | |
370 This takes one additional optional argument, the `sregex' info, which | |
371 can be obtained with `sregex-info'. The COUNT argument is adjusted to | |
372 allow for \"introduced groups\". If the extra argument is omitted or | |
373 nil, it defaults to the current value of (sregex-info)." | |
374 (match-beginning (sregex-backref-num | |
375 count | |
376 (sregex--value-groups (or sregex | |
377 sregex--current-sregex))))) | |
378 | |
379 (defun sregex-match-end (count &optional sregex) | |
380 "Like `match-end', for a regexp made with `sregex'. | |
381 This takes one additional optional argument, the `sregex' info, which | |
382 can be obtained with `sregex-info'. The COUNT argument is adjusted to | |
383 allow for \"introduced groups\". If the extra argument is omitted or | |
384 nil, it defaults to the current value of (sregex-info)." | |
385 (match-end (sregex-backref-num | |
386 count | |
387 (sregex--value-groups (or sregex | |
388 sregex--current-sregex))))) | |
389 | |
390 (defun sregex-match-data (&optional sregex) | |
391 "Like `match-data', for a regexp made with `sregex'. | |
392 This takes one additional optional argument, the `sregex' info, which | |
393 can be obtained with `sregex-info'. \"Introduced groups\" are removed | |
394 from the result. If the extra argument is omitted or nil, it defaults | |
395 to the current value of (sregex-info)." | |
396 (let* ((data (match-data)) | |
397 (groups (sregex--value-groups (or sregex | |
398 sregex--current-sregex))) | |
399 (result (list (car (cdr data)) | |
400 (car data)))) | |
401 (setq data (cdr (cdr data))) | |
402 (while data | |
403 (if (car groups) | |
404 (setq result (append (list (car (cdr data)) | |
405 (car data)) | |
406 result))) | |
407 (setq groups (cdr groups) | |
408 data (cdr (cdr data)))) | |
409 (reverse result))) | |
410 | |
411 (defun sregex--render-tree (tree sregex) | |
412 (let ((key (car tree))) | |
413 (cond ((eq key 'str) | |
414 (cdr tree)) | |
415 ((eq key 'or) | |
416 (mapconcat '(lambda (x) | |
417 (sregex--render-tree x sregex)) | |
418 (cdr tree) | |
419 "\\|")) | |
420 ((eq key 'sequence) | |
421 (apply 'concat | |
422 (mapcar '(lambda (x) | |
423 (sregex--render-tree x sregex)) | |
424 (cdr tree)))) | |
425 ((eq key 'group) | |
426 (concat "\\(" | |
427 (sregex--render-tree (cdr tree) sregex) | |
428 "\\)")) | |
429 ((eq key 'opt) | |
430 (concat (sregex--render-tree (cdr tree) sregex) | |
431 "?")) | |
432 ((eq key '0+) | |
433 (concat (sregex--render-tree (cdr tree) sregex) | |
434 "*")) | |
435 ((eq key '1+) | |
436 (concat (sregex--render-tree (cdr tree) sregex) | |
437 "+")) | |
438 ((eq key 'backref) | |
439 (let ((num (sregex-backref-num (cdr tree) sregex))) | |
440 (if (> num 9) | |
441 (error "sregex: backref number %d too high after adjustment" | |
442 num) | |
443 (concat "\\" (int-to-string num))))) | |
444 (t (error "sregex internal error: unknown tree type %S" | |
445 key))))) | |
446 | |
447 (defun sregex (&rest exps) | |
448 "Symbolic regular expression interpreter. | |
449 This is exactly like `sregexq' (q.v.) except that it evaluates all its | |
450 arguments, so literal sregex clauses must be quoted. For example: | |
451 | |
452 (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\" | |
453 | |
454 An argument-evaluating sregex interpreter lets you reuse sregex | |
455 subexpressions: | |
456 | |
457 (let ((dotstar '(0+ any)) | |
458 (whitespace '(1+ (syntax ?-))) | |
459 (digits '(1+ (char (?0 . ?9))))) | |
460 (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\"" | |
461 (progn | |
462 (setq sregex--current-sregex (sregex--sequence exps nil)) | |
463 (sregex--render-tree (sregex--value-tree sregex--current-sregex) | |
464 sregex--current-sregex))) | |
465 | |
466 (defmacro sregexq (&rest exps) | |
467 "Symbolic regular expression interpreter. | |
468 This macro allows you to specify a regular expression (regexp) in | |
469 symbolic form, and converts it into the string form required by Emacs's | |
470 regex functions such as `re-search-forward' and `looking-at'. Here is | |
471 a simple example: | |
472 | |
473 (sregexq (or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\" | |
474 | |
475 As you can see, an sregex is specified by placing one or more special | |
476 clauses in a call to `sregexq'. The clause in this case is the `or' | |
477 of two strings (not to be confused with the Lisp function `or'). The | |
478 list of allowable clauses appears below. | |
479 | |
480 With `sregex', it is never necessary to \"escape\" magic characters | |
481 that are meant to be taken literally; that happens automatically. | |
482 For example: | |
483 | |
484 (sregexq \"M*A*S*H\") => \"M\\\\*A\\\\*S\\\\*H\" | |
485 | |
486 It is also unnecessary to \"group\" parts of the expression together | |
487 to overcome operator precedence; that also happens automatically. | |
488 For example: | |
489 | |
490 (sregexq (opt (or \"Bob\" \"Robert\"))) => \"\\\\(Bob\\\\|Robert\\\\)?\" | |
491 | |
492 It *is* possible to group parts of the expression in order to refer | |
493 to them with numbered backreferences: | |
494 | |
495 (sregexq (group (or \"Go\" \"Run\")) | |
496 \", Spot, \" | |
497 (backref 1)) => \"\\\\(Go\\\\|Run\\\\), Spot, \\\\1\" | |
498 | |
499 If `sregexq' needs to introduce its own grouping parentheses, it will | |
500 automatically renumber your backreferences: | |
501 | |
502 (sregexq (opt \"resent-\") | |
503 (group (or \"to\" \"cc\" \"bcc\")) | |
504 \": \" | |
505 (backref 1)) => \"\\\\(resent-\\\\)?\\\\(to\\\\|cc\\\\|bcc\\\\): \\\\2\" | |
506 | |
507 `sregexq' is a macro. Each time it is used, it constructs a simple | |
508 Lisp expression that then invokes a moderately complex engine to | |
509 interpret the sregex and render the string form. Because of this, I | |
510 don't recommend sprinkling calls to `sregexq' throughout your code, | |
511 the way one normally does with string regexes (which are cheap to | |
512 evaluate). Instead, it's wiser to precompute the regexes you need | |
513 wherever possible instead of repeatedly constructing the same ones | |
514 over and over. Example: | |
515 | |
516 (let ((field-regex (sregexq (opt \"resent-\") | |
517 (or \"to\" \"cc\" \"bcc\")))) | |
518 ... | |
519 (while ... | |
520 ... | |
521 (re-search-forward field-regex ...) | |
522 ...)) | |
523 | |
524 The arguments to `sregexq' are automatically quoted, but the | |
525 flipside of this is that it is not straightforward to include | |
526 computed (i.e., non-constant) values in `sregexq' expressions. So | |
527 `sregex' is a function that is like `sregexq' but which does not | |
528 automatically quote its values. Literal sregex clauses must be | |
529 explicitly quoted like so: | |
530 | |
531 (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\" | |
532 | |
533 but computed clauses can be included easily, allowing for the reuse | |
534 of common clauses: | |
535 | |
536 (let ((dotstar '(0+ any)) | |
537 (whitespace '(1+ (syntax ?-))) | |
538 (digits '(1+ (char (?0 . ?9))))) | |
539 (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\" | |
540 | |
541 Here are the clauses allowed in an `sregex' or `sregexq' expression: | |
542 | |
543 - a string | |
544 This stands for the literal string. If it contains | |
545 metacharacters, they will be escaped in the resulting regex | |
546 (using `regexp-quote'). | |
547 | |
548 - the symbol `any' | |
549 This stands for \".\", a regex matching any character except | |
550 newline. | |
551 | |
552 - the symbol `bol' | |
553 Stands for \"^\", matching the empty string at the beginning of a line | |
554 | |
555 - the symbol `eol' | |
556 Stands for \"$\", matching the empty string at the end of a line | |
557 | |
558 - (group CLAUSE ...) | |
559 Groups the given CLAUSEs using \"\\\\(\" and \"\\\\)\". | |
560 | |
561 - (sequence CLAUSE ...) | |
562 | |
563 Groups the given CLAUSEs; may or may not use \"\\\\(\" and \"\\\\)\". | |
564 Clauses groups by `sequence' do not count for purposes of | |
565 numbering backreferences. Use `sequence' in situations like | |
566 this: | |
567 | |
568 (sregexq (or \"dog\" \"cat\" | |
569 (sequence (opt \"sea \") \"monkey\"))) | |
570 => \"dog\\\\|cat\\\\|\\\\(sea \\\\)?monkey\" | |
571 | |
572 where a single `or' alternate needs to contain multiple | |
573 subclauses. | |
574 | |
575 - (backref N) | |
576 Matches the same string previously matched by the Nth \"group\" in | |
577 the same sregex. N is a positive integer. In the resulting | |
578 regex, N may be adjusted to account for automatically introduced | |
579 groups. | |
580 | |
581 - (or CLAUSE ...) | |
582 Matches any one of the CLAUSEs by separating them with \"\\\\|\". | |
583 | |
584 - (0+ CLAUSE ...) | |
585 Concatenates the given CLAUSEs and matches zero or more | |
586 occurrences by appending \"*\". | |
587 | |
588 - (1+ CLAUSE ...) | |
589 Concatenates the given CLAUSEs and matches one or more | |
590 occurrences by appending \"+\". | |
591 | |
592 - (opt CLAUSE ...) | |
593 Concatenates the given CLAUSEs and matches zero or one occurrence | |
594 by appending \"?\". | |
595 | |
596 - (repeat MIN MAX CLAUSE ...) | |
597 Concatenates the given CLAUSEs and constructs a regex matching at | |
598 least MIN occurrences and at most MAX occurrences. MIN must be a | |
599 non-negative integer. MAX must be a non-negative integer greater | |
600 than or equal to MIN; or MAX can be nil to mean \"infinity.\" | |
601 | |
602 - (char CHAR-CLAUSE ...) | |
603 Creates a \"character class\" matching one character from the given | |
604 set. See below for how to construct a CHAR-CLAUSE. | |
605 | |
606 - (not-char CHAR-CLAUSE ...) | |
607 Creates a \"character class\" matching any one character not in the | |
608 given set. See below for how to construct a CHAR-CLAUSE. | |
609 | |
610 - the symbol `bot' | |
611 Stands for \"\\\\`\", matching the empty string at the beginning of | |
612 text (beginning of a string or of a buffer). | |
613 | |
614 - the symbol `eot' | |
615 Stands for \"\\\\'\", matching the empty string at the end of text. | |
616 | |
617 - the symbol `point' | |
618 Stands for \"\\\\=\", matching the empty string at point. | |
619 | |
620 - the symbol `word-boundary' | |
621 Stands for \"\\\\b\", matching the empty string at the beginning or | |
622 end of a word. | |
623 | |
624 - the symbol `not-word-boundary' | |
625 Stands for \"\\\\B\", matching the empty string not at the beginning | |
626 or end of a word. | |
627 | |
628 - the symbol `bow' | |
629 Stands for \"\\\\\\=<\", matching the empty string at the beginning of a | |
630 word. | |
631 | |
632 - the symbol `eow' | |
633 Stands for \"\\\\\\=>\", matching the empty string at the end of a word. | |
634 | |
635 - the symbol `wordchar' | |
636 Stands for the regex \"\\\\w\", matching a word-constituent character | |
637 (as determined by the current syntax table) | |
638 | |
639 - the symbol `not-wordchar' | |
640 Stands for the regex \"\\\\W\", matching a non-word-constituent | |
641 character. | |
642 | |
643 - (syntax CODE) | |
644 Stands for the regex \"\\\\sCODE\", where CODE is a syntax table code | |
645 (a single character). Matches any character with the requested | |
646 syntax. | |
647 | |
648 - (not-syntax CODE) | |
649 Stands for the regex \"\\\\SCODE\", where CODE is a syntax table code | |
650 (a single character). Matches any character without the | |
651 requested syntax. | |
652 | |
653 - (regex REGEX) | |
654 This is a \"trapdoor\" for including ordinary regular expression | |
655 strings in the result. Some regular expressions are clearer when | |
656 written the old way: \"[a-z]\" vs. (sregexq (char (?a . ?z))), for | |
657 instance. However, using this can confuse the code that | |
658 distinguishes introduced groups from user-specified groups. Avoid | |
659 using grouping within a `regex' form. Failing that, avoid using | |
660 backrefs if you're using `regex'. | |
661 | |
662 Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...) | |
663 has one of the following forms: | |
664 | |
665 - a character | |
666 Adds that character to the set. | |
667 | |
668 - a string | |
669 Adds all the characters in the string to the set. | |
670 | |
671 - A pair (MIN . MAX) | |
672 Where MIN and MAX are characters, adds the range of characters | |
673 from MIN through MAX to the set." | |
674 `(apply 'sregex ',exps)) | |
675 | |
676 (defun sregex--engine (exp combine) | |
677 (let* ((val (cond ((stringp exp) | |
678 (sregex--make-value (or (not (eq combine 'suffix)) | |
679 (= (length exp) 1)) | |
680 nil | |
681 (cons 'str | |
682 (regexp-quote exp)))) | |
683 ((symbolp exp) | |
684 (funcall (intern (concat "sregex--" | |
685 (symbol-name exp))) | |
686 combine)) | |
687 ((consp exp) | |
688 (funcall (intern (concat "sregex--" | |
689 (symbol-name (car exp)))) | |
690 (cdr exp) | |
691 combine)) | |
692 (t (error "Invalid expression: %s" exp)))) | |
693 (unitp (sregex--value-unitp val)) | |
694 (groups (sregex--value-groups val)) | |
695 (tree (sregex--value-tree val))) | |
696 (if (and combine (not unitp)) | |
697 (sregex--make-value t | |
698 (cons nil groups) | |
699 (cons 'group tree)) | |
700 (sregex--make-value unitp groups tree)))) | |
701 | |
702 (defun sregex--sequence (exps combine) | |
703 (if (= (length exps) 1) | |
704 (sregex--engine (car exps) combine) | |
705 (let ((groups nil) | |
706 (trees nil)) ;grows in reverse | |
707 (while exps | |
708 (let ((val (sregex--engine (car exps) 'concat))) | |
709 (setq groups (append groups | |
710 (sregex--value-groups val)) | |
711 trees (cons (sregex--value-tree val) trees) | |
712 exps (cdr exps)))) | |
713 (setq trees (nreverse trees)) | |
714 (if (eq combine 'suffix) | |
715 (sregex--make-value t | |
716 (cons nil groups) | |
717 (cons 'group | |
718 (cons 'sequence trees))) | |
719 (sregex--make-value (not (eq combine 'suffix)) | |
720 groups | |
721 (cons 'sequence trees)))))) | |
722 | |
723 (defun sregex--group (exps combine) | |
724 (let ((val (sregex--sequence exps nil))) | |
725 (sregex--make-value t | |
726 (cons t (sregex--value-groups val)) | |
727 (cons 'group (sregex--value-tree val))))) | |
728 | |
729 (defun sregex-backref-num (n &optional sregex) | |
730 "Adjust backreference number N according to SREGEX. | |
731 When `sregex' introduces parenthesized groups that the user didn't ask | |
732 for, the numbering of the groups that the user *did* ask for gets all | |
733 out of whack. This function accounts for introduced groups. Example: | |
734 | |
735 (sregexq (opt \"ab\") | |
736 (group (or \"c\" \"d\"))) => \"\\\\(ab\\\\)?\\\\(c\\\\|d\\\\)\" | |
737 (setq info (sregex-info)) | |
738 (sregex-backref-num 1 info) => 2 | |
739 | |
740 The SREGEX parameter is optional and defaults to the current value of | |
741 `sregex-info'." | |
742 (let ((groups (sregex--value-groups (or sregex | |
743 sregex--current-sregex))) | |
744 (result 0)) | |
745 (while (and groups (> n 0)) | |
746 (if (car groups) | |
747 (setq n (1- n))) | |
748 (setq result (1+ result) | |
749 groups (cdr groups))) | |
750 result)) | |
751 | |
752 (defun sregex--backref (exps combine) | |
753 (sregex--make-value t nil (cons 'backref (car exps)))) | |
754 | |
755 (defun sregex--any (combine) | |
756 (sregex--make-value t nil '(str . "."))) | |
757 | |
758 (defun sregex--opt (exps combine) | |
759 (let ((val (sregex--sequence exps 'suffix))) | |
760 (sregex--make-value t | |
761 (sregex--value-groups val) | |
762 (cons 'opt (sregex--value-tree val))))) | |
763 | |
764 (defun sregex--0+ (exps combine) | |
765 (let ((val (sregex--sequence exps 'suffix))) | |
766 (sregex--make-value t | |
767 (sregex--value-groups val) | |
768 (cons '0+ (sregex--value-tree val))))) | |
769 (defun sregex--1+ (exps combine) | |
770 (let ((val (sregex--sequence exps 'suffix))) | |
771 (sregex--make-value t | |
772 (sregex--value-groups val) | |
773 (cons '1+ (sregex--value-tree val))))) | |
774 | |
775 (defun sregex--repeat (exps combine) | |
776 (let ((min (or (car exps) 0)) | |
777 (max (car (cdr exps)))) | |
778 (setq exps (cdr (cdr exps))) | |
779 (cond ((zerop min) | |
780 (cond ((equal max 0) ;degenerate | |
781 (sregex--make-value t nil nil)) | |
782 ((equal max 1) | |
783 (sregex--opt exps combine)) | |
784 ((not max) | |
785 (sregex--0+ exps combine)) | |
786 (t (sregex--sequence (make-list max | |
787 (cons 'opt exps)) | |
788 combine)))) | |
789 ((= min 1) | |
790 (cond ((equal max 1) | |
791 (sregex--sequence exps combine)) | |
792 ((not max) | |
793 (sregex--1+ exps combine)) | |
794 (t (sregex--sequence (append exps | |
795 (make-list (1- max) | |
796 (cons 'opt exps))) | |
797 combine)))) | |
798 (t (sregex--sequence (append exps | |
799 (list (append (list 'repeat | |
800 (1- min) | |
801 (and max | |
802 (1- max))) | |
803 exps))) | |
804 combine))))) | |
805 | |
806 (defun sregex--or (exps combine) | |
807 (if (= (length exps) 1) | |
808 (sregex--engine (car exps) combine) | |
809 (let ((groups nil) | |
810 (trees nil)) | |
811 (while exps | |
812 (let ((val (sregex--engine (car exps) 'or))) | |
813 (setq groups (append groups | |
814 (sregex--value-groups val)) | |
815 trees (cons (sregex--value-tree val) trees) | |
816 exps (cdr exps)))) | |
817 (sregex--make-value (eq combine 'or) | |
818 groups | |
819 (cons 'or (nreverse trees)))))) | |
820 | |
821 (defmacro sregex--char-range-aux () | |
822 '(if start | |
823 (let (startc endc) | |
824 (if (and (<= 32 start) | |
825 (<= start 127)) | |
826 (setq startc (char-to-string start) | |
827 endc (char-to-string end)) | |
828 (setq startc (format "\\%03o" start) | |
829 endc (format "\\%03o" end))) | |
830 (if (> end start) | |
831 (if (> end (+ start 1)) | |
832 (setq class (concat class startc "-" endc)) | |
833 (setq class (concat class startc endc))) | |
834 (setq class (concat class startc)))))) | |
835 | |
836 (defmacro sregex--char-range (rstart rend) | |
837 `(let ((i ,rstart) | |
838 start end) | |
839 (while (<= i ,rend) | |
840 (if (aref chars i) | |
841 (progn | |
842 (if start | |
843 (setq end i) | |
844 (setq start i | |
845 end i)) | |
846 (aset chars i nil)) | |
847 (sregex--char-range-aux) | |
848 (setq start nil | |
849 end nil)) | |
850 (setq i (1+ i))) | |
851 (sregex--char-range-aux))) | |
852 | |
853 (defun sregex--char-aux (complement args) | |
854 (let ((chars (make-vector 256 nil))) | |
855 (while args | |
856 (let ((arg (car args))) | |
857 (cond ((integerp arg) | |
858 (aset chars arg t)) | |
859 ((stringp arg) | |
860 (mapcar (function | |
861 (lambda (c) | |
862 (aset chars c t))) | |
863 arg)) | |
864 ((consp arg) | |
865 (let ((start (car arg)) | |
866 (end (cdr arg))) | |
867 (if (> start end) | |
868 (let ((tmp start)) | |
869 (setq start end | |
870 end tmp))) | |
871 ;; now start <= end | |
872 (let ((i start)) | |
873 (while (<= i end) | |
874 (aset chars i t) | |
875 (setq i (1+ i)))))))) | |
876 (setq args (cdr args))) | |
877 ;; now chars is a map of the characters in the class | |
878 (let ((class "") | |
879 (caret (aref chars ?^))) | |
880 (aset chars ?^ nil) | |
881 (if (aref chars ?\]) | |
882 (progn | |
883 (setq class (concat class "]")) | |
884 (aset chars ?\] nil))) | |
885 (if (aref chars ?-) | |
886 (progn | |
887 (setq class (concat class "-")) | |
888 (aset chars ?- nil))) | |
889 (if (aref chars ?\\) | |
890 (progn | |
891 (setq class (concat class "\\\\")) | |
892 (aset chars ?\\ nil))) | |
893 | |
894 (sregex--char-range ?A ?Z) | |
895 (sregex--char-range ?a ?z) | |
896 (sregex--char-range ?0 ?9) | |
897 | |
898 (let ((i 32)) | |
899 (while (< i 128) | |
900 (if (aref chars i) | |
901 (progn | |
902 (setq class (concat class (char-to-string i))) | |
903 (aset chars i nil))) | |
904 (setq i (1+ i)))) | |
905 | |
906 (sregex--char-range 0 31) | |
907 (sregex--char-range 128 255) | |
908 | |
909 (let ((i 0)) | |
910 (while (< i 256) | |
911 (if (aref chars i) | |
912 (setq class (concat class (format "\\%03o" i)))) | |
913 (setq i (1+ i)))) | |
914 | |
915 (if caret | |
916 (setq class (concat class "^"))) | |
917 (concat "[" (if complement "^") class "]")))) | |
918 | |
919 (defun sregex--char (exps combine) | |
920 (sregex--make-value t nil (cons 'str (sregex--char-aux nil exps)))) | |
921 (defun sregex--not-char (exps combine) | |
922 (sregex--make-value t nil (cons 'str (sregex--char-aux t exps)))) | |
923 | |
924 (defun sregex--bol (combine) | |
925 (sregex--make-value t nil '(str . "^"))) | |
926 (defun sregex--eol (combine) | |
927 (sregex--make-value t nil '(str . "$"))) | |
928 | |
929 (defun sregex--wordchar (combine) | |
930 (sregex--make-value t nil '(str . "\\w"))) | |
931 (defun sregex--not-wordchar (combine) | |
932 (sregex--make-value t nil '(str . "\\W"))) | |
933 | |
934 (defun sregex--syntax (exps combine) | |
935 (sregex--make-value t nil (cons 'str (format "\\s%c" (car exps))))) | |
936 (defun sregex--not-syntax (exps combine) | |
937 (sregex--make-value t nil (cons 'str (format "\\S%c" (car exps))))) | |
938 | |
939 (defun sregex--bot (combine) | |
940 (sregex--make-value t nil (cons 'str "\\`"))) | |
941 (defun sregex--eot (combine) | |
942 (sregex--make-value t nil (cons 'str "\\'"))) | |
943 | |
944 (defun sregex--point (combine) | |
945 (sregex--make-value t nil '(str . "\\="))) | |
946 | |
947 (defun sregex--word-boundary (combine) | |
948 (sregex--make-value t nil '(str . "\\b"))) | |
949 (defun sregex--not-word-boundary (combine) | |
950 (sregex--make-value t nil '(str . "\\B"))) | |
951 | |
952 (defun sregex--bow (combine) | |
953 (sregex--make-value t nil '(str . "\\<"))) | |
954 (defun sregex--eow (combine) | |
955 (sregex--make-value t nil '(str . "\\>"))) | |
956 | |
957 | |
958 ;; trapdoor - usage discouraged | |
959 (defun sregex--regex (exps combine) | |
960 (sregex--make-value nil nil (car exps))) | |
961 | |
962 (provide 'sregex) | |
963 | |
964 ;;; sregex.el ends here | |
965 |