22537
|
1 ;;; sregex.el --- symbolic regular expressions
|
|
2
|
|
3 ;; Copyright (C) 1997, 1998 Free Software Foundation, Inc.
|
|
4
|
|
5 ;; Author: Bob Glickstein <bobg+sregex@zanshin.com>
|
|
6 ;; Maintainer: Bob Glickstein <bobg+sregex@zanshin.com>
|
|
7
|
|
8 ;; This file is part of GNU Emacs.
|
|
9
|
|
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
|
|
11 ;; it under the terms of the GNU General Public License as published by
|
|
12 ;; the Free Software Foundation; either version 2, or (at your option)
|
|
13 ;; any later version.
|
|
14
|
|
15 ;; GNU Emacs is distributed in the hope that it will be useful,
|
|
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
18 ;; GNU General Public License for more details.
|
|
19
|
|
20 ;; You should have received a copy of the GNU General Public License
|
|
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the
|
|
22 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
23 ;; Boston, MA 02111-1307, USA.
|
|
24
|
|
25 ;;; Commentary:
|
|
26
|
|
27 ;; This package allows you to write regular expressions using a
|
|
28 ;; totally new, Lisp-like syntax.
|
|
29
|
|
30 ;; A "symbolic regular expression" (sregex for short) is a Lisp form
|
|
31 ;; that, when evaluated, produces the string form of the specified
|
|
32 ;; regular expression. Here's a simple example:
|
|
33
|
|
34 ;; (sregexq (or "Bob" "Robert")) => "Bob\\|Robert"
|
|
35
|
|
36 ;; As you can see, an sregex is specified by placing one or more
|
|
37 ;; special clauses in a call to `sregexq'. The clause in this case is
|
|
38 ;; the `or' of two strings (not to be confused with the Lisp function
|
|
39 ;; `or'). The list of allowable clauses appears below.
|
|
40
|
|
41 ;; With sregex, it is never necessary to "escape" magic characters
|
|
42 ;; that are meant to be taken literally; that happens automatically.
|
|
43 ;; For example:
|
|
44
|
|
45 ;; (sregexq "M*A*S*H") => "M\\*A\\*S\\*H"
|
|
46
|
|
47 ;; It is also unnecessary to "group" parts of the expression together
|
|
48 ;; to overcome operator precedence; that also happens automatically.
|
|
49 ;; For example:
|
|
50
|
|
51 ;; (sregexq (opt (or "Bob" "Robert"))) => "\\(Bob\\|Robert\\)?"
|
|
52
|
|
53 ;; It *is* possible to group parts of the expression in order to refer
|
|
54 ;; to them with numbered backreferences:
|
|
55
|
|
56 ;; (sregexq (group (or "Go" "Run"))
|
|
57 ;; ", Spot, "
|
|
58 ;; (backref 1)) => "\\(Go\\|Run\\), Spot, \\1"
|
|
59
|
|
60 ;; If `sregexq' needs to introduce its own grouping parentheses, it
|
|
61 ;; will automatically renumber your backreferences:
|
|
62
|
|
63 ;; (sregexq (opt "resent-")
|
|
64 ;; (group (or "to" "cc" "bcc"))
|
|
65 ;; ": "
|
|
66 ;; (backref 1)) => "\\(resent-\\)?\\(to\\|cc\\|bcc\\): \\2"
|
|
67
|
|
68 ;; `sregexq' is a macro. Each time it is used, it constructs a simple
|
|
69 ;; Lisp expression that then invokes a moderately complex engine to
|
|
70 ;; interpret the sregex and render the string form. Because of this,
|
|
71 ;; I don't recommend sprinkling calls to `sregexq' throughout your
|
|
72 ;; code, the way one normally does with string regexes (which are
|
|
73 ;; cheap to evaluate). Instead, it's wiser to precompute the regexes
|
|
74 ;; you need wherever possible instead of repeatedly constructing the
|
|
75 ;; same ones over and over. Example:
|
|
76
|
|
77 ;; (let ((field-regex (sregexq (opt "resent-")
|
|
78 ;; (or "to" "cc" "bcc"))))
|
|
79 ;; ...
|
|
80 ;; (while ...
|
|
81 ;; ...
|
|
82 ;; (re-search-forward field-regex ...)
|
|
83 ;; ...))
|
|
84
|
|
85 ;; The arguments to `sregexq' are automatically quoted, but the
|
|
86 ;; flipside of this is that it is not straightforward to include
|
|
87 ;; computed (i.e., non-constant) values in `sregexq' expressions. So
|
|
88 ;; `sregex' is a function that is like `sregexq' but which does not
|
|
89 ;; automatically quote its values. Literal sregex clauses must be
|
|
90 ;; explicitly quoted like so:
|
|
91
|
|
92 ;; (sregex '(or "Bob" "Robert")) => "Bob\\|Robert"
|
|
93
|
|
94 ;; but computed clauses can be included easily, allowing for the reuse
|
|
95 ;; of common clauses:
|
|
96
|
|
97 ;; (let ((dotstar '(0+ any))
|
|
98 ;; (whitespace '(1+ (syntax ?-)))
|
|
99 ;; (digits '(1+ (char (?0 . ?9)))))
|
|
100 ;; (sregex 'bol dotstar ":" whitespace digits)) => "^.*:\\s-+[0-9]+"
|
|
101
|
|
102 ;; This package also provides sregex-specific versions of the Emacs
|
|
103 ;; functions `replace-match', `match-string',
|
|
104 ;; `match-string-no-properties', `match-beginning', `match-end', and
|
|
105 ;; `match-data'. In each case, the sregex version's name begins with
|
|
106 ;; `sregex-' and takes one additional optional parameter, an sregex
|
|
107 ;; "info" object. Each of these functions is concerned with numbered
|
|
108 ;; submatches. Since sregex may renumber submatches, alternate
|
|
109 ;; versions of these functions are needed that know how to adjust the
|
|
110 ;; supplied number.
|
|
111
|
|
112 ;; The sregex info object for the most recently evaluated sregex can
|
|
113 ;; be obtained with `sregex-info'; so if you precompute your sregexes
|
|
114 ;; and you plan to use `replace-match' or one of the others with it,
|
|
115 ;; you need to record the info object for later use:
|
|
116
|
|
117 ;; (let* ((regex (sregexq (opt "resent-")
|
|
118 ;; (group (or "to" "cc" "bcc"))
|
|
119 ;; ":"))
|
|
120 ;; (regex-info (sregex-info)))
|
|
121 ;; ...
|
|
122 ;; (if (re-search-forward regex ...)
|
|
123 ;; (let ((which (sregex-match-string 1 nil regex-info)))
|
|
124 ;; ...)))
|
|
125
|
|
126 ;; In this example, `regex' is "\\(resent-\\)?\\(to\\|cc\\|bcc\\):",
|
|
127 ;; so the call to (sregex-match-string 1 ...) is automatically turned
|
|
128 ;; into a call to (match-string 2 ...).
|
|
129
|
|
130 ;; If the sregex info argument to `sregex-replace-match',
|
|
131 ;; `sregex-match-string', `sregex-match-string-no-properties',
|
|
132 ;; `sregex-match-beginning', `sregex-match-end', or
|
|
133 ;; `sregex-match-data' is omitted, the current value of (sregex-info)
|
|
134 ;; is used.
|
|
135
|
|
136 ;; You can do your own sregex submatch renumbering with
|
|
137 ;; `sregex-backref-num'.
|
|
138
|
|
139 ;; Finally, `sregex-save-match-data' is like `save-match-data' but
|
|
140 ;; also saves and restores the information maintained by
|
|
141 ;; `sregex-info'.
|
|
142
|
|
143 ;; To use this package in a Lisp program, simply (require 'sregex).
|
|
144
|
|
145 ;; Here are the clauses allowed in an `sregex' or `sregexq'
|
|
146 ;; expression:
|
|
147
|
|
148 ;; - a string
|
|
149 ;; This stands for the literal string. If it contains
|
|
150 ;; metacharacters, they will be escaped in the resulting regex
|
|
151 ;; (using `regexp-quote').
|
|
152
|
|
153 ;; - the symbol `any'
|
|
154 ;; This stands for ".", a regex matching any character except
|
|
155 ;; newline.
|
|
156
|
|
157 ;; - the symbol `bol'
|
|
158 ;; Stands for "^", matching the empty string at the beginning of a line
|
|
159
|
|
160 ;; - the symbol `eol'
|
|
161 ;; Stands for "$", matching the empty string at the end of a line
|
|
162
|
|
163 ;; - (group CLAUSE ...)
|
|
164 ;; Groups the given CLAUSEs using "\\(" and "\\)".
|
|
165
|
|
166 ;; - (sequence CLAUSE ...)
|
|
167
|
|
168 ;; Groups the given CLAUSEs; may or may not use "\\(" and "\\)".
|
|
169 ;; Clauses groups by `sequence' do not count for purposes of
|
|
170 ;; numbering backreferences. Use `sequence' in situations like
|
|
171 ;; this:
|
|
172
|
|
173 ;; (sregexq (or "dog" "cat"
|
|
174 ;; (sequence (opt "sea ") "monkey")))
|
|
175 ;; => "dog\\|cat\\|\\(sea \\)?monkey"
|
|
176
|
|
177 ;; where a single `or' alternate needs to contain multiple
|
|
178 ;; subclauses.
|
|
179
|
|
180 ;; - (backref N)
|
|
181 ;; Matches the same string previously matched by the Nth "group" in
|
|
182 ;; the same sregex. N is a positive integer. In the resulting
|
|
183 ;; regex, N may be adjusted to account for automatically introduced
|
|
184 ;; groups.
|
|
185
|
|
186 ;; - (or CLAUSE ...)
|
|
187 ;; Matches any one of the CLAUSEs by separating them with "\\|".
|
|
188
|
|
189 ;; - (0+ CLAUSE ...)
|
|
190 ;; Concatenates the given CLAUSEs and matches zero or more
|
|
191 ;; occurrences by appending "*".
|
|
192
|
|
193 ;; - (1+ CLAUSE ...)
|
|
194 ;; Concatenates the given CLAUSEs and matches one or more
|
|
195 ;; occurrences by appending "+".
|
|
196
|
|
197 ;; - (opt CLAUSE ...)
|
|
198 ;; Concatenates the given CLAUSEs and matches zero or one occurrence
|
|
199 ;; by appending "?".
|
|
200
|
|
201 ;; - (repeat MIN MAX CLAUSE ...)
|
|
202 ;; Concatenates the given CLAUSEs and constructs a regex matching at
|
|
203 ;; least MIN occurrences and at most MAX occurrences. MIN must be a
|
|
204 ;; non-negative integer. MAX must be a non-negative integer greater
|
|
205 ;; than or equal to MIN; or MAX can be nil to mean "infinity."
|
|
206
|
|
207 ;; - (char CHAR-CLAUSE ...)
|
|
208 ;; Creates a "character class" matching one character from the given
|
|
209 ;; set. See below for how to construct a CHAR-CLAUSE.
|
|
210
|
|
211 ;; - (not-char CHAR-CLAUSE ...)
|
|
212 ;; Creates a "character class" matching any one character not in the
|
|
213 ;; given set. See below for how to construct a CHAR-CLAUSE.
|
|
214
|
|
215 ;; - the symbol `bot'
|
|
216 ;; Stands for "\\`", matching the empty string at the beginning of
|
|
217 ;; text (beginning of a string or of a buffer).
|
|
218
|
|
219 ;; - the symbol `eot'
|
|
220 ;; Stands for "\\'", matching the empty string at the end of text.
|
|
221
|
|
222 ;; - the symbol `point'
|
|
223 ;; Stands for "\\=", matching the empty string at point.
|
|
224
|
|
225 ;; - the symbol `word-boundary'
|
|
226 ;; Stands for "\\b", matching the empty string at the beginning or
|
|
227 ;; end of a word.
|
|
228
|
|
229 ;; - the symbol `not-word-boundary'
|
|
230 ;; Stands for "\\B", matching the empty string not at the beginning
|
|
231 ;; or end of a word.
|
|
232
|
|
233 ;; - the symbol `bow'
|
|
234 ;; Stands for "\\<", matching the empty string at the beginning of a
|
|
235 ;; word.
|
|
236
|
|
237 ;; - the symbol `eow'
|
|
238 ;; Stands for "\\>", matching the empty string at the end of a word.
|
|
239
|
|
240 ;; - the symbol `wordchar'
|
|
241 ;; Stands for the regex "\\w", matching a word-constituent character
|
|
242 ;; (as determined by the current syntax table)
|
|
243
|
|
244 ;; - the symbol `not-wordchar'
|
|
245 ;; Stands for the regex "\\W", matching a non-word-constituent
|
|
246 ;; character.
|
|
247
|
|
248 ;; - (syntax CODE)
|
|
249 ;; Stands for the regex "\\sCODE", where CODE is a syntax table code
|
|
250 ;; (a single character). Matches any character with the requested
|
|
251 ;; syntax.
|
|
252
|
|
253 ;; - (not-syntax CODE)
|
|
254 ;; Stands for the regex "\\SCODE", where CODE is a syntax table code
|
|
255 ;; (a single character). Matches any character without the
|
|
256 ;; requested syntax.
|
|
257
|
|
258 ;; - (regex REGEX)
|
|
259 ;; This is a "trapdoor" for including ordinary regular expression
|
|
260 ;; strings in the result. Some regular expressions are clearer when
|
|
261 ;; written the old way: "[a-z]" vs. (sregexq (char (?a . ?z))), for
|
|
262 ;; instance. However, see the note under "Bugs," below.
|
|
263
|
|
264 ;; Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...)
|
|
265 ;; has one of the following forms:
|
|
266
|
|
267 ;; - a character
|
|
268 ;; Adds that character to the set.
|
|
269
|
|
270 ;; - a string
|
|
271 ;; Adds all the characters in the string to the set.
|
|
272
|
|
273 ;; - A pair (MIN . MAX)
|
|
274 ;; Where MIN and MAX are characters, adds the range of characters
|
|
275 ;; from MIN through MAX to the set.
|
|
276
|
|
277 ;;; To do:
|
|
278
|
|
279 ;; Make (sregexq (or "a" (sequence "b" "c"))) return "a\\|bc" instead
|
|
280 ;; of "a\\|\\(bc\\)"
|
|
281
|
|
282 ;; An earlier version of this package could optionally translate the
|
|
283 ;; symbolic regex into other languages' syntaxes, e.g. Perl. For
|
|
284 ;; instance, with Perl syntax selected, (sregexq (or "ab" "cd")) would
|
|
285 ;; yield "ab|cd" instead of "ab\\|cd". It might be useful to restore
|
|
286 ;; such a facility.
|
|
287
|
|
288 ;;; Bugs:
|
|
289
|
|
290 ;; The (regex REGEX) form can confuse the code that distinguishes
|
|
291 ;; introduced groups from user-specified groups. Try to avoid using
|
|
292 ;; grouping within a `regex' form. Failing that, try to avoid using
|
|
293 ;; backrefs if you're using `regex'.
|
|
294
|
|
295 ;;; Code:
|
|
296
|
|
297 (defsubst sregex--value-unitp (val) (nth 0 val))
|
|
298 (defsubst sregex--value-groups (val) (nth 1 val))
|
|
299 (defsubst sregex--value-tree (val) (nth 2 val))
|
|
300
|
|
301 (defun sregex--make-value (unitp groups tree)
|
|
302 (list unitp groups tree))
|
|
303
|
|
304 (defvar sregex--current-sregex nil
|
|
305 "Global state for `sregex-info'.")
|
|
306
|
|
307 (defun sregex-info ()
|
|
308 "Return extra information about the latest call to `sregex'.
|
|
309 This extra information is needed in order to adjust user-requested
|
|
310 backreference numbers to numbers suitable for the generated regexp.
|
|
311 See e.g. `sregex-match-string' and `sregex-backref-num'."
|
|
312 sregex--current-sregex)
|
|
313
|
|
314 ; (require 'advice)
|
|
315 ; (defadvice save-match-data (around sregex-save-match-data protect)
|
|
316 ; (let ((sregex--saved-sregex sregex--current-sregex))
|
|
317 ; (unwind-protect
|
|
318 ; ad-do-it
|
|
319 ; (setq sregex--current-sregex sregex--saved-sregex))))
|
|
320 (defmacro sregex-save-match-data (&rest forms)
|
|
321 "Like `save-match-data', but also saves and restores `sregex-info' data."
|
|
322 `(let ((sregex--saved-sregex sregex--current-sregex))
|
|
323 (unwind-protect
|
|
324 (save-match-data ,@forms)
|
|
325 (setq sregex--current-sregex sregex--saved-sregex))))
|
|
326
|
|
327 (defun sregex-replace-match (replacement
|
|
328 &optional fixedcase literal string subexp sregex)
|
|
329 "Like `replace-match', for a regexp made with `sregex'.
|
|
330 This takes one additional optional argument, the `sregex' info, which
|
|
331 can be obtained with `sregex-info'. The SUBEXP argument is adjusted
|
|
332 to allow for \"introduced groups\". If the extra argument is omitted
|
|
333 or nil, it defaults to the current value of (sregex-info)."
|
|
334 (replace-match replacement fixedcase literal string
|
|
335 (and subexp
|
|
336 (sregex-backref-num
|
|
337 subexp
|
|
338 (sregex--value-groups (or sregex
|
|
339 sregex--current-sregex))))))
|
|
340
|
|
341 (defun sregex-match-string (count &optional in-string sregex)
|
|
342 "Like `match-string', for a regexp made with `sregex'.
|
|
343 This takes one additional optional argument, the `sregex' info, which
|
|
344 can be obtained with `sregex-info'. The COUNT argument is adjusted to
|
|
345 allow for \"introduced groups\". If the extra argument is omitted or
|
|
346 nil, it defaults to the current value of (sregex-info)."
|
|
347 (match-string (and count
|
|
348 (sregex-backref-num
|
|
349 count
|
|
350 (sregex--value-groups (or sregex
|
|
351 sregex--current-sregex))))
|
|
352 in-string))
|
|
353
|
|
354 (defun sregex-match-string-no-properties (count &optional in-string sregex)
|
|
355 "Like `match-string-no-properties', for a regexp made with `sregex'.
|
|
356 This takes one additional optional argument, the `sregex' info, which
|
|
357 can be obtained with `sregex-info'. The COUNT argument is adjusted to
|
|
358 allow for \"introduced groups\". If the extra argument is omitted or
|
|
359 nil, it defaults to the current value of (sregex-info)."
|
|
360 (match-string-no-properties
|
|
361 (and count
|
|
362 (sregex-backref-num
|
|
363 count
|
|
364 (sregex--value-groups (or sregex
|
|
365 sregex--current-sregex))))
|
|
366 in-string))
|
|
367
|
|
368 (defun sregex-match-beginning (count &optional sregex)
|
|
369 "Like `match-beginning', for a regexp made with `sregex'.
|
|
370 This takes one additional optional argument, the `sregex' info, which
|
|
371 can be obtained with `sregex-info'. The COUNT argument is adjusted to
|
|
372 allow for \"introduced groups\". If the extra argument is omitted or
|
|
373 nil, it defaults to the current value of (sregex-info)."
|
|
374 (match-beginning (sregex-backref-num
|
|
375 count
|
|
376 (sregex--value-groups (or sregex
|
|
377 sregex--current-sregex)))))
|
|
378
|
|
379 (defun sregex-match-end (count &optional sregex)
|
|
380 "Like `match-end', for a regexp made with `sregex'.
|
|
381 This takes one additional optional argument, the `sregex' info, which
|
|
382 can be obtained with `sregex-info'. The COUNT argument is adjusted to
|
|
383 allow for \"introduced groups\". If the extra argument is omitted or
|
|
384 nil, it defaults to the current value of (sregex-info)."
|
|
385 (match-end (sregex-backref-num
|
|
386 count
|
|
387 (sregex--value-groups (or sregex
|
|
388 sregex--current-sregex)))))
|
|
389
|
|
390 (defun sregex-match-data (&optional sregex)
|
|
391 "Like `match-data', for a regexp made with `sregex'.
|
|
392 This takes one additional optional argument, the `sregex' info, which
|
|
393 can be obtained with `sregex-info'. \"Introduced groups\" are removed
|
|
394 from the result. If the extra argument is omitted or nil, it defaults
|
|
395 to the current value of (sregex-info)."
|
|
396 (let* ((data (match-data))
|
|
397 (groups (sregex--value-groups (or sregex
|
|
398 sregex--current-sregex)))
|
|
399 (result (list (car (cdr data))
|
|
400 (car data))))
|
|
401 (setq data (cdr (cdr data)))
|
|
402 (while data
|
|
403 (if (car groups)
|
|
404 (setq result (append (list (car (cdr data))
|
|
405 (car data))
|
|
406 result)))
|
|
407 (setq groups (cdr groups)
|
|
408 data (cdr (cdr data))))
|
|
409 (reverse result)))
|
|
410
|
|
411 (defun sregex--render-tree (tree sregex)
|
|
412 (let ((key (car tree)))
|
|
413 (cond ((eq key 'str)
|
|
414 (cdr tree))
|
|
415 ((eq key 'or)
|
|
416 (mapconcat '(lambda (x)
|
|
417 (sregex--render-tree x sregex))
|
|
418 (cdr tree)
|
|
419 "\\|"))
|
|
420 ((eq key 'sequence)
|
|
421 (apply 'concat
|
|
422 (mapcar '(lambda (x)
|
|
423 (sregex--render-tree x sregex))
|
|
424 (cdr tree))))
|
|
425 ((eq key 'group)
|
|
426 (concat "\\("
|
|
427 (sregex--render-tree (cdr tree) sregex)
|
|
428 "\\)"))
|
|
429 ((eq key 'opt)
|
|
430 (concat (sregex--render-tree (cdr tree) sregex)
|
|
431 "?"))
|
|
432 ((eq key '0+)
|
|
433 (concat (sregex--render-tree (cdr tree) sregex)
|
|
434 "*"))
|
|
435 ((eq key '1+)
|
|
436 (concat (sregex--render-tree (cdr tree) sregex)
|
|
437 "+"))
|
|
438 ((eq key 'backref)
|
|
439 (let ((num (sregex-backref-num (cdr tree) sregex)))
|
|
440 (if (> num 9)
|
|
441 (error "sregex: backref number %d too high after adjustment"
|
|
442 num)
|
|
443 (concat "\\" (int-to-string num)))))
|
|
444 (t (error "sregex internal error: unknown tree type %S"
|
|
445 key)))))
|
|
446
|
|
447 (defun sregex (&rest exps)
|
|
448 "Symbolic regular expression interpreter.
|
|
449 This is exactly like `sregexq' (q.v.) except that it evaluates all its
|
|
450 arguments, so literal sregex clauses must be quoted. For example:
|
|
451
|
|
452 (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
|
|
453
|
|
454 An argument-evaluating sregex interpreter lets you reuse sregex
|
|
455 subexpressions:
|
|
456
|
|
457 (let ((dotstar '(0+ any))
|
|
458 (whitespace '(1+ (syntax ?-)))
|
|
459 (digits '(1+ (char (?0 . ?9)))))
|
|
460 (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\""
|
|
461 (progn
|
|
462 (setq sregex--current-sregex (sregex--sequence exps nil))
|
|
463 (sregex--render-tree (sregex--value-tree sregex--current-sregex)
|
|
464 sregex--current-sregex)))
|
|
465
|
|
466 (defmacro sregexq (&rest exps)
|
|
467 "Symbolic regular expression interpreter.
|
|
468 This macro allows you to specify a regular expression (regexp) in
|
|
469 symbolic form, and converts it into the string form required by Emacs's
|
|
470 regex functions such as `re-search-forward' and `looking-at'. Here is
|
|
471 a simple example:
|
|
472
|
|
473 (sregexq (or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
|
|
474
|
|
475 As you can see, an sregex is specified by placing one or more special
|
|
476 clauses in a call to `sregexq'. The clause in this case is the `or'
|
|
477 of two strings (not to be confused with the Lisp function `or'). The
|
|
478 list of allowable clauses appears below.
|
|
479
|
|
480 With `sregex', it is never necessary to \"escape\" magic characters
|
|
481 that are meant to be taken literally; that happens automatically.
|
|
482 For example:
|
|
483
|
|
484 (sregexq \"M*A*S*H\") => \"M\\\\*A\\\\*S\\\\*H\"
|
|
485
|
|
486 It is also unnecessary to \"group\" parts of the expression together
|
|
487 to overcome operator precedence; that also happens automatically.
|
|
488 For example:
|
|
489
|
|
490 (sregexq (opt (or \"Bob\" \"Robert\"))) => \"\\\\(Bob\\\\|Robert\\\\)?\"
|
|
491
|
|
492 It *is* possible to group parts of the expression in order to refer
|
|
493 to them with numbered backreferences:
|
|
494
|
|
495 (sregexq (group (or \"Go\" \"Run\"))
|
|
496 \", Spot, \"
|
|
497 (backref 1)) => \"\\\\(Go\\\\|Run\\\\), Spot, \\\\1\"
|
|
498
|
|
499 If `sregexq' needs to introduce its own grouping parentheses, it will
|
|
500 automatically renumber your backreferences:
|
|
501
|
|
502 (sregexq (opt \"resent-\")
|
|
503 (group (or \"to\" \"cc\" \"bcc\"))
|
|
504 \": \"
|
|
505 (backref 1)) => \"\\\\(resent-\\\\)?\\\\(to\\\\|cc\\\\|bcc\\\\): \\\\2\"
|
|
506
|
|
507 `sregexq' is a macro. Each time it is used, it constructs a simple
|
|
508 Lisp expression that then invokes a moderately complex engine to
|
|
509 interpret the sregex and render the string form. Because of this, I
|
|
510 don't recommend sprinkling calls to `sregexq' throughout your code,
|
|
511 the way one normally does with string regexes (which are cheap to
|
|
512 evaluate). Instead, it's wiser to precompute the regexes you need
|
|
513 wherever possible instead of repeatedly constructing the same ones
|
|
514 over and over. Example:
|
|
515
|
|
516 (let ((field-regex (sregexq (opt \"resent-\")
|
|
517 (or \"to\" \"cc\" \"bcc\"))))
|
|
518 ...
|
|
519 (while ...
|
|
520 ...
|
|
521 (re-search-forward field-regex ...)
|
|
522 ...))
|
|
523
|
|
524 The arguments to `sregexq' are automatically quoted, but the
|
|
525 flipside of this is that it is not straightforward to include
|
|
526 computed (i.e., non-constant) values in `sregexq' expressions. So
|
|
527 `sregex' is a function that is like `sregexq' but which does not
|
|
528 automatically quote its values. Literal sregex clauses must be
|
|
529 explicitly quoted like so:
|
|
530
|
|
531 (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
|
|
532
|
|
533 but computed clauses can be included easily, allowing for the reuse
|
|
534 of common clauses:
|
|
535
|
|
536 (let ((dotstar '(0+ any))
|
|
537 (whitespace '(1+ (syntax ?-)))
|
|
538 (digits '(1+ (char (?0 . ?9)))))
|
|
539 (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\"
|
|
540
|
|
541 Here are the clauses allowed in an `sregex' or `sregexq' expression:
|
|
542
|
|
543 - a string
|
|
544 This stands for the literal string. If it contains
|
|
545 metacharacters, they will be escaped in the resulting regex
|
|
546 (using `regexp-quote').
|
|
547
|
|
548 - the symbol `any'
|
|
549 This stands for \".\", a regex matching any character except
|
|
550 newline.
|
|
551
|
|
552 - the symbol `bol'
|
|
553 Stands for \"^\", matching the empty string at the beginning of a line
|
|
554
|
|
555 - the symbol `eol'
|
|
556 Stands for \"$\", matching the empty string at the end of a line
|
|
557
|
|
558 - (group CLAUSE ...)
|
|
559 Groups the given CLAUSEs using \"\\\\(\" and \"\\\\)\".
|
|
560
|
|
561 - (sequence CLAUSE ...)
|
|
562
|
|
563 Groups the given CLAUSEs; may or may not use \"\\\\(\" and \"\\\\)\".
|
|
564 Clauses groups by `sequence' do not count for purposes of
|
|
565 numbering backreferences. Use `sequence' in situations like
|
|
566 this:
|
|
567
|
|
568 (sregexq (or \"dog\" \"cat\"
|
|
569 (sequence (opt \"sea \") \"monkey\")))
|
|
570 => \"dog\\\\|cat\\\\|\\\\(sea \\\\)?monkey\"
|
|
571
|
|
572 where a single `or' alternate needs to contain multiple
|
|
573 subclauses.
|
|
574
|
|
575 - (backref N)
|
|
576 Matches the same string previously matched by the Nth \"group\" in
|
|
577 the same sregex. N is a positive integer. In the resulting
|
|
578 regex, N may be adjusted to account for automatically introduced
|
|
579 groups.
|
|
580
|
|
581 - (or CLAUSE ...)
|
|
582 Matches any one of the CLAUSEs by separating them with \"\\\\|\".
|
|
583
|
|
584 - (0+ CLAUSE ...)
|
|
585 Concatenates the given CLAUSEs and matches zero or more
|
|
586 occurrences by appending \"*\".
|
|
587
|
|
588 - (1+ CLAUSE ...)
|
|
589 Concatenates the given CLAUSEs and matches one or more
|
|
590 occurrences by appending \"+\".
|
|
591
|
|
592 - (opt CLAUSE ...)
|
|
593 Concatenates the given CLAUSEs and matches zero or one occurrence
|
|
594 by appending \"?\".
|
|
595
|
|
596 - (repeat MIN MAX CLAUSE ...)
|
|
597 Concatenates the given CLAUSEs and constructs a regex matching at
|
|
598 least MIN occurrences and at most MAX occurrences. MIN must be a
|
|
599 non-negative integer. MAX must be a non-negative integer greater
|
|
600 than or equal to MIN; or MAX can be nil to mean \"infinity.\"
|
|
601
|
|
602 - (char CHAR-CLAUSE ...)
|
|
603 Creates a \"character class\" matching one character from the given
|
|
604 set. See below for how to construct a CHAR-CLAUSE.
|
|
605
|
|
606 - (not-char CHAR-CLAUSE ...)
|
|
607 Creates a \"character class\" matching any one character not in the
|
|
608 given set. See below for how to construct a CHAR-CLAUSE.
|
|
609
|
|
610 - the symbol `bot'
|
|
611 Stands for \"\\\\`\", matching the empty string at the beginning of
|
|
612 text (beginning of a string or of a buffer).
|
|
613
|
|
614 - the symbol `eot'
|
|
615 Stands for \"\\\\'\", matching the empty string at the end of text.
|
|
616
|
|
617 - the symbol `point'
|
|
618 Stands for \"\\\\=\", matching the empty string at point.
|
|
619
|
|
620 - the symbol `word-boundary'
|
|
621 Stands for \"\\\\b\", matching the empty string at the beginning or
|
|
622 end of a word.
|
|
623
|
|
624 - the symbol `not-word-boundary'
|
|
625 Stands for \"\\\\B\", matching the empty string not at the beginning
|
|
626 or end of a word.
|
|
627
|
|
628 - the symbol `bow'
|
|
629 Stands for \"\\\\\\=<\", matching the empty string at the beginning of a
|
|
630 word.
|
|
631
|
|
632 - the symbol `eow'
|
|
633 Stands for \"\\\\\\=>\", matching the empty string at the end of a word.
|
|
634
|
|
635 - the symbol `wordchar'
|
|
636 Stands for the regex \"\\\\w\", matching a word-constituent character
|
|
637 (as determined by the current syntax table)
|
|
638
|
|
639 - the symbol `not-wordchar'
|
|
640 Stands for the regex \"\\\\W\", matching a non-word-constituent
|
|
641 character.
|
|
642
|
|
643 - (syntax CODE)
|
|
644 Stands for the regex \"\\\\sCODE\", where CODE is a syntax table code
|
|
645 (a single character). Matches any character with the requested
|
|
646 syntax.
|
|
647
|
|
648 - (not-syntax CODE)
|
|
649 Stands for the regex \"\\\\SCODE\", where CODE is a syntax table code
|
|
650 (a single character). Matches any character without the
|
|
651 requested syntax.
|
|
652
|
|
653 - (regex REGEX)
|
|
654 This is a \"trapdoor\" for including ordinary regular expression
|
|
655 strings in the result. Some regular expressions are clearer when
|
|
656 written the old way: \"[a-z]\" vs. (sregexq (char (?a . ?z))), for
|
|
657 instance. However, using this can confuse the code that
|
|
658 distinguishes introduced groups from user-specified groups. Avoid
|
|
659 using grouping within a `regex' form. Failing that, avoid using
|
|
660 backrefs if you're using `regex'.
|
|
661
|
|
662 Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...)
|
|
663 has one of the following forms:
|
|
664
|
|
665 - a character
|
|
666 Adds that character to the set.
|
|
667
|
|
668 - a string
|
|
669 Adds all the characters in the string to the set.
|
|
670
|
|
671 - A pair (MIN . MAX)
|
|
672 Where MIN and MAX are characters, adds the range of characters
|
|
673 from MIN through MAX to the set."
|
|
674 `(apply 'sregex ',exps))
|
|
675
|
|
676 (defun sregex--engine (exp combine)
|
|
677 (let* ((val (cond ((stringp exp)
|
|
678 (sregex--make-value (or (not (eq combine 'suffix))
|
|
679 (= (length exp) 1))
|
|
680 nil
|
|
681 (cons 'str
|
|
682 (regexp-quote exp))))
|
|
683 ((symbolp exp)
|
|
684 (funcall (intern (concat "sregex--"
|
|
685 (symbol-name exp)))
|
|
686 combine))
|
|
687 ((consp exp)
|
|
688 (funcall (intern (concat "sregex--"
|
|
689 (symbol-name (car exp))))
|
|
690 (cdr exp)
|
|
691 combine))
|
|
692 (t (error "Invalid expression: %s" exp))))
|
|
693 (unitp (sregex--value-unitp val))
|
|
694 (groups (sregex--value-groups val))
|
|
695 (tree (sregex--value-tree val)))
|
|
696 (if (and combine (not unitp))
|
|
697 (sregex--make-value t
|
|
698 (cons nil groups)
|
|
699 (cons 'group tree))
|
|
700 (sregex--make-value unitp groups tree))))
|
|
701
|
|
702 (defun sregex--sequence (exps combine)
|
|
703 (if (= (length exps) 1)
|
|
704 (sregex--engine (car exps) combine)
|
|
705 (let ((groups nil)
|
|
706 (trees nil)) ;grows in reverse
|
|
707 (while exps
|
|
708 (let ((val (sregex--engine (car exps) 'concat)))
|
|
709 (setq groups (append groups
|
|
710 (sregex--value-groups val))
|
|
711 trees (cons (sregex--value-tree val) trees)
|
|
712 exps (cdr exps))))
|
|
713 (setq trees (nreverse trees))
|
|
714 (if (eq combine 'suffix)
|
|
715 (sregex--make-value t
|
|
716 (cons nil groups)
|
|
717 (cons 'group
|
|
718 (cons 'sequence trees)))
|
|
719 (sregex--make-value (not (eq combine 'suffix))
|
|
720 groups
|
|
721 (cons 'sequence trees))))))
|
|
722
|
|
723 (defun sregex--group (exps combine)
|
|
724 (let ((val (sregex--sequence exps nil)))
|
|
725 (sregex--make-value t
|
|
726 (cons t (sregex--value-groups val))
|
|
727 (cons 'group (sregex--value-tree val)))))
|
|
728
|
|
729 (defun sregex-backref-num (n &optional sregex)
|
|
730 "Adjust backreference number N according to SREGEX.
|
|
731 When `sregex' introduces parenthesized groups that the user didn't ask
|
|
732 for, the numbering of the groups that the user *did* ask for gets all
|
|
733 out of whack. This function accounts for introduced groups. Example:
|
|
734
|
|
735 (sregexq (opt \"ab\")
|
|
736 (group (or \"c\" \"d\"))) => \"\\\\(ab\\\\)?\\\\(c\\\\|d\\\\)\"
|
|
737 (setq info (sregex-info))
|
|
738 (sregex-backref-num 1 info) => 2
|
|
739
|
|
740 The SREGEX parameter is optional and defaults to the current value of
|
|
741 `sregex-info'."
|
|
742 (let ((groups (sregex--value-groups (or sregex
|
|
743 sregex--current-sregex)))
|
|
744 (result 0))
|
|
745 (while (and groups (> n 0))
|
|
746 (if (car groups)
|
|
747 (setq n (1- n)))
|
|
748 (setq result (1+ result)
|
|
749 groups (cdr groups)))
|
|
750 result))
|
|
751
|
|
752 (defun sregex--backref (exps combine)
|
|
753 (sregex--make-value t nil (cons 'backref (car exps))))
|
|
754
|
|
755 (defun sregex--any (combine)
|
|
756 (sregex--make-value t nil '(str . ".")))
|
|
757
|
|
758 (defun sregex--opt (exps combine)
|
|
759 (let ((val (sregex--sequence exps 'suffix)))
|
|
760 (sregex--make-value t
|
|
761 (sregex--value-groups val)
|
|
762 (cons 'opt (sregex--value-tree val)))))
|
|
763
|
|
764 (defun sregex--0+ (exps combine)
|
|
765 (let ((val (sregex--sequence exps 'suffix)))
|
|
766 (sregex--make-value t
|
|
767 (sregex--value-groups val)
|
|
768 (cons '0+ (sregex--value-tree val)))))
|
|
769 (defun sregex--1+ (exps combine)
|
|
770 (let ((val (sregex--sequence exps 'suffix)))
|
|
771 (sregex--make-value t
|
|
772 (sregex--value-groups val)
|
|
773 (cons '1+ (sregex--value-tree val)))))
|
|
774
|
|
775 (defun sregex--repeat (exps combine)
|
|
776 (let ((min (or (car exps) 0))
|
|
777 (max (car (cdr exps))))
|
|
778 (setq exps (cdr (cdr exps)))
|
|
779 (cond ((zerop min)
|
|
780 (cond ((equal max 0) ;degenerate
|
|
781 (sregex--make-value t nil nil))
|
|
782 ((equal max 1)
|
|
783 (sregex--opt exps combine))
|
|
784 ((not max)
|
|
785 (sregex--0+ exps combine))
|
|
786 (t (sregex--sequence (make-list max
|
|
787 (cons 'opt exps))
|
|
788 combine))))
|
|
789 ((= min 1)
|
|
790 (cond ((equal max 1)
|
|
791 (sregex--sequence exps combine))
|
|
792 ((not max)
|
|
793 (sregex--1+ exps combine))
|
|
794 (t (sregex--sequence (append exps
|
|
795 (make-list (1- max)
|
|
796 (cons 'opt exps)))
|
|
797 combine))))
|
|
798 (t (sregex--sequence (append exps
|
|
799 (list (append (list 'repeat
|
|
800 (1- min)
|
|
801 (and max
|
|
802 (1- max)))
|
|
803 exps)))
|
|
804 combine)))))
|
|
805
|
|
806 (defun sregex--or (exps combine)
|
|
807 (if (= (length exps) 1)
|
|
808 (sregex--engine (car exps) combine)
|
|
809 (let ((groups nil)
|
|
810 (trees nil))
|
|
811 (while exps
|
|
812 (let ((val (sregex--engine (car exps) 'or)))
|
|
813 (setq groups (append groups
|
|
814 (sregex--value-groups val))
|
|
815 trees (cons (sregex--value-tree val) trees)
|
|
816 exps (cdr exps))))
|
|
817 (sregex--make-value (eq combine 'or)
|
|
818 groups
|
|
819 (cons 'or (nreverse trees))))))
|
|
820
|
|
821 (defmacro sregex--char-range-aux ()
|
|
822 '(if start
|
|
823 (let (startc endc)
|
|
824 (if (and (<= 32 start)
|
|
825 (<= start 127))
|
|
826 (setq startc (char-to-string start)
|
|
827 endc (char-to-string end))
|
|
828 (setq startc (format "\\%03o" start)
|
|
829 endc (format "\\%03o" end)))
|
|
830 (if (> end start)
|
|
831 (if (> end (+ start 1))
|
|
832 (setq class (concat class startc "-" endc))
|
|
833 (setq class (concat class startc endc)))
|
|
834 (setq class (concat class startc))))))
|
|
835
|
|
836 (defmacro sregex--char-range (rstart rend)
|
|
837 `(let ((i ,rstart)
|
|
838 start end)
|
|
839 (while (<= i ,rend)
|
|
840 (if (aref chars i)
|
|
841 (progn
|
|
842 (if start
|
|
843 (setq end i)
|
|
844 (setq start i
|
|
845 end i))
|
|
846 (aset chars i nil))
|
|
847 (sregex--char-range-aux)
|
|
848 (setq start nil
|
|
849 end nil))
|
|
850 (setq i (1+ i)))
|
|
851 (sregex--char-range-aux)))
|
|
852
|
|
853 (defun sregex--char-aux (complement args)
|
|
854 (let ((chars (make-vector 256 nil)))
|
|
855 (while args
|
|
856 (let ((arg (car args)))
|
|
857 (cond ((integerp arg)
|
|
858 (aset chars arg t))
|
|
859 ((stringp arg)
|
|
860 (mapcar (function
|
|
861 (lambda (c)
|
|
862 (aset chars c t)))
|
|
863 arg))
|
|
864 ((consp arg)
|
|
865 (let ((start (car arg))
|
|
866 (end (cdr arg)))
|
|
867 (if (> start end)
|
|
868 (let ((tmp start))
|
|
869 (setq start end
|
|
870 end tmp)))
|
|
871 ;; now start <= end
|
|
872 (let ((i start))
|
|
873 (while (<= i end)
|
|
874 (aset chars i t)
|
|
875 (setq i (1+ i))))))))
|
|
876 (setq args (cdr args)))
|
|
877 ;; now chars is a map of the characters in the class
|
|
878 (let ((class "")
|
|
879 (caret (aref chars ?^)))
|
|
880 (aset chars ?^ nil)
|
|
881 (if (aref chars ?\])
|
|
882 (progn
|
|
883 (setq class (concat class "]"))
|
|
884 (aset chars ?\] nil)))
|
|
885 (if (aref chars ?-)
|
|
886 (progn
|
|
887 (setq class (concat class "-"))
|
|
888 (aset chars ?- nil)))
|
|
889 (if (aref chars ?\\)
|
|
890 (progn
|
|
891 (setq class (concat class "\\\\"))
|
|
892 (aset chars ?\\ nil)))
|
|
893
|
|
894 (sregex--char-range ?A ?Z)
|
|
895 (sregex--char-range ?a ?z)
|
|
896 (sregex--char-range ?0 ?9)
|
|
897
|
|
898 (let ((i 32))
|
|
899 (while (< i 128)
|
|
900 (if (aref chars i)
|
|
901 (progn
|
|
902 (setq class (concat class (char-to-string i)))
|
|
903 (aset chars i nil)))
|
|
904 (setq i (1+ i))))
|
|
905
|
|
906 (sregex--char-range 0 31)
|
|
907 (sregex--char-range 128 255)
|
|
908
|
|
909 (let ((i 0))
|
|
910 (while (< i 256)
|
|
911 (if (aref chars i)
|
|
912 (setq class (concat class (format "\\%03o" i))))
|
|
913 (setq i (1+ i))))
|
|
914
|
|
915 (if caret
|
|
916 (setq class (concat class "^")))
|
|
917 (concat "[" (if complement "^") class "]"))))
|
|
918
|
|
919 (defun sregex--char (exps combine)
|
|
920 (sregex--make-value t nil (cons 'str (sregex--char-aux nil exps))))
|
|
921 (defun sregex--not-char (exps combine)
|
|
922 (sregex--make-value t nil (cons 'str (sregex--char-aux t exps))))
|
|
923
|
|
924 (defun sregex--bol (combine)
|
|
925 (sregex--make-value t nil '(str . "^")))
|
|
926 (defun sregex--eol (combine)
|
|
927 (sregex--make-value t nil '(str . "$")))
|
|
928
|
|
929 (defun sregex--wordchar (combine)
|
|
930 (sregex--make-value t nil '(str . "\\w")))
|
|
931 (defun sregex--not-wordchar (combine)
|
|
932 (sregex--make-value t nil '(str . "\\W")))
|
|
933
|
|
934 (defun sregex--syntax (exps combine)
|
|
935 (sregex--make-value t nil (cons 'str (format "\\s%c" (car exps)))))
|
|
936 (defun sregex--not-syntax (exps combine)
|
|
937 (sregex--make-value t nil (cons 'str (format "\\S%c" (car exps)))))
|
|
938
|
|
939 (defun sregex--bot (combine)
|
|
940 (sregex--make-value t nil (cons 'str "\\`")))
|
|
941 (defun sregex--eot (combine)
|
|
942 (sregex--make-value t nil (cons 'str "\\'")))
|
|
943
|
|
944 (defun sregex--point (combine)
|
|
945 (sregex--make-value t nil '(str . "\\=")))
|
|
946
|
|
947 (defun sregex--word-boundary (combine)
|
|
948 (sregex--make-value t nil '(str . "\\b")))
|
|
949 (defun sregex--not-word-boundary (combine)
|
|
950 (sregex--make-value t nil '(str . "\\B")))
|
|
951
|
|
952 (defun sregex--bow (combine)
|
|
953 (sregex--make-value t nil '(str . "\\<")))
|
|
954 (defun sregex--eow (combine)
|
|
955 (sregex--make-value t nil '(str . "\\>")))
|
|
956
|
|
957
|
|
958 ;; trapdoor - usage discouraged
|
|
959 (defun sregex--regex (exps combine)
|
|
960 (sregex--make-value nil nil (car exps)))
|
|
961
|
|
962 (provide 'sregex)
|
|
963
|
|
964 ;;; sregex.el ends here
|
|
965
|