comparison lisp/emacs-lisp/sregex.el @ 22537:7947a4ea28a8

Initial revision
author Dan Nicolaescu <done@ece.arizona.edu>
date Mon, 22 Jun 1998 02:03:41 +0000
parents
children 6e0af7097929
comparison
equal deleted inserted replaced
22536:e4bcb7cb0038 22537:7947a4ea28a8
1 ;;; sregex.el --- symbolic regular expressions
2
3 ;; Copyright (C) 1997, 1998 Free Software Foundation, Inc.
4
5 ;; Author: Bob Glickstein <bobg+sregex@zanshin.com>
6 ;; Maintainer: Bob Glickstein <bobg+sregex@zanshin.com>
7
8 ;; This file is part of GNU Emacs.
9
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation; either version 2, or (at your option)
13 ;; any later version.
14
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the
22 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;; Boston, MA 02111-1307, USA.
24
25 ;;; Commentary:
26
27 ;; This package allows you to write regular expressions using a
28 ;; totally new, Lisp-like syntax.
29
30 ;; A "symbolic regular expression" (sregex for short) is a Lisp form
31 ;; that, when evaluated, produces the string form of the specified
32 ;; regular expression. Here's a simple example:
33
34 ;; (sregexq (or "Bob" "Robert")) => "Bob\\|Robert"
35
36 ;; As you can see, an sregex is specified by placing one or more
37 ;; special clauses in a call to `sregexq'. The clause in this case is
38 ;; the `or' of two strings (not to be confused with the Lisp function
39 ;; `or'). The list of allowable clauses appears below.
40
41 ;; With sregex, it is never necessary to "escape" magic characters
42 ;; that are meant to be taken literally; that happens automatically.
43 ;; For example:
44
45 ;; (sregexq "M*A*S*H") => "M\\*A\\*S\\*H"
46
47 ;; It is also unnecessary to "group" parts of the expression together
48 ;; to overcome operator precedence; that also happens automatically.
49 ;; For example:
50
51 ;; (sregexq (opt (or "Bob" "Robert"))) => "\\(Bob\\|Robert\\)?"
52
53 ;; It *is* possible to group parts of the expression in order to refer
54 ;; to them with numbered backreferences:
55
56 ;; (sregexq (group (or "Go" "Run"))
57 ;; ", Spot, "
58 ;; (backref 1)) => "\\(Go\\|Run\\), Spot, \\1"
59
60 ;; If `sregexq' needs to introduce its own grouping parentheses, it
61 ;; will automatically renumber your backreferences:
62
63 ;; (sregexq (opt "resent-")
64 ;; (group (or "to" "cc" "bcc"))
65 ;; ": "
66 ;; (backref 1)) => "\\(resent-\\)?\\(to\\|cc\\|bcc\\): \\2"
67
68 ;; `sregexq' is a macro. Each time it is used, it constructs a simple
69 ;; Lisp expression that then invokes a moderately complex engine to
70 ;; interpret the sregex and render the string form. Because of this,
71 ;; I don't recommend sprinkling calls to `sregexq' throughout your
72 ;; code, the way one normally does with string regexes (which are
73 ;; cheap to evaluate). Instead, it's wiser to precompute the regexes
74 ;; you need wherever possible instead of repeatedly constructing the
75 ;; same ones over and over. Example:
76
77 ;; (let ((field-regex (sregexq (opt "resent-")
78 ;; (or "to" "cc" "bcc"))))
79 ;; ...
80 ;; (while ...
81 ;; ...
82 ;; (re-search-forward field-regex ...)
83 ;; ...))
84
85 ;; The arguments to `sregexq' are automatically quoted, but the
86 ;; flipside of this is that it is not straightforward to include
87 ;; computed (i.e., non-constant) values in `sregexq' expressions. So
88 ;; `sregex' is a function that is like `sregexq' but which does not
89 ;; automatically quote its values. Literal sregex clauses must be
90 ;; explicitly quoted like so:
91
92 ;; (sregex '(or "Bob" "Robert")) => "Bob\\|Robert"
93
94 ;; but computed clauses can be included easily, allowing for the reuse
95 ;; of common clauses:
96
97 ;; (let ((dotstar '(0+ any))
98 ;; (whitespace '(1+ (syntax ?-)))
99 ;; (digits '(1+ (char (?0 . ?9)))))
100 ;; (sregex 'bol dotstar ":" whitespace digits)) => "^.*:\\s-+[0-9]+"
101
102 ;; This package also provides sregex-specific versions of the Emacs
103 ;; functions `replace-match', `match-string',
104 ;; `match-string-no-properties', `match-beginning', `match-end', and
105 ;; `match-data'. In each case, the sregex version's name begins with
106 ;; `sregex-' and takes one additional optional parameter, an sregex
107 ;; "info" object. Each of these functions is concerned with numbered
108 ;; submatches. Since sregex may renumber submatches, alternate
109 ;; versions of these functions are needed that know how to adjust the
110 ;; supplied number.
111
112 ;; The sregex info object for the most recently evaluated sregex can
113 ;; be obtained with `sregex-info'; so if you precompute your sregexes
114 ;; and you plan to use `replace-match' or one of the others with it,
115 ;; you need to record the info object for later use:
116
117 ;; (let* ((regex (sregexq (opt "resent-")
118 ;; (group (or "to" "cc" "bcc"))
119 ;; ":"))
120 ;; (regex-info (sregex-info)))
121 ;; ...
122 ;; (if (re-search-forward regex ...)
123 ;; (let ((which (sregex-match-string 1 nil regex-info)))
124 ;; ...)))
125
126 ;; In this example, `regex' is "\\(resent-\\)?\\(to\\|cc\\|bcc\\):",
127 ;; so the call to (sregex-match-string 1 ...) is automatically turned
128 ;; into a call to (match-string 2 ...).
129
130 ;; If the sregex info argument to `sregex-replace-match',
131 ;; `sregex-match-string', `sregex-match-string-no-properties',
132 ;; `sregex-match-beginning', `sregex-match-end', or
133 ;; `sregex-match-data' is omitted, the current value of (sregex-info)
134 ;; is used.
135
136 ;; You can do your own sregex submatch renumbering with
137 ;; `sregex-backref-num'.
138
139 ;; Finally, `sregex-save-match-data' is like `save-match-data' but
140 ;; also saves and restores the information maintained by
141 ;; `sregex-info'.
142
143 ;; To use this package in a Lisp program, simply (require 'sregex).
144
145 ;; Here are the clauses allowed in an `sregex' or `sregexq'
146 ;; expression:
147
148 ;; - a string
149 ;; This stands for the literal string. If it contains
150 ;; metacharacters, they will be escaped in the resulting regex
151 ;; (using `regexp-quote').
152
153 ;; - the symbol `any'
154 ;; This stands for ".", a regex matching any character except
155 ;; newline.
156
157 ;; - the symbol `bol'
158 ;; Stands for "^", matching the empty string at the beginning of a line
159
160 ;; - the symbol `eol'
161 ;; Stands for "$", matching the empty string at the end of a line
162
163 ;; - (group CLAUSE ...)
164 ;; Groups the given CLAUSEs using "\\(" and "\\)".
165
166 ;; - (sequence CLAUSE ...)
167
168 ;; Groups the given CLAUSEs; may or may not use "\\(" and "\\)".
169 ;; Clauses groups by `sequence' do not count for purposes of
170 ;; numbering backreferences. Use `sequence' in situations like
171 ;; this:
172
173 ;; (sregexq (or "dog" "cat"
174 ;; (sequence (opt "sea ") "monkey")))
175 ;; => "dog\\|cat\\|\\(sea \\)?monkey"
176
177 ;; where a single `or' alternate needs to contain multiple
178 ;; subclauses.
179
180 ;; - (backref N)
181 ;; Matches the same string previously matched by the Nth "group" in
182 ;; the same sregex. N is a positive integer. In the resulting
183 ;; regex, N may be adjusted to account for automatically introduced
184 ;; groups.
185
186 ;; - (or CLAUSE ...)
187 ;; Matches any one of the CLAUSEs by separating them with "\\|".
188
189 ;; - (0+ CLAUSE ...)
190 ;; Concatenates the given CLAUSEs and matches zero or more
191 ;; occurrences by appending "*".
192
193 ;; - (1+ CLAUSE ...)
194 ;; Concatenates the given CLAUSEs and matches one or more
195 ;; occurrences by appending "+".
196
197 ;; - (opt CLAUSE ...)
198 ;; Concatenates the given CLAUSEs and matches zero or one occurrence
199 ;; by appending "?".
200
201 ;; - (repeat MIN MAX CLAUSE ...)
202 ;; Concatenates the given CLAUSEs and constructs a regex matching at
203 ;; least MIN occurrences and at most MAX occurrences. MIN must be a
204 ;; non-negative integer. MAX must be a non-negative integer greater
205 ;; than or equal to MIN; or MAX can be nil to mean "infinity."
206
207 ;; - (char CHAR-CLAUSE ...)
208 ;; Creates a "character class" matching one character from the given
209 ;; set. See below for how to construct a CHAR-CLAUSE.
210
211 ;; - (not-char CHAR-CLAUSE ...)
212 ;; Creates a "character class" matching any one character not in the
213 ;; given set. See below for how to construct a CHAR-CLAUSE.
214
215 ;; - the symbol `bot'
216 ;; Stands for "\\`", matching the empty string at the beginning of
217 ;; text (beginning of a string or of a buffer).
218
219 ;; - the symbol `eot'
220 ;; Stands for "\\'", matching the empty string at the end of text.
221
222 ;; - the symbol `point'
223 ;; Stands for "\\=", matching the empty string at point.
224
225 ;; - the symbol `word-boundary'
226 ;; Stands for "\\b", matching the empty string at the beginning or
227 ;; end of a word.
228
229 ;; - the symbol `not-word-boundary'
230 ;; Stands for "\\B", matching the empty string not at the beginning
231 ;; or end of a word.
232
233 ;; - the symbol `bow'
234 ;; Stands for "\\<", matching the empty string at the beginning of a
235 ;; word.
236
237 ;; - the symbol `eow'
238 ;; Stands for "\\>", matching the empty string at the end of a word.
239
240 ;; - the symbol `wordchar'
241 ;; Stands for the regex "\\w", matching a word-constituent character
242 ;; (as determined by the current syntax table)
243
244 ;; - the symbol `not-wordchar'
245 ;; Stands for the regex "\\W", matching a non-word-constituent
246 ;; character.
247
248 ;; - (syntax CODE)
249 ;; Stands for the regex "\\sCODE", where CODE is a syntax table code
250 ;; (a single character). Matches any character with the requested
251 ;; syntax.
252
253 ;; - (not-syntax CODE)
254 ;; Stands for the regex "\\SCODE", where CODE is a syntax table code
255 ;; (a single character). Matches any character without the
256 ;; requested syntax.
257
258 ;; - (regex REGEX)
259 ;; This is a "trapdoor" for including ordinary regular expression
260 ;; strings in the result. Some regular expressions are clearer when
261 ;; written the old way: "[a-z]" vs. (sregexq (char (?a . ?z))), for
262 ;; instance. However, see the note under "Bugs," below.
263
264 ;; Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...)
265 ;; has one of the following forms:
266
267 ;; - a character
268 ;; Adds that character to the set.
269
270 ;; - a string
271 ;; Adds all the characters in the string to the set.
272
273 ;; - A pair (MIN . MAX)
274 ;; Where MIN and MAX are characters, adds the range of characters
275 ;; from MIN through MAX to the set.
276
277 ;;; To do:
278
279 ;; Make (sregexq (or "a" (sequence "b" "c"))) return "a\\|bc" instead
280 ;; of "a\\|\\(bc\\)"
281
282 ;; An earlier version of this package could optionally translate the
283 ;; symbolic regex into other languages' syntaxes, e.g. Perl. For
284 ;; instance, with Perl syntax selected, (sregexq (or "ab" "cd")) would
285 ;; yield "ab|cd" instead of "ab\\|cd". It might be useful to restore
286 ;; such a facility.
287
288 ;;; Bugs:
289
290 ;; The (regex REGEX) form can confuse the code that distinguishes
291 ;; introduced groups from user-specified groups. Try to avoid using
292 ;; grouping within a `regex' form. Failing that, try to avoid using
293 ;; backrefs if you're using `regex'.
294
295 ;;; Code:
296
297 (defsubst sregex--value-unitp (val) (nth 0 val))
298 (defsubst sregex--value-groups (val) (nth 1 val))
299 (defsubst sregex--value-tree (val) (nth 2 val))
300
301 (defun sregex--make-value (unitp groups tree)
302 (list unitp groups tree))
303
304 (defvar sregex--current-sregex nil
305 "Global state for `sregex-info'.")
306
307 (defun sregex-info ()
308 "Return extra information about the latest call to `sregex'.
309 This extra information is needed in order to adjust user-requested
310 backreference numbers to numbers suitable for the generated regexp.
311 See e.g. `sregex-match-string' and `sregex-backref-num'."
312 sregex--current-sregex)
313
314 ; (require 'advice)
315 ; (defadvice save-match-data (around sregex-save-match-data protect)
316 ; (let ((sregex--saved-sregex sregex--current-sregex))
317 ; (unwind-protect
318 ; ad-do-it
319 ; (setq sregex--current-sregex sregex--saved-sregex))))
320 (defmacro sregex-save-match-data (&rest forms)
321 "Like `save-match-data', but also saves and restores `sregex-info' data."
322 `(let ((sregex--saved-sregex sregex--current-sregex))
323 (unwind-protect
324 (save-match-data ,@forms)
325 (setq sregex--current-sregex sregex--saved-sregex))))
326
327 (defun sregex-replace-match (replacement
328 &optional fixedcase literal string subexp sregex)
329 "Like `replace-match', for a regexp made with `sregex'.
330 This takes one additional optional argument, the `sregex' info, which
331 can be obtained with `sregex-info'. The SUBEXP argument is adjusted
332 to allow for \"introduced groups\". If the extra argument is omitted
333 or nil, it defaults to the current value of (sregex-info)."
334 (replace-match replacement fixedcase literal string
335 (and subexp
336 (sregex-backref-num
337 subexp
338 (sregex--value-groups (or sregex
339 sregex--current-sregex))))))
340
341 (defun sregex-match-string (count &optional in-string sregex)
342 "Like `match-string', for a regexp made with `sregex'.
343 This takes one additional optional argument, the `sregex' info, which
344 can be obtained with `sregex-info'. The COUNT argument is adjusted to
345 allow for \"introduced groups\". If the extra argument is omitted or
346 nil, it defaults to the current value of (sregex-info)."
347 (match-string (and count
348 (sregex-backref-num
349 count
350 (sregex--value-groups (or sregex
351 sregex--current-sregex))))
352 in-string))
353
354 (defun sregex-match-string-no-properties (count &optional in-string sregex)
355 "Like `match-string-no-properties', for a regexp made with `sregex'.
356 This takes one additional optional argument, the `sregex' info, which
357 can be obtained with `sregex-info'. The COUNT argument is adjusted to
358 allow for \"introduced groups\". If the extra argument is omitted or
359 nil, it defaults to the current value of (sregex-info)."
360 (match-string-no-properties
361 (and count
362 (sregex-backref-num
363 count
364 (sregex--value-groups (or sregex
365 sregex--current-sregex))))
366 in-string))
367
368 (defun sregex-match-beginning (count &optional sregex)
369 "Like `match-beginning', for a regexp made with `sregex'.
370 This takes one additional optional argument, the `sregex' info, which
371 can be obtained with `sregex-info'. The COUNT argument is adjusted to
372 allow for \"introduced groups\". If the extra argument is omitted or
373 nil, it defaults to the current value of (sregex-info)."
374 (match-beginning (sregex-backref-num
375 count
376 (sregex--value-groups (or sregex
377 sregex--current-sregex)))))
378
379 (defun sregex-match-end (count &optional sregex)
380 "Like `match-end', for a regexp made with `sregex'.
381 This takes one additional optional argument, the `sregex' info, which
382 can be obtained with `sregex-info'. The COUNT argument is adjusted to
383 allow for \"introduced groups\". If the extra argument is omitted or
384 nil, it defaults to the current value of (sregex-info)."
385 (match-end (sregex-backref-num
386 count
387 (sregex--value-groups (or sregex
388 sregex--current-sregex)))))
389
390 (defun sregex-match-data (&optional sregex)
391 "Like `match-data', for a regexp made with `sregex'.
392 This takes one additional optional argument, the `sregex' info, which
393 can be obtained with `sregex-info'. \"Introduced groups\" are removed
394 from the result. If the extra argument is omitted or nil, it defaults
395 to the current value of (sregex-info)."
396 (let* ((data (match-data))
397 (groups (sregex--value-groups (or sregex
398 sregex--current-sregex)))
399 (result (list (car (cdr data))
400 (car data))))
401 (setq data (cdr (cdr data)))
402 (while data
403 (if (car groups)
404 (setq result (append (list (car (cdr data))
405 (car data))
406 result)))
407 (setq groups (cdr groups)
408 data (cdr (cdr data))))
409 (reverse result)))
410
411 (defun sregex--render-tree (tree sregex)
412 (let ((key (car tree)))
413 (cond ((eq key 'str)
414 (cdr tree))
415 ((eq key 'or)
416 (mapconcat '(lambda (x)
417 (sregex--render-tree x sregex))
418 (cdr tree)
419 "\\|"))
420 ((eq key 'sequence)
421 (apply 'concat
422 (mapcar '(lambda (x)
423 (sregex--render-tree x sregex))
424 (cdr tree))))
425 ((eq key 'group)
426 (concat "\\("
427 (sregex--render-tree (cdr tree) sregex)
428 "\\)"))
429 ((eq key 'opt)
430 (concat (sregex--render-tree (cdr tree) sregex)
431 "?"))
432 ((eq key '0+)
433 (concat (sregex--render-tree (cdr tree) sregex)
434 "*"))
435 ((eq key '1+)
436 (concat (sregex--render-tree (cdr tree) sregex)
437 "+"))
438 ((eq key 'backref)
439 (let ((num (sregex-backref-num (cdr tree) sregex)))
440 (if (> num 9)
441 (error "sregex: backref number %d too high after adjustment"
442 num)
443 (concat "\\" (int-to-string num)))))
444 (t (error "sregex internal error: unknown tree type %S"
445 key)))))
446
447 (defun sregex (&rest exps)
448 "Symbolic regular expression interpreter.
449 This is exactly like `sregexq' (q.v.) except that it evaluates all its
450 arguments, so literal sregex clauses must be quoted. For example:
451
452 (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
453
454 An argument-evaluating sregex interpreter lets you reuse sregex
455 subexpressions:
456
457 (let ((dotstar '(0+ any))
458 (whitespace '(1+ (syntax ?-)))
459 (digits '(1+ (char (?0 . ?9)))))
460 (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\""
461 (progn
462 (setq sregex--current-sregex (sregex--sequence exps nil))
463 (sregex--render-tree (sregex--value-tree sregex--current-sregex)
464 sregex--current-sregex)))
465
466 (defmacro sregexq (&rest exps)
467 "Symbolic regular expression interpreter.
468 This macro allows you to specify a regular expression (regexp) in
469 symbolic form, and converts it into the string form required by Emacs's
470 regex functions such as `re-search-forward' and `looking-at'. Here is
471 a simple example:
472
473 (sregexq (or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
474
475 As you can see, an sregex is specified by placing one or more special
476 clauses in a call to `sregexq'. The clause in this case is the `or'
477 of two strings (not to be confused with the Lisp function `or'). The
478 list of allowable clauses appears below.
479
480 With `sregex', it is never necessary to \"escape\" magic characters
481 that are meant to be taken literally; that happens automatically.
482 For example:
483
484 (sregexq \"M*A*S*H\") => \"M\\\\*A\\\\*S\\\\*H\"
485
486 It is also unnecessary to \"group\" parts of the expression together
487 to overcome operator precedence; that also happens automatically.
488 For example:
489
490 (sregexq (opt (or \"Bob\" \"Robert\"))) => \"\\\\(Bob\\\\|Robert\\\\)?\"
491
492 It *is* possible to group parts of the expression in order to refer
493 to them with numbered backreferences:
494
495 (sregexq (group (or \"Go\" \"Run\"))
496 \", Spot, \"
497 (backref 1)) => \"\\\\(Go\\\\|Run\\\\), Spot, \\\\1\"
498
499 If `sregexq' needs to introduce its own grouping parentheses, it will
500 automatically renumber your backreferences:
501
502 (sregexq (opt \"resent-\")
503 (group (or \"to\" \"cc\" \"bcc\"))
504 \": \"
505 (backref 1)) => \"\\\\(resent-\\\\)?\\\\(to\\\\|cc\\\\|bcc\\\\): \\\\2\"
506
507 `sregexq' is a macro. Each time it is used, it constructs a simple
508 Lisp expression that then invokes a moderately complex engine to
509 interpret the sregex and render the string form. Because of this, I
510 don't recommend sprinkling calls to `sregexq' throughout your code,
511 the way one normally does with string regexes (which are cheap to
512 evaluate). Instead, it's wiser to precompute the regexes you need
513 wherever possible instead of repeatedly constructing the same ones
514 over and over. Example:
515
516 (let ((field-regex (sregexq (opt \"resent-\")
517 (or \"to\" \"cc\" \"bcc\"))))
518 ...
519 (while ...
520 ...
521 (re-search-forward field-regex ...)
522 ...))
523
524 The arguments to `sregexq' are automatically quoted, but the
525 flipside of this is that it is not straightforward to include
526 computed (i.e., non-constant) values in `sregexq' expressions. So
527 `sregex' is a function that is like `sregexq' but which does not
528 automatically quote its values. Literal sregex clauses must be
529 explicitly quoted like so:
530
531 (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
532
533 but computed clauses can be included easily, allowing for the reuse
534 of common clauses:
535
536 (let ((dotstar '(0+ any))
537 (whitespace '(1+ (syntax ?-)))
538 (digits '(1+ (char (?0 . ?9)))))
539 (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\"
540
541 Here are the clauses allowed in an `sregex' or `sregexq' expression:
542
543 - a string
544 This stands for the literal string. If it contains
545 metacharacters, they will be escaped in the resulting regex
546 (using `regexp-quote').
547
548 - the symbol `any'
549 This stands for \".\", a regex matching any character except
550 newline.
551
552 - the symbol `bol'
553 Stands for \"^\", matching the empty string at the beginning of a line
554
555 - the symbol `eol'
556 Stands for \"$\", matching the empty string at the end of a line
557
558 - (group CLAUSE ...)
559 Groups the given CLAUSEs using \"\\\\(\" and \"\\\\)\".
560
561 - (sequence CLAUSE ...)
562
563 Groups the given CLAUSEs; may or may not use \"\\\\(\" and \"\\\\)\".
564 Clauses groups by `sequence' do not count for purposes of
565 numbering backreferences. Use `sequence' in situations like
566 this:
567
568 (sregexq (or \"dog\" \"cat\"
569 (sequence (opt \"sea \") \"monkey\")))
570 => \"dog\\\\|cat\\\\|\\\\(sea \\\\)?monkey\"
571
572 where a single `or' alternate needs to contain multiple
573 subclauses.
574
575 - (backref N)
576 Matches the same string previously matched by the Nth \"group\" in
577 the same sregex. N is a positive integer. In the resulting
578 regex, N may be adjusted to account for automatically introduced
579 groups.
580
581 - (or CLAUSE ...)
582 Matches any one of the CLAUSEs by separating them with \"\\\\|\".
583
584 - (0+ CLAUSE ...)
585 Concatenates the given CLAUSEs and matches zero or more
586 occurrences by appending \"*\".
587
588 - (1+ CLAUSE ...)
589 Concatenates the given CLAUSEs and matches one or more
590 occurrences by appending \"+\".
591
592 - (opt CLAUSE ...)
593 Concatenates the given CLAUSEs and matches zero or one occurrence
594 by appending \"?\".
595
596 - (repeat MIN MAX CLAUSE ...)
597 Concatenates the given CLAUSEs and constructs a regex matching at
598 least MIN occurrences and at most MAX occurrences. MIN must be a
599 non-negative integer. MAX must be a non-negative integer greater
600 than or equal to MIN; or MAX can be nil to mean \"infinity.\"
601
602 - (char CHAR-CLAUSE ...)
603 Creates a \"character class\" matching one character from the given
604 set. See below for how to construct a CHAR-CLAUSE.
605
606 - (not-char CHAR-CLAUSE ...)
607 Creates a \"character class\" matching any one character not in the
608 given set. See below for how to construct a CHAR-CLAUSE.
609
610 - the symbol `bot'
611 Stands for \"\\\\`\", matching the empty string at the beginning of
612 text (beginning of a string or of a buffer).
613
614 - the symbol `eot'
615 Stands for \"\\\\'\", matching the empty string at the end of text.
616
617 - the symbol `point'
618 Stands for \"\\\\=\", matching the empty string at point.
619
620 - the symbol `word-boundary'
621 Stands for \"\\\\b\", matching the empty string at the beginning or
622 end of a word.
623
624 - the symbol `not-word-boundary'
625 Stands for \"\\\\B\", matching the empty string not at the beginning
626 or end of a word.
627
628 - the symbol `bow'
629 Stands for \"\\\\\\=<\", matching the empty string at the beginning of a
630 word.
631
632 - the symbol `eow'
633 Stands for \"\\\\\\=>\", matching the empty string at the end of a word.
634
635 - the symbol `wordchar'
636 Stands for the regex \"\\\\w\", matching a word-constituent character
637 (as determined by the current syntax table)
638
639 - the symbol `not-wordchar'
640 Stands for the regex \"\\\\W\", matching a non-word-constituent
641 character.
642
643 - (syntax CODE)
644 Stands for the regex \"\\\\sCODE\", where CODE is a syntax table code
645 (a single character). Matches any character with the requested
646 syntax.
647
648 - (not-syntax CODE)
649 Stands for the regex \"\\\\SCODE\", where CODE is a syntax table code
650 (a single character). Matches any character without the
651 requested syntax.
652
653 - (regex REGEX)
654 This is a \"trapdoor\" for including ordinary regular expression
655 strings in the result. Some regular expressions are clearer when
656 written the old way: \"[a-z]\" vs. (sregexq (char (?a . ?z))), for
657 instance. However, using this can confuse the code that
658 distinguishes introduced groups from user-specified groups. Avoid
659 using grouping within a `regex' form. Failing that, avoid using
660 backrefs if you're using `regex'.
661
662 Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...)
663 has one of the following forms:
664
665 - a character
666 Adds that character to the set.
667
668 - a string
669 Adds all the characters in the string to the set.
670
671 - A pair (MIN . MAX)
672 Where MIN and MAX are characters, adds the range of characters
673 from MIN through MAX to the set."
674 `(apply 'sregex ',exps))
675
676 (defun sregex--engine (exp combine)
677 (let* ((val (cond ((stringp exp)
678 (sregex--make-value (or (not (eq combine 'suffix))
679 (= (length exp) 1))
680 nil
681 (cons 'str
682 (regexp-quote exp))))
683 ((symbolp exp)
684 (funcall (intern (concat "sregex--"
685 (symbol-name exp)))
686 combine))
687 ((consp exp)
688 (funcall (intern (concat "sregex--"
689 (symbol-name (car exp))))
690 (cdr exp)
691 combine))
692 (t (error "Invalid expression: %s" exp))))
693 (unitp (sregex--value-unitp val))
694 (groups (sregex--value-groups val))
695 (tree (sregex--value-tree val)))
696 (if (and combine (not unitp))
697 (sregex--make-value t
698 (cons nil groups)
699 (cons 'group tree))
700 (sregex--make-value unitp groups tree))))
701
702 (defun sregex--sequence (exps combine)
703 (if (= (length exps) 1)
704 (sregex--engine (car exps) combine)
705 (let ((groups nil)
706 (trees nil)) ;grows in reverse
707 (while exps
708 (let ((val (sregex--engine (car exps) 'concat)))
709 (setq groups (append groups
710 (sregex--value-groups val))
711 trees (cons (sregex--value-tree val) trees)
712 exps (cdr exps))))
713 (setq trees (nreverse trees))
714 (if (eq combine 'suffix)
715 (sregex--make-value t
716 (cons nil groups)
717 (cons 'group
718 (cons 'sequence trees)))
719 (sregex--make-value (not (eq combine 'suffix))
720 groups
721 (cons 'sequence trees))))))
722
723 (defun sregex--group (exps combine)
724 (let ((val (sregex--sequence exps nil)))
725 (sregex--make-value t
726 (cons t (sregex--value-groups val))
727 (cons 'group (sregex--value-tree val)))))
728
729 (defun sregex-backref-num (n &optional sregex)
730 "Adjust backreference number N according to SREGEX.
731 When `sregex' introduces parenthesized groups that the user didn't ask
732 for, the numbering of the groups that the user *did* ask for gets all
733 out of whack. This function accounts for introduced groups. Example:
734
735 (sregexq (opt \"ab\")
736 (group (or \"c\" \"d\"))) => \"\\\\(ab\\\\)?\\\\(c\\\\|d\\\\)\"
737 (setq info (sregex-info))
738 (sregex-backref-num 1 info) => 2
739
740 The SREGEX parameter is optional and defaults to the current value of
741 `sregex-info'."
742 (let ((groups (sregex--value-groups (or sregex
743 sregex--current-sregex)))
744 (result 0))
745 (while (and groups (> n 0))
746 (if (car groups)
747 (setq n (1- n)))
748 (setq result (1+ result)
749 groups (cdr groups)))
750 result))
751
752 (defun sregex--backref (exps combine)
753 (sregex--make-value t nil (cons 'backref (car exps))))
754
755 (defun sregex--any (combine)
756 (sregex--make-value t nil '(str . ".")))
757
758 (defun sregex--opt (exps combine)
759 (let ((val (sregex--sequence exps 'suffix)))
760 (sregex--make-value t
761 (sregex--value-groups val)
762 (cons 'opt (sregex--value-tree val)))))
763
764 (defun sregex--0+ (exps combine)
765 (let ((val (sregex--sequence exps 'suffix)))
766 (sregex--make-value t
767 (sregex--value-groups val)
768 (cons '0+ (sregex--value-tree val)))))
769 (defun sregex--1+ (exps combine)
770 (let ((val (sregex--sequence exps 'suffix)))
771 (sregex--make-value t
772 (sregex--value-groups val)
773 (cons '1+ (sregex--value-tree val)))))
774
775 (defun sregex--repeat (exps combine)
776 (let ((min (or (car exps) 0))
777 (max (car (cdr exps))))
778 (setq exps (cdr (cdr exps)))
779 (cond ((zerop min)
780 (cond ((equal max 0) ;degenerate
781 (sregex--make-value t nil nil))
782 ((equal max 1)
783 (sregex--opt exps combine))
784 ((not max)
785 (sregex--0+ exps combine))
786 (t (sregex--sequence (make-list max
787 (cons 'opt exps))
788 combine))))
789 ((= min 1)
790 (cond ((equal max 1)
791 (sregex--sequence exps combine))
792 ((not max)
793 (sregex--1+ exps combine))
794 (t (sregex--sequence (append exps
795 (make-list (1- max)
796 (cons 'opt exps)))
797 combine))))
798 (t (sregex--sequence (append exps
799 (list (append (list 'repeat
800 (1- min)
801 (and max
802 (1- max)))
803 exps)))
804 combine)))))
805
806 (defun sregex--or (exps combine)
807 (if (= (length exps) 1)
808 (sregex--engine (car exps) combine)
809 (let ((groups nil)
810 (trees nil))
811 (while exps
812 (let ((val (sregex--engine (car exps) 'or)))
813 (setq groups (append groups
814 (sregex--value-groups val))
815 trees (cons (sregex--value-tree val) trees)
816 exps (cdr exps))))
817 (sregex--make-value (eq combine 'or)
818 groups
819 (cons 'or (nreverse trees))))))
820
821 (defmacro sregex--char-range-aux ()
822 '(if start
823 (let (startc endc)
824 (if (and (<= 32 start)
825 (<= start 127))
826 (setq startc (char-to-string start)
827 endc (char-to-string end))
828 (setq startc (format "\\%03o" start)
829 endc (format "\\%03o" end)))
830 (if (> end start)
831 (if (> end (+ start 1))
832 (setq class (concat class startc "-" endc))
833 (setq class (concat class startc endc)))
834 (setq class (concat class startc))))))
835
836 (defmacro sregex--char-range (rstart rend)
837 `(let ((i ,rstart)
838 start end)
839 (while (<= i ,rend)
840 (if (aref chars i)
841 (progn
842 (if start
843 (setq end i)
844 (setq start i
845 end i))
846 (aset chars i nil))
847 (sregex--char-range-aux)
848 (setq start nil
849 end nil))
850 (setq i (1+ i)))
851 (sregex--char-range-aux)))
852
853 (defun sregex--char-aux (complement args)
854 (let ((chars (make-vector 256 nil)))
855 (while args
856 (let ((arg (car args)))
857 (cond ((integerp arg)
858 (aset chars arg t))
859 ((stringp arg)
860 (mapcar (function
861 (lambda (c)
862 (aset chars c t)))
863 arg))
864 ((consp arg)
865 (let ((start (car arg))
866 (end (cdr arg)))
867 (if (> start end)
868 (let ((tmp start))
869 (setq start end
870 end tmp)))
871 ;; now start <= end
872 (let ((i start))
873 (while (<= i end)
874 (aset chars i t)
875 (setq i (1+ i))))))))
876 (setq args (cdr args)))
877 ;; now chars is a map of the characters in the class
878 (let ((class "")
879 (caret (aref chars ?^)))
880 (aset chars ?^ nil)
881 (if (aref chars ?\])
882 (progn
883 (setq class (concat class "]"))
884 (aset chars ?\] nil)))
885 (if (aref chars ?-)
886 (progn
887 (setq class (concat class "-"))
888 (aset chars ?- nil)))
889 (if (aref chars ?\\)
890 (progn
891 (setq class (concat class "\\\\"))
892 (aset chars ?\\ nil)))
893
894 (sregex--char-range ?A ?Z)
895 (sregex--char-range ?a ?z)
896 (sregex--char-range ?0 ?9)
897
898 (let ((i 32))
899 (while (< i 128)
900 (if (aref chars i)
901 (progn
902 (setq class (concat class (char-to-string i)))
903 (aset chars i nil)))
904 (setq i (1+ i))))
905
906 (sregex--char-range 0 31)
907 (sregex--char-range 128 255)
908
909 (let ((i 0))
910 (while (< i 256)
911 (if (aref chars i)
912 (setq class (concat class (format "\\%03o" i))))
913 (setq i (1+ i))))
914
915 (if caret
916 (setq class (concat class "^")))
917 (concat "[" (if complement "^") class "]"))))
918
919 (defun sregex--char (exps combine)
920 (sregex--make-value t nil (cons 'str (sregex--char-aux nil exps))))
921 (defun sregex--not-char (exps combine)
922 (sregex--make-value t nil (cons 'str (sregex--char-aux t exps))))
923
924 (defun sregex--bol (combine)
925 (sregex--make-value t nil '(str . "^")))
926 (defun sregex--eol (combine)
927 (sregex--make-value t nil '(str . "$")))
928
929 (defun sregex--wordchar (combine)
930 (sregex--make-value t nil '(str . "\\w")))
931 (defun sregex--not-wordchar (combine)
932 (sregex--make-value t nil '(str . "\\W")))
933
934 (defun sregex--syntax (exps combine)
935 (sregex--make-value t nil (cons 'str (format "\\s%c" (car exps)))))
936 (defun sregex--not-syntax (exps combine)
937 (sregex--make-value t nil (cons 'str (format "\\S%c" (car exps)))))
938
939 (defun sregex--bot (combine)
940 (sregex--make-value t nil (cons 'str "\\`")))
941 (defun sregex--eot (combine)
942 (sregex--make-value t nil (cons 'str "\\'")))
943
944 (defun sregex--point (combine)
945 (sregex--make-value t nil '(str . "\\=")))
946
947 (defun sregex--word-boundary (combine)
948 (sregex--make-value t nil '(str . "\\b")))
949 (defun sregex--not-word-boundary (combine)
950 (sregex--make-value t nil '(str . "\\B")))
951
952 (defun sregex--bow (combine)
953 (sregex--make-value t nil '(str . "\\<")))
954 (defun sregex--eow (combine)
955 (sregex--make-value t nil '(str . "\\>")))
956
957
958 ;; trapdoor - usage discouraged
959 (defun sregex--regex (exps combine)
960 (sregex--make-value nil nil (car exps)))
961
962 (provide 'sregex)
963
964 ;;; sregex.el ends here
965