22537
|
1 ;;; sregex.el --- symbolic regular expressions
|
|
2
|
64751
|
3 ;; Copyright (C) 1997, 1998, 2000, 2002, 2003, 2004,
|
|
4 ;; 2005 Free Software Foundation, Inc.
|
22537
|
5
|
|
6 ;; Author: Bob Glickstein <bobg+sregex@zanshin.com>
|
|
7 ;; Maintainer: Bob Glickstein <bobg+sregex@zanshin.com>
|
29212
|
8 ;; Keywords: extensions
|
22537
|
9
|
|
10 ;; This file is part of GNU Emacs.
|
|
11
|
|
12 ;; GNU Emacs is free software; you can redistribute it and/or modify
|
|
13 ;; it under the terms of the GNU General Public License as published by
|
|
14 ;; the Free Software Foundation; either version 2, or (at your option)
|
|
15 ;; any later version.
|
|
16
|
|
17 ;; GNU Emacs is distributed in the hope that it will be useful,
|
|
18 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
19 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
20 ;; GNU General Public License for more details.
|
|
21
|
|
22 ;; You should have received a copy of the GNU General Public License
|
|
23 ;; along with GNU Emacs; see the file COPYING. If not, write to the
|
64085
|
24 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
|
25 ;; Boston, MA 02110-1301, USA.
|
22537
|
26
|
|
27 ;;; Commentary:
|
|
28
|
|
29 ;; This package allows you to write regular expressions using a
|
|
30 ;; totally new, Lisp-like syntax.
|
|
31
|
|
32 ;; A "symbolic regular expression" (sregex for short) is a Lisp form
|
|
33 ;; that, when evaluated, produces the string form of the specified
|
|
34 ;; regular expression. Here's a simple example:
|
|
35
|
|
36 ;; (sregexq (or "Bob" "Robert")) => "Bob\\|Robert"
|
|
37
|
|
38 ;; As you can see, an sregex is specified by placing one or more
|
|
39 ;; special clauses in a call to `sregexq'. The clause in this case is
|
|
40 ;; the `or' of two strings (not to be confused with the Lisp function
|
|
41 ;; `or'). The list of allowable clauses appears below.
|
|
42
|
|
43 ;; With sregex, it is never necessary to "escape" magic characters
|
|
44 ;; that are meant to be taken literally; that happens automatically.
|
|
45 ;; For example:
|
|
46
|
|
47 ;; (sregexq "M*A*S*H") => "M\\*A\\*S\\*H"
|
|
48
|
|
49 ;; It is also unnecessary to "group" parts of the expression together
|
|
50 ;; to overcome operator precedence; that also happens automatically.
|
|
51 ;; For example:
|
|
52
|
29069
|
53 ;; (sregexq (opt (or "Bob" "Robert"))) => "\\(?:Bob\\|Robert\\)?"
|
22537
|
54
|
|
55 ;; It *is* possible to group parts of the expression in order to refer
|
|
56 ;; to them with numbered backreferences:
|
|
57
|
|
58 ;; (sregexq (group (or "Go" "Run"))
|
|
59 ;; ", Spot, "
|
|
60 ;; (backref 1)) => "\\(Go\\|Run\\), Spot, \\1"
|
|
61
|
|
62 ;; `sregexq' is a macro. Each time it is used, it constructs a simple
|
|
63 ;; Lisp expression that then invokes a moderately complex engine to
|
|
64 ;; interpret the sregex and render the string form. Because of this,
|
|
65 ;; I don't recommend sprinkling calls to `sregexq' throughout your
|
|
66 ;; code, the way one normally does with string regexes (which are
|
|
67 ;; cheap to evaluate). Instead, it's wiser to precompute the regexes
|
|
68 ;; you need wherever possible instead of repeatedly constructing the
|
|
69 ;; same ones over and over. Example:
|
|
70
|
|
71 ;; (let ((field-regex (sregexq (opt "resent-")
|
|
72 ;; (or "to" "cc" "bcc"))))
|
|
73 ;; ...
|
|
74 ;; (while ...
|
|
75 ;; ...
|
|
76 ;; (re-search-forward field-regex ...)
|
|
77 ;; ...))
|
|
78
|
|
79 ;; The arguments to `sregexq' are automatically quoted, but the
|
|
80 ;; flipside of this is that it is not straightforward to include
|
|
81 ;; computed (i.e., non-constant) values in `sregexq' expressions. So
|
|
82 ;; `sregex' is a function that is like `sregexq' but which does not
|
|
83 ;; automatically quote its values. Literal sregex clauses must be
|
|
84 ;; explicitly quoted like so:
|
|
85
|
|
86 ;; (sregex '(or "Bob" "Robert")) => "Bob\\|Robert"
|
|
87
|
|
88 ;; but computed clauses can be included easily, allowing for the reuse
|
|
89 ;; of common clauses:
|
|
90
|
|
91 ;; (let ((dotstar '(0+ any))
|
|
92 ;; (whitespace '(1+ (syntax ?-)))
|
|
93 ;; (digits '(1+ (char (?0 . ?9)))))
|
|
94 ;; (sregex 'bol dotstar ":" whitespace digits)) => "^.*:\\s-+[0-9]+"
|
|
95
|
|
96 ;; To use this package in a Lisp program, simply (require 'sregex).
|
|
97
|
|
98 ;; Here are the clauses allowed in an `sregex' or `sregexq'
|
|
99 ;; expression:
|
|
100
|
|
101 ;; - a string
|
|
102 ;; This stands for the literal string. If it contains
|
|
103 ;; metacharacters, they will be escaped in the resulting regex
|
|
104 ;; (using `regexp-quote').
|
|
105
|
|
106 ;; - the symbol `any'
|
|
107 ;; This stands for ".", a regex matching any character except
|
|
108 ;; newline.
|
|
109
|
|
110 ;; - the symbol `bol'
|
|
111 ;; Stands for "^", matching the empty string at the beginning of a line
|
|
112
|
|
113 ;; - the symbol `eol'
|
|
114 ;; Stands for "$", matching the empty string at the end of a line
|
|
115
|
|
116 ;; - (group CLAUSE ...)
|
|
117 ;; Groups the given CLAUSEs using "\\(" and "\\)".
|
|
118
|
|
119 ;; - (sequence CLAUSE ...)
|
|
120
|
29069
|
121 ;; Groups the given CLAUSEs; may or may not use "\\(?:" and "\\)".
|
|
122 ;; Clauses grouped by `sequence' do not count for purposes of
|
22537
|
123 ;; numbering backreferences. Use `sequence' in situations like
|
|
124 ;; this:
|
|
125
|
|
126 ;; (sregexq (or "dog" "cat"
|
|
127 ;; (sequence (opt "sea ") "monkey")))
|
29069
|
128 ;; => "dog\\|cat\\|\\(?:sea \\)?monkey"
|
22537
|
129
|
|
130 ;; where a single `or' alternate needs to contain multiple
|
|
131 ;; subclauses.
|
|
132
|
|
133 ;; - (backref N)
|
|
134 ;; Matches the same string previously matched by the Nth "group" in
|
29069
|
135 ;; the same sregex. N is a positive integer.
|
22537
|
136
|
|
137 ;; - (or CLAUSE ...)
|
|
138 ;; Matches any one of the CLAUSEs by separating them with "\\|".
|
|
139
|
|
140 ;; - (0+ CLAUSE ...)
|
|
141 ;; Concatenates the given CLAUSEs and matches zero or more
|
|
142 ;; occurrences by appending "*".
|
|
143
|
|
144 ;; - (1+ CLAUSE ...)
|
|
145 ;; Concatenates the given CLAUSEs and matches one or more
|
|
146 ;; occurrences by appending "+".
|
|
147
|
|
148 ;; - (opt CLAUSE ...)
|
|
149 ;; Concatenates the given CLAUSEs and matches zero or one occurrence
|
|
150 ;; by appending "?".
|
|
151
|
|
152 ;; - (repeat MIN MAX CLAUSE ...)
|
|
153 ;; Concatenates the given CLAUSEs and constructs a regex matching at
|
|
154 ;; least MIN occurrences and at most MAX occurrences. MIN must be a
|
|
155 ;; non-negative integer. MAX must be a non-negative integer greater
|
|
156 ;; than or equal to MIN; or MAX can be nil to mean "infinity."
|
|
157
|
|
158 ;; - (char CHAR-CLAUSE ...)
|
|
159 ;; Creates a "character class" matching one character from the given
|
|
160 ;; set. See below for how to construct a CHAR-CLAUSE.
|
|
161
|
|
162 ;; - (not-char CHAR-CLAUSE ...)
|
|
163 ;; Creates a "character class" matching any one character not in the
|
|
164 ;; given set. See below for how to construct a CHAR-CLAUSE.
|
|
165
|
|
166 ;; - the symbol `bot'
|
|
167 ;; Stands for "\\`", matching the empty string at the beginning of
|
|
168 ;; text (beginning of a string or of a buffer).
|
|
169
|
|
170 ;; - the symbol `eot'
|
|
171 ;; Stands for "\\'", matching the empty string at the end of text.
|
|
172
|
|
173 ;; - the symbol `point'
|
|
174 ;; Stands for "\\=", matching the empty string at point.
|
|
175
|
|
176 ;; - the symbol `word-boundary'
|
|
177 ;; Stands for "\\b", matching the empty string at the beginning or
|
|
178 ;; end of a word.
|
|
179
|
|
180 ;; - the symbol `not-word-boundary'
|
|
181 ;; Stands for "\\B", matching the empty string not at the beginning
|
|
182 ;; or end of a word.
|
|
183
|
|
184 ;; - the symbol `bow'
|
|
185 ;; Stands for "\\<", matching the empty string at the beginning of a
|
|
186 ;; word.
|
|
187
|
|
188 ;; - the symbol `eow'
|
|
189 ;; Stands for "\\>", matching the empty string at the end of a word.
|
|
190
|
|
191 ;; - the symbol `wordchar'
|
|
192 ;; Stands for the regex "\\w", matching a word-constituent character
|
|
193 ;; (as determined by the current syntax table)
|
|
194
|
|
195 ;; - the symbol `not-wordchar'
|
|
196 ;; Stands for the regex "\\W", matching a non-word-constituent
|
|
197 ;; character.
|
|
198
|
|
199 ;; - (syntax CODE)
|
|
200 ;; Stands for the regex "\\sCODE", where CODE is a syntax table code
|
|
201 ;; (a single character). Matches any character with the requested
|
|
202 ;; syntax.
|
|
203
|
|
204 ;; - (not-syntax CODE)
|
|
205 ;; Stands for the regex "\\SCODE", where CODE is a syntax table code
|
|
206 ;; (a single character). Matches any character without the
|
|
207 ;; requested syntax.
|
|
208
|
|
209 ;; - (regex REGEX)
|
|
210 ;; This is a "trapdoor" for including ordinary regular expression
|
|
211 ;; strings in the result. Some regular expressions are clearer when
|
|
212 ;; written the old way: "[a-z]" vs. (sregexq (char (?a . ?z))), for
|
|
213 ;; instance. However, see the note under "Bugs," below.
|
|
214
|
|
215 ;; Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...)
|
|
216 ;; has one of the following forms:
|
|
217
|
|
218 ;; - a character
|
|
219 ;; Adds that character to the set.
|
|
220
|
|
221 ;; - a string
|
|
222 ;; Adds all the characters in the string to the set.
|
|
223
|
|
224 ;; - A pair (MIN . MAX)
|
|
225 ;; Where MIN and MAX are characters, adds the range of characters
|
|
226 ;; from MIN through MAX to the set.
|
|
227
|
|
228 ;;; To do:
|
|
229
|
|
230 ;; An earlier version of this package could optionally translate the
|
|
231 ;; symbolic regex into other languages' syntaxes, e.g. Perl. For
|
|
232 ;; instance, with Perl syntax selected, (sregexq (or "ab" "cd")) would
|
|
233 ;; yield "ab|cd" instead of "ab\\|cd". It might be useful to restore
|
|
234 ;; such a facility.
|
|
235
|
29069
|
236 ;; - handle multibyte chars in sregex--char-aux
|
|
237 ;; - add support for character classes ([:blank:], ...)
|
|
238 ;; - add support for non-greedy operators *? and +?
|
|
239 ;; - bug: (sregexq (opt (opt ?a))) returns "a??" which is a non-greedy "a?"
|
22537
|
240
|
29069
|
241 ;;; Bugs:
|
22537
|
242
|
|
243 ;;; Code:
|
|
244
|
29069
|
245 (eval-when-compile (require 'cl))
|
22537
|
246
|
29069
|
247 ;; Compatibility code for when we didn't have shy-groups
|
|
248 (defvar sregex--current-sregex nil)
|
|
249 (defun sregex-info () nil)
|
|
250 (defmacro sregex-save-match-data (&rest forms) (cons 'save-match-data forms))
|
|
251 (defun sregex-replace-match (r &optional f l str subexp x)
|
|
252 (replace-match r f l str subexp))
|
|
253 (defun sregex-match-string (c &optional i x) (match-string c i))
|
|
254 (defun sregex-match-string-no-properties (count &optional in-string sregex)
|
|
255 (match-string-no-properties count in-string))
|
|
256 (defun sregex-match-beginning (count &optional sregex) (match-beginning count))
|
|
257 (defun sregex-match-end (count &optional sregex) (match-end count))
|
|
258 (defun sregex-match-data (&optional sregex) (match-data))
|
|
259 (defun sregex-backref-num (n &optional sregex) n)
|
22537
|
260
|
|
261
|
|
262 (defun sregex (&rest exps)
|
|
263 "Symbolic regular expression interpreter.
|
|
264 This is exactly like `sregexq' (q.v.) except that it evaluates all its
|
|
265 arguments, so literal sregex clauses must be quoted. For example:
|
|
266
|
|
267 (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
|
|
268
|
|
269 An argument-evaluating sregex interpreter lets you reuse sregex
|
|
270 subexpressions:
|
|
271
|
|
272 (let ((dotstar '(0+ any))
|
|
273 (whitespace '(1+ (syntax ?-)))
|
|
274 (digits '(1+ (char (?0 . ?9)))))
|
|
275 (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\""
|
29069
|
276 (sregex--sequence exps nil))
|
22537
|
277
|
|
278 (defmacro sregexq (&rest exps)
|
|
279 "Symbolic regular expression interpreter.
|
|
280 This macro allows you to specify a regular expression (regexp) in
|
|
281 symbolic form, and converts it into the string form required by Emacs's
|
|
282 regex functions such as `re-search-forward' and `looking-at'. Here is
|
|
283 a simple example:
|
|
284
|
|
285 (sregexq (or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
|
|
286
|
|
287 As you can see, an sregex is specified by placing one or more special
|
|
288 clauses in a call to `sregexq'. The clause in this case is the `or'
|
|
289 of two strings (not to be confused with the Lisp function `or'). The
|
|
290 list of allowable clauses appears below.
|
|
291
|
|
292 With `sregex', it is never necessary to \"escape\" magic characters
|
|
293 that are meant to be taken literally; that happens automatically.
|
|
294 For example:
|
|
295
|
|
296 (sregexq \"M*A*S*H\") => \"M\\\\*A\\\\*S\\\\*H\"
|
|
297
|
|
298 It is also unnecessary to \"group\" parts of the expression together
|
|
299 to overcome operator precedence; that also happens automatically.
|
|
300 For example:
|
|
301
|
|
302 (sregexq (opt (or \"Bob\" \"Robert\"))) => \"\\\\(Bob\\\\|Robert\\\\)?\"
|
|
303
|
|
304 It *is* possible to group parts of the expression in order to refer
|
|
305 to them with numbered backreferences:
|
|
306
|
|
307 (sregexq (group (or \"Go\" \"Run\"))
|
|
308 \", Spot, \"
|
|
309 (backref 1)) => \"\\\\(Go\\\\|Run\\\\), Spot, \\\\1\"
|
|
310
|
|
311 If `sregexq' needs to introduce its own grouping parentheses, it will
|
|
312 automatically renumber your backreferences:
|
|
313
|
|
314 (sregexq (opt \"resent-\")
|
|
315 (group (or \"to\" \"cc\" \"bcc\"))
|
|
316 \": \"
|
|
317 (backref 1)) => \"\\\\(resent-\\\\)?\\\\(to\\\\|cc\\\\|bcc\\\\): \\\\2\"
|
|
318
|
|
319 `sregexq' is a macro. Each time it is used, it constructs a simple
|
|
320 Lisp expression that then invokes a moderately complex engine to
|
|
321 interpret the sregex and render the string form. Because of this, I
|
|
322 don't recommend sprinkling calls to `sregexq' throughout your code,
|
|
323 the way one normally does with string regexes (which are cheap to
|
|
324 evaluate). Instead, it's wiser to precompute the regexes you need
|
|
325 wherever possible instead of repeatedly constructing the same ones
|
|
326 over and over. Example:
|
|
327
|
|
328 (let ((field-regex (sregexq (opt \"resent-\")
|
|
329 (or \"to\" \"cc\" \"bcc\"))))
|
|
330 ...
|
|
331 (while ...
|
|
332 ...
|
|
333 (re-search-forward field-regex ...)
|
|
334 ...))
|
|
335
|
|
336 The arguments to `sregexq' are automatically quoted, but the
|
|
337 flipside of this is that it is not straightforward to include
|
|
338 computed (i.e., non-constant) values in `sregexq' expressions. So
|
|
339 `sregex' is a function that is like `sregexq' but which does not
|
|
340 automatically quote its values. Literal sregex clauses must be
|
|
341 explicitly quoted like so:
|
|
342
|
|
343 (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
|
|
344
|
|
345 but computed clauses can be included easily, allowing for the reuse
|
|
346 of common clauses:
|
|
347
|
|
348 (let ((dotstar '(0+ any))
|
|
349 (whitespace '(1+ (syntax ?-)))
|
|
350 (digits '(1+ (char (?0 . ?9)))))
|
|
351 (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\"
|
|
352
|
|
353 Here are the clauses allowed in an `sregex' or `sregexq' expression:
|
|
354
|
|
355 - a string
|
|
356 This stands for the literal string. If it contains
|
|
357 metacharacters, they will be escaped in the resulting regex
|
|
358 (using `regexp-quote').
|
|
359
|
|
360 - the symbol `any'
|
|
361 This stands for \".\", a regex matching any character except
|
|
362 newline.
|
|
363
|
|
364 - the symbol `bol'
|
|
365 Stands for \"^\", matching the empty string at the beginning of a line
|
|
366
|
|
367 - the symbol `eol'
|
|
368 Stands for \"$\", matching the empty string at the end of a line
|
|
369
|
|
370 - (group CLAUSE ...)
|
|
371 Groups the given CLAUSEs using \"\\\\(\" and \"\\\\)\".
|
|
372
|
|
373 - (sequence CLAUSE ...)
|
|
374
|
|
375 Groups the given CLAUSEs; may or may not use \"\\\\(\" and \"\\\\)\".
|
29069
|
376 Clauses grouped by `sequence' do not count for purposes of
|
22537
|
377 numbering backreferences. Use `sequence' in situations like
|
|
378 this:
|
|
379
|
|
380 (sregexq (or \"dog\" \"cat\"
|
|
381 (sequence (opt \"sea \") \"monkey\")))
|
29069
|
382 => \"dog\\\\|cat\\\\|\\\\(?:sea \\\\)?monkey\"
|
22537
|
383
|
|
384 where a single `or' alternate needs to contain multiple
|
|
385 subclauses.
|
|
386
|
|
387 - (backref N)
|
|
388 Matches the same string previously matched by the Nth \"group\" in
|
29069
|
389 the same sregex. N is a positive integer.
|
22537
|
390
|
|
391 - (or CLAUSE ...)
|
|
392 Matches any one of the CLAUSEs by separating them with \"\\\\|\".
|
|
393
|
|
394 - (0+ CLAUSE ...)
|
|
395 Concatenates the given CLAUSEs and matches zero or more
|
|
396 occurrences by appending \"*\".
|
|
397
|
|
398 - (1+ CLAUSE ...)
|
|
399 Concatenates the given CLAUSEs and matches one or more
|
|
400 occurrences by appending \"+\".
|
|
401
|
|
402 - (opt CLAUSE ...)
|
|
403 Concatenates the given CLAUSEs and matches zero or one occurrence
|
|
404 by appending \"?\".
|
|
405
|
|
406 - (repeat MIN MAX CLAUSE ...)
|
|
407 Concatenates the given CLAUSEs and constructs a regex matching at
|
|
408 least MIN occurrences and at most MAX occurrences. MIN must be a
|
|
409 non-negative integer. MAX must be a non-negative integer greater
|
|
410 than or equal to MIN; or MAX can be nil to mean \"infinity.\"
|
|
411
|
|
412 - (char CHAR-CLAUSE ...)
|
|
413 Creates a \"character class\" matching one character from the given
|
|
414 set. See below for how to construct a CHAR-CLAUSE.
|
|
415
|
|
416 - (not-char CHAR-CLAUSE ...)
|
|
417 Creates a \"character class\" matching any one character not in the
|
|
418 given set. See below for how to construct a CHAR-CLAUSE.
|
|
419
|
|
420 - the symbol `bot'
|
|
421 Stands for \"\\\\`\", matching the empty string at the beginning of
|
|
422 text (beginning of a string or of a buffer).
|
|
423
|
|
424 - the symbol `eot'
|
|
425 Stands for \"\\\\'\", matching the empty string at the end of text.
|
|
426
|
|
427 - the symbol `point'
|
|
428 Stands for \"\\\\=\", matching the empty string at point.
|
|
429
|
|
430 - the symbol `word-boundary'
|
|
431 Stands for \"\\\\b\", matching the empty string at the beginning or
|
|
432 end of a word.
|
|
433
|
|
434 - the symbol `not-word-boundary'
|
|
435 Stands for \"\\\\B\", matching the empty string not at the beginning
|
|
436 or end of a word.
|
|
437
|
|
438 - the symbol `bow'
|
|
439 Stands for \"\\\\\\=<\", matching the empty string at the beginning of a
|
|
440 word.
|
|
441
|
|
442 - the symbol `eow'
|
|
443 Stands for \"\\\\\\=>\", matching the empty string at the end of a word.
|
|
444
|
|
445 - the symbol `wordchar'
|
|
446 Stands for the regex \"\\\\w\", matching a word-constituent character
|
|
447 (as determined by the current syntax table)
|
|
448
|
|
449 - the symbol `not-wordchar'
|
|
450 Stands for the regex \"\\\\W\", matching a non-word-constituent
|
|
451 character.
|
|
452
|
|
453 - (syntax CODE)
|
|
454 Stands for the regex \"\\\\sCODE\", where CODE is a syntax table code
|
|
455 (a single character). Matches any character with the requested
|
|
456 syntax.
|
|
457
|
|
458 - (not-syntax CODE)
|
|
459 Stands for the regex \"\\\\SCODE\", where CODE is a syntax table code
|
|
460 (a single character). Matches any character without the
|
|
461 requested syntax.
|
|
462
|
|
463 - (regex REGEX)
|
|
464 This is a \"trapdoor\" for including ordinary regular expression
|
|
465 strings in the result. Some regular expressions are clearer when
|
|
466 written the old way: \"[a-z]\" vs. (sregexq (char (?a . ?z))), for
|
29069
|
467 instance.
|
22537
|
468
|
|
469 Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...)
|
|
470 has one of the following forms:
|
|
471
|
|
472 - a character
|
|
473 Adds that character to the set.
|
|
474
|
|
475 - a string
|
|
476 Adds all the characters in the string to the set.
|
|
477
|
|
478 - A pair (MIN . MAX)
|
|
479 Where MIN and MAX are characters, adds the range of characters
|
|
480 from MIN through MAX to the set."
|
|
481 `(apply 'sregex ',exps))
|
|
482
|
|
483 (defun sregex--engine (exp combine)
|
29069
|
484 (cond
|
|
485 ((stringp exp)
|
|
486 (if (and combine
|
|
487 (eq combine 'suffix)
|
|
488 (/= (length exp) 1))
|
|
489 (concat "\\(?:" (regexp-quote exp) "\\)")
|
|
490 (regexp-quote exp)))
|
|
491 ((symbolp exp)
|
|
492 (ecase exp
|
|
493 (any ".")
|
|
494 (bol "^")
|
|
495 (eol "$")
|
|
496 (wordchar "\\w")
|
|
497 (not-wordchar "\\W")
|
|
498 (bot "\\`")
|
|
499 (eot "\\'")
|
|
500 (point "\\=")
|
|
501 (word-boundary "\\b")
|
|
502 (not-word-boundary "\\B")
|
|
503 (bow "\\<")
|
|
504 (eow "\\>")))
|
|
505 ((consp exp)
|
|
506 (funcall (intern (concat "sregex--"
|
|
507 (symbol-name (car exp))))
|
|
508 (cdr exp)
|
|
509 combine))
|
|
510 (t (error "Invalid expression: %s" exp))))
|
22537
|
511
|
|
512 (defun sregex--sequence (exps combine)
|
29069
|
513 (if (= (length exps) 1) (sregex--engine (car exps) combine)
|
|
514 (let ((re (mapconcat
|
|
515 (lambda (e) (sregex--engine e 'concat))
|
|
516 exps "")))
|
22537
|
517 (if (eq combine 'suffix)
|
29069
|
518 (concat "\\(?:" re "\\)")
|
|
519 re))))
|
22537
|
520
|
|
521 (defun sregex--or (exps combine)
|
29069
|
522 (if (= (length exps) 1) (sregex--engine (car exps) combine)
|
|
523 (let ((re (mapconcat
|
|
524 (lambda (e) (sregex--engine e 'or))
|
|
525 exps "\\|")))
|
|
526 (if (not (eq combine 'or))
|
|
527 (concat "\\(?:" re "\\)")
|
|
528 re))))
|
|
529
|
|
530 (defun sregex--group (exps combine) (concat "\\(" (sregex--sequence exps nil) "\\)"))
|
|
531
|
|
532 (defun sregex--backref (exps combine) (concat "\\" (int-to-string (car exps))))
|
|
533 (defun sregex--opt (exps combine) (concat (sregex--sequence exps 'suffix) "?"))
|
|
534 (defun sregex--0+ (exps combine) (concat (sregex--sequence exps 'suffix) "*"))
|
|
535 (defun sregex--1+ (exps combine) (concat (sregex--sequence exps 'suffix) "+"))
|
|
536
|
|
537 (defun sregex--char (exps combine) (sregex--char-aux nil exps))
|
|
538 (defun sregex--not-char (exps combine) (sregex--char-aux t exps))
|
22537
|
539
|
29069
|
540 (defun sregex--syntax (exps combine) (format "\\s%c" (car exps)))
|
|
541 (defun sregex--not-syntax (exps combine) (format "\\S%c" (car exps)))
|
|
542
|
|
543 (defun sregex--regex (exps combine)
|
|
544 (if combine (concat "\\(?:" (car exps) "\\)") (car exps)))
|
22537
|
545
|
29069
|
546 (defun sregex--repeat (exps combine)
|
|
547 (let* ((min (or (pop exps) 0))
|
|
548 (minstr (number-to-string min))
|
|
549 (max (pop exps)))
|
|
550 (concat (sregex--sequence exps 'suffix)
|
|
551 (concat "\\{" minstr ","
|
|
552 (when max (number-to-string max)) "\\}"))))
|
|
553
|
|
554 (defun sregex--char-range (start end)
|
|
555 (let ((startc (char-to-string start))
|
|
556 (endc (char-to-string end)))
|
|
557 (cond
|
|
558 ((> end (+ start 2)) (concat startc "-" endc))
|
|
559 ((> end (+ start 1)) (concat startc (char-to-string (1+ start)) endc))
|
|
560 ((> end start) (concat startc endc))
|
|
561 (t startc))))
|
22537
|
562
|
|
563 (defun sregex--char-aux (complement args)
|
29069
|
564 ;; regex-opt does the same, we should join effort.
|
|
565 (let ((chars (make-bool-vector 256 nil))) ; Yeah, right!
|
|
566 (dolist (arg args)
|
|
567 (cond ((integerp arg) (aset chars arg t))
|
|
568 ((stringp arg) (mapcar (lambda (c) (aset chars c t)) arg))
|
|
569 ((consp arg)
|
|
570 (let ((start (car arg))
|
|
571 (end (cdr arg)))
|
|
572 (when (> start end)
|
|
573 (let ((tmp start)) (setq start end) (setq end tmp)))
|
|
574 ;; now start <= end
|
|
575 (let ((i start))
|
|
576 (while (<= i end)
|
|
577 (aset chars i t)
|
|
578 (setq i (1+ i))))))))
|
22537
|
579 ;; now chars is a map of the characters in the class
|
29069
|
580 (let ((caret (aref chars ?^))
|
|
581 (dash (aref chars ?-))
|
|
582 (class (if (aref chars ?\]) "]" "")))
|
22537
|
583 (aset chars ?^ nil)
|
29069
|
584 (aset chars ?- nil)
|
|
585 (aset chars ?\] nil)
|
22537
|
586
|
29069
|
587 (let (start end)
|
|
588 (dotimes (i 256)
|
|
589 (if (aref chars i)
|
|
590 (progn
|
|
591 (unless start (setq start i))
|
|
592 (setq end i)
|
|
593 (aset chars i nil))
|
|
594 (when start
|
|
595 (setq class (concat class (sregex--char-range start end)))
|
|
596 (setq start nil))))
|
|
597 (if start
|
|
598 (setq class (concat class (sregex--char-range start end)))))
|
22537
|
599
|
29069
|
600 (if (> (length class) 0)
|
|
601 (setq class (concat class (if caret "^") (if dash "-")))
|
|
602 (setq class (concat class (if dash "-") (if caret "^"))))
|
|
603 (if (and (not complement) (= (length class) 1))
|
|
604 (regexp-quote class)
|
|
605 (concat "[" (if complement "^") class "]")))))
|
22537
|
606
|
|
607 (provide 'sregex)
|
|
608
|
52401
|
609 ;;; arch-tag: 460c1f5a-eb6e-42ec-a451-ffac78bdf492
|
22537
|
610 ;;; sregex.el ends here
|