39516
|
1 ;;; rx.el --- sexp notation for regular expressions
|
|
2
|
|
3 ;; Copyright (C) 2001 Free Software Foundation, Inc.
|
|
4
|
|
5 ;; Author: Gerd Moellmann <gerd@gnu.org>
|
|
6 ;; Maintainer: FSF
|
|
7 ;; Keywords: strings, regexps, extensions
|
|
8
|
|
9 ;; This file is part of GNU Emacs.
|
|
10
|
|
11 ;; GNU Emacs is free software; you can redistribute it and/or modify
|
|
12 ;; it under the terms of the GNU General Public License as published by
|
|
13 ;; the Free Software Foundation; either version 2, or (at your option)
|
|
14 ;; any later version.
|
|
15
|
|
16 ;; GNU Emacs is distributed in the hope that it will be useful,
|
|
17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
19 ;; GNU General Public License for more details.
|
|
20
|
|
21 ;; You should have received a copy of the GNU General Public License
|
|
22 ;; along with GNU Emacs; see the file COPYING. If not, write to the
|
|
23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
24 ;; Boston, MA 02111-1307, USA.
|
|
25
|
|
26 ;;; Commentary:
|
|
27
|
|
28 ;; This is another implementation of sexp-form regular expressions.
|
|
29 ;; It was unfortunately written without being aware of the Sregex
|
|
30 ;; package coming with Emacs, but as things stand, Rx completely
|
|
31 ;; covers all regexp features, which Sregex doesn't, doesn't suffer
|
|
32 ;; from the bugs mentioned in the commentary section of Sregex, and
|
|
33 ;; uses a nicer syntax (IMHO, of course :-).
|
|
34
|
|
35 ;; Rx translates a sexp notation for regular expressions into the
|
|
36 ;; usual string notation. The translation can be done at compile-time
|
|
37 ;; by using the `rx' macro. It can be done at run-time by calling
|
|
38 ;; function `rx-to-string'. See the documentation of `rx' for a
|
|
39 ;; complete description of the sexp notation.
|
|
40 ;;
|
|
41 ;; Some examples of string regexps and their sexp counterparts:
|
|
42 ;;
|
|
43 ;; "^[a-z]*"
|
|
44 ;; (rx (and line-start (0+ (in "a-z"))))
|
|
45 ;;
|
|
46 ;; "\n[^ \t]"
|
|
47 ;; (rx (and "\n" (not blank))), or
|
|
48 ;; (rx (and "\n" (not (any " \t"))))
|
|
49 ;;
|
|
50 ;; "\\*\\*\\* EOOH \\*\\*\\*\n"
|
|
51 ;; (rx "*** EOOH ***\n")
|
|
52 ;;
|
|
53 ;; "\\<\\(catch\\|finally\\)\\>[^_]"
|
|
54 ;; (rx (and word-start (submatch (or "catch" "finally")) word-end
|
|
55 ;; (not (any ?_))))
|
|
56 ;;
|
|
57 ;; "[ \t\n]*:\\([^:]+\\|$\\)"
|
|
58 ;; (rx (and (zero-or-more (in " \t\n")) ":"
|
|
59 ;; (submatch (or line-end (one-or-more (not (any ?:)))))))
|
|
60 ;;
|
|
61 ;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*"
|
|
62 ;; (rx (and line-start
|
|
63 ;; "content-transfer-encoding:"
|
|
64 ;; (+ (? ?\n) blank)
|
|
65 ;; "quoted-printable"
|
|
66 ;; (+ (? ?\n) blank))
|
|
67 ;;
|
|
68 ;; (concat "^\\(?:" something-else "\\)")
|
|
69 ;; (rx (and line-start (eval something-else))), statically or
|
|
70 ;; (rx-to-string '(and line-start ,something-else)), dynamically.
|
|
71 ;;
|
|
72 ;; (regexp-opt '(STRING1 STRING2 ...))
|
|
73 ;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically
|
|
74 ;; calls `regexp-opt' as needed.
|
|
75 ;;
|
|
76 ;; "^;;\\s-*\n\\|^\n"
|
|
77 ;; (rx (or (and line-start ";;" (0+ space) ?\n)
|
|
78 ;; (and line-start ?\n)))
|
|
79 ;;
|
|
80 ;; "\\$[I]d: [^ ]+ \\([^ ]+\\) "
|
|
81 ;; (rx (and "$Id": "
|
|
82 ;; (1+ (not (in " ")))
|
|
83 ;; " "
|
|
84 ;; (submatch (1+ (not (in " "))))
|
|
85 ;; " ")))
|
|
86 ;;
|
|
87 ;; "\\\\\\\\\\[\\w+"
|
|
88 ;; (rx (and ?\\ ?\\ ?\[ (1+ word)))
|
|
89 ;;
|
|
90 ;; etc.
|
|
91
|
|
92 ;;; History:
|
|
93 ;;
|
|
94
|
|
95 ;;; Code:
|
|
96
|
|
97
|
|
98 (defconst rx-constituents
|
|
99 '((and . (rx-and 1 nil))
|
|
100 (or . (rx-or 1 nil))
|
|
101 (not-newline . ".")
|
|
102 (anything . ".\\|\n")
|
|
103 (any . (rx-any 1 1 rx-check-any))
|
|
104 (in . any)
|
|
105 (not . (rx-not 1 1 rx-check-not))
|
|
106 (repeat . (rx-repeat 2 3))
|
|
107 (submatch . (rx-submatch 1 nil))
|
|
108 (group . submatch)
|
|
109 (zero-or-more . (rx-kleene 1 1))
|
|
110 (one-or-more . (rx-kleene 1 1))
|
|
111 (zero-or-one . (rx-kleene 1 1))
|
|
112 (\? . zero-or-one)
|
|
113 (\?? . zero-or-one)
|
|
114 (* . zero-or-more)
|
|
115 (*? . zero-or-more)
|
|
116 (0+ . zero-or-more)
|
|
117 (+ . one-or-more)
|
|
118 (+? . one-or-more)
|
|
119 (1+ . one-or-more)
|
|
120 (optional . zero-or-one)
|
|
121 (minimal-match . (rx-greedy 1 1))
|
|
122 (maximal-match . (rx-greedy 1 1))
|
|
123 (line-start . "^")
|
|
124 (line-end . "$")
|
|
125 (string-start . "\\`")
|
|
126 (string-end . "\\'")
|
|
127 (buffer-start . "\\`")
|
|
128 (buffer-end . "\\'")
|
|
129 (point . "\\=")
|
|
130 (word-start . "\\<")
|
|
131 (word-end . "\\>")
|
|
132 (word-boundary . "\\b")
|
|
133 (syntax . (rx-syntax 1 1))
|
|
134 (category . (rx-category 1 1 rx-check-category))
|
|
135 (eval . (rx-eval 1 1))
|
|
136 (regexp . (rx-regexp 1 1 stringp))
|
|
137 (digit . "[[:digit:]]")
|
|
138 (control . "[[:cntrl:]]")
|
|
139 (hex-digit . "[[:xdigit:]]")
|
|
140 (blank . "[[:blank:]]")
|
|
141 (graphic . "[[:graph:]]")
|
|
142 (printing . "[[:print:]]")
|
|
143 (alphanumeric . "[[:alnum:]]")
|
|
144 (letter . "[[:alpha:]]")
|
|
145 (ascii . "[[:ascii:]]")
|
|
146 (nonascii . "[[:nonascii:]]")
|
|
147 (lower . "[[:lower:]]")
|
|
148 (punctuation . "[[:punct:]]")
|
|
149 (space . "[[:space:]]")
|
|
150 (upper . "[[:upper:]]")
|
|
151 (word . "[[:word:]]"))
|
|
152 "Alist of sexp form regexp constituents.
|
|
153 Each element of the alist has the form (SYMBOL . DEFN).
|
|
154 SYMBOL is a valid constituent of sexp regular expressions.
|
|
155 If DEFN is a string, SYMBOL is translated into DEFN.
|
|
156 If DEFN is a symbol, use the definition of DEFN, recursively.
|
|
157 Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE).
|
|
158 FUNCTION is used to produce code for SYMBOL. MIN-ARGS and MAX-ARGS
|
|
159 are the minimum and maximum number of arguments the function-form
|
|
160 sexp constituent SYMBOL may have in sexp regular expressions.
|
|
161 MAX-ARGS nil means no limit. PREDICATE, if specified, means that
|
|
162 all arguments must satisfy PREDICATE.")
|
|
163
|
|
164
|
|
165 (defconst rx-syntax
|
|
166 '((whitespace . ?-)
|
|
167 (punctuation . ?.)
|
|
168 (word . ?w)
|
|
169 (symbol . ?_)
|
|
170 (open-parenthesis . ?\()
|
|
171 (close-parenthesis . ?\))
|
|
172 (expression-prefix . ?\')
|
|
173 (string-quote . ?\")
|
|
174 (paired-delimiter . ?$)
|
|
175 (escape . ?\\)
|
|
176 (character-quote . ?/)
|
|
177 (comment-start . ?<)
|
|
178 (comment-end . ?>))
|
|
179 "Alist mapping Rx syntax symbols to syntax characters.
|
|
180 Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
|
|
181 symbol in `(syntax SYMBOL)', and CHAR is the syntax character
|
|
182 corresponding to SYMBOL, as it would be used with \\s or \\S in
|
|
183 regular expressions.")
|
|
184
|
|
185
|
|
186 (defconst rx-categories
|
|
187 '((consonant . ?0)
|
|
188 (base-vowel . ?1)
|
|
189 (upper-diacritical-mark . ?2)
|
|
190 (lower-diacritical-mark . ?3)
|
|
191 (tone-mark . ?4)
|
|
192 (symbol . ?5)
|
|
193 (digit . ?6)
|
|
194 (vowel-modifying-diacritical-mark . ?7)
|
|
195 (vowel-sign . ?8)
|
|
196 (semivowel-lower . ?9)
|
|
197 (not-at-end-of-line . ?<)
|
|
198 (not-at-beginning-of-line . ?>)
|
|
199 (alpha-numeric-two-byte . ?A)
|
|
200 (chinse-two-byte . ?C)
|
|
201 (greek-two-byte . ?G)
|
|
202 (japanese-hiragana-two-byte . ?H)
|
|
203 (indian-two-byte . ?I)
|
|
204 (japanese-katakana-two-byte . ?K)
|
|
205 (korean-hangul-two-byte . ?N)
|
|
206 (cyrillic-two-byte . ?Y)
|
|
207 (ascii . ?a)
|
|
208 (arabic . ?b)
|
|
209 (chinese . ?c)
|
|
210 (ethiopic . ?e)
|
|
211 (greek . ?g)
|
|
212 (korean . ?h)
|
|
213 (indian . ?i)
|
|
214 (japanese . ?j)
|
|
215 (japanese-katakana . ?k)
|
|
216 (latin . ?l)
|
|
217 (lao . ?o)
|
|
218 (tibetan . ?q)
|
|
219 (japanese-roman . ?r)
|
|
220 (thai . ?t)
|
|
221 (vietnamese . ?v)
|
|
222 (hebrew . ?w)
|
|
223 (cyrillic . ?y)
|
|
224 (can-break . ?|))
|
|
225 "Alist mapping symbols to category characters.
|
|
226 Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
|
|
227 symbol in `(category SYMBOL)', and CHAR is the category character
|
|
228 corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in
|
|
229 regular expression strings.")
|
|
230
|
|
231
|
|
232 (defvar rx-greedy-flag t
|
|
233 "Non-nil means produce greedy regular expressions for `zero-or-one',
|
|
234 `zero-or-more', and `one-or-more'. Dynamically bound.")
|
|
235
|
|
236
|
|
237 (defun rx-info (op)
|
|
238 "Return parsing/code generation info for OP.
|
|
239 If OP is the space character ASCII 32, return info for the symbol `?'.
|
|
240 If OP is the character `?', return info for the symbol `??'.
|
|
241 See also `rx-constituents'."
|
|
242 (cond ((eq op ? ) (setq op '\?))
|
|
243 ((eq op ??) (setq op '\??)))
|
|
244 (while (and (not (null op)) (symbolp op))
|
|
245 (setq op (cdr (assq op rx-constituents))))
|
|
246 op)
|
|
247
|
|
248
|
|
249 (defun rx-check (form)
|
|
250 "Check FORM according to its car's parsing info."
|
|
251 (let* ((rx (rx-info (car form)))
|
|
252 (nargs (1- (length form)))
|
|
253 (min-args (nth 1 rx))
|
|
254 (max-args (nth 2 rx))
|
|
255 (type-pred (nth 3 rx)))
|
|
256 (when (and (not (null min-args))
|
|
257 (< nargs min-args))
|
|
258 (error "Rx form `%s' requires at least %d args"
|
|
259 (car form) min-args))
|
|
260 (when (and (not (null max-args))
|
|
261 (> nargs max-args))
|
|
262 (error "Rx form `%s' accepts at most %d args"
|
|
263 (car form) max-args))
|
|
264 (when (not (null type-pred))
|
|
265 (dolist (sub-form (cdr form))
|
|
266 (unless (funcall type-pred sub-form)
|
|
267 (error "Rx form `%s' requires args satisfying `%s'"
|
|
268 (car form) type-pred))))))
|
|
269
|
|
270
|
|
271 (defun rx-and (form)
|
|
272 "Parse and produce code from FORM.
|
|
273 FORM is of the form `(and FORM1 ...)'."
|
|
274 (rx-check form)
|
|
275 (mapconcat #'rx-to-string (cdr form) nil))
|
|
276
|
|
277
|
|
278 (defun rx-or (form)
|
|
279 "Parse and produce code from FORM, which is `(or FORM1 ...)'."
|
|
280 (rx-check form)
|
|
281 (let ((all-args-strings t))
|
|
282 (dolist (arg (cdr form))
|
|
283 (unless (stringp arg)
|
|
284 (setq all-args-strings nil)))
|
|
285 (if all-args-strings
|
|
286 (regexp-opt (cdr form))
|
|
287 (mapconcat #'rx-to-string (cdr form) "\\|"))))
|
|
288
|
|
289
|
|
290 (defun rx-quote-for-set (string)
|
|
291 "Transform STRING for use in a character set.
|
|
292 If STRING contains a `]', move it to the front.
|
|
293 If STRING starts with a '^', move it to the end."
|
|
294 (when (string-match "\\`\\(\\(?:.\\|\n\\)+\\)\\]\\(\\(?:.\\|\n\\)\\)*\\'"
|
|
295 string)
|
|
296 (setq string (concat "]" (match-string 1 string)
|
|
297 (match-string 2 string))))
|
|
298 (when (string-match "\\`^\\(\\(?:.\\|\n\\)+\\)\\'" string)
|
|
299 (setq string (concat (substring string 1) "^")))
|
|
300 string)
|
|
301
|
|
302
|
|
303 (defun rx-check-any (arg)
|
|
304 "Check arg ARG for Rx `any'."
|
|
305 (cond ((integerp arg) t)
|
|
306 ((and (stringp arg) (zerop (length arg)))
|
|
307 (error "String arg for Rx `any' must not be empty"))
|
|
308 ((stringp arg) t)
|
|
309 (t
|
|
310 (error "Rx `any' requires string or character arg"))))
|
|
311
|
|
312
|
|
313 (defun rx-any (form)
|
|
314 "Parse and produce code from FORM, which is `(any STRING)'.
|
|
315 STRING is optional. If it is omitted, build a regexp that
|
|
316 matches anything."
|
|
317 (rx-check form)
|
|
318 (let ((arg (cadr form)))
|
|
319 (cond ((integerp arg)
|
|
320 (char-to-string arg))
|
|
321 ((= (length arg) 1)
|
|
322 arg)
|
|
323 (t
|
|
324 (concat "[" (rx-quote-for-set (cadr form)) "]")))))
|
|
325
|
|
326
|
|
327 (defun rx-check-not (form)
|
|
328 "Check arguments of FORM. FORM is `(not ...)'."
|
|
329 (unless (or (memq form
|
|
330 '(digit control hex-digit blank graphic printing
|
|
331 alphanumeric letter ascii nonascii lower
|
|
332 punctuation space upper word))
|
|
333 (and (consp form)
|
|
334 (memq (car form) '(not any in syntax category:))))
|
|
335 (error "Rx `not' syntax error: %s" form))
|
|
336 t)
|
|
337
|
|
338
|
|
339 (defun rx-not (form)
|
|
340 "Parse and produce code from FORM. FORM is `(not ...)'."
|
|
341 (rx-check form)
|
|
342 (let ((result (rx-to-string (cadr form) 'no-group)))
|
|
343 (cond ((string-match "\\`\\[^" result)
|
|
344 (if (= (length result) 4)
|
|
345 (substring result 2 3)
|
|
346 (concat "[" (substring result 2))))
|
|
347 ((string-match "\\`\\[" result)
|
|
348 (concat "[^" (substring result 1)))
|
|
349 ((string-match "\\`\\\\s." result)
|
|
350 (concat "\\S" (substring result 2)))
|
|
351 ((string-match "\\`\\\\S." result)
|
|
352 (concat "\\s" (substring result 2)))
|
|
353 ((string-match "\\`\\\\c." result)
|
|
354 (concat "\\C" (substring result 2)))
|
|
355 ((string-match "\\`\\\\C." result)
|
|
356 (concat "\\c" (substring result 2)))
|
|
357 ((string-match "\\`\\\\B" result)
|
|
358 (concat "\\b" (substring result 2)))
|
|
359 ((string-match "\\`\\\\b" result)
|
|
360 (concat "\\B" (substring result 2)))
|
|
361 (t
|
|
362 (concat "[^" result "]")))))
|
|
363
|
|
364
|
|
365 (defun rx-repeat (form)
|
|
366 "Parse and produce code from FORM.
|
|
367 FORM is either `(repeat N FORM1)' or `(repeat N M FORM1)'."
|
|
368 (rx-check form)
|
|
369 (cond ((= (length form) 3)
|
|
370 (unless (and (integerp (nth 1 form))
|
|
371 (> (nth 1 form) 0))
|
|
372 (error "Rx `repeat' requires positive integer first arg"))
|
|
373 (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form)))
|
|
374 ((or (not (integerp (nth 2 form)))
|
|
375 (< (nth 2 form) 0)
|
|
376 (not (integerp (nth 1 form)))
|
|
377 (< (nth 1 form) 0)
|
|
378 (< (nth 2 form) (nth 1 form)))
|
|
379 (error "Rx `repeat' range error"))
|
|
380 (t
|
|
381 (format "%s\\{%d,%d\\}" (rx-to-string (nth 3 form))
|
|
382 (nth 1 form) (nth 2 form)))))
|
|
383
|
|
384
|
|
385 (defun rx-submatch (form)
|
|
386 "Parse and produce code from FORM, which is `(submatch ...)'."
|
|
387 (concat "\\(" (mapconcat #'rx-to-string (cdr form) nil) "\\)"))
|
|
388
|
|
389
|
|
390 (defun rx-kleene (form)
|
|
391 "Parse and produce code from FORM.
|
|
392 FORM is `(OP FORM1)', where OP is one of the `zero-or-one',
|
|
393 `zero-or-more' etc. operators.
|
|
394 If OP is one of `*', `+', `?', produce a greedy regexp.
|
|
395 If OP is one of `*?', `+?', `??', produce a non-greedy regexp.
|
|
396 If OP is anything else, produce a greedy regexp if `rx-greedy-flag'
|
|
397 is non-nil."
|
|
398 (rx-check form)
|
|
399 (let ((suffix (cond ((memq (car form) '(* + ? )) "")
|
|
400 ((memq (car form) '(*? +? ??)) "?")
|
|
401 (rx-greedy-flag "")
|
|
402 (t "?")))
|
|
403 (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*")
|
|
404 ((memq (car form) '(+ +? 1+ one-or-more)) "+")
|
|
405 (t "?"))))
|
|
406 (format "\\(?:%s\\)%s%s" (rx-to-string (cadr form) 'no-group)
|
|
407 op suffix)))
|
|
408
|
|
409
|
|
410 (defun rx-syntax (form)
|
|
411 "Parse and produce code from FORM, which is `(syntax SYMBOL)'."
|
|
412 (rx-check form)
|
|
413 (let ((syntax (assq (cadr form) rx-syntax)))
|
|
414 (unless syntax
|
|
415 (error "Unknown rx syntax `%s'" (cadr form)))
|
|
416 (format "\\s%c" (cdr syntax))))
|
|
417
|
|
418
|
|
419 (defun rx-check-category (form)
|
|
420 "Check the argument FORM of a `(category FORM)'."
|
|
421 (unless (or (integerp form)
|
|
422 (cdr (assq form rx-categories)))
|
|
423 (error "Unknown category `%s'" form))
|
|
424 t)
|
|
425
|
|
426
|
|
427 (defun rx-category (form)
|
|
428 "Parse and produce code from FORM, which is `(category SYMBOL ...)'."
|
|
429 (rx-check form)
|
|
430 (let ((char (if (integerp (cadr form))
|
|
431 (cadr form)
|
|
432 (cdr (assq (cadr form) rx-categories)))))
|
|
433 (format "\\c%c" char)))
|
|
434
|
|
435
|
|
436 (defun rx-eval (form)
|
|
437 "Parse and produce code from FORM, which is `(eval FORM)'."
|
|
438 (rx-check form)
|
|
439 (rx-to-string (eval (cadr form))))
|
|
440
|
|
441
|
|
442 (defun rx-greedy (form)
|
|
443 "Parse and produce code from FORM. If FORM is '(minimal-match
|
|
444 FORM1)', non-greedy versions of `*', `+', and `?' operators will be
|
|
445 used in FORM1. If FORM is '(maximal-match FORM1)', greedy operators
|
|
446 will be used."
|
|
447 (rx-check form)
|
|
448 (let ((rx-greedy-flag (eq (car form) 'maximal-match)))
|
|
449 (rx-to-string (cadr form))))
|
|
450
|
|
451
|
|
452 (defun rx-regexp (form)
|
|
453 "Parse and produce code from FORM, which is `(regexp STRING)'."
|
|
454 (rx-check form)
|
|
455 (concat "\\(?:" (cadr form) "\\)"))
|
|
456
|
|
457
|
|
458 ;;;###autoload
|
|
459 (defun rx-to-string (form &optional no-group)
|
|
460 "Parse and produce code for regular expression FORM.
|
|
461 FORM is a regular expression in sexp form.
|
|
462 NO-GROUP non-nil means don't put shy groups around the result."
|
|
463 (cond ((stringp form)
|
|
464 (regexp-quote form))
|
|
465 ((integerp form)
|
|
466 (regexp-quote (char-to-string form)))
|
|
467 ((symbolp form)
|
|
468 (let ((info (rx-info form)))
|
|
469 (cond ((stringp info)
|
|
470 info)
|
|
471 ((null info)
|
|
472 (error "Unknown Rx form `%s'" form))
|
|
473 (t
|
|
474 (funcall (nth 0 info) form)))))
|
|
475 ((consp form)
|
|
476 (let ((info (rx-info (car form))))
|
|
477 (unless (consp info)
|
|
478 (error "Unknown Rx form `%s'" (car form)))
|
|
479 (let ((result (funcall (nth 0 info) form)))
|
|
480 (if (or no-group (string-match "\\`\\\\[(]" result))
|
|
481 result
|
|
482 (concat "\\(?:" result "\\)")))))
|
|
483 (t
|
|
484 (error "Rx syntax error at `%s'" form))))
|
|
485
|
|
486
|
|
487 ;;;###autoload
|
|
488 (defmacro rx (regexp)
|
|
489 "Translate a regular expression REGEXP in sexp form to a regexp string.
|
|
490 See also `rx-to-string' for how to do such a translation at run-time.
|
|
491
|
|
492 The following are valid subforms of regular expressions in sexp
|
|
493 notation.
|
|
494
|
|
495 STRING
|
|
496 matches string STRING literally.
|
|
497
|
|
498 CHAR
|
|
499 matches character CHAR literally.
|
|
500
|
|
501 `not-newline'
|
|
502 matches any character except a newline.
|
|
503 .
|
|
504 `anything'
|
|
505 matches any character
|
|
506
|
|
507 `(any SET)'
|
|
508 matches any character in SET. SET may be a character or string.
|
|
509 Ranges of characters can be specified as `A-Z' in strings.
|
|
510
|
|
511 '(in SET)'
|
|
512 like `any'.
|
|
513
|
|
514 `(not (any SET))'
|
|
515 matches any character not in SET
|
|
516
|
|
517 `line-start'
|
|
518 matches the empty string, but only at the beginning of a line
|
|
519 in the text being matched
|
|
520
|
|
521 `line-end'
|
|
522 is similar to `line-start' but matches only at the end of a line
|
|
523
|
|
524 `string-start'
|
|
525 matches the empty string, but only at the beginning of the
|
|
526 string being matched against.
|
|
527
|
|
528 `string-end'
|
|
529 matches the empty string, but only at the end of the
|
|
530 string being matched against.
|
|
531
|
|
532 `buffer-start'
|
|
533 matches the empty string, but only at the beginning of the
|
|
534 buffer being matched against.
|
|
535
|
|
536 `buffer-end'
|
|
537 matches the empty string, but only at the end of the
|
|
538 buffer being matched against.
|
|
539
|
|
540 `point'
|
|
541 matches the empty string, but only at point.
|
|
542
|
|
543 `word-start'
|
|
544 matches the empty string, but only at the beginning or end of a
|
|
545 word.
|
|
546
|
|
547 `word-end'
|
|
548 matches the empty string, but only at the end of a word.
|
|
549
|
|
550 `word-boundary'
|
|
551 matches the empty string, but only at the beginning or end of a
|
|
552 word.
|
|
553
|
|
554 `(not word-boundary)'
|
|
555 matches the empty string, but not at the beginning or end of a
|
|
556 word.
|
|
557
|
|
558 `digit'
|
|
559 matches 0 through 9.
|
|
560
|
|
561 `control'
|
|
562 matches ASCII control characters.
|
|
563
|
|
564 `hex-digit'
|
|
565 matches 0 through 9, a through f and A through F.
|
|
566
|
|
567 `blank'
|
|
568 matches space and tab only.
|
|
569
|
|
570 `graphic'
|
|
571 matches graphic characters--everything except ASCII control chars,
|
|
572 space, and DEL.
|
|
573
|
|
574 `printing'
|
|
575 matches printing characters--everything except ASCII control chars
|
|
576 and DEL.
|
|
577
|
|
578 `alphanumeric'
|
|
579 matches letters and digits. (But at present, for multibyte characters,
|
|
580 it matches anything that has word syntax.)
|
|
581
|
|
582 `letter'
|
|
583 matches letters. (But at present, for multibyte characters,
|
|
584 it matches anything that has word syntax.)
|
|
585
|
|
586 `ascii'
|
|
587 matches ASCII (unibyte) characters.
|
|
588
|
|
589 `nonascii'
|
|
590 matches non-ASCII (multibyte) characters.
|
|
591
|
|
592 `lower'
|
|
593 matches anything lower-case.
|
|
594
|
|
595 `upper'
|
|
596 matches anything upper-case.
|
|
597
|
|
598 `punctuation'
|
|
599 matches punctuation. (But at present, for multibyte characters,
|
|
600 it matches anything that has non-word syntax.)
|
|
601
|
|
602 `space'
|
|
603 matches anything that has whitespace syntax.
|
|
604
|
|
605 `word'
|
|
606 matches anything that has word syntax.
|
|
607
|
|
608 `(syntax SYNTAX)'
|
|
609 matches a character with syntax SYNTAX. SYNTAX must be one
|
|
610 of the following symbols.
|
|
611
|
|
612 `whitespace' (\\s- in string notation)
|
|
613 `punctuation' (\\s.)
|
|
614 `word' (\\sw)
|
|
615 `symbol' (\\s_)
|
|
616 `open-parenthesis' (\\s()
|
|
617 `close-parenthesis' (\\s))
|
|
618 `expression-prefix' (\\s')
|
|
619 `string-quote' (\\s\")
|
|
620 `paired-delimiter' (\\s$)
|
|
621 `escape' (\\s\\)
|
|
622 `character-quote' (\\s/)
|
|
623 `comment-start' (\\s<)
|
|
624 `comment-end' (\\s>)
|
|
625
|
|
626 `(not (syntax SYNTAX))'
|
|
627 matches a character that has not syntax SYNTAX.
|
|
628
|
|
629 `(category CATEGORY)'
|
|
630 matches a character with category CATEGORY. CATEGORY must be
|
|
631 either a character to use for C, or one of the following symbols.
|
|
632
|
|
633 `consonant' (\\c0 in string notation)
|
|
634 `base-vowel' (\\c1)
|
|
635 `upper-diacritical-mark' (\\c2)
|
|
636 `lower-diacritical-mark' (\\c3)
|
|
637 `tone-mark' (\\c4)
|
|
638 `symbol' (\\c5)
|
|
639 `digit' (\\c6)
|
|
640 `vowel-modifying-diacritical-mark' (\\c7)
|
|
641 `vowel-sign' (\\c8)
|
|
642 `semivowel-lower' (\\c9)
|
|
643 `not-at-end-of-line' (\\c<)
|
|
644 `not-at-beginning-of-line' (\\c>)
|
|
645 `alpha-numeric-two-byte' (\\cA)
|
|
646 `chinse-two-byte' (\\cC)
|
|
647 `greek-two-byte' (\\cG)
|
|
648 `japanese-hiragana-two-byte' (\\cH)
|
|
649 `indian-tow-byte' (\\cI)
|
|
650 `japanese-katakana-two-byte' (\\cK)
|
|
651 `korean-hangul-two-byte' (\\cN)
|
|
652 `cyrillic-two-byte' (\\cY)
|
|
653 `ascii' (\\ca)
|
|
654 `arabic' (\\cb)
|
|
655 `chinese' (\\cc)
|
|
656 `ethiopic' (\\ce)
|
|
657 `greek' (\\cg)
|
|
658 `korean' (\\ch)
|
|
659 `indian' (\\ci)
|
|
660 `japanese' (\\cj)
|
|
661 `japanese-katakana' (\\ck)
|
|
662 `latin' (\\cl)
|
|
663 `lao' (\\co)
|
|
664 `tibetan' (\\cq)
|
|
665 `japanese-roman' (\\cr)
|
|
666 `thai' (\\ct)
|
|
667 `vietnamese' (\\cv)
|
|
668 `hebrew' (\\cw)
|
|
669 `cyrillic' (\\cy)
|
|
670 `can-break' (\\c|)
|
|
671
|
|
672 `(not (category CATEGORY))'
|
|
673 matches a character that has not category CATEGORY.
|
|
674
|
|
675 `(and SEXP1 SEXP2 ...)'
|
|
676 matches what SEXP1 matches, followed by what SEXP2 matches, etc.
|
|
677
|
|
678 `(submatch SEXP1 SEXP2 ...)'
|
|
679 like `and', but makes the match accessible with `match-end',
|
|
680 `match-beginning', and `match-string'.
|
|
681
|
|
682 `(group SEXP1 SEXP2 ...)'
|
|
683 another name for `submatch'.
|
|
684
|
|
685 `(or SEXP1 SEXP2 ...)'
|
|
686 matches anything that matches SEXP1 or SEXP2, etc. If all
|
|
687 args are strings, use `regexp-opt' to optimize the resulting
|
|
688 regular expression.
|
|
689
|
|
690 `(minimal-match SEXP)'
|
|
691 produce a non-greedy regexp for SEXP. Normally, regexps matching
|
|
692 zero or more occurrances of something are \"greedy\" in that they
|
|
693 match as much as they can, as long as the overall regexp can
|
|
694 still match. A non-greedy regexp matches as little as possible.
|
|
695
|
|
696 `(maximal-match SEXP)'
|
|
697 produce a greedy regexp for SEXP. This is the default.
|
|
698
|
|
699 `(zero-or-more SEXP)'
|
|
700 matches zero or more occurrences of what SEXP matches.
|
|
701
|
|
702 `(0+ SEXP)'
|
|
703 like `zero-or-more'.
|
|
704
|
|
705 `(* SEXP)'
|
|
706 like `zero-or-more', but always produces a greedy regexp.
|
|
707
|
|
708 `(*? SEXP)'
|
|
709 like `zero-or-more', but always produces a non-greedy regexp.
|
|
710
|
|
711 `(one-or-more SEXP)'
|
|
712 matches one or more occurrences of A.
|
|
713
|
|
714 `(1+ SEXP)'
|
|
715 like `one-or-more'.
|
|
716
|
|
717 `(+ SEXP)'
|
|
718 like `one-or-more', but always produces a greedy regexp.
|
|
719
|
|
720 `(+? SEXP)'
|
|
721 like `one-or-more', but always produces a non-greedy regexp.
|
|
722
|
|
723 `(zero-or-one SEXP)'
|
|
724 matches zero or one occurrences of A.
|
|
725
|
|
726 `(optional SEXP)'
|
|
727 like `zero-or-one'.
|
|
728
|
|
729 `(? SEXP)'
|
|
730 like `zero-or-one', but always produces a greedy regexp.
|
|
731
|
|
732 `(?? SEXP)'
|
|
733 like `zero-or-one', but always produces a non-greedy regexp.
|
|
734
|
|
735 `(repeat N SEXP)'
|
|
736 matches N occurrences of what SEXP matches.
|
|
737
|
|
738 `(repeat N M SEXP)'
|
|
739 matches N to M occurrences of what SEXP matches.
|
|
740
|
|
741 `(eval FORM)'
|
|
742 evaluate FORM and insert result. If result is a string,
|
|
743 `regexp-quote' it.
|
|
744
|
|
745 `(regexp REGEXP)'
|
|
746 include REGEXP in string notation in the result."
|
|
747
|
|
748 `(rx-to-string ',regexp))
|
|
749
|
|
750
|
|
751 (provide 'rx)
|
|
752
|
|
753 ;;; rx.el ends here
|