Mercurial > emacs
annotate src/regex.c @ 4413:5a00cec8e9b0
(fill-region-as-paragraph): When we take one word
after the fill column, don't stop at period with just one space.
When checking whether at beginning of line, if no fill prefix,
ignore intervening whitespace.
author | Richard M. Stallman <rms@gnu.org> |
---|---|
date | Mon, 02 Aug 1993 05:55:56 +0000 |
parents | d423b5ec9091 |
children | df5046c4e18b |
rev | line source |
---|---|
1155 | 1 /* Extended regular expression matching and search library, |
2454 | 2 version 0.12. |
1155 | 3 (Implements POSIX draft P10003.2/D11.2, except for |
4 internationalization features.) | |
5 | |
1738 | 6 Copyright (C) 1993 Free Software Foundation, Inc. |
1155 | 7 |
8 This program is free software; you can redistribute it and/or modify | |
9 it under the terms of the GNU General Public License as published by | |
10 the Free Software Foundation; either version 2, or (at your option) | |
11 any later version. | |
12 | |
13 This program is distributed in the hope that it will be useful, | |
14 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 GNU General Public License for more details. | |
17 | |
18 You should have received a copy of the GNU General Public License | |
19 along with this program; if not, write to the Free Software | |
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ | |
21 | |
22 /* AIX requires this to be the first thing in the file. */ | |
23 #if defined (_AIX) && !defined (REGEX_MALLOC) | |
24 #pragma alloca | |
25 #endif | |
26 | |
27 #define _GNU_SOURCE | |
28 | |
29 /* We need this for `regex.h', and perhaps for the Emacs include files. */ | |
30 #include <sys/types.h> | |
31 | |
1669 | 32 #ifdef HAVE_CONFIG_H |
1645
fb092d69da76
*** empty log message ***
David J. MacKenzie <djm@gnu.org>
parents:
1642
diff
changeset
|
33 #include "config.h" |
fb092d69da76
*** empty log message ***
David J. MacKenzie <djm@gnu.org>
parents:
1642
diff
changeset
|
34 #endif |
fb092d69da76
*** empty log message ***
David J. MacKenzie <djm@gnu.org>
parents:
1642
diff
changeset
|
35 |
1155 | 36 /* The `emacs' switch turns on certain matching commands |
37 that make sense only in Emacs. */ | |
38 #ifdef emacs | |
39 | |
40 #include "lisp.h" | |
41 #include "buffer.h" | |
42 #include "syntax.h" | |
43 | |
44 /* Emacs uses `NULL' as a predicate. */ | |
45 #undef NULL | |
46 | |
47 #else /* not emacs */ | |
48 | |
3766 | 49 #ifdef STDC_HEADERS |
50 #include <stdlib.h> | |
51 #else | |
52 char *malloc (); | |
53 char *realloc (); | |
54 #endif | |
55 | |
56 | |
1155 | 57 /* We used to test for `BSTRING' here, but only GCC and Emacs define |
58 `BSTRING', as far as I know, and neither of them use this code. */ | |
1641
47ae0840b2b9
*** empty log message ***
David J. MacKenzie <djm@gnu.org>
parents:
1637
diff
changeset
|
59 #if HAVE_STRING_H || STDC_HEADERS |
1155 | 60 #include <string.h> |
1637 | 61 #ifndef bcmp |
1155 | 62 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n)) |
1637 | 63 #endif |
64 #ifndef bcopy | |
1155 | 65 #define bcopy(s, d, n) memcpy ((d), (s), (n)) |
1637 | 66 #endif |
67 #ifndef bzero | |
1155 | 68 #define bzero(s, n) memset ((s), 0, (n)) |
1637 | 69 #endif |
1155 | 70 #else |
71 #include <strings.h> | |
72 #endif | |
73 | |
74 /* Define the syntax stuff for \<, \>, etc. */ | |
75 | |
76 /* This must be nonzero for the wordchar and notwordchar pattern | |
77 commands in re_match_2. */ | |
78 #ifndef Sword | |
79 #define Sword 1 | |
80 #endif | |
81 | |
82 #ifdef SYNTAX_TABLE | |
83 | |
84 extern char *re_syntax_table; | |
85 | |
86 #else /* not SYNTAX_TABLE */ | |
87 | |
88 /* How many characters in the character set. */ | |
89 #define CHAR_SET_SIZE 256 | |
90 | |
91 static char re_syntax_table[CHAR_SET_SIZE]; | |
92 | |
93 static void | |
94 init_syntax_once () | |
95 { | |
96 register int c; | |
97 static int done = 0; | |
98 | |
99 if (done) | |
100 return; | |
101 | |
102 bzero (re_syntax_table, sizeof re_syntax_table); | |
103 | |
104 for (c = 'a'; c <= 'z'; c++) | |
105 re_syntax_table[c] = Sword; | |
106 | |
107 for (c = 'A'; c <= 'Z'; c++) | |
108 re_syntax_table[c] = Sword; | |
109 | |
110 for (c = '0'; c <= '9'; c++) | |
111 re_syntax_table[c] = Sword; | |
112 | |
113 re_syntax_table['_'] = Sword; | |
114 | |
115 done = 1; | |
116 } | |
117 | |
118 #endif /* not SYNTAX_TABLE */ | |
119 | |
120 #define SYNTAX(c) re_syntax_table[c] | |
121 | |
122 #endif /* not emacs */ | |
123 | |
124 /* Get the interface, including the syntax bits. */ | |
125 #include "regex.h" | |
126 | |
127 /* isalpha etc. are used for the character classes. */ | |
128 #include <ctype.h> | |
1668 | 129 |
2465 | 130 /* Jim Meyering writes: |
131 | |
132 "... Some ctype macros are valid only for character codes that | |
133 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when | |
134 using /bin/cc or gcc but without giving an ansi option). So, all | |
135 ctype uses should be through macros like ISPRINT... If | |
136 STDC_HEADERS is defined, then autoconf has verified that the ctype | |
137 macros don't need to be guarded with references to isascii. ... | |
138 Defining isascii to 1 should let any compiler worth its salt | |
139 eliminate the && through constant folding." */ | |
140 #if ! defined (isascii) || defined (STDC_HEADERS) | |
141 #undef isascii | |
1668 | 142 #define isascii(c) 1 |
143 #endif | |
144 | |
145 #ifdef isblank | |
146 #define ISBLANK(c) (isascii (c) && isblank (c)) | |
147 #else | |
148 #define ISBLANK(c) ((c) == ' ' || (c) == '\t') | |
1155 | 149 #endif |
1668 | 150 #ifdef isgraph |
151 #define ISGRAPH(c) (isascii (c) && isgraph (c)) | |
152 #else | |
153 #define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c)) | |
1155 | 154 #endif |
155 | |
1668 | 156 #define ISPRINT(c) (isascii (c) && isprint (c)) |
157 #define ISDIGIT(c) (isascii (c) && isdigit (c)) | |
158 #define ISALNUM(c) (isascii (c) && isalnum (c)) | |
159 #define ISALPHA(c) (isascii (c) && isalpha (c)) | |
160 #define ISCNTRL(c) (isascii (c) && iscntrl (c)) | |
161 #define ISLOWER(c) (isascii (c) && islower (c)) | |
162 #define ISPUNCT(c) (isascii (c) && ispunct (c)) | |
163 #define ISSPACE(c) (isascii (c) && isspace (c)) | |
164 #define ISUPPER(c) (isascii (c) && isupper (c)) | |
165 #define ISXDIGIT(c) (isascii (c) && isxdigit (c)) | |
166 | |
1155 | 167 #ifndef NULL |
168 #define NULL 0 | |
169 #endif | |
170 | |
171 /* We remove any previous definition of `SIGN_EXTEND_CHAR', | |
172 since ours (we hope) works properly with all combinations of | |
173 machines, compilers, `char' and `unsigned char' argument types. | |
174 (Per Bothner suggested the basic approach.) */ | |
175 #undef SIGN_EXTEND_CHAR | |
176 #if __STDC__ | |
177 #define SIGN_EXTEND_CHAR(c) ((signed char) (c)) | |
1637 | 178 #else /* not __STDC__ */ |
1155 | 179 /* As in Harbison and Steele. */ |
180 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) | |
181 #endif | |
182 | |
183 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we | |
184 use `alloca' instead of `malloc'. This is because using malloc in | |
185 re_search* or re_match* could cause memory leaks when C-g is used in | |
186 Emacs; also, malloc is slower and causes storage fragmentation. On | |
187 the other hand, malloc is more portable, and easier to debug. | |
188 | |
189 Because we sometimes use alloca, some routines have to be macros, | |
190 not functions -- `alloca'-allocated space disappears at the end of the | |
191 function it is called in. */ | |
192 | |
193 #ifdef REGEX_MALLOC | |
194 | |
195 #define REGEX_ALLOCATE malloc | |
196 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) | |
197 | |
198 #else /* not REGEX_MALLOC */ | |
199 | |
200 /* Emacs already defines alloca, sometimes. */ | |
201 #ifndef alloca | |
202 | |
203 /* Make alloca work the best possible way. */ | |
204 #ifdef __GNUC__ | |
205 #define alloca __builtin_alloca | |
206 #else /* not __GNUC__ */ | |
207 #if HAVE_ALLOCA_H | |
208 #include <alloca.h> | |
209 #else /* not __GNUC__ or HAVE_ALLOCA_H */ | |
210 #ifndef _AIX /* Already did AIX, up at the top. */ | |
211 char *alloca (); | |
212 #endif /* not _AIX */ | |
213 #endif /* not HAVE_ALLOCA_H */ | |
214 #endif /* not __GNUC__ */ | |
215 | |
216 #endif /* not alloca */ | |
217 | |
218 #define REGEX_ALLOCATE alloca | |
219 | |
220 /* Assumes a `char *destination' variable. */ | |
221 #define REGEX_REALLOCATE(source, osize, nsize) \ | |
222 (destination = (char *) alloca (nsize), \ | |
223 bcopy (source, destination, osize), \ | |
224 destination) | |
225 | |
226 #endif /* not REGEX_MALLOC */ | |
227 | |
228 | |
229 /* True if `size1' is non-NULL and PTR is pointing anywhere inside | |
230 `string1' or just past its end. This works if PTR is NULL, which is | |
231 a good thing. */ | |
232 #define FIRST_STRING_P(ptr) \ | |
233 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) | |
234 | |
235 /* (Re)Allocate N items of type T using malloc, or fail. */ | |
236 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) | |
237 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) | |
2949 | 238 #define RETALLOC_IF(addr, n, t) \ |
239 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t) | |
1155 | 240 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) |
241 | |
242 #define BYTEWIDTH 8 /* In bits. */ | |
243 | |
244 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) | |
245 | |
246 #define MAX(a, b) ((a) > (b) ? (a) : (b)) | |
247 #define MIN(a, b) ((a) < (b) ? (a) : (b)) | |
248 | |
249 typedef char boolean; | |
250 #define false 0 | |
251 #define true 1 | |
252 | |
253 /* These are the command codes that appear in compiled regular | |
254 expressions. Some opcodes are followed by argument bytes. A | |
255 command code can specify any interpretation whatsoever for its | |
256 arguments. Zero bytes may appear in the compiled regular expression. | |
257 | |
258 The value of `exactn' is needed in search.c (search_buffer) in Emacs. | |
259 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of | |
260 `exactn' we use here must also be 1. */ | |
261 | |
262 typedef enum | |
263 { | |
264 no_op = 0, | |
265 | |
266 /* Followed by one byte giving n, then by n literal bytes. */ | |
267 exactn = 1, | |
268 | |
269 /* Matches any (more or less) character. */ | |
270 anychar, | |
271 | |
272 /* Matches any one char belonging to specified set. First | |
273 following byte is number of bitmap bytes. Then come bytes | |
274 for a bitmap saying which chars are in. Bits in each byte | |
275 are ordered low-bit-first. A character is in the set if its | |
276 bit is 1. A character too large to have a bit in the map is | |
277 automatically not in the set. */ | |
278 charset, | |
279 | |
280 /* Same parameters as charset, but match any character that is | |
281 not one of those specified. */ | |
282 charset_not, | |
283 | |
284 /* Start remembering the text that is matched, for storing in a | |
285 register. Followed by one byte with the register number, in | |
286 the range 0 to one less than the pattern buffer's re_nsub | |
287 field. Then followed by one byte with the number of groups | |
288 inner to this one. (This last has to be part of the | |
289 start_memory only because we need it in the on_failure_jump | |
290 of re_match_2.) */ | |
291 start_memory, | |
292 | |
293 /* Stop remembering the text that is matched and store it in a | |
294 memory register. Followed by one byte with the register | |
295 number, in the range 0 to one less than `re_nsub' in the | |
296 pattern buffer, and one byte with the number of inner groups, | |
297 just like `start_memory'. (We need the number of inner | |
298 groups here because we don't have any easy way of finding the | |
299 corresponding start_memory when we're at a stop_memory.) */ | |
300 stop_memory, | |
301 | |
302 /* Match a duplicate of something remembered. Followed by one | |
303 byte containing the register number. */ | |
304 duplicate, | |
305 | |
306 /* Fail unless at beginning of line. */ | |
307 begline, | |
308 | |
309 /* Fail unless at end of line. */ | |
310 endline, | |
311 | |
312 /* Succeeds if at beginning of buffer (if emacs) or at beginning | |
313 of string to be matched (if not). */ | |
314 begbuf, | |
315 | |
316 /* Analogously, for end of buffer/string. */ | |
317 endbuf, | |
318 | |
319 /* Followed by two byte relative address to which to jump. */ | |
320 jump, | |
321 | |
322 /* Same as jump, but marks the end of an alternative. */ | |
323 jump_past_alt, | |
324 | |
325 /* Followed by two-byte relative address of place to resume at | |
326 in case of failure. */ | |
327 on_failure_jump, | |
328 | |
329 /* Like on_failure_jump, but pushes a placeholder instead of the | |
330 current string position when executed. */ | |
331 on_failure_keep_string_jump, | |
332 | |
333 /* Throw away latest failure point and then jump to following | |
334 two-byte relative address. */ | |
335 pop_failure_jump, | |
336 | |
337 /* Change to pop_failure_jump if know won't have to backtrack to | |
338 match; otherwise change to jump. This is used to jump | |
339 back to the beginning of a repeat. If what follows this jump | |
340 clearly won't match what the repeat does, such that we can be | |
341 sure that there is no use backtracking out of repetitions | |
342 already matched, then we change it to a pop_failure_jump. | |
343 Followed by two-byte address. */ | |
344 maybe_pop_jump, | |
345 | |
346 /* Jump to following two-byte address, and push a dummy failure | |
347 point. This failure point will be thrown away if an attempt | |
348 is made to use it for a failure. A `+' construct makes this | |
349 before the first repeat. Also used as an intermediary kind | |
350 of jump when compiling an alternative. */ | |
351 dummy_failure_jump, | |
352 | |
353 /* Push a dummy failure point and continue. Used at the end of | |
354 alternatives. */ | |
355 push_dummy_failure, | |
356 | |
357 /* Followed by two-byte relative address and two-byte number n. | |
358 After matching N times, jump to the address upon failure. */ | |
359 succeed_n, | |
360 | |
361 /* Followed by two-byte relative address, and two-byte number n. | |
362 Jump to the address N times, then fail. */ | |
363 jump_n, | |
364 | |
365 /* Set the following two-byte relative address to the | |
366 subsequent two-byte number. The address *includes* the two | |
367 bytes of number. */ | |
368 set_number_at, | |
369 | |
370 wordchar, /* Matches any word-constituent character. */ | |
371 notwordchar, /* Matches any char that is not a word-constituent. */ | |
372 | |
373 wordbeg, /* Succeeds if at word beginning. */ | |
374 wordend, /* Succeeds if at word end. */ | |
375 | |
376 wordbound, /* Succeeds if at a word boundary. */ | |
377 notwordbound /* Succeeds if not at a word boundary. */ | |
378 | |
379 #ifdef emacs | |
380 ,before_dot, /* Succeeds if before point. */ | |
381 at_dot, /* Succeeds if at point. */ | |
382 after_dot, /* Succeeds if after point. */ | |
383 | |
384 /* Matches any character whose syntax is specified. Followed by | |
385 a byte which contains a syntax code, e.g., Sword. */ | |
386 syntaxspec, | |
387 | |
388 /* Matches any character whose syntax is not that specified. */ | |
389 notsyntaxspec | |
390 #endif /* emacs */ | |
391 } re_opcode_t; | |
392 | |
393 /* Common operations on the compiled pattern. */ | |
394 | |
395 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ | |
396 | |
397 #define STORE_NUMBER(destination, number) \ | |
398 do { \ | |
399 (destination)[0] = (number) & 0377; \ | |
400 (destination)[1] = (number) >> 8; \ | |
401 } while (0) | |
402 | |
403 /* Same as STORE_NUMBER, except increment DESTINATION to | |
404 the byte after where the number is stored. Therefore, DESTINATION | |
405 must be an lvalue. */ | |
406 | |
407 #define STORE_NUMBER_AND_INCR(destination, number) \ | |
408 do { \ | |
409 STORE_NUMBER (destination, number); \ | |
410 (destination) += 2; \ | |
411 } while (0) | |
412 | |
413 /* Put into DESTINATION a number stored in two contiguous bytes starting | |
414 at SOURCE. */ | |
415 | |
416 #define EXTRACT_NUMBER(destination, source) \ | |
417 do { \ | |
418 (destination) = *(source) & 0377; \ | |
419 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ | |
420 } while (0) | |
421 | |
422 #ifdef DEBUG | |
423 static void | |
424 extract_number (dest, source) | |
425 int *dest; | |
426 unsigned char *source; | |
427 { | |
428 int temp = SIGN_EXTEND_CHAR (*(source + 1)); | |
429 *dest = *source & 0377; | |
430 *dest += temp << 8; | |
431 } | |
432 | |
433 #ifndef EXTRACT_MACROS /* To debug the macros. */ | |
434 #undef EXTRACT_NUMBER | |
435 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) | |
436 #endif /* not EXTRACT_MACROS */ | |
437 | |
438 #endif /* DEBUG */ | |
439 | |
440 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. | |
441 SOURCE must be an lvalue. */ | |
442 | |
443 #define EXTRACT_NUMBER_AND_INCR(destination, source) \ | |
444 do { \ | |
445 EXTRACT_NUMBER (destination, source); \ | |
446 (source) += 2; \ | |
447 } while (0) | |
448 | |
449 #ifdef DEBUG | |
450 static void | |
451 extract_number_and_incr (destination, source) | |
452 int *destination; | |
453 unsigned char **source; | |
454 { | |
455 extract_number (destination, *source); | |
456 *source += 2; | |
457 } | |
458 | |
459 #ifndef EXTRACT_MACROS | |
460 #undef EXTRACT_NUMBER_AND_INCR | |
461 #define EXTRACT_NUMBER_AND_INCR(dest, src) \ | |
462 extract_number_and_incr (&dest, &src) | |
463 #endif /* not EXTRACT_MACROS */ | |
464 | |
465 #endif /* DEBUG */ | |
466 | |
467 /* If DEBUG is defined, Regex prints many voluminous messages about what | |
468 it is doing (if the variable `debug' is nonzero). If linked with the | |
469 main program in `iregex.c', you can enter patterns and strings | |
470 interactively. And if linked with the main program in `main.c' and | |
471 the other test files, you can run the already-written tests. */ | |
472 | |
473 #ifdef DEBUG | |
474 | |
475 /* We use standard I/O for debugging. */ | |
476 #include <stdio.h> | |
477 | |
478 /* It is useful to test things that ``must'' be true when debugging. */ | |
479 #include <assert.h> | |
480 | |
481 static int debug = 0; | |
482 | |
483 #define DEBUG_STATEMENT(e) e | |
484 #define DEBUG_PRINT1(x) if (debug) printf (x) | |
485 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) | |
486 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) | |
1637 | 487 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) |
1155 | 488 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ |
489 if (debug) print_partial_compiled_pattern (s, e) | |
490 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ | |
491 if (debug) print_double_string (w, s1, sz1, s2, sz2) | |
492 | |
493 | |
494 extern void printchar (); | |
495 | |
496 /* Print the fastmap in human-readable form. */ | |
497 | |
498 void | |
499 print_fastmap (fastmap) | |
500 char *fastmap; | |
501 { | |
502 unsigned was_a_range = 0; | |
503 unsigned i = 0; | |
504 | |
505 while (i < (1 << BYTEWIDTH)) | |
506 { | |
507 if (fastmap[i++]) | |
508 { | |
509 was_a_range = 0; | |
510 printchar (i - 1); | |
511 while (i < (1 << BYTEWIDTH) && fastmap[i]) | |
512 { | |
513 was_a_range = 1; | |
514 i++; | |
515 } | |
516 if (was_a_range) | |
517 { | |
518 printf ("-"); | |
519 printchar (i - 1); | |
520 } | |
521 } | |
522 } | |
523 putchar ('\n'); | |
524 } | |
525 | |
526 | |
527 /* Print a compiled pattern string in human-readable form, starting at | |
528 the START pointer into it and ending just before the pointer END. */ | |
529 | |
530 void | |
531 print_partial_compiled_pattern (start, end) | |
532 unsigned char *start; | |
533 unsigned char *end; | |
534 { | |
535 int mcnt, mcnt2; | |
536 unsigned char *p = start; | |
537 unsigned char *pend = end; | |
538 | |
539 if (start == NULL) | |
540 { | |
541 printf ("(null)\n"); | |
542 return; | |
543 } | |
544 | |
545 /* Loop over pattern commands. */ | |
546 while (p < pend) | |
547 { | |
2615 | 548 printf ("%d:\t", p - start); |
549 | |
1155 | 550 switch ((re_opcode_t) *p++) |
551 { | |
552 case no_op: | |
553 printf ("/no_op"); | |
554 break; | |
555 | |
556 case exactn: | |
557 mcnt = *p++; | |
558 printf ("/exactn/%d", mcnt); | |
559 do | |
560 { | |
561 putchar ('/'); | |
562 printchar (*p++); | |
563 } | |
564 while (--mcnt); | |
565 break; | |
566 | |
567 case start_memory: | |
568 mcnt = *p++; | |
569 printf ("/start_memory/%d/%d", mcnt, *p++); | |
570 break; | |
571 | |
572 case stop_memory: | |
573 mcnt = *p++; | |
574 printf ("/stop_memory/%d/%d", mcnt, *p++); | |
575 break; | |
576 | |
577 case duplicate: | |
578 printf ("/duplicate/%d", *p++); | |
579 break; | |
580 | |
581 case anychar: | |
582 printf ("/anychar"); | |
583 break; | |
584 | |
585 case charset: | |
586 case charset_not: | |
587 { | |
2615 | 588 register int c, last = -100; |
589 register int in_range = 0; | |
590 | |
591 printf ("/charset [%s", | |
592 (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); | |
1155 | 593 |
594 assert (p + *p < pend); | |
595 | |
2615 | 596 for (c = 0; c < 256; c++) |
597 if (c / 8 < *p | |
598 && (p[1 + (c/8)] & (1 << (c % 8)))) | |
599 { | |
600 /* Are we starting a range? */ | |
601 if (last + 1 == c && ! in_range) | |
602 { | |
603 putchar ('-'); | |
604 in_range = 1; | |
605 } | |
606 /* Have we broken a range? */ | |
607 else if (last + 1 != c && in_range) | |
1155 | 608 { |
2615 | 609 printchar (last); |
610 in_range = 0; | |
611 } | |
1155 | 612 |
2615 | 613 if (! in_range) |
614 printchar (c); | |
615 | |
616 last = c; | |
1155 | 617 } |
2615 | 618 |
619 if (in_range) | |
620 printchar (last); | |
621 | |
622 putchar (']'); | |
623 | |
1155 | 624 p += 1 + *p; |
625 } | |
2615 | 626 break; |
1155 | 627 |
628 case begline: | |
629 printf ("/begline"); | |
630 break; | |
631 | |
632 case endline: | |
633 printf ("/endline"); | |
634 break; | |
635 | |
636 case on_failure_jump: | |
637 extract_number_and_incr (&mcnt, &p); | |
2615 | 638 printf ("/on_failure_jump to %d", p + mcnt - start); |
1155 | 639 break; |
640 | |
641 case on_failure_keep_string_jump: | |
642 extract_number_and_incr (&mcnt, &p); | |
2615 | 643 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start); |
1155 | 644 break; |
645 | |
646 case dummy_failure_jump: | |
647 extract_number_and_incr (&mcnt, &p); | |
2615 | 648 printf ("/dummy_failure_jump to %d", p + mcnt - start); |
1155 | 649 break; |
650 | |
651 case push_dummy_failure: | |
652 printf ("/push_dummy_failure"); | |
653 break; | |
654 | |
655 case maybe_pop_jump: | |
656 extract_number_and_incr (&mcnt, &p); | |
2615 | 657 printf ("/maybe_pop_jump to %d", p + mcnt - start); |
1155 | 658 break; |
659 | |
660 case pop_failure_jump: | |
661 extract_number_and_incr (&mcnt, &p); | |
2615 | 662 printf ("/pop_failure_jump to %d", p + mcnt - start); |
1155 | 663 break; |
664 | |
665 case jump_past_alt: | |
666 extract_number_and_incr (&mcnt, &p); | |
2615 | 667 printf ("/jump_past_alt to %d", p + mcnt - start); |
1155 | 668 break; |
669 | |
670 case jump: | |
671 extract_number_and_incr (&mcnt, &p); | |
2615 | 672 printf ("/jump to %d", p + mcnt - start); |
1155 | 673 break; |
674 | |
675 case succeed_n: | |
676 extract_number_and_incr (&mcnt, &p); | |
677 extract_number_and_incr (&mcnt2, &p); | |
2615 | 678 printf ("/succeed_n to %d, %d times", p + mcnt - start, mcnt2); |
1155 | 679 break; |
680 | |
681 case jump_n: | |
682 extract_number_and_incr (&mcnt, &p); | |
683 extract_number_and_incr (&mcnt2, &p); | |
2615 | 684 printf ("/jump_n to %d, %d times", p + mcnt - start, mcnt2); |
1155 | 685 break; |
686 | |
687 case set_number_at: | |
688 extract_number_and_incr (&mcnt, &p); | |
689 extract_number_and_incr (&mcnt2, &p); | |
2615 | 690 printf ("/set_number_at location %d to %d", p + mcnt - start, mcnt2); |
1155 | 691 break; |
692 | |
693 case wordbound: | |
694 printf ("/wordbound"); | |
695 break; | |
696 | |
697 case notwordbound: | |
698 printf ("/notwordbound"); | |
699 break; | |
700 | |
701 case wordbeg: | |
702 printf ("/wordbeg"); | |
703 break; | |
704 | |
705 case wordend: | |
706 printf ("/wordend"); | |
707 | |
708 #ifdef emacs | |
709 case before_dot: | |
710 printf ("/before_dot"); | |
711 break; | |
712 | |
713 case at_dot: | |
714 printf ("/at_dot"); | |
715 break; | |
716 | |
717 case after_dot: | |
718 printf ("/after_dot"); | |
719 break; | |
720 | |
721 case syntaxspec: | |
722 printf ("/syntaxspec"); | |
723 mcnt = *p++; | |
724 printf ("/%d", mcnt); | |
725 break; | |
726 | |
727 case notsyntaxspec: | |
728 printf ("/notsyntaxspec"); | |
729 mcnt = *p++; | |
730 printf ("/%d", mcnt); | |
731 break; | |
732 #endif /* emacs */ | |
733 | |
734 case wordchar: | |
735 printf ("/wordchar"); | |
736 break; | |
737 | |
738 case notwordchar: | |
739 printf ("/notwordchar"); | |
740 break; | |
741 | |
742 case begbuf: | |
743 printf ("/begbuf"); | |
744 break; | |
745 | |
746 case endbuf: | |
747 printf ("/endbuf"); | |
748 break; | |
749 | |
750 default: | |
751 printf ("?%d", *(p-1)); | |
752 } | |
2615 | 753 |
754 putchar ('\n'); | |
1155 | 755 } |
2615 | 756 |
757 printf ("%d:\tend of pattern.\n", p - start); | |
1155 | 758 } |
759 | |
760 | |
761 void | |
762 print_compiled_pattern (bufp) | |
763 struct re_pattern_buffer *bufp; | |
764 { | |
765 unsigned char *buffer = bufp->buffer; | |
766 | |
767 print_partial_compiled_pattern (buffer, buffer + bufp->used); | |
768 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated); | |
769 | |
770 if (bufp->fastmap_accurate && bufp->fastmap) | |
771 { | |
772 printf ("fastmap: "); | |
773 print_fastmap (bufp->fastmap); | |
774 } | |
775 | |
776 printf ("re_nsub: %d\t", bufp->re_nsub); | |
777 printf ("regs_alloc: %d\t", bufp->regs_allocated); | |
778 printf ("can_be_null: %d\t", bufp->can_be_null); | |
779 printf ("newline_anchor: %d\n", bufp->newline_anchor); | |
780 printf ("no_sub: %d\t", bufp->no_sub); | |
781 printf ("not_bol: %d\t", bufp->not_bol); | |
782 printf ("not_eol: %d\t", bufp->not_eol); | |
783 printf ("syntax: %d\n", bufp->syntax); | |
784 /* Perhaps we should print the translate table? */ | |
785 } | |
786 | |
787 | |
788 void | |
789 print_double_string (where, string1, size1, string2, size2) | |
790 const char *where; | |
791 const char *string1; | |
792 const char *string2; | |
793 int size1; | |
794 int size2; | |
795 { | |
796 unsigned this_char; | |
797 | |
798 if (where == NULL) | |
799 printf ("(null)"); | |
800 else | |
801 { | |
802 if (FIRST_STRING_P (where)) | |
803 { | |
804 for (this_char = where - string1; this_char < size1; this_char++) | |
805 printchar (string1[this_char]); | |
806 | |
807 where = string2; | |
808 } | |
809 | |
810 for (this_char = where - string2; this_char < size2; this_char++) | |
811 printchar (string2[this_char]); | |
812 } | |
813 } | |
814 | |
815 #else /* not DEBUG */ | |
816 | |
817 #undef assert | |
818 #define assert(e) | |
819 | |
820 #define DEBUG_STATEMENT(e) | |
821 #define DEBUG_PRINT1(x) | |
822 #define DEBUG_PRINT2(x1, x2) | |
823 #define DEBUG_PRINT3(x1, x2, x3) | |
1637 | 824 #define DEBUG_PRINT4(x1, x2, x3, x4) |
1155 | 825 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) |
826 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) | |
827 | |
828 #endif /* not DEBUG */ | |
829 | |
830 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can | |
831 also be assigned to arbitrarily: each pattern buffer stores its own | |
832 syntax, so it can be changed between regex compilations. */ | |
833 reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS; | |
834 | |
835 | |
836 /* Specify the precise syntax of regexps for compilation. This provides | |
837 for compatibility for various utilities which historically have | |
838 different, incompatible syntaxes. | |
839 | |
840 The argument SYNTAX is a bit mask comprised of the various bits | |
841 defined in regex.h. We return the old syntax. */ | |
842 | |
843 reg_syntax_t | |
844 re_set_syntax (syntax) | |
845 reg_syntax_t syntax; | |
846 { | |
847 reg_syntax_t ret = re_syntax_options; | |
848 | |
849 re_syntax_options = syntax; | |
850 return ret; | |
851 } | |
852 | |
853 /* This table gives an error message for each of the error codes listed | |
854 in regex.h. Obviously the order here has to be same as there. */ | |
855 | |
856 static const char *re_error_msg[] = | |
857 { NULL, /* REG_NOERROR */ | |
858 "No match", /* REG_NOMATCH */ | |
859 "Invalid regular expression", /* REG_BADPAT */ | |
860 "Invalid collation character", /* REG_ECOLLATE */ | |
861 "Invalid character class name", /* REG_ECTYPE */ | |
862 "Trailing backslash", /* REG_EESCAPE */ | |
863 "Invalid back reference", /* REG_ESUBREG */ | |
864 "Unmatched [ or [^", /* REG_EBRACK */ | |
865 "Unmatched ( or \\(", /* REG_EPAREN */ | |
866 "Unmatched \\{", /* REG_EBRACE */ | |
867 "Invalid content of \\{\\}", /* REG_BADBR */ | |
868 "Invalid range end", /* REG_ERANGE */ | |
869 "Memory exhausted", /* REG_ESPACE */ | |
870 "Invalid preceding regular expression", /* REG_BADRPT */ | |
871 "Premature end of regular expression", /* REG_EEND */ | |
872 "Regular expression too big", /* REG_ESIZE */ | |
873 "Unmatched ) or \\)", /* REG_ERPAREN */ | |
874 }; | |
875 | |
2949 | 876 /* Avoiding alloca during matching, to placate r_alloc. */ |
877 | |
2952 | 878 /* Define MATCH_MAY_ALLOCATE if we need to make sure that the |
2949 | 879 searching and matching functions should not call alloca. On some |
880 systems, alloca is implemented in terms of malloc, and if we're | |
881 using the relocating allocator routines, then malloc could cause a | |
882 relocation, which might (if the strings being searched are in the | |
883 ralloc heap) shift the data out from underneath the regexp | |
3614 | 884 routines. |
885 | |
886 Here's another reason to avoid allocation: Emacs insists on | |
887 processing input from X in a signal handler; processing X input may | |
888 call malloc; if input arrives while a matching routine is calling | |
889 malloc, then we're scrod. But Emacs can't just block input while | |
890 calling matching routines; then we don't notice interrupts when | |
891 they come in. So, Emacs blocks input around all regexp calls | |
892 except the matching calls, which it leaves unprotected, in the | |
893 faith that they will not malloc. */ | |
2952 | 894 |
895 /* Normally, this is fine. */ | |
896 #define MATCH_MAY_ALLOCATE | |
897 | |
898 /* But under some circumstances, it's not. */ | |
3614 | 899 #if defined (emacs) || (defined (REL_ALLOC) && defined (C_ALLOCA)) |
2952 | 900 #undef MATCH_MAY_ALLOCATE |
2949 | 901 #endif |
902 | |
903 | |
904 /* Failure stack declarations and macros; both re_compile_fastmap and | |
905 re_match_2 use a failure stack. These have to be macros because of | |
906 REGEX_ALLOCATE. */ | |
907 | |
908 | |
909 /* Number of failure points for which to initially allocate space | |
910 when matching. If this number is exceeded, we allocate more | |
911 space, so it is not a hard limit. */ | |
912 #ifndef INIT_FAILURE_ALLOC | |
913 #define INIT_FAILURE_ALLOC 5 | |
914 #endif | |
915 | |
916 /* Roughly the maximum number of failure points on the stack. Would be | |
917 exactly that if always used MAX_FAILURE_SPACE each time we failed. | |
918 This is a variable only so users of regex can assign to it; we never | |
919 change it ourselves. */ | |
920 int re_max_failures = 2000; | |
921 | |
922 typedef const unsigned char *fail_stack_elt_t; | |
923 | |
924 typedef struct | |
925 { | |
926 fail_stack_elt_t *stack; | |
927 unsigned size; | |
928 unsigned avail; /* Offset of next open position. */ | |
929 } fail_stack_type; | |
930 | |
931 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0) | |
932 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) | |
933 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) | |
934 #define FAIL_STACK_TOP() (fail_stack.stack[fail_stack.avail]) | |
935 | |
936 | |
937 /* Initialize `fail_stack'. Do `return -2' if the alloc fails. */ | |
938 | |
2952 | 939 #ifdef MATCH_MAY_ALLOCATE |
2949 | 940 #define INIT_FAIL_STACK() \ |
941 do { \ | |
942 fail_stack.stack = (fail_stack_elt_t *) \ | |
943 REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \ | |
944 \ | |
945 if (fail_stack.stack == NULL) \ | |
946 return -2; \ | |
947 \ | |
948 fail_stack.size = INIT_FAILURE_ALLOC; \ | |
949 fail_stack.avail = 0; \ | |
950 } while (0) | |
951 #else | |
952 #define INIT_FAIL_STACK() \ | |
953 do { \ | |
954 fail_stack.avail = 0; \ | |
955 } while (0) | |
956 #endif | |
957 | |
958 | |
959 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. | |
960 | |
961 Return 1 if succeeds, and 0 if either ran out of memory | |
962 allocating space for it or it was already too large. | |
963 | |
964 REGEX_REALLOCATE requires `destination' be declared. */ | |
965 | |
966 #define DOUBLE_FAIL_STACK(fail_stack) \ | |
967 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \ | |
968 ? 0 \ | |
969 : ((fail_stack).stack = (fail_stack_elt_t *) \ | |
970 REGEX_REALLOCATE ((fail_stack).stack, \ | |
971 (fail_stack).size * sizeof (fail_stack_elt_t), \ | |
972 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ | |
973 \ | |
974 (fail_stack).stack == NULL \ | |
975 ? 0 \ | |
976 : ((fail_stack).size <<= 1, \ | |
977 1))) | |
978 | |
979 | |
980 /* Push PATTERN_OP on FAIL_STACK. | |
981 | |
982 Return 1 if was able to do so and 0 if ran out of memory allocating | |
983 space to do so. */ | |
984 #define PUSH_PATTERN_OP(pattern_op, fail_stack) \ | |
985 ((FAIL_STACK_FULL () \ | |
986 && !DOUBLE_FAIL_STACK (fail_stack)) \ | |
987 ? 0 \ | |
988 : ((fail_stack).stack[(fail_stack).avail++] = pattern_op, \ | |
989 1)) | |
990 | |
991 /* This pushes an item onto the failure stack. Must be a four-byte | |
992 value. Assumes the variable `fail_stack'. Probably should only | |
993 be called from within `PUSH_FAILURE_POINT'. */ | |
994 #define PUSH_FAILURE_ITEM(item) \ | |
995 fail_stack.stack[fail_stack.avail++] = (fail_stack_elt_t) item | |
996 | |
997 /* The complement operation. Assumes `fail_stack' is nonempty. */ | |
998 #define POP_FAILURE_ITEM() fail_stack.stack[--fail_stack.avail] | |
999 | |
1000 /* Used to omit pushing failure point id's when we're not debugging. */ | |
1001 #ifdef DEBUG | |
1002 #define DEBUG_PUSH PUSH_FAILURE_ITEM | |
1003 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_ITEM () | |
1004 #else | |
1005 #define DEBUG_PUSH(item) | |
1006 #define DEBUG_POP(item_addr) | |
1007 #endif | |
1008 | |
1009 | |
1010 /* Push the information about the state we will need | |
1011 if we ever fail back to it. | |
1012 | |
1013 Requires variables fail_stack, regstart, regend, reg_info, and | |
1014 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be | |
1015 declared. | |
1016 | |
1017 Does `return FAILURE_CODE' if runs out of memory. */ | |
1018 | |
1019 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ | |
1020 do { \ | |
1021 char *destination; \ | |
1022 /* Must be int, so when we don't save any registers, the arithmetic \ | |
1023 of 0 + -1 isn't done as unsigned. */ \ | |
1024 int this_reg; \ | |
1025 \ | |
1026 DEBUG_STATEMENT (failure_id++); \ | |
1027 DEBUG_STATEMENT (nfailure_points_pushed++); \ | |
1028 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ | |
1029 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ | |
1030 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ | |
1031 \ | |
1032 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \ | |
1033 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ | |
1034 \ | |
1035 /* Ensure we have enough space allocated for what we will push. */ \ | |
1036 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ | |
1037 { \ | |
1038 if (!DOUBLE_FAIL_STACK (fail_stack)) \ | |
1039 return failure_code; \ | |
1040 \ | |
1041 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ | |
1042 (fail_stack).size); \ | |
1043 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ | |
1044 } \ | |
1045 \ | |
1046 /* Push the info, starting with the registers. */ \ | |
1047 DEBUG_PRINT1 ("\n"); \ | |
1048 \ | |
1049 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ | |
1050 this_reg++) \ | |
1051 { \ | |
1052 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \ | |
1053 DEBUG_STATEMENT (num_regs_pushed++); \ | |
1054 \ | |
1055 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ | |
1056 PUSH_FAILURE_ITEM (regstart[this_reg]); \ | |
1057 \ | |
1058 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ | |
1059 PUSH_FAILURE_ITEM (regend[this_reg]); \ | |
1060 \ | |
1061 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \ | |
1062 DEBUG_PRINT2 (" match_null=%d", \ | |
1063 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ | |
1064 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ | |
1065 DEBUG_PRINT2 (" matched_something=%d", \ | |
1066 MATCHED_SOMETHING (reg_info[this_reg])); \ | |
1067 DEBUG_PRINT2 (" ever_matched=%d", \ | |
1068 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ | |
1069 DEBUG_PRINT1 ("\n"); \ | |
1070 PUSH_FAILURE_ITEM (reg_info[this_reg].word); \ | |
1071 } \ | |
1072 \ | |
1073 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\ | |
1074 PUSH_FAILURE_ITEM (lowest_active_reg); \ | |
1075 \ | |
1076 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\ | |
1077 PUSH_FAILURE_ITEM (highest_active_reg); \ | |
1078 \ | |
1079 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \ | |
1080 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ | |
1081 PUSH_FAILURE_ITEM (pattern_place); \ | |
1082 \ | |
1083 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \ | |
1084 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ | |
1085 size2); \ | |
1086 DEBUG_PRINT1 ("'\n"); \ | |
1087 PUSH_FAILURE_ITEM (string_place); \ | |
1088 \ | |
1089 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ | |
1090 DEBUG_PUSH (failure_id); \ | |
1091 } while (0) | |
1092 | |
1093 /* This is the number of items that are pushed and popped on the stack | |
1094 for each register. */ | |
1095 #define NUM_REG_ITEMS 3 | |
1096 | |
1097 /* Individual items aside from the registers. */ | |
1098 #ifdef DEBUG | |
1099 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ | |
1100 #else | |
1101 #define NUM_NONREG_ITEMS 4 | |
1102 #endif | |
1103 | |
1104 /* We push at most this many items on the stack. */ | |
1105 #define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS) | |
1106 | |
1107 /* We actually push this many items. */ | |
1108 #define NUM_FAILURE_ITEMS \ | |
1109 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \ | |
1110 + NUM_NONREG_ITEMS) | |
1111 | |
1112 /* How many items can still be added to the stack without overflowing it. */ | |
1113 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) | |
1114 | |
1115 | |
1116 /* Pops what PUSH_FAIL_STACK pushes. | |
1117 | |
1118 We restore into the parameters, all of which should be lvalues: | |
1119 STR -- the saved data position. | |
1120 PAT -- the saved pattern position. | |
1121 LOW_REG, HIGH_REG -- the highest and lowest active registers. | |
1122 REGSTART, REGEND -- arrays of string positions. | |
1123 REG_INFO -- array of information about each subexpression. | |
1124 | |
1125 Also assumes the variables `fail_stack' and (if debugging), `bufp', | |
1126 `pend', `string1', `size1', `string2', and `size2'. */ | |
1127 | |
1128 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ | |
1129 { \ | |
1130 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \ | |
1131 int this_reg; \ | |
1132 const unsigned char *string_temp; \ | |
1133 \ | |
1134 assert (!FAIL_STACK_EMPTY ()); \ | |
1135 \ | |
1136 /* Remove failure points and point to how many regs pushed. */ \ | |
1137 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ | |
1138 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ | |
1139 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ | |
1140 \ | |
1141 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ | |
1142 \ | |
1143 DEBUG_POP (&failure_id); \ | |
1144 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ | |
1145 \ | |
1146 /* If the saved string location is NULL, it came from an \ | |
1147 on_failure_keep_string_jump opcode, and we want to throw away the \ | |
1148 saved NULL, thus retaining our current position in the string. */ \ | |
1149 string_temp = POP_FAILURE_ITEM (); \ | |
1150 if (string_temp != NULL) \ | |
1151 str = (const char *) string_temp; \ | |
1152 \ | |
1153 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \ | |
1154 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ | |
1155 DEBUG_PRINT1 ("'\n"); \ | |
1156 \ | |
1157 pat = (unsigned char *) POP_FAILURE_ITEM (); \ | |
1158 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \ | |
1159 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ | |
1160 \ | |
1161 /* Restore register info. */ \ | |
1162 high_reg = (unsigned) POP_FAILURE_ITEM (); \ | |
1163 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \ | |
1164 \ | |
1165 low_reg = (unsigned) POP_FAILURE_ITEM (); \ | |
1166 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \ | |
1167 \ | |
1168 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ | |
1169 { \ | |
1170 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \ | |
1171 \ | |
1172 reg_info[this_reg].word = POP_FAILURE_ITEM (); \ | |
1173 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \ | |
1174 \ | |
1175 regend[this_reg] = (const char *) POP_FAILURE_ITEM (); \ | |
1176 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ | |
1177 \ | |
1178 regstart[this_reg] = (const char *) POP_FAILURE_ITEM (); \ | |
1179 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ | |
1180 } \ | |
1181 \ | |
1182 DEBUG_STATEMENT (nfailure_points_popped++); \ | |
1183 } /* POP_FAILURE_POINT */ | |
1184 | |
1185 | |
1186 | |
1187 /* Structure for per-register (a.k.a. per-group) information. | |
1188 This must not be longer than one word, because we push this value | |
1189 onto the failure stack. Other register information, such as the | |
1190 starting and ending positions (which are addresses), and the list of | |
1191 inner groups (which is a bits list) are maintained in separate | |
1192 variables. | |
1193 | |
1194 We are making a (strictly speaking) nonportable assumption here: that | |
1195 the compiler will pack our bit fields into something that fits into | |
1196 the type of `word', i.e., is something that fits into one item on the | |
1197 failure stack. */ | |
1198 typedef union | |
1199 { | |
1200 fail_stack_elt_t word; | |
1201 struct | |
1202 { | |
1203 /* This field is one if this group can match the empty string, | |
1204 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ | |
1205 #define MATCH_NULL_UNSET_VALUE 3 | |
1206 unsigned match_null_string_p : 2; | |
1207 unsigned is_active : 1; | |
1208 unsigned matched_something : 1; | |
1209 unsigned ever_matched_something : 1; | |
1210 } bits; | |
1211 } register_info_type; | |
1212 | |
1213 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) | |
1214 #define IS_ACTIVE(R) ((R).bits.is_active) | |
1215 #define MATCHED_SOMETHING(R) ((R).bits.matched_something) | |
1216 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) | |
1217 | |
1218 | |
1219 /* Call this when have matched a real character; it sets `matched' flags | |
1220 for the subexpressions which we are currently inside. Also records | |
1221 that those subexprs have matched. */ | |
1222 #define SET_REGS_MATCHED() \ | |
1223 do \ | |
1224 { \ | |
1225 unsigned r; \ | |
1226 for (r = lowest_active_reg; r <= highest_active_reg; r++) \ | |
1227 { \ | |
1228 MATCHED_SOMETHING (reg_info[r]) \ | |
1229 = EVER_MATCHED_SOMETHING (reg_info[r]) \ | |
1230 = 1; \ | |
1231 } \ | |
1232 } \ | |
1233 while (0) | |
1234 | |
1235 | |
1236 /* Registers are set to a sentinel when they haven't yet matched. */ | |
1237 #define REG_UNSET_VALUE ((char *) -1) | |
1238 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE) | |
1239 | |
1240 | |
1241 | |
2952 | 1242 /* How do we implement a missing MATCH_MAY_ALLOCATE? |
2949 | 1243 We make the fail stack a global thing, and then grow it to |
1244 re_max_failures when we compile. */ | |
2952 | 1245 #ifndef MATCH_MAY_ALLOCATE |
2949 | 1246 static fail_stack_type fail_stack; |
1247 | |
1248 static const char ** regstart, ** regend; | |
1249 static const char ** old_regstart, ** old_regend; | |
1250 static const char **best_regstart, **best_regend; | |
1251 static register_info_type *reg_info; | |
1252 static const char **reg_dummy; | |
1253 static register_info_type *reg_info_dummy; | |
1254 #endif | |
1255 | |
1256 | |
1155 | 1257 /* Subroutine declarations and macros for regex_compile. */ |
1258 | |
1259 static void store_op1 (), store_op2 (); | |
1260 static void insert_op1 (), insert_op2 (); | |
1261 static boolean at_begline_loc_p (), at_endline_loc_p (); | |
1262 static boolean group_in_compile_stack (); | |
1263 static reg_errcode_t compile_range (); | |
1264 | |
1265 /* Fetch the next character in the uncompiled pattern---translating it | |
1266 if necessary. Also cast from a signed character in the constant | |
1267 string passed to us by the user to an unsigned char that we can use | |
1268 as an array index (in, e.g., `translate'). */ | |
1269 #define PATFETCH(c) \ | |
1270 do {if (p == pend) return REG_EEND; \ | |
1271 c = (unsigned char) *p++; \ | |
1272 if (translate) c = translate[c]; \ | |
1273 } while (0) | |
1274 | |
1275 /* Fetch the next character in the uncompiled pattern, with no | |
1276 translation. */ | |
1277 #define PATFETCH_RAW(c) \ | |
1278 do {if (p == pend) return REG_EEND; \ | |
1279 c = (unsigned char) *p++; \ | |
1280 } while (0) | |
1281 | |
1282 /* Go backwards one character in the pattern. */ | |
1283 #define PATUNFETCH p-- | |
1284 | |
1285 | |
1286 /* If `translate' is non-null, return translate[D], else just D. We | |
1287 cast the subscript to translate because some data is declared as | |
1288 `char *', to avoid warnings when a string constant is passed. But | |
1289 when we use a character as a subscript we must make it unsigned. */ | |
1290 #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d)) | |
1291 | |
1292 | |
1293 /* Macros for outputting the compiled pattern into `buffer'. */ | |
1294 | |
1295 /* If the buffer isn't allocated when it comes in, use this. */ | |
1296 #define INIT_BUF_SIZE 32 | |
1297 | |
1298 /* Make sure we have at least N more bytes of space in buffer. */ | |
1299 #define GET_BUFFER_SPACE(n) \ | |
1300 while (b - bufp->buffer + (n) > bufp->allocated) \ | |
1301 EXTEND_BUFFER () | |
1302 | |
1303 /* Make sure we have one more byte of buffer space and then add C to it. */ | |
1304 #define BUF_PUSH(c) \ | |
1305 do { \ | |
1306 GET_BUFFER_SPACE (1); \ | |
1307 *b++ = (unsigned char) (c); \ | |
1308 } while (0) | |
1309 | |
1310 | |
1311 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */ | |
1312 #define BUF_PUSH_2(c1, c2) \ | |
1313 do { \ | |
1314 GET_BUFFER_SPACE (2); \ | |
1315 *b++ = (unsigned char) (c1); \ | |
1316 *b++ = (unsigned char) (c2); \ | |
1317 } while (0) | |
1318 | |
1319 | |
1320 /* As with BUF_PUSH_2, except for three bytes. */ | |
1321 #define BUF_PUSH_3(c1, c2, c3) \ | |
1322 do { \ | |
1323 GET_BUFFER_SPACE (3); \ | |
1324 *b++ = (unsigned char) (c1); \ | |
1325 *b++ = (unsigned char) (c2); \ | |
1326 *b++ = (unsigned char) (c3); \ | |
1327 } while (0) | |
1328 | |
1329 | |
1330 /* Store a jump with opcode OP at LOC to location TO. We store a | |
1331 relative address offset by the three bytes the jump itself occupies. */ | |
1332 #define STORE_JUMP(op, loc, to) \ | |
1333 store_op1 (op, loc, (to) - (loc) - 3) | |
1334 | |
1335 /* Likewise, for a two-argument jump. */ | |
1336 #define STORE_JUMP2(op, loc, to, arg) \ | |
1337 store_op2 (op, loc, (to) - (loc) - 3, arg) | |
1338 | |
1339 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ | |
1340 #define INSERT_JUMP(op, loc, to) \ | |
1341 insert_op1 (op, loc, (to) - (loc) - 3, b) | |
1342 | |
1343 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ | |
1344 #define INSERT_JUMP2(op, loc, to, arg) \ | |
1345 insert_op2 (op, loc, (to) - (loc) - 3, arg, b) | |
1346 | |
1347 | |
1348 /* This is not an arbitrary limit: the arguments which represent offsets | |
1349 into the pattern are two bytes long. So if 2^16 bytes turns out to | |
1350 be too small, many things would have to change. */ | |
1351 #define MAX_BUF_SIZE (1L << 16) | |
1352 | |
1353 | |
1354 /* Extend the buffer by twice its current size via realloc and | |
1355 reset the pointers that pointed into the old block to point to the | |
1356 correct places in the new one. If extending the buffer results in it | |
1357 being larger than MAX_BUF_SIZE, then flag memory exhausted. */ | |
1358 #define EXTEND_BUFFER() \ | |
1359 do { \ | |
1360 unsigned char *old_buffer = bufp->buffer; \ | |
1361 if (bufp->allocated == MAX_BUF_SIZE) \ | |
1362 return REG_ESIZE; \ | |
1363 bufp->allocated <<= 1; \ | |
1364 if (bufp->allocated > MAX_BUF_SIZE) \ | |
1365 bufp->allocated = MAX_BUF_SIZE; \ | |
1366 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\ | |
1367 if (bufp->buffer == NULL) \ | |
1368 return REG_ESPACE; \ | |
1369 /* If the buffer moved, move all the pointers into it. */ \ | |
1370 if (old_buffer != bufp->buffer) \ | |
1371 { \ | |
1372 b = (b - old_buffer) + bufp->buffer; \ | |
1373 begalt = (begalt - old_buffer) + bufp->buffer; \ | |
1374 if (fixup_alt_jump) \ | |
1375 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\ | |
1376 if (laststart) \ | |
1377 laststart = (laststart - old_buffer) + bufp->buffer; \ | |
1378 if (pending_exact) \ | |
1379 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ | |
1380 } \ | |
1381 } while (0) | |
1382 | |
1383 | |
1384 /* Since we have one byte reserved for the register number argument to | |
1385 {start,stop}_memory, the maximum number of groups we can report | |
1386 things about is what fits in that byte. */ | |
1387 #define MAX_REGNUM 255 | |
1388 | |
1389 /* But patterns can have more than `MAX_REGNUM' registers. We just | |
1390 ignore the excess. */ | |
1391 typedef unsigned regnum_t; | |
1392 | |
1393 | |
1394 /* Macros for the compile stack. */ | |
1395 | |
1396 /* Since offsets can go either forwards or backwards, this type needs to | |
1397 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ | |
1398 typedef int pattern_offset_t; | |
1399 | |
1400 typedef struct | |
1401 { | |
1402 pattern_offset_t begalt_offset; | |
1403 pattern_offset_t fixup_alt_jump; | |
1404 pattern_offset_t inner_group_offset; | |
1405 pattern_offset_t laststart_offset; | |
1406 regnum_t regnum; | |
1407 } compile_stack_elt_t; | |
1408 | |
1409 | |
1410 typedef struct | |
1411 { | |
1412 compile_stack_elt_t *stack; | |
1413 unsigned size; | |
1414 unsigned avail; /* Offset of next open position. */ | |
1415 } compile_stack_type; | |
1416 | |
1417 | |
1418 #define INIT_COMPILE_STACK_SIZE 32 | |
1419 | |
1420 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0) | |
1421 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) | |
1422 | |
1423 /* The next available element. */ | |
1424 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) | |
1425 | |
1426 | |
1427 /* Set the bit for character C in a list. */ | |
1428 #define SET_LIST_BIT(c) \ | |
1429 (b[((unsigned char) (c)) / BYTEWIDTH] \ | |
1430 |= 1 << (((unsigned char) c) % BYTEWIDTH)) | |
1431 | |
1432 | |
1433 /* Get the next unsigned number in the uncompiled pattern. */ | |
1434 #define GET_UNSIGNED_NUMBER(num) \ | |
1435 { if (p != pend) \ | |
1436 { \ | |
1437 PATFETCH (c); \ | |
1668 | 1438 while (ISDIGIT (c)) \ |
1155 | 1439 { \ |
1440 if (num < 0) \ | |
1441 num = 0; \ | |
1442 num = num * 10 + c - '0'; \ | |
1443 if (p == pend) \ | |
1444 break; \ | |
1445 PATFETCH (c); \ | |
1446 } \ | |
1447 } \ | |
1448 } | |
1449 | |
1450 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ | |
1451 | |
1452 #define IS_CHAR_CLASS(string) \ | |
1453 (STREQ (string, "alpha") || STREQ (string, "upper") \ | |
1454 || STREQ (string, "lower") || STREQ (string, "digit") \ | |
1455 || STREQ (string, "alnum") || STREQ (string, "xdigit") \ | |
1456 || STREQ (string, "space") || STREQ (string, "print") \ | |
1457 || STREQ (string, "punct") || STREQ (string, "graph") \ | |
1458 || STREQ (string, "cntrl") || STREQ (string, "blank")) | |
1459 | |
1460 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. | |
1461 Returns one of error codes defined in `regex.h', or zero for success. | |
1462 | |
1463 Assumes the `allocated' (and perhaps `buffer') and `translate' | |
1464 fields are set in BUFP on entry. | |
1465 | |
1466 If it succeeds, results are put in BUFP (if it returns an error, the | |
1467 contents of BUFP are undefined): | |
1468 `buffer' is the compiled pattern; | |
1469 `syntax' is set to SYNTAX; | |
1470 `used' is set to the length of the compiled pattern; | |
1637 | 1471 `fastmap_accurate' is zero; |
1472 `re_nsub' is the number of subexpressions in PATTERN; | |
1473 `not_bol' and `not_eol' are zero; | |
1155 | 1474 |
1475 The `fastmap' and `newline_anchor' fields are neither | |
1476 examined nor set. */ | |
1477 | |
1478 static reg_errcode_t | |
1479 regex_compile (pattern, size, syntax, bufp) | |
1480 const char *pattern; | |
1481 int size; | |
1482 reg_syntax_t syntax; | |
1483 struct re_pattern_buffer *bufp; | |
1484 { | |
1485 /* We fetch characters from PATTERN here. Even though PATTERN is | |
1486 `char *' (i.e., signed), we declare these variables as unsigned, so | |
1487 they can be reliably used as array indices. */ | |
1488 register unsigned char c, c1; | |
1489 | |
1490 /* A random tempory spot in PATTERN. */ | |
1491 const char *p1; | |
1492 | |
1493 /* Points to the end of the buffer, where we should append. */ | |
1494 register unsigned char *b; | |
1495 | |
1496 /* Keeps track of unclosed groups. */ | |
1497 compile_stack_type compile_stack; | |
1498 | |
1499 /* Points to the current (ending) position in the pattern. */ | |
1500 const char *p = pattern; | |
1501 const char *pend = pattern + size; | |
1502 | |
1503 /* How to translate the characters in the pattern. */ | |
1504 char *translate = bufp->translate; | |
1505 | |
1506 /* Address of the count-byte of the most recently inserted `exactn' | |
1507 command. This makes it possible to tell if a new exact-match | |
1508 character can be added to that command or if the character requires | |
1509 a new `exactn' command. */ | |
1510 unsigned char *pending_exact = 0; | |
1511 | |
1512 /* Address of start of the most recently finished expression. | |
1513 This tells, e.g., postfix * where to find the start of its | |
1514 operand. Reset at the beginning of groups and alternatives. */ | |
1515 unsigned char *laststart = 0; | |
1516 | |
1517 /* Address of beginning of regexp, or inside of last group. */ | |
1518 unsigned char *begalt; | |
1519 | |
1520 /* Place in the uncompiled pattern (i.e., the {) to | |
1521 which to go back if the interval is invalid. */ | |
1522 const char *beg_interval; | |
1523 | |
1524 /* Address of the place where a forward jump should go to the end of | |
1525 the containing expression. Each alternative of an `or' -- except the | |
1526 last -- ends with a forward jump of this sort. */ | |
1527 unsigned char *fixup_alt_jump = 0; | |
1528 | |
1529 /* Counts open-groups as they are encountered. Remembered for the | |
1530 matching close-group on the compile stack, so the same register | |
1531 number is put in the stop_memory as the start_memory. */ | |
1532 regnum_t regnum = 0; | |
1533 | |
1534 #ifdef DEBUG | |
1535 DEBUG_PRINT1 ("\nCompiling pattern: "); | |
1536 if (debug) | |
1537 { | |
1538 unsigned debug_count; | |
1539 | |
1540 for (debug_count = 0; debug_count < size; debug_count++) | |
1541 printchar (pattern[debug_count]); | |
1542 putchar ('\n'); | |
1543 } | |
1544 #endif /* DEBUG */ | |
1545 | |
1546 /* Initialize the compile stack. */ | |
1547 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); | |
1548 if (compile_stack.stack == NULL) | |
1549 return REG_ESPACE; | |
1550 | |
1551 compile_stack.size = INIT_COMPILE_STACK_SIZE; | |
1552 compile_stack.avail = 0; | |
1553 | |
1554 /* Initialize the pattern buffer. */ | |
1555 bufp->syntax = syntax; | |
1556 bufp->fastmap_accurate = 0; | |
1557 bufp->not_bol = bufp->not_eol = 0; | |
1558 | |
1559 /* Set `used' to zero, so that if we return an error, the pattern | |
1560 printer (for debugging) will think there's no pattern. We reset it | |
1561 at the end. */ | |
1562 bufp->used = 0; | |
1563 | |
1564 /* Always count groups, whether or not bufp->no_sub is set. */ | |
1565 bufp->re_nsub = 0; | |
1566 | |
1567 #if !defined (emacs) && !defined (SYNTAX_TABLE) | |
1568 /* Initialize the syntax table. */ | |
1569 init_syntax_once (); | |
1570 #endif | |
1571 | |
1572 if (bufp->allocated == 0) | |
1573 { | |
1574 if (bufp->buffer) | |
1575 { /* If zero allocated, but buffer is non-null, try to realloc | |
1576 enough space. This loses if buffer's address is bogus, but | |
1577 that is the user's responsibility. */ | |
1578 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); | |
1579 } | |
1580 else | |
1581 { /* Caller did not allocate a buffer. Do it for them. */ | |
1582 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); | |
1583 } | |
1584 if (!bufp->buffer) return REG_ESPACE; | |
1585 | |
1586 bufp->allocated = INIT_BUF_SIZE; | |
1587 } | |
1588 | |
1589 begalt = b = bufp->buffer; | |
1590 | |
1591 /* Loop through the uncompiled pattern until we're at the end. */ | |
1592 while (p != pend) | |
1593 { | |
1594 PATFETCH (c); | |
1595 | |
1596 switch (c) | |
1597 { | |
1598 case '^': | |
1599 { | |
1600 if ( /* If at start of pattern, it's an operator. */ | |
1601 p == pattern + 1 | |
1602 /* If context independent, it's an operator. */ | |
1603 || syntax & RE_CONTEXT_INDEP_ANCHORS | |
1604 /* Otherwise, depends on what's come before. */ | |
1605 || at_begline_loc_p (pattern, p, syntax)) | |
1606 BUF_PUSH (begline); | |
1607 else | |
1608 goto normal_char; | |
1609 } | |
1610 break; | |
1611 | |
1612 | |
1613 case '$': | |
1614 { | |
1615 if ( /* If at end of pattern, it's an operator. */ | |
1616 p == pend | |
1617 /* If context independent, it's an operator. */ | |
1618 || syntax & RE_CONTEXT_INDEP_ANCHORS | |
1619 /* Otherwise, depends on what's next. */ | |
1620 || at_endline_loc_p (p, pend, syntax)) | |
1621 BUF_PUSH (endline); | |
1622 else | |
1623 goto normal_char; | |
1624 } | |
1625 break; | |
1626 | |
1627 | |
1628 case '+': | |
1629 case '?': | |
1630 if ((syntax & RE_BK_PLUS_QM) | |
1631 || (syntax & RE_LIMITED_OPS)) | |
1632 goto normal_char; | |
1633 handle_plus: | |
1634 case '*': | |
1635 /* If there is no previous pattern... */ | |
1636 if (!laststart) | |
1637 { | |
1638 if (syntax & RE_CONTEXT_INVALID_OPS) | |
1639 return REG_BADRPT; | |
1640 else if (!(syntax & RE_CONTEXT_INDEP_OPS)) | |
1641 goto normal_char; | |
1642 } | |
1643 | |
1644 { | |
1645 /* Are we optimizing this jump? */ | |
1646 boolean keep_string_p = false; | |
1647 | |
1648 /* 1 means zero (many) matches is allowed. */ | |
1649 char zero_times_ok = 0, many_times_ok = 0; | |
1650 | |
1651 /* If there is a sequence of repetition chars, collapse it | |
1652 down to just one (the right one). We can't combine | |
1653 interval operators with these because of, e.g., `a{2}*', | |
1654 which should only match an even number of `a's. */ | |
1655 | |
1656 for (;;) | |
1657 { | |
1658 zero_times_ok |= c != '+'; | |
1659 many_times_ok |= c != '?'; | |
1660 | |
1661 if (p == pend) | |
1662 break; | |
1663 | |
1664 PATFETCH (c); | |
1665 | |
1666 if (c == '*' | |
1667 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) | |
1668 ; | |
1669 | |
1670 else if (syntax & RE_BK_PLUS_QM && c == '\\') | |
1671 { | |
1672 if (p == pend) return REG_EESCAPE; | |
1673 | |
1674 PATFETCH (c1); | |
1675 if (!(c1 == '+' || c1 == '?')) | |
1676 { | |
1677 PATUNFETCH; | |
1678 PATUNFETCH; | |
1679 break; | |
1680 } | |
1681 | |
1682 c = c1; | |
1683 } | |
1684 else | |
1685 { | |
1686 PATUNFETCH; | |
1687 break; | |
1688 } | |
1689 | |
1690 /* If we get here, we found another repeat character. */ | |
1691 } | |
1692 | |
1693 /* Star, etc. applied to an empty pattern is equivalent | |
1694 to an empty pattern. */ | |
1695 if (!laststart) | |
1696 break; | |
1697 | |
1698 /* Now we know whether or not zero matches is allowed | |
1699 and also whether or not two or more matches is allowed. */ | |
1700 if (many_times_ok) | |
1701 { /* More than one repetition is allowed, so put in at the | |
1702 end a backward relative jump from `b' to before the next | |
1703 jump we're going to put in below (which jumps from | |
1704 laststart to after this jump). | |
1705 | |
1706 But if we are at the `*' in the exact sequence `.*\n', | |
1707 insert an unconditional jump backwards to the ., | |
1708 instead of the beginning of the loop. This way we only | |
1709 push a failure point once, instead of every time | |
1710 through the loop. */ | |
1711 assert (p - 1 > pattern); | |
1712 | |
1713 /* Allocate the space for the jump. */ | |
1714 GET_BUFFER_SPACE (3); | |
1715 | |
1716 /* We know we are not at the first character of the pattern, | |
1717 because laststart was nonzero. And we've already | |
1718 incremented `p', by the way, to be the character after | |
1719 the `*'. Do we have to do something analogous here | |
1720 for null bytes, because of RE_DOT_NOT_NULL? */ | |
1721 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') | |
2453 | 1722 && zero_times_ok |
1155 | 1723 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') |
1724 && !(syntax & RE_DOT_NEWLINE)) | |
1725 { /* We have .*\n. */ | |
1726 STORE_JUMP (jump, b, laststart); | |
1727 keep_string_p = true; | |
1728 } | |
1729 else | |
1730 /* Anything else. */ | |
1731 STORE_JUMP (maybe_pop_jump, b, laststart - 3); | |
1732 | |
1733 /* We've added more stuff to the buffer. */ | |
1734 b += 3; | |
1735 } | |
1736 | |
1737 /* On failure, jump from laststart to b + 3, which will be the | |
1738 end of the buffer after this jump is inserted. */ | |
1739 GET_BUFFER_SPACE (3); | |
1740 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump | |
1741 : on_failure_jump, | |
1742 laststart, b + 3); | |
1743 pending_exact = 0; | |
1744 b += 3; | |
1745 | |
1746 if (!zero_times_ok) | |
1747 { | |
1748 /* At least one repetition is required, so insert a | |
1749 `dummy_failure_jump' before the initial | |
1750 `on_failure_jump' instruction of the loop. This | |
1751 effects a skip over that instruction the first time | |
1752 we hit that loop. */ | |
1753 GET_BUFFER_SPACE (3); | |
1754 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); | |
1755 b += 3; | |
1756 } | |
1757 } | |
1758 break; | |
1759 | |
1760 | |
1761 case '.': | |
1762 laststart = b; | |
1763 BUF_PUSH (anychar); | |
1764 break; | |
1765 | |
1766 | |
1767 case '[': | |
1768 { | |
1769 boolean had_char_class = false; | |
1770 | |
1771 if (p == pend) return REG_EBRACK; | |
1772 | |
1773 /* Ensure that we have enough space to push a charset: the | |
1774 opcode, the length count, and the bitset; 34 bytes in all. */ | |
1775 GET_BUFFER_SPACE (34); | |
1776 | |
1777 laststart = b; | |
1778 | |
1779 /* We test `*p == '^' twice, instead of using an if | |
1780 statement, so we only need one BUF_PUSH. */ | |
1781 BUF_PUSH (*p == '^' ? charset_not : charset); | |
1782 if (*p == '^') | |
1783 p++; | |
1784 | |
1785 /* Remember the first position in the bracket expression. */ | |
1786 p1 = p; | |
1787 | |
1788 /* Push the number of bytes in the bitmap. */ | |
1789 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); | |
1790 | |
1791 /* Clear the whole map. */ | |
1792 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH); | |
1793 | |
1794 /* charset_not matches newline according to a syntax bit. */ | |
1795 if ((re_opcode_t) b[-2] == charset_not | |
1796 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) | |
1797 SET_LIST_BIT ('\n'); | |
1798 | |
1799 /* Read in characters and ranges, setting map bits. */ | |
1800 for (;;) | |
1801 { | |
1802 if (p == pend) return REG_EBRACK; | |
1803 | |
1804 PATFETCH (c); | |
1805 | |
1806 /* \ might escape characters inside [...] and [^...]. */ | |
1807 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') | |
1808 { | |
1809 if (p == pend) return REG_EESCAPE; | |
1810 | |
1811 PATFETCH (c1); | |
1812 SET_LIST_BIT (c1); | |
1813 continue; | |
1814 } | |
1815 | |
1816 /* Could be the end of the bracket expression. If it's | |
1817 not (i.e., when the bracket expression is `[]' so | |
1818 far), the ']' character bit gets set way below. */ | |
1819 if (c == ']' && p != p1 + 1) | |
1820 break; | |
1821 | |
1822 /* Look ahead to see if it's a range when the last thing | |
1823 was a character class. */ | |
1824 if (had_char_class && c == '-' && *p != ']') | |
1825 return REG_ERANGE; | |
1826 | |
1827 /* Look ahead to see if it's a range when the last thing | |
1828 was a character: if this is a hyphen not at the | |
1829 beginning or the end of a list, then it's the range | |
1830 operator. */ | |
1831 if (c == '-' | |
1832 && !(p - 2 >= pattern && p[-2] == '[') | |
1833 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') | |
1834 && *p != ']') | |
1835 { | |
1836 reg_errcode_t ret | |
1837 = compile_range (&p, pend, translate, syntax, b); | |
1838 if (ret != REG_NOERROR) return ret; | |
1839 } | |
1840 | |
1841 else if (p[0] == '-' && p[1] != ']') | |
1842 { /* This handles ranges made up of characters only. */ | |
1843 reg_errcode_t ret; | |
1844 | |
1845 /* Move past the `-'. */ | |
1846 PATFETCH (c1); | |
1847 | |
1848 ret = compile_range (&p, pend, translate, syntax, b); | |
1849 if (ret != REG_NOERROR) return ret; | |
1850 } | |
1851 | |
1852 /* See if we're at the beginning of a possible character | |
1853 class. */ | |
1854 | |
1855 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') | |
1856 { /* Leave room for the null. */ | |
1857 char str[CHAR_CLASS_MAX_LENGTH + 1]; | |
1858 | |
1859 PATFETCH (c); | |
1860 c1 = 0; | |
1861 | |
1862 /* If pattern is `[[:'. */ | |
1863 if (p == pend) return REG_EBRACK; | |
1864 | |
1865 for (;;) | |
1866 { | |
1867 PATFETCH (c); | |
1868 if (c == ':' || c == ']' || p == pend | |
1869 || c1 == CHAR_CLASS_MAX_LENGTH) | |
1870 break; | |
1871 str[c1++] = c; | |
1872 } | |
1873 str[c1] = '\0'; | |
1874 | |
1875 /* If isn't a word bracketed by `[:' and:`]': | |
1876 undo the ending character, the letters, and leave | |
1877 the leading `:' and `[' (but set bits for them). */ | |
1878 if (c == ':' && *p == ']') | |
1879 { | |
1880 int ch; | |
1881 boolean is_alnum = STREQ (str, "alnum"); | |
1882 boolean is_alpha = STREQ (str, "alpha"); | |
1883 boolean is_blank = STREQ (str, "blank"); | |
1884 boolean is_cntrl = STREQ (str, "cntrl"); | |
1885 boolean is_digit = STREQ (str, "digit"); | |
1886 boolean is_graph = STREQ (str, "graph"); | |
1887 boolean is_lower = STREQ (str, "lower"); | |
1888 boolean is_print = STREQ (str, "print"); | |
1889 boolean is_punct = STREQ (str, "punct"); | |
1890 boolean is_space = STREQ (str, "space"); | |
1891 boolean is_upper = STREQ (str, "upper"); | |
1892 boolean is_xdigit = STREQ (str, "xdigit"); | |
1893 | |
1894 if (!IS_CHAR_CLASS (str)) return REG_ECTYPE; | |
1895 | |
1896 /* Throw away the ] at the end of the character | |
1897 class. */ | |
1898 PATFETCH (c); | |
1899 | |
1900 if (p == pend) return REG_EBRACK; | |
1901 | |
1902 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) | |
1903 { | |
1668 | 1904 if ( (is_alnum && ISALNUM (ch)) |
1905 || (is_alpha && ISALPHA (ch)) | |
1906 || (is_blank && ISBLANK (ch)) | |
1907 || (is_cntrl && ISCNTRL (ch)) | |
1908 || (is_digit && ISDIGIT (ch)) | |
1909 || (is_graph && ISGRAPH (ch)) | |
1910 || (is_lower && ISLOWER (ch)) | |
1911 || (is_print && ISPRINT (ch)) | |
1912 || (is_punct && ISPUNCT (ch)) | |
1913 || (is_space && ISSPACE (ch)) | |
1914 || (is_upper && ISUPPER (ch)) | |
1915 || (is_xdigit && ISXDIGIT (ch))) | |
1155 | 1916 SET_LIST_BIT (ch); |
1917 } | |
1918 had_char_class = true; | |
1919 } | |
1920 else | |
1921 { | |
1922 c1++; | |
1923 while (c1--) | |
1924 PATUNFETCH; | |
1925 SET_LIST_BIT ('['); | |
1926 SET_LIST_BIT (':'); | |
1927 had_char_class = false; | |
1928 } | |
1929 } | |
1930 else | |
1931 { | |
1932 had_char_class = false; | |
1933 SET_LIST_BIT (c); | |
1934 } | |
1935 } | |
1936 | |
1937 /* Discard any (non)matching list bytes that are all 0 at the | |
1938 end of the map. Decrease the map-length byte too. */ | |
1939 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) | |
1940 b[-1]--; | |
1941 b += b[-1]; | |
1942 } | |
1943 break; | |
1944 | |
1945 | |
1946 case '(': | |
1947 if (syntax & RE_NO_BK_PARENS) | |
1948 goto handle_open; | |
1949 else | |
1950 goto normal_char; | |
1951 | |
1952 | |
1953 case ')': | |
1954 if (syntax & RE_NO_BK_PARENS) | |
1955 goto handle_close; | |
1956 else | |
1957 goto normal_char; | |
1958 | |
1959 | |
1960 case '\n': | |
1961 if (syntax & RE_NEWLINE_ALT) | |
1962 goto handle_alt; | |
1963 else | |
1964 goto normal_char; | |
1965 | |
1966 | |
1967 case '|': | |
1968 if (syntax & RE_NO_BK_VBAR) | |
1969 goto handle_alt; | |
1970 else | |
1971 goto normal_char; | |
1972 | |
1973 | |
1974 case '{': | |
1975 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) | |
1976 goto handle_interval; | |
1977 else | |
1978 goto normal_char; | |
1979 | |
1980 | |
1981 case '\\': | |
1982 if (p == pend) return REG_EESCAPE; | |
1983 | |
1984 /* Do not translate the character after the \, so that we can | |
1985 distinguish, e.g., \B from \b, even if we normally would | |
1986 translate, e.g., B to b. */ | |
1987 PATFETCH_RAW (c); | |
1988 | |
1989 switch (c) | |
1990 { | |
1991 case '(': | |
1992 if (syntax & RE_NO_BK_PARENS) | |
1993 goto normal_backslash; | |
1994 | |
1995 handle_open: | |
1996 bufp->re_nsub++; | |
1997 regnum++; | |
1998 | |
1999 if (COMPILE_STACK_FULL) | |
2000 { | |
2001 RETALLOC (compile_stack.stack, compile_stack.size << 1, | |
2002 compile_stack_elt_t); | |
2003 if (compile_stack.stack == NULL) return REG_ESPACE; | |
2004 | |
2005 compile_stack.size <<= 1; | |
2006 } | |
2007 | |
2008 /* These are the values to restore when we hit end of this | |
2009 group. They are all relative offsets, so that if the | |
2010 whole pattern moves because of realloc, they will still | |
2011 be valid. */ | |
2012 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; | |
2013 COMPILE_STACK_TOP.fixup_alt_jump | |
2014 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; | |
2015 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; | |
2016 COMPILE_STACK_TOP.regnum = regnum; | |
2017 | |
2018 /* We will eventually replace the 0 with the number of | |
2019 groups inner to this one. But do not push a | |
2020 start_memory for groups beyond the last one we can | |
2021 represent in the compiled pattern. */ | |
2022 if (regnum <= MAX_REGNUM) | |
2023 { | |
2024 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2; | |
2025 BUF_PUSH_3 (start_memory, regnum, 0); | |
2026 } | |
2027 | |
2028 compile_stack.avail++; | |
2029 | |
2030 fixup_alt_jump = 0; | |
2031 laststart = 0; | |
2032 begalt = b; | |
2453 | 2033 /* If we've reached MAX_REGNUM groups, then this open |
2034 won't actually generate any code, so we'll have to | |
2035 clear pending_exact explicitly. */ | |
2036 pending_exact = 0; | |
1155 | 2037 break; |
2038 | |
2039 | |
2040 case ')': | |
2041 if (syntax & RE_NO_BK_PARENS) goto normal_backslash; | |
2042 | |
2043 if (COMPILE_STACK_EMPTY) | |
2044 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) | |
2045 goto normal_backslash; | |
2046 else | |
2047 return REG_ERPAREN; | |
2048 | |
2049 handle_close: | |
2050 if (fixup_alt_jump) | |
2051 { /* Push a dummy failure point at the end of the | |
2052 alternative for a possible future | |
2053 `pop_failure_jump' to pop. See comments at | |
2054 `push_dummy_failure' in `re_match_2'. */ | |
2055 BUF_PUSH (push_dummy_failure); | |
2056 | |
2057 /* We allocated space for this jump when we assigned | |
2058 to `fixup_alt_jump', in the `handle_alt' case below. */ | |
2059 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1); | |
2060 } | |
2061 | |
2062 /* See similar code for backslashed left paren above. */ | |
2063 if (COMPILE_STACK_EMPTY) | |
2064 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) | |
2065 goto normal_char; | |
2066 else | |
2067 return REG_ERPAREN; | |
2068 | |
2069 /* Since we just checked for an empty stack above, this | |
2070 ``can't happen''. */ | |
2071 assert (compile_stack.avail != 0); | |
2072 { | |
2073 /* We don't just want to restore into `regnum', because | |
2074 later groups should continue to be numbered higher, | |
2075 as in `(ab)c(de)' -- the second group is #2. */ | |
2076 regnum_t this_group_regnum; | |
2077 | |
2078 compile_stack.avail--; | |
2079 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; | |
2080 fixup_alt_jump | |
2081 = COMPILE_STACK_TOP.fixup_alt_jump | |
2082 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1 | |
2083 : 0; | |
2084 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset; | |
2085 this_group_regnum = COMPILE_STACK_TOP.regnum; | |
2453 | 2086 /* If we've reached MAX_REGNUM groups, then this open |
2087 won't actually generate any code, so we'll have to | |
2088 clear pending_exact explicitly. */ | |
2089 pending_exact = 0; | |
1155 | 2090 |
2091 /* We're at the end of the group, so now we know how many | |
2092 groups were inside this one. */ | |
2093 if (this_group_regnum <= MAX_REGNUM) | |
2094 { | |
2095 unsigned char *inner_group_loc | |
2096 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset; | |
2097 | |
2098 *inner_group_loc = regnum - this_group_regnum; | |
2099 BUF_PUSH_3 (stop_memory, this_group_regnum, | |
2100 regnum - this_group_regnum); | |
2101 } | |
2102 } | |
2103 break; | |
2104 | |
2105 | |
2106 case '|': /* `\|'. */ | |
2107 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) | |
2108 goto normal_backslash; | |
2109 handle_alt: | |
2110 if (syntax & RE_LIMITED_OPS) | |
2111 goto normal_char; | |
2112 | |
2113 /* Insert before the previous alternative a jump which | |
2114 jumps to this alternative if the former fails. */ | |
2115 GET_BUFFER_SPACE (3); | |
2116 INSERT_JUMP (on_failure_jump, begalt, b + 6); | |
2117 pending_exact = 0; | |
2118 b += 3; | |
2119 | |
2120 /* The alternative before this one has a jump after it | |
2121 which gets executed if it gets matched. Adjust that | |
2122 jump so it will jump to this alternative's analogous | |
2123 jump (put in below, which in turn will jump to the next | |
2124 (if any) alternative's such jump, etc.). The last such | |
2125 jump jumps to the correct final destination. A picture: | |
2126 _____ _____ | |
2127 | | | | | |
2128 | v | v | |
2129 a | b | c | |
2130 | |
1637 | 2131 If we are at `b', then fixup_alt_jump right now points to a |
2132 three-byte space after `a'. We'll put in the jump, set | |
2133 fixup_alt_jump to right after `b', and leave behind three | |
2134 bytes which we'll fill in when we get to after `c'. */ | |
1155 | 2135 |
2136 if (fixup_alt_jump) | |
2137 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); | |
2138 | |
2139 /* Mark and leave space for a jump after this alternative, | |
2140 to be filled in later either by next alternative or | |
2141 when know we're at the end of a series of alternatives. */ | |
2142 fixup_alt_jump = b; | |
2143 GET_BUFFER_SPACE (3); | |
2144 b += 3; | |
2145 | |
2146 laststart = 0; | |
2147 begalt = b; | |
2148 break; | |
2149 | |
2150 | |
2151 case '{': | |
2152 /* If \{ is a literal. */ | |
2153 if (!(syntax & RE_INTERVALS) | |
2154 /* If we're at `\{' and it's not the open-interval | |
2155 operator. */ | |
2156 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) | |
2157 || (p - 2 == pattern && p == pend)) | |
2158 goto normal_backslash; | |
2159 | |
2160 handle_interval: | |
2161 { | |
2162 /* If got here, then the syntax allows intervals. */ | |
2163 | |
2164 /* At least (most) this many matches must be made. */ | |
2165 int lower_bound = -1, upper_bound = -1; | |
2166 | |
2167 beg_interval = p - 1; | |
2168 | |
2169 if (p == pend) | |
2170 { | |
2171 if (syntax & RE_NO_BK_BRACES) | |
2172 goto unfetch_interval; | |
2173 else | |
2174 return REG_EBRACE; | |
2175 } | |
2176 | |
2177 GET_UNSIGNED_NUMBER (lower_bound); | |
2178 | |
2179 if (c == ',') | |
2180 { | |
2181 GET_UNSIGNED_NUMBER (upper_bound); | |
2182 if (upper_bound < 0) upper_bound = RE_DUP_MAX; | |
2183 } | |
2184 else | |
2185 /* Interval such as `{1}' => match exactly once. */ | |
2186 upper_bound = lower_bound; | |
2187 | |
2188 if (lower_bound < 0 || upper_bound > RE_DUP_MAX | |
2189 || lower_bound > upper_bound) | |
2190 { | |
2191 if (syntax & RE_NO_BK_BRACES) | |
2192 goto unfetch_interval; | |
2193 else | |
2194 return REG_BADBR; | |
2195 } | |
2196 | |
2197 if (!(syntax & RE_NO_BK_BRACES)) | |
2198 { | |
2199 if (c != '\\') return REG_EBRACE; | |
2200 | |
2201 PATFETCH (c); | |
2202 } | |
2203 | |
2204 if (c != '}') | |
2205 { | |
2206 if (syntax & RE_NO_BK_BRACES) | |
2207 goto unfetch_interval; | |
2208 else | |
2209 return REG_BADBR; | |
2210 } | |
2211 | |
2212 /* We just parsed a valid interval. */ | |
2213 | |
2214 /* If it's invalid to have no preceding re. */ | |
2215 if (!laststart) | |
2216 { | |
2217 if (syntax & RE_CONTEXT_INVALID_OPS) | |
2218 return REG_BADRPT; | |
2219 else if (syntax & RE_CONTEXT_INDEP_OPS) | |
2220 laststart = b; | |
2221 else | |
2222 goto unfetch_interval; | |
2223 } | |
2224 | |
2225 /* If the upper bound is zero, don't want to succeed at | |
2226 all; jump from `laststart' to `b + 3', which will be | |
2227 the end of the buffer after we insert the jump. */ | |
2228 if (upper_bound == 0) | |
2229 { | |
2230 GET_BUFFER_SPACE (3); | |
2231 INSERT_JUMP (jump, laststart, b + 3); | |
2232 b += 3; | |
2233 } | |
2234 | |
2235 /* Otherwise, we have a nontrivial interval. When | |
2236 we're all done, the pattern will look like: | |
2237 set_number_at <jump count> <upper bound> | |
2238 set_number_at <succeed_n count> <lower bound> | |
2239 succeed_n <after jump addr> <succed_n count> | |
2240 <body of loop> | |
2241 jump_n <succeed_n addr> <jump count> | |
2242 (The upper bound and `jump_n' are omitted if | |
2243 `upper_bound' is 1, though.) */ | |
2244 else | |
2245 { /* If the upper bound is > 1, we need to insert | |
2246 more at the end of the loop. */ | |
2247 unsigned nbytes = 10 + (upper_bound > 1) * 10; | |
2248 | |
2249 GET_BUFFER_SPACE (nbytes); | |
2250 | |
2251 /* Initialize lower bound of the `succeed_n', even | |
2252 though it will be set during matching by its | |
2253 attendant `set_number_at' (inserted next), | |
2254 because `re_compile_fastmap' needs to know. | |
2255 Jump to the `jump_n' we might insert below. */ | |
2256 INSERT_JUMP2 (succeed_n, laststart, | |
2257 b + 5 + (upper_bound > 1) * 5, | |
2258 lower_bound); | |
2259 b += 5; | |
2260 | |
2261 /* Code to initialize the lower bound. Insert | |
2262 before the `succeed_n'. The `5' is the last two | |
2263 bytes of this `set_number_at', plus 3 bytes of | |
2264 the following `succeed_n'. */ | |
2265 insert_op2 (set_number_at, laststart, 5, lower_bound, b); | |
2266 b += 5; | |
2267 | |
2268 if (upper_bound > 1) | |
2269 { /* More than one repetition is allowed, so | |
2270 append a backward jump to the `succeed_n' | |
2271 that starts this interval. | |
2272 | |
2273 When we've reached this during matching, | |
2274 we'll have matched the interval once, so | |
2275 jump back only `upper_bound - 1' times. */ | |
2276 STORE_JUMP2 (jump_n, b, laststart + 5, | |
2277 upper_bound - 1); | |
2278 b += 5; | |
2279 | |
2280 /* The location we want to set is the second | |
2281 parameter of the `jump_n'; that is `b-2' as | |
2282 an absolute address. `laststart' will be | |
2283 the `set_number_at' we're about to insert; | |
2284 `laststart+3' the number to set, the source | |
2285 for the relative address. But we are | |
2286 inserting into the middle of the pattern -- | |
2287 so everything is getting moved up by 5. | |
2288 Conclusion: (b - 2) - (laststart + 3) + 5, | |
2289 i.e., b - laststart. | |
2290 | |
2291 We insert this at the beginning of the loop | |
2292 so that if we fail during matching, we'll | |
2293 reinitialize the bounds. */ | |
2294 insert_op2 (set_number_at, laststart, b - laststart, | |
2295 upper_bound - 1, b); | |
2296 b += 5; | |
2297 } | |
2298 } | |
2299 pending_exact = 0; | |
2300 beg_interval = NULL; | |
2301 } | |
2302 break; | |
2303 | |
2304 unfetch_interval: | |
2305 /* If an invalid interval, match the characters as literals. */ | |
2306 assert (beg_interval); | |
2307 p = beg_interval; | |
2308 beg_interval = NULL; | |
2309 | |
2310 /* normal_char and normal_backslash need `c'. */ | |
2311 PATFETCH (c); | |
2312 | |
2313 if (!(syntax & RE_NO_BK_BRACES)) | |
2314 { | |
2315 if (p > pattern && p[-1] == '\\') | |
2316 goto normal_backslash; | |
2317 } | |
2318 goto normal_char; | |
2319 | |
2320 #ifdef emacs | |
2321 /* There is no way to specify the before_dot and after_dot | |
2322 operators. rms says this is ok. --karl */ | |
2323 case '=': | |
2324 BUF_PUSH (at_dot); | |
2325 break; | |
2326 | |
2327 case 's': | |
2328 laststart = b; | |
2329 PATFETCH (c); | |
2330 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); | |
2331 break; | |
2332 | |
2333 case 'S': | |
2334 laststart = b; | |
2335 PATFETCH (c); | |
2336 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); | |
2337 break; | |
2338 #endif /* emacs */ | |
2339 | |
2340 | |
2341 case 'w': | |
2342 laststart = b; | |
2343 BUF_PUSH (wordchar); | |
2344 break; | |
2345 | |
2346 | |
2347 case 'W': | |
2348 laststart = b; | |
2349 BUF_PUSH (notwordchar); | |
2350 break; | |
2351 | |
2352 | |
2353 case '<': | |
2354 BUF_PUSH (wordbeg); | |
2355 break; | |
2356 | |
2357 case '>': | |
2358 BUF_PUSH (wordend); | |
2359 break; | |
2360 | |
2361 case 'b': | |
2362 BUF_PUSH (wordbound); | |
2363 break; | |
2364 | |
2365 case 'B': | |
2366 BUF_PUSH (notwordbound); | |
2367 break; | |
2368 | |
2369 case '`': | |
2370 BUF_PUSH (begbuf); | |
2371 break; | |
2372 | |
2373 case '\'': | |
2374 BUF_PUSH (endbuf); | |
2375 break; | |
2376 | |
2377 case '1': case '2': case '3': case '4': case '5': | |
2378 case '6': case '7': case '8': case '9': | |
2379 if (syntax & RE_NO_BK_REFS) | |
2380 goto normal_char; | |
2381 | |
2382 c1 = c - '0'; | |
2383 | |
2384 if (c1 > regnum) | |
2385 return REG_ESUBREG; | |
2386 | |
2387 /* Can't back reference to a subexpression if inside of it. */ | |
2388 if (group_in_compile_stack (compile_stack, c1)) | |
2389 goto normal_char; | |
2390 | |
2391 laststart = b; | |
2392 BUF_PUSH_2 (duplicate, c1); | |
2393 break; | |
2394 | |
2395 | |
2396 case '+': | |
2397 case '?': | |
2398 if (syntax & RE_BK_PLUS_QM) | |
2399 goto handle_plus; | |
2400 else | |
2401 goto normal_backslash; | |
2402 | |
2403 default: | |
2404 normal_backslash: | |
2405 /* You might think it would be useful for \ to mean | |
2406 not to translate; but if we don't translate it | |
2407 it will never match anything. */ | |
2408 c = TRANSLATE (c); | |
2409 goto normal_char; | |
2410 } | |
2411 break; | |
2412 | |
2413 | |
2414 default: | |
2415 /* Expects the character in `c'. */ | |
2416 normal_char: | |
2417 /* If no exactn currently being built. */ | |
2418 if (!pending_exact | |
2419 | |
2420 /* If last exactn not at current position. */ | |
2421 || pending_exact + *pending_exact + 1 != b | |
2422 | |
2423 /* We have only one byte following the exactn for the count. */ | |
2424 || *pending_exact == (1 << BYTEWIDTH) - 1 | |
2425 | |
2426 /* If followed by a repetition operator. */ | |
2427 || *p == '*' || *p == '^' | |
2428 || ((syntax & RE_BK_PLUS_QM) | |
2429 ? *p == '\\' && (p[1] == '+' || p[1] == '?') | |
2430 : (*p == '+' || *p == '?')) | |
2431 || ((syntax & RE_INTERVALS) | |
2432 && ((syntax & RE_NO_BK_BRACES) | |
2433 ? *p == '{' | |
2434 : (p[0] == '\\' && p[1] == '{')))) | |
2435 { | |
2436 /* Start building a new exactn. */ | |
2437 | |
2438 laststart = b; | |
2439 | |
2440 BUF_PUSH_2 (exactn, 0); | |
2441 pending_exact = b - 1; | |
2442 } | |
2443 | |
2444 BUF_PUSH (c); | |
2445 (*pending_exact)++; | |
2446 break; | |
2447 } /* switch (c) */ | |
2448 } /* while p != pend */ | |
2449 | |
2450 | |
2451 /* Through the pattern now. */ | |
2452 | |
2453 if (fixup_alt_jump) | |
2454 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); | |
2455 | |
2456 if (!COMPILE_STACK_EMPTY) | |
2457 return REG_EPAREN; | |
2458 | |
2459 free (compile_stack.stack); | |
2460 | |
2461 /* We have succeeded; set the length of the buffer. */ | |
2462 bufp->used = b - bufp->buffer; | |
2463 | |
2464 #ifdef DEBUG | |
2465 if (debug) | |
2466 { | |
2615 | 2467 DEBUG_PRINT1 ("\nCompiled pattern: \n"); |
1155 | 2468 print_compiled_pattern (bufp); |
2469 } | |
2470 #endif /* DEBUG */ | |
2471 | |
2952 | 2472 #ifndef MATCH_MAY_ALLOCATE |
2949 | 2473 /* Initialize the failure stack to the largest possible stack. This |
2474 isn't necessary unless we're trying to avoid calling alloca in | |
2475 the search and match routines. */ | |
2476 { | |
2477 int num_regs = bufp->re_nsub + 1; | |
2478 | |
2479 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size | |
2480 is strictly greater than re_max_failures, the largest possible stack | |
2481 is 2 * re_max_failures failure points. */ | |
2482 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS); | |
2483 if (fail_stack.stack) | |
2484 fail_stack.stack = | |
2485 (fail_stack_elt_t *) realloc (fail_stack.stack, | |
2486 (fail_stack.size | |
2487 * sizeof (fail_stack_elt_t))); | |
2488 else | |
2489 fail_stack.stack = | |
2490 (fail_stack_elt_t *) malloc (fail_stack.size | |
2491 * sizeof (fail_stack_elt_t)); | |
2492 | |
2493 /* Initialize some other variables the matcher uses. */ | |
2494 RETALLOC_IF (regstart, num_regs, const char *); | |
2495 RETALLOC_IF (regend, num_regs, const char *); | |
2496 RETALLOC_IF (old_regstart, num_regs, const char *); | |
2497 RETALLOC_IF (old_regend, num_regs, const char *); | |
2498 RETALLOC_IF (best_regstart, num_regs, const char *); | |
2499 RETALLOC_IF (best_regend, num_regs, const char *); | |
2500 RETALLOC_IF (reg_info, num_regs, register_info_type); | |
2501 RETALLOC_IF (reg_dummy, num_regs, const char *); | |
2502 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type); | |
2503 } | |
2504 #endif | |
2505 | |
1155 | 2506 return REG_NOERROR; |
2507 } /* regex_compile */ | |
2508 | |
2509 /* Subroutines for `regex_compile'. */ | |
2510 | |
2511 /* Store OP at LOC followed by two-byte integer parameter ARG. */ | |
2512 | |
2513 static void | |
2514 store_op1 (op, loc, arg) | |
2515 re_opcode_t op; | |
2516 unsigned char *loc; | |
2517 int arg; | |
2518 { | |
2519 *loc = (unsigned char) op; | |
2520 STORE_NUMBER (loc + 1, arg); | |
2521 } | |
2522 | |
2523 | |
2524 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ | |
2525 | |
2526 static void | |
2527 store_op2 (op, loc, arg1, arg2) | |
2528 re_opcode_t op; | |
2529 unsigned char *loc; | |
2530 int arg1, arg2; | |
2531 { | |
2532 *loc = (unsigned char) op; | |
2533 STORE_NUMBER (loc + 1, arg1); | |
2534 STORE_NUMBER (loc + 3, arg2); | |
2535 } | |
2536 | |
2537 | |
2538 /* Copy the bytes from LOC to END to open up three bytes of space at LOC | |
2539 for OP followed by two-byte integer parameter ARG. */ | |
2540 | |
2541 static void | |
2542 insert_op1 (op, loc, arg, end) | |
2543 re_opcode_t op; | |
2544 unsigned char *loc; | |
2545 int arg; | |
2546 unsigned char *end; | |
2547 { | |
2548 register unsigned char *pfrom = end; | |
2549 register unsigned char *pto = end + 3; | |
2550 | |
2551 while (pfrom != loc) | |
2552 *--pto = *--pfrom; | |
2553 | |
2554 store_op1 (op, loc, arg); | |
2555 } | |
2556 | |
2557 | |
2558 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ | |
2559 | |
2560 static void | |
2561 insert_op2 (op, loc, arg1, arg2, end) | |
2562 re_opcode_t op; | |
2563 unsigned char *loc; | |
2564 int arg1, arg2; | |
2565 unsigned char *end; | |
2566 { | |
2567 register unsigned char *pfrom = end; | |
2568 register unsigned char *pto = end + 5; | |
2569 | |
2570 while (pfrom != loc) | |
2571 *--pto = *--pfrom; | |
2572 | |
2573 store_op2 (op, loc, arg1, arg2); | |
2574 } | |
2575 | |
2576 | |
2577 /* P points to just after a ^ in PATTERN. Return true if that ^ comes | |
2578 after an alternative or a begin-subexpression. We assume there is at | |
2579 least one character before the ^. */ | |
2580 | |
2581 static boolean | |
2582 at_begline_loc_p (pattern, p, syntax) | |
2583 const char *pattern, *p; | |
2584 reg_syntax_t syntax; | |
2585 { | |
2586 const char *prev = p - 2; | |
2587 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; | |
2588 | |
2589 return | |
2590 /* After a subexpression? */ | |
2591 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) | |
2592 /* After an alternative? */ | |
2593 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); | |
2594 } | |
2595 | |
2596 | |
2597 /* The dual of at_begline_loc_p. This one is for $. We assume there is | |
2598 at least one character after the $, i.e., `P < PEND'. */ | |
2599 | |
2600 static boolean | |
2601 at_endline_loc_p (p, pend, syntax) | |
2602 const char *p, *pend; | |
2603 int syntax; | |
2604 { | |
2605 const char *next = p; | |
2606 boolean next_backslash = *next == '\\'; | |
2607 const char *next_next = p + 1 < pend ? p + 1 : NULL; | |
2608 | |
2609 return | |
2610 /* Before a subexpression? */ | |
2611 (syntax & RE_NO_BK_PARENS ? *next == ')' | |
2612 : next_backslash && next_next && *next_next == ')') | |
2613 /* Before an alternative? */ | |
2614 || (syntax & RE_NO_BK_VBAR ? *next == '|' | |
2615 : next_backslash && next_next && *next_next == '|'); | |
2616 } | |
2617 | |
2618 | |
2619 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and | |
2620 false if it's not. */ | |
2621 | |
2622 static boolean | |
2623 group_in_compile_stack (compile_stack, regnum) | |
2624 compile_stack_type compile_stack; | |
2625 regnum_t regnum; | |
2626 { | |
2627 int this_element; | |
2628 | |
2629 for (this_element = compile_stack.avail - 1; | |
2630 this_element >= 0; | |
2631 this_element--) | |
2632 if (compile_stack.stack[this_element].regnum == regnum) | |
2633 return true; | |
2634 | |
2635 return false; | |
2636 } | |
2637 | |
2638 | |
2639 /* Read the ending character of a range (in a bracket expression) from the | |
2640 uncompiled pattern *P_PTR (which ends at PEND). We assume the | |
2641 starting character is in `P[-2]'. (`P[-1]' is the character `-'.) | |
2642 Then we set the translation of all bits between the starting and | |
2643 ending characters (inclusive) in the compiled pattern B. | |
2644 | |
2645 Return an error code. | |
2646 | |
2647 We use these short variable names so we can use the same macros as | |
2648 `regex_compile' itself. */ | |
2649 | |
2650 static reg_errcode_t | |
2651 compile_range (p_ptr, pend, translate, syntax, b) | |
2652 const char **p_ptr, *pend; | |
2653 char *translate; | |
2654 reg_syntax_t syntax; | |
2655 unsigned char *b; | |
2656 { | |
2657 unsigned this_char; | |
2658 | |
2659 const char *p = *p_ptr; | |
1689 | 2660 int range_start, range_end; |
1155 | 2661 |
2662 if (p == pend) | |
2663 return REG_ERANGE; | |
2664 | |
1689 | 2665 /* Even though the pattern is a signed `char *', we need to fetch |
2666 with unsigned char *'s; if the high bit of the pattern character | |
2667 is set, the range endpoints will be negative if we fetch using a | |
2668 signed char *. | |
2669 | |
2670 We also want to fetch the endpoints without translating them; the | |
2671 appropriate translation is done in the bit-setting loop below. */ | |
2672 range_start = ((unsigned char *) p)[-2]; | |
2673 range_end = ((unsigned char *) p)[0]; | |
1155 | 2674 |
2675 /* Have to increment the pointer into the pattern string, so the | |
2676 caller isn't still at the ending character. */ | |
2677 (*p_ptr)++; | |
2678 | |
2679 /* If the start is after the end, the range is empty. */ | |
2680 if (range_start > range_end) | |
2681 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; | |
2682 | |
2683 /* Here we see why `this_char' has to be larger than an `unsigned | |
2684 char' -- the range is inclusive, so if `range_end' == 0xff | |
2685 (assuming 8-bit characters), we would otherwise go into an infinite | |
2686 loop, since all characters <= 0xff. */ | |
2687 for (this_char = range_start; this_char <= range_end; this_char++) | |
2688 { | |
2689 SET_LIST_BIT (TRANSLATE (this_char)); | |
2690 } | |
2691 | |
2692 return REG_NOERROR; | |
2693 } | |
2694 | |
2695 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in | |
2696 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible | |
2697 characters can start a string that matches the pattern. This fastmap | |
2698 is used by re_search to skip quickly over impossible starting points. | |
2699 | |
2700 The caller must supply the address of a (1 << BYTEWIDTH)-byte data | |
2701 area as BUFP->fastmap. | |
2702 | |
2703 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in | |
2704 the pattern buffer. | |
2705 | |
2706 Returns 0 if we succeed, -2 if an internal error. */ | |
2707 | |
2708 int | |
2709 re_compile_fastmap (bufp) | |
2710 struct re_pattern_buffer *bufp; | |
2711 { | |
2712 int j, k; | |
2952 | 2713 #ifdef MATCH_MAY_ALLOCATE |
1155 | 2714 fail_stack_type fail_stack; |
2949 | 2715 #endif |
1155 | 2716 #ifndef REGEX_MALLOC |
2717 char *destination; | |
2718 #endif | |
2719 /* We don't push any register information onto the failure stack. */ | |
2720 unsigned num_regs = 0; | |
2721 | |
2722 register char *fastmap = bufp->fastmap; | |
2723 unsigned char *pattern = bufp->buffer; | |
2724 unsigned long size = bufp->used; | |
2725 const unsigned char *p = pattern; | |
2726 register unsigned char *pend = pattern + size; | |
2727 | |
2728 /* Assume that each path through the pattern can be null until | |
2729 proven otherwise. We set this false at the bottom of switch | |
2730 statement, to which we get only if a particular path doesn't | |
2731 match the empty string. */ | |
2732 boolean path_can_be_null = true; | |
2733 | |
2734 /* We aren't doing a `succeed_n' to begin with. */ | |
2735 boolean succeed_n_p = false; | |
2736 | |
2737 assert (fastmap != NULL && p != NULL); | |
2738 | |
2739 INIT_FAIL_STACK (); | |
2740 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ | |
2741 bufp->fastmap_accurate = 1; /* It will be when we're done. */ | |
2742 bufp->can_be_null = 0; | |
2743 | |
2744 while (p != pend || !FAIL_STACK_EMPTY ()) | |
2745 { | |
2746 if (p == pend) | |
2747 { | |
2748 bufp->can_be_null |= path_can_be_null; | |
2749 | |
2750 /* Reset for next path. */ | |
2751 path_can_be_null = true; | |
2752 | |
2753 p = fail_stack.stack[--fail_stack.avail]; | |
2754 } | |
2755 | |
2756 /* We should never be about to go beyond the end of the pattern. */ | |
2757 assert (p < pend); | |
2758 | |
2759 #ifdef SWITCH_ENUM_BUG | |
2760 switch ((int) ((re_opcode_t) *p++)) | |
2761 #else | |
2762 switch ((re_opcode_t) *p++) | |
2763 #endif | |
2764 { | |
2765 | |
2766 /* I guess the idea here is to simply not bother with a fastmap | |
2767 if a backreference is used, since it's too hard to figure out | |
2768 the fastmap for the corresponding group. Setting | |
2769 `can_be_null' stops `re_search_2' from using the fastmap, so | |
2770 that is all we do. */ | |
2771 case duplicate: | |
2772 bufp->can_be_null = 1; | |
2773 return 0; | |
2774 | |
2775 | |
2776 /* Following are the cases which match a character. These end | |
2777 with `break'. */ | |
2778 | |
2779 case exactn: | |
2780 fastmap[p[1]] = 1; | |
2781 break; | |
2782 | |
2783 | |
2784 case charset: | |
2785 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) | |
2786 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) | |
2787 fastmap[j] = 1; | |
2788 break; | |
2789 | |
2790 | |
2791 case charset_not: | |
2792 /* Chars beyond end of map must be allowed. */ | |
2793 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) | |
2794 fastmap[j] = 1; | |
2795 | |
2796 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) | |
2797 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) | |
2798 fastmap[j] = 1; | |
2799 break; | |
2800 | |
2801 | |
2802 case wordchar: | |
2803 for (j = 0; j < (1 << BYTEWIDTH); j++) | |
2804 if (SYNTAX (j) == Sword) | |
2805 fastmap[j] = 1; | |
2806 break; | |
2807 | |
2808 | |
2809 case notwordchar: | |
2810 for (j = 0; j < (1 << BYTEWIDTH); j++) | |
2811 if (SYNTAX (j) != Sword) | |
2812 fastmap[j] = 1; | |
2813 break; | |
2814 | |
2815 | |
2816 case anychar: | |
2817 /* `.' matches anything ... */ | |
2818 for (j = 0; j < (1 << BYTEWIDTH); j++) | |
2819 fastmap[j] = 1; | |
2820 | |
2821 /* ... except perhaps newline. */ | |
2822 if (!(bufp->syntax & RE_DOT_NEWLINE)) | |
2823 fastmap['\n'] = 0; | |
2824 | |
2825 /* Return if we have already set `can_be_null'; if we have, | |
2826 then the fastmap is irrelevant. Something's wrong here. */ | |
2827 else if (bufp->can_be_null) | |
2828 return 0; | |
2829 | |
2830 /* Otherwise, have to check alternative paths. */ | |
2831 break; | |
2832 | |
2833 | |
2834 #ifdef emacs | |
2835 case syntaxspec: | |
2836 k = *p++; | |
2837 for (j = 0; j < (1 << BYTEWIDTH); j++) | |
2838 if (SYNTAX (j) == (enum syntaxcode) k) | |
2839 fastmap[j] = 1; | |
2840 break; | |
2841 | |
2842 | |
2843 case notsyntaxspec: | |
2844 k = *p++; | |
2845 for (j = 0; j < (1 << BYTEWIDTH); j++) | |
2846 if (SYNTAX (j) != (enum syntaxcode) k) | |
2847 fastmap[j] = 1; | |
2848 break; | |
2849 | |
2850 | |
2851 /* All cases after this match the empty string. These end with | |
2852 `continue'. */ | |
2853 | |
2854 | |
2855 case before_dot: | |
2856 case at_dot: | |
2857 case after_dot: | |
2858 continue; | |
2859 #endif /* not emacs */ | |
2860 | |
2861 | |
2862 case no_op: | |
2863 case begline: | |
2864 case endline: | |
2865 case begbuf: | |
2866 case endbuf: | |
2867 case wordbound: | |
2868 case notwordbound: | |
2869 case wordbeg: | |
2870 case wordend: | |
2871 case push_dummy_failure: | |
2872 continue; | |
2873 | |
2874 | |
2875 case jump_n: | |
2876 case pop_failure_jump: | |
2877 case maybe_pop_jump: | |
2878 case jump: | |
2879 case jump_past_alt: | |
2880 case dummy_failure_jump: | |
2881 EXTRACT_NUMBER_AND_INCR (j, p); | |
2882 p += j; | |
2883 if (j > 0) | |
2884 continue; | |
2885 | |
2886 /* Jump backward implies we just went through the body of a | |
2887 loop and matched nothing. Opcode jumped to should be | |
2888 `on_failure_jump' or `succeed_n'. Just treat it like an | |
2889 ordinary jump. For a * loop, it has pushed its failure | |
2890 point already; if so, discard that as redundant. */ | |
2891 if ((re_opcode_t) *p != on_failure_jump | |
2892 && (re_opcode_t) *p != succeed_n) | |
2893 continue; | |
2894 | |
2895 p++; | |
2896 EXTRACT_NUMBER_AND_INCR (j, p); | |
2897 p += j; | |
2898 | |
2899 /* If what's on the stack is where we are now, pop it. */ | |
2900 if (!FAIL_STACK_EMPTY () | |
2901 && fail_stack.stack[fail_stack.avail - 1] == p) | |
2902 fail_stack.avail--; | |
2903 | |
2904 continue; | |
2905 | |
2906 | |
2907 case on_failure_jump: | |
2908 case on_failure_keep_string_jump: | |
2909 handle_on_failure_jump: | |
2910 EXTRACT_NUMBER_AND_INCR (j, p); | |
2911 | |
2912 /* For some patterns, e.g., `(a?)?', `p+j' here points to the | |
2913 end of the pattern. We don't want to push such a point, | |
2914 since when we restore it above, entering the switch will | |
2915 increment `p' past the end of the pattern. We don't need | |
2916 to push such a point since we obviously won't find any more | |
2917 fastmap entries beyond `pend'. Such a pattern can match | |
2918 the null string, though. */ | |
2919 if (p + j < pend) | |
2920 { | |
2921 if (!PUSH_PATTERN_OP (p + j, fail_stack)) | |
2922 return -2; | |
2923 } | |
2924 else | |
2925 bufp->can_be_null = 1; | |
2926 | |
2927 if (succeed_n_p) | |
2928 { | |
2929 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ | |
2930 succeed_n_p = false; | |
2931 } | |
2932 | |
2933 continue; | |
2934 | |
2935 | |
2936 case succeed_n: | |
2937 /* Get to the number of times to succeed. */ | |
2938 p += 2; | |
2939 | |
2940 /* Increment p past the n for when k != 0. */ | |
2941 EXTRACT_NUMBER_AND_INCR (k, p); | |
2942 if (k == 0) | |
2943 { | |
2944 p -= 4; | |
2945 succeed_n_p = true; /* Spaghetti code alert. */ | |
2946 goto handle_on_failure_jump; | |
2947 } | |
2948 continue; | |
2949 | |
2950 | |
2951 case set_number_at: | |
2952 p += 4; | |
2953 continue; | |
2954 | |
2955 | |
2956 case start_memory: | |
2957 case stop_memory: | |
2958 p += 2; | |
2959 continue; | |
2960 | |
2961 | |
2962 default: | |
2963 abort (); /* We have listed all the cases. */ | |
2964 } /* switch *p++ */ | |
2965 | |
2966 /* Getting here means we have found the possible starting | |
2967 characters for one path of the pattern -- and that the empty | |
2968 string does not match. We need not follow this path further. | |
2969 Instead, look at the next alternative (remembered on the | |
2970 stack), or quit if no more. The test at the top of the loop | |
2971 does these things. */ | |
2972 path_can_be_null = false; | |
2973 p = pend; | |
2974 } /* while p */ | |
2975 | |
2976 /* Set `can_be_null' for the last path (also the first path, if the | |
2977 pattern is empty). */ | |
2978 bufp->can_be_null |= path_can_be_null; | |
2979 return 0; | |
2980 } /* re_compile_fastmap */ | |
2981 | |
2982 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and | |
2983 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use | |
2984 this memory for recording register information. STARTS and ENDS | |
2985 must be allocated using the malloc library routine, and must each | |
2986 be at least NUM_REGS * sizeof (regoff_t) bytes long. | |
2987 | |
2988 If NUM_REGS == 0, then subsequent matches should allocate their own | |
2989 register data. | |
2990 | |
2991 Unless this function is called, the first search or match using | |
2992 PATTERN_BUFFER will allocate its own register data, without | |
2993 freeing the old data. */ | |
2994 | |
2995 void | |
2996 re_set_registers (bufp, regs, num_regs, starts, ends) | |
2997 struct re_pattern_buffer *bufp; | |
2998 struct re_registers *regs; | |
2999 unsigned num_regs; | |
3000 regoff_t *starts, *ends; | |
3001 { | |
3002 if (num_regs) | |
3003 { | |
3004 bufp->regs_allocated = REGS_REALLOCATE; | |
3005 regs->num_regs = num_regs; | |
3006 regs->start = starts; | |
3007 regs->end = ends; | |
3008 } | |
3009 else | |
3010 { | |
3011 bufp->regs_allocated = REGS_UNALLOCATED; | |
3012 regs->num_regs = 0; | |
3013 regs->start = regs->end = (regoff_t) 0; | |
3014 } | |
3015 } | |
3016 | |
3017 /* Searching routines. */ | |
3018 | |
3019 /* Like re_search_2, below, but only one string is specified, and | |
3020 doesn't let you say where to stop matching. */ | |
3021 | |
3022 int | |
3023 re_search (bufp, string, size, startpos, range, regs) | |
3024 struct re_pattern_buffer *bufp; | |
3025 const char *string; | |
3026 int size, startpos, range; | |
3027 struct re_registers *regs; | |
3028 { | |
3029 return re_search_2 (bufp, NULL, 0, string, size, startpos, range, | |
3030 regs, size); | |
3031 } | |
3032 | |
3033 | |
3034 /* Using the compiled pattern in BUFP->buffer, first tries to match the | |
3035 virtual concatenation of STRING1 and STRING2, starting first at index | |
3036 STARTPOS, then at STARTPOS + 1, and so on. | |
3037 | |
3038 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. | |
3039 | |
3040 RANGE is how far to scan while trying to match. RANGE = 0 means try | |
3041 only at STARTPOS; in general, the last start tried is STARTPOS + | |
3042 RANGE. | |
3043 | |
3044 In REGS, return the indices of the virtual concatenation of STRING1 | |
3045 and STRING2 that matched the entire BUFP->buffer and its contained | |
3046 subexpressions. | |
3047 | |
3048 Do not consider matching one past the index STOP in the virtual | |
3049 concatenation of STRING1 and STRING2. | |
3050 | |
3051 We return either the position in the strings at which the match was | |
3052 found, -1 if no match, or -2 if error (such as failure | |
3053 stack overflow). */ | |
3054 | |
3055 int | |
3056 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) | |
3057 struct re_pattern_buffer *bufp; | |
3058 const char *string1, *string2; | |
3059 int size1, size2; | |
3060 int startpos; | |
3061 int range; | |
3062 struct re_registers *regs; | |
3063 int stop; | |
3064 { | |
3065 int val; | |
3066 register char *fastmap = bufp->fastmap; | |
3067 register char *translate = bufp->translate; | |
3068 int total_size = size1 + size2; | |
3069 int endpos = startpos + range; | |
3070 | |
3071 /* Check for out-of-range STARTPOS. */ | |
3072 if (startpos < 0 || startpos > total_size) | |
3073 return -1; | |
3074 | |
3075 /* Fix up RANGE if it might eventually take us outside | |
3076 the virtual concatenation of STRING1 and STRING2. */ | |
3077 if (endpos < -1) | |
3078 range = -1 - startpos; | |
3079 else if (endpos > total_size) | |
3080 range = total_size - startpos; | |
3081 | |
3082 /* If the search isn't to be a backwards one, don't waste time in a | |
1637 | 3083 search for a pattern that must be anchored. */ |
3084 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0) | |
1155 | 3085 { |
3086 if (startpos > 0) | |
3087 return -1; | |
3088 else | |
3089 range = 1; | |
3090 } | |
3091 | |
1637 | 3092 /* Update the fastmap now if not correct already. */ |
3093 if (fastmap && !bufp->fastmap_accurate) | |
3094 if (re_compile_fastmap (bufp) == -2) | |
3095 return -2; | |
3096 | |
3097 /* Loop through the string, looking for a place to start matching. */ | |
1155 | 3098 for (;;) |
3099 { | |
3100 /* If a fastmap is supplied, skip quickly over characters that | |
3101 cannot be the start of a match. If the pattern can match the | |
3102 null string, however, we don't need to skip characters; we want | |
3103 the first null string. */ | |
3104 if (fastmap && startpos < total_size && !bufp->can_be_null) | |
3105 { | |
3106 if (range > 0) /* Searching forwards. */ | |
3107 { | |
3108 register const char *d; | |
3109 register int lim = 0; | |
3110 int irange = range; | |
3111 | |
3112 if (startpos < size1 && startpos + range >= size1) | |
3113 lim = range - (size1 - startpos); | |
3114 | |
3115 d = (startpos >= size1 ? string2 - size1 : string1) + startpos; | |
3116 | |
3117 /* Written out as an if-else to avoid testing `translate' | |
3118 inside the loop. */ | |
3119 if (translate) | |
3120 while (range > lim | |
2078 | 3121 && !fastmap[(unsigned char) |
3122 translate[(unsigned char) *d++]]) | |
1155 | 3123 range--; |
3124 else | |
3125 while (range > lim && !fastmap[(unsigned char) *d++]) | |
3126 range--; | |
3127 | |
3128 startpos += irange - range; | |
3129 } | |
3130 else /* Searching backwards. */ | |
3131 { | |
3132 register char c = (size1 == 0 || startpos >= size1 | |
3133 ? string2[startpos - size1] | |
3134 : string1[startpos]); | |
3135 | |
1637 | 3136 if (!fastmap[(unsigned char) TRANSLATE (c)]) |
1155 | 3137 goto advance; |
3138 } | |
3139 } | |
3140 | |
3141 /* If can't match the null string, and that's all we have left, fail. */ | |
3142 if (range >= 0 && startpos == total_size && fastmap | |
3143 && !bufp->can_be_null) | |
3144 return -1; | |
3145 | |
3146 val = re_match_2 (bufp, string1, size1, string2, size2, | |
3147 startpos, regs, stop); | |
3148 if (val >= 0) | |
3149 return startpos; | |
3150 | |
3151 if (val == -2) | |
3152 return -2; | |
3153 | |
3154 advance: | |
3155 if (!range) | |
3156 break; | |
3157 else if (range > 0) | |
3158 { | |
3159 range--; | |
3160 startpos++; | |
3161 } | |
3162 else | |
3163 { | |
3164 range++; | |
3165 startpos--; | |
3166 } | |
3167 } | |
3168 return -1; | |
3169 } /* re_search_2 */ | |
3170 | |
3171 /* Declarations and macros for re_match_2. */ | |
3172 | |
3173 static int bcmp_translate (); | |
3174 static boolean alt_match_null_string_p (), | |
3175 common_op_match_null_string_p (), | |
3176 group_match_null_string_p (); | |
3177 | |
3178 /* This converts PTR, a pointer into one of the search strings `string1' | |
3179 and `string2' into an offset from the beginning of that string. */ | |
3180 #define POINTER_TO_OFFSET(ptr) \ | |
3181 (FIRST_STRING_P (ptr) ? (ptr) - string1 : (ptr) - string2 + size1) | |
3182 | |
3183 /* Macros for dealing with the split strings in re_match_2. */ | |
3184 | |
3185 #define MATCHING_IN_FIRST_STRING (dend == end_match_1) | |
3186 | |
3187 /* Call before fetching a character with *d. This switches over to | |
3188 string2 if necessary. */ | |
3189 #define PREFETCH() \ | |
3190 while (d == dend) \ | |
3191 { \ | |
3192 /* End of string2 => fail. */ \ | |
3193 if (dend == end_match_2) \ | |
3194 goto fail; \ | |
3195 /* End of string1 => advance to string2. */ \ | |
3196 d = string2; \ | |
3197 dend = end_match_2; \ | |
3198 } | |
3199 | |
3200 | |
3201 /* Test if at very beginning or at very end of the virtual concatenation | |
3202 of `string1' and `string2'. If only one string, it's `string2'. */ | |
1637 | 3203 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) |
3204 #define AT_STRINGS_END(d) ((d) == end2) | |
1155 | 3205 |
3206 | |
3207 /* Test if D points to a character which is word-constituent. We have | |
3208 two special cases to check for: if past the end of string1, look at | |
3209 the first character in string2; and if before the beginning of | |
1637 | 3210 string2, look at the last character in string1. */ |
3211 #define WORDCHAR_P(d) \ | |
1155 | 3212 (SYNTAX ((d) == end1 ? *string2 \ |
1637 | 3213 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \ |
3214 == Sword) | |
1155 | 3215 |
3216 /* Test if the character before D and the one at D differ with respect | |
3217 to being word-constituent. */ | |
3218 #define AT_WORD_BOUNDARY(d) \ | |
1637 | 3219 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \ |
3220 || WORDCHAR_P (d - 1) != WORDCHAR_P (d)) | |
1155 | 3221 |
3222 | |
3223 /* Free everything we malloc. */ | |
2952 | 3224 #ifdef MATCH_MAY_ALLOCATE |
1155 | 3225 #ifdef REGEX_MALLOC |
3226 #define FREE_VAR(var) if (var) free (var); var = NULL | |
3227 #define FREE_VARIABLES() \ | |
3228 do { \ | |
3229 FREE_VAR (fail_stack.stack); \ | |
3230 FREE_VAR (regstart); \ | |
3231 FREE_VAR (regend); \ | |
3232 FREE_VAR (old_regstart); \ | |
3233 FREE_VAR (old_regend); \ | |
3234 FREE_VAR (best_regstart); \ | |
3235 FREE_VAR (best_regend); \ | |
3236 FREE_VAR (reg_info); \ | |
3237 FREE_VAR (reg_dummy); \ | |
3238 FREE_VAR (reg_info_dummy); \ | |
3239 } while (0) | |
3240 #else /* not REGEX_MALLOC */ | |
3241 /* Some MIPS systems (at least) want this to free alloca'd storage. */ | |
3242 #define FREE_VARIABLES() alloca (0) | |
3243 #endif /* not REGEX_MALLOC */ | |
2952 | 3244 #else |
3245 #define FREE_VARIABLES() /* Do nothing! */ | |
3246 #endif /* not MATCH_MAY_ALLOCATE */ | |
1155 | 3247 |
3248 /* These values must meet several constraints. They must not be valid | |
3249 register values; since we have a limit of 255 registers (because | |
3250 we use only one byte in the pattern for the register number), we can | |
3251 use numbers larger than 255. They must differ by 1, because of | |
3252 NUM_FAILURE_ITEMS above. And the value for the lowest register must | |
3253 be larger than the value for the highest register, so we do not try | |
3254 to actually save any registers when none are active. */ | |
3255 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) | |
3256 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) | |
3257 | |
3258 /* Matching routines. */ | |
3259 | |
3260 #ifndef emacs /* Emacs never uses this. */ | |
3261 /* re_match is like re_match_2 except it takes only a single string. */ | |
3262 | |
3263 int | |
3264 re_match (bufp, string, size, pos, regs) | |
3265 struct re_pattern_buffer *bufp; | |
3266 const char *string; | |
3267 int size, pos; | |
3268 struct re_registers *regs; | |
3269 { | |
3270 return re_match_2 (bufp, NULL, 0, string, size, pos, regs, size); | |
3271 } | |
3272 #endif /* not emacs */ | |
3273 | |
3274 | |
3275 /* re_match_2 matches the compiled pattern in BUFP against the | |
3276 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 | |
3277 and SIZE2, respectively). We start matching at POS, and stop | |
3278 matching at STOP. | |
3279 | |
3280 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we | |
3281 store offsets for the substring each group matched in REGS. See the | |
3282 documentation for exactly how many groups we fill. | |
3283 | |
3284 We return -1 if no match, -2 if an internal error (such as the | |
3285 failure stack overflowing). Otherwise, we return the length of the | |
3286 matched substring. */ | |
3287 | |
3288 int | |
3289 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) | |
3290 struct re_pattern_buffer *bufp; | |
3291 const char *string1, *string2; | |
3292 int size1, size2; | |
3293 int pos; | |
3294 struct re_registers *regs; | |
3295 int stop; | |
3296 { | |
3297 /* General temporaries. */ | |
3298 int mcnt; | |
3299 unsigned char *p1; | |
3300 | |
3301 /* Just past the end of the corresponding string. */ | |
3302 const char *end1, *end2; | |
3303 | |
3304 /* Pointers into string1 and string2, just past the last characters in | |
3305 each to consider matching. */ | |
3306 const char *end_match_1, *end_match_2; | |
3307 | |
3308 /* Where we are in the data, and the end of the current string. */ | |
3309 const char *d, *dend; | |
3310 | |
3311 /* Where we are in the pattern, and the end of the pattern. */ | |
3312 unsigned char *p = bufp->buffer; | |
3313 register unsigned char *pend = p + bufp->used; | |
3314 | |
3315 /* We use this to map every character in the string. */ | |
3316 char *translate = bufp->translate; | |
3317 | |
3318 /* Failure point stack. Each place that can handle a failure further | |
3319 down the line pushes a failure point on this stack. It consists of | |
3320 restart, regend, and reg_info for all registers corresponding to | |
3321 the subexpressions we're currently inside, plus the number of such | |
3322 registers, and, finally, two char *'s. The first char * is where | |
3323 to resume scanning the pattern; the second one is where to resume | |
3324 scanning the strings. If the latter is zero, the failure point is | |
3325 a ``dummy''; if a failure happens and the failure point is a dummy, | |
3326 it gets discarded and the next next one is tried. */ | |
2952 | 3327 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ |
1155 | 3328 fail_stack_type fail_stack; |
2949 | 3329 #endif |
1155 | 3330 #ifdef DEBUG |
3331 static unsigned failure_id = 0; | |
1637 | 3332 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0; |
1155 | 3333 #endif |
3334 | |
3335 /* We fill all the registers internally, independent of what we | |
3336 return, for use in backreferences. The number here includes | |
3337 an element for register zero. */ | |
3338 unsigned num_regs = bufp->re_nsub + 1; | |
3339 | |
3340 /* The currently active registers. */ | |
3341 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG; | |
3342 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG; | |
3343 | |
3344 /* Information on the contents of registers. These are pointers into | |
3345 the input strings; they record just what was matched (on this | |
3346 attempt) by a subexpression part of the pattern, that is, the | |
3347 regnum-th regstart pointer points to where in the pattern we began | |
3348 matching and the regnum-th regend points to right after where we | |
3349 stopped matching the regnum-th subexpression. (The zeroth register | |
3350 keeps track of what the whole pattern matches.) */ | |
2952 | 3351 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ |
1155 | 3352 const char **regstart, **regend; |
2949 | 3353 #endif |
1155 | 3354 |
3355 /* If a group that's operated upon by a repetition operator fails to | |
3356 match anything, then the register for its start will need to be | |
3357 restored because it will have been set to wherever in the string we | |
3358 are when we last see its open-group operator. Similarly for a | |
3359 register's end. */ | |
2952 | 3360 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ |
1155 | 3361 const char **old_regstart, **old_regend; |
2949 | 3362 #endif |
1155 | 3363 |
3364 /* The is_active field of reg_info helps us keep track of which (possibly | |
3365 nested) subexpressions we are currently in. The matched_something | |
3366 field of reg_info[reg_num] helps us tell whether or not we have | |
3367 matched any of the pattern so far this time through the reg_num-th | |
3368 subexpression. These two fields get reset each time through any | |
3369 loop their register is in. */ | |
2952 | 3370 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ |
1155 | 3371 register_info_type *reg_info; |
2949 | 3372 #endif |
1155 | 3373 |
3374 /* The following record the register info as found in the above | |
3375 variables when we find a match better than any we've seen before. | |
3376 This happens as we backtrack through the failure points, which in | |
3377 turn happens only if we have not yet matched the entire string. */ | |
3378 unsigned best_regs_set = false; | |
2952 | 3379 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ |
1155 | 3380 const char **best_regstart, **best_regend; |
2949 | 3381 #endif |
1155 | 3382 |
3383 /* Logically, this is `best_regend[0]'. But we don't want to have to | |
3384 allocate space for that if we're not allocating space for anything | |
3385 else (see below). Also, we never need info about register 0 for | |
3386 any of the other register vectors, and it seems rather a kludge to | |
3387 treat `best_regend' differently than the rest. So we keep track of | |
3388 the end of the best match so far in a separate variable. We | |
3389 initialize this to NULL so that when we backtrack the first time | |
3390 and need to test it, it's not garbage. */ | |
3391 const char *match_end = NULL; | |
3392 | |
3393 /* Used when we pop values we don't care about. */ | |
2952 | 3394 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ |
1155 | 3395 const char **reg_dummy; |
3396 register_info_type *reg_info_dummy; | |
2949 | 3397 #endif |
1155 | 3398 |
3399 #ifdef DEBUG | |
3400 /* Counts the total number of registers pushed. */ | |
3401 unsigned num_regs_pushed = 0; | |
3402 #endif | |
3403 | |
3404 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n"); | |
3405 | |
3406 INIT_FAIL_STACK (); | |
3407 | |
2952 | 3408 #ifdef MATCH_MAY_ALLOCATE |
1155 | 3409 /* Do not bother to initialize all the register variables if there are |
3410 no groups in the pattern, as it takes a fair amount of time. If | |
3411 there are groups, we include space for register 0 (the whole | |
3412 pattern), even though we never use it, since it simplifies the | |
3413 array indexing. We should fix this. */ | |
3414 if (bufp->re_nsub) | |
3415 { | |
3416 regstart = REGEX_TALLOC (num_regs, const char *); | |
3417 regend = REGEX_TALLOC (num_regs, const char *); | |
3418 old_regstart = REGEX_TALLOC (num_regs, const char *); | |
3419 old_regend = REGEX_TALLOC (num_regs, const char *); | |
3420 best_regstart = REGEX_TALLOC (num_regs, const char *); | |
3421 best_regend = REGEX_TALLOC (num_regs, const char *); | |
3422 reg_info = REGEX_TALLOC (num_regs, register_info_type); | |
3423 reg_dummy = REGEX_TALLOC (num_regs, const char *); | |
3424 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type); | |
3425 | |
3426 if (!(regstart && regend && old_regstart && old_regend && reg_info | |
3427 && best_regstart && best_regend && reg_dummy && reg_info_dummy)) | |
3428 { | |
3429 FREE_VARIABLES (); | |
3430 return -2; | |
3431 } | |
3432 } | |
2949 | 3433 #if defined (REGEX_MALLOC) |
1155 | 3434 else |
3435 { | |
3436 /* We must initialize all our variables to NULL, so that | |
1637 | 3437 `FREE_VARIABLES' doesn't try to free them. */ |
1155 | 3438 regstart = regend = old_regstart = old_regend = best_regstart |
3439 = best_regend = reg_dummy = NULL; | |
3440 reg_info = reg_info_dummy = (register_info_type *) NULL; | |
3441 } | |
3442 #endif /* REGEX_MALLOC */ | |
2952 | 3443 #endif /* MATCH_MAY_ALLOCATE */ |
1155 | 3444 |
3445 /* The starting position is bogus. */ | |
3446 if (pos < 0 || pos > size1 + size2) | |
3447 { | |
3448 FREE_VARIABLES (); | |
3449 return -1; | |
3450 } | |
3451 | |
3452 /* Initialize subexpression text positions to -1 to mark ones that no | |
3453 start_memory/stop_memory has been seen for. Also initialize the | |
3454 register information struct. */ | |
3455 for (mcnt = 1; mcnt < num_regs; mcnt++) | |
3456 { | |
3457 regstart[mcnt] = regend[mcnt] | |
3458 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; | |
3459 | |
3460 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; | |
3461 IS_ACTIVE (reg_info[mcnt]) = 0; | |
3462 MATCHED_SOMETHING (reg_info[mcnt]) = 0; | |
3463 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; | |
3464 } | |
3465 | |
3466 /* We move `string1' into `string2' if the latter's empty -- but not if | |
3467 `string1' is null. */ | |
3468 if (size2 == 0 && string1 != NULL) | |
3469 { | |
3470 string2 = string1; | |
3471 size2 = size1; | |
3472 string1 = 0; | |
3473 size1 = 0; | |
3474 } | |
3475 end1 = string1 + size1; | |
3476 end2 = string2 + size2; | |
3477 | |
3478 /* Compute where to stop matching, within the two strings. */ | |
3479 if (stop <= size1) | |
3480 { | |
3481 end_match_1 = string1 + stop; | |
3482 end_match_2 = string2; | |
3483 } | |
3484 else | |
3485 { | |
3486 end_match_1 = end1; | |
3487 end_match_2 = string2 + stop - size1; | |
3488 } | |
3489 | |
3490 /* `p' scans through the pattern as `d' scans through the data. | |
3491 `dend' is the end of the input string that `d' points within. `d' | |
3492 is advanced into the following input string whenever necessary, but | |
3493 this happens before fetching; therefore, at the beginning of the | |
3494 loop, `d' can be pointing at the end of a string, but it cannot | |
3495 equal `string2'. */ | |
3496 if (size1 > 0 && pos <= size1) | |
3497 { | |
3498 d = string1 + pos; | |
3499 dend = end_match_1; | |
3500 } | |
3501 else | |
3502 { | |
3503 d = string2 + pos - size1; | |
3504 dend = end_match_2; | |
3505 } | |
3506 | |
3507 DEBUG_PRINT1 ("The compiled pattern is: "); | |
3508 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); | |
3509 DEBUG_PRINT1 ("The string to match is: `"); | |
3510 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); | |
3511 DEBUG_PRINT1 ("'\n"); | |
3512 | |
3513 /* This loops over pattern commands. It exits by returning from the | |
3514 function if the match is complete, or it drops through if the match | |
3515 fails at this starting point in the input data. */ | |
3516 for (;;) | |
3517 { | |
3518 DEBUG_PRINT2 ("\n0x%x: ", p); | |
3519 | |
3520 if (p == pend) | |
3521 { /* End of pattern means we might have succeeded. */ | |
1637 | 3522 DEBUG_PRINT1 ("end of pattern ... "); |
3523 | |
3524 /* If we haven't matched the entire string, and we want the | |
3525 longest match, try backtracking. */ | |
1155 | 3526 if (d != end_match_2) |
3527 { | |
3528 DEBUG_PRINT1 ("backtracking.\n"); | |
3529 | |
3530 if (!FAIL_STACK_EMPTY ()) | |
3531 { /* More failure points to try. */ | |
3532 boolean same_str_p = (FIRST_STRING_P (match_end) | |
3533 == MATCHING_IN_FIRST_STRING); | |
3534 | |
3535 /* If exceeds best match so far, save it. */ | |
3536 if (!best_regs_set | |
3537 || (same_str_p && d > match_end) | |
3538 || (!same_str_p && !MATCHING_IN_FIRST_STRING)) | |
3539 { | |
3540 best_regs_set = true; | |
3541 match_end = d; | |
3542 | |
3543 DEBUG_PRINT1 ("\nSAVING match as best so far.\n"); | |
3544 | |
3545 for (mcnt = 1; mcnt < num_regs; mcnt++) | |
3546 { | |
3547 best_regstart[mcnt] = regstart[mcnt]; | |
3548 best_regend[mcnt] = regend[mcnt]; | |
3549 } | |
3550 } | |
3551 goto fail; | |
3552 } | |
3553 | |
3554 /* If no failure points, don't restore garbage. */ | |
3555 else if (best_regs_set) | |
3556 { | |
3557 restore_best_regs: | |
3558 /* Restore best match. It may happen that `dend == | |
3559 end_match_1' while the restored d is in string2. | |
3560 For example, the pattern `x.*y.*z' against the | |
3561 strings `x-' and `y-z-', if the two strings are | |
3562 not consecutive in memory. */ | |
1637 | 3563 DEBUG_PRINT1 ("Restoring best registers.\n"); |
3564 | |
1155 | 3565 d = match_end; |
3566 dend = ((d >= string1 && d <= end1) | |
3567 ? end_match_1 : end_match_2); | |
3568 | |
3569 for (mcnt = 1; mcnt < num_regs; mcnt++) | |
3570 { | |
3571 regstart[mcnt] = best_regstart[mcnt]; | |
3572 regend[mcnt] = best_regend[mcnt]; | |
3573 } | |
3574 } | |
3575 } /* d != end_match_2 */ | |
3576 | |
1637 | 3577 DEBUG_PRINT1 ("Accepting match.\n"); |
1155 | 3578 |
3579 /* If caller wants register contents data back, do it. */ | |
3580 if (regs && !bufp->no_sub) | |
3581 { | |
3582 /* Have the register data arrays been allocated? */ | |
3583 if (bufp->regs_allocated == REGS_UNALLOCATED) | |
3584 { /* No. So allocate them with malloc. We need one | |
3585 extra element beyond `num_regs' for the `-1' marker | |
3586 GNU code uses. */ | |
3587 regs->num_regs = MAX (RE_NREGS, num_regs + 1); | |
3588 regs->start = TALLOC (regs->num_regs, regoff_t); | |
3589 regs->end = TALLOC (regs->num_regs, regoff_t); | |
3590 if (regs->start == NULL || regs->end == NULL) | |
3591 return -2; | |
3592 bufp->regs_allocated = REGS_REALLOCATE; | |
3593 } | |
3594 else if (bufp->regs_allocated == REGS_REALLOCATE) | |
3595 { /* Yes. If we need more elements than were already | |
3596 allocated, reallocate them. If we need fewer, just | |
3597 leave it alone. */ | |
3598 if (regs->num_regs < num_regs + 1) | |
3599 { | |
3600 regs->num_regs = num_regs + 1; | |
3601 RETALLOC (regs->start, regs->num_regs, regoff_t); | |
3602 RETALLOC (regs->end, regs->num_regs, regoff_t); | |
3603 if (regs->start == NULL || regs->end == NULL) | |
3604 return -2; | |
3605 } | |
3606 } | |
3607 else | |
2465 | 3608 { |
3609 /* These braces fend off a "empty body in an else-statement" | |
3610 warning under GCC when assert expands to nothing. */ | |
3611 assert (bufp->regs_allocated == REGS_FIXED); | |
3612 } | |
1155 | 3613 |
3614 /* Convert the pointer data in `regstart' and `regend' to | |
3615 indices. Register zero has to be set differently, | |
3616 since we haven't kept track of any info for it. */ | |
3617 if (regs->num_regs > 0) | |
3618 { | |
3619 regs->start[0] = pos; | |
3620 regs->end[0] = (MATCHING_IN_FIRST_STRING ? d - string1 | |
3621 : d - string2 + size1); | |
3622 } | |
3623 | |
3624 /* Go through the first `min (num_regs, regs->num_regs)' | |
3625 registers, since that is all we initialized. */ | |
3626 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++) | |
3627 { | |
3628 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt])) | |
3629 regs->start[mcnt] = regs->end[mcnt] = -1; | |
3630 else | |
3631 { | |
3632 regs->start[mcnt] = POINTER_TO_OFFSET (regstart[mcnt]); | |
3633 regs->end[mcnt] = POINTER_TO_OFFSET (regend[mcnt]); | |
3634 } | |
3635 } | |
3636 | |
3637 /* If the regs structure we return has more elements than | |
3638 were in the pattern, set the extra elements to -1. If | |
3639 we (re)allocated the registers, this is the case, | |
3640 because we always allocate enough to have at least one | |
3641 -1 at the end. */ | |
3642 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++) | |
3643 regs->start[mcnt] = regs->end[mcnt] = -1; | |
3644 } /* regs && !bufp->no_sub */ | |
3645 | |
3646 FREE_VARIABLES (); | |
1637 | 3647 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", |
3648 nfailure_points_pushed, nfailure_points_popped, | |
3649 nfailure_points_pushed - nfailure_points_popped); | |
3650 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed); | |
1155 | 3651 |
3652 mcnt = d - pos - (MATCHING_IN_FIRST_STRING | |
3653 ? string1 | |
3654 : string2 - size1); | |
3655 | |
3656 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt); | |
3657 | |
3658 return mcnt; | |
3659 } | |
3660 | |
3661 /* Otherwise match next pattern command. */ | |
3662 #ifdef SWITCH_ENUM_BUG | |
3663 switch ((int) ((re_opcode_t) *p++)) | |
3664 #else | |
3665 switch ((re_opcode_t) *p++) | |
3666 #endif | |
3667 { | |
3668 /* Ignore these. Used to ignore the n of succeed_n's which | |
3669 currently have n == 0. */ | |
3670 case no_op: | |
3671 DEBUG_PRINT1 ("EXECUTING no_op.\n"); | |
3672 break; | |
3673 | |
3674 | |
3675 /* Match the next n pattern characters exactly. The following | |
3676 byte in the pattern defines n, and the n bytes after that | |
3677 are the characters to match. */ | |
3678 case exactn: | |
3679 mcnt = *p++; | |
3680 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt); | |
3681 | |
3682 /* This is written out as an if-else so we don't waste time | |
3683 testing `translate' inside the loop. */ | |
3684 if (translate) | |
3685 { | |
3686 do | |
3687 { | |
3688 PREFETCH (); | |
3689 if (translate[(unsigned char) *d++] != (char) *p++) | |
3690 goto fail; | |
3691 } | |
3692 while (--mcnt); | |
3693 } | |
3694 else | |
3695 { | |
3696 do | |
3697 { | |
3698 PREFETCH (); | |
3699 if (*d++ != (char) *p++) goto fail; | |
3700 } | |
3701 while (--mcnt); | |
3702 } | |
3703 SET_REGS_MATCHED (); | |
3704 break; | |
3705 | |
3706 | |
3707 /* Match any character except possibly a newline or a null. */ | |
3708 case anychar: | |
3709 DEBUG_PRINT1 ("EXECUTING anychar.\n"); | |
3710 | |
3711 PREFETCH (); | |
3712 | |
3713 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n') | |
3714 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000')) | |
3715 goto fail; | |
3716 | |
3717 SET_REGS_MATCHED (); | |
3718 DEBUG_PRINT2 (" Matched `%d'.\n", *d); | |
3719 d++; | |
3720 break; | |
3721 | |
3722 | |
3723 case charset: | |
3724 case charset_not: | |
3725 { | |
3726 register unsigned char c; | |
3727 boolean not = (re_opcode_t) *(p - 1) == charset_not; | |
3728 | |
3729 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); | |
3730 | |
3731 PREFETCH (); | |
3732 c = TRANSLATE (*d); /* The character to match. */ | |
3733 | |
3734 /* Cast to `unsigned' instead of `unsigned char' in case the | |
3735 bit list is a full 32 bytes long. */ | |
3736 if (c < (unsigned) (*p * BYTEWIDTH) | |
3737 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) | |
3738 not = !not; | |
3739 | |
3740 p += 1 + *p; | |
3741 | |
3742 if (!not) goto fail; | |
3743 | |
3744 SET_REGS_MATCHED (); | |
3745 d++; | |
3746 break; | |
3747 } | |
3748 | |
3749 | |
3750 /* The beginning of a group is represented by start_memory. | |
3751 The arguments are the register number in the next byte, and the | |
3752 number of groups inner to this one in the next. The text | |
3753 matched within the group is recorded (in the internal | |
3754 registers data structure) under the register number. */ | |
3755 case start_memory: | |
3756 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]); | |
3757 | |
3758 /* Find out if this group can match the empty string. */ | |
3759 p1 = p; /* To send to group_match_null_string_p. */ | |
3760 | |
3761 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) | |
3762 REG_MATCH_NULL_STRING_P (reg_info[*p]) | |
3763 = group_match_null_string_p (&p1, pend, reg_info); | |
3764 | |
3765 /* Save the position in the string where we were the last time | |
3766 we were at this open-group operator in case the group is | |
3767 operated upon by a repetition operator, e.g., with `(a*)*b' | |
3768 against `ab'; then we want to ignore where we are now in | |
3769 the string in case this attempt to match fails. */ | |
3770 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) | |
3771 ? REG_UNSET (regstart[*p]) ? d : regstart[*p] | |
3772 : regstart[*p]; | |
3773 DEBUG_PRINT2 (" old_regstart: %d\n", | |
3774 POINTER_TO_OFFSET (old_regstart[*p])); | |
3775 | |
3776 regstart[*p] = d; | |
3777 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p])); | |
3778 | |
3779 IS_ACTIVE (reg_info[*p]) = 1; | |
3780 MATCHED_SOMETHING (reg_info[*p]) = 0; | |
3781 | |
3782 /* This is the new highest active register. */ | |
3783 highest_active_reg = *p; | |
3784 | |
3785 /* If nothing was active before, this is the new lowest active | |
3786 register. */ | |
3787 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) | |
3788 lowest_active_reg = *p; | |
3789 | |
3790 /* Move past the register number and inner group count. */ | |
3791 p += 2; | |
3792 break; | |
3793 | |
3794 | |
3795 /* The stop_memory opcode represents the end of a group. Its | |
3796 arguments are the same as start_memory's: the register | |
3797 number, and the number of inner groups. */ | |
3798 case stop_memory: | |
3799 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]); | |
3800 | |
3801 /* We need to save the string position the last time we were at | |
3802 this close-group operator in case the group is operated | |
3803 upon by a repetition operator, e.g., with `((a*)*(b*)*)*' | |
3804 against `aba'; then we want to ignore where we are now in | |
3805 the string in case this attempt to match fails. */ | |
3806 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) | |
3807 ? REG_UNSET (regend[*p]) ? d : regend[*p] | |
3808 : regend[*p]; | |
3809 DEBUG_PRINT2 (" old_regend: %d\n", | |
3810 POINTER_TO_OFFSET (old_regend[*p])); | |
3811 | |
3812 regend[*p] = d; | |
3813 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p])); | |
3814 | |
3815 /* This register isn't active anymore. */ | |
3816 IS_ACTIVE (reg_info[*p]) = 0; | |
3817 | |
3818 /* If this was the only register active, nothing is active | |
3819 anymore. */ | |
3820 if (lowest_active_reg == highest_active_reg) | |
3821 { | |
3822 lowest_active_reg = NO_LOWEST_ACTIVE_REG; | |
3823 highest_active_reg = NO_HIGHEST_ACTIVE_REG; | |
3824 } | |
3825 else | |
3826 { /* We must scan for the new highest active register, since | |
3827 it isn't necessarily one less than now: consider | |
3828 (a(b)c(d(e)f)g). When group 3 ends, after the f), the | |
3829 new highest active register is 1. */ | |
3830 unsigned char r = *p - 1; | |
3831 while (r > 0 && !IS_ACTIVE (reg_info[r])) | |
3832 r--; | |
3833 | |
3834 /* If we end up at register zero, that means that we saved | |
3835 the registers as the result of an `on_failure_jump', not | |
3836 a `start_memory', and we jumped to past the innermost | |
3837 `stop_memory'. For example, in ((.)*) we save | |
3838 registers 1 and 2 as a result of the *, but when we pop | |
3839 back to the second ), we are at the stop_memory 1. | |
3840 Thus, nothing is active. */ | |
3841 if (r == 0) | |
3842 { | |
3843 lowest_active_reg = NO_LOWEST_ACTIVE_REG; | |
3844 highest_active_reg = NO_HIGHEST_ACTIVE_REG; | |
3845 } | |
3846 else | |
3847 highest_active_reg = r; | |
3848 } | |
3849 | |
3850 /* If just failed to match something this time around with a | |
3851 group that's operated on by a repetition operator, try to | |
1637 | 3852 force exit from the ``loop'', and restore the register |
1155 | 3853 information for this group that we had before trying this |
3854 last match. */ | |
3855 if ((!MATCHED_SOMETHING (reg_info[*p]) | |
3856 || (re_opcode_t) p[-3] == start_memory) | |
3857 && (p + 2) < pend) | |
3858 { | |
3859 boolean is_a_jump_n = false; | |
3860 | |
3861 p1 = p + 2; | |
3862 mcnt = 0; | |
3863 switch ((re_opcode_t) *p1++) | |
3864 { | |
3865 case jump_n: | |
3866 is_a_jump_n = true; | |
3867 case pop_failure_jump: | |
3868 case maybe_pop_jump: | |
3869 case jump: | |
3870 case dummy_failure_jump: | |
3871 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
3872 if (is_a_jump_n) | |
3873 p1 += 2; | |
3874 break; | |
3875 | |
3876 default: | |
3877 /* do nothing */ ; | |
3878 } | |
3879 p1 += mcnt; | |
3880 | |
3881 /* If the next operation is a jump backwards in the pattern | |
3882 to an on_failure_jump right before the start_memory | |
3883 corresponding to this stop_memory, exit from the loop | |
3884 by forcing a failure after pushing on the stack the | |
3885 on_failure_jump's jump in the pattern, and d. */ | |
3886 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump | |
3887 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) | |
3888 { | |
3889 /* If this group ever matched anything, then restore | |
3890 what its registers were before trying this last | |
3891 failed match, e.g., with `(a*)*b' against `ab' for | |
3892 regstart[1], and, e.g., with `((a*)*(b*)*)*' | |
3893 against `aba' for regend[3]. | |
3894 | |
3895 Also restore the registers for inner groups for, | |
3896 e.g., `((a*)(b*))*' against `aba' (register 3 would | |
3897 otherwise get trashed). */ | |
3898 | |
3899 if (EVER_MATCHED_SOMETHING (reg_info[*p])) | |
3900 { | |
3901 unsigned r; | |
3902 | |
3903 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; | |
3904 | |
3905 /* Restore this and inner groups' (if any) registers. */ | |
3906 for (r = *p; r < *p + *(p + 1); r++) | |
3907 { | |
3908 regstart[r] = old_regstart[r]; | |
3909 | |
3910 /* xx why this test? */ | |
3911 if ((int) old_regend[r] >= (int) regstart[r]) | |
3912 regend[r] = old_regend[r]; | |
3913 } | |
3914 } | |
3915 p1++; | |
3916 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
3917 PUSH_FAILURE_POINT (p1 + mcnt, d, -2); | |
3918 | |
3919 goto fail; | |
3920 } | |
3921 } | |
3922 | |
3923 /* Move past the register number and the inner group count. */ | |
3924 p += 2; | |
3925 break; | |
3926 | |
3927 | |
3928 /* \<digit> has been turned into a `duplicate' command which is | |
3929 followed by the numeric value of <digit> as the register number. */ | |
3930 case duplicate: | |
3931 { | |
3932 register const char *d2, *dend2; | |
3933 int regno = *p++; /* Get which register to match against. */ | |
3934 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno); | |
3935 | |
3936 /* Can't back reference a group which we've never matched. */ | |
3937 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno])) | |
3938 goto fail; | |
3939 | |
3940 /* Where in input to try to start matching. */ | |
3941 d2 = regstart[regno]; | |
3942 | |
3943 /* Where to stop matching; if both the place to start and | |
3944 the place to stop matching are in the same string, then | |
3945 set to the place to stop, otherwise, for now have to use | |
3946 the end of the first string. */ | |
3947 | |
3948 dend2 = ((FIRST_STRING_P (regstart[regno]) | |
3949 == FIRST_STRING_P (regend[regno])) | |
3950 ? regend[regno] : end_match_1); | |
3951 for (;;) | |
3952 { | |
3953 /* If necessary, advance to next segment in register | |
3954 contents. */ | |
3955 while (d2 == dend2) | |
3956 { | |
3957 if (dend2 == end_match_2) break; | |
3958 if (dend2 == regend[regno]) break; | |
3959 | |
3960 /* End of string1 => advance to string2. */ | |
3961 d2 = string2; | |
3962 dend2 = regend[regno]; | |
3963 } | |
3964 /* At end of register contents => success */ | |
3965 if (d2 == dend2) break; | |
3966 | |
3967 /* If necessary, advance to next segment in data. */ | |
3968 PREFETCH (); | |
3969 | |
3970 /* How many characters left in this segment to match. */ | |
3971 mcnt = dend - d; | |
3972 | |
3973 /* Want how many consecutive characters we can match in | |
3974 one shot, so, if necessary, adjust the count. */ | |
3975 if (mcnt > dend2 - d2) | |
3976 mcnt = dend2 - d2; | |
3977 | |
3978 /* Compare that many; failure if mismatch, else move | |
3979 past them. */ | |
3980 if (translate | |
3981 ? bcmp_translate (d, d2, mcnt, translate) | |
3982 : bcmp (d, d2, mcnt)) | |
3983 goto fail; | |
3984 d += mcnt, d2 += mcnt; | |
3985 } | |
3986 } | |
3987 break; | |
3988 | |
3989 | |
3990 /* begline matches the empty string at the beginning of the string | |
3991 (unless `not_bol' is set in `bufp'), and, if | |
3992 `newline_anchor' is set, after newlines. */ | |
3993 case begline: | |
3994 DEBUG_PRINT1 ("EXECUTING begline.\n"); | |
3995 | |
1637 | 3996 if (AT_STRINGS_BEG (d)) |
1155 | 3997 { |
3998 if (!bufp->not_bol) break; | |
3999 } | |
4000 else if (d[-1] == '\n' && bufp->newline_anchor) | |
4001 { | |
4002 break; | |
4003 } | |
4004 /* In all other cases, we fail. */ | |
4005 goto fail; | |
4006 | |
4007 | |
4008 /* endline is the dual of begline. */ | |
4009 case endline: | |
4010 DEBUG_PRINT1 ("EXECUTING endline.\n"); | |
4011 | |
1637 | 4012 if (AT_STRINGS_END (d)) |
1155 | 4013 { |
4014 if (!bufp->not_eol) break; | |
4015 } | |
4016 | |
4017 /* We have to ``prefetch'' the next character. */ | |
4018 else if ((d == end1 ? *string2 : *d) == '\n' | |
4019 && bufp->newline_anchor) | |
4020 { | |
4021 break; | |
4022 } | |
4023 goto fail; | |
4024 | |
4025 | |
4026 /* Match at the very beginning of the data. */ | |
4027 case begbuf: | |
4028 DEBUG_PRINT1 ("EXECUTING begbuf.\n"); | |
1637 | 4029 if (AT_STRINGS_BEG (d)) |
1155 | 4030 break; |
4031 goto fail; | |
4032 | |
4033 | |
4034 /* Match at the very end of the data. */ | |
4035 case endbuf: | |
4036 DEBUG_PRINT1 ("EXECUTING endbuf.\n"); | |
1637 | 4037 if (AT_STRINGS_END (d)) |
1155 | 4038 break; |
4039 goto fail; | |
4040 | |
4041 | |
4042 /* on_failure_keep_string_jump is used to optimize `.*\n'. It | |
4043 pushes NULL as the value for the string on the stack. Then | |
4044 `pop_failure_point' will keep the current value for the | |
4045 string, instead of restoring it. To see why, consider | |
4046 matching `foo\nbar' against `.*\n'. The .* matches the foo; | |
4047 then the . fails against the \n. But the next thing we want | |
4048 to do is match the \n against the \n; if we restored the | |
4049 string value, we would be back at the foo. | |
4050 | |
4051 Because this is used only in specific cases, we don't need to | |
4052 check all the things that `on_failure_jump' does, to make | |
4053 sure the right things get saved on the stack. Hence we don't | |
4054 share its code. The only reason to push anything on the | |
4055 stack at all is that otherwise we would have to change | |
4056 `anychar's code to do something besides goto fail in this | |
4057 case; that seems worse than this. */ | |
4058 case on_failure_keep_string_jump: | |
4059 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump"); | |
4060 | |
4061 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
4062 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt); | |
4063 | |
4064 PUSH_FAILURE_POINT (p + mcnt, NULL, -2); | |
4065 break; | |
4066 | |
4067 | |
4068 /* Uses of on_failure_jump: | |
4069 | |
4070 Each alternative starts with an on_failure_jump that points | |
4071 to the beginning of the next alternative. Each alternative | |
4072 except the last ends with a jump that in effect jumps past | |
4073 the rest of the alternatives. (They really jump to the | |
4074 ending jump of the following alternative, because tensioning | |
4075 these jumps is a hassle.) | |
4076 | |
4077 Repeats start with an on_failure_jump that points past both | |
4078 the repetition text and either the following jump or | |
4079 pop_failure_jump back to this on_failure_jump. */ | |
4080 case on_failure_jump: | |
4081 on_failure: | |
4082 DEBUG_PRINT1 ("EXECUTING on_failure_jump"); | |
4083 | |
4084 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
4085 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt); | |
4086 | |
4087 /* If this on_failure_jump comes right before a group (i.e., | |
4088 the original * applied to a group), save the information | |
4089 for that group and all inner ones, so that if we fail back | |
4090 to this point, the group's information will be correct. | |
1637 | 4091 For example, in \(a*\)*\1, we need the preceding group, |
1155 | 4092 and in \(\(a*\)b*\)\2, we need the inner group. */ |
4093 | |
4094 /* We can't use `p' to check ahead because we push | |
4095 a failure point to `p + mcnt' after we do this. */ | |
4096 p1 = p; | |
4097 | |
4098 /* We need to skip no_op's before we look for the | |
4099 start_memory in case this on_failure_jump is happening as | |
4100 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 | |
4101 against aba. */ | |
4102 while (p1 < pend && (re_opcode_t) *p1 == no_op) | |
4103 p1++; | |
4104 | |
4105 if (p1 < pend && (re_opcode_t) *p1 == start_memory) | |
4106 { | |
4107 /* We have a new highest active register now. This will | |
4108 get reset at the start_memory we are about to get to, | |
4109 but we will have saved all the registers relevant to | |
4110 this repetition op, as described above. */ | |
4111 highest_active_reg = *(p1 + 1) + *(p1 + 2); | |
4112 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) | |
4113 lowest_active_reg = *(p1 + 1); | |
4114 } | |
4115 | |
4116 DEBUG_PRINT1 (":\n"); | |
4117 PUSH_FAILURE_POINT (p + mcnt, d, -2); | |
4118 break; | |
4119 | |
4120 | |
1637 | 4121 /* A smart repeat ends with `maybe_pop_jump'. |
4122 We change it to either `pop_failure_jump' or `jump'. */ | |
1155 | 4123 case maybe_pop_jump: |
4124 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
4125 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); | |
4126 { | |
4127 register unsigned char *p2 = p; | |
4128 | |
4129 /* Compare the beginning of the repeat with what in the | |
4130 pattern follows its end. If we can establish that there | |
4131 is nothing that they would both match, i.e., that we | |
4132 would have to backtrack because of (as in, e.g., `a*a') | |
4133 then we can change to pop_failure_jump, because we'll | |
4134 never have to backtrack. | |
4135 | |
4136 This is not true in the case of alternatives: in | |
4137 `(a|ab)*' we do need to backtrack to the `ab' alternative | |
4138 (e.g., if the string was `ab'). But instead of trying to | |
4139 detect that here, the alternative has put on a dummy | |
4140 failure point which is what we will end up popping. */ | |
4141 | |
3541
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4142 /* Skip over open/close-group commands. |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4143 If what follows this loop is a ...+ construct, |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4144 look at what begins its body, since we will have to |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4145 match at least one of that. */ |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4146 while (1) |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4147 { |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4148 if (p2 + 2 < pend |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4149 && ((re_opcode_t) *p2 == stop_memory |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4150 || (re_opcode_t) *p2 == start_memory)) |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4151 p2 += 3; |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4152 else if (p2 + 6 < pend |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4153 && (re_opcode_t) *p2 == dummy_failure_jump) |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4154 p2 += 6; |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4155 else |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4156 break; |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4157 } |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4158 |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4159 p1 = p + mcnt; |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4160 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4161 to the `maybe_finalize_jump' of this case. Examine what |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4162 follows. */ |
1155 | 4163 |
4164 /* If we're at the end of the pattern, we can change. */ | |
4165 if (p2 == pend) | |
1669 | 4166 { |
4167 /* Consider what happens when matching ":\(.*\)" | |
4168 against ":/". I don't really understand this code | |
4169 yet. */ | |
1155 | 4170 p[-3] = (unsigned char) pop_failure_jump; |
1669 | 4171 DEBUG_PRINT1 |
4172 (" End of pattern: change to `pop_failure_jump'.\n"); | |
1155 | 4173 } |
4174 | |
4175 else if ((re_opcode_t) *p2 == exactn | |
4176 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) | |
4177 { | |
4178 register unsigned char c | |
4179 = *p2 == (unsigned char) endline ? '\n' : p2[2]; | |
3541
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4180 |
1155 | 4181 if ((re_opcode_t) p1[3] == exactn && p1[5] != c) |
1637 | 4182 { |
4183 p[-3] = (unsigned char) pop_failure_jump; | |
4184 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", | |
4185 c, p1[5]); | |
4186 } | |
4187 | |
1155 | 4188 else if ((re_opcode_t) p1[3] == charset |
4189 || (re_opcode_t) p1[3] == charset_not) | |
4190 { | |
4191 int not = (re_opcode_t) p1[3] == charset_not; | |
4192 | |
4193 if (c < (unsigned char) (p1[4] * BYTEWIDTH) | |
4194 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) | |
4195 not = !not; | |
4196 | |
4197 /* `not' is equal to 1 if c would match, which means | |
4198 that we can't change to pop_failure_jump. */ | |
4199 if (!not) | |
4200 { | |
4201 p[-3] = (unsigned char) pop_failure_jump; | |
1637 | 4202 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); |
1155 | 4203 } |
4204 } | |
4205 } | |
3541
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4206 else if ((re_opcode_t) *p2 == charset) |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4207 { |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4208 register unsigned char c |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4209 = *p2 == (unsigned char) endline ? '\n' : p2[2]; |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4210 |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4211 if ((re_opcode_t) p1[3] == exactn |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4212 && ! (p2[1] * BYTEWIDTH > p1[4] |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4213 && (p2[1 + p1[4] / BYTEWIDTH] |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4214 & (1 << (p1[4] % BYTEWIDTH))))) |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4215 { |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4216 p[-3] = (unsigned char) pop_failure_jump; |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4217 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4218 c, p1[5]); |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4219 } |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4220 |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4221 else if ((re_opcode_t) p1[3] == charset_not) |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4222 { |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4223 int idx; |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4224 /* We win if the charset_not inside the loop |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4225 lists every character listed in the charset after. */ |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4226 for (idx = 0; idx < p2[1]; idx++) |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4227 if (! (p2[2 + idx] == 0 |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4228 || (idx < p1[4] |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4229 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0)))) |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4230 break; |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4231 |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4232 if (idx == p2[1]) |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4233 { |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4234 p[-3] = (unsigned char) pop_failure_jump; |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4235 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4236 } |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4237 } |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4238 else if ((re_opcode_t) p1[3] == charset) |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4239 { |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4240 int idx; |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4241 /* We win if the charset inside the loop |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4242 has no overlap with the one after the loop. */ |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4243 for (idx = 0; idx < p2[1] && idx < p1[4]; idx++) |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4244 if ((p2[2 + idx] & p1[5 + idx]) != 0) |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4245 break; |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4246 |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4247 if (idx == p2[1] || idx == p1[4]) |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4248 { |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4249 p[-3] = (unsigned char) pop_failure_jump; |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4250 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4251 } |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4252 } |
cb4aa2f13edd
*** empty log message ***
Richard M. Stallman <rms@gnu.org>
parents:
2952
diff
changeset
|
4253 } |
1155 | 4254 } |
4255 p -= 2; /* Point at relative address again. */ | |
4256 if ((re_opcode_t) p[-1] != pop_failure_jump) | |
4257 { | |
4258 p[-1] = (unsigned char) jump; | |
1637 | 4259 DEBUG_PRINT1 (" Match => jump.\n"); |
1155 | 4260 goto unconditional_jump; |
4261 } | |
4262 /* Note fall through. */ | |
4263 | |
4264 | |
4265 /* The end of a simple repeat has a pop_failure_jump back to | |
4266 its matching on_failure_jump, where the latter will push a | |
4267 failure point. The pop_failure_jump takes off failure | |
4268 points put on by this pop_failure_jump's matching | |
4269 on_failure_jump; we got through the pattern to here from the | |
4270 matching on_failure_jump, so didn't fail. */ | |
4271 case pop_failure_jump: | |
4272 { | |
4273 /* We need to pass separate storage for the lowest and | |
4274 highest registers, even though we don't care about the | |
4275 actual values. Otherwise, we will restore only one | |
4276 register from the stack, since lowest will == highest in | |
4277 `pop_failure_point'. */ | |
4278 unsigned dummy_low_reg, dummy_high_reg; | |
4279 unsigned char *pdummy; | |
4280 const char *sdummy; | |
4281 | |
4282 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n"); | |
4283 POP_FAILURE_POINT (sdummy, pdummy, | |
4284 dummy_low_reg, dummy_high_reg, | |
4285 reg_dummy, reg_dummy, reg_info_dummy); | |
4286 } | |
4287 /* Note fall through. */ | |
4288 | |
4289 | |
4290 /* Unconditionally jump (without popping any failure points). */ | |
4291 case jump: | |
4292 unconditional_jump: | |
4293 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ | |
4294 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); | |
4295 p += mcnt; /* Do the jump. */ | |
4296 DEBUG_PRINT2 ("(to 0x%x).\n", p); | |
4297 break; | |
4298 | |
4299 | |
4300 /* We need this opcode so we can detect where alternatives end | |
4301 in `group_match_null_string_p' et al. */ | |
4302 case jump_past_alt: | |
4303 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n"); | |
4304 goto unconditional_jump; | |
4305 | |
4306 | |
4307 /* Normally, the on_failure_jump pushes a failure point, which | |
4308 then gets popped at pop_failure_jump. We will end up at | |
4309 pop_failure_jump, also, and with a pattern of, say, `a+', we | |
4310 are skipping over the on_failure_jump, so we have to push | |
4311 something meaningless for pop_failure_jump to pop. */ | |
4312 case dummy_failure_jump: | |
4313 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n"); | |
4314 /* It doesn't matter what we push for the string here. What | |
4315 the code at `fail' tests is the value for the pattern. */ | |
4316 PUSH_FAILURE_POINT (0, 0, -2); | |
4317 goto unconditional_jump; | |
4318 | |
4319 | |
4320 /* At the end of an alternative, we need to push a dummy failure | |
1637 | 4321 point in case we are followed by a `pop_failure_jump', because |
1155 | 4322 we don't want the failure point for the alternative to be |
4323 popped. For example, matching `(a|ab)*' against `aab' | |
4324 requires that we match the `ab' alternative. */ | |
4325 case push_dummy_failure: | |
4326 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n"); | |
4327 /* See comments just above at `dummy_failure_jump' about the | |
4328 two zeroes. */ | |
4329 PUSH_FAILURE_POINT (0, 0, -2); | |
4330 break; | |
4331 | |
4332 /* Have to succeed matching what follows at least n times. | |
4333 After that, handle like `on_failure_jump'. */ | |
4334 case succeed_n: | |
4335 EXTRACT_NUMBER (mcnt, p + 2); | |
4336 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); | |
4337 | |
4338 assert (mcnt >= 0); | |
4339 /* Originally, this is how many times we HAVE to succeed. */ | |
4340 if (mcnt > 0) | |
4341 { | |
4342 mcnt--; | |
4343 p += 2; | |
4344 STORE_NUMBER_AND_INCR (p, mcnt); | |
4345 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt); | |
4346 } | |
4347 else if (mcnt == 0) | |
4348 { | |
4349 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2); | |
4350 p[2] = (unsigned char) no_op; | |
4351 p[3] = (unsigned char) no_op; | |
4352 goto on_failure; | |
4353 } | |
4354 break; | |
4355 | |
4356 case jump_n: | |
4357 EXTRACT_NUMBER (mcnt, p + 2); | |
4358 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); | |
4359 | |
4360 /* Originally, this is how many times we CAN jump. */ | |
4361 if (mcnt) | |
4362 { | |
4363 mcnt--; | |
4364 STORE_NUMBER (p + 2, mcnt); | |
4365 goto unconditional_jump; | |
4366 } | |
4367 /* If don't have to jump any more, skip over the rest of command. */ | |
4368 else | |
4369 p += 4; | |
4370 break; | |
4371 | |
4372 case set_number_at: | |
4373 { | |
4374 DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); | |
4375 | |
4376 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
4377 p1 = p + mcnt; | |
4378 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
4379 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt); | |
4380 STORE_NUMBER (p1, mcnt); | |
4381 break; | |
4382 } | |
4383 | |
4384 case wordbound: | |
4385 DEBUG_PRINT1 ("EXECUTING wordbound.\n"); | |
4386 if (AT_WORD_BOUNDARY (d)) | |
4387 break; | |
4388 goto fail; | |
4389 | |
4390 case notwordbound: | |
4391 DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); | |
4392 if (AT_WORD_BOUNDARY (d)) | |
4393 goto fail; | |
4394 break; | |
4395 | |
4396 case wordbeg: | |
4397 DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); | |
1637 | 4398 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) |
1155 | 4399 break; |
4400 goto fail; | |
4401 | |
4402 case wordend: | |
4403 DEBUG_PRINT1 ("EXECUTING wordend.\n"); | |
1637 | 4404 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) |
4405 && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) | |
1155 | 4406 break; |
4407 goto fail; | |
4408 | |
4409 #ifdef emacs | |
4410 #ifdef emacs19 | |
4411 case before_dot: | |
4412 DEBUG_PRINT1 ("EXECUTING before_dot.\n"); | |
4413 if (PTR_CHAR_POS ((unsigned char *) d) >= point) | |
4414 goto fail; | |
4415 break; | |
4416 | |
4417 case at_dot: | |
4418 DEBUG_PRINT1 ("EXECUTING at_dot.\n"); | |
4419 if (PTR_CHAR_POS ((unsigned char *) d) != point) | |
4420 goto fail; | |
4421 break; | |
4422 | |
4423 case after_dot: | |
4424 DEBUG_PRINT1 ("EXECUTING after_dot.\n"); | |
4425 if (PTR_CHAR_POS ((unsigned char *) d) <= point) | |
4426 goto fail; | |
4427 break; | |
4428 #else /* not emacs19 */ | |
4429 case at_dot: | |
4430 DEBUG_PRINT1 ("EXECUTING at_dot.\n"); | |
4431 if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point) | |
4432 goto fail; | |
4433 break; | |
4434 #endif /* not emacs19 */ | |
4435 | |
4436 case syntaxspec: | |
4437 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); | |
4438 mcnt = *p++; | |
4439 goto matchsyntax; | |
4440 | |
4441 case wordchar: | |
1637 | 4442 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n"); |
1155 | 4443 mcnt = (int) Sword; |
4444 matchsyntax: | |
4445 PREFETCH (); | |
1637 | 4446 if (SYNTAX (*d++) != (enum syntaxcode) mcnt) |
4447 goto fail; | |
1155 | 4448 SET_REGS_MATCHED (); |
4449 break; | |
4450 | |
4451 case notsyntaxspec: | |
4452 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); | |
4453 mcnt = *p++; | |
4454 goto matchnotsyntax; | |
4455 | |
4456 case notwordchar: | |
1637 | 4457 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n"); |
1155 | 4458 mcnt = (int) Sword; |
1637 | 4459 matchnotsyntax: |
1155 | 4460 PREFETCH (); |
1637 | 4461 if (SYNTAX (*d++) == (enum syntaxcode) mcnt) |
4462 goto fail; | |
1155 | 4463 SET_REGS_MATCHED (); |
4464 break; | |
4465 | |
4466 #else /* not emacs */ | |
4467 case wordchar: | |
4468 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); | |
4469 PREFETCH (); | |
1637 | 4470 if (!WORDCHAR_P (d)) |
1155 | 4471 goto fail; |
4472 SET_REGS_MATCHED (); | |
1637 | 4473 d++; |
1155 | 4474 break; |
4475 | |
4476 case notwordchar: | |
4477 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); | |
4478 PREFETCH (); | |
1637 | 4479 if (WORDCHAR_P (d)) |
1155 | 4480 goto fail; |
4481 SET_REGS_MATCHED (); | |
1637 | 4482 d++; |
1155 | 4483 break; |
4484 #endif /* not emacs */ | |
4485 | |
4486 default: | |
4487 abort (); | |
4488 } | |
4489 continue; /* Successfully executed one pattern command; keep going. */ | |
4490 | |
4491 | |
4492 /* We goto here if a matching operation fails. */ | |
4493 fail: | |
4494 if (!FAIL_STACK_EMPTY ()) | |
4495 { /* A restart point is known. Restore to that state. */ | |
4496 DEBUG_PRINT1 ("\nFAIL:\n"); | |
4497 POP_FAILURE_POINT (d, p, | |
4498 lowest_active_reg, highest_active_reg, | |
4499 regstart, regend, reg_info); | |
4500 | |
4501 /* If this failure point is a dummy, try the next one. */ | |
4502 if (!p) | |
4503 goto fail; | |
4504 | |
4505 /* If we failed to the end of the pattern, don't examine *p. */ | |
4506 assert (p <= pend); | |
4507 if (p < pend) | |
4508 { | |
4509 boolean is_a_jump_n = false; | |
4510 | |
4511 /* If failed to a backwards jump that's part of a repetition | |
4512 loop, need to pop this failure point and use the next one. */ | |
4513 switch ((re_opcode_t) *p) | |
4514 { | |
4515 case jump_n: | |
4516 is_a_jump_n = true; | |
4517 case maybe_pop_jump: | |
4518 case pop_failure_jump: | |
4519 case jump: | |
4520 p1 = p + 1; | |
4521 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
4522 p1 += mcnt; | |
4523 | |
4524 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) | |
4525 || (!is_a_jump_n | |
4526 && (re_opcode_t) *p1 == on_failure_jump)) | |
4527 goto fail; | |
4528 break; | |
4529 default: | |
4530 /* do nothing */ ; | |
4531 } | |
4532 } | |
4533 | |
4534 if (d >= string1 && d <= end1) | |
4535 dend = end_match_1; | |
4536 } | |
4537 else | |
4538 break; /* Matching at this starting point really fails. */ | |
4539 } /* for (;;) */ | |
4540 | |
4541 if (best_regs_set) | |
4542 goto restore_best_regs; | |
4543 | |
4544 FREE_VARIABLES (); | |
4545 | |
4546 return -1; /* Failure to match. */ | |
4547 } /* re_match_2 */ | |
4548 | |
4549 /* Subroutine definitions for re_match_2. */ | |
4550 | |
4551 | |
4552 /* We are passed P pointing to a register number after a start_memory. | |
4553 | |
4554 Return true if the pattern up to the corresponding stop_memory can | |
4555 match the empty string, and false otherwise. | |
4556 | |
4557 If we find the matching stop_memory, sets P to point to one past its number. | |
4558 Otherwise, sets P to an undefined byte less than or equal to END. | |
4559 | |
4560 We don't handle duplicates properly (yet). */ | |
4561 | |
4562 static boolean | |
4563 group_match_null_string_p (p, end, reg_info) | |
4564 unsigned char **p, *end; | |
4565 register_info_type *reg_info; | |
4566 { | |
4567 int mcnt; | |
4568 /* Point to after the args to the start_memory. */ | |
4569 unsigned char *p1 = *p + 2; | |
4570 | |
4571 while (p1 < end) | |
4572 { | |
4573 /* Skip over opcodes that can match nothing, and return true or | |
4574 false, as appropriate, when we get to one that can't, or to the | |
4575 matching stop_memory. */ | |
4576 | |
4577 switch ((re_opcode_t) *p1) | |
4578 { | |
4579 /* Could be either a loop or a series of alternatives. */ | |
4580 case on_failure_jump: | |
4581 p1++; | |
4582 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
4583 | |
4584 /* If the next operation is not a jump backwards in the | |
4585 pattern. */ | |
4586 | |
4587 if (mcnt >= 0) | |
4588 { | |
4589 /* Go through the on_failure_jumps of the alternatives, | |
4590 seeing if any of the alternatives cannot match nothing. | |
4591 The last alternative starts with only a jump, | |
4592 whereas the rest start with on_failure_jump and end | |
4593 with a jump, e.g., here is the pattern for `a|b|c': | |
4594 | |
4595 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 | |
4596 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 | |
4597 /exactn/1/c | |
4598 | |
4599 So, we have to first go through the first (n-1) | |
4600 alternatives and then deal with the last one separately. */ | |
4601 | |
4602 | |
4603 /* Deal with the first (n-1) alternatives, which start | |
4604 with an on_failure_jump (see above) that jumps to right | |
4605 past a jump_past_alt. */ | |
4606 | |
4607 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) | |
4608 { | |
4609 /* `mcnt' holds how many bytes long the alternative | |
4610 is, including the ending `jump_past_alt' and | |
4611 its number. */ | |
4612 | |
4613 if (!alt_match_null_string_p (p1, p1 + mcnt - 3, | |
4614 reg_info)) | |
4615 return false; | |
4616 | |
4617 /* Move to right after this alternative, including the | |
4618 jump_past_alt. */ | |
4619 p1 += mcnt; | |
4620 | |
4621 /* Break if it's the beginning of an n-th alternative | |
4622 that doesn't begin with an on_failure_jump. */ | |
4623 if ((re_opcode_t) *p1 != on_failure_jump) | |
4624 break; | |
4625 | |
4626 /* Still have to check that it's not an n-th | |
4627 alternative that starts with an on_failure_jump. */ | |
4628 p1++; | |
4629 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
4630 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) | |
4631 { | |
4632 /* Get to the beginning of the n-th alternative. */ | |
4633 p1 -= 3; | |
4634 break; | |
4635 } | |
4636 } | |
4637 | |
4638 /* Deal with the last alternative: go back and get number | |
4639 of the `jump_past_alt' just before it. `mcnt' contains | |
4640 the length of the alternative. */ | |
4641 EXTRACT_NUMBER (mcnt, p1 - 2); | |
4642 | |
4643 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) | |
4644 return false; | |
4645 | |
4646 p1 += mcnt; /* Get past the n-th alternative. */ | |
4647 } /* if mcnt > 0 */ | |
4648 break; | |
4649 | |
4650 | |
4651 case stop_memory: | |
4652 assert (p1[1] == **p); | |
4653 *p = p1 + 2; | |
4654 return true; | |
4655 | |
4656 | |
4657 default: | |
4658 if (!common_op_match_null_string_p (&p1, end, reg_info)) | |
4659 return false; | |
4660 } | |
4661 } /* while p1 < end */ | |
4662 | |
4663 return false; | |
4664 } /* group_match_null_string_p */ | |
4665 | |
4666 | |
4667 /* Similar to group_match_null_string_p, but doesn't deal with alternatives: | |
4668 It expects P to be the first byte of a single alternative and END one | |
4669 byte past the last. The alternative can contain groups. */ | |
4670 | |
4671 static boolean | |
4672 alt_match_null_string_p (p, end, reg_info) | |
4673 unsigned char *p, *end; | |
4674 register_info_type *reg_info; | |
4675 { | |
4676 int mcnt; | |
4677 unsigned char *p1 = p; | |
4678 | |
4679 while (p1 < end) | |
4680 { | |
4681 /* Skip over opcodes that can match nothing, and break when we get | |
4682 to one that can't. */ | |
4683 | |
4684 switch ((re_opcode_t) *p1) | |
4685 { | |
4686 /* It's a loop. */ | |
4687 case on_failure_jump: | |
4688 p1++; | |
4689 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
4690 p1 += mcnt; | |
4691 break; | |
4692 | |
4693 default: | |
4694 if (!common_op_match_null_string_p (&p1, end, reg_info)) | |
4695 return false; | |
4696 } | |
4697 } /* while p1 < end */ | |
4698 | |
4699 return true; | |
4700 } /* alt_match_null_string_p */ | |
4701 | |
4702 | |
4703 /* Deals with the ops common to group_match_null_string_p and | |
4704 alt_match_null_string_p. | |
4705 | |
4706 Sets P to one after the op and its arguments, if any. */ | |
4707 | |
4708 static boolean | |
4709 common_op_match_null_string_p (p, end, reg_info) | |
4710 unsigned char **p, *end; | |
4711 register_info_type *reg_info; | |
4712 { | |
4713 int mcnt; | |
4714 boolean ret; | |
4715 int reg_no; | |
4716 unsigned char *p1 = *p; | |
4717 | |
4718 switch ((re_opcode_t) *p1++) | |
4719 { | |
4720 case no_op: | |
4721 case begline: | |
4722 case endline: | |
4723 case begbuf: | |
4724 case endbuf: | |
4725 case wordbeg: | |
4726 case wordend: | |
4727 case wordbound: | |
4728 case notwordbound: | |
4729 #ifdef emacs | |
4730 case before_dot: | |
4731 case at_dot: | |
4732 case after_dot: | |
4733 #endif | |
4734 break; | |
4735 | |
4736 case start_memory: | |
4737 reg_no = *p1; | |
4738 assert (reg_no > 0 && reg_no <= MAX_REGNUM); | |
4739 ret = group_match_null_string_p (&p1, end, reg_info); | |
4740 | |
4741 /* Have to set this here in case we're checking a group which | |
4742 contains a group and a back reference to it. */ | |
4743 | |
4744 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) | |
4745 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; | |
4746 | |
4747 if (!ret) | |
4748 return false; | |
4749 break; | |
4750 | |
4751 /* If this is an optimized succeed_n for zero times, make the jump. */ | |
4752 case jump: | |
4753 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
4754 if (mcnt >= 0) | |
4755 p1 += mcnt; | |
4756 else | |
4757 return false; | |
4758 break; | |
4759 | |
4760 case succeed_n: | |
4761 /* Get to the number of times to succeed. */ | |
4762 p1 += 2; | |
4763 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
4764 | |
4765 if (mcnt == 0) | |
4766 { | |
4767 p1 -= 4; | |
4768 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
4769 p1 += mcnt; | |
4770 } | |
4771 else | |
4772 return false; | |
4773 break; | |
4774 | |
4775 case duplicate: | |
4776 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) | |
4777 return false; | |
4778 break; | |
4779 | |
4780 case set_number_at: | |
4781 p1 += 4; | |
4782 | |
4783 default: | |
4784 /* All other opcodes mean we cannot match the empty string. */ | |
4785 return false; | |
4786 } | |
4787 | |
4788 *p = p1; | |
4789 return true; | |
4790 } /* common_op_match_null_string_p */ | |
4791 | |
4792 | |
4793 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN | |
4794 bytes; nonzero otherwise. */ | |
4795 | |
4796 static int | |
4797 bcmp_translate (s1, s2, len, translate) | |
4798 unsigned char *s1, *s2; | |
4799 register int len; | |
4800 char *translate; | |
4801 { | |
4802 register unsigned char *p1 = s1, *p2 = s2; | |
4803 while (len) | |
4804 { | |
4805 if (translate[*p1++] != translate[*p2++]) return 1; | |
4806 len--; | |
4807 } | |
4808 return 0; | |
4809 } | |
4810 | |
4811 /* Entry points for GNU code. */ | |
4812 | |
4813 /* re_compile_pattern is the GNU regular expression compiler: it | |
4814 compiles PATTERN (of length SIZE) and puts the result in BUFP. | |
4815 Returns 0 if the pattern was valid, otherwise an error string. | |
4816 | |
4817 Assumes the `allocated' (and perhaps `buffer') and `translate' fields | |
4818 are set in BUFP on entry. | |
4819 | |
4820 We call regex_compile to do the actual compilation. */ | |
4821 | |
4822 const char * | |
4823 re_compile_pattern (pattern, length, bufp) | |
4824 const char *pattern; | |
4825 int length; | |
4826 struct re_pattern_buffer *bufp; | |
4827 { | |
4828 reg_errcode_t ret; | |
4829 | |
4830 /* GNU code is written to assume at least RE_NREGS registers will be set | |
4831 (and at least one extra will be -1). */ | |
4832 bufp->regs_allocated = REGS_UNALLOCATED; | |
4833 | |
4834 /* And GNU code determines whether or not to get register information | |
4835 by passing null for the REGS argument to re_match, etc., not by | |
4836 setting no_sub. */ | |
4837 bufp->no_sub = 0; | |
4838 | |
4839 /* Match anchors at newline. */ | |
4840 bufp->newline_anchor = 1; | |
4841 | |
4842 ret = regex_compile (pattern, length, re_syntax_options, bufp); | |
4843 | |
4844 return re_error_msg[(int) ret]; | |
4845 } | |
4846 | |
4847 /* Entry points compatible with 4.2 BSD regex library. We don't define | |
4848 them if this is an Emacs or POSIX compilation. */ | |
4849 | |
4850 #if !defined (emacs) && !defined (_POSIX_SOURCE) | |
4851 | |
4852 /* BSD has one and only one pattern buffer. */ | |
4853 static struct re_pattern_buffer re_comp_buf; | |
4854 | |
4855 char * | |
4856 re_comp (s) | |
4857 const char *s; | |
4858 { | |
4859 reg_errcode_t ret; | |
4860 | |
4861 if (!s) | |
4862 { | |
4863 if (!re_comp_buf.buffer) | |
4864 return "No previous regular expression"; | |
4865 return 0; | |
4866 } | |
4867 | |
4868 if (!re_comp_buf.buffer) | |
4869 { | |
4870 re_comp_buf.buffer = (unsigned char *) malloc (200); | |
4871 if (re_comp_buf.buffer == NULL) | |
4872 return "Memory exhausted"; | |
4873 re_comp_buf.allocated = 200; | |
4874 | |
4875 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH); | |
4876 if (re_comp_buf.fastmap == NULL) | |
4877 return "Memory exhausted"; | |
4878 } | |
4879 | |
4880 /* Since `re_exec' always passes NULL for the `regs' argument, we | |
4881 don't need to initialize the pattern buffer fields which affect it. */ | |
4882 | |
4883 /* Match anchors at newlines. */ | |
4884 re_comp_buf.newline_anchor = 1; | |
4885 | |
4886 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); | |
4887 | |
4888 /* Yes, we're discarding `const' here. */ | |
4889 return (char *) re_error_msg[(int) ret]; | |
4890 } | |
4891 | |
4892 | |
4893 int | |
4894 re_exec (s) | |
4895 const char *s; | |
4896 { | |
4897 const int len = strlen (s); | |
4898 return | |
4899 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0); | |
4900 } | |
4901 #endif /* not emacs and not _POSIX_SOURCE */ | |
4902 | |
4903 /* POSIX.2 functions. Don't define these for Emacs. */ | |
4904 | |
4905 #ifndef emacs | |
4906 | |
4907 /* regcomp takes a regular expression as a string and compiles it. | |
4908 | |
4909 PREG is a regex_t *. We do not expect any fields to be initialized, | |
4910 since POSIX says we shouldn't. Thus, we set | |
4911 | |
4912 `buffer' to the compiled pattern; | |
4913 `used' to the length of the compiled pattern; | |
4914 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the | |
4915 REG_EXTENDED bit in CFLAGS is set; otherwise, to | |
4916 RE_SYNTAX_POSIX_BASIC; | |
4917 `newline_anchor' to REG_NEWLINE being set in CFLAGS; | |
4918 `fastmap' and `fastmap_accurate' to zero; | |
4919 `re_nsub' to the number of subexpressions in PATTERN. | |
4920 | |
4921 PATTERN is the address of the pattern string. | |
4922 | |
4923 CFLAGS is a series of bits which affect compilation. | |
4924 | |
4925 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we | |
4926 use POSIX basic syntax. | |
4927 | |
4928 If REG_NEWLINE is set, then . and [^...] don't match newline. | |
4929 Also, regexec will try a match beginning after every newline. | |
4930 | |
4931 If REG_ICASE is set, then we considers upper- and lowercase | |
4932 versions of letters to be equivalent when matching. | |
4933 | |
4934 If REG_NOSUB is set, then when PREG is passed to regexec, that | |
4935 routine will report only success or failure, and nothing about the | |
4936 registers. | |
4937 | |
4938 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for | |
4939 the return codes and their meanings.) */ | |
4940 | |
4941 int | |
4942 regcomp (preg, pattern, cflags) | |
4943 regex_t *preg; | |
4944 const char *pattern; | |
4945 int cflags; | |
4946 { | |
4947 reg_errcode_t ret; | |
4948 unsigned syntax | |
1642
340feb030df1
*** empty log message ***
David J. MacKenzie <djm@gnu.org>
parents:
1641
diff
changeset
|
4949 = (cflags & REG_EXTENDED) ? |
340feb030df1
*** empty log message ***
David J. MacKenzie <djm@gnu.org>
parents:
1641
diff
changeset
|
4950 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; |
1155 | 4951 |
4952 /* regex_compile will allocate the space for the compiled pattern. */ | |
4953 preg->buffer = 0; | |
1642
340feb030df1
*** empty log message ***
David J. MacKenzie <djm@gnu.org>
parents:
1641
diff
changeset
|
4954 preg->allocated = 0; |
2758 | 4955 preg->used = 0; |
1155 | 4956 |
4957 /* Don't bother to use a fastmap when searching. This simplifies the | |
4958 REG_NEWLINE case: if we used a fastmap, we'd have to put all the | |
4959 characters after newlines into the fastmap. This way, we just try | |
4960 every character. */ | |
4961 preg->fastmap = 0; | |
4962 | |
4963 if (cflags & REG_ICASE) | |
4964 { | |
4965 unsigned i; | |
4966 | |
4967 preg->translate = (char *) malloc (CHAR_SET_SIZE); | |
4968 if (preg->translate == NULL) | |
4969 return (int) REG_ESPACE; | |
4970 | |
4971 /* Map uppercase characters to corresponding lowercase ones. */ | |
4972 for (i = 0; i < CHAR_SET_SIZE; i++) | |
1668 | 4973 preg->translate[i] = ISUPPER (i) ? tolower (i) : i; |
1155 | 4974 } |
4975 else | |
4976 preg->translate = NULL; | |
4977 | |
4978 /* If REG_NEWLINE is set, newlines are treated differently. */ | |
4979 if (cflags & REG_NEWLINE) | |
4980 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ | |
4981 syntax &= ~RE_DOT_NEWLINE; | |
4982 syntax |= RE_HAT_LISTS_NOT_NEWLINE; | |
4983 /* It also changes the matching behavior. */ | |
4984 preg->newline_anchor = 1; | |
4985 } | |
4986 else | |
4987 preg->newline_anchor = 0; | |
4988 | |
4989 preg->no_sub = !!(cflags & REG_NOSUB); | |
4990 | |
4991 /* POSIX says a null character in the pattern terminates it, so we | |
4992 can use strlen here in compiling the pattern. */ | |
4993 ret = regex_compile (pattern, strlen (pattern), syntax, preg); | |
4994 | |
4995 /* POSIX doesn't distinguish between an unmatched open-group and an | |
4996 unmatched close-group: both are REG_EPAREN. */ | |
4997 if (ret == REG_ERPAREN) ret = REG_EPAREN; | |
4998 | |
4999 return (int) ret; | |
5000 } | |
5001 | |
5002 | |
5003 /* regexec searches for a given pattern, specified by PREG, in the | |
5004 string STRING. | |
5005 | |
5006 If NMATCH is zero or REG_NOSUB was set in the cflags argument to | |
5007 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at | |
5008 least NMATCH elements, and we set them to the offsets of the | |
5009 corresponding matched substrings. | |
5010 | |
5011 EFLAGS specifies `execution flags' which affect matching: if | |
5012 REG_NOTBOL is set, then ^ does not match at the beginning of the | |
5013 string; if REG_NOTEOL is set, then $ does not match at the end. | |
5014 | |
5015 We return 0 if we find a match and REG_NOMATCH if not. */ | |
5016 | |
5017 int | |
5018 regexec (preg, string, nmatch, pmatch, eflags) | |
5019 const regex_t *preg; | |
5020 const char *string; | |
5021 size_t nmatch; | |
5022 regmatch_t pmatch[]; | |
5023 int eflags; | |
5024 { | |
5025 int ret; | |
5026 struct re_registers regs; | |
5027 regex_t private_preg; | |
5028 int len = strlen (string); | |
5029 boolean want_reg_info = !preg->no_sub && nmatch > 0; | |
5030 | |
5031 private_preg = *preg; | |
5032 | |
5033 private_preg.not_bol = !!(eflags & REG_NOTBOL); | |
5034 private_preg.not_eol = !!(eflags & REG_NOTEOL); | |
5035 | |
5036 /* The user has told us exactly how many registers to return | |
5037 information about, via `nmatch'. We have to pass that on to the | |
5038 matching routines. */ | |
5039 private_preg.regs_allocated = REGS_FIXED; | |
5040 | |
5041 if (want_reg_info) | |
5042 { | |
5043 regs.num_regs = nmatch; | |
5044 regs.start = TALLOC (nmatch, regoff_t); | |
5045 regs.end = TALLOC (nmatch, regoff_t); | |
5046 if (regs.start == NULL || regs.end == NULL) | |
5047 return (int) REG_NOMATCH; | |
5048 } | |
5049 | |
5050 /* Perform the searching operation. */ | |
5051 ret = re_search (&private_preg, string, len, | |
5052 /* start: */ 0, /* range: */ len, | |
5053 want_reg_info ? ®s : (struct re_registers *) 0); | |
5054 | |
5055 /* Copy the register information to the POSIX structure. */ | |
5056 if (want_reg_info) | |
5057 { | |
5058 if (ret >= 0) | |
5059 { | |
5060 unsigned r; | |
5061 | |
5062 for (r = 0; r < nmatch; r++) | |
5063 { | |
5064 pmatch[r].rm_so = regs.start[r]; | |
5065 pmatch[r].rm_eo = regs.end[r]; | |
5066 } | |
5067 } | |
5068 | |
5069 /* If we needed the temporary register info, free the space now. */ | |
5070 free (regs.start); | |
5071 free (regs.end); | |
5072 } | |
5073 | |
5074 /* We want zero return to mean success, unlike `re_search'. */ | |
5075 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; | |
5076 } | |
5077 | |
5078 | |
5079 /* Returns a message corresponding to an error code, ERRCODE, returned | |
1637 | 5080 from either regcomp or regexec. We don't use PREG here. */ |
1155 | 5081 |
5082 size_t | |
5083 regerror (errcode, preg, errbuf, errbuf_size) | |
5084 int errcode; | |
5085 const regex_t *preg; | |
5086 char *errbuf; | |
5087 size_t errbuf_size; | |
5088 { | |
1738 | 5089 const char *msg; |
5090 size_t msg_size; | |
5091 | |
5092 if (errcode < 0 | |
5093 || errcode >= (sizeof (re_error_msg) / sizeof (re_error_msg[0]))) | |
5094 /* Only error codes returned by the rest of the code should be passed | |
5095 to this routine. If we are given anything else, or if other regex | |
5096 code generates an invalid error code, then the program has a bug. | |
5097 Dump core so we can fix it. */ | |
5098 abort (); | |
5099 | |
2453 | 5100 msg = re_error_msg[errcode]; |
5101 | |
5102 /* POSIX doesn't require that we do anything in this case, but why | |
5103 not be nice. */ | |
5104 if (! msg) | |
5105 msg = "Success"; | |
5106 | |
1738 | 5107 msg_size = strlen (msg) + 1; /* Includes the null. */ |
1155 | 5108 |
5109 if (errbuf_size != 0) | |
5110 { | |
5111 if (msg_size > errbuf_size) | |
5112 { | |
5113 strncpy (errbuf, msg, errbuf_size - 1); | |
5114 errbuf[errbuf_size - 1] = 0; | |
5115 } | |
5116 else | |
5117 strcpy (errbuf, msg); | |
5118 } | |
5119 | |
5120 return msg_size; | |
5121 } | |
5122 | |
5123 | |
5124 /* Free dynamically allocated space used by PREG. */ | |
5125 | |
5126 void | |
5127 regfree (preg) | |
5128 regex_t *preg; | |
5129 { | |
5130 if (preg->buffer != NULL) | |
5131 free (preg->buffer); | |
5132 preg->buffer = NULL; | |
5133 | |
5134 preg->allocated = 0; | |
5135 preg->used = 0; | |
5136 | |
5137 if (preg->fastmap != NULL) | |
5138 free (preg->fastmap); | |
5139 preg->fastmap = NULL; | |
5140 preg->fastmap_accurate = 0; | |
5141 | |
5142 if (preg->translate != NULL) | |
5143 free (preg->translate); | |
5144 preg->translate = NULL; | |
5145 } | |
5146 | |
5147 #endif /* not emacs */ | |
5148 | |
5149 /* | |
5150 Local variables: | |
5151 make-backup-files: t | |
5152 version-control: t | |
5153 trim-versions-without-asking: nil | |
5154 End: | |
5155 */ |