comparison src/regex.c @ 28380:5478842aea4c

(analyse_first): New function obtained by ripping out most of re_compile_fastmap and generalizing it a little bit so that it can also just return whether a given (sub)pattern can match the empty string or not. (regex_compile): Use `analyse_first' to decide whether the loop-check needs to be done or not for *, +, *? and +? (the loop check is costly for non-greedy repetition). (re_compile_fastmap): Delegate the actual work to `analyse_first'.
author Stefan Monnier <monnier@iro.umontreal.ca>
date Wed, 29 Mar 2000 04:01:25 +0000
parents bc86be15099e
children 975fe3d8922e
comparison
equal deleted inserted replaced
28379:b3a689c74cde 28380:5478842aea4c
18 along with this program; if not, write to the Free Software 18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20 USA. */ 20 USA. */
21 21
22 /* TODO: 22 /* TODO:
23 - use analyze_first to optimize non-empty loops
24 - clean up multibyte issues 23 - clean up multibyte issues
25 - structure the opcode space into opcode+flag. 24 - structure the opcode space into opcode+flag.
26 - merge with glic's regex.[ch] 25 - merge with glibc's regex.[ch]
27 26 */
28 That's it for now -sm */
29 27
30 /* AIX requires this to be the first thing in the file. */ 28 /* AIX requires this to be the first thing in the file. */
31 #if defined (_AIX) && !defined (REGEX_MALLOC) 29 #if defined (_AIX) && !defined (REGEX_MALLOC)
32 #pragma alloca 30 #pragma alloca
33 #endif 31 #endif
1540 reg_syntax_t syntax)); 1538 reg_syntax_t syntax));
1541 static boolean at_endline_loc_p _RE_ARGS((const unsigned char *p, 1539 static boolean at_endline_loc_p _RE_ARGS((const unsigned char *p,
1542 const unsigned char *pend, 1540 const unsigned char *pend,
1543 reg_syntax_t syntax)); 1541 reg_syntax_t syntax));
1544 static unsigned char *skip_one_char _RE_ARGS((unsigned char *p)); 1542 static unsigned char *skip_one_char _RE_ARGS((unsigned char *p));
1543 static int analyse_first _RE_ARGS((unsigned char *p, unsigned char *pend,
1544 char *fastmap, const int multibyte));
1545 1545
1546 /* Fetch the next character in the uncompiled pattern---translating it 1546 /* Fetch the next character in the uncompiled pattern---translating it
1547 if necessary. Also cast from a signed character in the constant 1547 if necessary. Also cast from a signed character in the constant
1548 string passed to us by the user to an unsigned char that we can use 1548 string passed to us by the user to an unsigned char that we can use
1549 as an array index (in, e.g., `translate'). */ 1549 as an array index (in, e.g., `translate'). */
2132 { 2132 {
2133 if (many_times_ok) 2133 if (many_times_ok)
2134 { 2134 {
2135 boolean simple = skip_one_char (laststart) == b; 2135 boolean simple = skip_one_char (laststart) == b;
2136 unsigned int startoffset = 0; 2136 unsigned int startoffset = 0;
2137 re_opcode_t ofj =
2138 (simple || !analyse_first (laststart, b, NULL, 0)) ?
2139 on_failure_jump : on_failure_jump_loop;
2137 assert (skip_one_char (laststart) <= b); 2140 assert (skip_one_char (laststart) <= b);
2138 2141
2139 if (!zero_times_ok && simple) 2142 if (!zero_times_ok && simple)
2140 { /* Since simple * loops can be made faster by using 2143 { /* Since simple * loops can be made faster by using
2141 on_failure_keep_string_jump, we turn simple P+ 2144 on_failure_keep_string_jump, we turn simple P+
2150 } 2153 }
2151 2154
2152 GET_BUFFER_SPACE (6); 2155 GET_BUFFER_SPACE (6);
2153 if (!zero_times_ok) 2156 if (!zero_times_ok)
2154 /* A + loop. */ 2157 /* A + loop. */
2155 STORE_JUMP (on_failure_jump_loop, b, b + 6); 2158 STORE_JUMP (ofj, b, b + 6);
2156 else 2159 else
2157 /* Simple * loops can use on_failure_keep_string_jump 2160 /* Simple * loops can use on_failure_keep_string_jump
2158 depending on what follows. But since we don't know 2161 depending on what follows. But since we don't know
2159 that yet, we leave the decision up to 2162 that yet, we leave the decision up to
2160 on_failure_jump_smart. */ 2163 on_failure_jump_smart. */
2161 INSERT_JUMP (simple ? on_failure_jump_smart 2164 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
2162 : on_failure_jump_loop,
2163 laststart + startoffset, b + 6); 2165 laststart + startoffset, b + 6);
2164 b += 3; 2166 b += 3;
2165 STORE_JUMP (jump, b, laststart + startoffset); 2167 STORE_JUMP (jump, b, laststart + startoffset);
2166 b += 3; 2168 b += 3;
2167 } 2169 }
2178 { /* I wish the greedy and non-greedy cases could be merged. */ 2180 { /* I wish the greedy and non-greedy cases could be merged. */
2179 2181
2180 GET_BUFFER_SPACE (7); /* We might use less. */ 2182 GET_BUFFER_SPACE (7); /* We might use less. */
2181 if (many_times_ok) 2183 if (many_times_ok)
2182 { 2184 {
2185 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2186
2183 /* The non-greedy multiple match looks like a repeat..until: 2187 /* The non-greedy multiple match looks like a repeat..until:
2184 we only need a conditional jump at the end of the loop */ 2188 we only need a conditional jump at the end of the loop */
2185 BUF_PUSH (no_op); 2189 if (emptyp) BUF_PUSH (no_op);
2186 STORE_JUMP (on_failure_jump_nastyloop, b, laststart); 2190 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2191 : on_failure_jump, b, laststart);
2187 b += 3; 2192 b += 3;
2188 if (zero_times_ok) 2193 if (zero_times_ok)
2189 { 2194 {
2190 /* The repeat...until naturally matches one or more. 2195 /* The repeat...until naturally matches one or more.
2191 To also match zero times, we need to first jump to 2196 To also match zero times, we need to first jump to
3266 return true; 3271 return true;
3267 3272
3268 return false; 3273 return false;
3269 } 3274 }
3270 3275
3271 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in 3276 /* analyse_first.
3272 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible 3277 If fastmap is non-NULL, go through the pattern and fill fastmap
3273 characters can start a string that matches the pattern. This fastmap 3278 with all the possible leading chars. If fastmap is NULL, don't
3274 is used by re_search to skip quickly over impossible starting points. 3279 bother filling it up (obviously) and only return whether the
3275 3280 pattern could potentially match the empty string.
3276 Character codes above (1 << BYTEWIDTH) are not represented in the 3281
3277 fastmap, but the leading codes are represented. Thus, the fastmap 3282 Return 1 if p..pend might match the empty string.
3278 indicates which character sets could start a match. 3283 Return 0 if p..pend matches at least one char.
3279 3284 Return -1 if p..pend matches at least one char, but fastmap was not
3280 The caller must supply the address of a (1 << BYTEWIDTH)-byte data 3285 updated accurately.
3281 area as BUFP->fastmap. 3286 Return -2 if an error occurred. */
3282 3287
3283 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in 3288 static int
3284 the pattern buffer. 3289 analyse_first (p, pend, fastmap, multibyte)
3285 3290 unsigned char *p, *pend;
3286 Returns 0 if we succeed, -2 if an internal error. */ 3291 char *fastmap;
3287 3292 const int multibyte;
3288 int
3289 re_compile_fastmap (bufp)
3290 struct re_pattern_buffer *bufp;
3291 { 3293 {
3292 int j, k; 3294 int j, k;
3293 boolean not; 3295 boolean not;
3294 #ifdef MATCH_MAY_ALLOCATE 3296 #ifdef MATCH_MAY_ALLOCATE
3295 fail_stack_type fail_stack; 3297 fail_stack_type fail_stack;
3296 #endif 3298 #endif
3297 #ifndef REGEX_MALLOC 3299 #ifndef REGEX_MALLOC
3298 char *destination; 3300 char *destination;
3299 #endif 3301 #endif
3300 3302
3301 register char *fastmap = bufp->fastmap;
3302 unsigned char *pattern = bufp->buffer;
3303 unsigned long size = bufp->used;
3304 unsigned char *p = pattern;
3305 register unsigned char *pend = pattern + size;
3306 const boolean multibyte = bufp->multibyte;
3307
3308 #if defined (REL_ALLOC) && defined (REGEX_MALLOC) 3303 #if defined (REL_ALLOC) && defined (REGEX_MALLOC)
3309 /* This holds the pointer to the failure stack, when 3304 /* This holds the pointer to the failure stack, when
3310 it is allocated relocatably. */ 3305 it is allocated relocatably. */
3311 fail_stack_elt_t *failure_stack_ptr; 3306 fail_stack_elt_t *failure_stack_ptr;
3312 #endif 3307 #endif
3319 3314
3320 /* If all elements for base leading-codes in fastmap is set, this 3315 /* If all elements for base leading-codes in fastmap is set, this
3321 flag is set true. */ 3316 flag is set true. */
3322 boolean match_any_multibyte_characters = false; 3317 boolean match_any_multibyte_characters = false;
3323 3318
3324 assert (fastmap != NULL && p != NULL); 3319 assert (p);
3325 3320
3326 INIT_FAIL_STACK (); 3321 INIT_FAIL_STACK ();
3327 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3328 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3329 bufp->can_be_null = 0;
3330 3322
3331 /* The loop below works as follows: 3323 /* The loop below works as follows:
3332 - It has a working-list kept in the PATTERN_STACK and which basically 3324 - It has a working-list kept in the PATTERN_STACK and which basically
3333 starts by only containing a pointer to the first operation. 3325 starts by only containing a pointer to the first operation.
3334 - If the opcode we're looking at is a match against some set of 3326 - If the opcode we're looking at is a match against some set of
3342 We guarantee termination by ignoring backward jumps (more or less), 3334 We guarantee termination by ignoring backward jumps (more or less),
3343 so that `p' is monotonically increasing. More to the point, we 3335 so that `p' is monotonically increasing. More to the point, we
3344 never set `p' (or push) anything `<= p1'. */ 3336 never set `p' (or push) anything `<= p1'. */
3345 3337
3346 /* If can_be_null is set, then the fastmap will not be used anyway. */ 3338 /* If can_be_null is set, then the fastmap will not be used anyway. */
3347 while (!bufp->can_be_null) 3339 while (1)
3348 { 3340 {
3349 /* `p1' is used as a marker of how far back a `on_failure_jump' 3341 /* `p1' is used as a marker of how far back a `on_failure_jump'
3350 can go without being ignored. It is normally equal to `p' 3342 can go without being ignored. It is normally equal to `p'
3351 (which prevents any backward `on_failure_jump') except right 3343 (which prevents any backward `on_failure_jump') except right
3352 after a plain `jump', to allow patterns such as: 3344 after a plain `jump', to allow patterns such as:
3354 3..9: <body> 3346 3..9: <body>
3355 10: on_failure_jump 3 3347 10: on_failure_jump 3
3356 as used for the *? operator. */ 3348 as used for the *? operator. */
3357 unsigned char *p1 = p; 3349 unsigned char *p1 = p;
3358 3350
3359 if (p >= pend || *p == succeed) 3351 if (p >= pend)
3360 { 3352 {
3353 if (path_can_be_null)
3354 return (RESET_FAIL_STACK (), 1);
3355
3361 /* We have reached the (effective) end of pattern. */ 3356 /* We have reached the (effective) end of pattern. */
3362 if (!PATTERN_STACK_EMPTY ()) 3357 if (PATTERN_STACK_EMPTY ())
3363 { 3358 return (RESET_FAIL_STACK (), 0);
3364 bufp->can_be_null |= path_can_be_null; 3359
3365 3360 p = (unsigned char*) POP_PATTERN_OP ();
3366 /* Reset for next path. */ 3361 path_can_be_null = true;
3367 path_can_be_null = true; 3362 continue;
3368
3369 p = (unsigned char*) POP_PATTERN_OP ();
3370
3371 continue;
3372 }
3373 else
3374 break;
3375 } 3363 }
3376 3364
3377 /* We should never be about to go beyond the end of the pattern. */ 3365 /* We should never be about to go beyond the end of the pattern. */
3378 assert (p < pend); 3366 assert (p < pend);
3379 3367
3380 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) 3368 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3381 { 3369 {
3370 case succeed:
3371 p = pend;
3372 continue;
3382 3373
3383 case duplicate: 3374 case duplicate:
3384 /* If the first character has to match a backreference, that means 3375 /* If the first character has to match a backreference, that means
3385 that the group was empty (since it already matched). Since this 3376 that the group was empty (since it already matched). Since this
3386 is the only case that interests us here, we can assume that the 3377 is the only case that interests us here, we can assume that the
3391 3382
3392 /* Following are the cases which match a character. These end 3383 /* Following are the cases which match a character. These end
3393 with `break'. */ 3384 with `break'. */
3394 3385
3395 case exactn: 3386 case exactn:
3396 fastmap[p[1]] = 1; 3387 if (fastmap) fastmap[p[1]] = 1;
3397 break; 3388 break;
3398 3389
3399 3390
3400 case anychar: 3391 case anychar:
3401 /* We could put all the chars except for \n (and maybe \0) 3392 /* We could put all the chars except for \n (and maybe \0)
3402 but we don't bother since it is generally not worth it. */ 3393 but we don't bother since it is generally not worth it. */
3403 bufp->can_be_null = 1; 3394 if (!fastmap) break;
3404 continue; 3395 return (RESET_FAIL_STACK (), -1);
3405 3396
3406 3397
3407 case charset_not: 3398 case charset_not:
3408 /* Chars beyond end of bitmap are possible matches. 3399 /* Chars beyond end of bitmap are possible matches.
3409 All the single-byte codes can occur in multibyte buffers. 3400 All the single-byte codes can occur in multibyte buffers.
3474 fastmap[j] = 1; 3465 fastmap[j] = 1;
3475 break; 3466 break;
3476 #else /* emacs */ 3467 #else /* emacs */
3477 /* This match depends on text properties. These end with 3468 /* This match depends on text properties. These end with
3478 aborting optimizations. */ 3469 aborting optimizations. */
3479 bufp->can_be_null = 1; 3470 return (RESET_FAIL_STACK (), -1);
3480 continue;
3481 3471
3482 case categoryspec: 3472 case categoryspec:
3483 case notcategoryspec: 3473 case notcategoryspec:
3484 if (!fastmap) break; 3474 if (!fastmap) break;
3485 not = (re_opcode_t)p[-1] == notcategoryspec; 3475 not = (re_opcode_t)p[-1] == notcategoryspec;
3591 does these things. */ 3581 does these things. */
3592 path_can_be_null = false; 3582 path_can_be_null = false;
3593 p = pend; 3583 p = pend;
3594 } /* while p */ 3584 } /* while p */
3595 3585
3596 /* Set `can_be_null' for the last path (also the first path, if the 3586 return (RESET_FAIL_STACK (), 0);
3597 pattern is empty). */ 3587 } /* analyse_first */
3598 bufp->can_be_null |= path_can_be_null; 3588
3599 RESET_FAIL_STACK (); 3589 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3590 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3591 characters can start a string that matches the pattern. This fastmap
3592 is used by re_search to skip quickly over impossible starting points.
3593
3594 Character codes above (1 << BYTEWIDTH) are not represented in the
3595 fastmap, but the leading codes are represented. Thus, the fastmap
3596 indicates which character sets could start a match.
3597
3598 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3599 area as BUFP->fastmap.
3600
3601 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3602 the pattern buffer.
3603
3604 Returns 0 if we succeed, -2 if an internal error. */
3605
3606 int
3607 re_compile_fastmap (bufp)
3608 struct re_pattern_buffer *bufp;
3609 {
3610 char *fastmap = bufp->fastmap;
3611 int analysis;
3612
3613 assert (fastmap && bufp->buffer);
3614
3615 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3616 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3617
3618 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
3619 fastmap, bufp->multibyte);
3620 if (analysis < -1)
3621 return analysis;
3622 bufp->can_be_null = (analysis != 0);
3600 return 0; 3623 return 0;
3601 } /* re_compile_fastmap */ 3624 } /* re_compile_fastmap */
3602 3625
3603 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and 3626 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3604 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 3627 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use