Mercurial > emacs
comparison src/regex.c @ 28380:5478842aea4c
(analyse_first): New function obtained by ripping out most
of re_compile_fastmap and generalizing it a little bit so that it
can also just return whether a given (sub)pattern can match the empty
string or not.
(regex_compile): Use `analyse_first' to decide whether the loop-check
needs to be done or not for *, +, *? and +? (the loop check is costly
for non-greedy repetition).
(re_compile_fastmap): Delegate the actual work to `analyse_first'.
author | Stefan Monnier <monnier@iro.umontreal.ca> |
---|---|
date | Wed, 29 Mar 2000 04:01:25 +0000 |
parents | bc86be15099e |
children | 975fe3d8922e |
comparison
equal
deleted
inserted
replaced
28379:b3a689c74cde | 28380:5478842aea4c |
---|---|
18 along with this program; if not, write to the Free Software | 18 along with this program; if not, write to the Free Software |
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | 19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, |
20 USA. */ | 20 USA. */ |
21 | 21 |
22 /* TODO: | 22 /* TODO: |
23 - use analyze_first to optimize non-empty loops | |
24 - clean up multibyte issues | 23 - clean up multibyte issues |
25 - structure the opcode space into opcode+flag. | 24 - structure the opcode space into opcode+flag. |
26 - merge with glic's regex.[ch] | 25 - merge with glibc's regex.[ch] |
27 | 26 */ |
28 That's it for now -sm */ | |
29 | 27 |
30 /* AIX requires this to be the first thing in the file. */ | 28 /* AIX requires this to be the first thing in the file. */ |
31 #if defined (_AIX) && !defined (REGEX_MALLOC) | 29 #if defined (_AIX) && !defined (REGEX_MALLOC) |
32 #pragma alloca | 30 #pragma alloca |
33 #endif | 31 #endif |
1540 reg_syntax_t syntax)); | 1538 reg_syntax_t syntax)); |
1541 static boolean at_endline_loc_p _RE_ARGS((const unsigned char *p, | 1539 static boolean at_endline_loc_p _RE_ARGS((const unsigned char *p, |
1542 const unsigned char *pend, | 1540 const unsigned char *pend, |
1543 reg_syntax_t syntax)); | 1541 reg_syntax_t syntax)); |
1544 static unsigned char *skip_one_char _RE_ARGS((unsigned char *p)); | 1542 static unsigned char *skip_one_char _RE_ARGS((unsigned char *p)); |
1543 static int analyse_first _RE_ARGS((unsigned char *p, unsigned char *pend, | |
1544 char *fastmap, const int multibyte)); | |
1545 | 1545 |
1546 /* Fetch the next character in the uncompiled pattern---translating it | 1546 /* Fetch the next character in the uncompiled pattern---translating it |
1547 if necessary. Also cast from a signed character in the constant | 1547 if necessary. Also cast from a signed character in the constant |
1548 string passed to us by the user to an unsigned char that we can use | 1548 string passed to us by the user to an unsigned char that we can use |
1549 as an array index (in, e.g., `translate'). */ | 1549 as an array index (in, e.g., `translate'). */ |
2132 { | 2132 { |
2133 if (many_times_ok) | 2133 if (many_times_ok) |
2134 { | 2134 { |
2135 boolean simple = skip_one_char (laststart) == b; | 2135 boolean simple = skip_one_char (laststart) == b; |
2136 unsigned int startoffset = 0; | 2136 unsigned int startoffset = 0; |
2137 re_opcode_t ofj = | |
2138 (simple || !analyse_first (laststart, b, NULL, 0)) ? | |
2139 on_failure_jump : on_failure_jump_loop; | |
2137 assert (skip_one_char (laststart) <= b); | 2140 assert (skip_one_char (laststart) <= b); |
2138 | 2141 |
2139 if (!zero_times_ok && simple) | 2142 if (!zero_times_ok && simple) |
2140 { /* Since simple * loops can be made faster by using | 2143 { /* Since simple * loops can be made faster by using |
2141 on_failure_keep_string_jump, we turn simple P+ | 2144 on_failure_keep_string_jump, we turn simple P+ |
2150 } | 2153 } |
2151 | 2154 |
2152 GET_BUFFER_SPACE (6); | 2155 GET_BUFFER_SPACE (6); |
2153 if (!zero_times_ok) | 2156 if (!zero_times_ok) |
2154 /* A + loop. */ | 2157 /* A + loop. */ |
2155 STORE_JUMP (on_failure_jump_loop, b, b + 6); | 2158 STORE_JUMP (ofj, b, b + 6); |
2156 else | 2159 else |
2157 /* Simple * loops can use on_failure_keep_string_jump | 2160 /* Simple * loops can use on_failure_keep_string_jump |
2158 depending on what follows. But since we don't know | 2161 depending on what follows. But since we don't know |
2159 that yet, we leave the decision up to | 2162 that yet, we leave the decision up to |
2160 on_failure_jump_smart. */ | 2163 on_failure_jump_smart. */ |
2161 INSERT_JUMP (simple ? on_failure_jump_smart | 2164 INSERT_JUMP (simple ? on_failure_jump_smart : ofj, |
2162 : on_failure_jump_loop, | |
2163 laststart + startoffset, b + 6); | 2165 laststart + startoffset, b + 6); |
2164 b += 3; | 2166 b += 3; |
2165 STORE_JUMP (jump, b, laststart + startoffset); | 2167 STORE_JUMP (jump, b, laststart + startoffset); |
2166 b += 3; | 2168 b += 3; |
2167 } | 2169 } |
2178 { /* I wish the greedy and non-greedy cases could be merged. */ | 2180 { /* I wish the greedy and non-greedy cases could be merged. */ |
2179 | 2181 |
2180 GET_BUFFER_SPACE (7); /* We might use less. */ | 2182 GET_BUFFER_SPACE (7); /* We might use less. */ |
2181 if (many_times_ok) | 2183 if (many_times_ok) |
2182 { | 2184 { |
2185 boolean emptyp = analyse_first (laststart, b, NULL, 0); | |
2186 | |
2183 /* The non-greedy multiple match looks like a repeat..until: | 2187 /* The non-greedy multiple match looks like a repeat..until: |
2184 we only need a conditional jump at the end of the loop */ | 2188 we only need a conditional jump at the end of the loop */ |
2185 BUF_PUSH (no_op); | 2189 if (emptyp) BUF_PUSH (no_op); |
2186 STORE_JUMP (on_failure_jump_nastyloop, b, laststart); | 2190 STORE_JUMP (emptyp ? on_failure_jump_nastyloop |
2191 : on_failure_jump, b, laststart); | |
2187 b += 3; | 2192 b += 3; |
2188 if (zero_times_ok) | 2193 if (zero_times_ok) |
2189 { | 2194 { |
2190 /* The repeat...until naturally matches one or more. | 2195 /* The repeat...until naturally matches one or more. |
2191 To also match zero times, we need to first jump to | 2196 To also match zero times, we need to first jump to |
3266 return true; | 3271 return true; |
3267 | 3272 |
3268 return false; | 3273 return false; |
3269 } | 3274 } |
3270 | 3275 |
3271 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in | 3276 /* analyse_first. |
3272 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible | 3277 If fastmap is non-NULL, go through the pattern and fill fastmap |
3273 characters can start a string that matches the pattern. This fastmap | 3278 with all the possible leading chars. If fastmap is NULL, don't |
3274 is used by re_search to skip quickly over impossible starting points. | 3279 bother filling it up (obviously) and only return whether the |
3275 | 3280 pattern could potentially match the empty string. |
3276 Character codes above (1 << BYTEWIDTH) are not represented in the | 3281 |
3277 fastmap, but the leading codes are represented. Thus, the fastmap | 3282 Return 1 if p..pend might match the empty string. |
3278 indicates which character sets could start a match. | 3283 Return 0 if p..pend matches at least one char. |
3279 | 3284 Return -1 if p..pend matches at least one char, but fastmap was not |
3280 The caller must supply the address of a (1 << BYTEWIDTH)-byte data | 3285 updated accurately. |
3281 area as BUFP->fastmap. | 3286 Return -2 if an error occurred. */ |
3282 | 3287 |
3283 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in | 3288 static int |
3284 the pattern buffer. | 3289 analyse_first (p, pend, fastmap, multibyte) |
3285 | 3290 unsigned char *p, *pend; |
3286 Returns 0 if we succeed, -2 if an internal error. */ | 3291 char *fastmap; |
3287 | 3292 const int multibyte; |
3288 int | |
3289 re_compile_fastmap (bufp) | |
3290 struct re_pattern_buffer *bufp; | |
3291 { | 3293 { |
3292 int j, k; | 3294 int j, k; |
3293 boolean not; | 3295 boolean not; |
3294 #ifdef MATCH_MAY_ALLOCATE | 3296 #ifdef MATCH_MAY_ALLOCATE |
3295 fail_stack_type fail_stack; | 3297 fail_stack_type fail_stack; |
3296 #endif | 3298 #endif |
3297 #ifndef REGEX_MALLOC | 3299 #ifndef REGEX_MALLOC |
3298 char *destination; | 3300 char *destination; |
3299 #endif | 3301 #endif |
3300 | 3302 |
3301 register char *fastmap = bufp->fastmap; | |
3302 unsigned char *pattern = bufp->buffer; | |
3303 unsigned long size = bufp->used; | |
3304 unsigned char *p = pattern; | |
3305 register unsigned char *pend = pattern + size; | |
3306 const boolean multibyte = bufp->multibyte; | |
3307 | |
3308 #if defined (REL_ALLOC) && defined (REGEX_MALLOC) | 3303 #if defined (REL_ALLOC) && defined (REGEX_MALLOC) |
3309 /* This holds the pointer to the failure stack, when | 3304 /* This holds the pointer to the failure stack, when |
3310 it is allocated relocatably. */ | 3305 it is allocated relocatably. */ |
3311 fail_stack_elt_t *failure_stack_ptr; | 3306 fail_stack_elt_t *failure_stack_ptr; |
3312 #endif | 3307 #endif |
3319 | 3314 |
3320 /* If all elements for base leading-codes in fastmap is set, this | 3315 /* If all elements for base leading-codes in fastmap is set, this |
3321 flag is set true. */ | 3316 flag is set true. */ |
3322 boolean match_any_multibyte_characters = false; | 3317 boolean match_any_multibyte_characters = false; |
3323 | 3318 |
3324 assert (fastmap != NULL && p != NULL); | 3319 assert (p); |
3325 | 3320 |
3326 INIT_FAIL_STACK (); | 3321 INIT_FAIL_STACK (); |
3327 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ | |
3328 bufp->fastmap_accurate = 1; /* It will be when we're done. */ | |
3329 bufp->can_be_null = 0; | |
3330 | 3322 |
3331 /* The loop below works as follows: | 3323 /* The loop below works as follows: |
3332 - It has a working-list kept in the PATTERN_STACK and which basically | 3324 - It has a working-list kept in the PATTERN_STACK and which basically |
3333 starts by only containing a pointer to the first operation. | 3325 starts by only containing a pointer to the first operation. |
3334 - If the opcode we're looking at is a match against some set of | 3326 - If the opcode we're looking at is a match against some set of |
3342 We guarantee termination by ignoring backward jumps (more or less), | 3334 We guarantee termination by ignoring backward jumps (more or less), |
3343 so that `p' is monotonically increasing. More to the point, we | 3335 so that `p' is monotonically increasing. More to the point, we |
3344 never set `p' (or push) anything `<= p1'. */ | 3336 never set `p' (or push) anything `<= p1'. */ |
3345 | 3337 |
3346 /* If can_be_null is set, then the fastmap will not be used anyway. */ | 3338 /* If can_be_null is set, then the fastmap will not be used anyway. */ |
3347 while (!bufp->can_be_null) | 3339 while (1) |
3348 { | 3340 { |
3349 /* `p1' is used as a marker of how far back a `on_failure_jump' | 3341 /* `p1' is used as a marker of how far back a `on_failure_jump' |
3350 can go without being ignored. It is normally equal to `p' | 3342 can go without being ignored. It is normally equal to `p' |
3351 (which prevents any backward `on_failure_jump') except right | 3343 (which prevents any backward `on_failure_jump') except right |
3352 after a plain `jump', to allow patterns such as: | 3344 after a plain `jump', to allow patterns such as: |
3354 3..9: <body> | 3346 3..9: <body> |
3355 10: on_failure_jump 3 | 3347 10: on_failure_jump 3 |
3356 as used for the *? operator. */ | 3348 as used for the *? operator. */ |
3357 unsigned char *p1 = p; | 3349 unsigned char *p1 = p; |
3358 | 3350 |
3359 if (p >= pend || *p == succeed) | 3351 if (p >= pend) |
3360 { | 3352 { |
3353 if (path_can_be_null) | |
3354 return (RESET_FAIL_STACK (), 1); | |
3355 | |
3361 /* We have reached the (effective) end of pattern. */ | 3356 /* We have reached the (effective) end of pattern. */ |
3362 if (!PATTERN_STACK_EMPTY ()) | 3357 if (PATTERN_STACK_EMPTY ()) |
3363 { | 3358 return (RESET_FAIL_STACK (), 0); |
3364 bufp->can_be_null |= path_can_be_null; | 3359 |
3365 | 3360 p = (unsigned char*) POP_PATTERN_OP (); |
3366 /* Reset for next path. */ | 3361 path_can_be_null = true; |
3367 path_can_be_null = true; | 3362 continue; |
3368 | |
3369 p = (unsigned char*) POP_PATTERN_OP (); | |
3370 | |
3371 continue; | |
3372 } | |
3373 else | |
3374 break; | |
3375 } | 3363 } |
3376 | 3364 |
3377 /* We should never be about to go beyond the end of the pattern. */ | 3365 /* We should never be about to go beyond the end of the pattern. */ |
3378 assert (p < pend); | 3366 assert (p < pend); |
3379 | 3367 |
3380 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) | 3368 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) |
3381 { | 3369 { |
3370 case succeed: | |
3371 p = pend; | |
3372 continue; | |
3382 | 3373 |
3383 case duplicate: | 3374 case duplicate: |
3384 /* If the first character has to match a backreference, that means | 3375 /* If the first character has to match a backreference, that means |
3385 that the group was empty (since it already matched). Since this | 3376 that the group was empty (since it already matched). Since this |
3386 is the only case that interests us here, we can assume that the | 3377 is the only case that interests us here, we can assume that the |
3391 | 3382 |
3392 /* Following are the cases which match a character. These end | 3383 /* Following are the cases which match a character. These end |
3393 with `break'. */ | 3384 with `break'. */ |
3394 | 3385 |
3395 case exactn: | 3386 case exactn: |
3396 fastmap[p[1]] = 1; | 3387 if (fastmap) fastmap[p[1]] = 1; |
3397 break; | 3388 break; |
3398 | 3389 |
3399 | 3390 |
3400 case anychar: | 3391 case anychar: |
3401 /* We could put all the chars except for \n (and maybe \0) | 3392 /* We could put all the chars except for \n (and maybe \0) |
3402 but we don't bother since it is generally not worth it. */ | 3393 but we don't bother since it is generally not worth it. */ |
3403 bufp->can_be_null = 1; | 3394 if (!fastmap) break; |
3404 continue; | 3395 return (RESET_FAIL_STACK (), -1); |
3405 | 3396 |
3406 | 3397 |
3407 case charset_not: | 3398 case charset_not: |
3408 /* Chars beyond end of bitmap are possible matches. | 3399 /* Chars beyond end of bitmap are possible matches. |
3409 All the single-byte codes can occur in multibyte buffers. | 3400 All the single-byte codes can occur in multibyte buffers. |
3474 fastmap[j] = 1; | 3465 fastmap[j] = 1; |
3475 break; | 3466 break; |
3476 #else /* emacs */ | 3467 #else /* emacs */ |
3477 /* This match depends on text properties. These end with | 3468 /* This match depends on text properties. These end with |
3478 aborting optimizations. */ | 3469 aborting optimizations. */ |
3479 bufp->can_be_null = 1; | 3470 return (RESET_FAIL_STACK (), -1); |
3480 continue; | |
3481 | 3471 |
3482 case categoryspec: | 3472 case categoryspec: |
3483 case notcategoryspec: | 3473 case notcategoryspec: |
3484 if (!fastmap) break; | 3474 if (!fastmap) break; |
3485 not = (re_opcode_t)p[-1] == notcategoryspec; | 3475 not = (re_opcode_t)p[-1] == notcategoryspec; |
3591 does these things. */ | 3581 does these things. */ |
3592 path_can_be_null = false; | 3582 path_can_be_null = false; |
3593 p = pend; | 3583 p = pend; |
3594 } /* while p */ | 3584 } /* while p */ |
3595 | 3585 |
3596 /* Set `can_be_null' for the last path (also the first path, if the | 3586 return (RESET_FAIL_STACK (), 0); |
3597 pattern is empty). */ | 3587 } /* analyse_first */ |
3598 bufp->can_be_null |= path_can_be_null; | 3588 |
3599 RESET_FAIL_STACK (); | 3589 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in |
3590 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible | |
3591 characters can start a string that matches the pattern. This fastmap | |
3592 is used by re_search to skip quickly over impossible starting points. | |
3593 | |
3594 Character codes above (1 << BYTEWIDTH) are not represented in the | |
3595 fastmap, but the leading codes are represented. Thus, the fastmap | |
3596 indicates which character sets could start a match. | |
3597 | |
3598 The caller must supply the address of a (1 << BYTEWIDTH)-byte data | |
3599 area as BUFP->fastmap. | |
3600 | |
3601 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in | |
3602 the pattern buffer. | |
3603 | |
3604 Returns 0 if we succeed, -2 if an internal error. */ | |
3605 | |
3606 int | |
3607 re_compile_fastmap (bufp) | |
3608 struct re_pattern_buffer *bufp; | |
3609 { | |
3610 char *fastmap = bufp->fastmap; | |
3611 int analysis; | |
3612 | |
3613 assert (fastmap && bufp->buffer); | |
3614 | |
3615 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ | |
3616 bufp->fastmap_accurate = 1; /* It will be when we're done. */ | |
3617 | |
3618 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used, | |
3619 fastmap, bufp->multibyte); | |
3620 if (analysis < -1) | |
3621 return analysis; | |
3622 bufp->can_be_null = (analysis != 0); | |
3600 return 0; | 3623 return 0; |
3601 } /* re_compile_fastmap */ | 3624 } /* re_compile_fastmap */ |
3602 | 3625 |
3603 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and | 3626 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and |
3604 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use | 3627 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use |