Mercurial > emacs
comparison src/regex.c @ 28203:c10ee0e6982b
(RE_STRING_CHAR): New macro.
(GET_CHAR_AFER_2): Remove.
(RE_TRANSLATE, RE_TRANSLATE_P): New macros moved from regex.h.
(enum re_opcode_t): Remove on_failure_jump_exclusive.
(print_partial_compiled_pattern, re_compile_fastmap)
(re_match_2_internal): Remove on_failure_jump_exclusive.
(regex_compile): Turn optimizable P+ loops into PP*, so that the
optimization only need to work for * (ie. can use of_keep_string_jump).
Remove the special case for .*\n since it is now covered by the general
optimization.
(re_search_2): Don't bother with `room'.
(skip_one_char): New function.
(skip_noops): Simplify since `memory' is not needed any more.
(mutually_exclusive_p): Restructure slightly to use `switch' and
add handling for "all" remaining cases.
(re_match_2_internal): Change on_failure_jump_smart to use
on_failure_keep_string_jump (and redirect the end-of-loop jump)
rather than on_failure_jump_exclusive.
author | Stefan Monnier <monnier@iro.umontreal.ca> |
---|---|
date | Sun, 19 Mar 2000 23:22:06 +0000 |
parents | c314d747a819 |
children | f955117a1fcd |
comparison
equal
deleted
inserted
replaced
28202:118ca0109252 | 28203:c10ee0e6982b |
---|---|
78 #include "category.h" | 78 #include "category.h" |
79 | 79 |
80 #define malloc xmalloc | 80 #define malloc xmalloc |
81 #define realloc xrealloc | 81 #define realloc xrealloc |
82 #define free xfree | 82 #define free xfree |
83 | |
84 #define RE_STRING_CHAR(p, s) \ | |
85 (multibyte ? (STRING_CHAR (p, s)) : (*(p))) | |
83 | 86 |
84 #else /* not emacs */ | 87 #else /* not emacs */ |
85 | 88 |
86 /* If we are not linking with Emacs proper, | 89 /* If we are not linking with Emacs proper, |
87 we can't use the relocating allocator | 90 we can't use the relocating allocator |
185 #define CHAR_HEAD_P(p) (1) | 188 #define CHAR_HEAD_P(p) (1) |
186 #define SINGLE_BYTE_CHAR_P(c) (1) | 189 #define SINGLE_BYTE_CHAR_P(c) (1) |
187 #define SAME_CHARSET_P(c1, c2) (1) | 190 #define SAME_CHARSET_P(c1, c2) (1) |
188 #define MULTIBYTE_FORM_LENGTH(p, s) (1) | 191 #define MULTIBYTE_FORM_LENGTH(p, s) (1) |
189 #define STRING_CHAR(p, s) (*(p)) | 192 #define STRING_CHAR(p, s) (*(p)) |
193 #define RE_STRING_CHAR STRING_CHAR | |
190 #define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p)) | 194 #define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p)) |
191 #define GET_CHAR_AFTER_2(c, p, str1, end1, str2, end2) \ | |
192 (c = ((p) == (end1) ? *(str2) : *(p))) | |
193 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ | 195 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ |
194 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1))) | 196 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1))) |
195 #endif /* not emacs */ | 197 #endif /* not emacs */ |
198 | |
199 #ifndef RE_TRANSLATE | |
200 #define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C]) | |
201 #define RE_TRANSLATE_P(TBL) (TBL) | |
202 #endif | |
196 | 203 |
197 /* Get the interface, including the syntax bits. */ | 204 /* Get the interface, including the syntax bits. */ |
198 #include "regex.h" | 205 #include "regex.h" |
199 | 206 |
200 /* isalpha etc. are used for the character classes. */ | 207 /* isalpha etc. are used for the character classes. */ |
517 | 524 |
518 /* Like on_failure_jump, but pushes a placeholder instead of the | 525 /* Like on_failure_jump, but pushes a placeholder instead of the |
519 current string position when executed. */ | 526 current string position when executed. */ |
520 on_failure_keep_string_jump, | 527 on_failure_keep_string_jump, |
521 | 528 |
522 /* Like `on_failure_jump', except that it assumes that the | |
523 pattern following it is mutually exclusive with the pattern | |
524 at the destination of the jump: if one matches something, | |
525 the other won't match at all. | |
526 Always used via `on_failure_jump_smart'. */ | |
527 on_failure_jump_exclusive, | |
528 | |
529 /* Just like `on_failure_jump', except that it checks that we | 529 /* Just like `on_failure_jump', except that it checks that we |
530 don't get stuck in an infinite loop (matching an empty string | 530 don't get stuck in an infinite loop (matching an empty string |
531 indefinitely). */ | 531 indefinitely). */ |
532 on_failure_jump_loop, | 532 on_failure_jump_loop, |
533 | 533 |
534 /* A smart `on_failure_jump' used for greedy * and + operators. | 534 /* A smart `on_failure_jump' used for greedy * and + operators. |
535 It analyses the loop before which it is put and if the | 535 It analyses the loop before which it is put and if the |
536 loop does not require backtracking, it changes itself to | 536 loop does not require backtracking, it changes itself to |
537 `on_failure_jump_exclusive', else it just defaults to | 537 `on_failure_keep_string_jump' and short-circuits the loop, |
538 changing itself into `on_failure_jump_loop'. */ | 538 else it just defaults to changing itself into `on_failure_jump'. |
539 It assumes that it is pointing to just past a `jump'. */ | |
539 on_failure_jump_smart, | 540 on_failure_jump_smart, |
540 | 541 |
541 /* Followed by two-byte relative address and two-byte number n. | 542 /* Followed by two-byte relative address and two-byte number n. |
542 After matching N times, jump to the address upon failure. */ | 543 After matching N times, jump to the address upon failure. */ |
543 succeed_n, | 544 succeed_n, |
939 break; | 940 break; |
940 | 941 |
941 case on_failure_keep_string_jump: | 942 case on_failure_keep_string_jump: |
942 extract_number_and_incr (&mcnt, &p); | 943 extract_number_and_incr (&mcnt, &p); |
943 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start); | 944 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start); |
944 break; | |
945 | |
946 case on_failure_jump_exclusive: | |
947 extract_number_and_incr (&mcnt, &p); | |
948 printf ("/on_failure_jump_exclusive to %d", p + mcnt - start); | |
949 break; | 945 break; |
950 | 946 |
951 case on_failure_jump_loop: | 947 case on_failure_jump_loop: |
952 extract_number_and_incr (&mcnt, &p); | 948 extract_number_and_incr (&mcnt, &p); |
953 printf ("/on_failure_jump_loop to %d", p + mcnt - start); | 949 printf ("/on_failure_jump_loop to %d", p + mcnt - start); |
1530 const unsigned char *p, | 1526 const unsigned char *p, |
1531 reg_syntax_t syntax)); | 1527 reg_syntax_t syntax)); |
1532 static boolean at_endline_loc_p _RE_ARGS((const unsigned char *p, | 1528 static boolean at_endline_loc_p _RE_ARGS((const unsigned char *p, |
1533 const unsigned char *pend, | 1529 const unsigned char *pend, |
1534 reg_syntax_t syntax)); | 1530 reg_syntax_t syntax)); |
1531 static unsigned char *skip_one_char _RE_ARGS((unsigned char *p)); | |
1535 | 1532 |
1536 /* Fetch the next character in the uncompiled pattern---translating it | 1533 /* Fetch the next character in the uncompiled pattern---translating it |
1537 if necessary. Also cast from a signed character in the constant | 1534 if necessary. Also cast from a signed character in the constant |
1538 string passed to us by the user to an unsigned char that we can use | 1535 string passed to us by the user to an unsigned char that we can use |
1539 as an array index (in, e.g., `translate'). */ | 1536 as an array index (in, e.g., `translate'). */ |
2070 else if (!(syntax & RE_CONTEXT_INDEP_OPS)) | 2067 else if (!(syntax & RE_CONTEXT_INDEP_OPS)) |
2071 goto normal_char; | 2068 goto normal_char; |
2072 } | 2069 } |
2073 | 2070 |
2074 { | 2071 { |
2075 /* Are we optimizing this jump? */ | |
2076 boolean keep_string_p = false; | |
2077 | |
2078 /* 1 means zero (many) matches is allowed. */ | 2072 /* 1 means zero (many) matches is allowed. */ |
2079 boolean zero_times_ok = 0, many_times_ok = 0; | 2073 boolean zero_times_ok = 0, many_times_ok = 0; |
2080 boolean greedy = 1; | 2074 boolean greedy = 1; |
2081 | 2075 |
2082 /* If there is a sequence of repetition chars, collapse it | 2076 /* If there is a sequence of repetition chars, collapse it |
2127 /* If we get here, we found another repeat character. */ | 2121 /* If we get here, we found another repeat character. */ |
2128 } | 2122 } |
2129 | 2123 |
2130 /* Star, etc. applied to an empty pattern is equivalent | 2124 /* Star, etc. applied to an empty pattern is equivalent |
2131 to an empty pattern. */ | 2125 to an empty pattern. */ |
2132 if (!laststart) | 2126 if (!laststart || laststart == b) |
2133 break; | 2127 break; |
2134 | 2128 |
2135 /* Now we know whether or not zero matches is allowed | 2129 /* Now we know whether or not zero matches is allowed |
2136 and also whether or not two or more matches is allowed. */ | 2130 and also whether or not two or more matches is allowed. */ |
2137 if (greedy) | 2131 if (greedy) |
2138 { | 2132 { |
2139 if (many_times_ok) | 2133 if (many_times_ok) |
2140 { /* More than one repetition is allowed, so put in at the | 2134 { |
2141 end a backward relative jump from `b' to before the next | 2135 boolean simple = skip_one_char (laststart) == b; |
2142 jump we're going to put in below (which jumps from | 2136 unsigned int startoffset = 0; |
2143 laststart to after this jump). | 2137 assert (skip_one_char (laststart) <= b); |
2144 | 2138 |
2145 But if we are at the `*' in the exact sequence `.*\n', | 2139 if (!zero_times_ok && simple) |
2146 insert an unconditional jump backwards to the ., | 2140 { /* Since simple * loops can be made faster by using |
2147 instead of the beginning of the loop. This way we only | 2141 on_failure_keep_string_jump, we turn simple P+ |
2148 push a failure point once, instead of every time | 2142 into PP* if P is simple. */ |
2149 through the loop. */ | 2143 unsigned char *p1, *p2; |
2150 assert (p - 1 > pattern); | 2144 startoffset = b - laststart; |
2151 | 2145 GET_BUFFER_SPACE (startoffset); |
2152 /* Allocate the space for the jump. */ | 2146 p1 = b; p2 = laststart; |
2153 GET_BUFFER_SPACE (3); | 2147 while (p2 < p1) |
2154 | 2148 *b++ = *p2++; |
2155 /* We know we are not at the first character of the pattern, | 2149 zero_times_ok = 1; |
2156 because laststart was nonzero. And we've already | |
2157 incremented `p', by the way, to be the character after | |
2158 the `*'. Do we have to do something analogous here | |
2159 for null bytes, because of RE_DOT_NOT_NULL? */ | |
2160 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') | |
2161 && zero_times_ok | |
2162 && p < pend | |
2163 && TRANSLATE (*p) == TRANSLATE ('\n') | |
2164 && !(syntax & RE_DOT_NEWLINE)) | |
2165 { /* We have .*\n. */ | |
2166 STORE_JUMP (jump, b, laststart); | |
2167 keep_string_p = true; | |
2168 } | 2150 } |
2151 | |
2152 GET_BUFFER_SPACE (6); | |
2153 if (!zero_times_ok) | |
2154 /* A + loop. */ | |
2155 STORE_JUMP (on_failure_jump_loop, b, b + 6); | |
2169 else | 2156 else |
2170 STORE_JUMP (jump, b, laststart - 3); | 2157 /* Simple * loops can use on_failure_keep_string_jump |
2171 | 2158 depending on what follows. But since we don't know |
2172 /* We've added more stuff to the buffer. */ | 2159 that yet, we leave the decision up to |
2160 on_failure_jump_smart. */ | |
2161 INSERT_JUMP (simple ? on_failure_jump_smart | |
2162 : on_failure_jump_loop, | |
2163 laststart + startoffset, b + 6); | |
2173 b += 3; | 2164 b += 3; |
2174 } | 2165 STORE_JUMP (jump, b, laststart + startoffset); |
2175 | |
2176 /* On failure, jump from laststart to b + 3, which will be the | |
2177 end of the buffer after this jump is inserted. */ | |
2178 GET_BUFFER_SPACE (3); | |
2179 if (!zero_times_ok) | |
2180 { | |
2181 assert (many_times_ok); | |
2182 INSERT_JUMP (on_failure_jump_smart, b - 3, b + 3); | |
2183 pending_exact = 0; | |
2184 b += 3; | 2166 b += 3; |
2185 } | 2167 } |
2186 else | 2168 else |
2187 { | 2169 { |
2188 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump | 2170 /* A simple ? pattern. */ |
2189 : !many_times_ok ? | 2171 assert (zero_times_ok); |
2190 on_failure_jump : on_failure_jump_smart, | 2172 GET_BUFFER_SPACE (3); |
2191 laststart, b + 3); | 2173 INSERT_JUMP (on_failure_jump, laststart, b + 3); |
2192 pending_exact = 0; | |
2193 b += 3; | 2174 b += 3; |
2194 } | 2175 } |
2195 } | 2176 } |
2196 else /* not greedy */ | 2177 else /* not greedy */ |
2197 { /* I wish the greedy and non-greedy cases could be merged. */ | 2178 { /* I wish the greedy and non-greedy cases could be merged. */ |
2222 INSERT_JUMP (on_failure_jump, laststart, laststart + 6); | 2203 INSERT_JUMP (on_failure_jump, laststart, laststart + 6); |
2223 b += 3; | 2204 b += 3; |
2224 } | 2205 } |
2225 } | 2206 } |
2226 } | 2207 } |
2208 pending_exact = 0; | |
2227 break; | 2209 break; |
2228 | 2210 |
2229 | 2211 |
2230 case '.': | 2212 case '.': |
2231 laststart = b; | 2213 laststart = b; |
3215 static boolean | 3197 static boolean |
3216 at_begline_loc_p (pattern, p, syntax) | 3198 at_begline_loc_p (pattern, p, syntax) |
3217 const unsigned char *pattern, *p; | 3199 const unsigned char *pattern, *p; |
3218 reg_syntax_t syntax; | 3200 reg_syntax_t syntax; |
3219 { | 3201 { |
3220 re_char *prev = p - 2; | 3202 const unsigned char *prev = p - 2; |
3221 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; | 3203 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; |
3222 | 3204 |
3223 return | 3205 return |
3224 /* After a subexpression? */ | 3206 /* After a subexpression? */ |
3225 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) | 3207 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) |
3234 static boolean | 3216 static boolean |
3235 at_endline_loc_p (p, pend, syntax) | 3217 at_endline_loc_p (p, pend, syntax) |
3236 const unsigned char *p, *pend; | 3218 const unsigned char *p, *pend; |
3237 reg_syntax_t syntax; | 3219 reg_syntax_t syntax; |
3238 { | 3220 { |
3239 re_char *next = p; | 3221 const unsigned char *next = p; |
3240 boolean next_backslash = *next == '\\'; | 3222 boolean next_backslash = *next == '\\'; |
3241 re_char *next_next = p + 1 < pend ? p + 1 : 0; | 3223 const unsigned char *next_next = p + 1 < pend ? p + 1 : 0; |
3242 | 3224 |
3243 return | 3225 return |
3244 /* Before a subexpression? */ | 3226 /* Before a subexpression? */ |
3245 (syntax & RE_NO_BK_PARENS ? *next == ')' | 3227 (syntax & RE_NO_BK_PARENS ? *next == ')' |
3246 : next_backslash && next_next && *next_next == ')') | 3228 : next_backslash && next_next && *next_next == ')') |
3661 p += j; | 3643 p += j; |
3662 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p)) | 3644 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p)) |
3663 { | 3645 { |
3664 case on_failure_jump: | 3646 case on_failure_jump: |
3665 case on_failure_keep_string_jump: | 3647 case on_failure_keep_string_jump: |
3666 case on_failure_jump_exclusive: | |
3667 case on_failure_jump_loop: | 3648 case on_failure_jump_loop: |
3668 case on_failure_jump_smart: | 3649 case on_failure_jump_smart: |
3669 p++; | 3650 p++; |
3670 break; | 3651 break; |
3671 default: | 3652 default: |
3675 to jump back to "just after here". */ | 3656 to jump back to "just after here". */ |
3676 /* Fallthrough */ | 3657 /* Fallthrough */ |
3677 | 3658 |
3678 case on_failure_jump: | 3659 case on_failure_jump: |
3679 case on_failure_keep_string_jump: | 3660 case on_failure_keep_string_jump: |
3680 case on_failure_jump_exclusive: | |
3681 case on_failure_jump_loop: | 3661 case on_failure_jump_loop: |
3682 case on_failure_jump_smart: | 3662 case on_failure_jump_smart: |
3683 handle_on_failure_jump: | 3663 handle_on_failure_jump: |
3684 EXTRACT_NUMBER_AND_INCR (j, p); | 3664 EXTRACT_NUMBER_AND_INCR (j, p); |
3685 | 3665 |
3985 | 3965 |
3986 startpos += irange - range; | 3966 startpos += irange - range; |
3987 } | 3967 } |
3988 else /* Searching backwards. */ | 3968 else /* Searching backwards. */ |
3989 { | 3969 { |
3990 int room = (startpos >= size1 | 3970 buf_ch = STRING_CHAR (d, (startpos >= size1 |
3991 ? size2 + size1 - startpos | 3971 ? size2 + size1 - startpos |
3992 : size1 - startpos); | 3972 : size1 - startpos)); |
3993 | |
3994 buf_ch = STRING_CHAR (d, room); | |
3995 if (RE_TRANSLATE_P (translate)) | 3973 if (RE_TRANSLATE_P (translate)) |
3996 buf_ch = RE_TRANSLATE (translate, buf_ch); | 3974 buf_ch = RE_TRANSLATE (translate, buf_ch); |
3997 | 3975 |
3998 if (! (buf_ch >= 0400 | 3976 if (! (buf_ch >= 0400 |
3999 || fastmap[buf_ch])) | 3977 || fastmap[buf_ch])) |
4151 #endif /* not MATCH_MAY_ALLOCATE */ | 4129 #endif /* not MATCH_MAY_ALLOCATE */ |
4152 | 4130 |
4153 | 4131 |
4154 /* Optimization routines. */ | 4132 /* Optimization routines. */ |
4155 | 4133 |
4134 /* If the operation is a match against one or more chars, | |
4135 return a pointer to the next operation, else return NULL. */ | |
4136 static unsigned char * | |
4137 skip_one_char (p) | |
4138 unsigned char *p; | |
4139 { | |
4140 switch (SWITCH_ENUM_CAST (*p++)) | |
4141 { | |
4142 case anychar: | |
4143 break; | |
4144 | |
4145 case exactn: | |
4146 p += *p + 1; | |
4147 break; | |
4148 | |
4149 case charset_not: | |
4150 case charset: | |
4151 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1)) | |
4152 { | |
4153 int mcnt; | |
4154 p = CHARSET_RANGE_TABLE (p - 1); | |
4155 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
4156 p = CHARSET_RANGE_TABLE_END (p, mcnt); | |
4157 } | |
4158 else | |
4159 p += 1 + CHARSET_BITMAP_SIZE (p - 1); | |
4160 break; | |
4161 | |
4162 #ifdef emacs | |
4163 case syntaxspec: | |
4164 case notsyntaxspec: | |
4165 case categoryspec: | |
4166 case notcategoryspec: | |
4167 #endif /* emacs */ | |
4168 p++; | |
4169 break; | |
4170 | |
4171 default: | |
4172 p = NULL; | |
4173 } | |
4174 return p; | |
4175 } | |
4176 | |
4177 | |
4156 /* Jump over non-matching operations. */ | 4178 /* Jump over non-matching operations. */ |
4157 static unsigned char * | 4179 static unsigned char * |
4158 skip_noops (p, pend, memory) | 4180 skip_noops (p, pend) |
4159 unsigned char *p, *pend; | 4181 unsigned char *p, *pend; |
4160 int memory; | |
4161 { | 4182 { |
4162 int mcnt; | 4183 int mcnt; |
4163 while (p < pend) | 4184 while (p < pend) |
4164 { | 4185 { |
4165 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p)) | 4186 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p)) |
4166 { | 4187 { |
4167 case start_memory: | 4188 case start_memory: |
4168 if (!memory) | |
4169 return p; | |
4170 case stop_memory: | 4189 case stop_memory: |
4171 p += 2; break; | 4190 p += 2; break; |
4172 case no_op: | 4191 case no_op: |
4173 p += 1; break; | 4192 p += 1; break; |
4174 case jump: | 4193 case jump: |
4188 static int | 4207 static int |
4189 mutually_exclusive_p (bufp, p1, p2) | 4208 mutually_exclusive_p (bufp, p1, p2) |
4190 struct re_pattern_buffer *bufp; | 4209 struct re_pattern_buffer *bufp; |
4191 unsigned char *p1, *p2; | 4210 unsigned char *p1, *p2; |
4192 { | 4211 { |
4193 int multibyte = bufp->multibyte; | 4212 re_opcode_t op2; |
4213 const boolean multibyte = bufp->multibyte; | |
4194 unsigned char *pend = bufp->buffer + bufp->used; | 4214 unsigned char *pend = bufp->buffer + bufp->used; |
4195 | 4215 |
4196 assert (p1 >= bufp->buffer && p1 <= pend | 4216 assert (p1 >= bufp->buffer && p1 < pend |
4197 && p2 >= bufp->buffer && p2 <= pend); | 4217 && p2 >= bufp->buffer && p2 <= pend); |
4198 | 4218 |
4199 /* Skip over open/close-group commands. | 4219 /* Skip over open/close-group commands. |
4200 If what follows this loop is a ...+ construct, | 4220 If what follows this loop is a ...+ construct, |
4201 look at what begins its body, since we will have to | 4221 look at what begins its body, since we will have to |
4202 match at least one of that. */ | 4222 match at least one of that. */ |
4203 p2 = skip_noops (p2, pend, 1); | 4223 p2 = skip_noops (p2, pend); |
4204 /* The same skip can be done for p1, except that skipping over | 4224 /* The same skip can be done for p1, except that this function |
4205 start_memory is not a good idea (if there's a group inside | 4225 is only used in the case where p1 is a simple match operator. */ |
4206 the loop delimited by on_failure_jump_exclusive, then it | 4226 /* p1 = skip_noops (p1, pend); */ |
4207 can't optimize the push away (it still works properly, but | 4227 |
4208 slightly slower rather than faster)). */ | 4228 assert (p1 >= bufp->buffer && p1 < pend |
4209 p1 = skip_noops (p1, pend, 0); | 4229 && p2 >= bufp->buffer && p2 <= pend); |
4210 | 4230 |
4211 /* If we're at the end of the pattern, we can change. */ | 4231 op2 = p2 == pend ? succeed : *p2; |
4212 if (p2 == pend) | 4232 |
4233 switch (SWITCH_ENUM_CAST (op2)) | |
4213 { | 4234 { |
4214 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p1)) | 4235 case succeed: |
4236 case endbuf: | |
4237 /* If we're at the end of the pattern, we can change. */ | |
4238 if (skip_one_char (p1)) | |
4215 { | 4239 { |
4216 case anychar: | |
4217 case charset_not: | |
4218 case charset: | |
4219 case exactn: | |
4220 DEBUG_PRINT1 (" End of pattern: fast loop.\n"); | 4240 DEBUG_PRINT1 (" End of pattern: fast loop.\n"); |
4221 return 1; | 4241 return 1; |
4222 default: | |
4223 return 0; | |
4224 } | 4242 } |
4225 } | 4243 break; |
4226 | 4244 |
4227 else if ((re_opcode_t) *p2 == exactn | 4245 case endline: |
4228 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) | 4246 if (!bufp->newline_anchor) |
4229 { | 4247 break; |
4230 register unsigned int c | 4248 /* Fallthrough */ |
4231 = *p2 == (unsigned char) endline ? '\n' : p2[2]; | 4249 case exactn: |
4232 | 4250 { |
4233 if ((re_opcode_t) *p1 == exactn) | 4251 register unsigned int c |
4234 { | 4252 = (re_opcode_t) *p2 == endline ? '\n' |
4235 if (!(multibyte /* && (c != '\n') */ | 4253 : RE_STRING_CHAR(p2 + 2, pend - p2 - 2); |
4236 && BASE_LEADING_CODE_P (c)) | 4254 |
4237 ? c != p1[2] | 4255 if ((re_opcode_t) *p1 == exactn) |
4238 : (STRING_CHAR (&p2[2], pend - &p2[2]) | 4256 { |
4239 != STRING_CHAR (&p1[2], pend - &p1[2]))) | 4257 if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2)) |
4240 { | 4258 { |
4241 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]); | 4259 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]); |
4242 return 1; | 4260 return 1; |
4243 } | 4261 } |
4244 } | 4262 } |
4245 | 4263 |
4246 else if ((re_opcode_t) *p1 == charset | 4264 else if ((re_opcode_t) *p1 == charset |
4247 || (re_opcode_t) *p1 == charset_not) | 4265 || (re_opcode_t) *p1 == charset_not) |
4248 { | 4266 { |
4249 int not = (re_opcode_t) *p1 == charset_not; | 4267 int not = (re_opcode_t) *p1 == charset_not; |
4250 | 4268 |
4251 if (multibyte /* && (c != '\n') */ | 4269 /* Test if C is listed in charset (or charset_not) |
4252 && BASE_LEADING_CODE_P (c)) | 4270 at `p1'. */ |
4253 c = STRING_CHAR (&p2[2], pend - &p2[2]); | 4271 if (SINGLE_BYTE_CHAR_P (c)) |
4254 | 4272 { |
4255 /* Test if C is listed in charset (or charset_not) | 4273 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH |
4256 at `p1'. */ | 4274 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) |
4257 if (SINGLE_BYTE_CHAR_P (c)) | 4275 not = !not; |
4258 { | 4276 } |
4259 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH | 4277 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1)) |
4260 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) | 4278 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1); |
4261 not = !not; | 4279 |
4262 } | 4280 /* `not' is equal to 1 if c would match, which means |
4263 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1)) | 4281 that we can't change to pop_failure_jump. */ |
4264 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1); | 4282 if (!not) |
4265 | 4283 { |
4266 /* `not' is equal to 1 if c would match, which means | 4284 DEBUG_PRINT1 (" No match => fast loop.\n"); |
4267 that we can't change to pop_failure_jump. */ | 4285 return 1; |
4268 if (!not) | 4286 } |
4269 { | 4287 } |
4270 DEBUG_PRINT1 (" No match => fast loop.\n"); | 4288 else if ((re_opcode_t) *p1 == anychar |
4271 return 1; | 4289 && c == '\n') |
4272 } | 4290 { |
4273 } | 4291 DEBUG_PRINT1 (" . != \\n => fast loop.\n"); |
4274 else if ((re_opcode_t) *p1 == anychar | 4292 return 1; |
4275 && c == '\n') | 4293 } |
4276 { | 4294 } |
4277 DEBUG_PRINT1 (" . != \\n => fast loop.\n"); | 4295 break; |
4278 return 1; | 4296 |
4279 } | 4297 case charset: |
4280 } | 4298 case charset_not: |
4281 else if ((re_opcode_t) *p2 == charset | 4299 { |
4282 || (re_opcode_t) *p2 == charset_not) | 4300 if ((re_opcode_t) *p1 == exactn) |
4283 { | 4301 /* Reuse the code above. */ |
4284 if ((re_opcode_t) *p1 == exactn) | 4302 return mutually_exclusive_p (bufp, p2, p1); |
4285 /* Reuse the code above. */ | |
4286 return mutually_exclusive_p (bufp, p2, p1); | |
4287 | 4303 |
4288 | 4304 |
4289 /* It is hard to list up all the character in charset | 4305 /* It is hard to list up all the character in charset |
4290 P2 if it includes multibyte character. Give up in | 4306 P2 if it includes multibyte character. Give up in |
4291 such case. */ | 4307 such case. */ |
4330 if (! (p2[2 + idx] == 0 | 4346 if (! (p2[2 + idx] == 0 |
4331 || (idx < CHARSET_BITMAP_SIZE (p1) | 4347 || (idx < CHARSET_BITMAP_SIZE (p1) |
4332 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0)))) | 4348 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0)))) |
4333 break; | 4349 break; |
4334 | 4350 |
4335 if (idx == p2[1]) | 4351 if (idx == p2[1]) |
4336 { | 4352 { |
4337 DEBUG_PRINT1 (" No match => fast loop.\n"); | 4353 DEBUG_PRINT1 (" No match => fast loop.\n"); |
4338 return 1; | 4354 return 1; |
4339 } | 4355 } |
4340 } | 4356 } |
4341 } | 4357 } |
4358 } | |
4359 | |
4360 #ifdef emacs | |
4361 case wordend: | |
4362 case notsyntaxspec: | |
4363 return ((re_opcode_t) *p1 == syntaxspec | |
4364 && p1[1] == (op2 == wordend ? Sword : p2[1])); | |
4365 | |
4366 case wordbeg: | |
4367 case syntaxspec: | |
4368 return ((re_opcode_t) *p1 == notsyntaxspec | |
4369 && p1[1] == (op2 == wordend ? Sword : p2[1])); | |
4370 | |
4371 case wordbound: | |
4372 return (((re_opcode_t) *p1 == notsyntaxspec | |
4373 || (re_opcode_t) *p1 == syntaxspec) | |
4374 && p1[1] == Sword); | |
4375 | |
4376 case categoryspec: | |
4377 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]); | |
4378 case notcategoryspec: | |
4379 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]); | |
4380 #endif /* emacs */ | |
4381 | |
4382 default: | |
4383 ; | |
4342 } | 4384 } |
4343 | 4385 |
4344 /* Safe default. */ | 4386 /* Safe default. */ |
4345 return 0; | 4387 return 0; |
4346 } | 4388 } |
5155 mcnt, p + mcnt); | 5197 mcnt, p + mcnt); |
5156 | 5198 |
5157 PUSH_FAILURE_POINT (p - 3, NULL); | 5199 PUSH_FAILURE_POINT (p - 3, NULL); |
5158 break; | 5200 break; |
5159 | 5201 |
5160 case on_failure_jump_exclusive: | 5202 |
5161 EXTRACT_NUMBER_AND_INCR (mcnt, p); | 5203 /* Simple loop detecting on_failure_jump: just check on the |
5162 DEBUG_PRINT3 ("EXECUTING on_failure_jump_exclusive %d (to %p):\n", | 5204 failure stack if the same spot was already hit earlier. */ |
5163 mcnt, p + mcnt); | |
5164 | |
5165 if (! FAIL_STACK_EMPTY () | |
5166 && FAILURE_PAT (TOP_FAILURE_HANDLE ()) == (p - 3) | |
5167 && fail_stack.avail == fail_stack.frame) | |
5168 { | |
5169 /* We are trying to push failure F2 onto the stack but there | |
5170 is already a failure F1 pushed from the same instruction. | |
5171 Between F1 and now, something has matched (else this is an | |
5172 improper use of on_failure_jump_exclusive), so that we know | |
5173 that the fail-destination of F1 cannot match, hence we can | |
5174 pop F1 before pushing F2. Instead of doing this pop/push, | |
5175 we manually turn F1 into F2. | |
5176 `fail_stack.avail == fail_stack.frame' makes sure | |
5177 that popping F1 doesn't involve registers, else | |
5178 this optimization cannot be done so trivially. */ | |
5179 assert (FAILURE_STR (TOP_FAILURE_HANDLE ()) != d); | |
5180 FAILURE_STR (TOP_FAILURE_HANDLE ()) = d; | |
5181 } | |
5182 else | |
5183 PUSH_FAILURE_POINT (p - 3, d); | |
5184 break; | |
5185 | |
5186 case on_failure_jump_loop: | 5205 case on_failure_jump_loop: |
5187 on_failure: | 5206 on_failure: |
5188 EXTRACT_NUMBER_AND_INCR (mcnt, p); | 5207 EXTRACT_NUMBER_AND_INCR (mcnt, p); |
5189 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n", | 5208 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n", |
5190 mcnt, p + mcnt); | 5209 mcnt, p + mcnt); |
5213 mcnt, p + mcnt); | 5232 mcnt, p + mcnt); |
5214 | 5233 |
5215 PUSH_FAILURE_POINT (p -3, d); | 5234 PUSH_FAILURE_POINT (p -3, d); |
5216 break; | 5235 break; |
5217 | 5236 |
5218 /* This operation is used for greedy * and +. | 5237 /* This operation is used for greedy *. |
5219 Compare the beginning of the repeat with what in the | 5238 Compare the beginning of the repeat with what in the |
5220 pattern follows its end. If we can establish that there | 5239 pattern follows its end. If we can establish that there |
5221 is nothing that they would both match, i.e., that we | 5240 is nothing that they would both match, i.e., that we |
5222 would have to backtrack because of (as in, e.g., `a*a') | 5241 would have to backtrack because of (as in, e.g., `a*a') |
5223 then we can use a non-backtracking loop based on | 5242 then we can use a non-backtracking loop based on |
5224 on_failure_jump_exclusive instead of on_failure_jump_loop. */ | 5243 on_failure_keep_string_jump instead of on_failure_jump. */ |
5225 case on_failure_jump_smart: | 5244 case on_failure_jump_smart: |
5226 QUIT; | 5245 QUIT; |
5227 EXTRACT_NUMBER_AND_INCR (mcnt, p); | 5246 EXTRACT_NUMBER_AND_INCR (mcnt, p); |
5228 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n", | 5247 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n", |
5229 mcnt, p + mcnt); | 5248 mcnt, p + mcnt); |
5232 unsigned char *p2 = p + mcnt; /* Destination of the jump. */ | 5251 unsigned char *p2 = p + mcnt; /* Destination of the jump. */ |
5233 | 5252 |
5234 p -= 3; /* Reset so that we will re-execute the | 5253 p -= 3; /* Reset so that we will re-execute the |
5235 instruction once it's been changed. */ | 5254 instruction once it's been changed. */ |
5236 | 5255 |
5256 EXTRACT_NUMBER (mcnt, p2 - 2); | |
5257 | |
5258 /* Ensure this is a indeed the trivial kind of loop | |
5259 we are expecting. */ | |
5260 assert (skip_one_char (p1) == p2 - 3); | |
5261 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p); | |
5237 DEBUG_STATEMENT (debug += 2); | 5262 DEBUG_STATEMENT (debug += 2); |
5238 if (mutually_exclusive_p (bufp, p1, p2)) | 5263 if (mutually_exclusive_p (bufp, p1, p2)) |
5239 { | 5264 { |
5240 /* Use a fast `on_failure_keep_string_jump' loop. */ | 5265 /* Use a fast `on_failure_keep_string_jump' loop. */ |
5241 *p = (unsigned char) on_failure_jump_exclusive; | 5266 DEBUG_PRINT1 (" smart exclusive => fast loop.\n"); |
5242 /* STORE_NUMBER (p2 - 2, mcnt + 3); */ | 5267 *p = (unsigned char) on_failure_keep_string_jump; |
5268 STORE_NUMBER (p2 - 2, mcnt + 3); | |
5243 } | 5269 } |
5244 else | 5270 else |
5245 { | 5271 { |
5246 /* Default to a safe `on_failure_jump' loop. */ | 5272 /* Default to a safe `on_failure_jump' loop. */ |
5247 DEBUG_PRINT1 (" smart default => slow loop.\n"); | 5273 DEBUG_PRINT1 (" smart default => slow loop.\n"); |
5248 *p = (unsigned char) on_failure_jump_loop; | 5274 *p = (unsigned char) on_failure_jump; |
5249 } | 5275 } |
5250 DEBUG_STATEMENT (debug -= 2); | 5276 DEBUG_STATEMENT (debug -= 2); |
5251 } | 5277 } |
5252 break; | 5278 break; |
5253 | 5279 |
5598 { | 5624 { |
5599 case on_failure_keep_string_jump: | 5625 case on_failure_keep_string_jump: |
5600 assert (str == NULL); | 5626 assert (str == NULL); |
5601 goto continue_failure_jump; | 5627 goto continue_failure_jump; |
5602 | 5628 |
5603 case on_failure_jump_exclusive: | |
5604 /* If something has matched, the alternative will not match, | |
5605 so we might as well keep popping right away. */ | |
5606 if (0 /* d != str && d != string2 */) /* Don't bother. -sm */ | |
5607 /* (d == string2 && str == end1) => (d =~ str) */ | |
5608 goto fail; | |
5609 /* Fallthrough */ | |
5610 | |
5611 case on_failure_jump_loop: | 5629 case on_failure_jump_loop: |
5612 case on_failure_jump: | 5630 case on_failure_jump: |
5613 case succeed_n: | 5631 case succeed_n: |
5614 d = str; | 5632 d = str; |
5615 continue_failure_jump: | 5633 continue_failure_jump: |