Mercurial > emacs
comparison src/regex.c @ 28473:975fe3d8922e
* regex.c (PTR_TO_OFFSET) [!emacs]: Remove.
(RE_MULTIBYTE_P, RE_STRING_CHAR_AND_LENGTH): New macros.
(GET_CHAR_BEFORE_2): Moved from charset.h plus fixed minor bug when
we are between str1 and str2.
(MAX_MULTIBYTE_LENGTH, CHAR_STRING) [!emacs]: Provide trivial default.
(PATFETCH): Use `TRANSLATE'.
(PATFETCH_RAW): Fetch multibyte char if applicable.
(PATUNFETCH): Remove.
(regex_compile): Rely on PATFETCH to do most of the multibyte magic.
When writing a char, write it directly into the pattern buffer rather
than going needlessly through a temp char-array.
(re_match_2_internal): Similarly, rely on RE_STRING_CHAR to do the
multibyte magic and remove the useless `#ifdef emacs'.
(bcmp_translate): Don't compare as multibyte chars when in a unibyte
buffer.
* regex.h (struct re_pattern_buffer): Make field `multibyte'
conditional on `emacs'.
* charset.h (GET_CHAR_BEFORE_2): Moved to regex.c.
author | Stefan Monnier <monnier@iro.umontreal.ca> |
---|---|
date | Sun, 02 Apr 2000 23:56:46 +0000 |
parents | 5478842aea4c |
children | d40a7f046efe |
comparison
equal
deleted
inserted
replaced
28472:bae9218986ac | 28473:975fe3d8922e |
---|---|
18 along with this program; if not, write to the Free Software | 18 along with this program; if not, write to the Free Software |
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | 19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, |
20 USA. */ | 20 USA. */ |
21 | 21 |
22 /* TODO: | 22 /* TODO: |
23 - clean up multibyte issues | |
24 - structure the opcode space into opcode+flag. | 23 - structure the opcode space into opcode+flag. |
25 - merge with glibc's regex.[ch] | 24 - merge with glibc's regex.[ch] |
26 */ | 25 */ |
27 | 26 |
28 /* AIX requires this to be the first thing in the file. */ | 27 /* AIX requires this to be the first thing in the file. */ |
35 | 34 |
36 #ifdef emacs | 35 #ifdef emacs |
37 /* Converts the pointer to the char to BEG-based offset from the start. */ | 36 /* Converts the pointer to the char to BEG-based offset from the start. */ |
38 #define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d)) | 37 #define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d)) |
39 #define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object))) | 38 #define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object))) |
40 #else | |
41 #define PTR_TO_OFFSET(d) 0 | |
42 #endif | 39 #endif |
43 | 40 |
44 #ifdef HAVE_CONFIG_H | 41 #ifdef HAVE_CONFIG_H |
45 #include <config.h> | 42 #include <config.h> |
46 #endif | 43 #endif |
77 | 74 |
78 #define malloc xmalloc | 75 #define malloc xmalloc |
79 #define realloc xrealloc | 76 #define realloc xrealloc |
80 #define free xfree | 77 #define free xfree |
81 | 78 |
79 #define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte) | |
82 #define RE_STRING_CHAR(p, s) \ | 80 #define RE_STRING_CHAR(p, s) \ |
83 (multibyte ? (STRING_CHAR (p, s)) : (*(p))) | 81 (multibyte ? (STRING_CHAR (p, s)) : (*(p))) |
82 #define RE_STRING_CHAR_AND_LENGTH(p, s, len) \ | |
83 (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p))) | |
84 | |
85 /* Set C a (possibly multibyte) character before P. P points into a | |
86 string which is the virtual concatenation of STR1 (which ends at | |
87 END1) or STR2 (which ends at END2). */ | |
88 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ | |
89 do { \ | |
90 if (multibyte) \ | |
91 { \ | |
92 re_char *dtemp = (p) == (str2) ? (end1) : (p); \ | |
93 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \ | |
94 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \ | |
95 c = STRING_CHAR (dtemp, (p) - dtemp); \ | |
96 } \ | |
97 else \ | |
98 (c = ((p) == (str2) ? (end1) : (p))[-1]); \ | |
99 } while (0) | |
100 | |
84 | 101 |
85 #else /* not emacs */ | 102 #else /* not emacs */ |
86 | 103 |
87 /* If we are not linking with Emacs proper, | 104 /* If we are not linking with Emacs proper, |
88 we can't use the relocating allocator | 105 we can't use the relocating allocator |
179 | 196 |
180 /* Dummy macros for non-Emacs environments. */ | 197 /* Dummy macros for non-Emacs environments. */ |
181 #define BASE_LEADING_CODE_P(c) (0) | 198 #define BASE_LEADING_CODE_P(c) (0) |
182 #define CHAR_CHARSET(c) 0 | 199 #define CHAR_CHARSET(c) 0 |
183 #define CHARSET_LEADING_CODE_BASE(c) 0 | 200 #define CHARSET_LEADING_CODE_BASE(c) 0 |
201 #define MAX_MULTIBYTE_LENGTH 1 | |
202 #define RE_MULTIBYTE_P(x) 0 | |
184 #define WORD_BOUNDARY_P(c1, c2) (0) | 203 #define WORD_BOUNDARY_P(c1, c2) (0) |
185 #define CHAR_HEAD_P(p) (1) | 204 #define CHAR_HEAD_P(p) (1) |
186 #define SINGLE_BYTE_CHAR_P(c) (1) | 205 #define SINGLE_BYTE_CHAR_P(c) (1) |
187 #define SAME_CHARSET_P(c1, c2) (1) | 206 #define SAME_CHARSET_P(c1, c2) (1) |
188 #define MULTIBYTE_FORM_LENGTH(p, s) (1) | 207 #define MULTIBYTE_FORM_LENGTH(p, s) (1) |
189 #define STRING_CHAR(p, s) (*(p)) | 208 #define STRING_CHAR(p, s) (*(p)) |
190 #define RE_STRING_CHAR STRING_CHAR | 209 #define RE_STRING_CHAR STRING_CHAR |
210 #define CHAR_STRING(c, s) (*(s) = (c), 1) | |
191 #define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p)) | 211 #define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p)) |
212 #define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH | |
192 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ | 213 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ |
193 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1))) | 214 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1))) |
194 #endif /* not emacs */ | 215 #endif /* not emacs */ |
195 | 216 |
196 #ifndef RE_TRANSLATE | 217 #ifndef RE_TRANSLATE |
1548 string passed to us by the user to an unsigned char that we can use | 1569 string passed to us by the user to an unsigned char that we can use |
1549 as an array index (in, e.g., `translate'). */ | 1570 as an array index (in, e.g., `translate'). */ |
1550 #define PATFETCH(c) \ | 1571 #define PATFETCH(c) \ |
1551 do { \ | 1572 do { \ |
1552 PATFETCH_RAW (c); \ | 1573 PATFETCH_RAW (c); \ |
1553 if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c); \ | 1574 c = TRANSLATE (c); \ |
1554 } while (0) | 1575 } while (0) |
1555 | 1576 |
1556 /* Fetch the next character in the uncompiled pattern, with no | 1577 /* Fetch the next character in the uncompiled pattern, with no |
1557 translation. */ | 1578 translation. */ |
1558 #define PATFETCH_RAW(c) \ | 1579 #define PATFETCH_RAW(c) \ |
1559 do {if (p == pend) return REG_EEND; \ | 1580 do { \ |
1560 c = *p++; \ | 1581 int len; \ |
1582 if (p == pend) return REG_EEND; \ | |
1583 c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \ | |
1584 p += len; \ | |
1561 } while (0) | 1585 } while (0) |
1562 | |
1563 /* Go backwards one character in the pattern. */ | |
1564 #define PATUNFETCH p-- | |
1565 | 1586 |
1566 | 1587 |
1567 /* If `translate' is non-null, return translate[D], else just D. We | 1588 /* If `translate' is non-null, return translate[D], else just D. We |
1568 cast the subscript to translate because some data is declared as | 1589 cast the subscript to translate because some data is declared as |
1569 `char *', to avoid warnings when a string constant is passed. But | 1590 `char *', to avoid warnings when a string constant is passed. But |
1955 regnum_t regnum = 0; | 1976 regnum_t regnum = 0; |
1956 | 1977 |
1957 /* Work area for range table of charset. */ | 1978 /* Work area for range table of charset. */ |
1958 struct range_table_work_area range_table_work; | 1979 struct range_table_work_area range_table_work; |
1959 | 1980 |
1981 /* If the object matched can contain multibyte characters. */ | |
1982 const boolean multibyte = RE_MULTIBYTE_P (bufp); | |
1983 | |
1960 #ifdef DEBUG | 1984 #ifdef DEBUG |
1961 debug++; | 1985 debug++; |
1962 DEBUG_PRINT1 ("\nCompiling pattern: "); | 1986 DEBUG_PRINT1 ("\nCompiling pattern: "); |
1963 if (debug > 0) | 1987 if (debug > 0) |
1964 { | 1988 { |
1991 at the end. */ | 2015 at the end. */ |
1992 bufp->used = 0; | 2016 bufp->used = 0; |
1993 | 2017 |
1994 /* Always count groups, whether or not bufp->no_sub is set. */ | 2018 /* Always count groups, whether or not bufp->no_sub is set. */ |
1995 bufp->re_nsub = 0; | 2019 bufp->re_nsub = 0; |
1996 | |
1997 #ifdef emacs | |
1998 /* bufp->multibyte is set before regex_compile is called, so don't alter | |
1999 it. */ | |
2000 #else /* not emacs */ | |
2001 /* Nothing is recognized as a multibyte character. */ | |
2002 bufp->multibyte = 0; | |
2003 #endif | |
2004 | 2020 |
2005 #if !defined (emacs) && !defined (SYNTAX_TABLE) | 2021 #if !defined (emacs) && !defined (SYNTAX_TABLE) |
2006 /* Initialize the syntax table. */ | 2022 /* Initialize the syntax table. */ |
2007 init_syntax_once (); | 2023 init_syntax_once (); |
2008 #endif | 2024 #endif |
2252 SET_LIST_BIT ('\n'); | 2268 SET_LIST_BIT ('\n'); |
2253 | 2269 |
2254 /* Read in characters and ranges, setting map bits. */ | 2270 /* Read in characters and ranges, setting map bits. */ |
2255 for (;;) | 2271 for (;;) |
2256 { | 2272 { |
2257 int len; | |
2258 boolean escaped_char = false; | 2273 boolean escaped_char = false; |
2274 const unsigned char *p2 = p; | |
2259 | 2275 |
2260 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | 2276 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
2261 | 2277 |
2262 PATFETCH (c); | 2278 PATFETCH (c); |
2263 | 2279 |
2272 else | 2288 else |
2273 { | 2289 { |
2274 /* Could be the end of the bracket expression. If it's | 2290 /* Could be the end of the bracket expression. If it's |
2275 not (i.e., when the bracket expression is `[]' so | 2291 not (i.e., when the bracket expression is `[]' so |
2276 far), the ']' character bit gets set way below. */ | 2292 far), the ']' character bit gets set way below. */ |
2277 if (c == ']' && p != p1 + 1) | 2293 if (c == ']' && p2 != p1) |
2278 break; | 2294 break; |
2279 } | 2295 } |
2280 | 2296 |
2281 /* If C indicates start of multibyte char, get the | |
2282 actual character code in C, and set the pattern | |
2283 pointer P to the next character boundary. */ | |
2284 if (bufp->multibyte && BASE_LEADING_CODE_P (c)) | |
2285 { | |
2286 PATUNFETCH; | |
2287 c = STRING_CHAR_AND_LENGTH (p, pend - p, len); | |
2288 p += len; | |
2289 } | |
2290 /* What should we do for the character which is | 2297 /* What should we do for the character which is |
2291 greater than 0x7F, but not BASE_LEADING_CODE_P? | 2298 greater than 0x7F, but not BASE_LEADING_CODE_P? |
2292 XXX */ | 2299 XXX */ |
2293 | 2300 |
2294 /* See if we're at the beginning of a possible character | 2301 /* See if we're at the beginning of a possible character |
2295 class. */ | 2302 class. */ |
2296 | 2303 |
2297 else if (!escaped_char && | 2304 if (!escaped_char && |
2298 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') | 2305 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') |
2299 { | 2306 { |
2300 /* Leave room for the null. */ | 2307 /* Leave room for the null. */ |
2301 char str[CHAR_CLASS_MAX_LENGTH + 1]; | 2308 char str[CHAR_CLASS_MAX_LENGTH + 1]; |
2302 const unsigned char *class_beg; | 2309 const unsigned char *class_beg; |
2303 | 2310 |
2356 just set a flag. Exceptions are is_blank, | 2363 just set a flag. Exceptions are is_blank, |
2357 is_digit, is_cntrl, and is_xdigit, since | 2364 is_digit, is_cntrl, and is_xdigit, since |
2358 they can only match ASCII characters. We | 2365 they can only match ASCII characters. We |
2359 don't need to handle them for multibyte. */ | 2366 don't need to handle them for multibyte. */ |
2360 | 2367 |
2361 if (bufp->multibyte) | 2368 if (multibyte) |
2362 { | 2369 { |
2363 int bit = 0; | 2370 int bit = 0; |
2364 | 2371 |
2365 if (is_alnum) bit = BIT_ALNUM; | 2372 if (is_alnum) bit = BIT_ALNUM; |
2366 if (is_alpha) bit = BIT_ALPHA; | 2373 if (is_alpha) bit = BIT_ALPHA; |
2433 /* Discard the `-'. */ | 2440 /* Discard the `-'. */ |
2434 PATFETCH (c1); | 2441 PATFETCH (c1); |
2435 | 2442 |
2436 /* Fetch the character which ends the range. */ | 2443 /* Fetch the character which ends the range. */ |
2437 PATFETCH (c1); | 2444 PATFETCH (c1); |
2438 if (bufp->multibyte && BASE_LEADING_CODE_P (c1)) | |
2439 { | |
2440 PATUNFETCH; | |
2441 c1 = STRING_CHAR_AND_LENGTH (p, pend - p, len); | |
2442 p += len; | |
2443 } | |
2444 | 2445 |
2445 if (SINGLE_BYTE_CHAR_P (c) | 2446 if (SINGLE_BYTE_CHAR_P (c) |
2446 && ! SINGLE_BYTE_CHAR_P (c1)) | 2447 && ! SINGLE_BYTE_CHAR_P (c1)) |
2447 { | 2448 { |
2448 /* Handle a range such as \177-\377 in multibyte mode. | 2449 /* Handle a range such as \177-\377 in multibyte mode. |
3026 | 3027 |
3027 | 3028 |
3028 default: | 3029 default: |
3029 /* Expects the character in `c'. */ | 3030 /* Expects the character in `c'. */ |
3030 normal_char: | 3031 normal_char: |
3031 p1 = p - 1; /* P1 points the head of C. */ | |
3032 #ifdef emacs | |
3033 if (bufp->multibyte) | |
3034 { | |
3035 c = STRING_CHAR (p1, pend - p1); | |
3036 c = TRANSLATE (c); | |
3037 /* Set P to the next character boundary. */ | |
3038 p += MULTIBYTE_FORM_LENGTH (p1, pend - p1) - 1; | |
3039 } | |
3040 #endif | |
3041 /* If no exactn currently being built. */ | 3032 /* If no exactn currently being built. */ |
3042 if (!pending_exact | 3033 if (!pending_exact |
3043 | 3034 |
3044 /* If last exactn not at current position. */ | 3035 /* If last exactn not at current position. */ |
3045 || pending_exact + *pending_exact + 1 != b | 3036 || pending_exact + *pending_exact + 1 != b |
3046 | 3037 |
3047 /* We have only one byte following the exactn for the count. */ | 3038 /* We have only one byte following the exactn for the count. */ |
3048 || *pending_exact >= (1 << BYTEWIDTH) - (p - p1) | 3039 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH |
3049 | 3040 |
3050 /* If followed by a repetition operator. */ | 3041 /* If followed by a repetition operator. */ |
3051 || (p != pend && (*p == '*' || *p == '^')) | 3042 || (p != pend && (*p == '*' || *p == '^')) |
3052 || ((syntax & RE_BK_PLUS_QM) | 3043 || ((syntax & RE_BK_PLUS_QM) |
3053 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?') | 3044 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?') |
3063 | 3054 |
3064 BUF_PUSH_2 (exactn, 0); | 3055 BUF_PUSH_2 (exactn, 0); |
3065 pending_exact = b - 1; | 3056 pending_exact = b - 1; |
3066 } | 3057 } |
3067 | 3058 |
3068 #ifdef emacs | 3059 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH); |
3069 if (! SINGLE_BYTE_CHAR_P (c)) | 3060 { |
3070 { | 3061 int len = CHAR_STRING (c, b); |
3071 unsigned char str[MAX_MULTIBYTE_LENGTH]; | 3062 b += len; |
3072 int i = CHAR_STRING (c, str); | 3063 (*pending_exact) += len; |
3073 int j; | 3064 } |
3074 for (j = 0; j < i; j++) | 3065 |
3075 { | |
3076 BUF_PUSH (str[j]); | |
3077 (*pending_exact)++; | |
3078 } | |
3079 } | |
3080 else | |
3081 #endif | |
3082 { | |
3083 BUF_PUSH (c); | |
3084 (*pending_exact)++; | |
3085 } | |
3086 break; | 3066 break; |
3087 } /* switch (c) */ | 3067 } /* switch (c) */ |
3088 } /* while p != pend */ | 3068 } /* while p != pend */ |
3089 | 3069 |
3090 | 3070 |
3614 | 3594 |
3615 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ | 3595 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ |
3616 bufp->fastmap_accurate = 1; /* It will be when we're done. */ | 3596 bufp->fastmap_accurate = 1; /* It will be when we're done. */ |
3617 | 3597 |
3618 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used, | 3598 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used, |
3619 fastmap, bufp->multibyte); | 3599 fastmap, RE_MULTIBYTE_P (bufp)); |
3620 if (analysis < -1) | 3600 if (analysis < -1) |
3621 return analysis; | 3601 return analysis; |
3622 bufp->can_be_null = (analysis != 0); | 3602 bufp->can_be_null = (analysis != 0); |
3623 return 0; | 3603 return 0; |
3624 } /* re_compile_fastmap */ | 3604 } /* re_compile_fastmap */ |
3721 int total_size = size1 + size2; | 3701 int total_size = size1 + size2; |
3722 int endpos = startpos + range; | 3702 int endpos = startpos + range; |
3723 int anchored_start = 0; | 3703 int anchored_start = 0; |
3724 | 3704 |
3725 /* Nonzero if we have to concern multibyte character. */ | 3705 /* Nonzero if we have to concern multibyte character. */ |
3726 const boolean multibyte = bufp->multibyte; | 3706 const boolean multibyte = RE_MULTIBYTE_P (bufp); |
3727 | 3707 |
3728 /* Check for out-of-range STARTPOS. */ | 3708 /* Check for out-of-range STARTPOS. */ |
3729 if (startpos < 0 || startpos > total_size) | 3709 if (startpos < 0 || startpos > total_size) |
3730 return -1; | 3710 return -1; |
3731 | 3711 |
3848 | 3828 |
3849 startpos += irange - range; | 3829 startpos += irange - range; |
3850 } | 3830 } |
3851 else /* Searching backwards. */ | 3831 else /* Searching backwards. */ |
3852 { | 3832 { |
3853 buf_ch = STRING_CHAR (d, (startpos >= size1 | 3833 int room = (startpos >= size1 |
3854 ? size2 + size1 - startpos | 3834 ? size2 + size1 - startpos |
3855 : size1 - startpos)); | 3835 : size1 - startpos); |
3856 if (RE_TRANSLATE_P (translate)) | 3836 buf_ch = RE_STRING_CHAR (d, room); |
3857 buf_ch = RE_TRANSLATE (translate, buf_ch); | 3837 buf_ch = TRANSLATE (buf_ch); |
3858 | 3838 |
3859 if (! (buf_ch >= 0400 | 3839 if (! (buf_ch >= 0400 |
3860 || fastmap[buf_ch])) | 3840 || fastmap[buf_ch])) |
3861 goto advance; | 3841 goto advance; |
3862 } | 3842 } |
3938 return -1; | 3918 return -1; |
3939 } /* re_search_2 */ | 3919 } /* re_search_2 */ |
3940 | 3920 |
3941 /* Declarations and macros for re_match_2. */ | 3921 /* Declarations and macros for re_match_2. */ |
3942 | 3922 |
3943 static int bcmp_translate (); | 3923 static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2, |
3924 register int len, | |
3925 RE_TRANSLATE_TYPE translate, | |
3926 const int multibyte)); | |
3944 | 3927 |
3945 /* This converts PTR, a pointer into one of the search strings `string1' | 3928 /* This converts PTR, a pointer into one of the search strings `string1' |
3946 and `string2' into an offset from the beginning of that string. */ | 3929 and `string2' into an offset from the beginning of that string. */ |
3947 #define POINTER_TO_OFFSET(ptr) \ | 3930 #define POINTER_TO_OFFSET(ptr) \ |
3948 (FIRST_STRING_P (ptr) \ | 3931 (FIRST_STRING_P (ptr) \ |
4091 mutually_exclusive_p (bufp, p1, p2) | 4074 mutually_exclusive_p (bufp, p1, p2) |
4092 struct re_pattern_buffer *bufp; | 4075 struct re_pattern_buffer *bufp; |
4093 unsigned char *p1, *p2; | 4076 unsigned char *p1, *p2; |
4094 { | 4077 { |
4095 re_opcode_t op2; | 4078 re_opcode_t op2; |
4096 const boolean multibyte = bufp->multibyte; | 4079 const boolean multibyte = RE_MULTIBYTE_P (bufp); |
4097 unsigned char *pend = bufp->buffer + bufp->used; | 4080 unsigned char *pend = bufp->buffer + bufp->used; |
4098 | 4081 |
4099 assert (p1 >= bufp->buffer && p1 < pend | 4082 assert (p1 >= bufp->buffer && p1 < pend |
4100 && p2 >= bufp->buffer && p2 <= pend); | 4083 && p2 >= bufp->buffer && p2 <= pend); |
4101 | 4084 |
4371 | 4354 |
4372 /* We use this to map every character in the string. */ | 4355 /* We use this to map every character in the string. */ |
4373 RE_TRANSLATE_TYPE translate = bufp->translate; | 4356 RE_TRANSLATE_TYPE translate = bufp->translate; |
4374 | 4357 |
4375 /* Nonzero if we have to concern multibyte character. */ | 4358 /* Nonzero if we have to concern multibyte character. */ |
4376 const boolean multibyte = bufp->multibyte; | 4359 const boolean multibyte = RE_MULTIBYTE_P (bufp); |
4377 | 4360 |
4378 /* Failure point stack. Each place that can handle a failure further | 4361 /* Failure point stack. Each place that can handle a failure further |
4379 down the line pushes a failure point on this stack. It consists of | 4362 down the line pushes a failure point on this stack. It consists of |
4380 regstart, and regend for all registers corresponding to | 4363 regstart, and regend for all registers corresponding to |
4381 the subexpressions we're currently inside, plus the number of such | 4364 the subexpressions we're currently inside, plus the number of such |
4719 | 4702 |
4720 /* This is written out as an if-else so we don't waste time | 4703 /* This is written out as an if-else so we don't waste time |
4721 testing `translate' inside the loop. */ | 4704 testing `translate' inside the loop. */ |
4722 if (RE_TRANSLATE_P (translate)) | 4705 if (RE_TRANSLATE_P (translate)) |
4723 { | 4706 { |
4724 #ifdef emacs | |
4725 if (multibyte) | 4707 if (multibyte) |
4726 do | 4708 do |
4727 { | 4709 { |
4728 int pat_charlen, buf_charlen; | 4710 int pat_charlen, buf_charlen; |
4729 unsigned int pat_ch, buf_ch; | 4711 unsigned int pat_ch, buf_ch; |
4743 d += buf_charlen; | 4725 d += buf_charlen; |
4744 mcnt -= pat_charlen; | 4726 mcnt -= pat_charlen; |
4745 } | 4727 } |
4746 while (mcnt > 0); | 4728 while (mcnt > 0); |
4747 else | 4729 else |
4748 #endif /* not emacs */ | |
4749 do | 4730 do |
4750 { | 4731 { |
4751 PREFETCH (); | 4732 PREFETCH (); |
4752 if (RE_TRANSLATE (translate, *d) != *p++) | 4733 if (RE_TRANSLATE (translate, *d) != *p++) |
4753 { | 4734 { |
4781 unsigned int buf_ch; | 4762 unsigned int buf_ch; |
4782 | 4763 |
4783 DEBUG_PRINT1 ("EXECUTING anychar.\n"); | 4764 DEBUG_PRINT1 ("EXECUTING anychar.\n"); |
4784 | 4765 |
4785 PREFETCH (); | 4766 PREFETCH (); |
4786 | 4767 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); |
4787 #ifdef emacs | |
4788 if (multibyte) | |
4789 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); | |
4790 else | |
4791 #endif /* not emacs */ | |
4792 { | |
4793 buf_ch = *d; | |
4794 buf_charlen = 1; | |
4795 } | |
4796 | |
4797 buf_ch = TRANSLATE (buf_ch); | 4768 buf_ch = TRANSLATE (buf_ch); |
4798 | 4769 |
4799 if ((!(bufp->syntax & RE_DOT_NEWLINE) | 4770 if ((!(bufp->syntax & RE_DOT_NEWLINE) |
4800 && buf_ch == '\n') | 4771 && buf_ch == '\n') |
4801 || ((bufp->syntax & RE_DOT_NOT_NULL) | 4772 || ((bufp->syntax & RE_DOT_NOT_NULL) |
4826 in the initial byte-length of the command. */ | 4797 in the initial byte-length of the command. */ |
4827 int count = 0; | 4798 int count = 0; |
4828 | 4799 |
4829 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); | 4800 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); |
4830 | 4801 |
4831 PREFETCH (); | |
4832 c = *d; | |
4833 | |
4834 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]); | 4802 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]); |
4835 | 4803 |
4836 #ifdef emacs | |
4837 if (range_table_exists) | 4804 if (range_table_exists) |
4838 { | 4805 { |
4839 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */ | 4806 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */ |
4840 EXTRACT_NUMBER_AND_INCR (count, range_table); | 4807 EXTRACT_NUMBER_AND_INCR (count, range_table); |
4841 } | 4808 } |
4842 | 4809 |
4843 if (multibyte && BASE_LEADING_CODE_P (c)) | 4810 PREFETCH (); |
4844 c = STRING_CHAR_AND_LENGTH (d, dend - d, len); | 4811 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); |
4845 #endif /* emacs */ | 4812 c = TRANSLATE (c); /* The character to match. */ |
4846 | 4813 |
4847 if (SINGLE_BYTE_CHAR_P (c)) | 4814 if (SINGLE_BYTE_CHAR_P (c)) |
4848 { /* Lookup bitmap. */ | 4815 { /* Lookup bitmap. */ |
4849 c = TRANSLATE (c); /* The character to match. */ | |
4850 len = 1; | |
4851 | |
4852 /* Cast to `unsigned' instead of `unsigned char' in | 4816 /* Cast to `unsigned' instead of `unsigned char' in |
4853 case the bit list is a full 32 bytes long. */ | 4817 case the bit list is a full 32 bytes long. */ |
4854 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH) | 4818 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH) |
4855 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) | 4819 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) |
4856 not = !not; | 4820 not = !not; |
4992 mcnt = dend2 - d2; | 4956 mcnt = dend2 - d2; |
4993 | 4957 |
4994 /* Compare that many; failure if mismatch, else move | 4958 /* Compare that many; failure if mismatch, else move |
4995 past them. */ | 4959 past them. */ |
4996 if (RE_TRANSLATE_P (translate) | 4960 if (RE_TRANSLATE_P (translate) |
4997 ? bcmp_translate (d, d2, mcnt, translate) | 4961 ? bcmp_translate (d, d2, mcnt, translate, multibyte) |
4998 : bcmp (d, d2, mcnt)) | 4962 : bcmp (d, d2, mcnt)) |
4999 { | 4963 { |
5000 d = dfail; | 4964 d = dfail; |
5001 goto fail; | 4965 goto fail; |
5002 } | 4966 } |
5261 { | 5225 { |
5262 /* C1 is the character before D, S1 is the syntax of C1, C2 | 5226 /* C1 is the character before D, S1 is the syntax of C1, C2 |
5263 is the character at D, and S2 is the syntax of C2. */ | 5227 is the character at D, and S2 is the syntax of C2. */ |
5264 int c1, c2, s1, s2; | 5228 int c1, c2, s1, s2; |
5265 #ifdef emacs | 5229 #ifdef emacs |
5266 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d - 1)); | 5230 int offset = PTR_TO_OFFSET (d - 1); |
5231 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); | |
5267 UPDATE_SYNTAX_TABLE (charpos); | 5232 UPDATE_SYNTAX_TABLE (charpos); |
5268 #endif | 5233 #endif |
5269 /* FIXME: This does a STRING_CHAR even for unibyte buffers. */ | |
5270 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); | 5234 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); |
5271 s1 = SYNTAX (c1); | 5235 s1 = SYNTAX (c1); |
5272 #ifdef emacs | 5236 #ifdef emacs |
5273 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); | 5237 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); |
5274 #endif | 5238 #endif |
5275 PREFETCH (); | 5239 PREFETCH (); |
5276 /* FIXME: This does a STRING_CHAR even for unibyte buffers. */ | 5240 c2 = RE_STRING_CHAR (d, dend - d); |
5277 c2 = STRING_CHAR (d, dend - d); | |
5278 s2 = SYNTAX (c2); | 5241 s2 = SYNTAX (c2); |
5279 | 5242 |
5280 if (/* Case 2: Only one of S1 and S2 is Sword. */ | 5243 if (/* Case 2: Only one of S1 and S2 is Sword. */ |
5281 ((s1 == Sword) != (s2 == Sword)) | 5244 ((s1 == Sword) != (s2 == Sword)) |
5282 /* Case 3: Both of S1 and S2 are Sword, and macro | 5245 /* Case 3: Both of S1 and S2 are Sword, and macro |
5301 { | 5264 { |
5302 /* C1 is the character before D, S1 is the syntax of C1, C2 | 5265 /* C1 is the character before D, S1 is the syntax of C1, C2 |
5303 is the character at D, and S2 is the syntax of C2. */ | 5266 is the character at D, and S2 is the syntax of C2. */ |
5304 int c1, c2, s1, s2; | 5267 int c1, c2, s1, s2; |
5305 #ifdef emacs | 5268 #ifdef emacs |
5306 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)); | 5269 int offset = PTR_TO_OFFSET (d); |
5270 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); | |
5307 UPDATE_SYNTAX_TABLE (charpos); | 5271 UPDATE_SYNTAX_TABLE (charpos); |
5308 #endif | 5272 #endif |
5309 PREFETCH (); | 5273 PREFETCH (); |
5310 /* FIXME: This does a STRING_CHAR even for unibyte buffers. */ | 5274 c2 = RE_STRING_CHAR (d, dend - d); |
5311 c2 = STRING_CHAR (d, dend - d); | |
5312 s2 = SYNTAX (c2); | 5275 s2 = SYNTAX (c2); |
5313 | 5276 |
5314 /* Case 2: S2 is not Sword. */ | 5277 /* Case 2: S2 is not Sword. */ |
5315 if (s2 != Sword) | 5278 if (s2 != Sword) |
5316 goto fail; | 5279 goto fail; |
5344 { | 5307 { |
5345 /* C1 is the character before D, S1 is the syntax of C1, C2 | 5308 /* C1 is the character before D, S1 is the syntax of C1, C2 |
5346 is the character at D, and S2 is the syntax of C2. */ | 5309 is the character at D, and S2 is the syntax of C2. */ |
5347 int c1, c2, s1, s2; | 5310 int c1, c2, s1, s2; |
5348 #ifdef emacs | 5311 #ifdef emacs |
5349 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d) - 1); | 5312 int offset = PTR_TO_OFFSET (d) - 1; |
5313 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); | |
5350 UPDATE_SYNTAX_TABLE (charpos); | 5314 UPDATE_SYNTAX_TABLE (charpos); |
5351 #endif | 5315 #endif |
5352 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); | 5316 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); |
5353 s1 = SYNTAX (c1); | 5317 s1 = SYNTAX (c1); |
5354 | 5318 |
5358 | 5322 |
5359 /* Case 3: D is not at the end of string ... */ | 5323 /* Case 3: D is not at the end of string ... */ |
5360 if (!AT_STRINGS_END (d)) | 5324 if (!AT_STRINGS_END (d)) |
5361 { | 5325 { |
5362 PREFETCH (); | 5326 PREFETCH (); |
5363 /* FIXME: This does a STRING_CHAR even for unibyte buffers. */ | 5327 c2 = RE_STRING_CHAR (d, dend - d); |
5364 c2 = STRING_CHAR (d, dend - d); | |
5365 #ifdef emacs | 5328 #ifdef emacs |
5366 UPDATE_SYNTAX_TABLE_FORWARD (charpos); | 5329 UPDATE_SYNTAX_TABLE_FORWARD (charpos); |
5367 #endif | 5330 #endif |
5368 s2 = SYNTAX (c2); | 5331 s2 = SYNTAX (c2); |
5369 | 5332 |
5381 mcnt = *p++; | 5344 mcnt = *p++; |
5382 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt); | 5345 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt); |
5383 PREFETCH (); | 5346 PREFETCH (); |
5384 #ifdef emacs | 5347 #ifdef emacs |
5385 { | 5348 { |
5386 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)); | 5349 int offset = PTR_TO_OFFSET (d); |
5350 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset); | |
5387 UPDATE_SYNTAX_TABLE (pos1); | 5351 UPDATE_SYNTAX_TABLE (pos1); |
5388 } | 5352 } |
5389 #endif | 5353 #endif |
5390 { | 5354 { |
5391 int c, len; | 5355 int c, len; |
5392 | 5356 |
5393 if (multibyte) | 5357 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); |
5394 /* we must concern about multibyte form, ... */ | |
5395 c = STRING_CHAR_AND_LENGTH (d, dend - d, len); | |
5396 else | |
5397 /* everything should be handled as ASCII, even though it | |
5398 looks like multibyte form. */ | |
5399 c = *d, len = 1; | |
5400 | 5358 |
5401 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not) | 5359 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not) |
5402 goto fail; | 5360 goto fail; |
5403 d += len; | 5361 d += len; |
5404 } | 5362 } |
5429 mcnt = *p++; | 5387 mcnt = *p++; |
5430 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt); | 5388 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt); |
5431 PREFETCH (); | 5389 PREFETCH (); |
5432 { | 5390 { |
5433 int c, len; | 5391 int c, len; |
5434 | 5392 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); |
5435 if (multibyte) | |
5436 c = STRING_CHAR_AND_LENGTH (d, dend - d, len); | |
5437 else | |
5438 c = *d, len = 1; | |
5439 | 5393 |
5440 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not) | 5394 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not) |
5441 goto fail; | 5395 goto fail; |
5442 d += len; | 5396 d += len; |
5443 } | 5397 } |
5510 | 5464 |
5511 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN | 5465 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN |
5512 bytes; nonzero otherwise. */ | 5466 bytes; nonzero otherwise. */ |
5513 | 5467 |
5514 static int | 5468 static int |
5515 bcmp_translate (s1, s2, len, translate) | 5469 bcmp_translate (s1, s2, len, translate, multibyte) |
5516 unsigned char *s1, *s2; | 5470 re_char *s1, *s2; |
5517 register int len; | 5471 register int len; |
5518 RE_TRANSLATE_TYPE translate; | 5472 RE_TRANSLATE_TYPE translate; |
5473 const int multibyte; | |
5519 { | 5474 { |
5520 register unsigned char *p1 = s1, *p2 = s2; | 5475 register re_char *p1 = s1, *p2 = s2; |
5521 unsigned char *p1_end = s1 + len; | 5476 re_char *p1_end = s1 + len; |
5522 unsigned char *p2_end = s2 + len; | 5477 re_char *p2_end = s2 + len; |
5523 | 5478 |
5524 while (p1 != p1_end && p2 != p2_end) | 5479 while (p1 != p1_end && p2 != p2_end) |
5525 { | 5480 { |
5526 int p1_charlen, p2_charlen; | 5481 int p1_charlen, p2_charlen; |
5527 int p1_ch, p2_ch; | 5482 int p1_ch, p2_ch; |
5528 | 5483 |
5529 /* FIXME: This assumes `multibyte = true'. */ | 5484 p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen); |
5530 p1_ch = STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen); | 5485 p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen); |
5531 p2_ch = STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen); | |
5532 | 5486 |
5533 if (RE_TRANSLATE (translate, p1_ch) | 5487 if (RE_TRANSLATE (translate, p1_ch) |
5534 != RE_TRANSLATE (translate, p2_ch)) | 5488 != RE_TRANSLATE (translate, p2_ch)) |
5535 return 1; | 5489 return 1; |
5536 | 5490 |