comparison src/regex.c @ 28473:975fe3d8922e

* regex.c (PTR_TO_OFFSET) [!emacs]: Remove. (RE_MULTIBYTE_P, RE_STRING_CHAR_AND_LENGTH): New macros. (GET_CHAR_BEFORE_2): Moved from charset.h plus fixed minor bug when we are between str1 and str2. (MAX_MULTIBYTE_LENGTH, CHAR_STRING) [!emacs]: Provide trivial default. (PATFETCH): Use `TRANSLATE'. (PATFETCH_RAW): Fetch multibyte char if applicable. (PATUNFETCH): Remove. (regex_compile): Rely on PATFETCH to do most of the multibyte magic. When writing a char, write it directly into the pattern buffer rather than going needlessly through a temp char-array. (re_match_2_internal): Similarly, rely on RE_STRING_CHAR to do the multibyte magic and remove the useless `#ifdef emacs'. (bcmp_translate): Don't compare as multibyte chars when in a unibyte buffer. * regex.h (struct re_pattern_buffer): Make field `multibyte' conditional on `emacs'. * charset.h (GET_CHAR_BEFORE_2): Moved to regex.c.
author Stefan Monnier <monnier@iro.umontreal.ca>
date Sun, 02 Apr 2000 23:56:46 +0000
parents 5478842aea4c
children d40a7f046efe
comparison
equal deleted inserted replaced
28472:bae9218986ac 28473:975fe3d8922e
18 along with this program; if not, write to the Free Software 18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20 USA. */ 20 USA. */
21 21
22 /* TODO: 22 /* TODO:
23 - clean up multibyte issues
24 - structure the opcode space into opcode+flag. 23 - structure the opcode space into opcode+flag.
25 - merge with glibc's regex.[ch] 24 - merge with glibc's regex.[ch]
26 */ 25 */
27 26
28 /* AIX requires this to be the first thing in the file. */ 27 /* AIX requires this to be the first thing in the file. */
35 34
36 #ifdef emacs 35 #ifdef emacs
37 /* Converts the pointer to the char to BEG-based offset from the start. */ 36 /* Converts the pointer to the char to BEG-based offset from the start. */
38 #define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d)) 37 #define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
39 #define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object))) 38 #define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
40 #else
41 #define PTR_TO_OFFSET(d) 0
42 #endif 39 #endif
43 40
44 #ifdef HAVE_CONFIG_H 41 #ifdef HAVE_CONFIG_H
45 #include <config.h> 42 #include <config.h>
46 #endif 43 #endif
77 74
78 #define malloc xmalloc 75 #define malloc xmalloc
79 #define realloc xrealloc 76 #define realloc xrealloc
80 #define free xfree 77 #define free xfree
81 78
79 #define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
82 #define RE_STRING_CHAR(p, s) \ 80 #define RE_STRING_CHAR(p, s) \
83 (multibyte ? (STRING_CHAR (p, s)) : (*(p))) 81 (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
82 #define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
83 (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
84
85 /* Set C a (possibly multibyte) character before P. P points into a
86 string which is the virtual concatenation of STR1 (which ends at
87 END1) or STR2 (which ends at END2). */
88 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
89 do { \
90 if (multibyte) \
91 { \
92 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
93 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
94 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
95 c = STRING_CHAR (dtemp, (p) - dtemp); \
96 } \
97 else \
98 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
99 } while (0)
100
84 101
85 #else /* not emacs */ 102 #else /* not emacs */
86 103
87 /* If we are not linking with Emacs proper, 104 /* If we are not linking with Emacs proper,
88 we can't use the relocating allocator 105 we can't use the relocating allocator
179 196
180 /* Dummy macros for non-Emacs environments. */ 197 /* Dummy macros for non-Emacs environments. */
181 #define BASE_LEADING_CODE_P(c) (0) 198 #define BASE_LEADING_CODE_P(c) (0)
182 #define CHAR_CHARSET(c) 0 199 #define CHAR_CHARSET(c) 0
183 #define CHARSET_LEADING_CODE_BASE(c) 0 200 #define CHARSET_LEADING_CODE_BASE(c) 0
201 #define MAX_MULTIBYTE_LENGTH 1
202 #define RE_MULTIBYTE_P(x) 0
184 #define WORD_BOUNDARY_P(c1, c2) (0) 203 #define WORD_BOUNDARY_P(c1, c2) (0)
185 #define CHAR_HEAD_P(p) (1) 204 #define CHAR_HEAD_P(p) (1)
186 #define SINGLE_BYTE_CHAR_P(c) (1) 205 #define SINGLE_BYTE_CHAR_P(c) (1)
187 #define SAME_CHARSET_P(c1, c2) (1) 206 #define SAME_CHARSET_P(c1, c2) (1)
188 #define MULTIBYTE_FORM_LENGTH(p, s) (1) 207 #define MULTIBYTE_FORM_LENGTH(p, s) (1)
189 #define STRING_CHAR(p, s) (*(p)) 208 #define STRING_CHAR(p, s) (*(p))
190 #define RE_STRING_CHAR STRING_CHAR 209 #define RE_STRING_CHAR STRING_CHAR
210 #define CHAR_STRING(c, s) (*(s) = (c), 1)
191 #define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p)) 211 #define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
212 #define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH
192 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ 213 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
193 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1))) 214 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
194 #endif /* not emacs */ 215 #endif /* not emacs */
195 216
196 #ifndef RE_TRANSLATE 217 #ifndef RE_TRANSLATE
1548 string passed to us by the user to an unsigned char that we can use 1569 string passed to us by the user to an unsigned char that we can use
1549 as an array index (in, e.g., `translate'). */ 1570 as an array index (in, e.g., `translate'). */
1550 #define PATFETCH(c) \ 1571 #define PATFETCH(c) \
1551 do { \ 1572 do { \
1552 PATFETCH_RAW (c); \ 1573 PATFETCH_RAW (c); \
1553 if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c); \ 1574 c = TRANSLATE (c); \
1554 } while (0) 1575 } while (0)
1555 1576
1556 /* Fetch the next character in the uncompiled pattern, with no 1577 /* Fetch the next character in the uncompiled pattern, with no
1557 translation. */ 1578 translation. */
1558 #define PATFETCH_RAW(c) \ 1579 #define PATFETCH_RAW(c) \
1559 do {if (p == pend) return REG_EEND; \ 1580 do { \
1560 c = *p++; \ 1581 int len; \
1582 if (p == pend) return REG_EEND; \
1583 c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \
1584 p += len; \
1561 } while (0) 1585 } while (0)
1562
1563 /* Go backwards one character in the pattern. */
1564 #define PATUNFETCH p--
1565 1586
1566 1587
1567 /* If `translate' is non-null, return translate[D], else just D. We 1588 /* If `translate' is non-null, return translate[D], else just D. We
1568 cast the subscript to translate because some data is declared as 1589 cast the subscript to translate because some data is declared as
1569 `char *', to avoid warnings when a string constant is passed. But 1590 `char *', to avoid warnings when a string constant is passed. But
1955 regnum_t regnum = 0; 1976 regnum_t regnum = 0;
1956 1977
1957 /* Work area for range table of charset. */ 1978 /* Work area for range table of charset. */
1958 struct range_table_work_area range_table_work; 1979 struct range_table_work_area range_table_work;
1959 1980
1981 /* If the object matched can contain multibyte characters. */
1982 const boolean multibyte = RE_MULTIBYTE_P (bufp);
1983
1960 #ifdef DEBUG 1984 #ifdef DEBUG
1961 debug++; 1985 debug++;
1962 DEBUG_PRINT1 ("\nCompiling pattern: "); 1986 DEBUG_PRINT1 ("\nCompiling pattern: ");
1963 if (debug > 0) 1987 if (debug > 0)
1964 { 1988 {
1991 at the end. */ 2015 at the end. */
1992 bufp->used = 0; 2016 bufp->used = 0;
1993 2017
1994 /* Always count groups, whether or not bufp->no_sub is set. */ 2018 /* Always count groups, whether or not bufp->no_sub is set. */
1995 bufp->re_nsub = 0; 2019 bufp->re_nsub = 0;
1996
1997 #ifdef emacs
1998 /* bufp->multibyte is set before regex_compile is called, so don't alter
1999 it. */
2000 #else /* not emacs */
2001 /* Nothing is recognized as a multibyte character. */
2002 bufp->multibyte = 0;
2003 #endif
2004 2020
2005 #if !defined (emacs) && !defined (SYNTAX_TABLE) 2021 #if !defined (emacs) && !defined (SYNTAX_TABLE)
2006 /* Initialize the syntax table. */ 2022 /* Initialize the syntax table. */
2007 init_syntax_once (); 2023 init_syntax_once ();
2008 #endif 2024 #endif
2252 SET_LIST_BIT ('\n'); 2268 SET_LIST_BIT ('\n');
2253 2269
2254 /* Read in characters and ranges, setting map bits. */ 2270 /* Read in characters and ranges, setting map bits. */
2255 for (;;) 2271 for (;;)
2256 { 2272 {
2257 int len;
2258 boolean escaped_char = false; 2273 boolean escaped_char = false;
2274 const unsigned char *p2 = p;
2259 2275
2260 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2276 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2261 2277
2262 PATFETCH (c); 2278 PATFETCH (c);
2263 2279
2272 else 2288 else
2273 { 2289 {
2274 /* Could be the end of the bracket expression. If it's 2290 /* Could be the end of the bracket expression. If it's
2275 not (i.e., when the bracket expression is `[]' so 2291 not (i.e., when the bracket expression is `[]' so
2276 far), the ']' character bit gets set way below. */ 2292 far), the ']' character bit gets set way below. */
2277 if (c == ']' && p != p1 + 1) 2293 if (c == ']' && p2 != p1)
2278 break; 2294 break;
2279 } 2295 }
2280 2296
2281 /* If C indicates start of multibyte char, get the
2282 actual character code in C, and set the pattern
2283 pointer P to the next character boundary. */
2284 if (bufp->multibyte && BASE_LEADING_CODE_P (c))
2285 {
2286 PATUNFETCH;
2287 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
2288 p += len;
2289 }
2290 /* What should we do for the character which is 2297 /* What should we do for the character which is
2291 greater than 0x7F, but not BASE_LEADING_CODE_P? 2298 greater than 0x7F, but not BASE_LEADING_CODE_P?
2292 XXX */ 2299 XXX */
2293 2300
2294 /* See if we're at the beginning of a possible character 2301 /* See if we're at the beginning of a possible character
2295 class. */ 2302 class. */
2296 2303
2297 else if (!escaped_char && 2304 if (!escaped_char &&
2298 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') 2305 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2299 { 2306 {
2300 /* Leave room for the null. */ 2307 /* Leave room for the null. */
2301 char str[CHAR_CLASS_MAX_LENGTH + 1]; 2308 char str[CHAR_CLASS_MAX_LENGTH + 1];
2302 const unsigned char *class_beg; 2309 const unsigned char *class_beg;
2303 2310
2356 just set a flag. Exceptions are is_blank, 2363 just set a flag. Exceptions are is_blank,
2357 is_digit, is_cntrl, and is_xdigit, since 2364 is_digit, is_cntrl, and is_xdigit, since
2358 they can only match ASCII characters. We 2365 they can only match ASCII characters. We
2359 don't need to handle them for multibyte. */ 2366 don't need to handle them for multibyte. */
2360 2367
2361 if (bufp->multibyte) 2368 if (multibyte)
2362 { 2369 {
2363 int bit = 0; 2370 int bit = 0;
2364 2371
2365 if (is_alnum) bit = BIT_ALNUM; 2372 if (is_alnum) bit = BIT_ALNUM;
2366 if (is_alpha) bit = BIT_ALPHA; 2373 if (is_alpha) bit = BIT_ALPHA;
2433 /* Discard the `-'. */ 2440 /* Discard the `-'. */
2434 PATFETCH (c1); 2441 PATFETCH (c1);
2435 2442
2436 /* Fetch the character which ends the range. */ 2443 /* Fetch the character which ends the range. */
2437 PATFETCH (c1); 2444 PATFETCH (c1);
2438 if (bufp->multibyte && BASE_LEADING_CODE_P (c1))
2439 {
2440 PATUNFETCH;
2441 c1 = STRING_CHAR_AND_LENGTH (p, pend - p, len);
2442 p += len;
2443 }
2444 2445
2445 if (SINGLE_BYTE_CHAR_P (c) 2446 if (SINGLE_BYTE_CHAR_P (c)
2446 && ! SINGLE_BYTE_CHAR_P (c1)) 2447 && ! SINGLE_BYTE_CHAR_P (c1))
2447 { 2448 {
2448 /* Handle a range such as \177-\377 in multibyte mode. 2449 /* Handle a range such as \177-\377 in multibyte mode.
3026 3027
3027 3028
3028 default: 3029 default:
3029 /* Expects the character in `c'. */ 3030 /* Expects the character in `c'. */
3030 normal_char: 3031 normal_char:
3031 p1 = p - 1; /* P1 points the head of C. */
3032 #ifdef emacs
3033 if (bufp->multibyte)
3034 {
3035 c = STRING_CHAR (p1, pend - p1);
3036 c = TRANSLATE (c);
3037 /* Set P to the next character boundary. */
3038 p += MULTIBYTE_FORM_LENGTH (p1, pend - p1) - 1;
3039 }
3040 #endif
3041 /* If no exactn currently being built. */ 3032 /* If no exactn currently being built. */
3042 if (!pending_exact 3033 if (!pending_exact
3043 3034
3044 /* If last exactn not at current position. */ 3035 /* If last exactn not at current position. */
3045 || pending_exact + *pending_exact + 1 != b 3036 || pending_exact + *pending_exact + 1 != b
3046 3037
3047 /* We have only one byte following the exactn for the count. */ 3038 /* We have only one byte following the exactn for the count. */
3048 || *pending_exact >= (1 << BYTEWIDTH) - (p - p1) 3039 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
3049 3040
3050 /* If followed by a repetition operator. */ 3041 /* If followed by a repetition operator. */
3051 || (p != pend && (*p == '*' || *p == '^')) 3042 || (p != pend && (*p == '*' || *p == '^'))
3052 || ((syntax & RE_BK_PLUS_QM) 3043 || ((syntax & RE_BK_PLUS_QM)
3053 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?') 3044 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3063 3054
3064 BUF_PUSH_2 (exactn, 0); 3055 BUF_PUSH_2 (exactn, 0);
3065 pending_exact = b - 1; 3056 pending_exact = b - 1;
3066 } 3057 }
3067 3058
3068 #ifdef emacs 3059 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3069 if (! SINGLE_BYTE_CHAR_P (c)) 3060 {
3070 { 3061 int len = CHAR_STRING (c, b);
3071 unsigned char str[MAX_MULTIBYTE_LENGTH]; 3062 b += len;
3072 int i = CHAR_STRING (c, str); 3063 (*pending_exact) += len;
3073 int j; 3064 }
3074 for (j = 0; j < i; j++) 3065
3075 {
3076 BUF_PUSH (str[j]);
3077 (*pending_exact)++;
3078 }
3079 }
3080 else
3081 #endif
3082 {
3083 BUF_PUSH (c);
3084 (*pending_exact)++;
3085 }
3086 break; 3066 break;
3087 } /* switch (c) */ 3067 } /* switch (c) */
3088 } /* while p != pend */ 3068 } /* while p != pend */
3089 3069
3090 3070
3614 3594
3615 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ 3595 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3616 bufp->fastmap_accurate = 1; /* It will be when we're done. */ 3596 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3617 3597
3618 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used, 3598 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
3619 fastmap, bufp->multibyte); 3599 fastmap, RE_MULTIBYTE_P (bufp));
3620 if (analysis < -1) 3600 if (analysis < -1)
3621 return analysis; 3601 return analysis;
3622 bufp->can_be_null = (analysis != 0); 3602 bufp->can_be_null = (analysis != 0);
3623 return 0; 3603 return 0;
3624 } /* re_compile_fastmap */ 3604 } /* re_compile_fastmap */
3721 int total_size = size1 + size2; 3701 int total_size = size1 + size2;
3722 int endpos = startpos + range; 3702 int endpos = startpos + range;
3723 int anchored_start = 0; 3703 int anchored_start = 0;
3724 3704
3725 /* Nonzero if we have to concern multibyte character. */ 3705 /* Nonzero if we have to concern multibyte character. */
3726 const boolean multibyte = bufp->multibyte; 3706 const boolean multibyte = RE_MULTIBYTE_P (bufp);
3727 3707
3728 /* Check for out-of-range STARTPOS. */ 3708 /* Check for out-of-range STARTPOS. */
3729 if (startpos < 0 || startpos > total_size) 3709 if (startpos < 0 || startpos > total_size)
3730 return -1; 3710 return -1;
3731 3711
3848 3828
3849 startpos += irange - range; 3829 startpos += irange - range;
3850 } 3830 }
3851 else /* Searching backwards. */ 3831 else /* Searching backwards. */
3852 { 3832 {
3853 buf_ch = STRING_CHAR (d, (startpos >= size1 3833 int room = (startpos >= size1
3854 ? size2 + size1 - startpos 3834 ? size2 + size1 - startpos
3855 : size1 - startpos)); 3835 : size1 - startpos);
3856 if (RE_TRANSLATE_P (translate)) 3836 buf_ch = RE_STRING_CHAR (d, room);
3857 buf_ch = RE_TRANSLATE (translate, buf_ch); 3837 buf_ch = TRANSLATE (buf_ch);
3858 3838
3859 if (! (buf_ch >= 0400 3839 if (! (buf_ch >= 0400
3860 || fastmap[buf_ch])) 3840 || fastmap[buf_ch]))
3861 goto advance; 3841 goto advance;
3862 } 3842 }
3938 return -1; 3918 return -1;
3939 } /* re_search_2 */ 3919 } /* re_search_2 */
3940 3920
3941 /* Declarations and macros for re_match_2. */ 3921 /* Declarations and macros for re_match_2. */
3942 3922
3943 static int bcmp_translate (); 3923 static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
3924 register int len,
3925 RE_TRANSLATE_TYPE translate,
3926 const int multibyte));
3944 3927
3945 /* This converts PTR, a pointer into one of the search strings `string1' 3928 /* This converts PTR, a pointer into one of the search strings `string1'
3946 and `string2' into an offset from the beginning of that string. */ 3929 and `string2' into an offset from the beginning of that string. */
3947 #define POINTER_TO_OFFSET(ptr) \ 3930 #define POINTER_TO_OFFSET(ptr) \
3948 (FIRST_STRING_P (ptr) \ 3931 (FIRST_STRING_P (ptr) \
4091 mutually_exclusive_p (bufp, p1, p2) 4074 mutually_exclusive_p (bufp, p1, p2)
4092 struct re_pattern_buffer *bufp; 4075 struct re_pattern_buffer *bufp;
4093 unsigned char *p1, *p2; 4076 unsigned char *p1, *p2;
4094 { 4077 {
4095 re_opcode_t op2; 4078 re_opcode_t op2;
4096 const boolean multibyte = bufp->multibyte; 4079 const boolean multibyte = RE_MULTIBYTE_P (bufp);
4097 unsigned char *pend = bufp->buffer + bufp->used; 4080 unsigned char *pend = bufp->buffer + bufp->used;
4098 4081
4099 assert (p1 >= bufp->buffer && p1 < pend 4082 assert (p1 >= bufp->buffer && p1 < pend
4100 && p2 >= bufp->buffer && p2 <= pend); 4083 && p2 >= bufp->buffer && p2 <= pend);
4101 4084
4371 4354
4372 /* We use this to map every character in the string. */ 4355 /* We use this to map every character in the string. */
4373 RE_TRANSLATE_TYPE translate = bufp->translate; 4356 RE_TRANSLATE_TYPE translate = bufp->translate;
4374 4357
4375 /* Nonzero if we have to concern multibyte character. */ 4358 /* Nonzero if we have to concern multibyte character. */
4376 const boolean multibyte = bufp->multibyte; 4359 const boolean multibyte = RE_MULTIBYTE_P (bufp);
4377 4360
4378 /* Failure point stack. Each place that can handle a failure further 4361 /* Failure point stack. Each place that can handle a failure further
4379 down the line pushes a failure point on this stack. It consists of 4362 down the line pushes a failure point on this stack. It consists of
4380 regstart, and regend for all registers corresponding to 4363 regstart, and regend for all registers corresponding to
4381 the subexpressions we're currently inside, plus the number of such 4364 the subexpressions we're currently inside, plus the number of such
4719 4702
4720 /* This is written out as an if-else so we don't waste time 4703 /* This is written out as an if-else so we don't waste time
4721 testing `translate' inside the loop. */ 4704 testing `translate' inside the loop. */
4722 if (RE_TRANSLATE_P (translate)) 4705 if (RE_TRANSLATE_P (translate))
4723 { 4706 {
4724 #ifdef emacs
4725 if (multibyte) 4707 if (multibyte)
4726 do 4708 do
4727 { 4709 {
4728 int pat_charlen, buf_charlen; 4710 int pat_charlen, buf_charlen;
4729 unsigned int pat_ch, buf_ch; 4711 unsigned int pat_ch, buf_ch;
4743 d += buf_charlen; 4725 d += buf_charlen;
4744 mcnt -= pat_charlen; 4726 mcnt -= pat_charlen;
4745 } 4727 }
4746 while (mcnt > 0); 4728 while (mcnt > 0);
4747 else 4729 else
4748 #endif /* not emacs */
4749 do 4730 do
4750 { 4731 {
4751 PREFETCH (); 4732 PREFETCH ();
4752 if (RE_TRANSLATE (translate, *d) != *p++) 4733 if (RE_TRANSLATE (translate, *d) != *p++)
4753 { 4734 {
4781 unsigned int buf_ch; 4762 unsigned int buf_ch;
4782 4763
4783 DEBUG_PRINT1 ("EXECUTING anychar.\n"); 4764 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4784 4765
4785 PREFETCH (); 4766 PREFETCH ();
4786 4767 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
4787 #ifdef emacs
4788 if (multibyte)
4789 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
4790 else
4791 #endif /* not emacs */
4792 {
4793 buf_ch = *d;
4794 buf_charlen = 1;
4795 }
4796
4797 buf_ch = TRANSLATE (buf_ch); 4768 buf_ch = TRANSLATE (buf_ch);
4798 4769
4799 if ((!(bufp->syntax & RE_DOT_NEWLINE) 4770 if ((!(bufp->syntax & RE_DOT_NEWLINE)
4800 && buf_ch == '\n') 4771 && buf_ch == '\n')
4801 || ((bufp->syntax & RE_DOT_NOT_NULL) 4772 || ((bufp->syntax & RE_DOT_NOT_NULL)
4826 in the initial byte-length of the command. */ 4797 in the initial byte-length of the command. */
4827 int count = 0; 4798 int count = 0;
4828 4799
4829 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); 4800 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
4830 4801
4831 PREFETCH ();
4832 c = *d;
4833
4834 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]); 4802 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
4835 4803
4836 #ifdef emacs
4837 if (range_table_exists) 4804 if (range_table_exists)
4838 { 4805 {
4839 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */ 4806 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
4840 EXTRACT_NUMBER_AND_INCR (count, range_table); 4807 EXTRACT_NUMBER_AND_INCR (count, range_table);
4841 } 4808 }
4842 4809
4843 if (multibyte && BASE_LEADING_CODE_P (c)) 4810 PREFETCH ();
4844 c = STRING_CHAR_AND_LENGTH (d, dend - d, len); 4811 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
4845 #endif /* emacs */ 4812 c = TRANSLATE (c); /* The character to match. */
4846 4813
4847 if (SINGLE_BYTE_CHAR_P (c)) 4814 if (SINGLE_BYTE_CHAR_P (c))
4848 { /* Lookup bitmap. */ 4815 { /* Lookup bitmap. */
4849 c = TRANSLATE (c); /* The character to match. */
4850 len = 1;
4851
4852 /* Cast to `unsigned' instead of `unsigned char' in 4816 /* Cast to `unsigned' instead of `unsigned char' in
4853 case the bit list is a full 32 bytes long. */ 4817 case the bit list is a full 32 bytes long. */
4854 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH) 4818 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
4855 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 4819 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4856 not = !not; 4820 not = !not;
4992 mcnt = dend2 - d2; 4956 mcnt = dend2 - d2;
4993 4957
4994 /* Compare that many; failure if mismatch, else move 4958 /* Compare that many; failure if mismatch, else move
4995 past them. */ 4959 past them. */
4996 if (RE_TRANSLATE_P (translate) 4960 if (RE_TRANSLATE_P (translate)
4997 ? bcmp_translate (d, d2, mcnt, translate) 4961 ? bcmp_translate (d, d2, mcnt, translate, multibyte)
4998 : bcmp (d, d2, mcnt)) 4962 : bcmp (d, d2, mcnt))
4999 { 4963 {
5000 d = dfail; 4964 d = dfail;
5001 goto fail; 4965 goto fail;
5002 } 4966 }
5261 { 5225 {
5262 /* C1 is the character before D, S1 is the syntax of C1, C2 5226 /* C1 is the character before D, S1 is the syntax of C1, C2
5263 is the character at D, and S2 is the syntax of C2. */ 5227 is the character at D, and S2 is the syntax of C2. */
5264 int c1, c2, s1, s2; 5228 int c1, c2, s1, s2;
5265 #ifdef emacs 5229 #ifdef emacs
5266 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d - 1)); 5230 int offset = PTR_TO_OFFSET (d - 1);
5231 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5267 UPDATE_SYNTAX_TABLE (charpos); 5232 UPDATE_SYNTAX_TABLE (charpos);
5268 #endif 5233 #endif
5269 /* FIXME: This does a STRING_CHAR even for unibyte buffers. */
5270 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); 5234 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5271 s1 = SYNTAX (c1); 5235 s1 = SYNTAX (c1);
5272 #ifdef emacs 5236 #ifdef emacs
5273 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); 5237 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
5274 #endif 5238 #endif
5275 PREFETCH (); 5239 PREFETCH ();
5276 /* FIXME: This does a STRING_CHAR even for unibyte buffers. */ 5240 c2 = RE_STRING_CHAR (d, dend - d);
5277 c2 = STRING_CHAR (d, dend - d);
5278 s2 = SYNTAX (c2); 5241 s2 = SYNTAX (c2);
5279 5242
5280 if (/* Case 2: Only one of S1 and S2 is Sword. */ 5243 if (/* Case 2: Only one of S1 and S2 is Sword. */
5281 ((s1 == Sword) != (s2 == Sword)) 5244 ((s1 == Sword) != (s2 == Sword))
5282 /* Case 3: Both of S1 and S2 are Sword, and macro 5245 /* Case 3: Both of S1 and S2 are Sword, and macro
5301 { 5264 {
5302 /* C1 is the character before D, S1 is the syntax of C1, C2 5265 /* C1 is the character before D, S1 is the syntax of C1, C2
5303 is the character at D, and S2 is the syntax of C2. */ 5266 is the character at D, and S2 is the syntax of C2. */
5304 int c1, c2, s1, s2; 5267 int c1, c2, s1, s2;
5305 #ifdef emacs 5268 #ifdef emacs
5306 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)); 5269 int offset = PTR_TO_OFFSET (d);
5270 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5307 UPDATE_SYNTAX_TABLE (charpos); 5271 UPDATE_SYNTAX_TABLE (charpos);
5308 #endif 5272 #endif
5309 PREFETCH (); 5273 PREFETCH ();
5310 /* FIXME: This does a STRING_CHAR even for unibyte buffers. */ 5274 c2 = RE_STRING_CHAR (d, dend - d);
5311 c2 = STRING_CHAR (d, dend - d);
5312 s2 = SYNTAX (c2); 5275 s2 = SYNTAX (c2);
5313 5276
5314 /* Case 2: S2 is not Sword. */ 5277 /* Case 2: S2 is not Sword. */
5315 if (s2 != Sword) 5278 if (s2 != Sword)
5316 goto fail; 5279 goto fail;
5344 { 5307 {
5345 /* C1 is the character before D, S1 is the syntax of C1, C2 5308 /* C1 is the character before D, S1 is the syntax of C1, C2
5346 is the character at D, and S2 is the syntax of C2. */ 5309 is the character at D, and S2 is the syntax of C2. */
5347 int c1, c2, s1, s2; 5310 int c1, c2, s1, s2;
5348 #ifdef emacs 5311 #ifdef emacs
5349 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d) - 1); 5312 int offset = PTR_TO_OFFSET (d) - 1;
5313 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5350 UPDATE_SYNTAX_TABLE (charpos); 5314 UPDATE_SYNTAX_TABLE (charpos);
5351 #endif 5315 #endif
5352 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); 5316 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5353 s1 = SYNTAX (c1); 5317 s1 = SYNTAX (c1);
5354 5318
5358 5322
5359 /* Case 3: D is not at the end of string ... */ 5323 /* Case 3: D is not at the end of string ... */
5360 if (!AT_STRINGS_END (d)) 5324 if (!AT_STRINGS_END (d))
5361 { 5325 {
5362 PREFETCH (); 5326 PREFETCH ();
5363 /* FIXME: This does a STRING_CHAR even for unibyte buffers. */ 5327 c2 = RE_STRING_CHAR (d, dend - d);
5364 c2 = STRING_CHAR (d, dend - d);
5365 #ifdef emacs 5328 #ifdef emacs
5366 UPDATE_SYNTAX_TABLE_FORWARD (charpos); 5329 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
5367 #endif 5330 #endif
5368 s2 = SYNTAX (c2); 5331 s2 = SYNTAX (c2);
5369 5332
5381 mcnt = *p++; 5344 mcnt = *p++;
5382 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt); 5345 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
5383 PREFETCH (); 5346 PREFETCH ();
5384 #ifdef emacs 5347 #ifdef emacs
5385 { 5348 {
5386 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)); 5349 int offset = PTR_TO_OFFSET (d);
5350 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5387 UPDATE_SYNTAX_TABLE (pos1); 5351 UPDATE_SYNTAX_TABLE (pos1);
5388 } 5352 }
5389 #endif 5353 #endif
5390 { 5354 {
5391 int c, len; 5355 int c, len;
5392 5356
5393 if (multibyte) 5357 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
5394 /* we must concern about multibyte form, ... */
5395 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
5396 else
5397 /* everything should be handled as ASCII, even though it
5398 looks like multibyte form. */
5399 c = *d, len = 1;
5400 5358
5401 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not) 5359 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
5402 goto fail; 5360 goto fail;
5403 d += len; 5361 d += len;
5404 } 5362 }
5429 mcnt = *p++; 5387 mcnt = *p++;
5430 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt); 5388 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
5431 PREFETCH (); 5389 PREFETCH ();
5432 { 5390 {
5433 int c, len; 5391 int c, len;
5434 5392 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
5435 if (multibyte)
5436 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
5437 else
5438 c = *d, len = 1;
5439 5393
5440 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not) 5394 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
5441 goto fail; 5395 goto fail;
5442 d += len; 5396 d += len;
5443 } 5397 }
5510 5464
5511 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN 5465 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
5512 bytes; nonzero otherwise. */ 5466 bytes; nonzero otherwise. */
5513 5467
5514 static int 5468 static int
5515 bcmp_translate (s1, s2, len, translate) 5469 bcmp_translate (s1, s2, len, translate, multibyte)
5516 unsigned char *s1, *s2; 5470 re_char *s1, *s2;
5517 register int len; 5471 register int len;
5518 RE_TRANSLATE_TYPE translate; 5472 RE_TRANSLATE_TYPE translate;
5473 const int multibyte;
5519 { 5474 {
5520 register unsigned char *p1 = s1, *p2 = s2; 5475 register re_char *p1 = s1, *p2 = s2;
5521 unsigned char *p1_end = s1 + len; 5476 re_char *p1_end = s1 + len;
5522 unsigned char *p2_end = s2 + len; 5477 re_char *p2_end = s2 + len;
5523 5478
5524 while (p1 != p1_end && p2 != p2_end) 5479 while (p1 != p1_end && p2 != p2_end)
5525 { 5480 {
5526 int p1_charlen, p2_charlen; 5481 int p1_charlen, p2_charlen;
5527 int p1_ch, p2_ch; 5482 int p1_ch, p2_ch;
5528 5483
5529 /* FIXME: This assumes `multibyte = true'. */ 5484 p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
5530 p1_ch = STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen); 5485 p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
5531 p2_ch = STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
5532 5486
5533 if (RE_TRANSLATE (translate, p1_ch) 5487 if (RE_TRANSLATE (translate, p1_ch)
5534 != RE_TRANSLATE (translate, p2_ch)) 5488 != RE_TRANSLATE (translate, p2_ch))
5535 return 1; 5489 return 1;
5536 5490