Mercurial > emacs
changeset 21348:64590f10c605
(compile_range): Unused function deleted.
(regex_compile): Special handling for range \177-\377.
(regex_compile): Cast args to TRANSLATE to unsigned char.
(re_search_2): Fix forward scan handling multibyte.
Recognize that nonascii characters are not in the fastmap.
Handle fetching multibyte characters for backward scan,
(re_match_2_internal): Handle multibyte and translation
in exactn and anychar.
(bcmp_translate): Handle multibyte chars for translation.
(TRANSLATE): Don't cast to unsigned char.
(PATFETCH): Use RE_TRANSLATE to translate.
author | Richard M. Stallman <rms@gnu.org> |
---|---|
date | Fri, 03 Apr 1998 07:33:13 +0000 |
parents | aca24aa07fb4 |
children | ede1a6e9e122 |
files | src/regex.c |
diffstat | 1 files changed, 137 insertions(+), 101 deletions(-) [+] |
line wrap: on
line diff
--- a/src/regex.c Fri Apr 03 05:34:10 1998 +0000 +++ b/src/regex.c Fri Apr 03 07:33:13 1998 +0000 @@ -168,7 +168,7 @@ #define SYNTAX(c) re_syntax_table[c] -/* Dummy macro for non emacs environments. */ +/* Dummy macros for non-Emacs environments. */ #define BASE_LEADING_CODE_P(c) (0) #define WORD_BOUNDARY_P(c1, c2) (0) #define CHAR_HEAD_P(p) (1) @@ -1539,7 +1539,7 @@ #define PATFETCH(c) \ do {if (p == pend) return REG_EEND; \ c = (unsigned char) *p++; \ - if (translate) c = (unsigned char) translate[c]; \ + if (translate) c = RE_TRANSLATE (translate, c); \ } while (0) #endif @@ -1560,7 +1560,7 @@ when we use a character as a subscript we must make it unsigned. */ #ifndef TRANSLATE #define TRANSLATE(d) \ - (translate ? (unsigned char) RE_TRANSLATE (translate, (unsigned char) (d)) : (d)) + (translate ? (unsigned) RE_TRANSLATE (translate, (unsigned) (d)) : (d)) #endif @@ -2107,9 +2107,10 @@ incremented `p', by the way, to be the character after the `*'. Do we have to do something analogous here for null bytes, because of RE_DOT_NOT_NULL? */ - if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') + if (TRANSLATE ((unsigned char)*(p - 2)) == TRANSLATE ('.') && zero_times_ok - && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') + && p < pend + && TRANSLATE ((unsigned char)*p) == TRANSLATE ('\n') && !(syntax & RE_DOT_NEWLINE)) { /* We have .*\n. */ STORE_JUMP (jump, b, laststart); @@ -2333,7 +2334,18 @@ p += len; } - if (!SAME_CHARSET_P (c, c1)) + if (SINGLE_BYTE_CHAR_P (c) + && ! SINGLE_BYTE_CHAR_P (c1)) + { + /* Handle a range such as \177-\377 in multibyte mode. + Split that into two ranges,, + the low one ending at 0237, and the high one + starting at ...040. */ + int c1_base = (c1 & ~0177) | 040; + SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1); + c1 = 0237; + } + else if (!SAME_CHARSET_P (c, c1)) FREE_STACK_RETURN (REG_ERANGE); } else @@ -2359,8 +2371,8 @@ for (this_char = range_start; this_char <= range_end; this_char++) SET_LIST_BIT (TRANSLATE (this_char)); + } } - } else /* ... into range table. */ SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1); @@ -2913,8 +2925,8 @@ /* Here, C may translated, therefore C may not equal to *P1. */ while (1) { - BUF_PUSH (c); - (*pending_exact)++; + BUF_PUSH (c); + (*pending_exact)++; if (++p1 == p) break; @@ -3121,64 +3133,6 @@ return false; } - - -/* Read the ending character of a range (in a bracket expression) from the - uncompiled pattern *P_PTR (which ends at PEND). We assume the - starting character is in `P[-2]'. (`P[-1]' is the character `-'.) - Then we set the translation of all bits between the starting and - ending characters (inclusive) in the compiled pattern B. - - Return an error code. - - We use these short variable names so we can use the same macros as - `regex_compile' itself. */ - -static reg_errcode_t -compile_range (p_ptr, pend, translate, syntax, b) - const char **p_ptr, *pend; - RE_TRANSLATE_TYPE translate; - reg_syntax_t syntax; - unsigned char *b; -{ - unsigned this_char; - - const char *p = *p_ptr; - int range_start, range_end; - - if (p == pend) - return REG_ERANGE; - - /* Even though the pattern is a signed `char *', we need to fetch - with unsigned char *'s; if the high bit of the pattern character - is set, the range endpoints will be negative if we fetch using a - signed char *. - - We also want to fetch the endpoints without translating them; the - appropriate translation is done in the bit-setting loop below. */ - /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */ - range_start = ((const unsigned char *) p)[-2]; - range_end = ((const unsigned char *) p)[0]; - - /* Have to increment the pointer into the pattern string, so the - caller isn't still at the ending character. */ - (*p_ptr)++; - - /* If the start is after the end, the range is empty. */ - if (range_start > range_end) - return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; - - /* Here we see why `this_char' has to be larger than an `unsigned - char' -- the range is inclusive, so if `range_end' == 0xff - (assuming 8-bit characters), we would otherwise go into an infinite - loop, since all characters <= 0xff. */ - for (this_char = range_start; this_char <= range_end; this_char++) - { - SET_LIST_BIT (TRANSLATE (this_char)); - } - - return REG_NOERROR; -} /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible @@ -3812,24 +3766,45 @@ the first null string. */ if (fastmap && startpos < total_size && !bufp->can_be_null) { + register const char *d; + register unsigned int buf_ch; + + d = POS_ADDR_VSTRING (startpos); + if (range > 0) /* Searching forwards. */ { - register const char *d; register int lim = 0; int irange = range; if (startpos < size1 && startpos + range >= size1) lim = range - (size1 - startpos); - d = POS_ADDR_VSTRING (startpos); - /* Written out as an if-else to avoid testing `translate' inside the loop. */ if (translate) - while (range > lim - && !fastmap[(unsigned char) - RE_TRANSLATE (translate, (unsigned char) *d++)]) - range--; + { + if (multibyte) + while (range > lim) + { + int buf_charlen; + + buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim, + buf_charlen); + + buf_ch = RE_TRANSLATE (translate, buf_ch); + if (buf_ch >= 0400 + || fastmap[buf_ch]) + break; + + range -= buf_charlen; + d += buf_charlen; + } + else + while (range > lim + && !fastmap[(unsigned char) + RE_TRANSLATE (translate, (unsigned char) *d++)]) + range--; + } else while (range > lim && !fastmap[(unsigned char) *d++]) range--; @@ -3838,11 +3813,16 @@ } else /* Searching backwards. */ { - register char c = (size1 == 0 || startpos >= size1 - ? string2[startpos - size1] - : string1[startpos]); - - if (!fastmap[(unsigned char) TRANSLATE (c)]) + int room = (size1 == 0 || startpos >= size1 + ? size2 + size1 - startpos + : size1 - startpos); + + buf_ch = STRING_CHAR (d, room); + if (translate) + buf_ch = RE_TRANSLATE (translate, buf_ch); + + if (! (buf_ch >= 0400 + || fastmap[buf_ch])) goto advance; } } @@ -4515,14 +4495,36 @@ testing `translate' inside the loop. */ if (translate) { - do - { - PREFETCH (); - if ((unsigned char) RE_TRANSLATE (translate, (unsigned char) *d++) - != (unsigned char) *p++) - goto fail; - } - while (--mcnt); +#ifdef emacs + if (multibyte) + do + { + int pat_charlen, buf_charlen; + int pat_ch, buf_ch; + + PREFETCH (); + pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen); + buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); + + if (RE_TRANSLATE (translate, buf_ch) + != pat_ch) + goto fail; + + p += pat_charlen; + d += buf_charlen; + mcnt -= pat_charlen; + } + while (mcnt > 0); + else +#endif /* not emacs */ + do + { + PREFETCH (); + if ((unsigned char) RE_TRANSLATE (translate, (unsigned char) *d++) + != (unsigned char) *p++) + goto fail; + } + while (--mcnt); } else { @@ -4539,17 +4541,36 @@ /* Match any character except possibly a newline or a null. */ case anychar: - DEBUG_PRINT1 ("EXECUTING anychar.\n"); - - PREFETCH (); - - if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n') - || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000')) - goto fail; - - SET_REGS_MATCHED (); - DEBUG_PRINT2 (" Matched `%d'.\n", *d); - d += multibyte ? MULTIBYTE_FORM_LENGTH (d, dend - d) : 1; + { + int buf_charlen; + int buf_ch; + + DEBUG_PRINT1 ("EXECUTING anychar.\n"); + + PREFETCH (); + +#ifdef emacs + if (multibyte) + buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); + else +#endif /* not emacs */ + { + buf_ch = *d; + buf_charlen = 1; + } + + buf_ch = TRANSLATE (buf_ch); + + if ((!(bufp->syntax & RE_DOT_NEWLINE) + && buf_ch == '\n') + || ((bufp->syntax & RE_DOT_NOT_NULL) + && buf_ch == '\000')) + goto fail; + + SET_REGS_MATCHED (); + DEBUG_PRINT2 (" Matched `%d'.\n", *d); + d += buf_charlen; + } break; @@ -5926,12 +5947,27 @@ RE_TRANSLATE_TYPE translate; { register unsigned char *p1 = s1, *p2 = s2; - while (len) + unsigned char *p1_end = s1 + len; + unsigned char *p2_end = s2 + len; + + while (p1 != p1_end && p2 != p2_end) { - if (RE_TRANSLATE (translate, *p1++) != RE_TRANSLATE (translate, *p2++)) + int p1_charlen, p2_charlen; + int p1_ch, p2_ch; + + p1_ch = STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen); + p2_ch = STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen); + + if (RE_TRANSLATE (translate, p1_ch) + != RE_TRANSLATE (translate, p2_ch)) return 1; - len--; + + p1 += p1_charlen, p2 += p2_charlen; } + + if (p1 != p1_end || p2 != p2_end) + return 1; + return 0; }