# HG changeset patch # User Stefan Monnier # Date 1030141196 0 # Node ID c299d527a088436d417b2227352f8b7aa86ddc8d # Parent 724e287f33a3a6e7b3819cd4384d10704c3f2060 (PATFETCH): Remove the translating fetch. (PATFETCH_RAW): Rename to PATFETCH. (set_image_of_range): New fun. (SET_RANGE_TABLE_WORK_AREA): Use it. (regex_compile): Don't translate the pattern chars so eagerly. Only do it when inserting an `exactn' bytecode or when handling a char-range. (mutually_exclusive_p): Avoid empty statement. diff -r 724e287f33a3 -r c299d527a088 src/regex.c --- a/src/regex.c Fri Aug 23 19:52:29 2002 +0000 +++ b/src/regex.c Fri Aug 23 22:19:56 2002 +0000 @@ -19,7 +19,9 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -/* TODO: +/* BUGS: + - (x?)*y\1z should match both xxxxyxz and xxxyz. + TODO: - structure the opcode space into opcode+flag. - merge with glibc's regex.[ch]. - replace (succeed_n + jump_n + set_number_at) with something that doesn't @@ -1682,17 +1684,9 @@ static int analyse_first _RE_ARGS ((re_char *p, re_char *pend, char *fastmap, const int multibyte)); -/* Fetch the next character in the uncompiled pattern---translating it - if necessary. */ -#define PATFETCH(c) \ - do { \ - PATFETCH_RAW (c); \ - c = TRANSLATE (c); \ - } while (0) - /* Fetch the next character in the uncompiled pattern, with no translation. */ -#define PATFETCH_RAW(c) \ +#define PATFETCH(c) \ do { \ int len; \ if (p == pend) return REG_EEND; \ @@ -1914,12 +1908,13 @@ #define BIT_UPPER 0x10 #define BIT_MULTIBYTE 0x20 -/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */ -#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \ - do { \ - EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \ - (work_area).table[(work_area).used++] = (range_start); \ - (work_area).table[(work_area).used++] = (range_end); \ +/* Set a range START..END to WORK_AREA. + The range is passed through TRANSLATE, so START and END + should be untranslated. */ +#define SET_RANGE_TABLE_WORK_AREA(work_area, start, end) \ + do { \ + EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \ + set_image_of_range (&work_area, start, end, translate); \ } while (0) /* Free allocated memory for WORK_AREA. */ @@ -2077,6 +2072,31 @@ } #endif + + +/* We need to find the image of the range start..end when passed through + TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end) + and is not even necessarily contiguous. + We approximate it with the smallest contiguous range that contains + all the chars we need. */ +static void +set_image_of_range (work_area, start, end, translate) + RE_TRANSLATE_TYPE translate; + struct range_table_work_area *work_area; + re_wchar_t start, end; +{ + re_wchar_t cmin = TRANSLATE (start), cmax = TRANSLATE (end); + if (RE_TRANSLATE_P (translate)) + for (; start <= end; start++) + { + re_wchar_t c = TRANSLATE (start); + cmin = MIN (cmin, c); + cmax = MAX (cmax, c); + } + work_area->table[work_area->used++] = (cmin); + work_area->table[work_area->used++] = (cmax); +} + /* Explicit quit checking is only used on NTemacs. */ #if defined WINDOWSNT && defined emacs && defined QUIT extern int immediate_quit; @@ -2525,6 +2545,10 @@ if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + /* Don't translate yet. The range TRANSLATE(X..Y) cannot + always be determined from TRANSLATE(X) and TRANSLATE(Y) + So the translation is done later in a loop. Example: + (let ((case-fold-search t)) (string-match "[A-_]" "A")) */ PATFETCH (c); /* \ might escape characters inside [...] and [^...]. */ @@ -2584,7 +2608,7 @@ them). */ if (c == ':' && *p == ']') { - int ch; + re_wchar_t ch; re_wctype_t cc; cc = re_wctype (str); @@ -2653,8 +2677,8 @@ starting at the smallest character in the charset of C1 and ending at C1. */ int charset = CHAR_CHARSET (c1); - int c2 = MAKE_CHAR (charset, 0, 0); - + re_wchar_t c2 = MAKE_CHAR (charset, 0, 0); + SET_RANGE_TABLE_WORK_AREA (range_table_work, c2, c1); c1 = 0377; @@ -2672,7 +2696,7 @@ /* ... into bitmap. */ { re_wchar_t this_char; - int range_start = c, range_end = c1; + re_wchar_t range_start = c, range_end = c1; /* If the start is after the end, the range is empty. */ if (range_start > range_end) @@ -2769,7 +2793,7 @@ /* Do not translate the character after the \, so that we can distinguish, e.g., \B from \b, even if we normally would translate, e.g., B to b. */ - PATFETCH_RAW (c); + PATFETCH (c); switch (c) { @@ -3129,13 +3153,13 @@ case 'c': laststart = b; - PATFETCH_RAW (c); + PATFETCH (c); BUF_PUSH_2 (categoryspec, c); break; case 'C': laststart = b; - PATFETCH_RAW (c); + PATFETCH (c); BUF_PUSH_2 (notcategoryspec, c); break; #endif /* emacs */ @@ -3225,7 +3249,6 @@ /* You might think it would be useful for \ to mean not to translate; but if we don't translate it it will never match anything. */ - c = TRANSLATE (c); goto normal_char; } break; @@ -3234,7 +3257,7 @@ default: /* Expects the character in `c'. */ normal_char: - /* If no exactn currently being built. */ + /* If no exactn currently being built. */ if (!pending_exact /* If last exactn not at current position. */ @@ -3265,6 +3288,7 @@ { int len; + c = TRANSLATE (c); if (multibyte) len = CHAR_STRING (c, b); else @@ -4427,7 +4451,7 @@ they don't overlap. The union of the two sets of excluded chars should cover all possible chars, which, as a matter of fact, is virtually impossible in multibyte buffers. */ - ; + break; } break;