Mercurial > emacs
changeset 89130:e18339404909
(search_buffer): Fix case-fold-search of multibyte
characters.
(boyer_moore): Rename the last argument to char_high_bits.
author | Kenichi Handa <handa@m17n.org> |
---|---|
date | Tue, 01 Oct 2002 01:30:13 +0000 |
parents | ed1d9d1cfc71 |
children | 13bec264e345 |
files | src/search.c |
diffstat | 1 files changed, 23 insertions(+), 32 deletions(-) [+] |
line wrap: on
line diff
--- a/src/search.c Fri Sep 27 04:54:47 2002 +0000 +++ b/src/search.c Tue Oct 01 01:30:13 2002 +0000 @@ -1106,7 +1106,12 @@ unsigned char *patbuf; int multibyte = !NILP (current_buffer->enable_multibyte_characters); unsigned char *base_pat = XSTRING (string)->data; - int charset_base = -1; + /* High bits of char, calculated by (CHAR & 0x3F). Characters + of the same high bits have the same sequence of bytes but + last. To do the BM search, all characters in STRING must + have the same high bits (including their case + translations). */ + int char_high_bits = -1; int boyer_moore_ok = 1; /* MULTIBYTE says whether the text to be searched is multibyte. @@ -1147,16 +1152,15 @@ /* Copy and optionally translate the pattern. */ len = raw_pattern_size; len_byte = raw_pattern_size_byte; - patbuf = (unsigned char *) alloca (len_byte); + patbuf = (unsigned char *) alloca (len * MAX_MULTIBYTE_LENGTH); pat = patbuf; base_pat = raw_pattern; if (multibyte) { while (--len >= 0) { - unsigned char str[MAX_MULTIBYTE_LENGTH]; int c, translated, inverse; - int in_charlen, charlen; + int in_charlen; /* If we got here and the RE flag is set, it's because we're dealing with a regexp known to be trivial, so the backslash @@ -1172,23 +1176,6 @@ /* Translate the character, if requested. */ TRANSLATE (translated, trt, c); - /* If translation changed the byte-length, go back - to the original character. */ - charlen = CHAR_STRING (translated, str); - if (in_charlen != charlen) - { - translated = c; - charlen = CHAR_STRING (c, str); - } - - /* If we are searching for something strange, - an invalid multibyte code, don't use boyer-moore. */ - if (! ASCII_BYTE_P (translated) - && (charlen == 1 /* 8bit code */ - || charlen != in_charlen /* invalid multibyte code */ - )) - boyer_moore_ok = 0; - TRANSLATE (inverse, inverse_trt, c); /* Did this char actually get translated? @@ -1197,18 +1184,22 @@ { /* Keep track of which character set row contains the characters that need translation. */ - int charset_base_code = c & ~0x3F; - if (charset_base == -1) - charset_base = charset_base_code; - else if (charset_base != charset_base_code) + int this_high_bit = c & ~0x3F; + int trt_high_bit = ((inverse != c ? inverse : translated) + & ~0x3F); + + if (this_high_bit != trt_high_bit) + boyer_moore_ok = 0; + else if (char_high_bits == -1) + char_high_bits = this_high_bit; + else if (char_high_bits != this_high_bit) /* If two different rows appear, needing translation, then we cannot use boyer_moore search. */ boyer_moore_ok = 0; } /* Store this character into the translated pattern. */ - bcopy (str, pat, charlen); - pat += charlen; + CHAR_STRING_ADVANCE (translated, pat); base_pat += in_charlen; len_byte -= in_charlen; } @@ -1216,7 +1207,7 @@ else { /* Unibyte buffer. */ - charset_base = 0; + char_high_bits = 0; while (--len >= 0) { int c, translated; @@ -1242,7 +1233,7 @@ if (boyer_moore_ok) return boyer_moore (n, pat, len, len_byte, trt, inverse_trt, pos, pos_byte, lim, lim_byte, - charset_base); + char_high_bits); else return simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte); @@ -1475,7 +1466,7 @@ static int boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, - pos, pos_byte, lim, lim_byte, charset_base) + pos, pos_byte, lim, lim_byte, char_high_bits) int n; unsigned char *base_pat; int len, len_byte; @@ -1483,7 +1474,7 @@ Lisp_Object inverse_trt; int pos, pos_byte; int lim, lim_byte; - int charset_base; + int char_high_bits; { int direction = ((n > 0) ? 1 : -1); register int dirlen; @@ -1584,7 +1575,7 @@ while (! CHAR_HEAD_P (*charstart)) charstart--; untranslated = STRING_CHAR (charstart, ptr - charstart + 1); - if (charset_base == (untranslated & ~0x3F)) + if (char_high_bits == (untranslated & ~0x3F)) { TRANSLATE (ch, trt, untranslated); if (! CHAR_HEAD_P (*ptr))