comparison src/search.c @ 61189:91ba6c641a60

(looking_at_1): Use current_buffer->case_canon_table, not DOWNCASE_TABLE. (string_match_1): Likewise. (fast_c_string_match_ignore_case): Use Vascii_canon_table, not Vascii_downcase_table. (fast_string_match_ignore_case): Likewise. (search_buffer): Fix checking of boyer-moore usability. (boyer_moore): Calculate translate_prev_byte1/2/3 in advance. No need of tranlating characters in PAT. Fix calculation of simple_translate.
author Kenichi Handa <handa@m17n.org>
date Fri, 01 Apr 2005 01:05:46 +0000
parents 47729f2cb184
children 0015a00ccb5a 7a3341d65a12
comparison
equal deleted inserted replaced
61188:a96e8bdd2b37 61189:91ba6c641a60
291 save_search_regs (); 291 save_search_regs ();
292 292
293 CHECK_STRING (string); 293 CHECK_STRING (string);
294 bufp = compile_pattern (string, &search_regs, 294 bufp = compile_pattern (string, &search_regs,
295 (!NILP (current_buffer->case_fold_search) 295 (!NILP (current_buffer->case_fold_search)
296 ? DOWNCASE_TABLE : Qnil), 296 ? current_buffer->case_canon_table : Qnil),
297 posix, 297 posix,
298 !NILP (current_buffer->enable_multibyte_characters)); 298 !NILP (current_buffer->enable_multibyte_characters));
299 299
300 immediate_quit = 1; 300 immediate_quit = 1;
301 QUIT; /* Do a pending quit right away, to avoid paradoxical behavior */ 301 QUIT; /* Do a pending quit right away, to avoid paradoxical behavior */
397 pos_byte = string_char_to_byte (string, pos); 397 pos_byte = string_char_to_byte (string, pos);
398 } 398 }
399 399
400 bufp = compile_pattern (regexp, &search_regs, 400 bufp = compile_pattern (regexp, &search_regs,
401 (!NILP (current_buffer->case_fold_search) 401 (!NILP (current_buffer->case_fold_search)
402 ? DOWNCASE_TABLE : Qnil), 402 ? current_buffer->case_canon_table : Qnil),
403 posix, 403 posix,
404 STRING_MULTIBYTE (string)); 404 STRING_MULTIBYTE (string));
405 immediate_quit = 1; 405 immediate_quit = 1;
406 re_match_object = string; 406 re_match_object = string;
407 407
497 int len = strlen (string); 497 int len = strlen (string);
498 498
499 regexp = string_make_unibyte (regexp); 499 regexp = string_make_unibyte (regexp);
500 re_match_object = Qt; 500 re_match_object = Qt;
501 bufp = compile_pattern (regexp, 0, 501 bufp = compile_pattern (regexp, 0,
502 Vascii_downcase_table, 0, 502 Vascii_canon_table, 0,
503 0); 503 0);
504 immediate_quit = 1; 504 immediate_quit = 1;
505 val = re_search (bufp, string, len, 0, len, 0); 505 val = re_search (bufp, string, len, 0, len, 0);
506 immediate_quit = 0; 506 immediate_quit = 0;
507 return val; 507 return val;
514 Lisp_Object regexp, string; 514 Lisp_Object regexp, string;
515 { 515 {
516 int val; 516 int val;
517 struct re_pattern_buffer *bufp; 517 struct re_pattern_buffer *bufp;
518 518
519 bufp = compile_pattern (regexp, 0, Vascii_downcase_table, 519 bufp = compile_pattern (regexp, 0, Vascii_canon_table,
520 0, STRING_MULTIBYTE (string)); 520 0, STRING_MULTIBYTE (string));
521 immediate_quit = 1; 521 immediate_quit = 1;
522 re_match_object = string; 522 re_match_object = string;
523 523
524 val = re_search (bufp, (char *) SDATA (string), 524 val = re_search (bufp, (char *) SDATA (string),
1173 int raw_pattern_size; 1173 int raw_pattern_size;
1174 int raw_pattern_size_byte; 1174 int raw_pattern_size_byte;
1175 unsigned char *patbuf; 1175 unsigned char *patbuf;
1176 int multibyte = !NILP (current_buffer->enable_multibyte_characters); 1176 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
1177 unsigned char *base_pat = SDATA (string); 1177 unsigned char *base_pat = SDATA (string);
1178 int charset_base = -1; 1178 /* Set to nozero if we find a non-ASCII char that need
1179 translation. */
1180 int charset_base = 0;
1179 int boyer_moore_ok = 1; 1181 int boyer_moore_ok = 1;
1180 1182
1181 /* MULTIBYTE says whether the text to be searched is multibyte. 1183 /* MULTIBYTE says whether the text to be searched is multibyte.
1182 We must convert PATTERN to match that, or we will not really 1184 We must convert PATTERN to match that, or we will not really
1183 find things right. */ 1185 find things right. */
1219 patbuf = (unsigned char *) alloca (len_byte); 1221 patbuf = (unsigned char *) alloca (len_byte);
1220 pat = patbuf; 1222 pat = patbuf;
1221 base_pat = raw_pattern; 1223 base_pat = raw_pattern;
1222 if (multibyte) 1224 if (multibyte)
1223 { 1225 {
1226 /* Fill patbuf by translated characters in STRING while
1227 checking if we can use boyer-moore search. If TRT is
1228 non-nil, we can use boyer-moore search only if TRT can be
1229 represented by the byte array of 256 elements. For that,
1230 all non-ASCII case-equivalents of all case-senstive
1231 characters in STRING must belong to the same charset and
1232 row. */
1233
1224 while (--len >= 0) 1234 while (--len >= 0)
1225 { 1235 {
1226 unsigned char str[MAX_MULTIBYTE_LENGTH]; 1236 unsigned char str_base[MAX_MULTIBYTE_LENGTH], *str;
1227 int c, translated, inverse; 1237 int c, translated, inverse;
1228 int in_charlen, charlen; 1238 int in_charlen, charlen;
1229 1239
1230 /* If we got here and the RE flag is set, it's because we're 1240 /* If we got here and the RE flag is set, it's because we're
1231 dealing with a regexp known to be trivial, so the backslash 1241 dealing with a regexp known to be trivial, so the backslash
1232 just quotes the next character. */ 1242 just quotes the next character. */
1233 if (RE && *base_pat == '\\') 1243 if (RE && *base_pat == '\\')
1234 { 1244 {
1235 len--; 1245 len--;
1246 raw_pattern_size--;
1236 len_byte--; 1247 len_byte--;
1237 base_pat++; 1248 base_pat++;
1238 } 1249 }
1239 1250
1240 c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen); 1251 c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen);
1241 1252
1242 /* Translate the character, if requested. */ 1253 if (NILP (trt))
1243 TRANSLATE (translated, trt, c);
1244 /* If translation changed the byte-length, go back
1245 to the original character. */
1246 charlen = CHAR_STRING (translated, str);
1247 if (in_charlen != charlen)
1248 { 1254 {
1249 translated = c; 1255 str = base_pat;
1250 charlen = CHAR_STRING (c, str); 1256 charlen = in_charlen;
1251 } 1257 }
1252 1258 else
1253 /* If we are searching for something strange,
1254 an invalid multibyte code, don't use boyer-moore. */
1255 if (! ASCII_BYTE_P (translated)
1256 && (charlen == 1 /* 8bit code */
1257 || charlen != in_charlen /* invalid multibyte code */
1258 ))
1259 boyer_moore_ok = 0;
1260
1261 TRANSLATE (inverse, inverse_trt, c);
1262
1263 /* Did this char actually get translated?
1264 Would any other char get translated into it? */
1265 if (translated != c || inverse != c)
1266 { 1259 {
1267 /* Keep track of which character set row 1260 /* Translate the character. */
1268 contains the characters that need translation. */ 1261 TRANSLATE (translated, trt, c);
1269 int charset_base_code = c & ~CHAR_FIELD3_MASK; 1262 charlen = CHAR_STRING (translated, str_base);
1270 int inverse_charset_base = inverse & ~CHAR_FIELD3_MASK; 1263 str = str_base;
1271 1264
1272 if (charset_base_code != inverse_charset_base) 1265 /* Check if C has any other case-equivalents. */
1273 boyer_moore_ok = 0; 1266 TRANSLATE (inverse, inverse_trt, c);
1274 else if (charset_base == -1) 1267 /* If so, check if we can use boyer-moore. */
1275 charset_base = charset_base_code; 1268 if (c != inverse && boyer_moore_ok)
1276 else if (charset_base != charset_base_code) 1269 {
1277 /* If two different rows appear, needing translation, 1270 /* Check if all equivalents belong to the same
1278 then we cannot use boyer_moore search. */ 1271 charset & row. Note that the check of C
1279 boyer_moore_ok = 0; 1272 itself is done by the last iteration. Note
1273 also that we don't have to check ASCII
1274 characters because boyer-moore search can
1275 always handle their translation. */
1276 while (1)
1277 {
1278 if (! ASCII_BYTE_P (inverse))
1279 {
1280 if (SINGLE_BYTE_CHAR_P (inverse))
1281 {
1282 /* Boyer-moore search can't handle a
1283 translation of an eight-bit
1284 character. */
1285 boyer_moore_ok = 0;
1286 break;
1287 }
1288 else if (charset_base == 0)
1289 charset_base = inverse & ~CHAR_FIELD3_MASK;
1290 else if ((inverse & ~CHAR_FIELD3_MASK)
1291 != charset_base)
1292 {
1293 boyer_moore_ok = 0;
1294 break;
1295 }
1296 }
1297 if (c == inverse)
1298 break;
1299 TRANSLATE (inverse, inverse_trt, inverse);
1300 }
1301 }
1280 } 1302 }
1281 1303
1282 /* Store this character into the translated pattern. */ 1304 /* Store this character into the translated pattern. */
1283 bcopy (str, pat, charlen); 1305 bcopy (str, pat, charlen);
1284 pat += charlen; 1306 pat += charlen;
1298 dealing with a regexp known to be trivial, so the backslash 1320 dealing with a regexp known to be trivial, so the backslash
1299 just quotes the next character. */ 1321 just quotes the next character. */
1300 if (RE && *base_pat == '\\') 1322 if (RE && *base_pat == '\\')
1301 { 1323 {
1302 len--; 1324 len--;
1325 raw_pattern_size--;
1303 base_pat++; 1326 base_pat++;
1304 } 1327 }
1305 c = *base_pat++; 1328 c = *base_pat++;
1306 TRANSLATE (translated, trt, c); 1329 TRANSLATE (translated, trt, c);
1307 *pat++ = translated; 1330 *pat++ = translated;
1531 return -n; 1554 return -n;
1532 else 1555 else
1533 return n; 1556 return n;
1534 } 1557 }
1535 1558
1536 /* Do Boyer-Moore search N times for the string PAT, 1559 /* Do Boyer-Moore search N times for the string BASE_PAT,
1537 whose length is LEN/LEN_BYTE, 1560 whose length is LEN/LEN_BYTE,
1538 from buffer position POS/POS_BYTE until LIM/LIM_BYTE. 1561 from buffer position POS/POS_BYTE until LIM/LIM_BYTE.
1539 DIRECTION says which direction we search in. 1562 DIRECTION says which direction we search in.
1540 TRT and INVERSE_TRT are translation tables. 1563 TRT and INVERSE_TRT are translation tables.
1541 1564 Characters in PAT are already translated by TRT.
1542 This kind of search works if all the characters in PAT that have 1565
1543 nontrivial translation are the same aside from the last byte. This 1566 This kind of search works if all the characters in BASE_PAT that
1544 makes it possible to translate just the last byte of a character, 1567 have nontrivial translation are the same aside from the last byte.
1545 and do so after just a simple test of the context. 1568 This makes it possible to translate just the last byte of a
1569 character, and do so after just a simple test of the context.
1570 CHARSET_BASE is nonzero iff there is such a non-ASCII character.
1546 1571
1547 If that criterion is not satisfied, do not call this function. */ 1572 If that criterion is not satisfied, do not call this function. */
1548 1573
1549 static int 1574 static int
1550 boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, 1575 boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1567 register int i, j; 1592 register int i, j;
1568 unsigned char *pat, *pat_end; 1593 unsigned char *pat, *pat_end;
1569 int multibyte = ! NILP (current_buffer->enable_multibyte_characters); 1594 int multibyte = ! NILP (current_buffer->enable_multibyte_characters);
1570 1595
1571 unsigned char simple_translate[0400]; 1596 unsigned char simple_translate[0400];
1572 int translate_prev_byte = 0; 1597 /* These are set to the preceding bytes of a byte to be translated
1573 int translate_anteprev_byte = 0; 1598 if charset_base is nonzero. As the maximum byte length of a
1599 multibyte character is 4, we have to check at most three previous
1600 bytes. */
1601 int translate_prev_byte1 = 0;
1602 int translate_prev_byte2 = 0;
1603 int translate_prev_byte3 = 0;
1574 1604
1575 #ifdef C_ALLOCA 1605 #ifdef C_ALLOCA
1576 int BM_tab_space[0400]; 1606 int BM_tab_space[0400];
1577 BM_tab = &BM_tab_space[0]; 1607 BM_tab = &BM_tab_space[0];
1578 #else 1608 #else
1634 occur in the pattern. Others don't matter anyway! */ 1664 occur in the pattern. Others don't matter anyway! */
1635 bzero (simple_translate, sizeof simple_translate); 1665 bzero (simple_translate, sizeof simple_translate);
1636 for (i = 0; i < 0400; i++) 1666 for (i = 0; i < 0400; i++)
1637 simple_translate[i] = i; 1667 simple_translate[i] = i;
1638 1668
1669 if (charset_base)
1670 {
1671 /* Setup translate_prev_byte1/2/3 from CHARSET_BASE. Only a
1672 byte following them are the target of translation. */
1673 int sample_char = charset_base | 0x20;
1674 unsigned char str[MAX_MULTIBYTE_LENGTH];
1675 int len = CHAR_STRING (sample_char, str);
1676
1677 translate_prev_byte1 = str[len - 2];
1678 if (len > 2)
1679 {
1680 translate_prev_byte2 = str[len - 3];
1681 if (len > 3)
1682 translate_prev_byte3 = str[len - 4];
1683 }
1684 }
1685
1639 i = 0; 1686 i = 0;
1640 while (i != infinity) 1687 while (i != infinity)
1641 { 1688 {
1642 unsigned char *ptr = base_pat + i; 1689 unsigned char *ptr = base_pat + i;
1643 i += direction; 1690 i += direction;
1644 if (i == dirlen) 1691 if (i == dirlen)
1645 i = infinity; 1692 i = infinity;
1646 if (! NILP (trt)) 1693 if (! NILP (trt))
1647 { 1694 {
1648 int ch; 1695 /* If the byte currently looking at is a head of a character
1649 int untranslated; 1696 to check case-equivalents, set CH to that character. An
1650 int this_translated = 1; 1697 ASCII character and a non-ASCII character matching with
1651 1698 CHARSET_BASE are to be checked. */
1652 if (multibyte 1699 int ch = -1;
1653 /* Is *PTR the last byte of a character? */ 1700
1654 && (pat_end - ptr == 1 || CHAR_HEAD_P (ptr[1]))) 1701 if (ASCII_BYTE_P (*ptr) || ! multibyte)
1702 ch = *ptr;
1703 else if (charset_base && CHAR_HEAD_P (*ptr))
1655 { 1704 {
1656 unsigned char *charstart = ptr; 1705 ch = STRING_CHAR (ptr, pat_end - ptr);
1657 while (! CHAR_HEAD_P (*charstart)) 1706 if (charset_base != (ch & ~CHAR_FIELD3_MASK))
1658 charstart--; 1707 ch = -1;
1659 untranslated = STRING_CHAR (charstart, ptr - charstart + 1);
1660 if (charset_base == (untranslated & ~CHAR_FIELD3_MASK))
1661 {
1662 TRANSLATE (ch, trt, untranslated);
1663 if (! CHAR_HEAD_P (*ptr))
1664 {
1665 translate_prev_byte = ptr[-1];
1666 if (! CHAR_HEAD_P (translate_prev_byte))
1667 translate_anteprev_byte = ptr[-2];
1668 }
1669 }
1670 else
1671 {
1672 this_translated = 0;
1673 ch = *ptr;
1674 }
1675 } 1708 }
1676 else if (!multibyte) 1709
1677 TRANSLATE (ch, trt, *ptr); 1710 j = *ptr;
1678 else
1679 {
1680 ch = *ptr;
1681 this_translated = 0;
1682 }
1683
1684 if (ch > 0400)
1685 j = ((unsigned char) ch) | 0200;
1686 else
1687 j = (unsigned char) ch;
1688
1689 if (i == infinity) 1711 if (i == infinity)
1690 stride_for_teases = BM_tab[j]; 1712 stride_for_teases = BM_tab[j];
1691 1713
1692 BM_tab[j] = dirlen - i; 1714 BM_tab[j] = dirlen - i;
1693 /* A translation table is accompanied by its inverse -- see */ 1715 /* A translation table is accompanied by its inverse -- see */
1694 /* comment following downcase_table for details */ 1716 /* comment following downcase_table for details */
1695 if (this_translated) 1717 if (ch >= 0)
1696 { 1718 {
1697 int starting_ch = ch; 1719 int starting_ch = ch;
1698 int starting_j = j; 1720 int starting_j;
1721
1722 if (ch > 0400)
1723 starting_j = ((unsigned char) ch) | 0200;
1724 else
1725 starting_j = (unsigned char) ch;
1699 while (1) 1726 while (1)
1700 { 1727 {
1701 TRANSLATE (ch, inverse_trt, ch); 1728 TRANSLATE (ch, inverse_trt, ch);
1702 if (ch > 0400) 1729 if (ch > 0400)
1703 j = ((unsigned char) ch) | 0200; 1730 j = ((unsigned char) ch) | 0200;
1819 /* Translate only the last byte of a character. */ 1846 /* Translate only the last byte of a character. */
1820 if (! multibyte 1847 if (! multibyte
1821 || ((cursor == tail_end_ptr 1848 || ((cursor == tail_end_ptr
1822 || CHAR_HEAD_P (cursor[1])) 1849 || CHAR_HEAD_P (cursor[1]))
1823 && (CHAR_HEAD_P (cursor[0]) 1850 && (CHAR_HEAD_P (cursor[0])
1824 || (translate_prev_byte == cursor[-1] 1851 /* Check if this is the last byte of
1825 && (CHAR_HEAD_P (translate_prev_byte) 1852 a translable character. */
1826 || translate_anteprev_byte == cursor[-2]))))) 1853 || (translate_prev_byte1 == cursor[-1]
1854 && (CHAR_HEAD_P (translate_prev_byte1)
1855 || (translate_prev_byte2 == cursor[-2]
1856 && (CHAR_HEAD_P (translate_prev_byte2)
1857 || (translate_prev_byte3 == cursor[-3]))))))))
1827 ch = simple_translate[*cursor]; 1858 ch = simple_translate[*cursor];
1828 else 1859 else
1829 ch = *cursor; 1860 ch = *cursor;
1830 if (pat[i] != ch) 1861 if (pat[i] != ch)
1831 break; 1862 break;
1899 /* Translate only the last byte of a character. */ 1930 /* Translate only the last byte of a character. */
1900 if (! multibyte 1931 if (! multibyte
1901 || ((ptr == tail_end_ptr 1932 || ((ptr == tail_end_ptr
1902 || CHAR_HEAD_P (ptr[1])) 1933 || CHAR_HEAD_P (ptr[1]))
1903 && (CHAR_HEAD_P (ptr[0]) 1934 && (CHAR_HEAD_P (ptr[0])
1904 || (translate_prev_byte == ptr[-1] 1935 /* Check if this is the last byte of a
1905 && (CHAR_HEAD_P (translate_prev_byte) 1936 translable character. */
1906 || translate_anteprev_byte == ptr[-2]))))) 1937 || (translate_prev_byte1 == ptr[-1]
1938 && (CHAR_HEAD_P (translate_prev_byte1)
1939 || (translate_prev_byte2 == ptr[-2]
1940 && (CHAR_HEAD_P (translate_prev_byte2)
1941 || translate_prev_byte3 == ptr[-3])))))))
1907 ch = simple_translate[*ptr]; 1942 ch = simple_translate[*ptr];
1908 else 1943 else
1909 ch = *ptr; 1944 ch = *ptr;
1910 if (pat[i] != ch) 1945 if (pat[i] != ch)
1911 break; 1946 break;