Mercurial > emacs
comparison src/search.c @ 61189:91ba6c641a60
(looking_at_1): Use current_buffer->case_canon_table,
not DOWNCASE_TABLE.
(string_match_1): Likewise.
(fast_c_string_match_ignore_case): Use Vascii_canon_table, not
Vascii_downcase_table.
(fast_string_match_ignore_case): Likewise.
(search_buffer): Fix checking of boyer-moore usability.
(boyer_moore): Calculate translate_prev_byte1/2/3 in advance. No
need of tranlating characters in PAT. Fix calculation of
simple_translate.
author | Kenichi Handa <handa@m17n.org> |
---|---|
date | Fri, 01 Apr 2005 01:05:46 +0000 |
parents | 47729f2cb184 |
children | 0015a00ccb5a 7a3341d65a12 |
comparison
equal
deleted
inserted
replaced
61188:a96e8bdd2b37 | 61189:91ba6c641a60 |
---|---|
291 save_search_regs (); | 291 save_search_regs (); |
292 | 292 |
293 CHECK_STRING (string); | 293 CHECK_STRING (string); |
294 bufp = compile_pattern (string, &search_regs, | 294 bufp = compile_pattern (string, &search_regs, |
295 (!NILP (current_buffer->case_fold_search) | 295 (!NILP (current_buffer->case_fold_search) |
296 ? DOWNCASE_TABLE : Qnil), | 296 ? current_buffer->case_canon_table : Qnil), |
297 posix, | 297 posix, |
298 !NILP (current_buffer->enable_multibyte_characters)); | 298 !NILP (current_buffer->enable_multibyte_characters)); |
299 | 299 |
300 immediate_quit = 1; | 300 immediate_quit = 1; |
301 QUIT; /* Do a pending quit right away, to avoid paradoxical behavior */ | 301 QUIT; /* Do a pending quit right away, to avoid paradoxical behavior */ |
397 pos_byte = string_char_to_byte (string, pos); | 397 pos_byte = string_char_to_byte (string, pos); |
398 } | 398 } |
399 | 399 |
400 bufp = compile_pattern (regexp, &search_regs, | 400 bufp = compile_pattern (regexp, &search_regs, |
401 (!NILP (current_buffer->case_fold_search) | 401 (!NILP (current_buffer->case_fold_search) |
402 ? DOWNCASE_TABLE : Qnil), | 402 ? current_buffer->case_canon_table : Qnil), |
403 posix, | 403 posix, |
404 STRING_MULTIBYTE (string)); | 404 STRING_MULTIBYTE (string)); |
405 immediate_quit = 1; | 405 immediate_quit = 1; |
406 re_match_object = string; | 406 re_match_object = string; |
407 | 407 |
497 int len = strlen (string); | 497 int len = strlen (string); |
498 | 498 |
499 regexp = string_make_unibyte (regexp); | 499 regexp = string_make_unibyte (regexp); |
500 re_match_object = Qt; | 500 re_match_object = Qt; |
501 bufp = compile_pattern (regexp, 0, | 501 bufp = compile_pattern (regexp, 0, |
502 Vascii_downcase_table, 0, | 502 Vascii_canon_table, 0, |
503 0); | 503 0); |
504 immediate_quit = 1; | 504 immediate_quit = 1; |
505 val = re_search (bufp, string, len, 0, len, 0); | 505 val = re_search (bufp, string, len, 0, len, 0); |
506 immediate_quit = 0; | 506 immediate_quit = 0; |
507 return val; | 507 return val; |
514 Lisp_Object regexp, string; | 514 Lisp_Object regexp, string; |
515 { | 515 { |
516 int val; | 516 int val; |
517 struct re_pattern_buffer *bufp; | 517 struct re_pattern_buffer *bufp; |
518 | 518 |
519 bufp = compile_pattern (regexp, 0, Vascii_downcase_table, | 519 bufp = compile_pattern (regexp, 0, Vascii_canon_table, |
520 0, STRING_MULTIBYTE (string)); | 520 0, STRING_MULTIBYTE (string)); |
521 immediate_quit = 1; | 521 immediate_quit = 1; |
522 re_match_object = string; | 522 re_match_object = string; |
523 | 523 |
524 val = re_search (bufp, (char *) SDATA (string), | 524 val = re_search (bufp, (char *) SDATA (string), |
1173 int raw_pattern_size; | 1173 int raw_pattern_size; |
1174 int raw_pattern_size_byte; | 1174 int raw_pattern_size_byte; |
1175 unsigned char *patbuf; | 1175 unsigned char *patbuf; |
1176 int multibyte = !NILP (current_buffer->enable_multibyte_characters); | 1176 int multibyte = !NILP (current_buffer->enable_multibyte_characters); |
1177 unsigned char *base_pat = SDATA (string); | 1177 unsigned char *base_pat = SDATA (string); |
1178 int charset_base = -1; | 1178 /* Set to nozero if we find a non-ASCII char that need |
1179 translation. */ | |
1180 int charset_base = 0; | |
1179 int boyer_moore_ok = 1; | 1181 int boyer_moore_ok = 1; |
1180 | 1182 |
1181 /* MULTIBYTE says whether the text to be searched is multibyte. | 1183 /* MULTIBYTE says whether the text to be searched is multibyte. |
1182 We must convert PATTERN to match that, or we will not really | 1184 We must convert PATTERN to match that, or we will not really |
1183 find things right. */ | 1185 find things right. */ |
1219 patbuf = (unsigned char *) alloca (len_byte); | 1221 patbuf = (unsigned char *) alloca (len_byte); |
1220 pat = patbuf; | 1222 pat = patbuf; |
1221 base_pat = raw_pattern; | 1223 base_pat = raw_pattern; |
1222 if (multibyte) | 1224 if (multibyte) |
1223 { | 1225 { |
1226 /* Fill patbuf by translated characters in STRING while | |
1227 checking if we can use boyer-moore search. If TRT is | |
1228 non-nil, we can use boyer-moore search only if TRT can be | |
1229 represented by the byte array of 256 elements. For that, | |
1230 all non-ASCII case-equivalents of all case-senstive | |
1231 characters in STRING must belong to the same charset and | |
1232 row. */ | |
1233 | |
1224 while (--len >= 0) | 1234 while (--len >= 0) |
1225 { | 1235 { |
1226 unsigned char str[MAX_MULTIBYTE_LENGTH]; | 1236 unsigned char str_base[MAX_MULTIBYTE_LENGTH], *str; |
1227 int c, translated, inverse; | 1237 int c, translated, inverse; |
1228 int in_charlen, charlen; | 1238 int in_charlen, charlen; |
1229 | 1239 |
1230 /* If we got here and the RE flag is set, it's because we're | 1240 /* If we got here and the RE flag is set, it's because we're |
1231 dealing with a regexp known to be trivial, so the backslash | 1241 dealing with a regexp known to be trivial, so the backslash |
1232 just quotes the next character. */ | 1242 just quotes the next character. */ |
1233 if (RE && *base_pat == '\\') | 1243 if (RE && *base_pat == '\\') |
1234 { | 1244 { |
1235 len--; | 1245 len--; |
1246 raw_pattern_size--; | |
1236 len_byte--; | 1247 len_byte--; |
1237 base_pat++; | 1248 base_pat++; |
1238 } | 1249 } |
1239 | 1250 |
1240 c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen); | 1251 c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen); |
1241 | 1252 |
1242 /* Translate the character, if requested. */ | 1253 if (NILP (trt)) |
1243 TRANSLATE (translated, trt, c); | |
1244 /* If translation changed the byte-length, go back | |
1245 to the original character. */ | |
1246 charlen = CHAR_STRING (translated, str); | |
1247 if (in_charlen != charlen) | |
1248 { | 1254 { |
1249 translated = c; | 1255 str = base_pat; |
1250 charlen = CHAR_STRING (c, str); | 1256 charlen = in_charlen; |
1251 } | 1257 } |
1252 | 1258 else |
1253 /* If we are searching for something strange, | |
1254 an invalid multibyte code, don't use boyer-moore. */ | |
1255 if (! ASCII_BYTE_P (translated) | |
1256 && (charlen == 1 /* 8bit code */ | |
1257 || charlen != in_charlen /* invalid multibyte code */ | |
1258 )) | |
1259 boyer_moore_ok = 0; | |
1260 | |
1261 TRANSLATE (inverse, inverse_trt, c); | |
1262 | |
1263 /* Did this char actually get translated? | |
1264 Would any other char get translated into it? */ | |
1265 if (translated != c || inverse != c) | |
1266 { | 1259 { |
1267 /* Keep track of which character set row | 1260 /* Translate the character. */ |
1268 contains the characters that need translation. */ | 1261 TRANSLATE (translated, trt, c); |
1269 int charset_base_code = c & ~CHAR_FIELD3_MASK; | 1262 charlen = CHAR_STRING (translated, str_base); |
1270 int inverse_charset_base = inverse & ~CHAR_FIELD3_MASK; | 1263 str = str_base; |
1271 | 1264 |
1272 if (charset_base_code != inverse_charset_base) | 1265 /* Check if C has any other case-equivalents. */ |
1273 boyer_moore_ok = 0; | 1266 TRANSLATE (inverse, inverse_trt, c); |
1274 else if (charset_base == -1) | 1267 /* If so, check if we can use boyer-moore. */ |
1275 charset_base = charset_base_code; | 1268 if (c != inverse && boyer_moore_ok) |
1276 else if (charset_base != charset_base_code) | 1269 { |
1277 /* If two different rows appear, needing translation, | 1270 /* Check if all equivalents belong to the same |
1278 then we cannot use boyer_moore search. */ | 1271 charset & row. Note that the check of C |
1279 boyer_moore_ok = 0; | 1272 itself is done by the last iteration. Note |
1273 also that we don't have to check ASCII | |
1274 characters because boyer-moore search can | |
1275 always handle their translation. */ | |
1276 while (1) | |
1277 { | |
1278 if (! ASCII_BYTE_P (inverse)) | |
1279 { | |
1280 if (SINGLE_BYTE_CHAR_P (inverse)) | |
1281 { | |
1282 /* Boyer-moore search can't handle a | |
1283 translation of an eight-bit | |
1284 character. */ | |
1285 boyer_moore_ok = 0; | |
1286 break; | |
1287 } | |
1288 else if (charset_base == 0) | |
1289 charset_base = inverse & ~CHAR_FIELD3_MASK; | |
1290 else if ((inverse & ~CHAR_FIELD3_MASK) | |
1291 != charset_base) | |
1292 { | |
1293 boyer_moore_ok = 0; | |
1294 break; | |
1295 } | |
1296 } | |
1297 if (c == inverse) | |
1298 break; | |
1299 TRANSLATE (inverse, inverse_trt, inverse); | |
1300 } | |
1301 } | |
1280 } | 1302 } |
1281 | 1303 |
1282 /* Store this character into the translated pattern. */ | 1304 /* Store this character into the translated pattern. */ |
1283 bcopy (str, pat, charlen); | 1305 bcopy (str, pat, charlen); |
1284 pat += charlen; | 1306 pat += charlen; |
1298 dealing with a regexp known to be trivial, so the backslash | 1320 dealing with a regexp known to be trivial, so the backslash |
1299 just quotes the next character. */ | 1321 just quotes the next character. */ |
1300 if (RE && *base_pat == '\\') | 1322 if (RE && *base_pat == '\\') |
1301 { | 1323 { |
1302 len--; | 1324 len--; |
1325 raw_pattern_size--; | |
1303 base_pat++; | 1326 base_pat++; |
1304 } | 1327 } |
1305 c = *base_pat++; | 1328 c = *base_pat++; |
1306 TRANSLATE (translated, trt, c); | 1329 TRANSLATE (translated, trt, c); |
1307 *pat++ = translated; | 1330 *pat++ = translated; |
1531 return -n; | 1554 return -n; |
1532 else | 1555 else |
1533 return n; | 1556 return n; |
1534 } | 1557 } |
1535 | 1558 |
1536 /* Do Boyer-Moore search N times for the string PAT, | 1559 /* Do Boyer-Moore search N times for the string BASE_PAT, |
1537 whose length is LEN/LEN_BYTE, | 1560 whose length is LEN/LEN_BYTE, |
1538 from buffer position POS/POS_BYTE until LIM/LIM_BYTE. | 1561 from buffer position POS/POS_BYTE until LIM/LIM_BYTE. |
1539 DIRECTION says which direction we search in. | 1562 DIRECTION says which direction we search in. |
1540 TRT and INVERSE_TRT are translation tables. | 1563 TRT and INVERSE_TRT are translation tables. |
1541 | 1564 Characters in PAT are already translated by TRT. |
1542 This kind of search works if all the characters in PAT that have | 1565 |
1543 nontrivial translation are the same aside from the last byte. This | 1566 This kind of search works if all the characters in BASE_PAT that |
1544 makes it possible to translate just the last byte of a character, | 1567 have nontrivial translation are the same aside from the last byte. |
1545 and do so after just a simple test of the context. | 1568 This makes it possible to translate just the last byte of a |
1569 character, and do so after just a simple test of the context. | |
1570 CHARSET_BASE is nonzero iff there is such a non-ASCII character. | |
1546 | 1571 |
1547 If that criterion is not satisfied, do not call this function. */ | 1572 If that criterion is not satisfied, do not call this function. */ |
1548 | 1573 |
1549 static int | 1574 static int |
1550 boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | 1575 boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, |
1567 register int i, j; | 1592 register int i, j; |
1568 unsigned char *pat, *pat_end; | 1593 unsigned char *pat, *pat_end; |
1569 int multibyte = ! NILP (current_buffer->enable_multibyte_characters); | 1594 int multibyte = ! NILP (current_buffer->enable_multibyte_characters); |
1570 | 1595 |
1571 unsigned char simple_translate[0400]; | 1596 unsigned char simple_translate[0400]; |
1572 int translate_prev_byte = 0; | 1597 /* These are set to the preceding bytes of a byte to be translated |
1573 int translate_anteprev_byte = 0; | 1598 if charset_base is nonzero. As the maximum byte length of a |
1599 multibyte character is 4, we have to check at most three previous | |
1600 bytes. */ | |
1601 int translate_prev_byte1 = 0; | |
1602 int translate_prev_byte2 = 0; | |
1603 int translate_prev_byte3 = 0; | |
1574 | 1604 |
1575 #ifdef C_ALLOCA | 1605 #ifdef C_ALLOCA |
1576 int BM_tab_space[0400]; | 1606 int BM_tab_space[0400]; |
1577 BM_tab = &BM_tab_space[0]; | 1607 BM_tab = &BM_tab_space[0]; |
1578 #else | 1608 #else |
1634 occur in the pattern. Others don't matter anyway! */ | 1664 occur in the pattern. Others don't matter anyway! */ |
1635 bzero (simple_translate, sizeof simple_translate); | 1665 bzero (simple_translate, sizeof simple_translate); |
1636 for (i = 0; i < 0400; i++) | 1666 for (i = 0; i < 0400; i++) |
1637 simple_translate[i] = i; | 1667 simple_translate[i] = i; |
1638 | 1668 |
1669 if (charset_base) | |
1670 { | |
1671 /* Setup translate_prev_byte1/2/3 from CHARSET_BASE. Only a | |
1672 byte following them are the target of translation. */ | |
1673 int sample_char = charset_base | 0x20; | |
1674 unsigned char str[MAX_MULTIBYTE_LENGTH]; | |
1675 int len = CHAR_STRING (sample_char, str); | |
1676 | |
1677 translate_prev_byte1 = str[len - 2]; | |
1678 if (len > 2) | |
1679 { | |
1680 translate_prev_byte2 = str[len - 3]; | |
1681 if (len > 3) | |
1682 translate_prev_byte3 = str[len - 4]; | |
1683 } | |
1684 } | |
1685 | |
1639 i = 0; | 1686 i = 0; |
1640 while (i != infinity) | 1687 while (i != infinity) |
1641 { | 1688 { |
1642 unsigned char *ptr = base_pat + i; | 1689 unsigned char *ptr = base_pat + i; |
1643 i += direction; | 1690 i += direction; |
1644 if (i == dirlen) | 1691 if (i == dirlen) |
1645 i = infinity; | 1692 i = infinity; |
1646 if (! NILP (trt)) | 1693 if (! NILP (trt)) |
1647 { | 1694 { |
1648 int ch; | 1695 /* If the byte currently looking at is a head of a character |
1649 int untranslated; | 1696 to check case-equivalents, set CH to that character. An |
1650 int this_translated = 1; | 1697 ASCII character and a non-ASCII character matching with |
1651 | 1698 CHARSET_BASE are to be checked. */ |
1652 if (multibyte | 1699 int ch = -1; |
1653 /* Is *PTR the last byte of a character? */ | 1700 |
1654 && (pat_end - ptr == 1 || CHAR_HEAD_P (ptr[1]))) | 1701 if (ASCII_BYTE_P (*ptr) || ! multibyte) |
1702 ch = *ptr; | |
1703 else if (charset_base && CHAR_HEAD_P (*ptr)) | |
1655 { | 1704 { |
1656 unsigned char *charstart = ptr; | 1705 ch = STRING_CHAR (ptr, pat_end - ptr); |
1657 while (! CHAR_HEAD_P (*charstart)) | 1706 if (charset_base != (ch & ~CHAR_FIELD3_MASK)) |
1658 charstart--; | 1707 ch = -1; |
1659 untranslated = STRING_CHAR (charstart, ptr - charstart + 1); | |
1660 if (charset_base == (untranslated & ~CHAR_FIELD3_MASK)) | |
1661 { | |
1662 TRANSLATE (ch, trt, untranslated); | |
1663 if (! CHAR_HEAD_P (*ptr)) | |
1664 { | |
1665 translate_prev_byte = ptr[-1]; | |
1666 if (! CHAR_HEAD_P (translate_prev_byte)) | |
1667 translate_anteprev_byte = ptr[-2]; | |
1668 } | |
1669 } | |
1670 else | |
1671 { | |
1672 this_translated = 0; | |
1673 ch = *ptr; | |
1674 } | |
1675 } | 1708 } |
1676 else if (!multibyte) | 1709 |
1677 TRANSLATE (ch, trt, *ptr); | 1710 j = *ptr; |
1678 else | |
1679 { | |
1680 ch = *ptr; | |
1681 this_translated = 0; | |
1682 } | |
1683 | |
1684 if (ch > 0400) | |
1685 j = ((unsigned char) ch) | 0200; | |
1686 else | |
1687 j = (unsigned char) ch; | |
1688 | |
1689 if (i == infinity) | 1711 if (i == infinity) |
1690 stride_for_teases = BM_tab[j]; | 1712 stride_for_teases = BM_tab[j]; |
1691 | 1713 |
1692 BM_tab[j] = dirlen - i; | 1714 BM_tab[j] = dirlen - i; |
1693 /* A translation table is accompanied by its inverse -- see */ | 1715 /* A translation table is accompanied by its inverse -- see */ |
1694 /* comment following downcase_table for details */ | 1716 /* comment following downcase_table for details */ |
1695 if (this_translated) | 1717 if (ch >= 0) |
1696 { | 1718 { |
1697 int starting_ch = ch; | 1719 int starting_ch = ch; |
1698 int starting_j = j; | 1720 int starting_j; |
1721 | |
1722 if (ch > 0400) | |
1723 starting_j = ((unsigned char) ch) | 0200; | |
1724 else | |
1725 starting_j = (unsigned char) ch; | |
1699 while (1) | 1726 while (1) |
1700 { | 1727 { |
1701 TRANSLATE (ch, inverse_trt, ch); | 1728 TRANSLATE (ch, inverse_trt, ch); |
1702 if (ch > 0400) | 1729 if (ch > 0400) |
1703 j = ((unsigned char) ch) | 0200; | 1730 j = ((unsigned char) ch) | 0200; |
1819 /* Translate only the last byte of a character. */ | 1846 /* Translate only the last byte of a character. */ |
1820 if (! multibyte | 1847 if (! multibyte |
1821 || ((cursor == tail_end_ptr | 1848 || ((cursor == tail_end_ptr |
1822 || CHAR_HEAD_P (cursor[1])) | 1849 || CHAR_HEAD_P (cursor[1])) |
1823 && (CHAR_HEAD_P (cursor[0]) | 1850 && (CHAR_HEAD_P (cursor[0]) |
1824 || (translate_prev_byte == cursor[-1] | 1851 /* Check if this is the last byte of |
1825 && (CHAR_HEAD_P (translate_prev_byte) | 1852 a translable character. */ |
1826 || translate_anteprev_byte == cursor[-2]))))) | 1853 || (translate_prev_byte1 == cursor[-1] |
1854 && (CHAR_HEAD_P (translate_prev_byte1) | |
1855 || (translate_prev_byte2 == cursor[-2] | |
1856 && (CHAR_HEAD_P (translate_prev_byte2) | |
1857 || (translate_prev_byte3 == cursor[-3])))))))) | |
1827 ch = simple_translate[*cursor]; | 1858 ch = simple_translate[*cursor]; |
1828 else | 1859 else |
1829 ch = *cursor; | 1860 ch = *cursor; |
1830 if (pat[i] != ch) | 1861 if (pat[i] != ch) |
1831 break; | 1862 break; |
1899 /* Translate only the last byte of a character. */ | 1930 /* Translate only the last byte of a character. */ |
1900 if (! multibyte | 1931 if (! multibyte |
1901 || ((ptr == tail_end_ptr | 1932 || ((ptr == tail_end_ptr |
1902 || CHAR_HEAD_P (ptr[1])) | 1933 || CHAR_HEAD_P (ptr[1])) |
1903 && (CHAR_HEAD_P (ptr[0]) | 1934 && (CHAR_HEAD_P (ptr[0]) |
1904 || (translate_prev_byte == ptr[-1] | 1935 /* Check if this is the last byte of a |
1905 && (CHAR_HEAD_P (translate_prev_byte) | 1936 translable character. */ |
1906 || translate_anteprev_byte == ptr[-2]))))) | 1937 || (translate_prev_byte1 == ptr[-1] |
1938 && (CHAR_HEAD_P (translate_prev_byte1) | |
1939 || (translate_prev_byte2 == ptr[-2] | |
1940 && (CHAR_HEAD_P (translate_prev_byte2) | |
1941 || translate_prev_byte3 == ptr[-3]))))))) | |
1907 ch = simple_translate[*ptr]; | 1942 ch = simple_translate[*ptr]; |
1908 else | 1943 else |
1909 ch = *ptr; | 1944 ch = *ptr; |
1910 if (pat[i] != ch) | 1945 if (pat[i] != ch) |
1911 break; | 1946 break; |