Mercurial > emacs
comparison src/search.c @ 89483:2f877ed80fa6
*** empty log message ***
author | Kenichi Handa <handa@m17n.org> |
---|---|
date | Mon, 08 Sep 2003 12:53:41 +0000 |
parents | 375f2633d815 9bd823b992df |
children | 885b083d5599 |
comparison
equal
deleted
inserted
replaced
88123:375f2633d815 | 89483:2f877ed80fa6 |
---|---|
22 #include <config.h> | 22 #include <config.h> |
23 #include "lisp.h" | 23 #include "lisp.h" |
24 #include "syntax.h" | 24 #include "syntax.h" |
25 #include "category.h" | 25 #include "category.h" |
26 #include "buffer.h" | 26 #include "buffer.h" |
27 #include "charset.h" | 27 #include "character.h" |
28 #include "region-cache.h" | 28 #include "region-cache.h" |
29 #include "commands.h" | 29 #include "commands.h" |
30 #include "blockinput.h" | 30 #include "blockinput.h" |
31 #include "intervals.h" | 31 #include "intervals.h" |
32 | 32 |
102 values that will result from matching this pattern. | 102 values that will result from matching this pattern. |
103 If it is 0, we should compile the pattern not to record any | 103 If it is 0, we should compile the pattern not to record any |
104 subexpression bounds. | 104 subexpression bounds. |
105 POSIX is nonzero if we want full backtracking (POSIX style) | 105 POSIX is nonzero if we want full backtracking (POSIX style) |
106 for this pattern. 0 means backtrack only enough to get a valid match. | 106 for this pattern. 0 means backtrack only enough to get a valid match. |
107 MULTIBYTE is nonzero if we want to handle multibyte characters in | 107 MULTIBYTE is nonzero iff a target of match is a multibyte buffer or |
108 PATTERN. 0 means all multibyte characters are recognized just as | 108 string. */ |
109 sequences of binary data. */ | |
110 | 109 |
111 static void | 110 static void |
112 compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte) | 111 compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte) |
113 struct regexp_cache *cp; | 112 struct regexp_cache *cp; |
114 Lisp_Object pattern; | 113 Lisp_Object pattern; |
115 Lisp_Object translate; | 114 Lisp_Object translate; |
116 struct re_registers *regp; | 115 struct re_registers *regp; |
117 int posix; | 116 int posix; |
118 int multibyte; | 117 int multibyte; |
119 { | 118 { |
120 unsigned char *raw_pattern; | |
121 int raw_pattern_size; | |
122 char *val; | 119 char *val; |
123 reg_syntax_t old; | 120 reg_syntax_t old; |
124 | |
125 /* MULTIBYTE says whether the text to be searched is multibyte. | |
126 We must convert PATTERN to match that, or we will not really | |
127 find things right. */ | |
128 | |
129 if (multibyte == STRING_MULTIBYTE (pattern)) | |
130 { | |
131 raw_pattern = (unsigned char *) SDATA (pattern); | |
132 raw_pattern_size = SBYTES (pattern); | |
133 } | |
134 else if (multibyte) | |
135 { | |
136 raw_pattern_size = count_size_as_multibyte (SDATA (pattern), | |
137 SCHARS (pattern)); | |
138 raw_pattern = (unsigned char *) alloca (raw_pattern_size + 1); | |
139 copy_text (SDATA (pattern), raw_pattern, | |
140 SCHARS (pattern), 0, 1); | |
141 } | |
142 else | |
143 { | |
144 /* Converting multibyte to single-byte. | |
145 | |
146 ??? Perhaps this conversion should be done in a special way | |
147 by subtracting nonascii-insert-offset from each non-ASCII char, | |
148 so that only the multibyte chars which really correspond to | |
149 the chosen single-byte character set can possibly match. */ | |
150 raw_pattern_size = SCHARS (pattern); | |
151 raw_pattern = (unsigned char *) alloca (raw_pattern_size + 1); | |
152 copy_text (SDATA (pattern), raw_pattern, | |
153 SBYTES (pattern), 1, 0); | |
154 } | |
155 | 121 |
156 cp->regexp = Qnil; | 122 cp->regexp = Qnil; |
157 cp->buf.translate = (! NILP (translate) ? translate : make_number (0)); | 123 cp->buf.translate = (! NILP (translate) ? translate : make_number (0)); |
158 cp->posix = posix; | 124 cp->posix = posix; |
159 cp->buf.multibyte = multibyte; | 125 cp->buf.multibyte = STRING_MULTIBYTE (pattern); |
126 cp->buf.target_multibyte = multibyte; | |
160 BLOCK_INPUT; | 127 BLOCK_INPUT; |
161 old = re_set_syntax (RE_SYNTAX_EMACS | 128 old = re_set_syntax (RE_SYNTAX_EMACS |
162 | (posix ? 0 : RE_NO_POSIX_BACKTRACKING)); | 129 | (posix ? 0 : RE_NO_POSIX_BACKTRACKING)); |
163 val = (char *) re_compile_pattern ((char *)raw_pattern, | 130 val = (char *) re_compile_pattern ((char *) SDATA (pattern), |
164 raw_pattern_size, &cp->buf); | 131 SBYTES (pattern), &cp->buf); |
165 re_set_syntax (old); | 132 re_set_syntax (old); |
166 UNBLOCK_INPUT; | 133 UNBLOCK_INPUT; |
167 if (val) | 134 if (val) |
168 Fsignal (Qinvalid_regexp, Fcons (build_string (val), Qnil)); | 135 Fsignal (Qinvalid_regexp, Fcons (build_string (val), Qnil)); |
169 | 136 |
220 if (SCHARS (cp->regexp) == SCHARS (pattern) | 187 if (SCHARS (cp->regexp) == SCHARS (pattern) |
221 && STRING_MULTIBYTE (cp->regexp) == STRING_MULTIBYTE (pattern) | 188 && STRING_MULTIBYTE (cp->regexp) == STRING_MULTIBYTE (pattern) |
222 && !NILP (Fstring_equal (cp->regexp, pattern)) | 189 && !NILP (Fstring_equal (cp->regexp, pattern)) |
223 && EQ (cp->buf.translate, (! NILP (translate) ? translate : make_number (0))) | 190 && EQ (cp->buf.translate, (! NILP (translate) ? translate : make_number (0))) |
224 && cp->posix == posix | 191 && cp->posix == posix |
225 && cp->buf.multibyte == multibyte) | 192 && cp->buf.target_multibyte == multibyte) |
226 break; | 193 break; |
227 | 194 |
228 /* If we're at the end of the cache, compile into the nil cell | 195 /* If we're at the end of the cache, compile into the nil cell |
229 we found, or the last (least recently used) cell with a | 196 we found, or the last (least recently used) cell with a |
230 string value. */ | 197 string value. */ |
1138 int raw_pattern_size; | 1105 int raw_pattern_size; |
1139 int raw_pattern_size_byte; | 1106 int raw_pattern_size_byte; |
1140 unsigned char *patbuf; | 1107 unsigned char *patbuf; |
1141 int multibyte = !NILP (current_buffer->enable_multibyte_characters); | 1108 int multibyte = !NILP (current_buffer->enable_multibyte_characters); |
1142 unsigned char *base_pat = SDATA (string); | 1109 unsigned char *base_pat = SDATA (string); |
1143 int charset_base = -1; | 1110 /* High bits of char; 0 for ASCII characters, (CHAR & ~0x3F) |
1111 otherwise. Characters of the same high bits have the same | |
1112 sequence of bytes but last. To do the BM search, all | |
1113 characters in STRING must have the same high bits (including | |
1114 their case translations). */ | |
1115 int char_high_bits = -1; | |
1144 int boyer_moore_ok = 1; | 1116 int boyer_moore_ok = 1; |
1145 | 1117 |
1146 /* MULTIBYTE says whether the text to be searched is multibyte. | 1118 /* MULTIBYTE says whether the text to be searched is multibyte. |
1147 We must convert PATTERN to match that, or we will not really | 1119 We must convert PATTERN to match that, or we will not really |
1148 find things right. */ | 1120 find things right. */ |
1179 } | 1151 } |
1180 | 1152 |
1181 /* Copy and optionally translate the pattern. */ | 1153 /* Copy and optionally translate the pattern. */ |
1182 len = raw_pattern_size; | 1154 len = raw_pattern_size; |
1183 len_byte = raw_pattern_size_byte; | 1155 len_byte = raw_pattern_size_byte; |
1184 patbuf = (unsigned char *) alloca (len_byte); | 1156 patbuf = (unsigned char *) alloca (len * MAX_MULTIBYTE_LENGTH); |
1185 pat = patbuf; | 1157 pat = patbuf; |
1186 base_pat = raw_pattern; | 1158 base_pat = raw_pattern; |
1187 if (multibyte) | 1159 if (multibyte) |
1188 { | 1160 { |
1189 while (--len >= 0) | 1161 while (--len >= 0) |
1190 { | 1162 { |
1191 unsigned char str[MAX_MULTIBYTE_LENGTH]; | |
1192 int c, translated, inverse; | 1163 int c, translated, inverse; |
1193 int in_charlen, charlen; | 1164 int in_charlen; |
1194 | 1165 |
1195 /* If we got here and the RE flag is set, it's because we're | 1166 /* If we got here and the RE flag is set, it's because we're |
1196 dealing with a regexp known to be trivial, so the backslash | 1167 dealing with a regexp known to be trivial, so the backslash |
1197 just quotes the next character. */ | 1168 just quotes the next character. */ |
1198 if (RE && *base_pat == '\\') | 1169 if (RE && *base_pat == '\\') |
1204 | 1175 |
1205 c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen); | 1176 c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen); |
1206 | 1177 |
1207 /* Translate the character, if requested. */ | 1178 /* Translate the character, if requested. */ |
1208 TRANSLATE (translated, trt, c); | 1179 TRANSLATE (translated, trt, c); |
1209 /* If translation changed the byte-length, go back | |
1210 to the original character. */ | |
1211 charlen = CHAR_STRING (translated, str); | |
1212 if (in_charlen != charlen) | |
1213 { | |
1214 translated = c; | |
1215 charlen = CHAR_STRING (c, str); | |
1216 } | |
1217 | |
1218 /* If we are searching for something strange, | |
1219 an invalid multibyte code, don't use boyer-moore. */ | |
1220 if (! ASCII_BYTE_P (translated) | |
1221 && (charlen == 1 /* 8bit code */ | |
1222 || charlen != in_charlen /* invalid multibyte code */ | |
1223 )) | |
1224 boyer_moore_ok = 0; | |
1225 | |
1226 TRANSLATE (inverse, inverse_trt, c); | 1180 TRANSLATE (inverse, inverse_trt, c); |
1227 | 1181 |
1228 /* Did this char actually get translated? | 1182 /* Did this char actually get translated? |
1229 Would any other char get translated into it? */ | 1183 Would any other char get translated into it? */ |
1230 if (translated != c || inverse != c) | 1184 if (translated != c || inverse != c) |
1231 { | 1185 { |
1232 /* Keep track of which character set row | 1186 /* Keep track of which character set row |
1233 contains the characters that need translation. */ | 1187 contains the characters that need translation. */ |
1234 int charset_base_code = c & ~CHAR_FIELD3_MASK; | 1188 int this_high_bit = ASCII_CHAR_P (c) ? 0 : (c & ~0x3F); |
1235 int inverse_charset_base = inverse & ~CHAR_FIELD3_MASK; | 1189 int c1 = inverse != c ? inverse : translated; |
1236 | 1190 int trt_high_bit = ASCII_CHAR_P (c1) ? 0 : (c1 & ~0x3F); |
1237 if (charset_base_code != inverse_charset_base) | 1191 |
1192 if (this_high_bit != trt_high_bit) | |
1238 boyer_moore_ok = 0; | 1193 boyer_moore_ok = 0; |
1239 else if (charset_base == -1) | 1194 else if (char_high_bits == -1) |
1240 charset_base = charset_base_code; | 1195 char_high_bits = this_high_bit; |
1241 else if (charset_base != charset_base_code) | 1196 else if (char_high_bits != this_high_bit) |
1242 /* If two different rows appear, needing translation, | 1197 /* If two different rows appear, needing translation, |
1243 then we cannot use boyer_moore search. */ | 1198 then we cannot use boyer_moore search. */ |
1244 boyer_moore_ok = 0; | 1199 boyer_moore_ok = 0; |
1245 } | 1200 } |
1246 | 1201 |
1247 /* Store this character into the translated pattern. */ | 1202 /* Store this character into the translated pattern. */ |
1248 bcopy (str, pat, charlen); | 1203 CHAR_STRING_ADVANCE (translated, pat); |
1249 pat += charlen; | |
1250 base_pat += in_charlen; | 1204 base_pat += in_charlen; |
1251 len_byte -= in_charlen; | 1205 len_byte -= in_charlen; |
1252 } | 1206 } |
1253 } | 1207 } |
1254 else | 1208 else |
1255 { | 1209 { |
1256 /* Unibyte buffer. */ | 1210 /* Unibyte buffer. */ |
1257 charset_base = 0; | 1211 char_high_bits = 0; |
1258 while (--len >= 0) | 1212 while (--len >= 0) |
1259 { | 1213 { |
1260 int c, translated; | 1214 int c, translated; |
1261 | 1215 |
1262 /* If we got here and the RE flag is set, it's because we're | 1216 /* If we got here and the RE flag is set, it's because we're |
1278 pat = base_pat = patbuf; | 1232 pat = base_pat = patbuf; |
1279 | 1233 |
1280 if (boyer_moore_ok) | 1234 if (boyer_moore_ok) |
1281 return boyer_moore (n, pat, len, len_byte, trt, inverse_trt, | 1235 return boyer_moore (n, pat, len, len_byte, trt, inverse_trt, |
1282 pos, pos_byte, lim, lim_byte, | 1236 pos, pos_byte, lim, lim_byte, |
1283 charset_base); | 1237 char_high_bits); |
1284 else | 1238 else |
1285 return simple_search (n, pat, len, len_byte, trt, | 1239 return simple_search (n, pat, len, len_byte, trt, |
1286 pos, pos_byte, lim, lim_byte); | 1240 pos, pos_byte, lim, lim_byte); |
1287 } | 1241 } |
1288 } | 1242 } |
1511 | 1465 |
1512 If that criterion is not satisfied, do not call this function. */ | 1466 If that criterion is not satisfied, do not call this function. */ |
1513 | 1467 |
1514 static int | 1468 static int |
1515 boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | 1469 boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, |
1516 pos, pos_byte, lim, lim_byte, charset_base) | 1470 pos, pos_byte, lim, lim_byte, char_high_bits) |
1517 int n; | 1471 int n; |
1518 unsigned char *base_pat; | 1472 unsigned char *base_pat; |
1519 int len, len_byte; | 1473 int len, len_byte; |
1520 Lisp_Object trt; | 1474 Lisp_Object trt; |
1521 Lisp_Object inverse_trt; | 1475 Lisp_Object inverse_trt; |
1522 int pos, pos_byte; | 1476 int pos, pos_byte; |
1523 int lim, lim_byte; | 1477 int lim, lim_byte; |
1524 int charset_base; | 1478 int char_high_bits; |
1525 { | 1479 { |
1526 int direction = ((n > 0) ? 1 : -1); | 1480 int direction = ((n > 0) ? 1 : -1); |
1527 register int dirlen; | 1481 register int dirlen; |
1528 int infinity, limit, stride_for_teases = 0; | 1482 int infinity, limit, stride_for_teases = 0; |
1529 register int *BM_tab; | 1483 register int *BM_tab; |
1620 { | 1574 { |
1621 unsigned char *charstart = ptr; | 1575 unsigned char *charstart = ptr; |
1622 while (! CHAR_HEAD_P (*charstart)) | 1576 while (! CHAR_HEAD_P (*charstart)) |
1623 charstart--; | 1577 charstart--; |
1624 untranslated = STRING_CHAR (charstart, ptr - charstart + 1); | 1578 untranslated = STRING_CHAR (charstart, ptr - charstart + 1); |
1625 if (charset_base == (untranslated & ~CHAR_FIELD3_MASK)) | 1579 if (char_high_bits |
1580 == (ASCII_CHAR_P (untranslated) ? 0 : untranslated & ~0x3F)) | |
1626 { | 1581 { |
1627 TRANSLATE (ch, trt, untranslated); | 1582 TRANSLATE (ch, trt, untranslated); |
1628 if (! CHAR_HEAD_P (*ptr)) | 1583 if (! CHAR_HEAD_P (*ptr)) |
1629 { | 1584 { |
1630 translate_prev_byte = ptr[-1]; | 1585 translate_prev_byte = ptr[-1]; |
1644 { | 1599 { |
1645 ch = *ptr; | 1600 ch = *ptr; |
1646 this_translated = 0; | 1601 this_translated = 0; |
1647 } | 1602 } |
1648 | 1603 |
1649 if (ch > 0400) | 1604 if (this_translated |
1650 j = ((unsigned char) ch) | 0200; | 1605 && ch >= 0200) |
1606 j = (ch & 0x3F) | 0200; | |
1651 else | 1607 else |
1652 j = (unsigned char) ch; | 1608 j = (unsigned char) ch; |
1653 | 1609 |
1654 if (i == infinity) | 1610 if (i == infinity) |
1655 stride_for_teases = BM_tab[j]; | 1611 stride_for_teases = BM_tab[j]; |
1662 int starting_ch = ch; | 1618 int starting_ch = ch; |
1663 int starting_j = j; | 1619 int starting_j = j; |
1664 while (1) | 1620 while (1) |
1665 { | 1621 { |
1666 TRANSLATE (ch, inverse_trt, ch); | 1622 TRANSLATE (ch, inverse_trt, ch); |
1667 if (ch > 0400) | 1623 if (ch > 0200) |
1668 j = ((unsigned char) ch) | 0200; | 1624 j = (ch & 0x3F) | 0200; |
1669 else | 1625 else |
1670 j = (unsigned char) ch; | 1626 j = (unsigned char) ch; |
1671 | 1627 |
1672 /* For all the characters that map into CH, | 1628 /* For all the characters that map into CH, |
1673 set up simple_translate to map the last byte | 1629 set up simple_translate to map the last byte |
1956 | 1912 |
1957 for (i = 0, i_byte = 0; i < len; ) | 1913 for (i = 0, i_byte = 0; i < len; ) |
1958 { | 1914 { |
1959 int c; | 1915 int c; |
1960 | 1916 |
1961 FETCH_STRING_CHAR_ADVANCE (c, string, i, i_byte); | 1917 FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, i, i_byte); |
1962 | 1918 |
1963 if (SYNTAX (c) != Sword) | 1919 if (SYNTAX (c) != Sword) |
1964 { | 1920 { |
1965 punct_count++; | 1921 punct_count++; |
1966 if (i > 0 && SYNTAX (prev_c) == Sword) | 1922 if (i > 0 && SYNTAX (prev_c) == Sword) |
1991 for (i = 0, i_byte = 0; i < len; ) | 1947 for (i = 0, i_byte = 0; i < len; ) |
1992 { | 1948 { |
1993 int c; | 1949 int c; |
1994 int i_byte_orig = i_byte; | 1950 int i_byte_orig = i_byte; |
1995 | 1951 |
1996 FETCH_STRING_CHAR_ADVANCE (c, string, i, i_byte); | 1952 FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, i, i_byte); |
1997 | 1953 |
1998 if (SYNTAX (c) == Sword) | 1954 if (SYNTAX (c) == Sword) |
1999 { | 1955 { |
2000 bcopy (SDATA (string) + i_byte_orig, o, | 1956 bcopy (SDATA (string) + i_byte_orig, o, |
2001 i_byte - i_byte_orig); | 1957 i_byte - i_byte_orig); |
2275 | 2231 |
2276 while (pos < last) | 2232 while (pos < last) |
2277 { | 2233 { |
2278 if (NILP (string)) | 2234 if (NILP (string)) |
2279 { | 2235 { |
2280 c = FETCH_CHAR (pos_byte); | 2236 c = FETCH_CHAR_AS_MULTIBYTE (pos_byte); |
2281 INC_BOTH (pos, pos_byte); | 2237 INC_BOTH (pos, pos_byte); |
2282 } | 2238 } |
2283 else | 2239 else |
2284 FETCH_STRING_CHAR_ADVANCE (c, string, pos, pos_byte); | 2240 FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, pos, pos_byte); |
2285 | 2241 |
2286 if (LOWERCASEP (c)) | 2242 if (LOWERCASEP (c)) |
2287 { | 2243 { |
2288 /* Cannot be all caps if any original char is lower case */ | 2244 /* Cannot be all caps if any original char is lower case */ |
2289 | 2245 |
2443 int buf_multibyte = !NILP (current_buffer->enable_multibyte_characters); | 2399 int buf_multibyte = !NILP (current_buffer->enable_multibyte_characters); |
2444 int str_multibyte = STRING_MULTIBYTE (newtext); | 2400 int str_multibyte = STRING_MULTIBYTE (newtext); |
2445 Lisp_Object rev_tbl; | 2401 Lisp_Object rev_tbl; |
2446 int really_changed = 0; | 2402 int really_changed = 0; |
2447 | 2403 |
2448 rev_tbl= (!buf_multibyte && CHAR_TABLE_P (Vnonascii_translation_table) | 2404 rev_tbl = Qnil; |
2449 ? Fchar_table_extra_slot (Vnonascii_translation_table, | |
2450 make_number (0)) | |
2451 : Qnil); | |
2452 | 2405 |
2453 substed_alloc_size = length * 2 + 100; | 2406 substed_alloc_size = length * 2 + 100; |
2454 substed = (unsigned char *) xmalloc (substed_alloc_size + 1); | 2407 substed = (unsigned char *) xmalloc (substed_alloc_size + 1); |
2455 substed_len = 0; | 2408 substed_len = 0; |
2456 | 2409 |
2489 | 2442 |
2490 if (str_multibyte) | 2443 if (str_multibyte) |
2491 { | 2444 { |
2492 FETCH_STRING_CHAR_ADVANCE_NO_CHECK (c, newtext, | 2445 FETCH_STRING_CHAR_ADVANCE_NO_CHECK (c, newtext, |
2493 pos, pos_byte); | 2446 pos, pos_byte); |
2494 if (!buf_multibyte && !SINGLE_BYTE_CHAR_P (c)) | 2447 if (!buf_multibyte && !ASCII_CHAR_P (c)) |
2495 c = multibyte_char_to_unibyte (c, rev_tbl); | 2448 c = multibyte_char_to_unibyte (c, rev_tbl); |
2496 } | 2449 } |
2497 else | 2450 else |
2498 { | 2451 { |
2499 c = SREF (newtext, pos_byte++); | 2452 c = SREF (newtext, pos_byte++); |