comparison src/search.c @ 89483:2f877ed80fa6

*** empty log message ***
author Kenichi Handa <handa@m17n.org>
date Mon, 08 Sep 2003 12:53:41 +0000
parents 375f2633d815 9bd823b992df
children 885b083d5599
comparison
equal deleted inserted replaced
88123:375f2633d815 89483:2f877ed80fa6
22 #include <config.h> 22 #include <config.h>
23 #include "lisp.h" 23 #include "lisp.h"
24 #include "syntax.h" 24 #include "syntax.h"
25 #include "category.h" 25 #include "category.h"
26 #include "buffer.h" 26 #include "buffer.h"
27 #include "charset.h" 27 #include "character.h"
28 #include "region-cache.h" 28 #include "region-cache.h"
29 #include "commands.h" 29 #include "commands.h"
30 #include "blockinput.h" 30 #include "blockinput.h"
31 #include "intervals.h" 31 #include "intervals.h"
32 32
102 values that will result from matching this pattern. 102 values that will result from matching this pattern.
103 If it is 0, we should compile the pattern not to record any 103 If it is 0, we should compile the pattern not to record any
104 subexpression bounds. 104 subexpression bounds.
105 POSIX is nonzero if we want full backtracking (POSIX style) 105 POSIX is nonzero if we want full backtracking (POSIX style)
106 for this pattern. 0 means backtrack only enough to get a valid match. 106 for this pattern. 0 means backtrack only enough to get a valid match.
107 MULTIBYTE is nonzero if we want to handle multibyte characters in 107 MULTIBYTE is nonzero iff a target of match is a multibyte buffer or
108 PATTERN. 0 means all multibyte characters are recognized just as 108 string. */
109 sequences of binary data. */
110 109
111 static void 110 static void
112 compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte) 111 compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte)
113 struct regexp_cache *cp; 112 struct regexp_cache *cp;
114 Lisp_Object pattern; 113 Lisp_Object pattern;
115 Lisp_Object translate; 114 Lisp_Object translate;
116 struct re_registers *regp; 115 struct re_registers *regp;
117 int posix; 116 int posix;
118 int multibyte; 117 int multibyte;
119 { 118 {
120 unsigned char *raw_pattern;
121 int raw_pattern_size;
122 char *val; 119 char *val;
123 reg_syntax_t old; 120 reg_syntax_t old;
124
125 /* MULTIBYTE says whether the text to be searched is multibyte.
126 We must convert PATTERN to match that, or we will not really
127 find things right. */
128
129 if (multibyte == STRING_MULTIBYTE (pattern))
130 {
131 raw_pattern = (unsigned char *) SDATA (pattern);
132 raw_pattern_size = SBYTES (pattern);
133 }
134 else if (multibyte)
135 {
136 raw_pattern_size = count_size_as_multibyte (SDATA (pattern),
137 SCHARS (pattern));
138 raw_pattern = (unsigned char *) alloca (raw_pattern_size + 1);
139 copy_text (SDATA (pattern), raw_pattern,
140 SCHARS (pattern), 0, 1);
141 }
142 else
143 {
144 /* Converting multibyte to single-byte.
145
146 ??? Perhaps this conversion should be done in a special way
147 by subtracting nonascii-insert-offset from each non-ASCII char,
148 so that only the multibyte chars which really correspond to
149 the chosen single-byte character set can possibly match. */
150 raw_pattern_size = SCHARS (pattern);
151 raw_pattern = (unsigned char *) alloca (raw_pattern_size + 1);
152 copy_text (SDATA (pattern), raw_pattern,
153 SBYTES (pattern), 1, 0);
154 }
155 121
156 cp->regexp = Qnil; 122 cp->regexp = Qnil;
157 cp->buf.translate = (! NILP (translate) ? translate : make_number (0)); 123 cp->buf.translate = (! NILP (translate) ? translate : make_number (0));
158 cp->posix = posix; 124 cp->posix = posix;
159 cp->buf.multibyte = multibyte; 125 cp->buf.multibyte = STRING_MULTIBYTE (pattern);
126 cp->buf.target_multibyte = multibyte;
160 BLOCK_INPUT; 127 BLOCK_INPUT;
161 old = re_set_syntax (RE_SYNTAX_EMACS 128 old = re_set_syntax (RE_SYNTAX_EMACS
162 | (posix ? 0 : RE_NO_POSIX_BACKTRACKING)); 129 | (posix ? 0 : RE_NO_POSIX_BACKTRACKING));
163 val = (char *) re_compile_pattern ((char *)raw_pattern, 130 val = (char *) re_compile_pattern ((char *) SDATA (pattern),
164 raw_pattern_size, &cp->buf); 131 SBYTES (pattern), &cp->buf);
165 re_set_syntax (old); 132 re_set_syntax (old);
166 UNBLOCK_INPUT; 133 UNBLOCK_INPUT;
167 if (val) 134 if (val)
168 Fsignal (Qinvalid_regexp, Fcons (build_string (val), Qnil)); 135 Fsignal (Qinvalid_regexp, Fcons (build_string (val), Qnil));
169 136
220 if (SCHARS (cp->regexp) == SCHARS (pattern) 187 if (SCHARS (cp->regexp) == SCHARS (pattern)
221 && STRING_MULTIBYTE (cp->regexp) == STRING_MULTIBYTE (pattern) 188 && STRING_MULTIBYTE (cp->regexp) == STRING_MULTIBYTE (pattern)
222 && !NILP (Fstring_equal (cp->regexp, pattern)) 189 && !NILP (Fstring_equal (cp->regexp, pattern))
223 && EQ (cp->buf.translate, (! NILP (translate) ? translate : make_number (0))) 190 && EQ (cp->buf.translate, (! NILP (translate) ? translate : make_number (0)))
224 && cp->posix == posix 191 && cp->posix == posix
225 && cp->buf.multibyte == multibyte) 192 && cp->buf.target_multibyte == multibyte)
226 break; 193 break;
227 194
228 /* If we're at the end of the cache, compile into the nil cell 195 /* If we're at the end of the cache, compile into the nil cell
229 we found, or the last (least recently used) cell with a 196 we found, or the last (least recently used) cell with a
230 string value. */ 197 string value. */
1138 int raw_pattern_size; 1105 int raw_pattern_size;
1139 int raw_pattern_size_byte; 1106 int raw_pattern_size_byte;
1140 unsigned char *patbuf; 1107 unsigned char *patbuf;
1141 int multibyte = !NILP (current_buffer->enable_multibyte_characters); 1108 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
1142 unsigned char *base_pat = SDATA (string); 1109 unsigned char *base_pat = SDATA (string);
1143 int charset_base = -1; 1110 /* High bits of char; 0 for ASCII characters, (CHAR & ~0x3F)
1111 otherwise. Characters of the same high bits have the same
1112 sequence of bytes but last. To do the BM search, all
1113 characters in STRING must have the same high bits (including
1114 their case translations). */
1115 int char_high_bits = -1;
1144 int boyer_moore_ok = 1; 1116 int boyer_moore_ok = 1;
1145 1117
1146 /* MULTIBYTE says whether the text to be searched is multibyte. 1118 /* MULTIBYTE says whether the text to be searched is multibyte.
1147 We must convert PATTERN to match that, or we will not really 1119 We must convert PATTERN to match that, or we will not really
1148 find things right. */ 1120 find things right. */
1179 } 1151 }
1180 1152
1181 /* Copy and optionally translate the pattern. */ 1153 /* Copy and optionally translate the pattern. */
1182 len = raw_pattern_size; 1154 len = raw_pattern_size;
1183 len_byte = raw_pattern_size_byte; 1155 len_byte = raw_pattern_size_byte;
1184 patbuf = (unsigned char *) alloca (len_byte); 1156 patbuf = (unsigned char *) alloca (len * MAX_MULTIBYTE_LENGTH);
1185 pat = patbuf; 1157 pat = patbuf;
1186 base_pat = raw_pattern; 1158 base_pat = raw_pattern;
1187 if (multibyte) 1159 if (multibyte)
1188 { 1160 {
1189 while (--len >= 0) 1161 while (--len >= 0)
1190 { 1162 {
1191 unsigned char str[MAX_MULTIBYTE_LENGTH];
1192 int c, translated, inverse; 1163 int c, translated, inverse;
1193 int in_charlen, charlen; 1164 int in_charlen;
1194 1165
1195 /* If we got here and the RE flag is set, it's because we're 1166 /* If we got here and the RE flag is set, it's because we're
1196 dealing with a regexp known to be trivial, so the backslash 1167 dealing with a regexp known to be trivial, so the backslash
1197 just quotes the next character. */ 1168 just quotes the next character. */
1198 if (RE && *base_pat == '\\') 1169 if (RE && *base_pat == '\\')
1204 1175
1205 c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen); 1176 c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen);
1206 1177
1207 /* Translate the character, if requested. */ 1178 /* Translate the character, if requested. */
1208 TRANSLATE (translated, trt, c); 1179 TRANSLATE (translated, trt, c);
1209 /* If translation changed the byte-length, go back
1210 to the original character. */
1211 charlen = CHAR_STRING (translated, str);
1212 if (in_charlen != charlen)
1213 {
1214 translated = c;
1215 charlen = CHAR_STRING (c, str);
1216 }
1217
1218 /* If we are searching for something strange,
1219 an invalid multibyte code, don't use boyer-moore. */
1220 if (! ASCII_BYTE_P (translated)
1221 && (charlen == 1 /* 8bit code */
1222 || charlen != in_charlen /* invalid multibyte code */
1223 ))
1224 boyer_moore_ok = 0;
1225
1226 TRANSLATE (inverse, inverse_trt, c); 1180 TRANSLATE (inverse, inverse_trt, c);
1227 1181
1228 /* Did this char actually get translated? 1182 /* Did this char actually get translated?
1229 Would any other char get translated into it? */ 1183 Would any other char get translated into it? */
1230 if (translated != c || inverse != c) 1184 if (translated != c || inverse != c)
1231 { 1185 {
1232 /* Keep track of which character set row 1186 /* Keep track of which character set row
1233 contains the characters that need translation. */ 1187 contains the characters that need translation. */
1234 int charset_base_code = c & ~CHAR_FIELD3_MASK; 1188 int this_high_bit = ASCII_CHAR_P (c) ? 0 : (c & ~0x3F);
1235 int inverse_charset_base = inverse & ~CHAR_FIELD3_MASK; 1189 int c1 = inverse != c ? inverse : translated;
1236 1190 int trt_high_bit = ASCII_CHAR_P (c1) ? 0 : (c1 & ~0x3F);
1237 if (charset_base_code != inverse_charset_base) 1191
1192 if (this_high_bit != trt_high_bit)
1238 boyer_moore_ok = 0; 1193 boyer_moore_ok = 0;
1239 else if (charset_base == -1) 1194 else if (char_high_bits == -1)
1240 charset_base = charset_base_code; 1195 char_high_bits = this_high_bit;
1241 else if (charset_base != charset_base_code) 1196 else if (char_high_bits != this_high_bit)
1242 /* If two different rows appear, needing translation, 1197 /* If two different rows appear, needing translation,
1243 then we cannot use boyer_moore search. */ 1198 then we cannot use boyer_moore search. */
1244 boyer_moore_ok = 0; 1199 boyer_moore_ok = 0;
1245 } 1200 }
1246 1201
1247 /* Store this character into the translated pattern. */ 1202 /* Store this character into the translated pattern. */
1248 bcopy (str, pat, charlen); 1203 CHAR_STRING_ADVANCE (translated, pat);
1249 pat += charlen;
1250 base_pat += in_charlen; 1204 base_pat += in_charlen;
1251 len_byte -= in_charlen; 1205 len_byte -= in_charlen;
1252 } 1206 }
1253 } 1207 }
1254 else 1208 else
1255 { 1209 {
1256 /* Unibyte buffer. */ 1210 /* Unibyte buffer. */
1257 charset_base = 0; 1211 char_high_bits = 0;
1258 while (--len >= 0) 1212 while (--len >= 0)
1259 { 1213 {
1260 int c, translated; 1214 int c, translated;
1261 1215
1262 /* If we got here and the RE flag is set, it's because we're 1216 /* If we got here and the RE flag is set, it's because we're
1278 pat = base_pat = patbuf; 1232 pat = base_pat = patbuf;
1279 1233
1280 if (boyer_moore_ok) 1234 if (boyer_moore_ok)
1281 return boyer_moore (n, pat, len, len_byte, trt, inverse_trt, 1235 return boyer_moore (n, pat, len, len_byte, trt, inverse_trt,
1282 pos, pos_byte, lim, lim_byte, 1236 pos, pos_byte, lim, lim_byte,
1283 charset_base); 1237 char_high_bits);
1284 else 1238 else
1285 return simple_search (n, pat, len, len_byte, trt, 1239 return simple_search (n, pat, len, len_byte, trt,
1286 pos, pos_byte, lim, lim_byte); 1240 pos, pos_byte, lim, lim_byte);
1287 } 1241 }
1288 } 1242 }
1511 1465
1512 If that criterion is not satisfied, do not call this function. */ 1466 If that criterion is not satisfied, do not call this function. */
1513 1467
1514 static int 1468 static int
1515 boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, 1469 boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1516 pos, pos_byte, lim, lim_byte, charset_base) 1470 pos, pos_byte, lim, lim_byte, char_high_bits)
1517 int n; 1471 int n;
1518 unsigned char *base_pat; 1472 unsigned char *base_pat;
1519 int len, len_byte; 1473 int len, len_byte;
1520 Lisp_Object trt; 1474 Lisp_Object trt;
1521 Lisp_Object inverse_trt; 1475 Lisp_Object inverse_trt;
1522 int pos, pos_byte; 1476 int pos, pos_byte;
1523 int lim, lim_byte; 1477 int lim, lim_byte;
1524 int charset_base; 1478 int char_high_bits;
1525 { 1479 {
1526 int direction = ((n > 0) ? 1 : -1); 1480 int direction = ((n > 0) ? 1 : -1);
1527 register int dirlen; 1481 register int dirlen;
1528 int infinity, limit, stride_for_teases = 0; 1482 int infinity, limit, stride_for_teases = 0;
1529 register int *BM_tab; 1483 register int *BM_tab;
1620 { 1574 {
1621 unsigned char *charstart = ptr; 1575 unsigned char *charstart = ptr;
1622 while (! CHAR_HEAD_P (*charstart)) 1576 while (! CHAR_HEAD_P (*charstart))
1623 charstart--; 1577 charstart--;
1624 untranslated = STRING_CHAR (charstart, ptr - charstart + 1); 1578 untranslated = STRING_CHAR (charstart, ptr - charstart + 1);
1625 if (charset_base == (untranslated & ~CHAR_FIELD3_MASK)) 1579 if (char_high_bits
1580 == (ASCII_CHAR_P (untranslated) ? 0 : untranslated & ~0x3F))
1626 { 1581 {
1627 TRANSLATE (ch, trt, untranslated); 1582 TRANSLATE (ch, trt, untranslated);
1628 if (! CHAR_HEAD_P (*ptr)) 1583 if (! CHAR_HEAD_P (*ptr))
1629 { 1584 {
1630 translate_prev_byte = ptr[-1]; 1585 translate_prev_byte = ptr[-1];
1644 { 1599 {
1645 ch = *ptr; 1600 ch = *ptr;
1646 this_translated = 0; 1601 this_translated = 0;
1647 } 1602 }
1648 1603
1649 if (ch > 0400) 1604 if (this_translated
1650 j = ((unsigned char) ch) | 0200; 1605 && ch >= 0200)
1606 j = (ch & 0x3F) | 0200;
1651 else 1607 else
1652 j = (unsigned char) ch; 1608 j = (unsigned char) ch;
1653 1609
1654 if (i == infinity) 1610 if (i == infinity)
1655 stride_for_teases = BM_tab[j]; 1611 stride_for_teases = BM_tab[j];
1662 int starting_ch = ch; 1618 int starting_ch = ch;
1663 int starting_j = j; 1619 int starting_j = j;
1664 while (1) 1620 while (1)
1665 { 1621 {
1666 TRANSLATE (ch, inverse_trt, ch); 1622 TRANSLATE (ch, inverse_trt, ch);
1667 if (ch > 0400) 1623 if (ch > 0200)
1668 j = ((unsigned char) ch) | 0200; 1624 j = (ch & 0x3F) | 0200;
1669 else 1625 else
1670 j = (unsigned char) ch; 1626 j = (unsigned char) ch;
1671 1627
1672 /* For all the characters that map into CH, 1628 /* For all the characters that map into CH,
1673 set up simple_translate to map the last byte 1629 set up simple_translate to map the last byte
1956 1912
1957 for (i = 0, i_byte = 0; i < len; ) 1913 for (i = 0, i_byte = 0; i < len; )
1958 { 1914 {
1959 int c; 1915 int c;
1960 1916
1961 FETCH_STRING_CHAR_ADVANCE (c, string, i, i_byte); 1917 FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, i, i_byte);
1962 1918
1963 if (SYNTAX (c) != Sword) 1919 if (SYNTAX (c) != Sword)
1964 { 1920 {
1965 punct_count++; 1921 punct_count++;
1966 if (i > 0 && SYNTAX (prev_c) == Sword) 1922 if (i > 0 && SYNTAX (prev_c) == Sword)
1991 for (i = 0, i_byte = 0; i < len; ) 1947 for (i = 0, i_byte = 0; i < len; )
1992 { 1948 {
1993 int c; 1949 int c;
1994 int i_byte_orig = i_byte; 1950 int i_byte_orig = i_byte;
1995 1951
1996 FETCH_STRING_CHAR_ADVANCE (c, string, i, i_byte); 1952 FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, i, i_byte);
1997 1953
1998 if (SYNTAX (c) == Sword) 1954 if (SYNTAX (c) == Sword)
1999 { 1955 {
2000 bcopy (SDATA (string) + i_byte_orig, o, 1956 bcopy (SDATA (string) + i_byte_orig, o,
2001 i_byte - i_byte_orig); 1957 i_byte - i_byte_orig);
2275 2231
2276 while (pos < last) 2232 while (pos < last)
2277 { 2233 {
2278 if (NILP (string)) 2234 if (NILP (string))
2279 { 2235 {
2280 c = FETCH_CHAR (pos_byte); 2236 c = FETCH_CHAR_AS_MULTIBYTE (pos_byte);
2281 INC_BOTH (pos, pos_byte); 2237 INC_BOTH (pos, pos_byte);
2282 } 2238 }
2283 else 2239 else
2284 FETCH_STRING_CHAR_ADVANCE (c, string, pos, pos_byte); 2240 FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, pos, pos_byte);
2285 2241
2286 if (LOWERCASEP (c)) 2242 if (LOWERCASEP (c))
2287 { 2243 {
2288 /* Cannot be all caps if any original char is lower case */ 2244 /* Cannot be all caps if any original char is lower case */
2289 2245
2443 int buf_multibyte = !NILP (current_buffer->enable_multibyte_characters); 2399 int buf_multibyte = !NILP (current_buffer->enable_multibyte_characters);
2444 int str_multibyte = STRING_MULTIBYTE (newtext); 2400 int str_multibyte = STRING_MULTIBYTE (newtext);
2445 Lisp_Object rev_tbl; 2401 Lisp_Object rev_tbl;
2446 int really_changed = 0; 2402 int really_changed = 0;
2447 2403
2448 rev_tbl= (!buf_multibyte && CHAR_TABLE_P (Vnonascii_translation_table) 2404 rev_tbl = Qnil;
2449 ? Fchar_table_extra_slot (Vnonascii_translation_table,
2450 make_number (0))
2451 : Qnil);
2452 2405
2453 substed_alloc_size = length * 2 + 100; 2406 substed_alloc_size = length * 2 + 100;
2454 substed = (unsigned char *) xmalloc (substed_alloc_size + 1); 2407 substed = (unsigned char *) xmalloc (substed_alloc_size + 1);
2455 substed_len = 0; 2408 substed_len = 0;
2456 2409
2489 2442
2490 if (str_multibyte) 2443 if (str_multibyte)
2491 { 2444 {
2492 FETCH_STRING_CHAR_ADVANCE_NO_CHECK (c, newtext, 2445 FETCH_STRING_CHAR_ADVANCE_NO_CHECK (c, newtext,
2493 pos, pos_byte); 2446 pos, pos_byte);
2494 if (!buf_multibyte && !SINGLE_BYTE_CHAR_P (c)) 2447 if (!buf_multibyte && !ASCII_CHAR_P (c))
2495 c = multibyte_char_to_unibyte (c, rev_tbl); 2448 c = multibyte_char_to_unibyte (c, rev_tbl);
2496 } 2449 }
2497 else 2450 else
2498 { 2451 {
2499 c = SREF (newtext, pos_byte++); 2452 c = SREF (newtext, pos_byte++);