Mercurial > emacs
comparison src/search.c @ 20898:f69969e35e78
(simple_search): Call set_search_regs.
(boyer_moore): New arg CHARSET_BASE says which chars
to use the translate tables for.
(search_buffer): Properly test which chars participate in translation.
(TRANSLATE): New arg OUT. Handle non-integer in TRT.
All calls changed.
author | Richard M. Stallman <rms@gnu.org> |
---|---|
date | Sat, 14 Feb 1998 08:43:17 +0000 |
parents | 4fac9830041a |
children | eda7e44ef9d9 |
comparison
equal
deleted
inserted
replaced
20897:77deec174f57 | 20898:f69969e35e78 |
---|---|
970 (if searching backward) or the end (if searching forward). | 970 (if searching backward) or the end (if searching forward). |
971 | 971 |
972 POSIX is nonzero if we want full backtracking (POSIX style) | 972 POSIX is nonzero if we want full backtracking (POSIX style) |
973 for this pattern. 0 means backtrack only enough to get a valid match. */ | 973 for this pattern. 0 means backtrack only enough to get a valid match. */ |
974 | 974 |
975 #define TRANSLATE(trt, d) \ | 975 #define TRANSLATE(out, trt, d) \ |
976 (! NILP (trt) ? XINT (Faref (trt, make_number (d))) : (d)) | 976 do \ |
977 { \ | |
978 if (! NILP (trt)) \ | |
979 { \ | |
980 Lisp_Object temp; \ | |
981 temp = Faref (trt, make_number (d)); \ | |
982 if (INTEGERP (temp)) \ | |
983 out = XINT (temp); \ | |
984 else \ | |
985 out = d; \ | |
986 } \ | |
987 else \ | |
988 out = d; \ | |
989 } \ | |
990 while (0) | |
977 | 991 |
978 static int | 992 static int |
979 search_buffer (string, pos, pos_byte, lim, lim_byte, n, | 993 search_buffer (string, pos, pos_byte, lim, lim_byte, n, |
980 RE, trt, inverse_trt, posix) | 994 RE, trt, inverse_trt, posix) |
981 Lisp_Object string; | 995 Lisp_Object string; |
1163 if (multibyte) | 1177 if (multibyte) |
1164 { | 1178 { |
1165 while (--len >= 0) | 1179 while (--len >= 0) |
1166 { | 1180 { |
1167 unsigned char workbuf[4], *str; | 1181 unsigned char workbuf[4], *str; |
1168 int c, translated; | 1182 int c, translated, inverse; |
1169 int in_charlen, charlen; | 1183 int in_charlen, charlen; |
1170 | 1184 |
1171 /* If we got here and the RE flag is set, it's because we're | 1185 /* If we got here and the RE flag is set, it's because we're |
1172 dealing with a regexp known to be trivial, so the backslash | 1186 dealing with a regexp known to be trivial, so the backslash |
1173 just quotes the next character. */ | 1187 just quotes the next character. */ |
1178 base_pat++; | 1192 base_pat++; |
1179 } | 1193 } |
1180 | 1194 |
1181 c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen); | 1195 c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen); |
1182 /* Translate the character, if requested. */ | 1196 /* Translate the character, if requested. */ |
1183 translated = TRANSLATE (trt, c); | 1197 TRANSLATE (translated, trt, c); |
1184 /* If translation changed the byte-length, go back | 1198 /* If translation changed the byte-length, go back |
1185 to the original character. */ | 1199 to the original character. */ |
1186 charlen = CHAR_STRING (translated, workbuf, str); | 1200 charlen = CHAR_STRING (translated, workbuf, str); |
1187 if (in_charlen != charlen) | 1201 if (in_charlen != charlen) |
1188 { | 1202 { |
1189 translated = c; | 1203 translated = c; |
1190 charlen = CHAR_STRING (c, workbuf, str); | 1204 charlen = CHAR_STRING (c, workbuf, str); |
1191 } | 1205 } |
1192 | 1206 |
1207 TRANSLATE (inverse, inverse_trt, c); | |
1208 | |
1193 /* Did this char actually get translated? | 1209 /* Did this char actually get translated? |
1194 Would any other char get translated into it? */ | 1210 Would any other char get translated into it? */ |
1195 if (translated != c | 1211 if (translated != c || inverse != c) |
1196 || TRANSLATE (inverse_trt, c) != c) | |
1197 { | 1212 { |
1198 /* Keep track of which character set row | 1213 /* Keep track of which character set row |
1199 contains the characters that need translation. */ | 1214 contains the characters that need translation. */ |
1200 int charset_base_code = c & ~0xff; | 1215 int charset_base_code = c & ~0xff; |
1201 if (charset_base == -1) | 1216 if (charset_base == -1) |
1204 /* If two different rows appear, needing translation, | 1219 /* If two different rows appear, needing translation, |
1205 then we cannot use boyer_moore search. */ | 1220 then we cannot use boyer_moore search. */ |
1206 simple = 0; | 1221 simple = 0; |
1207 /* ??? Handa: this must do simple = 0 | 1222 /* ??? Handa: this must do simple = 0 |
1208 if c is a composite character. */ | 1223 if c is a composite character. */ |
1209 } | 1224 } |
1210 | 1225 |
1211 /* Store this character into the translated pattern. */ | 1226 /* Store this character into the translated pattern. */ |
1212 bcopy (str, pat, charlen); | 1227 bcopy (str, pat, charlen); |
1213 pat += charlen; | 1228 pat += charlen; |
1214 base_pat += in_charlen; | 1229 base_pat += in_charlen; |
1217 } | 1232 } |
1218 else | 1233 else |
1219 { | 1234 { |
1220 while (--len >= 0) | 1235 while (--len >= 0) |
1221 { | 1236 { |
1222 int c, translated; | 1237 int c, translated, inverse; |
1223 | 1238 |
1224 /* If we got here and the RE flag is set, it's because we're | 1239 /* If we got here and the RE flag is set, it's because we're |
1225 dealing with a regexp known to be trivial, so the backslash | 1240 dealing with a regexp known to be trivial, so the backslash |
1226 just quotes the next character. */ | 1241 just quotes the next character. */ |
1227 if (RE && *base_pat == '\\') | 1242 if (RE && *base_pat == '\\') |
1228 { | 1243 { |
1229 len--; | 1244 len--; |
1230 base_pat++; | 1245 base_pat++; |
1231 } | 1246 } |
1232 c = *base_pat++; | 1247 c = *base_pat++; |
1233 translated = TRANSLATE (trt, c); | 1248 TRANSLATE (translated, trt, c); |
1249 TRANSLATE (inverse, inverse_trt, c); | |
1234 | 1250 |
1235 /* Did this char actually get translated? | 1251 /* Did this char actually get translated? |
1236 Would any other char get translated into it? */ | 1252 Would any other char get translated into it? */ |
1237 if (translated != c | 1253 if (translated != c || inverse != c) |
1238 || TRANSLATE (inverse_trt, c) != c) | |
1239 { | 1254 { |
1240 /* Keep track of which character set row | 1255 /* Keep track of which character set row |
1241 contains the characters that need translation. */ | 1256 contains the characters that need translation. */ |
1242 int charset_base_code = c & ~0xff; | 1257 int charset_base_code = c & ~0xff; |
1243 if (charset_base == -1) | 1258 if (charset_base == -1) |
1244 charset_base = charset_base_code; | 1259 charset_base = charset_base_code; |
1245 else if (charset_base != charset_base_code) | 1260 else if (charset_base != charset_base_code) |
1246 /* If two different rows appear, needing translation, | 1261 /* If two different rows appear, needing translation, |
1247 then we cannot use boyer_moore search. */ | 1262 then we cannot use boyer_moore search. */ |
1248 simple = 0; | 1263 simple = 0; |
1249 } | 1264 } |
1250 *pat++ = translated; | 1265 *pat++ = translated; |
1251 } | 1266 } |
1252 } | 1267 } |
1253 | 1268 |
1254 len_byte = pat - patbuf; | 1269 len_byte = pat - patbuf; |
1255 len = raw_pattern_size; | 1270 len = raw_pattern_size; |
1256 pat = base_pat = patbuf; | 1271 pat = base_pat = patbuf; |
1257 | 1272 |
1258 if (simple) | 1273 if (simple) |
1259 return boyer_moore (n, pat, len, len_byte, trt, inverse_trt, | 1274 return boyer_moore (n, pat, len, len_byte, trt, inverse_trt, |
1260 pos, pos_byte, lim, lim_byte); | 1275 pos, pos_byte, lim, lim_byte, |
1276 charset_base); | |
1261 else | 1277 else |
1262 return simple_search (n, pat, len, len_byte, trt, | 1278 return simple_search (n, pat, len, len_byte, trt, |
1263 pos, pos_byte, lim, lim_byte); | 1279 pos, pos_byte, lim, lim_byte); |
1264 } | 1280 } |
1265 } | 1281 } |
1314 buf_ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (this_pos_byte), | 1330 buf_ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (this_pos_byte), |
1315 ZV_BYTE - this_pos_byte, | 1331 ZV_BYTE - this_pos_byte, |
1316 buf_charlen); | 1332 buf_charlen); |
1317 this_pos_byte += buf_charlen; | 1333 this_pos_byte += buf_charlen; |
1318 this_pos++; | 1334 this_pos++; |
1319 buf_ch = TRANSLATE (trt, buf_ch); | 1335 TRANSLATE (buf_ch, trt, buf_ch); |
1320 | 1336 |
1321 if (buf_ch != pat_ch) | 1337 if (buf_ch != pat_ch) |
1322 break; | 1338 break; |
1323 } | 1339 } |
1324 | 1340 |
1351 { | 1367 { |
1352 int pat_ch = *p++; | 1368 int pat_ch = *p++; |
1353 int buf_ch = FETCH_BYTE (this_pos); | 1369 int buf_ch = FETCH_BYTE (this_pos); |
1354 this_len--; | 1370 this_len--; |
1355 this_pos++; | 1371 this_pos++; |
1356 buf_ch = TRANSLATE (trt, buf_ch); | 1372 TRANSLATE (buf_ch, trt, buf_ch); |
1357 | 1373 |
1358 if (buf_ch != pat_ch) | 1374 if (buf_ch != pat_ch) |
1359 break; | 1375 break; |
1360 } | 1376 } |
1361 | 1377 |
1399 buf_ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (this_pos_byte), | 1415 buf_ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (this_pos_byte), |
1400 ZV_BYTE - this_pos_byte, | 1416 ZV_BYTE - this_pos_byte, |
1401 buf_charlen); | 1417 buf_charlen); |
1402 this_pos_byte += buf_charlen; | 1418 this_pos_byte += buf_charlen; |
1403 this_pos++; | 1419 this_pos++; |
1404 buf_ch = TRANSLATE (trt, buf_ch); | 1420 TRANSLATE (buf_ch, trt, buf_ch); |
1405 | 1421 |
1406 if (buf_ch != pat_ch) | 1422 if (buf_ch != pat_ch) |
1407 break; | 1423 break; |
1408 } | 1424 } |
1409 | 1425 |
1436 { | 1452 { |
1437 int pat_ch = *p++; | 1453 int pat_ch = *p++; |
1438 int buf_ch = FETCH_BYTE (this_pos); | 1454 int buf_ch = FETCH_BYTE (this_pos); |
1439 this_len--; | 1455 this_len--; |
1440 this_pos++; | 1456 this_pos++; |
1441 buf_ch = TRANSLATE (trt, buf_ch); | 1457 TRANSLATE (buf_ch, trt, buf_ch); |
1442 | 1458 |
1443 if (buf_ch != pat_ch) | 1459 if (buf_ch != pat_ch) |
1444 break; | 1460 break; |
1445 } | 1461 } |
1446 | 1462 |
1456 n++; | 1472 n++; |
1457 } | 1473 } |
1458 | 1474 |
1459 stop: | 1475 stop: |
1460 if (n == 0) | 1476 if (n == 0) |
1461 return pos; | 1477 { |
1478 set_search_regs (multibyte ? pos_byte : pos, len_byte); | |
1479 | |
1480 return pos; | |
1481 } | |
1462 else if (n > 0) | 1482 else if (n > 0) |
1463 return -n; | 1483 return -n; |
1464 else | 1484 else |
1465 return n; | 1485 return n; |
1466 } | 1486 } |
1478 | 1498 |
1479 If that criterion is not satisfied, do not call this function. */ | 1499 If that criterion is not satisfied, do not call this function. */ |
1480 | 1500 |
1481 static int | 1501 static int |
1482 boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | 1502 boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, |
1483 pos, pos_byte, lim, lim_byte) | 1503 pos, pos_byte, lim, lim_byte, charset_base) |
1484 int n; | 1504 int n; |
1485 unsigned char *base_pat; | 1505 unsigned char *base_pat; |
1486 int len, len_byte; | 1506 int len, len_byte; |
1487 Lisp_Object trt; | 1507 Lisp_Object trt; |
1488 Lisp_Object inverse_trt; | 1508 Lisp_Object inverse_trt; |
1489 int pos, pos_byte; | 1509 int pos, pos_byte; |
1490 int lim, lim_byte; | 1510 int lim, lim_byte; |
1511 int charset_base; | |
1491 { | 1512 { |
1492 int direction = ((n > 0) ? 1 : -1); | 1513 int direction = ((n > 0) ? 1 : -1); |
1493 register int dirlen; | 1514 register int dirlen; |
1494 int infinity, limit, k, stride_for_teases; | 1515 int infinity, limit, k, stride_for_teases; |
1495 register int *BM_tab; | 1516 register int *BM_tab; |
1570 if (i == dirlen) | 1591 if (i == dirlen) |
1571 i = infinity; | 1592 i = infinity; |
1572 if (! NILP (trt)) | 1593 if (! NILP (trt)) |
1573 { | 1594 { |
1574 int ch; | 1595 int ch; |
1596 int untranslated; | |
1575 int this_translated = 1; | 1597 int this_translated = 1; |
1576 | 1598 |
1577 if (multibyte | 1599 if (multibyte |
1578 && (ptr + 1 == pat + len_byte || CHAR_HEAD_P (ptr[1]))) | 1600 && (ptr + 1 == pat + len_byte || CHAR_HEAD_P (ptr[1]))) |
1579 { | 1601 { |
1580 unsigned char *charstart = ptr; | 1602 unsigned char *charstart = ptr; |
1581 while (! CHAR_HEAD_P (*charstart)) | 1603 while (! CHAR_HEAD_P (*charstart)) |
1582 charstart--; | 1604 charstart--; |
1583 if (! CHAR_HEAD_P (*ptr)) | 1605 untranslated = STRING_CHAR (charstart, ptr - charstart + 1); |
1606 TRANSLATE (ch, trt, untranslated); | |
1607 if (charset_base == (ch & ~0xff)) | |
1584 { | 1608 { |
1585 translate_prev_byte = ptr[-1]; | 1609 if (! CHAR_HEAD_P (*ptr)) |
1586 if (! CHAR_HEAD_P (translate_prev_byte)) | 1610 { |
1587 translate_anteprev_byte = ptr[-2]; | 1611 translate_prev_byte = ptr[-1]; |
1612 if (! CHAR_HEAD_P (translate_prev_byte)) | |
1613 translate_anteprev_byte = ptr[-2]; | |
1614 } | |
1588 } | 1615 } |
1589 ch = STRING_CHAR (charstart, ptr - charstart + 1); | 1616 else |
1590 ch = TRANSLATE (trt, ch); | 1617 this_translated = 0; |
1591 } | 1618 } |
1592 else if (!multibyte) | 1619 else if (!multibyte) |
1593 ch = TRANSLATE (trt, *ptr); | 1620 TRANSLATE (ch, trt, *ptr); |
1594 else | 1621 else |
1595 { | 1622 { |
1596 ch = *ptr; | 1623 ch = *ptr; |
1597 this_translated = 0; | 1624 this_translated = 0; |
1598 } | 1625 } |
1604 /* A translation table is accompanied by its inverse -- see */ | 1631 /* A translation table is accompanied by its inverse -- see */ |
1605 /* comment following downcase_table for details */ | 1632 /* comment following downcase_table for details */ |
1606 if (this_translated) | 1633 if (this_translated) |
1607 while (1) | 1634 while (1) |
1608 { | 1635 { |
1609 ch = TRANSLATE (inverse_trt, ch); | 1636 TRANSLATE (ch, inverse_trt, ch); |
1610 /* For all the characters that map into K, | 1637 /* For all the characters that map into K, |
1611 set up simple_translate to map them into K. */ | 1638 set up simple_translate to map them into K. */ |
1612 simple_translate[(unsigned char) ch] = k; | 1639 simple_translate[(unsigned char) ch] = k; |
1613 if ((unsigned char) ch == k) | 1640 if ((unsigned char) ch == k) |
1614 break; | 1641 break; |