comparison src/search.c @ 20898:f69969e35e78

(simple_search): Call set_search_regs. (boyer_moore): New arg CHARSET_BASE says which chars to use the translate tables for. (search_buffer): Properly test which chars participate in translation. (TRANSLATE): New arg OUT. Handle non-integer in TRT. All calls changed.
author Richard M. Stallman <rms@gnu.org>
date Sat, 14 Feb 1998 08:43:17 +0000
parents 4fac9830041a
children eda7e44ef9d9
comparison
equal deleted inserted replaced
20897:77deec174f57 20898:f69969e35e78
970 (if searching backward) or the end (if searching forward). 970 (if searching backward) or the end (if searching forward).
971 971
972 POSIX is nonzero if we want full backtracking (POSIX style) 972 POSIX is nonzero if we want full backtracking (POSIX style)
973 for this pattern. 0 means backtrack only enough to get a valid match. */ 973 for this pattern. 0 means backtrack only enough to get a valid match. */
974 974
975 #define TRANSLATE(trt, d) \ 975 #define TRANSLATE(out, trt, d) \
976 (! NILP (trt) ? XINT (Faref (trt, make_number (d))) : (d)) 976 do \
977 { \
978 if (! NILP (trt)) \
979 { \
980 Lisp_Object temp; \
981 temp = Faref (trt, make_number (d)); \
982 if (INTEGERP (temp)) \
983 out = XINT (temp); \
984 else \
985 out = d; \
986 } \
987 else \
988 out = d; \
989 } \
990 while (0)
977 991
978 static int 992 static int
979 search_buffer (string, pos, pos_byte, lim, lim_byte, n, 993 search_buffer (string, pos, pos_byte, lim, lim_byte, n,
980 RE, trt, inverse_trt, posix) 994 RE, trt, inverse_trt, posix)
981 Lisp_Object string; 995 Lisp_Object string;
1163 if (multibyte) 1177 if (multibyte)
1164 { 1178 {
1165 while (--len >= 0) 1179 while (--len >= 0)
1166 { 1180 {
1167 unsigned char workbuf[4], *str; 1181 unsigned char workbuf[4], *str;
1168 int c, translated; 1182 int c, translated, inverse;
1169 int in_charlen, charlen; 1183 int in_charlen, charlen;
1170 1184
1171 /* If we got here and the RE flag is set, it's because we're 1185 /* If we got here and the RE flag is set, it's because we're
1172 dealing with a regexp known to be trivial, so the backslash 1186 dealing with a regexp known to be trivial, so the backslash
1173 just quotes the next character. */ 1187 just quotes the next character. */
1178 base_pat++; 1192 base_pat++;
1179 } 1193 }
1180 1194
1181 c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen); 1195 c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen);
1182 /* Translate the character, if requested. */ 1196 /* Translate the character, if requested. */
1183 translated = TRANSLATE (trt, c); 1197 TRANSLATE (translated, trt, c);
1184 /* If translation changed the byte-length, go back 1198 /* If translation changed the byte-length, go back
1185 to the original character. */ 1199 to the original character. */
1186 charlen = CHAR_STRING (translated, workbuf, str); 1200 charlen = CHAR_STRING (translated, workbuf, str);
1187 if (in_charlen != charlen) 1201 if (in_charlen != charlen)
1188 { 1202 {
1189 translated = c; 1203 translated = c;
1190 charlen = CHAR_STRING (c, workbuf, str); 1204 charlen = CHAR_STRING (c, workbuf, str);
1191 } 1205 }
1192 1206
1207 TRANSLATE (inverse, inverse_trt, c);
1208
1193 /* Did this char actually get translated? 1209 /* Did this char actually get translated?
1194 Would any other char get translated into it? */ 1210 Would any other char get translated into it? */
1195 if (translated != c 1211 if (translated != c || inverse != c)
1196 || TRANSLATE (inverse_trt, c) != c)
1197 { 1212 {
1198 /* Keep track of which character set row 1213 /* Keep track of which character set row
1199 contains the characters that need translation. */ 1214 contains the characters that need translation. */
1200 int charset_base_code = c & ~0xff; 1215 int charset_base_code = c & ~0xff;
1201 if (charset_base == -1) 1216 if (charset_base == -1)
1204 /* If two different rows appear, needing translation, 1219 /* If two different rows appear, needing translation,
1205 then we cannot use boyer_moore search. */ 1220 then we cannot use boyer_moore search. */
1206 simple = 0; 1221 simple = 0;
1207 /* ??? Handa: this must do simple = 0 1222 /* ??? Handa: this must do simple = 0
1208 if c is a composite character. */ 1223 if c is a composite character. */
1209 } 1224 }
1210 1225
1211 /* Store this character into the translated pattern. */ 1226 /* Store this character into the translated pattern. */
1212 bcopy (str, pat, charlen); 1227 bcopy (str, pat, charlen);
1213 pat += charlen; 1228 pat += charlen;
1214 base_pat += in_charlen; 1229 base_pat += in_charlen;
1217 } 1232 }
1218 else 1233 else
1219 { 1234 {
1220 while (--len >= 0) 1235 while (--len >= 0)
1221 { 1236 {
1222 int c, translated; 1237 int c, translated, inverse;
1223 1238
1224 /* If we got here and the RE flag is set, it's because we're 1239 /* If we got here and the RE flag is set, it's because we're
1225 dealing with a regexp known to be trivial, so the backslash 1240 dealing with a regexp known to be trivial, so the backslash
1226 just quotes the next character. */ 1241 just quotes the next character. */
1227 if (RE && *base_pat == '\\') 1242 if (RE && *base_pat == '\\')
1228 { 1243 {
1229 len--; 1244 len--;
1230 base_pat++; 1245 base_pat++;
1231 } 1246 }
1232 c = *base_pat++; 1247 c = *base_pat++;
1233 translated = TRANSLATE (trt, c); 1248 TRANSLATE (translated, trt, c);
1249 TRANSLATE (inverse, inverse_trt, c);
1234 1250
1235 /* Did this char actually get translated? 1251 /* Did this char actually get translated?
1236 Would any other char get translated into it? */ 1252 Would any other char get translated into it? */
1237 if (translated != c 1253 if (translated != c || inverse != c)
1238 || TRANSLATE (inverse_trt, c) != c)
1239 { 1254 {
1240 /* Keep track of which character set row 1255 /* Keep track of which character set row
1241 contains the characters that need translation. */ 1256 contains the characters that need translation. */
1242 int charset_base_code = c & ~0xff; 1257 int charset_base_code = c & ~0xff;
1243 if (charset_base == -1) 1258 if (charset_base == -1)
1244 charset_base = charset_base_code; 1259 charset_base = charset_base_code;
1245 else if (charset_base != charset_base_code) 1260 else if (charset_base != charset_base_code)
1246 /* If two different rows appear, needing translation, 1261 /* If two different rows appear, needing translation,
1247 then we cannot use boyer_moore search. */ 1262 then we cannot use boyer_moore search. */
1248 simple = 0; 1263 simple = 0;
1249 } 1264 }
1250 *pat++ = translated; 1265 *pat++ = translated;
1251 } 1266 }
1252 } 1267 }
1253 1268
1254 len_byte = pat - patbuf; 1269 len_byte = pat - patbuf;
1255 len = raw_pattern_size; 1270 len = raw_pattern_size;
1256 pat = base_pat = patbuf; 1271 pat = base_pat = patbuf;
1257 1272
1258 if (simple) 1273 if (simple)
1259 return boyer_moore (n, pat, len, len_byte, trt, inverse_trt, 1274 return boyer_moore (n, pat, len, len_byte, trt, inverse_trt,
1260 pos, pos_byte, lim, lim_byte); 1275 pos, pos_byte, lim, lim_byte,
1276 charset_base);
1261 else 1277 else
1262 return simple_search (n, pat, len, len_byte, trt, 1278 return simple_search (n, pat, len, len_byte, trt,
1263 pos, pos_byte, lim, lim_byte); 1279 pos, pos_byte, lim, lim_byte);
1264 } 1280 }
1265 } 1281 }
1314 buf_ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (this_pos_byte), 1330 buf_ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (this_pos_byte),
1315 ZV_BYTE - this_pos_byte, 1331 ZV_BYTE - this_pos_byte,
1316 buf_charlen); 1332 buf_charlen);
1317 this_pos_byte += buf_charlen; 1333 this_pos_byte += buf_charlen;
1318 this_pos++; 1334 this_pos++;
1319 buf_ch = TRANSLATE (trt, buf_ch); 1335 TRANSLATE (buf_ch, trt, buf_ch);
1320 1336
1321 if (buf_ch != pat_ch) 1337 if (buf_ch != pat_ch)
1322 break; 1338 break;
1323 } 1339 }
1324 1340
1351 { 1367 {
1352 int pat_ch = *p++; 1368 int pat_ch = *p++;
1353 int buf_ch = FETCH_BYTE (this_pos); 1369 int buf_ch = FETCH_BYTE (this_pos);
1354 this_len--; 1370 this_len--;
1355 this_pos++; 1371 this_pos++;
1356 buf_ch = TRANSLATE (trt, buf_ch); 1372 TRANSLATE (buf_ch, trt, buf_ch);
1357 1373
1358 if (buf_ch != pat_ch) 1374 if (buf_ch != pat_ch)
1359 break; 1375 break;
1360 } 1376 }
1361 1377
1399 buf_ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (this_pos_byte), 1415 buf_ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (this_pos_byte),
1400 ZV_BYTE - this_pos_byte, 1416 ZV_BYTE - this_pos_byte,
1401 buf_charlen); 1417 buf_charlen);
1402 this_pos_byte += buf_charlen; 1418 this_pos_byte += buf_charlen;
1403 this_pos++; 1419 this_pos++;
1404 buf_ch = TRANSLATE (trt, buf_ch); 1420 TRANSLATE (buf_ch, trt, buf_ch);
1405 1421
1406 if (buf_ch != pat_ch) 1422 if (buf_ch != pat_ch)
1407 break; 1423 break;
1408 } 1424 }
1409 1425
1436 { 1452 {
1437 int pat_ch = *p++; 1453 int pat_ch = *p++;
1438 int buf_ch = FETCH_BYTE (this_pos); 1454 int buf_ch = FETCH_BYTE (this_pos);
1439 this_len--; 1455 this_len--;
1440 this_pos++; 1456 this_pos++;
1441 buf_ch = TRANSLATE (trt, buf_ch); 1457 TRANSLATE (buf_ch, trt, buf_ch);
1442 1458
1443 if (buf_ch != pat_ch) 1459 if (buf_ch != pat_ch)
1444 break; 1460 break;
1445 } 1461 }
1446 1462
1456 n++; 1472 n++;
1457 } 1473 }
1458 1474
1459 stop: 1475 stop:
1460 if (n == 0) 1476 if (n == 0)
1461 return pos; 1477 {
1478 set_search_regs (multibyte ? pos_byte : pos, len_byte);
1479
1480 return pos;
1481 }
1462 else if (n > 0) 1482 else if (n > 0)
1463 return -n; 1483 return -n;
1464 else 1484 else
1465 return n; 1485 return n;
1466 } 1486 }
1478 1498
1479 If that criterion is not satisfied, do not call this function. */ 1499 If that criterion is not satisfied, do not call this function. */
1480 1500
1481 static int 1501 static int
1482 boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, 1502 boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1483 pos, pos_byte, lim, lim_byte) 1503 pos, pos_byte, lim, lim_byte, charset_base)
1484 int n; 1504 int n;
1485 unsigned char *base_pat; 1505 unsigned char *base_pat;
1486 int len, len_byte; 1506 int len, len_byte;
1487 Lisp_Object trt; 1507 Lisp_Object trt;
1488 Lisp_Object inverse_trt; 1508 Lisp_Object inverse_trt;
1489 int pos, pos_byte; 1509 int pos, pos_byte;
1490 int lim, lim_byte; 1510 int lim, lim_byte;
1511 int charset_base;
1491 { 1512 {
1492 int direction = ((n > 0) ? 1 : -1); 1513 int direction = ((n > 0) ? 1 : -1);
1493 register int dirlen; 1514 register int dirlen;
1494 int infinity, limit, k, stride_for_teases; 1515 int infinity, limit, k, stride_for_teases;
1495 register int *BM_tab; 1516 register int *BM_tab;
1570 if (i == dirlen) 1591 if (i == dirlen)
1571 i = infinity; 1592 i = infinity;
1572 if (! NILP (trt)) 1593 if (! NILP (trt))
1573 { 1594 {
1574 int ch; 1595 int ch;
1596 int untranslated;
1575 int this_translated = 1; 1597 int this_translated = 1;
1576 1598
1577 if (multibyte 1599 if (multibyte
1578 && (ptr + 1 == pat + len_byte || CHAR_HEAD_P (ptr[1]))) 1600 && (ptr + 1 == pat + len_byte || CHAR_HEAD_P (ptr[1])))
1579 { 1601 {
1580 unsigned char *charstart = ptr; 1602 unsigned char *charstart = ptr;
1581 while (! CHAR_HEAD_P (*charstart)) 1603 while (! CHAR_HEAD_P (*charstart))
1582 charstart--; 1604 charstart--;
1583 if (! CHAR_HEAD_P (*ptr)) 1605 untranslated = STRING_CHAR (charstart, ptr - charstart + 1);
1606 TRANSLATE (ch, trt, untranslated);
1607 if (charset_base == (ch & ~0xff))
1584 { 1608 {
1585 translate_prev_byte = ptr[-1]; 1609 if (! CHAR_HEAD_P (*ptr))
1586 if (! CHAR_HEAD_P (translate_prev_byte)) 1610 {
1587 translate_anteprev_byte = ptr[-2]; 1611 translate_prev_byte = ptr[-1];
1612 if (! CHAR_HEAD_P (translate_prev_byte))
1613 translate_anteprev_byte = ptr[-2];
1614 }
1588 } 1615 }
1589 ch = STRING_CHAR (charstart, ptr - charstart + 1); 1616 else
1590 ch = TRANSLATE (trt, ch); 1617 this_translated = 0;
1591 } 1618 }
1592 else if (!multibyte) 1619 else if (!multibyte)
1593 ch = TRANSLATE (trt, *ptr); 1620 TRANSLATE (ch, trt, *ptr);
1594 else 1621 else
1595 { 1622 {
1596 ch = *ptr; 1623 ch = *ptr;
1597 this_translated = 0; 1624 this_translated = 0;
1598 } 1625 }
1604 /* A translation table is accompanied by its inverse -- see */ 1631 /* A translation table is accompanied by its inverse -- see */
1605 /* comment following downcase_table for details */ 1632 /* comment following downcase_table for details */
1606 if (this_translated) 1633 if (this_translated)
1607 while (1) 1634 while (1)
1608 { 1635 {
1609 ch = TRANSLATE (inverse_trt, ch); 1636 TRANSLATE (ch, inverse_trt, ch);
1610 /* For all the characters that map into K, 1637 /* For all the characters that map into K,
1611 set up simple_translate to map them into K. */ 1638 set up simple_translate to map them into K. */
1612 simple_translate[(unsigned char) ch] = k; 1639 simple_translate[(unsigned char) ch] = k;
1613 if ((unsigned char) ch == k) 1640 if ((unsigned char) ch == k)
1614 break; 1641 break;