Mercurial > emacs
comparison src/regex.c @ 55690:daeeb5ef2d95
Add support for new '\_<' and '\_>' regexp operators, matching the
beginning and ends of symbols.
* regex.c (enum syntaxcode): Add Ssymbol.
(init_syntax_once): Set the syntax for '_' to Ssymbol, not Sword.
(re_opcode_t): New opcodes `symbeg' and `symend'.
(print_partial_compiled_pattern): Print the new opcodes properly.
(regex_compile): Parse the new operators.
(analyse_first): Skip symbeg and symend (they match only the empty string).
(mutually_exclusive_p): `symend' is mutually exclusive with \s_ and
\sw; `symbeg' is mutually exclusive with \S_ and \Sw.
(re_match_2_internal): Match symbeg and symend.
author | Stefan Monnier <monnier@iro.umontreal.ca> |
---|---|
date | Wed, 19 May 2004 16:37:35 +0000 |
parents | ada02c2b390c |
children | d8ee27fc17e9 |
comparison
equal
deleted
inserted
replaced
55689:f4a937a898f4 | 55690:daeeb5ef2d95 |
---|---|
1 /* Extended regular expression matching and search library, version | 1 /* Extended regular expression matching and search library, version |
2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the | 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the |
3 internationalization features.) | 3 internationalization features.) |
4 | 4 |
5 Copyright (C) 1993,94,95,96,97,98,99,2000 Free Software Foundation, Inc. | 5 Copyright (C) 1993,94,95,96,97,98,99,2000,04 Free Software Foundation, Inc. |
6 | 6 |
7 This program is free software; you can redistribute it and/or modify | 7 This program is free software; you can redistribute it and/or modify |
8 it under the terms of the GNU General Public License as published by | 8 it under the terms of the GNU General Public License as published by |
9 the Free Software Foundation; either version 2, or (at your option) | 9 the Free Software Foundation; either version 2, or (at your option) |
10 any later version. | 10 any later version. |
215 # endif | 215 # endif |
216 | 216 |
217 /* Define the syntax stuff for \<, \>, etc. */ | 217 /* Define the syntax stuff for \<, \>, etc. */ |
218 | 218 |
219 /* Sword must be nonzero for the wordchar pattern commands in re_match_2. */ | 219 /* Sword must be nonzero for the wordchar pattern commands in re_match_2. */ |
220 enum syntaxcode { Swhitespace = 0, Sword = 1 }; | 220 enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 }; |
221 | 221 |
222 # ifdef SWITCH_ENUM_BUG | 222 # ifdef SWITCH_ENUM_BUG |
223 # define SWITCH_ENUM_CAST(x) ((int)(x)) | 223 # define SWITCH_ENUM_CAST(x) ((int)(x)) |
224 # else | 224 # else |
225 # define SWITCH_ENUM_CAST(x) (x) | 225 # define SWITCH_ENUM_CAST(x) (x) |
396 | 396 |
397 for (c = 0; c < CHAR_SET_SIZE; ++c) | 397 for (c = 0; c < CHAR_SET_SIZE; ++c) |
398 if (ISALNUM (c)) | 398 if (ISALNUM (c)) |
399 re_syntax_table[c] = Sword; | 399 re_syntax_table[c] = Sword; |
400 | 400 |
401 re_syntax_table['_'] = Sword; | 401 re_syntax_table['_'] = Ssymbol; |
402 | 402 |
403 done = 1; | 403 done = 1; |
404 } | 404 } |
405 | 405 |
406 # endif /* not SYNTAX_TABLE */ | 406 # endif /* not SYNTAX_TABLE */ |
652 wordbeg, /* Succeeds if at word beginning. */ | 652 wordbeg, /* Succeeds if at word beginning. */ |
653 wordend, /* Succeeds if at word end. */ | 653 wordend, /* Succeeds if at word end. */ |
654 | 654 |
655 wordbound, /* Succeeds if at a word boundary. */ | 655 wordbound, /* Succeeds if at a word boundary. */ |
656 notwordbound, /* Succeeds if not at a word boundary. */ | 656 notwordbound, /* Succeeds if not at a word boundary. */ |
657 | |
658 symbeg, /* Succeeds if at symbol beginning. */ | |
659 symend, /* Succeeds if at symbol end. */ | |
657 | 660 |
658 /* Matches any character whose syntax is specified. Followed by | 661 /* Matches any character whose syntax is specified. Followed by |
659 a byte which contains a syntax code, e.g., Sword. */ | 662 a byte which contains a syntax code, e.g., Sword. */ |
660 syntaxspec, | 663 syntaxspec, |
661 | 664 |
1091 fprintf (stderr, "/wordbeg"); | 1094 fprintf (stderr, "/wordbeg"); |
1092 break; | 1095 break; |
1093 | 1096 |
1094 case wordend: | 1097 case wordend: |
1095 fprintf (stderr, "/wordend"); | 1098 fprintf (stderr, "/wordend"); |
1099 | |
1100 case symbeg: | |
1101 printf ("/symbeg"); | |
1102 break; | |
1103 | |
1104 case symend: | |
1105 printf ("/symend"); | |
1106 break; | |
1096 | 1107 |
1097 case syntaxspec: | 1108 case syntaxspec: |
1098 fprintf (stderr, "/syntaxspec"); | 1109 fprintf (stderr, "/syntaxspec"); |
1099 mcnt = *p++; | 1110 mcnt = *p++; |
1100 fprintf (stderr, "/%d", mcnt); | 1111 fprintf (stderr, "/%d", mcnt); |
3396 if (syntax & RE_NO_GNU_OPS) | 3407 if (syntax & RE_NO_GNU_OPS) |
3397 goto normal_char; | 3408 goto normal_char; |
3398 BUF_PUSH (wordend); | 3409 BUF_PUSH (wordend); |
3399 break; | 3410 break; |
3400 | 3411 |
3412 case '_': | |
3413 if (syntax & RE_NO_GNU_OPS) | |
3414 goto normal_char; | |
3415 laststart = b; | |
3416 PATFETCH (c); | |
3417 if (c == '<') | |
3418 BUF_PUSH (symbeg); | |
3419 else if (c == '>') | |
3420 BUF_PUSH (symend); | |
3421 else | |
3422 FREE_STACK_RETURN (REG_BADPAT); | |
3423 break; | |
3424 | |
3401 case 'b': | 3425 case 'b': |
3402 if (syntax & RE_NO_GNU_OPS) | 3426 if (syntax & RE_NO_GNU_OPS) |
3403 goto normal_char; | 3427 goto normal_char; |
3404 BUF_PUSH (wordbound); | 3428 BUF_PUSH (wordbound); |
3405 break; | 3429 break; |
3888 case endbuf: | 3912 case endbuf: |
3889 case wordbound: | 3913 case wordbound: |
3890 case notwordbound: | 3914 case notwordbound: |
3891 case wordbeg: | 3915 case wordbeg: |
3892 case wordend: | 3916 case wordend: |
3917 case symbeg: | |
3918 case symend: | |
3893 continue; | 3919 continue; |
3894 | 3920 |
3895 | 3921 |
3896 case jump: | 3922 case jump: |
3897 EXTRACT_NUMBER_AND_INCR (j, p); | 3923 EXTRACT_NUMBER_AND_INCR (j, p); |
4652 break; | 4678 break; |
4653 } | 4679 } |
4654 break; | 4680 break; |
4655 | 4681 |
4656 case wordend: | 4682 case wordend: |
4683 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword); | |
4684 case symend: | |
4685 return ((re_opcode_t) *p1 == syntaxspec | |
4686 && (p1[1] == Ssymbol || p1[1] == Sword)); | |
4657 case notsyntaxspec: | 4687 case notsyntaxspec: |
4658 return ((re_opcode_t) *p1 == syntaxspec | 4688 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]); |
4659 && p1[1] == (op2 == wordend ? Sword : p2[1])); | |
4660 | 4689 |
4661 case wordbeg: | 4690 case wordbeg: |
4691 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword); | |
4692 case symbeg: | |
4693 return ((re_opcode_t) *p1 == notsyntaxspec | |
4694 && (p1[1] == Ssymbol || p1[1] == Sword)); | |
4662 case syntaxspec: | 4695 case syntaxspec: |
4663 return ((re_opcode_t) *p1 == notsyntaxspec | 4696 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]); |
4664 && p1[1] == (op2 == wordbeg ? Sword : p2[1])); | |
4665 | 4697 |
4666 case wordbound: | 4698 case wordbound: |
4667 return (((re_opcode_t) *p1 == notsyntaxspec | 4699 return (((re_opcode_t) *p1 == notsyntaxspec |
4668 || (re_opcode_t) *p1 == syntaxspec) | 4700 || (re_opcode_t) *p1 == syntaxspec) |
4669 && p1[1] == Sword); | 4701 && p1[1] == Sword); |
5801 goto fail; | 5833 goto fail; |
5802 } | 5834 } |
5803 } | 5835 } |
5804 break; | 5836 break; |
5805 | 5837 |
5838 case symbeg: | |
5839 DEBUG_PRINT1 ("EXECUTING symbeg.\n"); | |
5840 | |
5841 /* We FAIL in one of the following cases: */ | |
5842 | |
5843 /* Case 1: D is at the end of string. */ | |
5844 if (AT_STRINGS_END (d)) | |
5845 goto fail; | |
5846 else | |
5847 { | |
5848 /* C1 is the character before D, S1 is the syntax of C1, C2 | |
5849 is the character at D, and S2 is the syntax of C2. */ | |
5850 re_wchar_t c1, c2; | |
5851 int s1, s2; | |
5852 #ifdef emacs | |
5853 int offset = PTR_TO_OFFSET (d); | |
5854 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); | |
5855 UPDATE_SYNTAX_TABLE (charpos); | |
5856 #endif | |
5857 PREFETCH (); | |
5858 c2 = RE_STRING_CHAR (d, dend - d); | |
5859 s2 = SYNTAX (c2); | |
5860 | |
5861 /* Case 2: S2 is neither Sword nor Ssymbol. */ | |
5862 if (s2 != Sword && s2 != Ssymbol) | |
5863 goto fail; | |
5864 | |
5865 /* Case 3: D is not at the beginning of string ... */ | |
5866 if (!AT_STRINGS_BEG (d)) | |
5867 { | |
5868 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); | |
5869 #ifdef emacs | |
5870 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1); | |
5871 #endif | |
5872 s1 = SYNTAX (c1); | |
5873 | |
5874 /* ... and S1 is Sword or Ssymbol. */ | |
5875 if (s1 == Sword || s1 == Ssymbol) | |
5876 goto fail; | |
5877 } | |
5878 } | |
5879 break; | |
5880 | |
5881 case symend: | |
5882 DEBUG_PRINT1 ("EXECUTING symend.\n"); | |
5883 | |
5884 /* We FAIL in one of the following cases: */ | |
5885 | |
5886 /* Case 1: D is at the beginning of string. */ | |
5887 if (AT_STRINGS_BEG (d)) | |
5888 goto fail; | |
5889 else | |
5890 { | |
5891 /* C1 is the character before D, S1 is the syntax of C1, C2 | |
5892 is the character at D, and S2 is the syntax of C2. */ | |
5893 re_wchar_t c1, c2; | |
5894 int s1, s2; | |
5895 #ifdef emacs | |
5896 int offset = PTR_TO_OFFSET (d) - 1; | |
5897 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); | |
5898 UPDATE_SYNTAX_TABLE (charpos); | |
5899 #endif | |
5900 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); | |
5901 s1 = SYNTAX (c1); | |
5902 | |
5903 /* Case 2: S1 is neither Ssymbol nor Sword. */ | |
5904 if (s1 != Sword && s1 != Ssymbol) | |
5905 goto fail; | |
5906 | |
5907 /* Case 3: D is not at the end of string ... */ | |
5908 if (!AT_STRINGS_END (d)) | |
5909 { | |
5910 PREFETCH_NOLIMIT (); | |
5911 c2 = RE_STRING_CHAR (d, dend - d); | |
5912 #ifdef emacs | |
5913 UPDATE_SYNTAX_TABLE_FORWARD (charpos); | |
5914 #endif | |
5915 s2 = SYNTAX (c2); | |
5916 | |
5917 /* ... and S2 is Sword or Ssymbol. */ | |
5918 if (s2 == Sword || s2 == Ssymbol) | |
5919 goto fail; | |
5920 } | |
5921 } | |
5922 break; | |
5923 | |
5806 case syntaxspec: | 5924 case syntaxspec: |
5807 case notsyntaxspec: | 5925 case notsyntaxspec: |
5808 not = (re_opcode_t) *(p - 1) == notsyntaxspec; | 5926 not = (re_opcode_t) *(p - 1) == notsyntaxspec; |
5809 mcnt = *p++; | 5927 mcnt = *p++; |
5810 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt); | 5928 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt); |