comparison src/regex.c @ 55690:daeeb5ef2d95

Add support for new '\_<' and '\_>' regexp operators, matching the beginning and ends of symbols. * regex.c (enum syntaxcode): Add Ssymbol. (init_syntax_once): Set the syntax for '_' to Ssymbol, not Sword. (re_opcode_t): New opcodes `symbeg' and `symend'. (print_partial_compiled_pattern): Print the new opcodes properly. (regex_compile): Parse the new operators. (analyse_first): Skip symbeg and symend (they match only the empty string). (mutually_exclusive_p): `symend' is mutually exclusive with \s_ and \sw; `symbeg' is mutually exclusive with \S_ and \Sw. (re_match_2_internal): Match symbeg and symend.
author Stefan Monnier <monnier@iro.umontreal.ca>
date Wed, 19 May 2004 16:37:35 +0000
parents ada02c2b390c
children d8ee27fc17e9
comparison
equal deleted inserted replaced
55689:f4a937a898f4 55690:daeeb5ef2d95
1 /* Extended regular expression matching and search library, version 1 /* Extended regular expression matching and search library, version
2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
3 internationalization features.) 3 internationalization features.)
4 4
5 Copyright (C) 1993,94,95,96,97,98,99,2000 Free Software Foundation, Inc. 5 Copyright (C) 1993,94,95,96,97,98,99,2000,04 Free Software Foundation, Inc.
6 6
7 This program is free software; you can redistribute it and/or modify 7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by 8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option) 9 the Free Software Foundation; either version 2, or (at your option)
10 any later version. 10 any later version.
215 # endif 215 # endif
216 216
217 /* Define the syntax stuff for \<, \>, etc. */ 217 /* Define the syntax stuff for \<, \>, etc. */
218 218
219 /* Sword must be nonzero for the wordchar pattern commands in re_match_2. */ 219 /* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
220 enum syntaxcode { Swhitespace = 0, Sword = 1 }; 220 enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
221 221
222 # ifdef SWITCH_ENUM_BUG 222 # ifdef SWITCH_ENUM_BUG
223 # define SWITCH_ENUM_CAST(x) ((int)(x)) 223 # define SWITCH_ENUM_CAST(x) ((int)(x))
224 # else 224 # else
225 # define SWITCH_ENUM_CAST(x) (x) 225 # define SWITCH_ENUM_CAST(x) (x)
396 396
397 for (c = 0; c < CHAR_SET_SIZE; ++c) 397 for (c = 0; c < CHAR_SET_SIZE; ++c)
398 if (ISALNUM (c)) 398 if (ISALNUM (c))
399 re_syntax_table[c] = Sword; 399 re_syntax_table[c] = Sword;
400 400
401 re_syntax_table['_'] = Sword; 401 re_syntax_table['_'] = Ssymbol;
402 402
403 done = 1; 403 done = 1;
404 } 404 }
405 405
406 # endif /* not SYNTAX_TABLE */ 406 # endif /* not SYNTAX_TABLE */
652 wordbeg, /* Succeeds if at word beginning. */ 652 wordbeg, /* Succeeds if at word beginning. */
653 wordend, /* Succeeds if at word end. */ 653 wordend, /* Succeeds if at word end. */
654 654
655 wordbound, /* Succeeds if at a word boundary. */ 655 wordbound, /* Succeeds if at a word boundary. */
656 notwordbound, /* Succeeds if not at a word boundary. */ 656 notwordbound, /* Succeeds if not at a word boundary. */
657
658 symbeg, /* Succeeds if at symbol beginning. */
659 symend, /* Succeeds if at symbol end. */
657 660
658 /* Matches any character whose syntax is specified. Followed by 661 /* Matches any character whose syntax is specified. Followed by
659 a byte which contains a syntax code, e.g., Sword. */ 662 a byte which contains a syntax code, e.g., Sword. */
660 syntaxspec, 663 syntaxspec,
661 664
1091 fprintf (stderr, "/wordbeg"); 1094 fprintf (stderr, "/wordbeg");
1092 break; 1095 break;
1093 1096
1094 case wordend: 1097 case wordend:
1095 fprintf (stderr, "/wordend"); 1098 fprintf (stderr, "/wordend");
1099
1100 case symbeg:
1101 printf ("/symbeg");
1102 break;
1103
1104 case symend:
1105 printf ("/symend");
1106 break;
1096 1107
1097 case syntaxspec: 1108 case syntaxspec:
1098 fprintf (stderr, "/syntaxspec"); 1109 fprintf (stderr, "/syntaxspec");
1099 mcnt = *p++; 1110 mcnt = *p++;
1100 fprintf (stderr, "/%d", mcnt); 1111 fprintf (stderr, "/%d", mcnt);
3396 if (syntax & RE_NO_GNU_OPS) 3407 if (syntax & RE_NO_GNU_OPS)
3397 goto normal_char; 3408 goto normal_char;
3398 BUF_PUSH (wordend); 3409 BUF_PUSH (wordend);
3399 break; 3410 break;
3400 3411
3412 case '_':
3413 if (syntax & RE_NO_GNU_OPS)
3414 goto normal_char;
3415 laststart = b;
3416 PATFETCH (c);
3417 if (c == '<')
3418 BUF_PUSH (symbeg);
3419 else if (c == '>')
3420 BUF_PUSH (symend);
3421 else
3422 FREE_STACK_RETURN (REG_BADPAT);
3423 break;
3424
3401 case 'b': 3425 case 'b':
3402 if (syntax & RE_NO_GNU_OPS) 3426 if (syntax & RE_NO_GNU_OPS)
3403 goto normal_char; 3427 goto normal_char;
3404 BUF_PUSH (wordbound); 3428 BUF_PUSH (wordbound);
3405 break; 3429 break;
3888 case endbuf: 3912 case endbuf:
3889 case wordbound: 3913 case wordbound:
3890 case notwordbound: 3914 case notwordbound:
3891 case wordbeg: 3915 case wordbeg:
3892 case wordend: 3916 case wordend:
3917 case symbeg:
3918 case symend:
3893 continue; 3919 continue;
3894 3920
3895 3921
3896 case jump: 3922 case jump:
3897 EXTRACT_NUMBER_AND_INCR (j, p); 3923 EXTRACT_NUMBER_AND_INCR (j, p);
4652 break; 4678 break;
4653 } 4679 }
4654 break; 4680 break;
4655 4681
4656 case wordend: 4682 case wordend:
4683 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4684 case symend:
4685 return ((re_opcode_t) *p1 == syntaxspec
4686 && (p1[1] == Ssymbol || p1[1] == Sword));
4657 case notsyntaxspec: 4687 case notsyntaxspec:
4658 return ((re_opcode_t) *p1 == syntaxspec 4688 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4659 && p1[1] == (op2 == wordend ? Sword : p2[1]));
4660 4689
4661 case wordbeg: 4690 case wordbeg:
4691 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4692 case symbeg:
4693 return ((re_opcode_t) *p1 == notsyntaxspec
4694 && (p1[1] == Ssymbol || p1[1] == Sword));
4662 case syntaxspec: 4695 case syntaxspec:
4663 return ((re_opcode_t) *p1 == notsyntaxspec 4696 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4664 && p1[1] == (op2 == wordbeg ? Sword : p2[1]));
4665 4697
4666 case wordbound: 4698 case wordbound:
4667 return (((re_opcode_t) *p1 == notsyntaxspec 4699 return (((re_opcode_t) *p1 == notsyntaxspec
4668 || (re_opcode_t) *p1 == syntaxspec) 4700 || (re_opcode_t) *p1 == syntaxspec)
4669 && p1[1] == Sword); 4701 && p1[1] == Sword);
5801 goto fail; 5833 goto fail;
5802 } 5834 }
5803 } 5835 }
5804 break; 5836 break;
5805 5837
5838 case symbeg:
5839 DEBUG_PRINT1 ("EXECUTING symbeg.\n");
5840
5841 /* We FAIL in one of the following cases: */
5842
5843 /* Case 1: D is at the end of string. */
5844 if (AT_STRINGS_END (d))
5845 goto fail;
5846 else
5847 {
5848 /* C1 is the character before D, S1 is the syntax of C1, C2
5849 is the character at D, and S2 is the syntax of C2. */
5850 re_wchar_t c1, c2;
5851 int s1, s2;
5852 #ifdef emacs
5853 int offset = PTR_TO_OFFSET (d);
5854 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5855 UPDATE_SYNTAX_TABLE (charpos);
5856 #endif
5857 PREFETCH ();
5858 c2 = RE_STRING_CHAR (d, dend - d);
5859 s2 = SYNTAX (c2);
5860
5861 /* Case 2: S2 is neither Sword nor Ssymbol. */
5862 if (s2 != Sword && s2 != Ssymbol)
5863 goto fail;
5864
5865 /* Case 3: D is not at the beginning of string ... */
5866 if (!AT_STRINGS_BEG (d))
5867 {
5868 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5869 #ifdef emacs
5870 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
5871 #endif
5872 s1 = SYNTAX (c1);
5873
5874 /* ... and S1 is Sword or Ssymbol. */
5875 if (s1 == Sword || s1 == Ssymbol)
5876 goto fail;
5877 }
5878 }
5879 break;
5880
5881 case symend:
5882 DEBUG_PRINT1 ("EXECUTING symend.\n");
5883
5884 /* We FAIL in one of the following cases: */
5885
5886 /* Case 1: D is at the beginning of string. */
5887 if (AT_STRINGS_BEG (d))
5888 goto fail;
5889 else
5890 {
5891 /* C1 is the character before D, S1 is the syntax of C1, C2
5892 is the character at D, and S2 is the syntax of C2. */
5893 re_wchar_t c1, c2;
5894 int s1, s2;
5895 #ifdef emacs
5896 int offset = PTR_TO_OFFSET (d) - 1;
5897 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5898 UPDATE_SYNTAX_TABLE (charpos);
5899 #endif
5900 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5901 s1 = SYNTAX (c1);
5902
5903 /* Case 2: S1 is neither Ssymbol nor Sword. */
5904 if (s1 != Sword && s1 != Ssymbol)
5905 goto fail;
5906
5907 /* Case 3: D is not at the end of string ... */
5908 if (!AT_STRINGS_END (d))
5909 {
5910 PREFETCH_NOLIMIT ();
5911 c2 = RE_STRING_CHAR (d, dend - d);
5912 #ifdef emacs
5913 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
5914 #endif
5915 s2 = SYNTAX (c2);
5916
5917 /* ... and S2 is Sword or Ssymbol. */
5918 if (s2 == Sword || s2 == Ssymbol)
5919 goto fail;
5920 }
5921 }
5922 break;
5923
5806 case syntaxspec: 5924 case syntaxspec:
5807 case notsyntaxspec: 5925 case notsyntaxspec:
5808 not = (re_opcode_t) *(p - 1) == notsyntaxspec; 5926 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
5809 mcnt = *p++; 5927 mcnt = *p++;
5810 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt); 5928 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);