changeset 1637:e81cb2cc709e

*** empty log message ***
author Karl Berry <karl@gnu.org>
date Sat, 21 Nov 1992 01:51:33 +0000
parents c2bea3f2f8cb
children e49c2e6349e4
files src/regex.c src/regex.h
diffstat 2 files changed, 115 insertions(+), 79 deletions(-) [+]
line wrap: on
line diff
--- a/src/regex.c	Fri Nov 20 19:33:38 1992 +0000
+++ b/src/regex.c	Sat Nov 21 01:51:33 1992 +0000
@@ -47,9 +47,15 @@
    `BSTRING', as far as I know, and neither of them use this code.  */
 #if USG || STDC_HEADERS
 #include <string.h>
+#ifndef bcmp
 #define bcmp(s1, s2, n)	memcmp ((s1), (s2), (n))
+#endif
+#ifndef bcopy
 #define bcopy(s, d, n)	memcpy ((d), (s), (n))
+#endif
+#ifndef bzero
 #define bzero(s, n)	memset ((s), 0, (n))
+#endif
 #else
 #include <strings.h>
 #endif
@@ -135,12 +141,8 @@
    (Per Bothner suggested the basic approach.)  */
 #undef SIGN_EXTEND_CHAR
 #if __STDC__
-#ifndef VMS
 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
-#else /* On VMS, VAXC doesn't recognize `signed' before `char' */
-#define SIGN_EXTEND_CHAR(c) ((char) (c))
-#endif /* VMS */
-#else
+#else  /* not __STDC__ */
 /* As in Harbison and Steele.  */
 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
 #endif
@@ -447,6 +449,7 @@
 #define DEBUG_PRINT1(x) if (debug) printf (x)
 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
+#define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) 				\
   if (debug) print_partial_compiled_pattern (s, e)
 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)			\
@@ -760,6 +763,7 @@
 #define DEBUG_PRINT1(x)
 #define DEBUG_PRINT2(x1, x2)
 #define DEBUG_PRINT3(x1, x2, x3)
+#define DEBUG_PRINT4(x1, x2, x3, x4)
 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
 
@@ -1025,9 +1029,9 @@
      `buffer' is the compiled pattern;
      `syntax' is set to SYNTAX;
      `used' is set to the length of the compiled pattern;
-     `fastmap_accurate' is set to zero;
-     `re_nsub' is set to the number of groups in PATTERN;
-     `not_bol' and `not_eol' are set to zero.
+     `fastmap_accurate' is zero;
+     `re_nsub' is the number of subexpressions in PATTERN;
+     `not_bol' and `not_eol' are zero;
    
    The `fastmap' and `newline_anchor' fields are neither
    examined nor set.  */
@@ -1676,10 +1680,10 @@
                           |   v |   v 
                          a | b   | c   
 
-                 If we are at `b,' then fixup_alt_jump right now points to a
-                 three-byte space after `a.'  We'll put in the jump, set
-                 fixup_alt_jump to right after `b,' and leave behind three
-                 bytes which we'll fill in when we get to after `c.'  */
+                 If we are at `b', then fixup_alt_jump right now points to a
+                 three-byte space after `a'.  We'll put in the jump, set
+                 fixup_alt_jump to right after `b', and leave behind three
+                 bytes which we'll fill in when we get to after `c'.  */
 
               if (fixup_alt_jump)
                 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
@@ -2320,6 +2324,7 @@
     int this_reg;							\
     									\
     DEBUG_STATEMENT (failure_id++);					\
+    DEBUG_STATEMENT (nfailure_points_pushed++);				\
     DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id);		\
     DEBUG_PRINT2 ("  Before push, next avail: %d\n", (fail_stack).avail);\
     DEBUG_PRINT2 ("                     size: %d\n", (fail_stack).size);\
@@ -2473,6 +2478,8 @@
       regstart[this_reg] = (const char *) POP_FAILURE_ITEM ();		\
       DEBUG_PRINT2 ("      start: 0x%x\n", regstart[this_reg]);		\
     }									\
+									\
+  DEBUG_STATEMENT (nfailure_points_popped++);				\
 } /* POP_FAILURE_POINT */
 
 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
@@ -2860,15 +2867,9 @@
   else if (endpos > total_size)
     range = total_size - startpos;
 
-  /* Update the fastmap now if not correct already.  */
-  if (fastmap && !bufp->fastmap_accurate)
-    if (re_compile_fastmap (bufp) == -2)
-      return -2;
-  
   /* If the search isn't to be a backwards one, don't waste time in a
-     long search for a pattern that says it is anchored.  */
-  if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf
-      && range > 0)
+     search for a pattern that must be anchored.  */
+  if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
     {
       if (startpos > 0)
 	return -1;
@@ -2876,6 +2877,12 @@
 	range = 1;
     }
 
+  /* Update the fastmap now if not correct already.  */
+  if (fastmap && !bufp->fastmap_accurate)
+    if (re_compile_fastmap (bufp) == -2)
+      return -2;
+  
+  /* Loop through the string, looking for a place to start matching.  */
   for (;;)
     { 
       /* If a fastmap is supplied, skip quickly over characters that
@@ -2913,7 +2920,7 @@
                                  ? string2[startpos - size1] 
                                  : string1[startpos]);
 
-	      if (!fastmap[TRANSLATE (c)])
+	      if (!fastmap[(unsigned char) TRANSLATE (c)])
 		goto advance;
 	    }
 	}
@@ -2987,12 +2994,9 @@
 #define EVER_MATCHED_SOMETHING(R)  ((R).bits.ever_matched_something)
 
 
-/* Call this when have matched something; it sets `matched' flags for the
-   registers corresponding to the group of which we currently are inside.  
-   Also records whether this group ever matched something.  We only care
-   about this information at `stop_memory', and then only about the
-   previous time through the loop (if the group is starred or whatever).
-   So it is ok to clear all the nonactive registers here.  */
+/* Call this when have matched a real character; it sets `matched' flags
+   for the subexpressions which we are currently inside.  Also records
+   that those subexprs have matched.  */
 #define SET_REGS_MATCHED()						\
   do									\
     {									\
@@ -3037,24 +3041,24 @@
 
 /* Test if at very beginning or at very end of the virtual concatenation
    of `string1' and `string2'.  If only one string, it's `string2'.  */
-#define AT_STRINGS_BEG() (d == (size1 ? string1 : string2) || !size2)
-#define AT_STRINGS_END() (d == end2)	
+#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
+#define AT_STRINGS_END(d) ((d) == end2)	
 
 
 /* Test if D points to a character which is word-constituent.  We have
    two special cases to check for: if past the end of string1, look at
    the first character in string2; and if before the beginning of
-   string2, look at the last character in string1.
-   
-   Assumes `string1' exists, so use in conjunction with AT_STRINGS_BEG ().  */
-#define LETTER_P(d)							\
+   string2, look at the last character in string1.  */
+#define WORDCHAR_P(d)							\
   (SYNTAX ((d) == end1 ? *string2					\
-  : (d) == string2 - 1 ? *(end1 - 1) : *(d)) == Sword)
+           : (d) == string2 - 1 ? *(end1 - 1) : *(d))			\
+   == Sword)
 
 /* Test if the character before D and the one at D differ with respect
    to being word-constituent.  */
 #define AT_WORD_BOUNDARY(d)						\
-  (AT_STRINGS_BEG () || AT_STRINGS_END () || LETTER_P (d - 1) != LETTER_P (d))
+  (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)				\
+   || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
 
 
 /* Free everything we malloc.  */
@@ -3161,6 +3165,7 @@
   fail_stack_type fail_stack;
 #ifdef DEBUG
   static unsigned failure_id = 0;
+  unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
 #endif
 
   /* We fill all the registers internally, independent of what we
@@ -3254,8 +3259,7 @@
   else
     {
       /* We must initialize all our variables to NULL, so that
-         `FREE_VARIABLES' doesn't try to free them.  Too bad this isn't
-         Lisp, so we could have a list of variables.  As it is, */
+         `FREE_VARIABLES' doesn't try to free them.  */
       regstart = regend = old_regstart = old_regend = best_regstart
         = best_regend = reg_dummy = NULL;
       reg_info = reg_info_dummy = (register_info_type *) NULL;
@@ -3339,8 +3343,10 @@
 
       if (p == pend)
 	{ /* End of pattern means we might have succeeded.  */
-          DEBUG_PRINT1 ("End of pattern: ");
-	  /* If not end of string, try backtracking.  Otherwise done.  */
+          DEBUG_PRINT1 ("end of pattern ... ");
+          
+	  /* If we haven't matched the entire string, and we want the
+             longest match, try backtracking.  */
           if (d != end_match_2)
 	    {
               DEBUG_PRINT1 ("backtracking.\n");
@@ -3378,6 +3384,8 @@
                      For example, the pattern `x.*y.*z' against the
                      strings `x-' and `y-z-', if the two strings are
                      not consecutive in memory.  */
+                  DEBUG_PRINT1 ("Restoring best registers.\n");
+                  
                   d = match_end;
                   dend = ((d >= string1 && d <= end1)
 		           ? end_match_1 : end_match_2);
@@ -3390,7 +3398,7 @@
                 }
             } /* d != end_match_2 */
 
-          DEBUG_PRINT1 ("\nAccepting match.\n");
+          DEBUG_PRINT1 ("Accepting match.\n");
 
           /* If caller wants register contents data back, do it.  */
           if (regs && !bufp->no_sub)
@@ -3456,7 +3464,10 @@
 	    } /* regs && !bufp->no_sub */
 
           FREE_VARIABLES ();
-          DEBUG_PRINT2 ("%d registers pushed.\n", num_regs_pushed);
+          DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
+                        nfailure_points_pushed, nfailure_points_popped,
+                        nfailure_points_pushed - nfailure_points_popped);
+          DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
 
           mcnt = d - pos - (MATCHING_IN_FIRST_STRING 
 			    ? string1 
@@ -3658,7 +3669,7 @@
           
           /* If just failed to match something this time around with a
              group that's operated on by a repetition operator, try to
-             force exit from the ``loop,'' and restore the register
+             force exit from the ``loop'', and restore the register
              information for this group that we had before trying this
              last match.  */
           if ((!MATCHED_SOMETHING (reg_info[*p])
@@ -3802,7 +3813,7 @@
 	case begline:
           DEBUG_PRINT1 ("EXECUTING begline.\n");
           
-          if (AT_STRINGS_BEG ())
+          if (AT_STRINGS_BEG (d))
             {
               if (!bufp->not_bol) break;
             }
@@ -3818,7 +3829,7 @@
 	case endline:
           DEBUG_PRINT1 ("EXECUTING endline.\n");
 
-          if (AT_STRINGS_END ())
+          if (AT_STRINGS_END (d))
             {
               if (!bufp->not_eol) break;
             }
@@ -3835,7 +3846,7 @@
 	/* Match at the very beginning of the data.  */
         case begbuf:
           DEBUG_PRINT1 ("EXECUTING begbuf.\n");
-          if (AT_STRINGS_BEG ())
+          if (AT_STRINGS_BEG (d))
             break;
           goto fail;
 
@@ -3843,7 +3854,7 @@
 	/* Match at the very end of the data.  */
         case endbuf:
           DEBUG_PRINT1 ("EXECUTING endbuf.\n");
-	  if (AT_STRINGS_END ())
+	  if (AT_STRINGS_END (d))
 	    break;
           goto fail;
 
@@ -3897,7 +3908,7 @@
              the original * applied to a group), save the information
              for that group and all inner ones, so that if we fail back
              to this point, the group's information will be correct.
-             For example, in \(a*\)*\1, we only need the preceding group,
+             For example, in \(a*\)*\1, we need the preceding group,
              and in \(\(a*\)b*\)\2, we need the inner group.  */
 
           /* We can't use `p' to check ahead because we push
@@ -3927,8 +3938,8 @@
           break;
 
 
-        /* A smart repeat ends with a maybe_pop_jump.
-	   We change it either to a pop_failure_jump or a jump.  */
+        /* A smart repeat ends with `maybe_pop_jump'.
+	   We change it to either `pop_failure_jump' or `jump'.  */
         case maybe_pop_jump:
           EXTRACT_NUMBER_AND_INCR (mcnt, p);
           DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
@@ -3956,10 +3967,21 @@
 
             /* If we're at the end of the pattern, we can change.  */
             if (p2 == pend)
-              {
+              { /* But if we're also at the end of the string, we might
+                   as well skip changing anything.  For example, in `a+'
+                   against `a', we'll have already matched the `a', and
+                   I don't see the the point of changing the opcode,
+                   popping the failure point, finding out it fails, and
+                   then going into our endgame.  */
+                if (d == dend)
+                  {
+                    p = pend;
+                    DEBUG_PRINT1 ("  End of pattern & string => done.\n");
+                    continue;
+                  }
+                
   	        p[-3] = (unsigned char) pop_failure_jump;
-                DEBUG_PRINT1
-                  ("  End of pattern: change to `pop_failure_jump'.\n");
+                DEBUG_PRINT1 ("  End of pattern => pop_failure_jump.\n");
               }
 
             else if ((re_opcode_t) *p2 == exactn
@@ -3973,7 +3995,12 @@
                    to the `maybe_finalize_jump' of this case.  Examine what 
                    follows.  */
                 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
-		  p[-3] = (unsigned char) pop_failure_jump;
+                  {
+  		    p[-3] = (unsigned char) pop_failure_jump;
+                    DEBUG_PRINT3 ("  %c != %c => pop_failure_jump.\n",
+                                  c, p1[5]);
+                  }
+                  
 		else if ((re_opcode_t) p1[3] == charset
 			 || (re_opcode_t) p1[3] == charset_not)
 		  {
@@ -3988,9 +4015,7 @@
 		    if (!not)
                       {
   		        p[-3] = (unsigned char) pop_failure_jump;
-                        DEBUG_PRINT1
-                          ("  No match: change to `pop_failure_jump'.\n");
-
+                        DEBUG_PRINT1 ("  No match => pop_failure_jump.\n");
                       }
 		  }
 	      }
@@ -3999,6 +4024,7 @@
 	  if ((re_opcode_t) p[-1] != pop_failure_jump)
 	    {
 	      p[-1] = (unsigned char) jump;
+              DEBUG_PRINT1 ("  Match => jump.\n");
 	      goto unconditional_jump;
 	    }
         /* Note fall through.  */
@@ -4060,7 +4086,7 @@
 
 
         /* At the end of an alternative, we need to push a dummy failure
-           point in case we are followed by a pop_failure_jump', because
+           point in case we are followed by a `pop_failure_jump', because
            we don't want the failure point for the alternative to be
            popped.  For example, matching `(a|ab)*' against `aab'
            requires that we match the `ab' alternative.  */
@@ -4137,14 +4163,14 @@
 
 	case wordbeg:
           DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
-	  if (LETTER_P (d) && (AT_STRINGS_BEG () || !LETTER_P (d - 1)))
+	  if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
 	    break;
           goto fail;
 
 	case wordend:
           DEBUG_PRINT1 ("EXECUTING wordend.\n");
-	  if (!AT_STRINGS_BEG () && LETTER_P (d - 1)
-              && (!LETTER_P (d) || AT_STRINGS_END ()))
+	  if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
+              && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
 	    break;
           goto fail;
 
@@ -4181,11 +4207,12 @@
 	  goto matchsyntax;
 
         case wordchar:
-          DEBUG_PRINT1 ("EXECUTING wordchar.\n");
+          DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
 	  mcnt = (int) Sword;
         matchsyntax:
 	  PREFETCH ();
-	  if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail;
+	  if (SYNTAX (*d++) != (enum syntaxcode) mcnt)
+            goto fail;
           SET_REGS_MATCHED ();
 	  break;
 
@@ -4195,11 +4222,12 @@
 	  goto matchnotsyntax;
 
         case notwordchar:
-          DEBUG_PRINT1 ("EXECUTING notwordchar.\n");
+          DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
 	  mcnt = (int) Sword;
-        matchnotsyntax: /* We goto here from notsyntaxspec.  */
+        matchnotsyntax:
 	  PREFETCH ();
-	  if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail;
+	  if (SYNTAX (*d++) == (enum syntaxcode) mcnt)
+            goto fail;
 	  SET_REGS_MATCHED ();
           break;
 
@@ -4207,17 +4235,19 @@
 	case wordchar:
           DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
 	  PREFETCH ();
-          if (!LETTER_P (d))
+          if (!WORDCHAR_P (d))
             goto fail;
 	  SET_REGS_MATCHED ();
+          d++;
 	  break;
 	  
 	case notwordchar:
           DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
 	  PREFETCH ();
-	  if (LETTER_P (d))
+	  if (WORDCHAR_P (d))
             goto fail;
           SET_REGS_MATCHED ();
+          d++;
 	  break;
 #endif /* not emacs */
           
@@ -4812,7 +4842,7 @@
 
 
 /* Returns a message corresponding to an error code, ERRCODE, returned
-   from either regcomp or regexec.   */
+   from either regcomp or regexec.   We don't use PREG here.  */
 
 size_t
 regerror (errcode, preg, errbuf, errbuf_size)
--- a/src/regex.h	Fri Nov 20 19:33:38 1992 +0000
+++ b/src/regex.h	Sat Nov 21 01:51:33 1992 +0000
@@ -20,12 +20,15 @@
 #ifndef __REGEXP_LIBRARY_H__
 #define __REGEXP_LIBRARY_H__
 
+/* POSIX says that <sys/types.h> must be included (by the caller) before
+   <regex.h>.  */
+
 #ifdef VMS
-/* POSIX says that size_t should be in stddef.h.  */
+/* VMS doesn't have `size_t' in <sys/types.h>, even though POSIX says it
+   should be there.  */
 #include <stddef.h>
 #endif
 
-/* POSIX says that <sys/types.h> must be included before <regex.h>.  */
 
 /* The following bits are used to determine the regexp syntax we
    recognize.  The set/not-set meanings are chosen so that Emacs syntax
@@ -162,6 +165,9 @@
 #define RE_SYNTAX_POSIX_EGREP						\
   (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES)
 
+/* P1003.2/D11.2, section 4.20.7.1, lines 5078ff.  */
+#define RE_SYNTAX_ED RE_SYNTAX_POSIX_BASIC
+
 #define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC
 
 /* Syntax bits common to both basic and extended POSIX regex syntax.  */
@@ -316,12 +322,12 @@
 #define REGS_FIXED 2
   unsigned regs_allocated : 2;
 
-        /* Set to zero when regex_compile compiles a pattern; set to one
-           by re_compile_fastmap when it updates the fastmap, if any.  */
+        /* Set to zero when `regex_compile' compiles a pattern; set to one
+           by `re_compile_fastmap' if it updates the fastmap.  */
   unsigned fastmap_accurate : 1;
 
-        /* If set, regexec reports only success or failure and does not
-           return anything in pmatch.  */
+        /* If set, `re_match_2' does not return information about
+           subexpressions.  */
   unsigned no_sub : 1;
 
         /* If set, a beginning-of-line anchor doesn't match at the
@@ -383,17 +389,17 @@
    unfortunately clutters up the declarations a bit, but I think it's
    worth it.
    
-   We also have to undo `const' if we are not ANSI and if it hasn't
-   previously being taken care of.  */
+   We may also have to undo `const' if we are not ANSI -- but if it has
+   already been defined, as by Autoconf's AC_CONST, don't do anything.  */
 
 #if __STDC__
 #define _RE_ARGS(args) args
-#else
+#else /* not __STDC__ */
 #define _RE_ARGS(args) ()
-#ifndef const
+#if !const && !HAVE_CONST
 #define const
 #endif
-#endif
+#endif /* not __STDC__ */
 
 /* Sets the current default syntax to SYNTAX, and return the old syntax.
    You can also simply assign to the `re_syntax_options' variable.  */