changeset 21348:64590f10c605

(compile_range): Unused function deleted. (regex_compile): Special handling for range \177-\377. (regex_compile): Cast args to TRANSLATE to unsigned char. (re_search_2): Fix forward scan handling multibyte. Recognize that nonascii characters are not in the fastmap. Handle fetching multibyte characters for backward scan, (re_match_2_internal): Handle multibyte and translation in exactn and anychar. (bcmp_translate): Handle multibyte chars for translation. (TRANSLATE): Don't cast to unsigned char. (PATFETCH): Use RE_TRANSLATE to translate.
author Richard M. Stallman <rms@gnu.org>
date Fri, 03 Apr 1998 07:33:13 +0000
parents aca24aa07fb4
children ede1a6e9e122
files src/regex.c
diffstat 1 files changed, 137 insertions(+), 101 deletions(-) [+]
line wrap: on
line diff
--- a/src/regex.c	Fri Apr 03 05:34:10 1998 +0000
+++ b/src/regex.c	Fri Apr 03 07:33:13 1998 +0000
@@ -168,7 +168,7 @@
 
 #define SYNTAX(c) re_syntax_table[c]
 
-/* Dummy macro for non emacs environments.  */
+/* Dummy macros for non-Emacs environments.  */
 #define BASE_LEADING_CODE_P(c) (0)
 #define WORD_BOUNDARY_P(c1, c2) (0)
 #define CHAR_HEAD_P(p) (1)
@@ -1539,7 +1539,7 @@
 #define PATFETCH(c)							\
   do {if (p == pend) return REG_EEND;					\
     c = (unsigned char) *p++;						\
-    if (translate) c = (unsigned char) translate[c];			\
+    if (translate) c = RE_TRANSLATE (translate, c);			\
   } while (0)
 #endif
 
@@ -1560,7 +1560,7 @@
    when we use a character as a subscript we must make it unsigned.  */
 #ifndef TRANSLATE
 #define TRANSLATE(d) \
-  (translate ? (unsigned char) RE_TRANSLATE (translate, (unsigned char) (d)) : (d))
+  (translate ? (unsigned) RE_TRANSLATE (translate, (unsigned) (d)) : (d))
 #endif
 
 
@@ -2107,9 +2107,10 @@
 		   incremented `p', by the way, to be the character after
 		   the `*'.  Do we have to do something analogous here
 		   for null bytes, because of RE_DOT_NOT_NULL?	*/
-		if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
+		if (TRANSLATE ((unsigned char)*(p - 2)) == TRANSLATE ('.')
 		    && zero_times_ok
-		    && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
+		    && p < pend
+		    && TRANSLATE ((unsigned char)*p) == TRANSLATE ('\n')
 		    && !(syntax & RE_DOT_NEWLINE))
 		  { /* We have .*\n.  */
 		    STORE_JUMP (jump, b, laststart);
@@ -2333,7 +2334,18 @@
 			p += len;
 		      }
 
-		    if (!SAME_CHARSET_P (c, c1))
+		    if (SINGLE_BYTE_CHAR_P (c)
+			&& ! SINGLE_BYTE_CHAR_P (c1))
+		      {
+			/* Handle a range such as \177-\377 in multibyte mode.
+			   Split that into two ranges,,
+			   the low one ending at 0237, and the high one
+			   starting at ...040.  */
+			int c1_base = (c1 & ~0177) | 040;
+			SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
+			c1 = 0237;
+		      }
+		    else if (!SAME_CHARSET_P (c, c1))
 		      FREE_STACK_RETURN (REG_ERANGE);
 		  }
 		else
@@ -2359,8 +2371,8 @@
 			for (this_char = range_start; this_char <= range_end;
 			     this_char++)
 			  SET_LIST_BIT (TRANSLATE (this_char));
+		      }
 		  }
-	      }
 		else
 		  /* ... into range table.  */
 		  SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
@@ -2913,8 +2925,8 @@
 	  /* Here, C may translated, therefore C may not equal to *P1. */
 	  while (1)
 	    {
-	  BUF_PUSH (c);
-	  (*pending_exact)++;
+	      BUF_PUSH (c);
+	      (*pending_exact)++;
 	      if (++p1 == p)
 		break;
 
@@ -3121,64 +3133,6 @@
 
   return false;
 }
-
-
-/* Read the ending character of a range (in a bracket expression) from the
-   uncompiled pattern *P_PTR (which ends at PEND).  We assume the
-   starting character is in `P[-2]'.  (`P[-1]' is the character `-'.)
-   Then we set the translation of all bits between the starting and
-   ending characters (inclusive) in the compiled pattern B.
-
-   Return an error code.
-
-   We use these short variable names so we can use the same macros as
-   `regex_compile' itself.  */
-
-static reg_errcode_t
-compile_range (p_ptr, pend, translate, syntax, b)
-    const char **p_ptr, *pend;
-    RE_TRANSLATE_TYPE translate;
-    reg_syntax_t syntax;
-    unsigned char *b;
-{
-  unsigned this_char;
-
-  const char *p = *p_ptr;
-  int range_start, range_end;
-
-  if (p == pend)
-    return REG_ERANGE;
-
-  /* Even though the pattern is a signed `char *', we need to fetch
-     with unsigned char *'s; if the high bit of the pattern character
-     is set, the range endpoints will be negative if we fetch using a
-     signed char *.
-
-     We also want to fetch the endpoints without translating them; the
-     appropriate translation is done in the bit-setting loop below.  */
-  /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *.  */
-  range_start = ((const unsigned char *) p)[-2];
-  range_end   = ((const unsigned char *) p)[0];
-
-  /* Have to increment the pointer into the pattern string, so the
-     caller isn't still at the ending character.  */
-  (*p_ptr)++;
-
-  /* If the start is after the end, the range is empty.	 */
-  if (range_start > range_end)
-    return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
-
-  /* Here we see why `this_char' has to be larger than an `unsigned
-     char' -- the range is inclusive, so if `range_end' == 0xff
-     (assuming 8-bit characters), we would otherwise go into an infinite
-     loop, since all characters <= 0xff.  */
-  for (this_char = range_start; this_char <= range_end; this_char++)
-    {
-      SET_LIST_BIT (TRANSLATE (this_char));
-    }
-
-  return REG_NOERROR;
-}
 
 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
    BUFP.  A fastmap records which of the (1 << BYTEWIDTH) possible
@@ -3812,24 +3766,45 @@
 	 the first null string.	 */
       if (fastmap && startpos < total_size && !bufp->can_be_null)
 	{
+	  register const char *d;
+	  register unsigned int buf_ch;
+
+	  d = POS_ADDR_VSTRING (startpos);
+
 	  if (range > 0)	/* Searching forwards.	*/
 	    {
-	      register const char *d;
 	      register int lim = 0;
 	      int irange = range;
 
 	      if (startpos < size1 && startpos + range >= size1)
 		lim = range - (size1 - startpos);
 
-	      d = POS_ADDR_VSTRING (startpos);
-
 	      /* Written out as an if-else to avoid testing `translate'
 		 inside the loop.  */
 	      if (translate)
-		while (range > lim
-		       && !fastmap[(unsigned char)
-				   RE_TRANSLATE (translate, (unsigned char) *d++)])
-		  range--;
+		{
+		  if (multibyte)
+		    while (range > lim)
+		      {
+			int buf_charlen;
+
+			buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
+							 buf_charlen);
+
+			buf_ch = RE_TRANSLATE (translate, buf_ch);
+			if (buf_ch >= 0400
+			    || fastmap[buf_ch])
+			  break;
+
+			range -= buf_charlen;
+			d += buf_charlen;
+		      }
+		  else
+		    while (range > lim
+			   && !fastmap[(unsigned char)
+				       RE_TRANSLATE (translate, (unsigned char) *d++)])
+		      range--;
+		}
 	      else
 		while (range > lim && !fastmap[(unsigned char) *d++])
 		  range--;
@@ -3838,11 +3813,16 @@
 	    }
 	  else				/* Searching backwards.	 */
 	    {
-	      register char c = (size1 == 0 || startpos >= size1
-				 ? string2[startpos - size1]
-				 : string1[startpos]);
-
-	      if (!fastmap[(unsigned char) TRANSLATE (c)])
+	      int room = (size1 == 0 || startpos >= size1
+			  ? size2 + size1 - startpos
+			  : size1 - startpos);
+
+	      buf_ch = STRING_CHAR (d, room);
+	      if (translate)
+		buf_ch = RE_TRANSLATE (translate, buf_ch);
+
+	      if (! (buf_ch >= 0400
+		     || fastmap[buf_ch]))
 		goto advance;
 	    }
 	}
@@ -4515,14 +4495,36 @@
 	     testing `translate' inside the loop.  */
 	  if (translate)
 	    {
-	      do
-		{
-		  PREFETCH ();
-		  if ((unsigned char) RE_TRANSLATE (translate, (unsigned char) *d++)
-		      != (unsigned char) *p++)
-		    goto fail;
-		}
-	      while (--mcnt);
+#ifdef emacs
+	      if (multibyte)
+		do
+		  {
+		    int pat_charlen, buf_charlen;
+		    int pat_ch, buf_ch;
+
+		    PREFETCH ();
+		    pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
+		    buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
+
+		    if (RE_TRANSLATE (translate, buf_ch)
+			!= pat_ch)
+		      goto fail;
+
+		    p += pat_charlen;
+		    d += buf_charlen;
+		    mcnt -= pat_charlen;
+		  }
+		while (mcnt > 0);
+	      else
+#endif /* not emacs */
+		do
+		  {
+		    PREFETCH ();
+		    if ((unsigned char) RE_TRANSLATE (translate, (unsigned char) *d++)
+			!= (unsigned char) *p++)
+		      goto fail;
+		  }
+		while (--mcnt);
 	    }
 	  else
 	    {
@@ -4539,17 +4541,36 @@
 
 	/* Match any character except possibly a newline or a null.  */
 	case anychar:
-	  DEBUG_PRINT1 ("EXECUTING anychar.\n");
-
-	  PREFETCH ();
-
-	  if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
-	      || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
-	    goto fail;
-
-	  SET_REGS_MATCHED ();
-	  DEBUG_PRINT2 ("  Matched `%d'.\n", *d);
-	  d += multibyte ? MULTIBYTE_FORM_LENGTH (d, dend - d) : 1;
+	  {
+	    int buf_charlen;
+	    int buf_ch;
+
+	    DEBUG_PRINT1 ("EXECUTING anychar.\n");
+
+	    PREFETCH ();
+
+#ifdef emacs
+	    if (multibyte)
+	      buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
+	    else
+#endif /* not emacs */
+	      {
+		buf_ch = *d;
+		buf_charlen = 1;
+	      }
+
+	    buf_ch = TRANSLATE (buf_ch);
+
+	    if ((!(bufp->syntax & RE_DOT_NEWLINE)
+		 && buf_ch == '\n')
+		|| ((bufp->syntax & RE_DOT_NOT_NULL)
+		    && buf_ch == '\000'))
+	      goto fail;
+
+	    SET_REGS_MATCHED ();
+	    DEBUG_PRINT2 ("  Matched `%d'.\n", *d);
+	    d += buf_charlen;
+	  }
 	  break;
 
 
@@ -5926,12 +5947,27 @@
      RE_TRANSLATE_TYPE translate;
 {
   register unsigned char *p1 = s1, *p2 = s2;
-  while (len)
+  unsigned char *p1_end = s1 + len;
+  unsigned char *p2_end = s2 + len;
+
+  while (p1 != p1_end && p2 != p2_end)
     {
-      if (RE_TRANSLATE (translate, *p1++) != RE_TRANSLATE (translate, *p2++))
+      int p1_charlen, p2_charlen;
+      int p1_ch, p2_ch;
+
+      p1_ch = STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
+      p2_ch = STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
+
+      if (RE_TRANSLATE (translate, p1_ch)
+	  != RE_TRANSLATE (translate, p2_ch))
 	return 1;
-      len--;
+
+      p1 += p1_charlen, p2 += p2_charlen;
     }
+
+  if (p1 != p1_end || p2 != p2_end)
+    return 1;
+
   return 0;
 }