changeset 73056:2e2651f3d494

(analyse_first): For eight-bit-control chars, mark both the char's value and its leading byte in the fastmap. (re_search_2): When fast-scanning without translation, be careful to check that we only match the leading byte of a multibyte char.
author Stefan Monnier <monnier@iro.umontreal.ca>
date Fri, 22 Sep 2006 17:30:13 +0000
parents e946daf4b223
children 76991373b070
files src/ChangeLog src/regex.c
diffstat 2 files changed, 54 insertions(+), 9 deletions(-) [+]
line wrap: on
line diff
--- a/src/ChangeLog	Fri Sep 22 17:30:02 2006 +0000
+++ b/src/ChangeLog	Fri Sep 22 17:30:13 2006 +0000
@@ -1,3 +1,14 @@
+2006-09-22  Stefan Monnier  <monnier@iro.umontreal.ca>
+
+	* regex.c (analyse_first): For eight-bit-control chars, mark both the
+	char's value and its leading byte in the fastmap.
+	(re_search_2): When fast-scanning without translation, be careful to
+	check that we only match the leading byte of a multibyte char.
+
+	* charset.h (PREV_CHAR_BOUNDARY): Make it work from within a char's
+	byte sequence.
+	(AT_CHAR_BOUNDARY): New macro.
+
 2006-09-22  Kenichi Handa  <handa@m17n.org>
 
 	* fns.c (optimize_sub_char_table): Don't optimize a sub-char-table
@@ -271,7 +282,7 @@
 
 2006-08-27  Martin Rudalics  <rudalics@gmx.at>
 
-	* xdisp.c (mouse_autoselect_window): Removed.
+	* xdisp.c (mouse_autoselect_window): Remove.
 	(Vmouse_autoselect_window): New variable.  DEFVAR_LISP it.
 
 	* dispextern.h (mouse_autoselect_window): Remove extern.
--- a/src/regex.c	Fri Sep 22 17:30:02 2006 +0000
+++ b/src/regex.c	Fri Sep 22 17:30:13 2006 +0000
@@ -3877,11 +3877,13 @@
 	  if (fastmap)
 	    {
 	      int c = RE_STRING_CHAR (p + 1, pend - p);
-
+	      /* When fast-scanning, the fastmap can be indexed either with
+		 a char (smaller than 256) or with the first byte of
+		 a char's byte sequence.  So we have to conservatively add
+		 both to the table.  */
 	      if (SINGLE_BYTE_CHAR_P (c))
 		fastmap[c] = 1;
-	      else
-		fastmap[p[1]] = 1;
+	      fastmap[p[1]] = 1;
 	    }
 	  break;
 
@@ -3899,6 +3901,10 @@
 	     So any that are not listed in the charset
 	     are possible matches, even in multibyte buffers.  */
 	  if (!fastmap) break;
+	  /* We don't need to mark LEADING_CODE_8_BIT_CONTROL specially
+	     because it will automatically be set when needed by virtue of
+	     being larger than the highest char of its charset (0xbf) but
+	     smaller than (1<<BYTEWIDTH).  */
 	  for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
 	       j < (1 << BYTEWIDTH); j++)
 	    fastmap[j] = 1;
@@ -3909,7 +3915,13 @@
 	  for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
 	       j >= 0; j--)
 	    if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
-	      fastmap[j] = 1;
+	      {
+		fastmap[j] = 1;
+#ifdef emacs
+		if (j >= 0x80 && j < 0xa0)
+		  fastmap[LEADING_CODE_8_BIT_CONTROL] = 1;
+#endif
+	      }
 
 	  if ((not && multibyte)
 	      /* Any character set can possibly contain a character
@@ -4352,11 +4364,33 @@
 		    }
 		}
 	      else
-		while (range > lim && !fastmap[*d])
+		do
 		  {
-		    d++;
-		    range--;
-		  }
+		    re_char *d_start = d;
+		    while (range > lim && !fastmap[*d])
+		      {
+			d++;
+			range--;
+		      }
+#ifdef emacs
+		    if (multibyte && range > lim)
+		      {
+			/* Check that we are at the beginning of a char.  */
+			int at_boundary;
+			AT_CHAR_BOUNDARY_P (at_boundary, d, d_start);
+			if (at_boundary)
+			  break;
+			else
+			  { /* We have matched an internal byte of a char
+			       rather than the leading byte, so it's a false
+			       positive: we should keep scanning.  */
+			    d++; range--;
+			  }
+		      }
+		    else
+#endif
+		      break;
+		  } while (1);
 
 	      startpos += irange - range;
 	    }