changeset 90245:7dd79b4dd8a5

(search_buffer): Give up BM search on case-fold-search if one of a target character has a case-equivalence of different byte length even if that target charcter is an ASCII. (simple_search): Fix culculation of byte length of matched text. (boyer_moore): Fix handling of case-equivalent multibyte characters.
author Kenichi Handa <handa@m17n.org>
date Mon, 17 Oct 2005 04:15:51 +0000
parents bf08376ae834
children b45f998a92aa
files src/search.c
diffstat 1 files changed, 63 insertions(+), 42 deletions(-) [+]
line wrap: on
line diff
--- a/src/search.c	Sat Oct 15 02:09:24 2005 +0000
+++ b/src/search.c	Mon Oct 17 04:15:51 2005 +0000
@@ -1141,9 +1141,9 @@
       unsigned char *patbuf;
       int multibyte = !NILP (current_buffer->enable_multibyte_characters);
       unsigned char *base_pat = SDATA (string);
-      /* Set to nozero if we find a non-ASCII char that need
-	 translation.  */
-      int char_base = 0;
+      /* Set to positive if we find a non-ASCII char that need
+	 translation.  Otherwise set to zero later.  */
+      int char_base = -1;
       int boyer_moore_ok = 1;
 
       /* MULTIBYTE says whether the text to be searched is multibyte.
@@ -1234,37 +1234,46 @@
 		    {
 		      /* Check if all equivalents belong to the same
 			 group of characters.  Note that the check of C
-			 itself is done by the last iteration.  Note
-			 also that we don't have to check ASCII
-			 characters because boyer-moore search can
-			 always handle their translation.  */
-		      while (1)
+			 itself is done by the last iteration.  */
+		      int this_char_base = -1;
+
+		      while (boyer_moore_ok)
 			{
-			  if (! ASCII_BYTE_P (inverse))
+			  if (ASCII_BYTE_P (inverse))
 			    {
-			      if (CHAR_BYTE8_P (inverse))
+			      if (this_char_base > 0)
+				boyer_moore_ok = 0;
+			      else
 				{
-				  /* Boyer-moore search can't handle a
-				     translation of an eight-bit
-				     character.  */
-				  boyer_moore_ok = 0;
-				  break;
-				}
-			      else if (char_base == 0)
-				char_base = inverse & ~0x3F;
-			      else if ((inverse & ~0x3F)
-				       != char_base)
-				{
-				  boyer_moore_ok = 0;
-				  break;
+				  this_char_base = 0;
+				  if (char_base < 0)
+				    char_base = this_char_base;
 				}
 			    }
+			  else if (CHAR_BYTE8_P (inverse))
+			    /* Boyer-moore search can't handle a
+			       translation of an eight-bit
+			       character.  */
+			    boyer_moore_ok = 0;
+			  else if (this_char_base < 0)
+			    {
+			      this_char_base = inverse & ~0x3F;
+			      if (char_base < 0)
+				char_base = this_char_base;
+			      else if (char_base > 0
+				       && this_char_base != char_base)
+				boyer_moore_ok = 0;
+			    }
+			  else if ((inverse & ~0x3F) != this_char_base)
+			    boyer_moore_ok = 0;
 			  if (c == inverse)
 			    break;
 			  TRANSLATE (inverse, inverse_trt, inverse);
 			}
 		    }
 		}
+	      if (char_base < 0)
+		char_base = 0;
 
 	      /* Store this character into the translated pattern.  */
 	      bcopy (str, pat, charlen);
@@ -1333,6 +1342,9 @@
 {
   int multibyte = ! NILP (current_buffer->enable_multibyte_characters);
   int forward = n > 0;
+  /* Number of buffer bytes matched.  Note that this may be different
+     from len_byte in a multibyte buffer.  */
+  int match_byte;
 
   if (lim > pos && multibyte)
     while (n > 0)
@@ -1372,8 +1384,9 @@
 
 	    if (this_len == 0)
 	      {
+		match_byte = this_pos_byte - pos_byte;
 		pos += len;
-		pos_byte += len_byte;
+		pos_byte += match_byte;
 		break;
 	      }
 
@@ -1410,6 +1423,7 @@
 
 	    if (this_len == 0)
 	      {
+		match_byte = len;
 		pos += len;
 		break;
 	      }
@@ -1435,6 +1449,7 @@
 	    if (pos - len < lim)
 	      goto stop;
 	    this_pos_byte = CHAR_TO_BYTE (this_pos);
+	    match_byte = pos_byte - this_pos_byte;
 
 	    while (this_len > 0)
 	      {
@@ -1460,7 +1475,7 @@
 	    if (this_len == 0)
 	      {
 		pos -= len;
-		pos_byte -= len_byte;
+		pos_byte -= match_byte;
 		break;
 	      }
 
@@ -1496,6 +1511,7 @@
 
 	    if (this_len == 0)
 	      {
+		match_byte = len;
 		pos -= len;
 		break;
 	      }
@@ -1510,9 +1526,9 @@
   if (n == 0)
     {
       if (forward)
-	set_search_regs ((multibyte ? pos_byte : pos) - len_byte, len_byte);
+	set_search_regs ((multibyte ? pos_byte : pos) - match_byte, match_byte);
       else
-	set_search_regs (multibyte ? pos_byte : pos, len_byte);
+	set_search_regs (multibyte ? pos_byte : pos, match_byte);
 
       return pos;
     }
@@ -1561,7 +1577,7 @@
 
   unsigned char simple_translate[0400];
   /* These are set to the preceding bytes of a byte to be translated
-     if charset_base is nonzero.  As the maximum byte length of a
+     if char_base is nonzero.  As the maximum byte length of a
      multibyte character is 5, we have to check at most four previous
      bytes.  */
   int translate_prev_byte1 = 0;
@@ -1662,22 +1678,31 @@
 	i = infinity;
       if (! NILP (trt))
 	{
-	  /* If the byte currently looking at is a head of a character
-	     to check case-equivalents, set CH to that character.  An
-	     ASCII character and a non-ASCII character matching with
-	     CHAR_BASE are to be checked.  */
+	  /* If the byte currently looking at is the last of a
+	     character to check case-equivalents, set CH to that
+	     character.  An ASCII character and a non-ASCII character
+	     matching with CHAR_BASE are to be checked.  */
 	  int ch = -1;
 
 	  if (ASCII_BYTE_P (*ptr) || ! multibyte)
 	    ch = *ptr;
-	  else if (char_base && CHAR_HEAD_P (*ptr))
+	  else if (char_base
+		   && (pat_end - ptr) == 1 || CHAR_HEAD_P (ptr[1]))
 	    {
-	      ch = STRING_CHAR (ptr, pat_end - ptr);
+	      unsigned char *charstart = ptr - 1;
+
+	      while (! (CHAR_HEAD_P (*charstart)))
+		charstart--;
+	      ch = STRING_CHAR (charstart, ptr - charstart + 1);
 	      if (char_base != (ch & ~0x3F))
 		ch = -1;
 	    }
 
-	  j = *ptr;
+	  if (ch > 0400)
+	    j = (ch & 0x3F) | 0200;
+	  else
+	    j = *ptr;
+
 	  if (i == infinity)
 	    stride_for_teases = BM_tab[j];
 
@@ -1687,17 +1712,13 @@
 	  if (ch >= 0)
 	    {
 	      int starting_ch = ch;
-	      int starting_j;
-
-	      if (ch > 0400)
-		starting_j = (ch & ~0x3F) | 0200;
-	      else
-		starting_j = ch;
+	      int starting_j = j;
+
 	      while (1)
 		{
 		  TRANSLATE (ch, inverse_trt, ch);
 		  if (ch > 0400)
-		    j = (ch & ~0x3F) | 0200;
+		    j = (ch & 0x3F) | 0200;
 		  else
 		    j = ch;