diff src/search.c @ 61189:91ba6c641a60

(looking_at_1): Use current_buffer->case_canon_table, not DOWNCASE_TABLE. (string_match_1): Likewise. (fast_c_string_match_ignore_case): Use Vascii_canon_table, not Vascii_downcase_table. (fast_string_match_ignore_case): Likewise. (search_buffer): Fix checking of boyer-moore usability. (boyer_moore): Calculate translate_prev_byte1/2/3 in advance. No need of tranlating characters in PAT. Fix calculation of simple_translate.
author Kenichi Handa <handa@m17n.org>
date Fri, 01 Apr 2005 01:05:46 +0000
parents 47729f2cb184
children 0015a00ccb5a 7a3341d65a12
line wrap: on
line diff
--- a/src/search.c	Fri Apr 01 01:05:29 2005 +0000
+++ b/src/search.c	Fri Apr 01 01:05:46 2005 +0000
@@ -293,7 +293,7 @@
   CHECK_STRING (string);
   bufp = compile_pattern (string, &search_regs,
 			  (!NILP (current_buffer->case_fold_search)
-			   ? DOWNCASE_TABLE : Qnil),
+			   ? current_buffer->case_canon_table : Qnil),
 			  posix,
 			  !NILP (current_buffer->enable_multibyte_characters));
 
@@ -399,7 +399,7 @@
 
   bufp = compile_pattern (regexp, &search_regs,
 			  (!NILP (current_buffer->case_fold_search)
-			   ? DOWNCASE_TABLE : Qnil),
+			   ? current_buffer->case_canon_table : Qnil),
 			  posix,
 			  STRING_MULTIBYTE (string));
   immediate_quit = 1;
@@ -499,7 +499,7 @@
   regexp = string_make_unibyte (regexp);
   re_match_object = Qt;
   bufp = compile_pattern (regexp, 0,
-			  Vascii_downcase_table, 0,
+			  Vascii_canon_table, 0,
 			  0);
   immediate_quit = 1;
   val = re_search (bufp, string, len, 0, len, 0);
@@ -516,7 +516,7 @@
   int val;
   struct re_pattern_buffer *bufp;
 
-  bufp = compile_pattern (regexp, 0, Vascii_downcase_table,
+  bufp = compile_pattern (regexp, 0, Vascii_canon_table,
 			  0, STRING_MULTIBYTE (string));
   immediate_quit = 1;
   re_match_object = string;
@@ -1175,7 +1175,9 @@
       unsigned char *patbuf;
       int multibyte = !NILP (current_buffer->enable_multibyte_characters);
       unsigned char *base_pat = SDATA (string);
-      int charset_base = -1;
+      /* Set to nozero if we find a non-ASCII char that need
+	 translation.  */
+      int charset_base = 0;
       int boyer_moore_ok = 1;
 
       /* MULTIBYTE says whether the text to be searched is multibyte.
@@ -1221,9 +1223,17 @@
       base_pat = raw_pattern;
       if (multibyte)
 	{
+	  /* Fill patbuf by translated characters in STRING while
+	     checking if we can use boyer-moore search.  If TRT is
+	     non-nil, we can use boyer-moore search only if TRT can be
+	     represented by the byte array of 256 elements.  For that,
+	     all non-ASCII case-equivalents of all case-senstive
+	     characters in STRING must belong to the same charset and
+	     row.  */
+
 	  while (--len >= 0)
 	    {
-	      unsigned char str[MAX_MULTIBYTE_LENGTH];
+	      unsigned char str_base[MAX_MULTIBYTE_LENGTH], *str;
 	      int c, translated, inverse;
 	      int in_charlen, charlen;
 
@@ -1233,50 +1243,62 @@
 	      if (RE && *base_pat == '\\')
 		{
 		  len--;
+		  raw_pattern_size--;
 		  len_byte--;
 		  base_pat++;
 		}
 
 	      c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen);
 
-	      /* Translate the character, if requested.  */
-	      TRANSLATE (translated, trt, c);
-	      /* If translation changed the byte-length, go back
-		 to the original character.  */
-	      charlen = CHAR_STRING (translated, str);
-	      if (in_charlen != charlen)
+	      if (NILP (trt))
 		{
-		  translated = c;
-		  charlen = CHAR_STRING (c, str);
+		  str = base_pat;
+		  charlen = in_charlen;
 		}
-
-	      /* If we are searching for something strange,
-		 an invalid multibyte code, don't use boyer-moore.  */
-	      if (! ASCII_BYTE_P (translated)
-		  && (charlen == 1 /* 8bit code */
-		      || charlen != in_charlen /* invalid multibyte code */
-		      ))
-		boyer_moore_ok = 0;
-
-	      TRANSLATE (inverse, inverse_trt, c);
-
-	      /* Did this char actually get translated?
-		 Would any other char get translated into it?  */
-	      if (translated != c || inverse != c)
+	      else
 		{
-		  /* Keep track of which character set row
-		     contains the characters that need translation.  */
-		  int charset_base_code = c & ~CHAR_FIELD3_MASK;
-		  int inverse_charset_base = inverse & ~CHAR_FIELD3_MASK;
-
-		  if (charset_base_code != inverse_charset_base)
-		    boyer_moore_ok = 0;
-		  else if (charset_base == -1)
-		    charset_base = charset_base_code;
-		  else if (charset_base != charset_base_code)
-		    /* If two different rows appear, needing translation,
-		       then we cannot use boyer_moore search.  */
-		    boyer_moore_ok = 0;
+		  /* Translate the character.  */
+		  TRANSLATE (translated, trt, c);
+		  charlen = CHAR_STRING (translated, str_base);
+		  str = str_base;
+
+		  /* Check if C has any other case-equivalents.  */
+		  TRANSLATE (inverse, inverse_trt, c);
+		  /* If so, check if we can use boyer-moore.  */
+		  if (c != inverse && boyer_moore_ok)
+		    {
+		      /* Check if all equivalents belong to the same
+			 charset & row.  Note that the check of C
+			 itself is done by the last iteration.  Note
+			 also that we don't have to check ASCII
+			 characters because boyer-moore search can
+			 always handle their translation.  */
+		      while (1)
+			{
+			  if (! ASCII_BYTE_P (inverse))
+			    {
+			      if (SINGLE_BYTE_CHAR_P (inverse))
+				{
+				  /* Boyer-moore search can't handle a
+				     translation of an eight-bit
+				     character.  */
+				  boyer_moore_ok = 0;
+				  break;
+				}
+			      else if (charset_base == 0)
+				charset_base = inverse & ~CHAR_FIELD3_MASK;
+			      else if ((inverse & ~CHAR_FIELD3_MASK)
+				       != charset_base)
+				{
+				  boyer_moore_ok = 0;
+				  break;
+				}
+			    }
+			  if (c == inverse)
+			    break;
+			  TRANSLATE (inverse, inverse_trt, inverse);
+			}
+		    }
 		}
 
 	      /* Store this character into the translated pattern.  */
@@ -1300,6 +1322,7 @@
 	      if (RE && *base_pat == '\\')
 		{
 		  len--;
+		  raw_pattern_size--;
 		  base_pat++;
 		}
 	      c = *base_pat++;
@@ -1533,16 +1556,18 @@
     return n;
 }
 
-/* Do Boyer-Moore search N times for the string PAT,
+/* Do Boyer-Moore search N times for the string BASE_PAT,
    whose length is LEN/LEN_BYTE,
    from buffer position POS/POS_BYTE until LIM/LIM_BYTE.
    DIRECTION says which direction we search in.
    TRT and INVERSE_TRT are translation tables.
-
-   This kind of search works if all the characters in PAT that have
-   nontrivial translation are the same aside from the last byte.  This
-   makes it possible to translate just the last byte of a character,
-   and do so after just a simple test of the context.
+   Characters in PAT are already translated by TRT.
+
+   This kind of search works if all the characters in BASE_PAT that
+   have nontrivial translation are the same aside from the last byte.
+   This makes it possible to translate just the last byte of a
+   character, and do so after just a simple test of the context.
+   CHARSET_BASE is nonzero iff there is such a non-ASCII character.
 
    If that criterion is not satisfied, do not call this function.  */
 
@@ -1569,8 +1594,13 @@
   int multibyte = ! NILP (current_buffer->enable_multibyte_characters);
 
   unsigned char simple_translate[0400];
-  int translate_prev_byte = 0;
-  int translate_anteprev_byte = 0;
+  /* These are set to the preceding bytes of a byte to be translated
+     if charset_base is nonzero.  As the maximum byte length of a
+     multibyte character is 4, we have to check at most three previous
+     bytes.  */
+  int translate_prev_byte1 = 0;
+  int translate_prev_byte2 = 0;
+  int translate_prev_byte3 = 0;
 
 #ifdef C_ALLOCA
   int BM_tab_space[0400];
@@ -1636,6 +1666,23 @@
   for (i = 0; i < 0400; i++)
     simple_translate[i] = i;
 
+  if (charset_base)
+    {
+      /* Setup translate_prev_byte1/2/3 from CHARSET_BASE.  Only a
+	 byte following them are the target of translation.  */
+      int sample_char = charset_base | 0x20;
+      unsigned char str[MAX_MULTIBYTE_LENGTH];
+      int len = CHAR_STRING (sample_char, str);
+
+      translate_prev_byte1 = str[len - 2];
+      if (len > 2)
+	{
+	  translate_prev_byte2 = str[len - 3];
+	  if (len > 3)
+	    translate_prev_byte3 = str[len - 4];
+	}
+    }
+
   i = 0;
   while (i != infinity)
     {
@@ -1645,57 +1692,37 @@
 	i = infinity;
       if (! NILP (trt))
 	{
-	  int ch;
-	  int untranslated;
-	  int this_translated = 1;
-
-	  if (multibyte
-	      /* Is *PTR the last byte of a character?  */
-	      && (pat_end - ptr == 1 || CHAR_HEAD_P (ptr[1])))
+	  /* If the byte currently looking at is a head of a character
+	     to check case-equivalents, set CH to that character.  An
+	     ASCII character and a non-ASCII character matching with
+	     CHARSET_BASE are to be checked.  */
+	  int ch = -1;
+
+	  if (ASCII_BYTE_P (*ptr) || ! multibyte)
+	    ch = *ptr;
+	  else if (charset_base && CHAR_HEAD_P (*ptr))
 	    {
-	      unsigned char *charstart = ptr;
-	      while (! CHAR_HEAD_P (*charstart))
-		charstart--;
-	      untranslated = STRING_CHAR (charstart, ptr - charstart + 1);
-	      if (charset_base == (untranslated & ~CHAR_FIELD3_MASK))
-		{
-		  TRANSLATE (ch, trt, untranslated);
-		  if (! CHAR_HEAD_P (*ptr))
-		    {
-		      translate_prev_byte = ptr[-1];
-		      if (! CHAR_HEAD_P (translate_prev_byte))
-			translate_anteprev_byte = ptr[-2];
-		    }
-		}
-	      else
-		{
-		  this_translated = 0;
-		  ch = *ptr;
-		}
+	      ch = STRING_CHAR (ptr, pat_end - ptr);
+	      if (charset_base != (ch & ~CHAR_FIELD3_MASK))
+		ch = -1;
 	    }
-	  else if (!multibyte)
-	    TRANSLATE (ch, trt, *ptr);
-	  else
-	    {
-	      ch = *ptr;
-	      this_translated = 0;
-	    }
-
-	  if (ch > 0400)
-	    j = ((unsigned char) ch) | 0200;
-	  else
-	    j = (unsigned char) ch;
-
+
+	  j = *ptr;
 	  if (i == infinity)
 	    stride_for_teases = BM_tab[j];
 
 	  BM_tab[j] = dirlen - i;
 	  /* A translation table is accompanied by its inverse -- see */
 	  /* comment following downcase_table for details */
-	  if (this_translated)
+	  if (ch >= 0)
 	    {
 	      int starting_ch = ch;
-	      int starting_j = j;
+	      int starting_j;
+
+	      if (ch > 0400)
+		starting_j = ((unsigned char) ch) | 0200;
+	      else
+		starting_j = (unsigned char) ch;
 	      while (1)
 		{
 		  TRANSLATE (ch, inverse_trt, ch);
@@ -1821,9 +1848,13 @@
 			  || ((cursor == tail_end_ptr
 			       || CHAR_HEAD_P (cursor[1]))
 			      && (CHAR_HEAD_P (cursor[0])
-				  || (translate_prev_byte == cursor[-1]
-				      && (CHAR_HEAD_P (translate_prev_byte)
-					  || translate_anteprev_byte == cursor[-2])))))
+				  /* Check if this is the last byte of
+				     a translable character.  */
+				  || (translate_prev_byte1 == cursor[-1]
+				      && (CHAR_HEAD_P (translate_prev_byte1)
+					  || (translate_prev_byte2 == cursor[-2]
+					      && (CHAR_HEAD_P (translate_prev_byte2)
+						  || (translate_prev_byte3 == cursor[-3]))))))))
 			ch = simple_translate[*cursor];
 		      else
 			ch = *cursor;
@@ -1901,9 +1932,13 @@
 		      || ((ptr == tail_end_ptr
 			   || CHAR_HEAD_P (ptr[1]))
 			  && (CHAR_HEAD_P (ptr[0])
-			      || (translate_prev_byte == ptr[-1]
-				  && (CHAR_HEAD_P (translate_prev_byte)
-				      || translate_anteprev_byte == ptr[-2])))))
+			      /* Check if this is the last byte of a
+				 translable character.  */
+			      || (translate_prev_byte1 == ptr[-1]
+				  && (CHAR_HEAD_P (translate_prev_byte1)
+				      || (translate_prev_byte2 == ptr[-2]
+					  && (CHAR_HEAD_P (translate_prev_byte2)
+					      || translate_prev_byte3 == ptr[-3])))))))
 		    ch = simple_translate[*ptr];
 		  else
 		    ch = *ptr;