emacs: src/search.c comparison

comparison src/search.c @ 61189:91ba6c641a60

(looking_at_1): Use current_buffer->case_canon_table, not DOWNCASE_TABLE. (string_match_1): Likewise. (fast_c_string_match_ignore_case): Use Vascii_canon_table, not Vascii_downcase_table. (fast_string_match_ignore_case): Likewise. (search_buffer): Fix checking of boyer-moore usability. (boyer_moore): Calculate translate_prev_byte1/2/3 in advance. No need of tranlating characters in PAT. Fix calculation of simple_translate.

author	Kenichi Handa <handa@m17n.org>
date	Fri, 01 Apr 2005 01:05:46 +0000
parents	47729f2cb184
children	0015a00ccb5a 7a3341d65a12

comparison

equal deleted inserted replaced

-:a96e8bdd2b37
+:91ba6c641a60
 save_search_regs ();
 CHECK_STRING (string);
 bufp = compile_pattern (string, &search_regs,
 			  (!NILP (current_buffer->case_fold_search)
-			   ? DOWNCASE_TABLE : Qnil),
+			   ? current_buffer->case_canon_table : Qnil),
 			  posix,
 			  !NILP (current_buffer->enable_multibyte_characters));
 immediate_quit = 1;
 QUIT;			/* Do a pending quit right away, to avoid paradoxical behavior */
 pos_byte = string_char_to_byte (string, pos);
 }
 bufp = compile_pattern (regexp, &search_regs,
 			  (!NILP (current_buffer->case_fold_search)
-			   ? DOWNCASE_TABLE : Qnil),
+			   ? current_buffer->case_canon_table : Qnil),
 			  posix,
 			  STRING_MULTIBYTE (string));
 immediate_quit = 1;
 re_match_object = string;
 int len = strlen (string);
 regexp = string_make_unibyte (regexp);
 re_match_object = Qt;
 bufp = compile_pattern (regexp, 0,
-			  Vascii_downcase_table, 0,
+			  Vascii_canon_table, 0,
 			  0);
 immediate_quit = 1;
 val = re_search (bufp, string, len, 0, len, 0);
 immediate_quit = 0;
 return val;
 Lisp_Object regexp, string;
 {
 int val;
 struct re_pattern_buffer *bufp;
-bufp = compile_pattern (regexp, 0, Vascii_downcase_table,
+bufp = compile_pattern (regexp, 0, Vascii_canon_table,
 			  0, STRING_MULTIBYTE (string));
 immediate_quit = 1;
 re_match_object = string;
 val = re_search (bufp, (char *) SDATA (string),
 int raw_pattern_size;
 int raw_pattern_size_byte;
 unsigned char *patbuf;
 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
 unsigned char *base_pat = SDATA (string);
-int charset_base = -1;
+/* Set to nozero if we find a non-ASCII char that need
+	 translation.  */
+int charset_base = 0;
 int boyer_moore_ok = 1;
 /* MULTIBYTE says whether the text to be searched is multibyte.
 	 We must convert PATTERN to match that, or we will not really
 	 find things right.  */
 patbuf = (unsigned char *) alloca (len_byte);
 pat = patbuf;
 base_pat = raw_pattern;
 if (multibyte)
 	{
+	  /* Fill patbuf by translated characters in STRING while
+	     checking if we can use boyer-moore search.  If TRT is
+	     non-nil, we can use boyer-moore search only if TRT can be
+	     represented by the byte array of 256 elements.  For that,
+	     all non-ASCII case-equivalents of all case-senstive
+	     characters in STRING must belong to the same charset and
+	     row.  */
 	  while (--len >= 0)
 	    {
-	      unsigned char str[MAX_MULTIBYTE_LENGTH];
+	      unsigned char str_base[MAX_MULTIBYTE_LENGTH], *str;
 	      int c, translated, inverse;
 	      int in_charlen, charlen;
 	      /* If we got here and the RE flag is set, it's because we're
 		 dealing with a regexp known to be trivial, so the backslash
 		 just quotes the next character.  */
 	      if (RE && *base_pat == '\\')
 		{
 		  len--;
+		  raw_pattern_size--;
 		  len_byte--;
 		  base_pat++;
 		}
 	      c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen);
-	      /* Translate the character, if requested.  */
+	      if (NILP (trt))
-	      TRANSLATE (translated, trt, c);
-	      /* If translation changed the byte-length, go back
-		 to the original character.  */
-	      charlen = CHAR_STRING (translated, str);
-	      if (in_charlen != charlen)
 		{
-		  translated = c;
+		  str = base_pat;
-		  charlen = CHAR_STRING (c, str);
+		  charlen = in_charlen;
 		}
+	      else
-	      /* If we are searching for something strange,
-		 an invalid multibyte code, don't use boyer-moore.  */
-	      if (! ASCII_BYTE_P (translated)
-		  && (charlen == 1 /* 8bit code */
-		      || charlen != in_charlen /* invalid multibyte code */
-		      ))
-		boyer_moore_ok = 0;
-	      TRANSLATE (inverse, inverse_trt, c);
-	      /* Did this char actually get translated?
-		 Would any other char get translated into it?  */
-	      if (translated != c || inverse != c)
 		{
-		  /* Keep track of which character set row
+		  /* Translate the character.  */
-		     contains the characters that need translation.  */
+		  TRANSLATE (translated, trt, c);
-		  int charset_base_code = c & ~CHAR_FIELD3_MASK;
+		  charlen = CHAR_STRING (translated, str_base);
-		  int inverse_charset_base = inverse & ~CHAR_FIELD3_MASK;
+		  str = str_base;
-		  if (charset_base_code != inverse_charset_base)
+		  /* Check if C has any other case-equivalents.  */
-		    boyer_moore_ok = 0;
+		  TRANSLATE (inverse, inverse_trt, c);
-		  else if (charset_base == -1)
+		  /* If so, check if we can use boyer-moore.  */
-		    charset_base = charset_base_code;
+		  if (c != inverse && boyer_moore_ok)
-		  else if (charset_base != charset_base_code)
+		    {
-		    /* If two different rows appear, needing translation,
+		      /* Check if all equivalents belong to the same
-		       then we cannot use boyer_moore search.  */
+			 charset & row.  Note that the check of C
-		    boyer_moore_ok = 0;
+			 itself is done by the last iteration.  Note
+			 also that we don't have to check ASCII
+			 characters because boyer-moore search can
+			 always handle their translation.  */
+		      while (1)
+			{
+			  if (! ASCII_BYTE_P (inverse))
+			    {
+			      if (SINGLE_BYTE_CHAR_P (inverse))
+				{
+				  /* Boyer-moore search can't handle a
+				     translation of an eight-bit
+				     character.  */
+				  boyer_moore_ok = 0;
+				  break;
+				}
+			      else if (charset_base == 0)
+				charset_base = inverse & ~CHAR_FIELD3_MASK;
+			      else if ((inverse & ~CHAR_FIELD3_MASK)
+				       != charset_base)
+				{
+				  boyer_moore_ok = 0;
+				  break;
+				}
+			    }
+			  if (c == inverse)
+			    break;
+			  TRANSLATE (inverse, inverse_trt, inverse);
+			}
+		    }
 		}
 	      /* Store this character into the translated pattern.  */
 	      bcopy (str, pat, charlen);
 	      pat += charlen;
 		 dealing with a regexp known to be trivial, so the backslash
 		 just quotes the next character.  */
 	      if (RE && *base_pat == '\\')
 		{
 		  len--;
+		  raw_pattern_size--;
 		  base_pat++;
 		}
 	      c = *base_pat++;
 	      TRANSLATE (translated, trt, c);
 	      *pat++ = translated;
 return -n;
 else
 return n;
 }
-/* Do Boyer-Moore search N times for the string PAT,
+/* Do Boyer-Moore search N times for the string BASE_PAT,
 whose length is LEN/LEN_BYTE,
 from buffer position POS/POS_BYTE until LIM/LIM_BYTE.
 DIRECTION says which direction we search in.
 TRT and INVERSE_TRT are translation tables.
+Characters in PAT are already translated by TRT.
-This kind of search works if all the characters in PAT that have
-nontrivial translation are the same aside from the last byte.  This
+This kind of search works if all the characters in BASE_PAT that
-makes it possible to translate just the last byte of a character,
+have nontrivial translation are the same aside from the last byte.
-and do so after just a simple test of the context.
+This makes it possible to translate just the last byte of a
+character, and do so after just a simple test of the context.
+CHARSET_BASE is nonzero iff there is such a non-ASCII character.
 If that criterion is not satisfied, do not call this function.  */
 static int
 boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
 register int i, j;
 unsigned char *pat, *pat_end;
 int multibyte = ! NILP (current_buffer->enable_multibyte_characters);
 unsigned char simple_translate[0400];
-int translate_prev_byte = 0;
+/* These are set to the preceding bytes of a byte to be translated
-int translate_anteprev_byte = 0;
+if charset_base is nonzero.  As the maximum byte length of a
+multibyte character is 4, we have to check at most three previous
+bytes.  */
+int translate_prev_byte1 = 0;
+int translate_prev_byte2 = 0;
+int translate_prev_byte3 = 0;
 #ifdef C_ALLOCA
 int BM_tab_space[0400];
 BM_tab = &BM_tab_space[0];
 #else
 occur in the pattern.  Others don't matter anyway!  */
 bzero (simple_translate, sizeof simple_translate);
 for (i = 0; i < 0400; i++)
 simple_translate[i] = i;
+if (charset_base)
+{
+/* Setup translate_prev_byte1/2/3 from CHARSET_BASE.  Only a
+	 byte following them are the target of translation.  */
+int sample_char = charset_base | 0x20;
+unsigned char str[MAX_MULTIBYTE_LENGTH];
+int len = CHAR_STRING (sample_char, str);
+translate_prev_byte1 = str[len - 2];
+if (len > 2)
+	{
+	  translate_prev_byte2 = str[len - 3];
+	  if (len > 3)
+	    translate_prev_byte3 = str[len - 4];
+	}
+}
 i = 0;
 while (i != infinity)
 {
 unsigned char *ptr = base_pat + i;
 i += direction;
 if (i == dirlen)
 	i = infinity;
 if (! NILP (trt))
 	{
-	  int ch;
+	  /* If the byte currently looking at is a head of a character
-	  int untranslated;
+	     to check case-equivalents, set CH to that character.  An
-	  int this_translated = 1;
+	     ASCII character and a non-ASCII character matching with
+	     CHARSET_BASE are to be checked.  */
-	  if (multibyte
+	  int ch = -1;
-	      /* Is *PTR the last byte of a character?  */
-	      && (pat_end - ptr == 1 || CHAR_HEAD_P (ptr[1])))
+	  if (ASCII_BYTE_P (*ptr) || ! multibyte)
+	    ch = *ptr;
+	  else if (charset_base && CHAR_HEAD_P (*ptr))
 	    {
-	      unsigned char *charstart = ptr;
+	      ch = STRING_CHAR (ptr, pat_end - ptr);
-	      while (! CHAR_HEAD_P (*charstart))
+	      if (charset_base != (ch & ~CHAR_FIELD3_MASK))
-		charstart--;
+		ch = -1;
-	      untranslated = STRING_CHAR (charstart, ptr - charstart + 1);
-	      if (charset_base == (untranslated & ~CHAR_FIELD3_MASK))
-		{
-		  TRANSLATE (ch, trt, untranslated);
-		  if (! CHAR_HEAD_P (*ptr))
-		    {
-		      translate_prev_byte = ptr[-1];
-		      if (! CHAR_HEAD_P (translate_prev_byte))
-			translate_anteprev_byte = ptr[-2];
-		    }
-		}
-	      else
-		{
-		  this_translated = 0;
-		  ch = *ptr;
-		}
 	    }
-	  else if (!multibyte)
-	    TRANSLATE (ch, trt, *ptr);
+	  j = *ptr;
-	  else
-	    {
-	      ch = *ptr;
-	      this_translated = 0;
-	    }
-	  if (ch > 0400)
-	    j = ((unsigned char) ch) | 0200;
-	  else
-	    j = (unsigned char) ch;
 	  if (i == infinity)
 	    stride_for_teases = BM_tab[j];
 	  BM_tab[j] = dirlen - i;
 	  /* A translation table is accompanied by its inverse -- see */
 	  /* comment following downcase_table for details */
-	  if (this_translated)
+	  if (ch >= 0)
 	    {
 	      int starting_ch = ch;
-	      int starting_j = j;
+	      int starting_j;
+	      if (ch > 0400)
+		starting_j = ((unsigned char) ch) | 0200;
+	      else
+		starting_j = (unsigned char) ch;
 	      while (1)
 		{
 		  TRANSLATE (ch, inverse_trt, ch);
 		  if (ch > 0400)
 		    j = ((unsigned char) ch) | 0200;
 		      /* Translate only the last byte of a character.  */
 		      if (! multibyte
 			  || ((cursor == tail_end_ptr
 			       || CHAR_HEAD_P (cursor[1]))
 			      && (CHAR_HEAD_P (cursor[0])
-				  || (translate_prev_byte == cursor[-1]
+				  /* Check if this is the last byte of
-				      && (CHAR_HEAD_P (translate_prev_byte)
+				     a translable character.  */
-					  || translate_anteprev_byte == cursor[-2])))))
+				  || (translate_prev_byte1 == cursor[-1]
+				      && (CHAR_HEAD_P (translate_prev_byte1)
+					  || (translate_prev_byte2 == cursor[-2]
+					      && (CHAR_HEAD_P (translate_prev_byte2)
+						  || (translate_prev_byte3 == cursor[-3]))))))))
 			ch = simple_translate[*cursor];
 		      else
 			ch = *cursor;
 		      if (pat[i] != ch)
 			break;
 		  /* Translate only the last byte of a character.  */
 		  if (! multibyte
 		      || ((ptr == tail_end_ptr
 			   || CHAR_HEAD_P (ptr[1]))
 			  && (CHAR_HEAD_P (ptr[0])
-			      || (translate_prev_byte == ptr[-1]
+			      /* Check if this is the last byte of a
-				  && (CHAR_HEAD_P (translate_prev_byte)
+				 translable character.  */
-				      || translate_anteprev_byte == ptr[-2])))))
+			      || (translate_prev_byte1 == ptr[-1]
+				  && (CHAR_HEAD_P (translate_prev_byte1)
+				      || (translate_prev_byte2 == ptr[-2]
+					  && (CHAR_HEAD_P (translate_prev_byte2)
+					      || translate_prev_byte3 == ptr[-3])))))))
 		    ch = simple_translate[*ptr];
 		  else
 		    ch = *ptr;
 		  if (pat[i] != ch)
 		    break;

Mercurial > emacs

comparison src/search.c @ 61189:91ba6c641a60