changeset 17053:df904355f033

Include category.h and charset.h. (compile_pattern_1): Handle new argument `multibyte'. (compile_pattern): Handle the flag `enable-multibyte-characters'. (Vascii_downcase_table): Declare external. (fast_string_match_ignore_case): New function. (skip_chars): Handle multibyte characters. (trivial_regexp_p): Handle regular expression "\\Cc" and "\\CC" for category.
author Karl Heuer <kwzh@gnu.org>
date Thu, 20 Feb 1997 07:26:24 +0000
parents d0d7b244b1d0
children 84414e5f194d
files src/search.c
diffstat 1 files changed, 153 insertions(+), 31 deletions(-) [+]
line wrap: on
line diff
--- a/src/search.c	Thu Feb 20 07:02:49 1997 +0000
+++ b/src/search.c	Thu Feb 20 07:26:24 1997 +0000
@@ -22,7 +22,9 @@
 #include <config.h>
 #include "lisp.h"
 #include "syntax.h"
+#include "category.h"
 #include "buffer.h"
+#include "charset.h"
 #include "region-cache.h"
 #include "commands.h"
 #include "blockinput.h"
@@ -105,15 +107,19 @@
    If it is 0, we should compile the pattern not to record any
    subexpression bounds.
    POSIX is nonzero if we want full backtracking (POSIX style)
-   for this pattern.  0 means backtrack only enough to get a valid match.  */
+   for this pattern.  0 means backtrack only enough to get a valid match.
+   MULTIBYTE is nonzero if we want to handle multibyte characters in
+   PATTERN.  0 means all multibyte characters are recognized just as
+   sequences of binary data.  */
 
 static void
-compile_pattern_1 (cp, pattern, translate, regp, posix)
+compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte)
      struct regexp_cache *cp;
      Lisp_Object pattern;
      Lisp_Object *translate;
      struct re_registers *regp;
      int posix;
+     int multibyte;
 {
   CONST char *val;
   reg_syntax_t old;
@@ -121,6 +127,7 @@
   cp->regexp = Qnil;
   cp->buf.translate = translate;
   cp->posix = posix;
+  cp->buf.multibyte = multibyte;
   BLOCK_INPUT;
   old = re_set_syntax (RE_SYNTAX_EMACS
 		       | (posix ? 0 : RE_NO_POSIX_BACKTRACKING));
@@ -153,6 +160,9 @@
      int posix;
 {
   struct regexp_cache *cp, **cpp;
+  /* Should we check it here, or add an argument `multibyte' to this
+     function?  */
+  int multibyte = !NILP (current_buffer->enable_multibyte_characters);
 
   for (cpp = &searchbuf_head; ; cpp = &cp->next)
     {
@@ -160,13 +170,14 @@
       if (XSTRING (cp->regexp)->size == XSTRING (pattern)->size
 	  && !NILP (Fstring_equal (cp->regexp, pattern))
 	  && cp->buf.translate == translate
-	  && cp->posix == posix)
+	  && cp->posix == posix
+	  && cp->buf.multibyte == multibyte)
 	break;
 
       /* If we're at the end of the cache, compile into the last cell.  */
       if (cp->next == 0)
 	{
-	  compile_pattern_1 (cp, pattern, translate, regp, posix);
+	  compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte);
 	  break;
 	}
     }
@@ -369,6 +380,29 @@
   immediate_quit = 0;
   return val;
 }
+
+/* Match REGEXP against STRING, searching all of STRING ignoring case,
+   and return the index of the match, or negative on failure.
+   This does not clobber the match data.  */
+
+extern Lisp_Object Vascii_downcase_table;
+
+int
+fast_string_match_ignore_case (regexp, string)
+     Lisp_Object regexp;
+     char *string;
+{
+  int val;
+  struct re_pattern_buffer *bufp;
+  int len = strlen (string);
+
+  bufp = compile_pattern (regexp, 0,
+			  XCHAR_TABLE (Vascii_downcase_table)->contents, 0);
+  immediate_quit = 1;
+  val = re_search (bufp, string, len, 0, len, 0);
+  immediate_quit = 0;
+  return val;
+}
 
 /* max and min.  */
 
@@ -502,8 +536,8 @@
 
         {
           /* The termination address of the dumb loop.  */ 
-          register unsigned char *ceiling_addr = &FETCH_CHAR (ceiling) + 1;
-          register unsigned char *cursor = &FETCH_CHAR (start);
+          register unsigned char *ceiling_addr = POS_ADDR (ceiling) + 1;
+          register unsigned char *cursor = POS_ADDR (start);
           unsigned char *base = cursor;
 
           while (cursor < ceiling_addr)
@@ -566,8 +600,8 @@
 
         {
           /* The termination address of the dumb loop.  */
-          register unsigned char *ceiling_addr = &FETCH_CHAR (ceiling);
-          register unsigned char *cursor = &FETCH_CHAR (start - 1);
+          register unsigned char *ceiling_addr = POS_ADDR (ceiling);
+          register unsigned char *cursor = POS_ADDR (start - 1);
           unsigned char *base = cursor;
 
           while (cursor >= ceiling_addr)
@@ -693,7 +727,18 @@
 {
   register unsigned char *p, *pend;
   register unsigned char c;
+  register int ch;
   unsigned char fastmap[0400];
+  /* If SYNTAXP is 0, STRING may contain multi-byte form of characters
+     of which codes don't fit in FASTMAP.  In that case, we set the
+     first byte of multibyte form (i.e. base leading-code) in FASTMAP
+     and set the actual ranges of characters in CHAR_RANGES.  In the
+     form "X-Y" of STRING, both X and Y must belong to the same
+     character set because a range striding across character sets is
+     meaningless.  */
+  int *char_ranges
+    = (int *) alloca (XSTRING (string)->size * (sizeof (int)) * 2);
+  int n_char_ranges = 0;
   int negate = 0;
   register int i;
 
@@ -724,11 +769,13 @@
 
   /* Find the characters specified and set their elements of fastmap.
      If syntaxp, each character counts as itself.
-     Otherwise, handle backslashes and ranges specially  */
+     Otherwise, handle backslashes and ranges specially.  */
 
   while (p != pend)
     {
-      c = *p++;
+      c = *p;
+      ch = STRING_CHAR (p, pend - p);
+      p += BYTES_BY_CHAR_HEAD (*p);
       if (syntaxp)
 	fastmap[syntax_spec_code[c]] = 1;
       else
@@ -740,25 +787,49 @@
 	    }
 	  if (p != pend && *p == '-')
 	    {
+	      unsigned int ch2;
+
 	      p++;
 	      if (p == pend) break;
-	      while (c <= *p)
+	      if (SINGLE_BYTE_CHAR_P (ch))
+		while (c <= *p)
+		  {
+		    fastmap[c] = 1;
+		    c++;
+		  }
+	      else
 		{
-		  fastmap[c] = 1;
-		  c++;
+		  fastmap[c] = 1; /* C is the base leading-code.  */
+		  ch2 = STRING_CHAR (p, pend - p);
+		  if (ch <= ch2)
+		    char_ranges[n_char_ranges++] = ch,
+		    char_ranges[n_char_ranges++] = ch2;
 		}
-	      p++;
+	      p += BYTES_BY_CHAR_HEAD (*p);
 	    }
 	  else
-	    fastmap[c] = 1;
+	    {
+	      fastmap[c] = 1;
+	      if (!SINGLE_BYTE_CHAR_P (ch))
+		char_ranges[n_char_ranges++] = ch,
+		char_ranges[n_char_ranges++] = ch;
+	    }
 	}
     }
 
-  /* If ^ was the first character, complement the fastmap. */
+  /* If ^ was the first character, complement the fastmap.  In
+     addition, as all multibyte characters have possibility of
+     matching, set all entries for base leading codes, which is
+     harmless even if SYNTAXP is 1.  */
 
   if (negate)
     for (i = 0; i < sizeof fastmap; i++)
-      fastmap[i] ^= 1;
+      {
+	if (!BASE_LEADING_CODE_P (i))
+	  fastmap[i] ^= 1;
+	else
+	  fastmap[i] = 1;
+      }
 
   {
     int start_point = PT;
@@ -771,26 +842,76 @@
 	  {
 	    while (pos < XINT (lim)
 		   && fastmap[(int) SYNTAX (FETCH_CHAR (pos))])
-	      pos++;
+	      INC_POS (pos);
 	  }
 	else
 	  {
-	    while (pos > XINT (lim)
-		   && fastmap[(int) SYNTAX (FETCH_CHAR (pos - 1))])
-	      pos--;
+	    while (pos > XINT (lim))
+	      {
+		DEC_POS (pos);
+		if (!fastmap[(int) SYNTAX (FETCH_CHAR (pos))])
+		  {
+		    INC_POS (pos);
+		    break;
+		  }
+	      }
 	  }
       }
     else
       {
 	if (forwardp)
 	  {
-	    while (pos < XINT (lim) && fastmap[FETCH_CHAR (pos)])
-	      pos++;
+	    while (pos < XINT (lim) && fastmap[(c = FETCH_BYTE (pos))])
+	      {
+		if (!BASE_LEADING_CODE_P (c))
+		  pos++;
+		else if (n_char_ranges)
+		  {
+		    /* We much check CHAR_RANGES for a multibyte
+                       character.  */
+		    ch = FETCH_MULTIBYTE_CHAR (pos);
+		    for (i = 0; i < n_char_ranges; i += 2)
+		      if ((ch >= char_ranges[i] && ch <= char_ranges[i + 1]))
+			break;
+		    if (!(negate ^ (i < n_char_ranges)))
+		      break;
+
+		    INC_POS (pos);
+		  }
+		else
+		  {
+		    if (!negate) break;
+		    INC_POS (pos);
+		  }
+	      }
 	  }
 	else
 	  {
-	    while (pos > XINT (lim) && fastmap[FETCH_CHAR (pos - 1)])
-	      pos--;
+	    while (pos > XINT (lim))
+	      {
+		DEC_POS (pos);
+		if (fastmap[(c = FETCH_BYTE (pos))])
+		  {
+		    if (!BASE_LEADING_CODE_P (c))
+		      ;
+		    else if (n_char_ranges)
+		      {
+			/* We much check CHAR_RANGES for a multibyte
+                           character.  */
+			ch = FETCH_MULTIBYTE_CHAR (pos);
+			for (i = 0; i < n_char_ranges; i += 2)
+			  if (ch >= char_ranges[i] && ch <= char_ranges[i + 1])
+			    break;
+			if (!(negate ^ (i < n_char_ranges)))
+			  break;
+		      }
+		    else
+		      if (!negate)
+			break;
+		  }
+		else
+		  break;
+	      }
 	  }
       }
     SET_PT (pos);
@@ -890,6 +1011,7 @@
 	    case '|': case '(': case ')': case '`': case '\'': case 'b':
 	    case 'B': case '<': case '>': case 'w': case 'W': case 's':
 	    case 'S': case '=':
+	    case 'c': case 'C':	/* for categoryspec and notcategoryspec */
 	    case '1': case '2': case '3': case '4': case '5':
 	    case '6': case '7': case '8': case '9':
 	      return 0;
@@ -1165,8 +1287,8 @@
 		   : max (lim, max (limit, pos - 20000)));
 	  if ((limit - pos) * direction > 20)
 	    {
-	      p_limit = &FETCH_CHAR (limit);
-	      p2 = (cursor = &FETCH_CHAR (pos));
+	      p_limit = POS_ADDR (limit);
+	      p2 = (cursor = POS_ADDR (pos));
 	      /* In this loop, pos + cursor - p2 is the surrogate for pos */
 	      while (1)		/* use one cursor setting as long as i can */
 		{
@@ -1256,7 +1378,7 @@
 		  /* (the reach is at most len + 21, and typically */
 		  /* does not exceed len) */    
 		  while ((limit - pos) * direction >= 0)
-		    pos += BM_tab[FETCH_CHAR(pos)];
+		    pos += BM_tab[FETCH_BYTE (pos)];
 		  /* now run the same tests to distinguish going off the */
 		  /* end, a match or a phony match. */
 		  if ((pos - limit) * direction <= len)
@@ -1269,8 +1391,8 @@
 		    {
 		      pos -= direction;
 		      if (pat[i] != (trt != 0
-				     ? trt[FETCH_CHAR(pos)]
-				     : FETCH_CHAR (pos)))
+				     ? trt[FETCH_BYTE (pos)]
+				     : FETCH_BYTE (pos)))
 			break;
 		    }
 		  /* Above loop has moved POS part or all the way
@@ -1599,7 +1721,7 @@
       for (pos = search_regs.start[sub]; pos < last; pos++)
 	{
 	  if (NILP (string))
-	    c = FETCH_CHAR (pos);
+	    c = FETCH_BYTE (pos);
 	  else
 	    c = XSTRING (string)->data[pos];