changeset 23883:0ca2fa58ca7b

(Qunknown): New variable. (init_charset_once): Intern and staticpro Qunknown. Initialize all elements of Vcharset_symbol_table to Qunknown. (find_charset_in_str): New arg MULTIBYTE. If it is zero, check unibyte characters only. For an invalid composition sequence, set CHARSETS[1] to 1. (Ffind_charset_region): Call find_charset_in_str with an appropriate MULTIBYTE arg. If undefined charsets are found, include `unknown' is the return value. (Ffind_charset_string): Likewise. (Fsplit_char): If CHAR is invalid, return `(unknown CHAR)'. (str_cmpchar_id): Max composite character code should be less than GENERIC_COMPOSITION_CHAR.
author Kenichi Handa <handa@m17n.org>
date Tue, 15 Dec 1998 04:35:38 +0000
parents 16d0232006fb
children 179bcb86f12f
files src/charset.c
diffstat 1 files changed, 86 insertions(+), 27 deletions(-) [+]
line wrap: on
line diff
--- a/src/charset.c	Tue Dec 15 04:35:38 1998 +0000
+++ b/src/charset.c	Tue Dec 15 04:35:38 1998 +0000
@@ -41,6 +41,7 @@
 #endif /* emacs */
 
 Lisp_Object Qcharset, Qascii, Qcomposition;
+Lisp_Object Qunknown;
 
 /* Declaration of special leading-codes.  */
 int leading_code_composition;	/* for composite characters */
@@ -141,7 +142,7 @@
 {
   int charset, c1, c2;
 
-  if (c & ~GLYPH_MASK_CHAR)
+  if (c & ~GLYPH_MASK_CHAR)	/* This includes the case C is negative.  */
     {
       if (c & CHAR_META)
 	/* Move the meta bit to the right place for a string.  */
@@ -735,17 +736,42 @@
 
    If CMPCHARP is nonzero and some composite character is found,
    CHARSETS[128] is also set 1 and the returned number is incremented
-   by 1.  */
+   by 1.
+
+   If MULTIBYTE is zero, do not check multibyte characters, i.e. if
+   any ASCII codes (7-bit) are found, CHARSET[0] is set to 1, if any
+   8-bit codes are found CHARSET[1] is set to 1.  */
 
 int
-find_charset_in_str (str, len, charsets, table, cmpcharp)
+find_charset_in_str (str, len, charsets, table, cmpcharp, multibyte)
      unsigned char *str;
      int len, *charsets;
      Lisp_Object table;
      int cmpcharp;
+     int multibyte;
 {
   register int num = 0, c;
 
+  if (! multibyte)
+    {
+      unsigned char *endp = str + len;
+      int maskbits = 0;
+	
+      while (str < endp && maskbits != 3)
+	maskbits |=  (*str++ < 0x80 ? 1 : 2);
+      if (maskbits & 1)
+	{
+	  charsets[0] = 1;
+	  num++;
+	}
+      if (maskbits & 2)
+	{
+	  charsets[1] = 1;
+	  num++;
+	}
+      return num;
+    }
+
   if (! CHAR_TABLE_P (table))
     table = Qnil;
 
@@ -790,7 +816,7 @@
 	      continue;
 	    }
 
-	  charset = CHARSET_ASCII;
+	  charset = 1;		/* This leads to `unknown' charset.  */
 	  bytes = 1;
 	}
       else
@@ -822,23 +848,27 @@
 BEG and END are buffer positions.\n\
 If the region contains any composite character,\n\
 `composition' is included in the returned list.\n\
-Optional arg TABLE if non-nil is a translation table to look up.")
+Optional arg TABLE if non-nil is a translation table to look up.\n\
+\n\
+If the region contains invalid multiybte characters,\n\
+`unknown' is included in the returned list.
+\n\
+If the current buffer is unibyte, the returned list contains\n\
+`ascii' if any 7-bit characters are found,\n\
+and `unknown' if any 8-bit characters are found.")
   (beg, end, table)
      Lisp_Object beg, end, table;
 {
   int charsets[MAX_CHARSET + 1];
   int from, from_byte, to, stop, stop_byte, i;
   Lisp_Object val;
+  int undefined;
+  int multibyte = !NILP (current_buffer->enable_multibyte_characters);
 
   validate_region (&beg, &end);
   from = XFASTINT (beg);
   stop = to = XFASTINT (end);
 
-  if (NILP (current_buffer->enable_multibyte_characters))
-    return (from == to
-	    ? Qnil
-	    : Fcons (Qascii, Qnil));
-
   if (from < GPT && GPT < to)
     {
       stop = GPT;
@@ -853,7 +883,7 @@
   while (1)
     {
       find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte,
-			   charsets, table, 1);
+			   charsets, table, 1, multibyte);
       if (stop < to)
 	{
 	  from = stop, from_byte = stop_byte;
@@ -864,9 +894,17 @@
     }
 
   val = Qnil;
-  for (i = MAX_CHARSET; i >= 0; i--)
+  undefined = 0;
+  for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
     if (charsets[i])
-      val = Fcons (CHARSET_SYMBOL (i), val);
+      {
+	if (CHARSET_DEFINED_P (i) || i == CHARSET_COMPOSITION)
+	  val = Fcons (CHARSET_SYMBOL (i), val);
+	else
+	  undefined = 1;
+      }
+  if (undefined)
+    val = Fcons (Qunknown, val);
   return val;
 }
 
@@ -875,28 +913,41 @@
   "Return a list of charsets in STR.\n\
 If the string contains any composite characters,\n\
 `composition' is included in the returned list.\n\
-Optional arg TABLE if non-nil is a translation table to look up.")
+Optional arg TABLE if non-nil is a translation table to look up.\n\
+\n\
+If the region contains invalid multiybte characters,\n\
+`unknown' is included in the returned list.\n\
+\n\
+If STR is unibyte, the returned list contains\n\
+`ascii' if any 7-bit characters are found,\n\
+and `unknown' if any 8-bit characters are found.")
   (str, table)
      Lisp_Object str, table;
 {
   int charsets[MAX_CHARSET + 1];
   int i;
   Lisp_Object val;
+  int undefined;
+  int multibyte;
 
   CHECK_STRING (str, 0);
-
-  if (! STRING_MULTIBYTE (str))
-    return (XSTRING (str)->size == 0
-	    ? Qnil
-	    : Fcons (Qascii, Qnil));
+  multibyte = STRING_MULTIBYTE (str);
 
   bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
   find_charset_in_str (XSTRING (str)->data, STRING_BYTES (XSTRING (str)),
-		       charsets, table, 1);
+		       charsets, table, 1, multibyte);
   val = Qnil;
-  for (i = MAX_CHARSET; i >= 0; i--)
+  undefined = 0;
+  for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
     if (charsets[i])
-      val = Fcons (CHARSET_SYMBOL (i), val);
+      {
+	if (CHARSET_DEFINED_P (i) || i == CHARSET_COMPOSITION)
+	  val = Fcons (CHARSET_SYMBOL (i), val);
+	else
+	  undefined = 1;
+      }
+  if (undefined)
+    val = Fcons (Qunknown, val);
   return val;
 }
 
@@ -923,14 +974,19 @@
 }
 
 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
-  "Return list of charset and one or two position-codes of CHAR.")
+  "Return list of charset and one or two position-codes of CHAR.\n\
+If CHAR is invalid as a character code,\n\
+return a list of symbol `unknown' and CHAR.")
   (ch)
      Lisp_Object ch;
 {
   Lisp_Object val;
-  int charset, c1, c2;
+  int c, charset, c1, c2;
 
   CHECK_NUMBER (ch, 0);
+  c = XFASTINT (ch);
+  if (!CHAR_VALID_P (c, 1))
+    return Fcons (Qunknown, Fcons (ch, Qnil));
   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
   return (c2 >= 0
 	  ? Fcons (CHARSET_SYMBOL (charset),
@@ -1153,7 +1209,7 @@
   else if (COMPOSITE_CHAR_P (c))
     {
       int id = COMPOSITE_CHAR_ID (XFASTINT (ch));
-      XSETFASTINT (val, (id < n_cmpchars ? cmpchar_table[id]->width : 0));
+      XSETFASTINT (val, (id < n_cmpchars ? cmpchar_table[id]->width : 1));
     }
   else
     {
@@ -1469,7 +1525,7 @@
       }
 
   /* We have to register the composite character in cmpchar_table.  */
-  if (n_cmpchars > (CHAR_FIELD2_MASK | CHAR_FIELD3_MASK))
+  if (n_cmpchars >= (CHAR_FIELD2_MASK | CHAR_FIELD3_MASK))
     /* No, we have no more room for a new composite character.  */
     return -1;
 
@@ -1846,7 +1902,10 @@
   Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
   Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
 
-  Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1), Qnil);
+  Qunknown = intern ("unknown");
+  staticpro (&Qunknown);
+  Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
+					Qunknown);
 
   /* Setup tables.  */
   for (i = 0; i < 2; i++)