emacs: src/lread.c comparison

comparison src/lread.c @ 88383:0b4249d736a0

Include "character.h". (read_multibyte): New arg NBYTES. (read_escape): The meaning of returned *BYTEREP changed. (to_multibyte): Deleted. (read1): Adjuted the handling of char table and string.

author	Kenichi Handa <handa@m17n.org>
date	Fri, 01 Mar 2002 01:44:45 +0000
parents	a6382f0fcb2a
children	ef046df4c6ee

comparison

equal deleted inserted replaced

-:5e3e1d9d514f
+:0b4249d736a0
 #include <sys/file.h>
 #include <errno.h>
 #include "lisp.h"
 #include "intervals.h"
 #include "buffer.h"
+#include "character.h"
 #include "charset.h"
 #include <epaths.h>
 #include "commands.h"
 #include "keyboard.h"
 #include "termhooks.h"
 static int read_buffer_size;
 static char *read_buffer;
 /* Read multibyte form and return it as a character.  C is a first
 byte of multibyte form, and rest of them are read from
-READCHARFUN.  */
+READCHARFUN.  Store the byte length of the form into *NBYTES.  */
 static int
-read_multibyte (c, readcharfun)
+read_multibyte (c, readcharfun, nbytes)
 register int c;
 Lisp_Object readcharfun;
+int *nbytes;
 {
 /* We need the actual character code of this multibyte
 characters.  */
 unsigned char str[MAX_MULTIBYTE_LENGTH];
 int len = 0;
-int bytes;
+int bytes = BYTES_BY_CHAR_HEAD (c);
 str[len++] = c;
-while ((c = READCHAR) >= 0xA0
+while (len < bytes)
-	 && len < MAX_MULTIBYTE_LENGTH)
+{
-str[len++] = c;
+c = READCHAR;
-UNREAD (c);
+if (CHAR_HEAD_P (c))
-if (UNIBYTE_STR_AS_MULTIBYTE_P (str, len, bytes))
+	{
-return STRING_CHAR (str, len);
+	  UNREAD (c);
+	  break;
+	}
+str[len++] = c;
+}
+if (len == bytes && MULTIBYTE_LENGTH_NO_CHECK (str) > 0)
+{
+*nbytes = len;
+return STRING_CHAR (str, len);
+}
 /* The byte sequence is not valid as multibyte.  Unread all bytes
 but the first one, and return the first byte.  */
 while (--len > 0)
 UNREAD (str[len]);
+*nbytes = 1;
 return str[0];
 }
 /* Read a \-escape sequence, assuming we already read the `\'.
 If the escape sequence forces unibyte, store 1 into *BYTEREP.
-If the escape sequence forces multibyte, store 2 into *BYTEREP.
+If the escape sequence forces multibyte and the returned character
+is raw 8-bit char, store 2 into *BYTEREP.
+If the escape sequence forces multibyte and the returned character
+is not raw 8-bit char, store 3 into *BYTEREP.
 Otherwise store 0 into *BYTEREP.  */
 static int
 read_escape (readcharfun, stringp, byterep)
 Lisp_Object readcharfun;
 		UNREAD (c);
 		break;
 	      }
 	  }
-	*byterep = 1;
+	if (c < 0x100)
+	  *byterep = 1;
+	else
+	  *byterep = 3;
 	return i;
 }
 case 'x':
 /* A hex escape, as in ANSI C.  */
 {
 	int i = 0;
+	int count = 0;
 	while (1)
 	  {
 	    c = READCHAR;
 	    if (c >= '0' && c <= '9')
 	      {
 	    else
 	      {
 		UNREAD (c);
 		break;
 	      }
+	    count++;
 	  }
-	*byterep = 2;
+	if (count < 3 && i >= 0x80)
+	  *byterep = 2;
+	else
+	  *byterep = 3;
 	return i;
 }
 default:
-if (BASE_LEADING_CODE_P (c))
+if (EQ (readcharfun, Qget_file_char)
-	c = read_multibyte (c, readcharfun);
+	  && BASE_LEADING_CODE_P (c))
+	{
+	  int nbytes;
+	  c = read_multibyte (c, readcharfun, &nbytes);
+	  if (nbytes > 1)
+	    *byterep = 3;
+	}
 return c;
 }
 }
 return make_number (sign * number);
 }
-/* Convert unibyte text in read_buffer to multibyte.
-Initially, *P is a pointer after the end of the unibyte text, and
-the pointer *END points after the end of read_buffer.
-If read_buffer doesn't have enough room to hold the result
-of the conversion, reallocate it and adjust *P and *END.
-At the end, make *P point after the result of the conversion, and
-return in *NCHARS the number of characters in the converted
-text.  */
-static void
-to_multibyte (p, end, nchars)
-char **p, **end;
-int *nchars;
-{
-int nbytes;
-parse_str_as_multibyte (read_buffer, *p - read_buffer, &nbytes, nchars);
-if (read_buffer_size < 2 * nbytes)
-{
-int offset = *p - read_buffer;
-read_buffer_size = 2 * max (read_buffer_size, nbytes);
-read_buffer = (char *) xrealloc (read_buffer, read_buffer_size);
-*p = read_buffer + offset;
-*end = read_buffer + read_buffer_size;
-}
-if (nbytes != *nchars)
-nbytes = str_as_multibyte (read_buffer, read_buffer_size,
-			       *p - read_buffer, nchars);
-*p = read_buffer + nbytes;
-}
 /* If the next token is ')' or ']' or '.', we store that character
 in *PCH and the return value is not interesting.  Else, we store
 zero in *PCH and we read and return one lisp object.
 FIRST_IN_LIST is nonzero if this is the first element of a list.  */
 	  c = READCHAR;
 	  if (c == '[')
 	    {
 	      Lisp_Object tmp;
 	      tmp = read_vector (readcharfun, 0);
-	      if (XVECTOR (tmp)->size < CHAR_TABLE_STANDARD_SLOTS
+	      if (XVECTOR (tmp)->size != VECSIZE (struct Lisp_Char_Table))
-		  || XVECTOR (tmp)->size > CHAR_TABLE_STANDARD_SLOTS + 10)
 		error ("Invalid size char-table");
 	      XSETCHAR_TABLE (tmp, XCHAR_TABLE (tmp));
-	      XCHAR_TABLE (tmp)->top = Qt;
 	      return tmp;
 	    }
 	  else if (c == '^')
 	    {
 	      c = READCHAR;
 	      if (c == '[')
 		{
 		  Lisp_Object tmp;
+		  int depth, size;
 		  tmp = read_vector (readcharfun, 0);
-		  if (XVECTOR (tmp)->size != SUB_CHAR_TABLE_STANDARD_SLOTS)
+		  if (!INTEGERP (AREF (tmp, 0)))
+		    error ("Invalid depth in char-table");
+		  depth = XINT (AREF (tmp, 0));
+		  if (depth < 1 || depth > 3)
+		    error ("Invalid depth in char-table");
+		  size = XVECTOR (tmp)->size + 2;
+		  if (chartab_size [depth] != size)
 		    error ("Invalid size char-table");
-		  XSETCHAR_TABLE (tmp, XCHAR_TABLE (tmp));
+		  XSETSUB_CHAR_TABLE (tmp, XSUB_CHAR_TABLE (tmp));
-		  XCHAR_TABLE (tmp)->top = Qnil;
 		  return tmp;
 		}
 	      Fsignal (Qinvalid_read_syntax,
 		       Fcons (make_string ("#^^", 3), Qnil));
 	    }
 	if (c < 0)
 	  end_of_file_error ();
 	if (c == '\\')
 	  c = read_escape (readcharfun, 0, &discard);
-	else if (BASE_LEADING_CODE_P (c))
+	else if (EQ (readcharfun, Qget_file_char)
-	  c = read_multibyte (c, readcharfun);
+		 && BASE_LEADING_CODE_P (c))
+	  c = read_multibyte (c, readcharfun, &discard);
 	return make_number (c);
 }
 case '"':
 {
 	char *p = read_buffer;
 	char *end = read_buffer + read_buffer_size;
 	register int c;
-	/* 1 if we saw an escape sequence specifying
+	/* Nonzero if we saw an escape sequence specifying
-	   a multibyte character, or a multibyte character.  */
+	   a multibyte character.  */
 	int force_multibyte = 0;
-	/* 1 if we saw an escape sequence specifying
+	/* Nonzero if we saw an escape sequence specifying
 	   a single-byte character.  */
 	int force_singlebyte = 0;
-	/* 1 if read_buffer contains multibyte text now.  */
-	int is_multibyte = 0;
 	int cancel = 0;
 	int nchars = 0;
 	while ((c = READCHAR) >= 0
 	       && c != '\"')
 		end = read_buffer + read_buffer_size;
 	      }
 	    if (c == '\\')
 	      {
+		int modifiers;
 		int byterep;
 		c = read_escape (readcharfun, 1, &byterep);
 		/* C is -1 if \ newline has just been seen */
 		    if (p == read_buffer)
 		      cancel = 1;
 		    continue;
 		  }
+		modifiers = c & CHAR_MODIFIER_MASK;
+		c = c & ~CHAR_MODIFIER_MASK;
 		if (byterep == 1)
-		  force_singlebyte = 1;
+		  {
-		else if (byterep == 2)
+		    force_singlebyte = 1;
+		    if (c >= 0x80)
+		      /*  Raw 8-bit code */
+		      c = BYTE8_TO_CHAR (c);
+		  }
+		else if (byterep > 1)
+		  {
+		    force_multibyte = 1;
+		    if (byterep == 2)
+		      c = BYTE8_TO_CHAR (c);
+		  }
+		else if (c >= 0x80)
+		  {
+		    force_singlebyte = 1;
+		    c = BYTE8_TO_CHAR (c);
+		  }
+		if (ASCII_CHAR_P (c))
+		  {
+		    /* Allow `\C- ' and `\C-?'.  */
+		    if (modifiers == CHAR_CTL)
+		      {
+			if (c == ' ')
+			  c = 0, modifiers = 0;
+			else if (c == '?')
+			  c = 127, modifiers = 0;
+		      }
+		    if (modifiers & CHAR_SHIFT)
+		      {
+			/* Shift modifier is valid only with [A-Za-z].  */
+			if (c >= 'A' && c <= 'Z')
+			  modifiers &= ~CHAR_SHIFT;
+			else if (c >= 'a' && c <= 'z')
+			  c -= ('a' - 'A'), modifiers &= ~CHAR_SHIFT;
+		      }
+		    if (modifiers & CHAR_META)
+		      {
+			/* Move the meta bit to the right place for a
+			   string.  */
+			modifiers &= ~CHAR_META;
+			c = BYTE8_TO_CHAR (c | 0x80);
+			force_singlebyte = 1;
+		      }
+		  }
+		/* Any modifiers remaining are invalid.  */
+		if (modifiers)
+		  error ("Invalid modifier in string");
+		p += CHAR_STRING (c, (unsigned char *) p);
+	      }
+	    else if (c >= 0x80)
+	      {
+		if (EQ (readcharfun, Qget_file_char))
+		  {
+		    if (BASE_LEADING_CODE_P (c))
+		      {
+			int nbytes;
+			c = read_multibyte (c, readcharfun, &nbytes);
+			if (nbytes > 1)
+			  force_multibyte = 1;
+			else
+			  {
+			    force_singlebyte = 1;
+			    c = BYTE8_TO_CHAR (c);
+			  }
+		      }
+		    else
+		      {
+			force_singlebyte = 1;
+			c = BYTE8_TO_CHAR (c);
+		      }
+		  }
+		else
 		  force_multibyte = 1;
+		p += CHAR_STRING (c, (unsigned char *) p);
 	      }
-	    /* A character that must be multibyte forces multibyte.  */
-	    if (! SINGLE_BYTE_CHAR_P (c & ~CHAR_MODIFIER_MASK))
-	      force_multibyte = 1;
-	    /* If we just discovered the need to be multibyte,
-	       convert the text accumulated thus far.  */
-	    if (force_multibyte && ! is_multibyte)
-	      {
-		is_multibyte = 1;
-		to_multibyte (&p, &end, &nchars);
-	      }
-	    /* Allow `\C- ' and `\C-?'.  */
-	    if (c == (CHAR_CTL | ' '))
-	      c = 0;
-	    else if (c == (CHAR_CTL | '?'))
-	      c = 127;
-	    if (c & CHAR_SHIFT)
-	      {
-		/* Shift modifier is valid only with [A-Za-z].  */
-		if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
-		  c &= ~CHAR_SHIFT;
-		else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
-		  c = (c & ~CHAR_SHIFT) - ('a' - 'A');
-	      }
-	    if (c & CHAR_META)
-	      /* Move the meta bit to the right place for a string.  */
-	      c = (c & ~CHAR_META) | 0x80;
-	    if (c & CHAR_MODIFIER_MASK)
-	      error ("Invalid modifier in string");
-	    if (is_multibyte)
-	      p += CHAR_STRING (c, p);
 	    else
 	      *p++ = c;
 	    nchars++;
 	  }
 	if (c < 0)
 	  end_of_file_error ();
 	/* If purifying, and string starts with \ newline,
 	   return zero instead.  This is for doc strings
 	   that we are really going to find in etc/DOC.nn.nn  */
 	if (!NILP (Vpurify_flag) && NILP (Vdoc_file_name) && cancel)
 	  return make_number (0);
-	if (is_multibyte || force_singlebyte)
+	if (force_multibyte)
+	  /* READ_BUFFER already contains valid multibyte forms.  */
 	  ;
-	else if (load_convert_to_unibyte)
+	else if (force_singlebyte)
 	  {
-	    Lisp_Object string;
+	    nchars = str_as_unibyte (read_buffer, p - read_buffer);
-	    to_multibyte (&p, &end, &nchars);
+	    p = read_buffer + nchars;
-	    if (p - read_buffer != nchars)
-	      {
-		string = make_multibyte_string (read_buffer, nchars,
-						p - read_buffer);
-		return Fstring_make_unibyte (string);
-	      }
-	    /* We can make a unibyte string directly.  */
-	    is_multibyte = 0;
-	  }
-	else if (EQ (readcharfun, Qget_file_char)
-		 || EQ (readcharfun, Qlambda))
-	  {
-	    /* Nowadays, reading directly from a file is used only for
-	       compiled Emacs Lisp files, and those always use the
-	       Emacs internal encoding.  Meanwhile, Qlambda is used
-	       for reading dynamic byte code (compiled with
-	       byte-compile-dynamic = t).  */
-	    to_multibyte (&p, &end, &nchars);
-	    is_multibyte = 1;
 	  }
 	else
-	  /* In all other cases, if we read these bytes as
+	  /* Otherwise, READ_BUFFER contains only ASCII.  */
-	     separate characters, treat them as separate characters now.  */
-	  ;
 	if (read_pure)
 	  return make_pure_string (read_buffer, nchars, p - read_buffer,
-				   is_multibyte);
+				   (force_multibyte
+				    || (p - read_buffer != nchars)));
 	return make_specified_string (read_buffer, nchars, p - read_buffer,
-				      is_multibyte);
+				      (force_multibyte
+				       || (p - read_buffer != nchars)));
 }
 case '.':
 {
 	int next_char = READCHAR;

Mercurial > emacs

comparison src/lread.c @ 88383:0b4249d736a0