changeset 21724:12a826a37249

(read_from_string_index_byte): New variable. (read_from_string_index): Now counts characters. (readchar, unreadchar, Fread_from_string): Changed accordingly. (readchar): Read a multibyte char all at once from a buffer, marker or string. (unreadchar): Unread a multibyte char all at once. (read1): Properly handle non-escaped multibyte chars. They force a string to be multibyte. When reading direct from a file, any multibyte sequence means a multibyte string. Insist on MAX_LENGTH_OF_MULTI_BYTE_FORM bytes when checking for read_buffer full; this way need not check specially for multibyte.
author Richard M. Stallman <rms@gnu.org>
date Thu, 23 Apr 1998 21:22:51 +0000
parents edc95c28d970
children 375190ad5062
files src/lread.c
diffstat 1 files changed, 122 insertions(+), 119 deletions(-) [+]
line wrap: on
line diff
--- a/src/lread.c	Thu Apr 23 21:20:31 1998 +0000
+++ b/src/lread.c	Thu Apr 23 21:22:51 1998 +0000
@@ -131,6 +131,7 @@
 
 /* For use within read-from-string (this reader is non-reentrant!!)  */
 static int read_from_string_index;
+static int read_from_string_index_byte;
 static int read_from_string_limit;
 
 /* Number of bytes left to read in the buffer character
@@ -169,64 +170,59 @@
      Lisp_Object readcharfun;
 {
   Lisp_Object tem;
-  register struct buffer *inbuffer;
   register int c, mpos;
 
   if (BUFFERP (readcharfun))
     {
-      inbuffer = XBUFFER (readcharfun);
-
-      if (readchar_backlog == 0)
+      register struct buffer *inbuffer = XBUFFER (readcharfun);
+
+      int pt_byte = BUF_PT_BYTE (inbuffer);
+      int orig_pt_byte = pt_byte;
+
+      if (pt_byte >= BUF_ZV_BYTE (inbuffer))
+	return -1;
+
+      if (! NILP (inbuffer->enable_multibyte_characters))
 	{
-	  int pt_byte = BUF_PT_BYTE (inbuffer);
-	  int orig_pt_byte = pt_byte;
-
-	  if (pt_byte >= BUF_ZV_BYTE (inbuffer))
-	    return -1;
-
-	  if (! NILP (inbuffer->enable_multibyte_characters))
-	    BUF_INC_POS (inbuffer, pt_byte);
-	  else
-	    pt_byte++;
-	  SET_BUF_PT_BOTH (inbuffer, BUF_PT (inbuffer) + 1, pt_byte);
-	  readchar_backlog = pt_byte - orig_pt_byte;
+	  unsigned char *p = BUF_BYTE_ADDRESS (inbuffer, pt_byte);
+	  BUF_INC_POS (inbuffer, pt_byte);
+	  c = STRING_CHAR (p, pt_byte - orig_pt_byte);
 	}
-
-      /* We get the address of the byte just passed,
-	 which is the last byte of the character.
-	 The other bytes in this character are consecutive with it,
-	 because the gap can't be in the middle of a character.  */
-      return *(BUF_BYTE_ADDRESS (inbuffer, BUF_PT_BYTE (inbuffer) - 1)
-	       - --readchar_backlog);
+      else
+	{
+	  c = BUF_FETCH_BYTE (inbuffer, pt_byte);
+	  pt_byte++;
+	}
+      SET_BUF_PT_BOTH (inbuffer, BUF_PT (inbuffer) + 1, pt_byte);
+
+      return c;
     }
   if (MARKERP (readcharfun))
     {
-      inbuffer = XMARKER (readcharfun)->buffer;
-
-      if (readchar_backlog == 0)
+      register struct buffer *inbuffer = XMARKER (readcharfun)->buffer;
+
+      int bytepos = marker_byte_position (readcharfun);
+      int orig_bytepos = bytepos;
+
+      if (bytepos >= BUF_ZV_BYTE (inbuffer))
+	return -1;
+
+      if (! NILP (inbuffer->enable_multibyte_characters))
 	{
-	  int bytepos = marker_byte_position (readcharfun);
-	  int orig_bytepos = bytepos;
-
-	  if (bytepos >= BUF_ZV_BYTE (inbuffer))
-	    return -1;
-
-	  if (! NILP (inbuffer->enable_multibyte_characters))
-	    INC_POS (bytepos);
-	  else
-	    bytepos++;
-	  XMARKER (readcharfun)->bytepos = bytepos;
-	  XMARKER (readcharfun)->charpos++;
-
-	  readchar_backlog = bytepos - orig_bytepos;
+	  unsigned char *p = BUF_BYTE_ADDRESS (inbuffer, bytepos);
+	  BUF_INC_POS (inbuffer, bytepos);
+	  c = STRING_CHAR (p, bytepos - orig_bytepos);
 	}
-
-      /* We get the address of the byte just passed,
-	 which is the last byte of the character.
-	 The other bytes in this character are consecutive with it,
-	 because the gap can't be in the middle of a character.  */
-      return *(BUF_BYTE_ADDRESS (inbuffer, XMARKER (readcharfun)->bytepos - 1)
-	       - --readchar_backlog);
+      else
+	{
+	  c = BUF_FETCH_BYTE (inbuffer, bytepos);
+	  bytepos++;
+	}
+
+      XMARKER (readcharfun)->bytepos = bytepos;
+      XMARKER (readcharfun)->charpos++;
+
+      return c;
     }
   if (EQ (readcharfun, Qget_file_char))
     {
@@ -244,13 +240,14 @@
 
   if (STRINGP (readcharfun))
     {
-      register int c;
-      /* This used to be return of a conditional expression,
-	 but that truncated -1 to a char on VMS.  */
-      if (read_from_string_index < read_from_string_limit)
+      if (read_from_string_index >= read_from_string_limit)
+	c = -1;
+      else if (STRING_MULTIBYTE (readcharfun))
+	FETCH_STRING_CHAR_ADVANCE (c, readcharfun,
+				   read_from_string_index,
+				   read_from_string_index_byte);
+      else
 	c = XSTRING (readcharfun)->data[read_from_string_index++];
-      else
-	c = -1;
 
       return c;
     }
@@ -276,42 +273,36 @@
     ;
   else if (BUFFERP (readcharfun))
     {
-      if (!SINGLE_BYTE_CHAR_P (c))
-	readchar_backlog++;
+      struct buffer *b = XBUFFER (readcharfun);
+      int bytepos = BUF_PT_BYTE (b);
+
+      BUF_PT (b)--;
+      if (! NILP (b->enable_multibyte_characters))
+	BUF_DEC_POS (b, bytepos);
       else
-	{
-	  struct buffer *b = XBUFFER (readcharfun);
-	  int bytepos = BUF_PT_BYTE (b);
-
-	  BUF_PT (b)--;
-	  if (! NILP (b->enable_multibyte_characters))
-	    BUF_DEC_POS (b, bytepos);
-	  else
-	    bytepos--;
-
-	  BUF_PT_BYTE (b) = bytepos;
-	}
+	bytepos--;
+
+      BUF_PT_BYTE (b) = bytepos;
     }
   else if (MARKERP (readcharfun))
     {
-      if (!SINGLE_BYTE_CHAR_P (c))
-	readchar_backlog++;
+      struct buffer *b = XMARKER (readcharfun)->buffer;
+      int bytepos = XMARKER (readcharfun)->bytepos;
+
+      XMARKER (readcharfun)->charpos--;
+      if (! NILP (b->enable_multibyte_characters))
+	BUF_DEC_POS (b, bytepos);
       else
-	{
-	  struct buffer *b = XMARKER (readcharfun)->buffer;
-	  int bytepos = XMARKER (readcharfun)->bytepos;
-
-	  XMARKER (readcharfun)->charpos--;
-	  if (! NILP (b->enable_multibyte_characters))
-	    BUF_DEC_POS (b, bytepos);
-	  else
-	    bytepos--;
-
-	  XMARKER (readcharfun)->bytepos = bytepos;
-	}
+	bytepos--;
+
+      XMARKER (readcharfun)->bytepos = bytepos;
     }
   else if (STRINGP (readcharfun))
-    read_from_string_index--;
+    {
+      read_from_string_index--;
+      read_from_string_index_byte
+	= string_char_to_byte (readcharfun, read_from_string_index);
+    }
   else if (EQ (readcharfun, Qget_file_char))
     ungetc (c, instream);
   else
@@ -321,7 +312,7 @@
 static Lisp_Object read0 (), read1 (), read_list (), read_vector ();
 static int read_multibyte ();
 
-/* get a character from the tty */
+/* Get a character from the tty.  */
 
 extern Lisp_Object read_char ();
 
@@ -1180,12 +1171,11 @@
   CHECK_STRING (string,0);
 
   if (NILP (end))
-    endval = STRING_BYTES (XSTRING (string));
+    endval = XSTRING (string)->size;
   else
     {
       CHECK_NUMBER (end, 2);
-      endval = string_char_to_byte (string, XINT (end));
-      if (endval < 0 || endval > STRING_BYTES (XSTRING (string)))
+      if (endval < 0 || endval > XSTRING (string)->size)
 	args_out_of_range (string, end);
     }
 
@@ -1194,21 +1184,19 @@
   else
     {
       CHECK_NUMBER (start, 1);
-      startval = string_char_to_byte (string, XINT (start));
       if (startval < 0 || startval > endval)
 	args_out_of_range (string, start);
     }
 
   read_from_string_index = startval;
+  read_from_string_index_byte = string_char_to_byte (string, startval);
   read_from_string_limit = endval;
 
   new_backquote_flag = 0;
   read_objects = Qnil;
 
   tem = read0 (string);
-  endval = string_byte_to_char (string,
-				read_from_string_index);
-  return Fcons (tem, make_number (endval));
+  return Fcons (tem, make_number (read_from_string_index));
 }
 
 /* Use this for recursive reads, in contexts where internal tokens
@@ -1744,49 +1732,45 @@
 	while ((c = READCHAR) >= 0
 	       && c != '\"')
 	  {
-	    if (p == end)
+	    if (end - p < MAX_LENGTH_OF_MULTI_BYTE_FORM)
 	      {
 		char *new = (char *) xrealloc (read_buffer, read_buffer_size *= 2);
 		p += new - read_buffer;
 		read_buffer += new - read_buffer;
 		end = read_buffer + read_buffer_size;
 	      }
+
 	    if (c == '\\')
 	      {
 		c = read_escape (readcharfun, 1);
-		if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_META)))
+
+		/* C is -1 if \ newline has just been seen */
+		if (c == -1)
 		  {
-		    unsigned char workbuf[4];
-		    unsigned char *str = workbuf;
-		    int length;
-
-		    length = non_ascii_char_to_string (c, workbuf, &str);
-		    if (length > 1)
-		      force_multibyte = 1;
-
-		    if (p + length > end)
-		      {
-			char *new = (char *) xrealloc (read_buffer, read_buffer_size *= 2);
-			p += new - read_buffer;
-			read_buffer += new - read_buffer;
-			end = read_buffer + read_buffer_size;
-		      }
- 
-		    bcopy (str, p, length);
-		    p += length;
+		    if (p == read_buffer)
+		      cancel = 1;
 		    continue;
 		  }
+
 		/* If an escape specifies a non-ASCII single-byte character,
 		   this must be a unibyte string.  */
-		else if (! ASCII_BYTE_P (c))
+		if (SINGLE_BYTE_CHAR_P ((c & ~CHAR_META))
+		    && ! ASCII_BYTE_P (c))
 		  force_singlebyte = 1;
 	      }
 
-	    /* c is -1 if \ newline has just been seen */
-	    if (c == -1)
+	    if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_META)))
 	      {
-		if (p == read_buffer)
-		  cancel = 1;
+		unsigned char workbuf[4];
+		unsigned char *str = workbuf;
+		int length;
+
+		length = non_ascii_char_to_string (c, workbuf, &str);
+		if (length > 1)
+		  force_multibyte = 1;
+
+		bcopy (str, p, length);
+		p += length;
 	      }
 	    else
 	      {
@@ -1814,7 +1798,7 @@
 	  return make_number (0);
 
 	if (force_singlebyte && force_multibyte)
-	  error ("Multibyte and single-byte escapes in one string constant");
+	  error ("Multibyte and unibyte characters in one string constant");
 
 	if (force_singlebyte)
 	  nchars = p - read_buffer;
@@ -1831,7 +1815,14 @@
 		return Fstring_make_unibyte (string);
 	      }
 	  }
+	else if (EQ (readcharfun, Qget_file_char))
+	  /* Nowadays, reading directly from a file
+	     is used only for compiled Emacs Lisp files,
+	     and those always use the Emacs internal encoding.  */
+	  nchars = multibyte_chars_in_text (read_buffer, p - read_buffer);
 	else
+	  /* In all other cases, if we read these bytes as
+	     separate characters, treat them as separate characters now.  */
 	  nchars = p - read_buffer;
 
 	if (read_pure)
@@ -1884,7 +1875,7 @@
 		      || c == '[' || c == ']' || c == '#'
 		      ))
 	    {
-	      if (p == end)
+	      if (end - p < MAX_LENGTH_OF_MULTI_BYTE_FORM)
 		{
 		  register char *new = (char *) xrealloc (read_buffer, read_buffer_size *= 2);
 		  p += new - read_buffer;
@@ -1897,7 +1888,19 @@
 		  quoted = 1;
 		}
 
-	      *p++ = c;
+	      if (! SINGLE_BYTE_CHAR_P (c))
+		{
+		  unsigned char workbuf[4];
+		  unsigned char *str = workbuf;
+		  int length;
+
+		  length = non_ascii_char_to_string (c, workbuf, &str);
+
+		  bcopy (str, p, length);
+		  p += length;
+		}
+	      else
+		*p++ = c;
 
 	      c = READCHAR;
 	    }
@@ -2553,7 +2556,7 @@
   Qvariable_documentation = intern ("variable-documentation");
   staticpro (&Qvariable_documentation);
 
-  read_buffer_size = 100;
+  read_buffer_size = 100 + MAX_LENGTH_OF_MULTI_BYTE_FORM;
   read_buffer = (char *) malloc (read_buffer_size);
 }