comparison src/lread.c @ 21724:12a826a37249

(read_from_string_index_byte): New variable. (read_from_string_index): Now counts characters. (readchar, unreadchar, Fread_from_string): Changed accordingly. (readchar): Read a multibyte char all at once from a buffer, marker or string. (unreadchar): Unread a multibyte char all at once. (read1): Properly handle non-escaped multibyte chars. They force a string to be multibyte. When reading direct from a file, any multibyte sequence means a multibyte string. Insist on MAX_LENGTH_OF_MULTI_BYTE_FORM bytes when checking for read_buffer full; this way need not check specially for multibyte.
author Richard M. Stallman <rms@gnu.org>
date Thu, 23 Apr 1998 21:22:51 +0000
parents f42b78e033db
children f654c3b16214
comparison
equal deleted inserted replaced
21723:edc95c28d970 21724:12a826a37249
129 /* When nonzero, read conses in pure space */ 129 /* When nonzero, read conses in pure space */
130 static int read_pure; 130 static int read_pure;
131 131
132 /* For use within read-from-string (this reader is non-reentrant!!) */ 132 /* For use within read-from-string (this reader is non-reentrant!!) */
133 static int read_from_string_index; 133 static int read_from_string_index;
134 static int read_from_string_index_byte;
134 static int read_from_string_limit; 135 static int read_from_string_limit;
135 136
136 /* Number of bytes left to read in the buffer character 137 /* Number of bytes left to read in the buffer character
137 that `readchar' has already advanced over. */ 138 that `readchar' has already advanced over. */
138 static int readchar_backlog; 139 static int readchar_backlog;
167 static int 168 static int
168 readchar (readcharfun) 169 readchar (readcharfun)
169 Lisp_Object readcharfun; 170 Lisp_Object readcharfun;
170 { 171 {
171 Lisp_Object tem; 172 Lisp_Object tem;
172 register struct buffer *inbuffer;
173 register int c, mpos; 173 register int c, mpos;
174 174
175 if (BUFFERP (readcharfun)) 175 if (BUFFERP (readcharfun))
176 { 176 {
177 inbuffer = XBUFFER (readcharfun); 177 register struct buffer *inbuffer = XBUFFER (readcharfun);
178 178
179 if (readchar_backlog == 0) 179 int pt_byte = BUF_PT_BYTE (inbuffer);
180 { 180 int orig_pt_byte = pt_byte;
181 int pt_byte = BUF_PT_BYTE (inbuffer); 181
182 int orig_pt_byte = pt_byte; 182 if (pt_byte >= BUF_ZV_BYTE (inbuffer))
183 183 return -1;
184 if (pt_byte >= BUF_ZV_BYTE (inbuffer)) 184
185 return -1; 185 if (! NILP (inbuffer->enable_multibyte_characters))
186 186 {
187 if (! NILP (inbuffer->enable_multibyte_characters)) 187 unsigned char *p = BUF_BYTE_ADDRESS (inbuffer, pt_byte);
188 BUF_INC_POS (inbuffer, pt_byte); 188 BUF_INC_POS (inbuffer, pt_byte);
189 else 189 c = STRING_CHAR (p, pt_byte - orig_pt_byte);
190 pt_byte++; 190 }
191 SET_BUF_PT_BOTH (inbuffer, BUF_PT (inbuffer) + 1, pt_byte); 191 else
192 readchar_backlog = pt_byte - orig_pt_byte; 192 {
193 } 193 c = BUF_FETCH_BYTE (inbuffer, pt_byte);
194 194 pt_byte++;
195 /* We get the address of the byte just passed, 195 }
196 which is the last byte of the character. 196 SET_BUF_PT_BOTH (inbuffer, BUF_PT (inbuffer) + 1, pt_byte);
197 The other bytes in this character are consecutive with it, 197
198 because the gap can't be in the middle of a character. */ 198 return c;
199 return *(BUF_BYTE_ADDRESS (inbuffer, BUF_PT_BYTE (inbuffer) - 1)
200 - --readchar_backlog);
201 } 199 }
202 if (MARKERP (readcharfun)) 200 if (MARKERP (readcharfun))
203 { 201 {
204 inbuffer = XMARKER (readcharfun)->buffer; 202 register struct buffer *inbuffer = XMARKER (readcharfun)->buffer;
205 203
206 if (readchar_backlog == 0) 204 int bytepos = marker_byte_position (readcharfun);
207 { 205 int orig_bytepos = bytepos;
208 int bytepos = marker_byte_position (readcharfun); 206
209 int orig_bytepos = bytepos; 207 if (bytepos >= BUF_ZV_BYTE (inbuffer))
210 208 return -1;
211 if (bytepos >= BUF_ZV_BYTE (inbuffer)) 209
212 return -1; 210 if (! NILP (inbuffer->enable_multibyte_characters))
213 211 {
214 if (! NILP (inbuffer->enable_multibyte_characters)) 212 unsigned char *p = BUF_BYTE_ADDRESS (inbuffer, bytepos);
215 INC_POS (bytepos); 213 BUF_INC_POS (inbuffer, bytepos);
216 else 214 c = STRING_CHAR (p, bytepos - orig_bytepos);
217 bytepos++; 215 }
218 XMARKER (readcharfun)->bytepos = bytepos; 216 else
219 XMARKER (readcharfun)->charpos++; 217 {
220 218 c = BUF_FETCH_BYTE (inbuffer, bytepos);
221 readchar_backlog = bytepos - orig_bytepos; 219 bytepos++;
222 } 220 }
223 221
224 /* We get the address of the byte just passed, 222 XMARKER (readcharfun)->bytepos = bytepos;
225 which is the last byte of the character. 223 XMARKER (readcharfun)->charpos++;
226 The other bytes in this character are consecutive with it, 224
227 because the gap can't be in the middle of a character. */ 225 return c;
228 return *(BUF_BYTE_ADDRESS (inbuffer, XMARKER (readcharfun)->bytepos - 1)
229 - --readchar_backlog);
230 } 226 }
231 if (EQ (readcharfun, Qget_file_char)) 227 if (EQ (readcharfun, Qget_file_char))
232 { 228 {
233 c = getc (instream); 229 c = getc (instream);
234 #ifdef EINTR 230 #ifdef EINTR
242 return c; 238 return c;
243 } 239 }
244 240
245 if (STRINGP (readcharfun)) 241 if (STRINGP (readcharfun))
246 { 242 {
247 register int c; 243 if (read_from_string_index >= read_from_string_limit)
248 /* This used to be return of a conditional expression, 244 c = -1;
249 but that truncated -1 to a char on VMS. */ 245 else if (STRING_MULTIBYTE (readcharfun))
250 if (read_from_string_index < read_from_string_limit) 246 FETCH_STRING_CHAR_ADVANCE (c, readcharfun,
247 read_from_string_index,
248 read_from_string_index_byte);
249 else
251 c = XSTRING (readcharfun)->data[read_from_string_index++]; 250 c = XSTRING (readcharfun)->data[read_from_string_index++];
252 else
253 c = -1;
254 251
255 return c; 252 return c;
256 } 253 }
257 254
258 tem = call0 (readcharfun); 255 tem = call0 (readcharfun);
274 /* Don't back up the pointer if we're unreading the end-of-input mark, 271 /* Don't back up the pointer if we're unreading the end-of-input mark,
275 since readchar didn't advance it when we read it. */ 272 since readchar didn't advance it when we read it. */
276 ; 273 ;
277 else if (BUFFERP (readcharfun)) 274 else if (BUFFERP (readcharfun))
278 { 275 {
279 if (!SINGLE_BYTE_CHAR_P (c)) 276 struct buffer *b = XBUFFER (readcharfun);
280 readchar_backlog++; 277 int bytepos = BUF_PT_BYTE (b);
278
279 BUF_PT (b)--;
280 if (! NILP (b->enable_multibyte_characters))
281 BUF_DEC_POS (b, bytepos);
281 else 282 else
282 { 283 bytepos--;
283 struct buffer *b = XBUFFER (readcharfun); 284
284 int bytepos = BUF_PT_BYTE (b); 285 BUF_PT_BYTE (b) = bytepos;
285
286 BUF_PT (b)--;
287 if (! NILP (b->enable_multibyte_characters))
288 BUF_DEC_POS (b, bytepos);
289 else
290 bytepos--;
291
292 BUF_PT_BYTE (b) = bytepos;
293 }
294 } 286 }
295 else if (MARKERP (readcharfun)) 287 else if (MARKERP (readcharfun))
296 { 288 {
297 if (!SINGLE_BYTE_CHAR_P (c)) 289 struct buffer *b = XMARKER (readcharfun)->buffer;
298 readchar_backlog++; 290 int bytepos = XMARKER (readcharfun)->bytepos;
291
292 XMARKER (readcharfun)->charpos--;
293 if (! NILP (b->enable_multibyte_characters))
294 BUF_DEC_POS (b, bytepos);
299 else 295 else
300 { 296 bytepos--;
301 struct buffer *b = XMARKER (readcharfun)->buffer; 297
302 int bytepos = XMARKER (readcharfun)->bytepos; 298 XMARKER (readcharfun)->bytepos = bytepos;
303
304 XMARKER (readcharfun)->charpos--;
305 if (! NILP (b->enable_multibyte_characters))
306 BUF_DEC_POS (b, bytepos);
307 else
308 bytepos--;
309
310 XMARKER (readcharfun)->bytepos = bytepos;
311 }
312 } 299 }
313 else if (STRINGP (readcharfun)) 300 else if (STRINGP (readcharfun))
314 read_from_string_index--; 301 {
302 read_from_string_index--;
303 read_from_string_index_byte
304 = string_char_to_byte (readcharfun, read_from_string_index);
305 }
315 else if (EQ (readcharfun, Qget_file_char)) 306 else if (EQ (readcharfun, Qget_file_char))
316 ungetc (c, instream); 307 ungetc (c, instream);
317 else 308 else
318 call1 (readcharfun, make_number (c)); 309 call1 (readcharfun, make_number (c));
319 } 310 }
320 311
321 static Lisp_Object read0 (), read1 (), read_list (), read_vector (); 312 static Lisp_Object read0 (), read1 (), read_list (), read_vector ();
322 static int read_multibyte (); 313 static int read_multibyte ();
323 314
324 /* get a character from the tty */ 315 /* Get a character from the tty. */
325 316
326 extern Lisp_Object read_char (); 317 extern Lisp_Object read_char ();
327 318
328 /* Read input events until we get one that's acceptable for our purposes. 319 /* Read input events until we get one that's acceptable for our purposes.
329 320
1178 Lisp_Object tem; 1169 Lisp_Object tem;
1179 1170
1180 CHECK_STRING (string,0); 1171 CHECK_STRING (string,0);
1181 1172
1182 if (NILP (end)) 1173 if (NILP (end))
1183 endval = STRING_BYTES (XSTRING (string)); 1174 endval = XSTRING (string)->size;
1184 else 1175 else
1185 { 1176 {
1186 CHECK_NUMBER (end, 2); 1177 CHECK_NUMBER (end, 2);
1187 endval = string_char_to_byte (string, XINT (end)); 1178 if (endval < 0 || endval > XSTRING (string)->size)
1188 if (endval < 0 || endval > STRING_BYTES (XSTRING (string)))
1189 args_out_of_range (string, end); 1179 args_out_of_range (string, end);
1190 } 1180 }
1191 1181
1192 if (NILP (start)) 1182 if (NILP (start))
1193 startval = 0; 1183 startval = 0;
1194 else 1184 else
1195 { 1185 {
1196 CHECK_NUMBER (start, 1); 1186 CHECK_NUMBER (start, 1);
1197 startval = string_char_to_byte (string, XINT (start));
1198 if (startval < 0 || startval > endval) 1187 if (startval < 0 || startval > endval)
1199 args_out_of_range (string, start); 1188 args_out_of_range (string, start);
1200 } 1189 }
1201 1190
1202 read_from_string_index = startval; 1191 read_from_string_index = startval;
1192 read_from_string_index_byte = string_char_to_byte (string, startval);
1203 read_from_string_limit = endval; 1193 read_from_string_limit = endval;
1204 1194
1205 new_backquote_flag = 0; 1195 new_backquote_flag = 0;
1206 read_objects = Qnil; 1196 read_objects = Qnil;
1207 1197
1208 tem = read0 (string); 1198 tem = read0 (string);
1209 endval = string_byte_to_char (string, 1199 return Fcons (tem, make_number (read_from_string_index));
1210 read_from_string_index);
1211 return Fcons (tem, make_number (endval));
1212 } 1200 }
1213 1201
1214 /* Use this for recursive reads, in contexts where internal tokens 1202 /* Use this for recursive reads, in contexts where internal tokens
1215 are not allowed. */ 1203 are not allowed. */
1216 1204
1742 int nchars; 1730 int nchars;
1743 1731
1744 while ((c = READCHAR) >= 0 1732 while ((c = READCHAR) >= 0
1745 && c != '\"') 1733 && c != '\"')
1746 { 1734 {
1747 if (p == end) 1735 if (end - p < MAX_LENGTH_OF_MULTI_BYTE_FORM)
1748 { 1736 {
1749 char *new = (char *) xrealloc (read_buffer, read_buffer_size *= 2); 1737 char *new = (char *) xrealloc (read_buffer, read_buffer_size *= 2);
1750 p += new - read_buffer; 1738 p += new - read_buffer;
1751 read_buffer += new - read_buffer; 1739 read_buffer += new - read_buffer;
1752 end = read_buffer + read_buffer_size; 1740 end = read_buffer + read_buffer_size;
1753 } 1741 }
1742
1754 if (c == '\\') 1743 if (c == '\\')
1755 { 1744 {
1756 c = read_escape (readcharfun, 1); 1745 c = read_escape (readcharfun, 1);
1757 if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_META))) 1746
1747 /* C is -1 if \ newline has just been seen */
1748 if (c == -1)
1758 { 1749 {
1759 unsigned char workbuf[4]; 1750 if (p == read_buffer)
1760 unsigned char *str = workbuf; 1751 cancel = 1;
1761 int length;
1762
1763 length = non_ascii_char_to_string (c, workbuf, &str);
1764 if (length > 1)
1765 force_multibyte = 1;
1766
1767 if (p + length > end)
1768 {
1769 char *new = (char *) xrealloc (read_buffer, read_buffer_size *= 2);
1770 p += new - read_buffer;
1771 read_buffer += new - read_buffer;
1772 end = read_buffer + read_buffer_size;
1773 }
1774
1775 bcopy (str, p, length);
1776 p += length;
1777 continue; 1752 continue;
1778 } 1753 }
1754
1779 /* If an escape specifies a non-ASCII single-byte character, 1755 /* If an escape specifies a non-ASCII single-byte character,
1780 this must be a unibyte string. */ 1756 this must be a unibyte string. */
1781 else if (! ASCII_BYTE_P (c)) 1757 if (SINGLE_BYTE_CHAR_P ((c & ~CHAR_META))
1758 && ! ASCII_BYTE_P (c))
1782 force_singlebyte = 1; 1759 force_singlebyte = 1;
1783 } 1760 }
1784 1761
1785 /* c is -1 if \ newline has just been seen */ 1762 if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_META)))
1786 if (c == -1)
1787 { 1763 {
1788 if (p == read_buffer) 1764 unsigned char workbuf[4];
1789 cancel = 1; 1765 unsigned char *str = workbuf;
1766 int length;
1767
1768 length = non_ascii_char_to_string (c, workbuf, &str);
1769 if (length > 1)
1770 force_multibyte = 1;
1771
1772 bcopy (str, p, length);
1773 p += length;
1790 } 1774 }
1791 else 1775 else
1792 { 1776 {
1793 /* Allow `\C- ' and `\C-?'. */ 1777 /* Allow `\C- ' and `\C-?'. */
1794 if (c == (CHAR_CTL | ' ')) 1778 if (c == (CHAR_CTL | ' '))
1812 that we are really going to find in etc/DOC.nn.nn */ 1796 that we are really going to find in etc/DOC.nn.nn */
1813 if (!NILP (Vpurify_flag) && NILP (Vdoc_file_name) && cancel) 1797 if (!NILP (Vpurify_flag) && NILP (Vdoc_file_name) && cancel)
1814 return make_number (0); 1798 return make_number (0);
1815 1799
1816 if (force_singlebyte && force_multibyte) 1800 if (force_singlebyte && force_multibyte)
1817 error ("Multibyte and single-byte escapes in one string constant"); 1801 error ("Multibyte and unibyte characters in one string constant");
1818 1802
1819 if (force_singlebyte) 1803 if (force_singlebyte)
1820 nchars = p - read_buffer; 1804 nchars = p - read_buffer;
1821 else if (force_multibyte) 1805 else if (force_multibyte)
1822 nchars = multibyte_chars_in_text (read_buffer, p - read_buffer); 1806 nchars = multibyte_chars_in_text (read_buffer, p - read_buffer);
1829 string = make_multibyte_string (read_buffer, nchars, 1813 string = make_multibyte_string (read_buffer, nchars,
1830 p - read_buffer); 1814 p - read_buffer);
1831 return Fstring_make_unibyte (string); 1815 return Fstring_make_unibyte (string);
1832 } 1816 }
1833 } 1817 }
1818 else if (EQ (readcharfun, Qget_file_char))
1819 /* Nowadays, reading directly from a file
1820 is used only for compiled Emacs Lisp files,
1821 and those always use the Emacs internal encoding. */
1822 nchars = multibyte_chars_in_text (read_buffer, p - read_buffer);
1834 else 1823 else
1824 /* In all other cases, if we read these bytes as
1825 separate characters, treat them as separate characters now. */
1835 nchars = p - read_buffer; 1826 nchars = p - read_buffer;
1836 1827
1837 if (read_pure) 1828 if (read_pure)
1838 return make_pure_string (read_buffer, nchars, p - read_buffer, 1829 return make_pure_string (read_buffer, nchars, p - read_buffer,
1839 (force_multibyte 1830 (force_multibyte
1882 || c =='.' 1873 || c =='.'
1883 #endif /* not LISP_FLOAT_TYPE */ 1874 #endif /* not LISP_FLOAT_TYPE */
1884 || c == '[' || c == ']' || c == '#' 1875 || c == '[' || c == ']' || c == '#'
1885 )) 1876 ))
1886 { 1877 {
1887 if (p == end) 1878 if (end - p < MAX_LENGTH_OF_MULTI_BYTE_FORM)
1888 { 1879 {
1889 register char *new = (char *) xrealloc (read_buffer, read_buffer_size *= 2); 1880 register char *new = (char *) xrealloc (read_buffer, read_buffer_size *= 2);
1890 p += new - read_buffer; 1881 p += new - read_buffer;
1891 read_buffer += new - read_buffer; 1882 read_buffer += new - read_buffer;
1892 end = read_buffer + read_buffer_size; 1883 end = read_buffer + read_buffer_size;
1895 { 1886 {
1896 c = READCHAR; 1887 c = READCHAR;
1897 quoted = 1; 1888 quoted = 1;
1898 } 1889 }
1899 1890
1900 *p++ = c; 1891 if (! SINGLE_BYTE_CHAR_P (c))
1892 {
1893 unsigned char workbuf[4];
1894 unsigned char *str = workbuf;
1895 int length;
1896
1897 length = non_ascii_char_to_string (c, workbuf, &str);
1898
1899 bcopy (str, p, length);
1900 p += length;
1901 }
1902 else
1903 *p++ = c;
1901 1904
1902 c = READCHAR; 1905 c = READCHAR;
1903 } 1906 }
1904 1907
1905 if (p == end) 1908 if (p == end)
2551 Vpurify_flag = Qt; 2554 Vpurify_flag = Qt;
2552 2555
2553 Qvariable_documentation = intern ("variable-documentation"); 2556 Qvariable_documentation = intern ("variable-documentation");
2554 staticpro (&Qvariable_documentation); 2557 staticpro (&Qvariable_documentation);
2555 2558
2556 read_buffer_size = 100; 2559 read_buffer_size = 100 + MAX_LENGTH_OF_MULTI_BYTE_FORM;
2557 read_buffer = (char *) malloc (read_buffer_size); 2560 read_buffer = (char *) malloc (read_buffer_size);
2558 } 2561 }
2559 2562
2560 void 2563 void
2561 defsubr (sname) 2564 defsubr (sname)