comparison src/lread.c @ 88383:0b4249d736a0

Include "character.h". (read_multibyte): New arg NBYTES. (read_escape): The meaning of returned *BYTEREP changed. (to_multibyte): Deleted. (read1): Adjuted the handling of char table and string.
author Kenichi Handa <handa@m17n.org>
date Fri, 01 Mar 2002 01:44:45 +0000
parents a6382f0fcb2a
children ef046df4c6ee
comparison
equal deleted inserted replaced
88382:5e3e1d9d514f 88383:0b4249d736a0
27 #include <sys/file.h> 27 #include <sys/file.h>
28 #include <errno.h> 28 #include <errno.h>
29 #include "lisp.h" 29 #include "lisp.h"
30 #include "intervals.h" 30 #include "intervals.h"
31 #include "buffer.h" 31 #include "buffer.h"
32 #include "character.h"
32 #include "charset.h" 33 #include "charset.h"
33 #include <epaths.h> 34 #include <epaths.h>
34 #include "commands.h" 35 #include "commands.h"
35 #include "keyboard.h" 36 #include "keyboard.h"
36 #include "termhooks.h" 37 #include "termhooks.h"
1474 static int read_buffer_size; 1475 static int read_buffer_size;
1475 static char *read_buffer; 1476 static char *read_buffer;
1476 1477
1477 /* Read multibyte form and return it as a character. C is a first 1478 /* Read multibyte form and return it as a character. C is a first
1478 byte of multibyte form, and rest of them are read from 1479 byte of multibyte form, and rest of them are read from
1479 READCHARFUN. */ 1480 READCHARFUN. Store the byte length of the form into *NBYTES. */
1480 1481
1481 static int 1482 static int
1482 read_multibyte (c, readcharfun) 1483 read_multibyte (c, readcharfun, nbytes)
1483 register int c; 1484 register int c;
1484 Lisp_Object readcharfun; 1485 Lisp_Object readcharfun;
1486 int *nbytes;
1485 { 1487 {
1486 /* We need the actual character code of this multibyte 1488 /* We need the actual character code of this multibyte
1487 characters. */ 1489 characters. */
1488 unsigned char str[MAX_MULTIBYTE_LENGTH]; 1490 unsigned char str[MAX_MULTIBYTE_LENGTH];
1489 int len = 0; 1491 int len = 0;
1490 int bytes; 1492 int bytes = BYTES_BY_CHAR_HEAD (c);
1491 1493
1492 str[len++] = c; 1494 str[len++] = c;
1493 while ((c = READCHAR) >= 0xA0 1495 while (len < bytes)
1494 && len < MAX_MULTIBYTE_LENGTH) 1496 {
1495 str[len++] = c; 1497 c = READCHAR;
1496 UNREAD (c); 1498 if (CHAR_HEAD_P (c))
1497 if (UNIBYTE_STR_AS_MULTIBYTE_P (str, len, bytes)) 1499 {
1498 return STRING_CHAR (str, len); 1500 UNREAD (c);
1501 break;
1502 }
1503 str[len++] = c;
1504 }
1505
1506 if (len == bytes && MULTIBYTE_LENGTH_NO_CHECK (str) > 0)
1507 {
1508 *nbytes = len;
1509 return STRING_CHAR (str, len);
1510 }
1499 /* The byte sequence is not valid as multibyte. Unread all bytes 1511 /* The byte sequence is not valid as multibyte. Unread all bytes
1500 but the first one, and return the first byte. */ 1512 but the first one, and return the first byte. */
1501 while (--len > 0) 1513 while (--len > 0)
1502 UNREAD (str[len]); 1514 UNREAD (str[len]);
1515 *nbytes = 1;
1503 return str[0]; 1516 return str[0];
1504 } 1517 }
1505 1518
1506 /* Read a \-escape sequence, assuming we already read the `\'. 1519 /* Read a \-escape sequence, assuming we already read the `\'.
1507 If the escape sequence forces unibyte, store 1 into *BYTEREP. 1520 If the escape sequence forces unibyte, store 1 into *BYTEREP.
1508 If the escape sequence forces multibyte, store 2 into *BYTEREP. 1521 If the escape sequence forces multibyte and the returned character
1522 is raw 8-bit char, store 2 into *BYTEREP.
1523 If the escape sequence forces multibyte and the returned character
1524 is not raw 8-bit char, store 3 into *BYTEREP.
1509 Otherwise store 0 into *BYTEREP. */ 1525 Otherwise store 0 into *BYTEREP. */
1510 1526
1511 static int 1527 static int
1512 read_escape (readcharfun, stringp, byterep) 1528 read_escape (readcharfun, stringp, byterep)
1513 Lisp_Object readcharfun; 1529 Lisp_Object readcharfun;
1638 UNREAD (c); 1654 UNREAD (c);
1639 break; 1655 break;
1640 } 1656 }
1641 } 1657 }
1642 1658
1643 *byterep = 1; 1659 if (c < 0x100)
1660 *byterep = 1;
1661 else
1662 *byterep = 3;
1644 return i; 1663 return i;
1645 } 1664 }
1646 1665
1647 case 'x': 1666 case 'x':
1648 /* A hex escape, as in ANSI C. */ 1667 /* A hex escape, as in ANSI C. */
1649 { 1668 {
1650 int i = 0; 1669 int i = 0;
1670 int count = 0;
1651 while (1) 1671 while (1)
1652 { 1672 {
1653 c = READCHAR; 1673 c = READCHAR;
1654 if (c >= '0' && c <= '9') 1674 if (c >= '0' && c <= '9')
1655 { 1675 {
1668 else 1688 else
1669 { 1689 {
1670 UNREAD (c); 1690 UNREAD (c);
1671 break; 1691 break;
1672 } 1692 }
1693 count++;
1673 } 1694 }
1674 1695
1675 *byterep = 2; 1696 if (count < 3 && i >= 0x80)
1697 *byterep = 2;
1698 else
1699 *byterep = 3;
1676 return i; 1700 return i;
1677 } 1701 }
1678 1702
1679 default: 1703 default:
1680 if (BASE_LEADING_CODE_P (c)) 1704 if (EQ (readcharfun, Qget_file_char)
1681 c = read_multibyte (c, readcharfun); 1705 && BASE_LEADING_CODE_P (c))
1706 {
1707 int nbytes;
1708
1709 c = read_multibyte (c, readcharfun, &nbytes);
1710 if (nbytes > 1)
1711 *byterep = 3;
1712 }
1682 return c; 1713 return c;
1683 } 1714 }
1684 } 1715 }
1685 1716
1686 1717
1748 1779
1749 return make_number (sign * number); 1780 return make_number (sign * number);
1750 } 1781 }
1751 1782
1752 1783
1753 /* Convert unibyte text in read_buffer to multibyte.
1754
1755 Initially, *P is a pointer after the end of the unibyte text, and
1756 the pointer *END points after the end of read_buffer.
1757
1758 If read_buffer doesn't have enough room to hold the result
1759 of the conversion, reallocate it and adjust *P and *END.
1760
1761 At the end, make *P point after the result of the conversion, and
1762 return in *NCHARS the number of characters in the converted
1763 text. */
1764
1765 static void
1766 to_multibyte (p, end, nchars)
1767 char **p, **end;
1768 int *nchars;
1769 {
1770 int nbytes;
1771
1772 parse_str_as_multibyte (read_buffer, *p - read_buffer, &nbytes, nchars);
1773 if (read_buffer_size < 2 * nbytes)
1774 {
1775 int offset = *p - read_buffer;
1776 read_buffer_size = 2 * max (read_buffer_size, nbytes);
1777 read_buffer = (char *) xrealloc (read_buffer, read_buffer_size);
1778 *p = read_buffer + offset;
1779 *end = read_buffer + read_buffer_size;
1780 }
1781
1782 if (nbytes != *nchars)
1783 nbytes = str_as_multibyte (read_buffer, read_buffer_size,
1784 *p - read_buffer, nchars);
1785
1786 *p = read_buffer + nbytes;
1787 }
1788
1789
1790 /* If the next token is ')' or ']' or '.', we store that character 1784 /* If the next token is ')' or ']' or '.', we store that character
1791 in *PCH and the return value is not interesting. Else, we store 1785 in *PCH and the return value is not interesting. Else, we store
1792 zero in *PCH and we read and return one lisp object. 1786 zero in *PCH and we read and return one lisp object.
1793 1787
1794 FIRST_IN_LIST is nonzero if this is the first element of a list. */ 1788 FIRST_IN_LIST is nonzero if this is the first element of a list. */
1832 c = READCHAR; 1826 c = READCHAR;
1833 if (c == '[') 1827 if (c == '[')
1834 { 1828 {
1835 Lisp_Object tmp; 1829 Lisp_Object tmp;
1836 tmp = read_vector (readcharfun, 0); 1830 tmp = read_vector (readcharfun, 0);
1837 if (XVECTOR (tmp)->size < CHAR_TABLE_STANDARD_SLOTS 1831 if (XVECTOR (tmp)->size != VECSIZE (struct Lisp_Char_Table))
1838 || XVECTOR (tmp)->size > CHAR_TABLE_STANDARD_SLOTS + 10)
1839 error ("Invalid size char-table"); 1832 error ("Invalid size char-table");
1840 XSETCHAR_TABLE (tmp, XCHAR_TABLE (tmp)); 1833 XSETCHAR_TABLE (tmp, XCHAR_TABLE (tmp));
1841 XCHAR_TABLE (tmp)->top = Qt;
1842 return tmp; 1834 return tmp;
1843 } 1835 }
1844 else if (c == '^') 1836 else if (c == '^')
1845 { 1837 {
1846 c = READCHAR; 1838 c = READCHAR;
1847 if (c == '[') 1839 if (c == '[')
1848 { 1840 {
1849 Lisp_Object tmp; 1841 Lisp_Object tmp;
1842 int depth, size;
1843
1850 tmp = read_vector (readcharfun, 0); 1844 tmp = read_vector (readcharfun, 0);
1851 if (XVECTOR (tmp)->size != SUB_CHAR_TABLE_STANDARD_SLOTS) 1845 if (!INTEGERP (AREF (tmp, 0)))
1846 error ("Invalid depth in char-table");
1847 depth = XINT (AREF (tmp, 0));
1848 if (depth < 1 || depth > 3)
1849 error ("Invalid depth in char-table");
1850 size = XVECTOR (tmp)->size + 2;
1851 if (chartab_size [depth] != size)
1852 error ("Invalid size char-table"); 1852 error ("Invalid size char-table");
1853 XSETCHAR_TABLE (tmp, XCHAR_TABLE (tmp)); 1853 XSETSUB_CHAR_TABLE (tmp, XSUB_CHAR_TABLE (tmp));
1854 XCHAR_TABLE (tmp)->top = Qnil;
1855 return tmp; 1854 return tmp;
1856 } 1855 }
1857 Fsignal (Qinvalid_read_syntax, 1856 Fsignal (Qinvalid_read_syntax,
1858 Fcons (make_string ("#^^", 3), Qnil)); 1857 Fcons (make_string ("#^^", 3), Qnil));
1859 } 1858 }
2132 if (c < 0) 2131 if (c < 0)
2133 end_of_file_error (); 2132 end_of_file_error ();
2134 2133
2135 if (c == '\\') 2134 if (c == '\\')
2136 c = read_escape (readcharfun, 0, &discard); 2135 c = read_escape (readcharfun, 0, &discard);
2137 else if (BASE_LEADING_CODE_P (c)) 2136 else if (EQ (readcharfun, Qget_file_char)
2138 c = read_multibyte (c, readcharfun); 2137 && BASE_LEADING_CODE_P (c))
2138 c = read_multibyte (c, readcharfun, &discard);
2139 2139
2140 return make_number (c); 2140 return make_number (c);
2141 } 2141 }
2142 2142
2143 case '"': 2143 case '"':
2144 { 2144 {
2145 char *p = read_buffer; 2145 char *p = read_buffer;
2146 char *end = read_buffer + read_buffer_size; 2146 char *end = read_buffer + read_buffer_size;
2147 register int c; 2147 register int c;
2148 /* 1 if we saw an escape sequence specifying 2148 /* Nonzero if we saw an escape sequence specifying
2149 a multibyte character, or a multibyte character. */ 2149 a multibyte character. */
2150 int force_multibyte = 0; 2150 int force_multibyte = 0;
2151 /* 1 if we saw an escape sequence specifying 2151 /* Nonzero if we saw an escape sequence specifying
2152 a single-byte character. */ 2152 a single-byte character. */
2153 int force_singlebyte = 0; 2153 int force_singlebyte = 0;
2154 /* 1 if read_buffer contains multibyte text now. */
2155 int is_multibyte = 0;
2156 int cancel = 0; 2154 int cancel = 0;
2157 int nchars = 0; 2155 int nchars = 0;
2158 2156
2159 while ((c = READCHAR) >= 0 2157 while ((c = READCHAR) >= 0
2160 && c != '\"') 2158 && c != '\"')
2168 end = read_buffer + read_buffer_size; 2166 end = read_buffer + read_buffer_size;
2169 } 2167 }
2170 2168
2171 if (c == '\\') 2169 if (c == '\\')
2172 { 2170 {
2171 int modifiers;
2173 int byterep; 2172 int byterep;
2174 2173
2175 c = read_escape (readcharfun, 1, &byterep); 2174 c = read_escape (readcharfun, 1, &byterep);
2176 2175
2177 /* C is -1 if \ newline has just been seen */ 2176 /* C is -1 if \ newline has just been seen */
2180 if (p == read_buffer) 2179 if (p == read_buffer)
2181 cancel = 1; 2180 cancel = 1;
2182 continue; 2181 continue;
2183 } 2182 }
2184 2183
2184 modifiers = c & CHAR_MODIFIER_MASK;
2185 c = c & ~CHAR_MODIFIER_MASK;
2186
2185 if (byterep == 1) 2187 if (byterep == 1)
2186 force_singlebyte = 1; 2188 {
2187 else if (byterep == 2) 2189 force_singlebyte = 1;
2190 if (c >= 0x80)
2191 /* Raw 8-bit code */
2192 c = BYTE8_TO_CHAR (c);
2193 }
2194 else if (byterep > 1)
2195 {
2196 force_multibyte = 1;
2197 if (byterep == 2)
2198 c = BYTE8_TO_CHAR (c);
2199 }
2200 else if (c >= 0x80)
2201 {
2202 force_singlebyte = 1;
2203 c = BYTE8_TO_CHAR (c);
2204 }
2205
2206 if (ASCII_CHAR_P (c))
2207 {
2208 /* Allow `\C- ' and `\C-?'. */
2209 if (modifiers == CHAR_CTL)
2210 {
2211 if (c == ' ')
2212 c = 0, modifiers = 0;
2213 else if (c == '?')
2214 c = 127, modifiers = 0;
2215 }
2216 if (modifiers & CHAR_SHIFT)
2217 {
2218 /* Shift modifier is valid only with [A-Za-z]. */
2219 if (c >= 'A' && c <= 'Z')
2220 modifiers &= ~CHAR_SHIFT;
2221 else if (c >= 'a' && c <= 'z')
2222 c -= ('a' - 'A'), modifiers &= ~CHAR_SHIFT;
2223 }
2224
2225 if (modifiers & CHAR_META)
2226 {
2227 /* Move the meta bit to the right place for a
2228 string. */
2229 modifiers &= ~CHAR_META;
2230 c = BYTE8_TO_CHAR (c | 0x80);
2231 force_singlebyte = 1;
2232 }
2233 }
2234
2235 /* Any modifiers remaining are invalid. */
2236 if (modifiers)
2237 error ("Invalid modifier in string");
2238 p += CHAR_STRING (c, (unsigned char *) p);
2239 }
2240 else if (c >= 0x80)
2241 {
2242 if (EQ (readcharfun, Qget_file_char))
2243 {
2244 if (BASE_LEADING_CODE_P (c))
2245 {
2246 int nbytes;
2247 c = read_multibyte (c, readcharfun, &nbytes);
2248 if (nbytes > 1)
2249 force_multibyte = 1;
2250 else
2251 {
2252 force_singlebyte = 1;
2253 c = BYTE8_TO_CHAR (c);
2254 }
2255 }
2256 else
2257 {
2258 force_singlebyte = 1;
2259 c = BYTE8_TO_CHAR (c);
2260 }
2261 }
2262 else
2188 force_multibyte = 1; 2263 force_multibyte = 1;
2264 p += CHAR_STRING (c, (unsigned char *) p);
2189 } 2265 }
2190
2191 /* A character that must be multibyte forces multibyte. */
2192 if (! SINGLE_BYTE_CHAR_P (c & ~CHAR_MODIFIER_MASK))
2193 force_multibyte = 1;
2194
2195 /* If we just discovered the need to be multibyte,
2196 convert the text accumulated thus far. */
2197 if (force_multibyte && ! is_multibyte)
2198 {
2199 is_multibyte = 1;
2200 to_multibyte (&p, &end, &nchars);
2201 }
2202
2203 /* Allow `\C- ' and `\C-?'. */
2204 if (c == (CHAR_CTL | ' '))
2205 c = 0;
2206 else if (c == (CHAR_CTL | '?'))
2207 c = 127;
2208
2209 if (c & CHAR_SHIFT)
2210 {
2211 /* Shift modifier is valid only with [A-Za-z]. */
2212 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
2213 c &= ~CHAR_SHIFT;
2214 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
2215 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
2216 }
2217
2218 if (c & CHAR_META)
2219 /* Move the meta bit to the right place for a string. */
2220 c = (c & ~CHAR_META) | 0x80;
2221 if (c & CHAR_MODIFIER_MASK)
2222 error ("Invalid modifier in string");
2223
2224 if (is_multibyte)
2225 p += CHAR_STRING (c, p);
2226 else 2266 else
2227 *p++ = c; 2267 *p++ = c;
2228
2229 nchars++; 2268 nchars++;
2230 } 2269 }
2231
2232 if (c < 0) 2270 if (c < 0)
2233 end_of_file_error (); 2271 end_of_file_error ();
2234 2272
2235 /* If purifying, and string starts with \ newline, 2273 /* If purifying, and string starts with \ newline,
2236 return zero instead. This is for doc strings 2274 return zero instead. This is for doc strings
2237 that we are really going to find in etc/DOC.nn.nn */ 2275 that we are really going to find in etc/DOC.nn.nn */
2238 if (!NILP (Vpurify_flag) && NILP (Vdoc_file_name) && cancel) 2276 if (!NILP (Vpurify_flag) && NILP (Vdoc_file_name) && cancel)
2239 return make_number (0); 2277 return make_number (0);
2240 2278
2241 if (is_multibyte || force_singlebyte) 2279 if (force_multibyte)
2280 /* READ_BUFFER already contains valid multibyte forms. */
2242 ; 2281 ;
2243 else if (load_convert_to_unibyte) 2282 else if (force_singlebyte)
2244 { 2283 {
2245 Lisp_Object string; 2284 nchars = str_as_unibyte (read_buffer, p - read_buffer);
2246 to_multibyte (&p, &end, &nchars); 2285 p = read_buffer + nchars;
2247 if (p - read_buffer != nchars)
2248 {
2249 string = make_multibyte_string (read_buffer, nchars,
2250 p - read_buffer);
2251 return Fstring_make_unibyte (string);
2252 }
2253 /* We can make a unibyte string directly. */
2254 is_multibyte = 0;
2255 }
2256 else if (EQ (readcharfun, Qget_file_char)
2257 || EQ (readcharfun, Qlambda))
2258 {
2259 /* Nowadays, reading directly from a file is used only for
2260 compiled Emacs Lisp files, and those always use the
2261 Emacs internal encoding. Meanwhile, Qlambda is used
2262 for reading dynamic byte code (compiled with
2263 byte-compile-dynamic = t). */
2264 to_multibyte (&p, &end, &nchars);
2265 is_multibyte = 1;
2266 } 2286 }
2267 else 2287 else
2268 /* In all other cases, if we read these bytes as 2288 /* Otherwise, READ_BUFFER contains only ASCII. */
2269 separate characters, treat them as separate characters now. */
2270 ;
2271 2289
2272 if (read_pure) 2290 if (read_pure)
2273 return make_pure_string (read_buffer, nchars, p - read_buffer, 2291 return make_pure_string (read_buffer, nchars, p - read_buffer,
2274 is_multibyte); 2292 (force_multibyte
2293 || (p - read_buffer != nchars)));
2275 return make_specified_string (read_buffer, nchars, p - read_buffer, 2294 return make_specified_string (read_buffer, nchars, p - read_buffer,
2276 is_multibyte); 2295 (force_multibyte
2296 || (p - read_buffer != nchars)));
2277 } 2297 }
2278 2298
2279 case '.': 2299 case '.':
2280 { 2300 {
2281 int next_char = READCHAR; 2301 int next_char = READCHAR;