Mercurial > emacs
comparison src/lread.c @ 88383:0b4249d736a0
Include "character.h".
(read_multibyte): New arg NBYTES.
(read_escape): The meaning of returned *BYTEREP changed.
(to_multibyte): Deleted.
(read1): Adjuted the handling of char table and string.
author | Kenichi Handa <handa@m17n.org> |
---|---|
date | Fri, 01 Mar 2002 01:44:45 +0000 |
parents | a6382f0fcb2a |
children | ef046df4c6ee |
comparison
equal
deleted
inserted
replaced
88382:5e3e1d9d514f | 88383:0b4249d736a0 |
---|---|
27 #include <sys/file.h> | 27 #include <sys/file.h> |
28 #include <errno.h> | 28 #include <errno.h> |
29 #include "lisp.h" | 29 #include "lisp.h" |
30 #include "intervals.h" | 30 #include "intervals.h" |
31 #include "buffer.h" | 31 #include "buffer.h" |
32 #include "character.h" | |
32 #include "charset.h" | 33 #include "charset.h" |
33 #include <epaths.h> | 34 #include <epaths.h> |
34 #include "commands.h" | 35 #include "commands.h" |
35 #include "keyboard.h" | 36 #include "keyboard.h" |
36 #include "termhooks.h" | 37 #include "termhooks.h" |
1474 static int read_buffer_size; | 1475 static int read_buffer_size; |
1475 static char *read_buffer; | 1476 static char *read_buffer; |
1476 | 1477 |
1477 /* Read multibyte form and return it as a character. C is a first | 1478 /* Read multibyte form and return it as a character. C is a first |
1478 byte of multibyte form, and rest of them are read from | 1479 byte of multibyte form, and rest of them are read from |
1479 READCHARFUN. */ | 1480 READCHARFUN. Store the byte length of the form into *NBYTES. */ |
1480 | 1481 |
1481 static int | 1482 static int |
1482 read_multibyte (c, readcharfun) | 1483 read_multibyte (c, readcharfun, nbytes) |
1483 register int c; | 1484 register int c; |
1484 Lisp_Object readcharfun; | 1485 Lisp_Object readcharfun; |
1486 int *nbytes; | |
1485 { | 1487 { |
1486 /* We need the actual character code of this multibyte | 1488 /* We need the actual character code of this multibyte |
1487 characters. */ | 1489 characters. */ |
1488 unsigned char str[MAX_MULTIBYTE_LENGTH]; | 1490 unsigned char str[MAX_MULTIBYTE_LENGTH]; |
1489 int len = 0; | 1491 int len = 0; |
1490 int bytes; | 1492 int bytes = BYTES_BY_CHAR_HEAD (c); |
1491 | 1493 |
1492 str[len++] = c; | 1494 str[len++] = c; |
1493 while ((c = READCHAR) >= 0xA0 | 1495 while (len < bytes) |
1494 && len < MAX_MULTIBYTE_LENGTH) | 1496 { |
1495 str[len++] = c; | 1497 c = READCHAR; |
1496 UNREAD (c); | 1498 if (CHAR_HEAD_P (c)) |
1497 if (UNIBYTE_STR_AS_MULTIBYTE_P (str, len, bytes)) | 1499 { |
1498 return STRING_CHAR (str, len); | 1500 UNREAD (c); |
1501 break; | |
1502 } | |
1503 str[len++] = c; | |
1504 } | |
1505 | |
1506 if (len == bytes && MULTIBYTE_LENGTH_NO_CHECK (str) > 0) | |
1507 { | |
1508 *nbytes = len; | |
1509 return STRING_CHAR (str, len); | |
1510 } | |
1499 /* The byte sequence is not valid as multibyte. Unread all bytes | 1511 /* The byte sequence is not valid as multibyte. Unread all bytes |
1500 but the first one, and return the first byte. */ | 1512 but the first one, and return the first byte. */ |
1501 while (--len > 0) | 1513 while (--len > 0) |
1502 UNREAD (str[len]); | 1514 UNREAD (str[len]); |
1515 *nbytes = 1; | |
1503 return str[0]; | 1516 return str[0]; |
1504 } | 1517 } |
1505 | 1518 |
1506 /* Read a \-escape sequence, assuming we already read the `\'. | 1519 /* Read a \-escape sequence, assuming we already read the `\'. |
1507 If the escape sequence forces unibyte, store 1 into *BYTEREP. | 1520 If the escape sequence forces unibyte, store 1 into *BYTEREP. |
1508 If the escape sequence forces multibyte, store 2 into *BYTEREP. | 1521 If the escape sequence forces multibyte and the returned character |
1522 is raw 8-bit char, store 2 into *BYTEREP. | |
1523 If the escape sequence forces multibyte and the returned character | |
1524 is not raw 8-bit char, store 3 into *BYTEREP. | |
1509 Otherwise store 0 into *BYTEREP. */ | 1525 Otherwise store 0 into *BYTEREP. */ |
1510 | 1526 |
1511 static int | 1527 static int |
1512 read_escape (readcharfun, stringp, byterep) | 1528 read_escape (readcharfun, stringp, byterep) |
1513 Lisp_Object readcharfun; | 1529 Lisp_Object readcharfun; |
1638 UNREAD (c); | 1654 UNREAD (c); |
1639 break; | 1655 break; |
1640 } | 1656 } |
1641 } | 1657 } |
1642 | 1658 |
1643 *byterep = 1; | 1659 if (c < 0x100) |
1660 *byterep = 1; | |
1661 else | |
1662 *byterep = 3; | |
1644 return i; | 1663 return i; |
1645 } | 1664 } |
1646 | 1665 |
1647 case 'x': | 1666 case 'x': |
1648 /* A hex escape, as in ANSI C. */ | 1667 /* A hex escape, as in ANSI C. */ |
1649 { | 1668 { |
1650 int i = 0; | 1669 int i = 0; |
1670 int count = 0; | |
1651 while (1) | 1671 while (1) |
1652 { | 1672 { |
1653 c = READCHAR; | 1673 c = READCHAR; |
1654 if (c >= '0' && c <= '9') | 1674 if (c >= '0' && c <= '9') |
1655 { | 1675 { |
1668 else | 1688 else |
1669 { | 1689 { |
1670 UNREAD (c); | 1690 UNREAD (c); |
1671 break; | 1691 break; |
1672 } | 1692 } |
1693 count++; | |
1673 } | 1694 } |
1674 | 1695 |
1675 *byterep = 2; | 1696 if (count < 3 && i >= 0x80) |
1697 *byterep = 2; | |
1698 else | |
1699 *byterep = 3; | |
1676 return i; | 1700 return i; |
1677 } | 1701 } |
1678 | 1702 |
1679 default: | 1703 default: |
1680 if (BASE_LEADING_CODE_P (c)) | 1704 if (EQ (readcharfun, Qget_file_char) |
1681 c = read_multibyte (c, readcharfun); | 1705 && BASE_LEADING_CODE_P (c)) |
1706 { | |
1707 int nbytes; | |
1708 | |
1709 c = read_multibyte (c, readcharfun, &nbytes); | |
1710 if (nbytes > 1) | |
1711 *byterep = 3; | |
1712 } | |
1682 return c; | 1713 return c; |
1683 } | 1714 } |
1684 } | 1715 } |
1685 | 1716 |
1686 | 1717 |
1748 | 1779 |
1749 return make_number (sign * number); | 1780 return make_number (sign * number); |
1750 } | 1781 } |
1751 | 1782 |
1752 | 1783 |
1753 /* Convert unibyte text in read_buffer to multibyte. | |
1754 | |
1755 Initially, *P is a pointer after the end of the unibyte text, and | |
1756 the pointer *END points after the end of read_buffer. | |
1757 | |
1758 If read_buffer doesn't have enough room to hold the result | |
1759 of the conversion, reallocate it and adjust *P and *END. | |
1760 | |
1761 At the end, make *P point after the result of the conversion, and | |
1762 return in *NCHARS the number of characters in the converted | |
1763 text. */ | |
1764 | |
1765 static void | |
1766 to_multibyte (p, end, nchars) | |
1767 char **p, **end; | |
1768 int *nchars; | |
1769 { | |
1770 int nbytes; | |
1771 | |
1772 parse_str_as_multibyte (read_buffer, *p - read_buffer, &nbytes, nchars); | |
1773 if (read_buffer_size < 2 * nbytes) | |
1774 { | |
1775 int offset = *p - read_buffer; | |
1776 read_buffer_size = 2 * max (read_buffer_size, nbytes); | |
1777 read_buffer = (char *) xrealloc (read_buffer, read_buffer_size); | |
1778 *p = read_buffer + offset; | |
1779 *end = read_buffer + read_buffer_size; | |
1780 } | |
1781 | |
1782 if (nbytes != *nchars) | |
1783 nbytes = str_as_multibyte (read_buffer, read_buffer_size, | |
1784 *p - read_buffer, nchars); | |
1785 | |
1786 *p = read_buffer + nbytes; | |
1787 } | |
1788 | |
1789 | |
1790 /* If the next token is ')' or ']' or '.', we store that character | 1784 /* If the next token is ')' or ']' or '.', we store that character |
1791 in *PCH and the return value is not interesting. Else, we store | 1785 in *PCH and the return value is not interesting. Else, we store |
1792 zero in *PCH and we read and return one lisp object. | 1786 zero in *PCH and we read and return one lisp object. |
1793 | 1787 |
1794 FIRST_IN_LIST is nonzero if this is the first element of a list. */ | 1788 FIRST_IN_LIST is nonzero if this is the first element of a list. */ |
1832 c = READCHAR; | 1826 c = READCHAR; |
1833 if (c == '[') | 1827 if (c == '[') |
1834 { | 1828 { |
1835 Lisp_Object tmp; | 1829 Lisp_Object tmp; |
1836 tmp = read_vector (readcharfun, 0); | 1830 tmp = read_vector (readcharfun, 0); |
1837 if (XVECTOR (tmp)->size < CHAR_TABLE_STANDARD_SLOTS | 1831 if (XVECTOR (tmp)->size != VECSIZE (struct Lisp_Char_Table)) |
1838 || XVECTOR (tmp)->size > CHAR_TABLE_STANDARD_SLOTS + 10) | |
1839 error ("Invalid size char-table"); | 1832 error ("Invalid size char-table"); |
1840 XSETCHAR_TABLE (tmp, XCHAR_TABLE (tmp)); | 1833 XSETCHAR_TABLE (tmp, XCHAR_TABLE (tmp)); |
1841 XCHAR_TABLE (tmp)->top = Qt; | |
1842 return tmp; | 1834 return tmp; |
1843 } | 1835 } |
1844 else if (c == '^') | 1836 else if (c == '^') |
1845 { | 1837 { |
1846 c = READCHAR; | 1838 c = READCHAR; |
1847 if (c == '[') | 1839 if (c == '[') |
1848 { | 1840 { |
1849 Lisp_Object tmp; | 1841 Lisp_Object tmp; |
1842 int depth, size; | |
1843 | |
1850 tmp = read_vector (readcharfun, 0); | 1844 tmp = read_vector (readcharfun, 0); |
1851 if (XVECTOR (tmp)->size != SUB_CHAR_TABLE_STANDARD_SLOTS) | 1845 if (!INTEGERP (AREF (tmp, 0))) |
1846 error ("Invalid depth in char-table"); | |
1847 depth = XINT (AREF (tmp, 0)); | |
1848 if (depth < 1 || depth > 3) | |
1849 error ("Invalid depth in char-table"); | |
1850 size = XVECTOR (tmp)->size + 2; | |
1851 if (chartab_size [depth] != size) | |
1852 error ("Invalid size char-table"); | 1852 error ("Invalid size char-table"); |
1853 XSETCHAR_TABLE (tmp, XCHAR_TABLE (tmp)); | 1853 XSETSUB_CHAR_TABLE (tmp, XSUB_CHAR_TABLE (tmp)); |
1854 XCHAR_TABLE (tmp)->top = Qnil; | |
1855 return tmp; | 1854 return tmp; |
1856 } | 1855 } |
1857 Fsignal (Qinvalid_read_syntax, | 1856 Fsignal (Qinvalid_read_syntax, |
1858 Fcons (make_string ("#^^", 3), Qnil)); | 1857 Fcons (make_string ("#^^", 3), Qnil)); |
1859 } | 1858 } |
2132 if (c < 0) | 2131 if (c < 0) |
2133 end_of_file_error (); | 2132 end_of_file_error (); |
2134 | 2133 |
2135 if (c == '\\') | 2134 if (c == '\\') |
2136 c = read_escape (readcharfun, 0, &discard); | 2135 c = read_escape (readcharfun, 0, &discard); |
2137 else if (BASE_LEADING_CODE_P (c)) | 2136 else if (EQ (readcharfun, Qget_file_char) |
2138 c = read_multibyte (c, readcharfun); | 2137 && BASE_LEADING_CODE_P (c)) |
2138 c = read_multibyte (c, readcharfun, &discard); | |
2139 | 2139 |
2140 return make_number (c); | 2140 return make_number (c); |
2141 } | 2141 } |
2142 | 2142 |
2143 case '"': | 2143 case '"': |
2144 { | 2144 { |
2145 char *p = read_buffer; | 2145 char *p = read_buffer; |
2146 char *end = read_buffer + read_buffer_size; | 2146 char *end = read_buffer + read_buffer_size; |
2147 register int c; | 2147 register int c; |
2148 /* 1 if we saw an escape sequence specifying | 2148 /* Nonzero if we saw an escape sequence specifying |
2149 a multibyte character, or a multibyte character. */ | 2149 a multibyte character. */ |
2150 int force_multibyte = 0; | 2150 int force_multibyte = 0; |
2151 /* 1 if we saw an escape sequence specifying | 2151 /* Nonzero if we saw an escape sequence specifying |
2152 a single-byte character. */ | 2152 a single-byte character. */ |
2153 int force_singlebyte = 0; | 2153 int force_singlebyte = 0; |
2154 /* 1 if read_buffer contains multibyte text now. */ | |
2155 int is_multibyte = 0; | |
2156 int cancel = 0; | 2154 int cancel = 0; |
2157 int nchars = 0; | 2155 int nchars = 0; |
2158 | 2156 |
2159 while ((c = READCHAR) >= 0 | 2157 while ((c = READCHAR) >= 0 |
2160 && c != '\"') | 2158 && c != '\"') |
2168 end = read_buffer + read_buffer_size; | 2166 end = read_buffer + read_buffer_size; |
2169 } | 2167 } |
2170 | 2168 |
2171 if (c == '\\') | 2169 if (c == '\\') |
2172 { | 2170 { |
2171 int modifiers; | |
2173 int byterep; | 2172 int byterep; |
2174 | 2173 |
2175 c = read_escape (readcharfun, 1, &byterep); | 2174 c = read_escape (readcharfun, 1, &byterep); |
2176 | 2175 |
2177 /* C is -1 if \ newline has just been seen */ | 2176 /* C is -1 if \ newline has just been seen */ |
2180 if (p == read_buffer) | 2179 if (p == read_buffer) |
2181 cancel = 1; | 2180 cancel = 1; |
2182 continue; | 2181 continue; |
2183 } | 2182 } |
2184 | 2183 |
2184 modifiers = c & CHAR_MODIFIER_MASK; | |
2185 c = c & ~CHAR_MODIFIER_MASK; | |
2186 | |
2185 if (byterep == 1) | 2187 if (byterep == 1) |
2186 force_singlebyte = 1; | 2188 { |
2187 else if (byterep == 2) | 2189 force_singlebyte = 1; |
2190 if (c >= 0x80) | |
2191 /* Raw 8-bit code */ | |
2192 c = BYTE8_TO_CHAR (c); | |
2193 } | |
2194 else if (byterep > 1) | |
2195 { | |
2196 force_multibyte = 1; | |
2197 if (byterep == 2) | |
2198 c = BYTE8_TO_CHAR (c); | |
2199 } | |
2200 else if (c >= 0x80) | |
2201 { | |
2202 force_singlebyte = 1; | |
2203 c = BYTE8_TO_CHAR (c); | |
2204 } | |
2205 | |
2206 if (ASCII_CHAR_P (c)) | |
2207 { | |
2208 /* Allow `\C- ' and `\C-?'. */ | |
2209 if (modifiers == CHAR_CTL) | |
2210 { | |
2211 if (c == ' ') | |
2212 c = 0, modifiers = 0; | |
2213 else if (c == '?') | |
2214 c = 127, modifiers = 0; | |
2215 } | |
2216 if (modifiers & CHAR_SHIFT) | |
2217 { | |
2218 /* Shift modifier is valid only with [A-Za-z]. */ | |
2219 if (c >= 'A' && c <= 'Z') | |
2220 modifiers &= ~CHAR_SHIFT; | |
2221 else if (c >= 'a' && c <= 'z') | |
2222 c -= ('a' - 'A'), modifiers &= ~CHAR_SHIFT; | |
2223 } | |
2224 | |
2225 if (modifiers & CHAR_META) | |
2226 { | |
2227 /* Move the meta bit to the right place for a | |
2228 string. */ | |
2229 modifiers &= ~CHAR_META; | |
2230 c = BYTE8_TO_CHAR (c | 0x80); | |
2231 force_singlebyte = 1; | |
2232 } | |
2233 } | |
2234 | |
2235 /* Any modifiers remaining are invalid. */ | |
2236 if (modifiers) | |
2237 error ("Invalid modifier in string"); | |
2238 p += CHAR_STRING (c, (unsigned char *) p); | |
2239 } | |
2240 else if (c >= 0x80) | |
2241 { | |
2242 if (EQ (readcharfun, Qget_file_char)) | |
2243 { | |
2244 if (BASE_LEADING_CODE_P (c)) | |
2245 { | |
2246 int nbytes; | |
2247 c = read_multibyte (c, readcharfun, &nbytes); | |
2248 if (nbytes > 1) | |
2249 force_multibyte = 1; | |
2250 else | |
2251 { | |
2252 force_singlebyte = 1; | |
2253 c = BYTE8_TO_CHAR (c); | |
2254 } | |
2255 } | |
2256 else | |
2257 { | |
2258 force_singlebyte = 1; | |
2259 c = BYTE8_TO_CHAR (c); | |
2260 } | |
2261 } | |
2262 else | |
2188 force_multibyte = 1; | 2263 force_multibyte = 1; |
2264 p += CHAR_STRING (c, (unsigned char *) p); | |
2189 } | 2265 } |
2190 | |
2191 /* A character that must be multibyte forces multibyte. */ | |
2192 if (! SINGLE_BYTE_CHAR_P (c & ~CHAR_MODIFIER_MASK)) | |
2193 force_multibyte = 1; | |
2194 | |
2195 /* If we just discovered the need to be multibyte, | |
2196 convert the text accumulated thus far. */ | |
2197 if (force_multibyte && ! is_multibyte) | |
2198 { | |
2199 is_multibyte = 1; | |
2200 to_multibyte (&p, &end, &nchars); | |
2201 } | |
2202 | |
2203 /* Allow `\C- ' and `\C-?'. */ | |
2204 if (c == (CHAR_CTL | ' ')) | |
2205 c = 0; | |
2206 else if (c == (CHAR_CTL | '?')) | |
2207 c = 127; | |
2208 | |
2209 if (c & CHAR_SHIFT) | |
2210 { | |
2211 /* Shift modifier is valid only with [A-Za-z]. */ | |
2212 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z') | |
2213 c &= ~CHAR_SHIFT; | |
2214 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z') | |
2215 c = (c & ~CHAR_SHIFT) - ('a' - 'A'); | |
2216 } | |
2217 | |
2218 if (c & CHAR_META) | |
2219 /* Move the meta bit to the right place for a string. */ | |
2220 c = (c & ~CHAR_META) | 0x80; | |
2221 if (c & CHAR_MODIFIER_MASK) | |
2222 error ("Invalid modifier in string"); | |
2223 | |
2224 if (is_multibyte) | |
2225 p += CHAR_STRING (c, p); | |
2226 else | 2266 else |
2227 *p++ = c; | 2267 *p++ = c; |
2228 | |
2229 nchars++; | 2268 nchars++; |
2230 } | 2269 } |
2231 | |
2232 if (c < 0) | 2270 if (c < 0) |
2233 end_of_file_error (); | 2271 end_of_file_error (); |
2234 | 2272 |
2235 /* If purifying, and string starts with \ newline, | 2273 /* If purifying, and string starts with \ newline, |
2236 return zero instead. This is for doc strings | 2274 return zero instead. This is for doc strings |
2237 that we are really going to find in etc/DOC.nn.nn */ | 2275 that we are really going to find in etc/DOC.nn.nn */ |
2238 if (!NILP (Vpurify_flag) && NILP (Vdoc_file_name) && cancel) | 2276 if (!NILP (Vpurify_flag) && NILP (Vdoc_file_name) && cancel) |
2239 return make_number (0); | 2277 return make_number (0); |
2240 | 2278 |
2241 if (is_multibyte || force_singlebyte) | 2279 if (force_multibyte) |
2280 /* READ_BUFFER already contains valid multibyte forms. */ | |
2242 ; | 2281 ; |
2243 else if (load_convert_to_unibyte) | 2282 else if (force_singlebyte) |
2244 { | 2283 { |
2245 Lisp_Object string; | 2284 nchars = str_as_unibyte (read_buffer, p - read_buffer); |
2246 to_multibyte (&p, &end, &nchars); | 2285 p = read_buffer + nchars; |
2247 if (p - read_buffer != nchars) | |
2248 { | |
2249 string = make_multibyte_string (read_buffer, nchars, | |
2250 p - read_buffer); | |
2251 return Fstring_make_unibyte (string); | |
2252 } | |
2253 /* We can make a unibyte string directly. */ | |
2254 is_multibyte = 0; | |
2255 } | |
2256 else if (EQ (readcharfun, Qget_file_char) | |
2257 || EQ (readcharfun, Qlambda)) | |
2258 { | |
2259 /* Nowadays, reading directly from a file is used only for | |
2260 compiled Emacs Lisp files, and those always use the | |
2261 Emacs internal encoding. Meanwhile, Qlambda is used | |
2262 for reading dynamic byte code (compiled with | |
2263 byte-compile-dynamic = t). */ | |
2264 to_multibyte (&p, &end, &nchars); | |
2265 is_multibyte = 1; | |
2266 } | 2286 } |
2267 else | 2287 else |
2268 /* In all other cases, if we read these bytes as | 2288 /* Otherwise, READ_BUFFER contains only ASCII. */ |
2269 separate characters, treat them as separate characters now. */ | |
2270 ; | |
2271 | 2289 |
2272 if (read_pure) | 2290 if (read_pure) |
2273 return make_pure_string (read_buffer, nchars, p - read_buffer, | 2291 return make_pure_string (read_buffer, nchars, p - read_buffer, |
2274 is_multibyte); | 2292 (force_multibyte |
2293 || (p - read_buffer != nchars))); | |
2275 return make_specified_string (read_buffer, nchars, p - read_buffer, | 2294 return make_specified_string (read_buffer, nchars, p - read_buffer, |
2276 is_multibyte); | 2295 (force_multibyte |
2296 || (p - read_buffer != nchars))); | |
2277 } | 2297 } |
2278 | 2298 |
2279 case '.': | 2299 case '.': |
2280 { | 2300 { |
2281 int next_char = READCHAR; | 2301 int next_char = READCHAR; |