comparison src/coding.c @ 89483:2f877ed80fa6

*** empty log message ***
author Kenichi Handa <handa@m17n.org>
date Mon, 08 Sep 2003 12:53:41 +0000
parents 375f2633d815 4896b8834fb6
children 040a08a2a879
comparison
equal deleted inserted replaced
88123:375f2633d815 89483:2f877ed80fa6
1 /* Coding system handler (conversion, detection, and etc). 1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995,97,1998,2002,2003 Electrotechnical Laboratory, JAPAN. 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation. 3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001,2002,2003 Free Software Foundation, Inc. 4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2003
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
5 8
6 This file is part of GNU Emacs. 9 This file is part of GNU Emacs.
7 10
8 GNU Emacs is free software; you can redistribute it and/or modify 11 GNU Emacs is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by 12 it under the terms of the GNU General Public License as published by
22 25
23 /*** TABLE OF CONTENTS *** 26 /*** TABLE OF CONTENTS ***
24 27
25 0. General comments 28 0. General comments
26 1. Preamble 29 1. Preamble
27 2. Emacs' internal format (emacs-mule) handlers 30 2. Emacs' internal format (emacs-utf-8) handlers
28 3. ISO2022 handlers 31 3. UTF-8 handlers
29 4. Shift-JIS and BIG5 handlers 32 4. UTF-16 handlers
30 5. CCL handlers 33 5. Charset-base coding systems handlers
31 6. End-of-line handlers 34 6. emacs-mule (old Emacs' internal format) handlers
32 7. C library functions 35 7. ISO2022 handlers
33 8. Emacs Lisp library functions 36 8. Shift-JIS and BIG5 handlers
34 9. Post-amble 37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
35 41
36 */ 42 */
37 43
38 /*** 0. General comments ***/ 44 /*** 0. General comments ***
39 45
40 46
41 /*** GENERAL NOTE on CODING SYSTEMS *** 47 CODING SYSTEM
42 48
43 A coding system is an encoding mechanism for one or more character 49 A coding system is an object for an encoding mechanism that contains
44 sets. Here's a list of coding systems which Emacs can handle. When 50 information about how to convert byte sequences to character
45 we say "decode", it means converting some other coding system to 51 sequences and vice versa. When we say "decode", it means converting
46 Emacs' internal format (emacs-mule), and when we say "encode", 52 a byte sequence of a specific coding system into a character
47 it means converting the coding system emacs-mule to some other 53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
48 coding system. 56 coding system.
49 57
50 0. Emacs' internal format (emacs-mule) 58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
51 59 C level, a coding system is represented by a vector of attributes
52 Emacs itself holds a multi-lingual character in buffers and strings 60 stored in the hash table Vcharset_hash_table. The conversion from
53 in a special format. Details are described in section 2. 61 coding system symbol to attributes vector is done by looking up
54 62 Vcharset_hash_table by the symbol.
55 1. ISO2022 63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
56 82
57 The most famous coding system for multiple character sets. X's 83 The most famous coding system for multiple character sets. X's
58 Compound Text, various EUCs (Extended Unix Code), and coding 84 Compound Text, various EUCs (Extended Unix Code), and coding systems
59 systems used in Internet communication such as ISO-2022-JP are 85 used in the Internet communication such as ISO-2022-JP are all
60 all variants of ISO2022. Details are described in section 3. 86 variants of ISO2022.
61 87
62 2. SJIS (or Shift-JIS or MS-Kanji-Code) 88 o SJIS (or Shift-JIS or MS-Kanji-Code)
63 89
64 A coding system to encode character sets: ASCII, JISX0201, and 90 A coding system to encode character sets: ASCII, JISX0201, and
65 JISX0208. Widely used for PC's in Japan. Details are described in 91 JISX0208. Widely used for PC's in Japan. Details are described in
66 section 4. 92 section 8.
67 93
68 3. BIG5 94 o BIG5
69 95
70 A coding system to encode the character sets ASCII and Big5. Widely 96 A coding system to encode character sets: ASCII and Big5. Widely
71 used for Chinese (mainly in Taiwan and Hong Kong). Details are 97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
72 described in section 4. In this file, when we write "BIG5" 98 described in section 8. In this file, when we write "big5" (all
73 (all uppercase), we mean the coding system, and when we write 99 lowercase), we mean the coding system, and when we write "Big5"
74 "Big5" (capitalized), we mean the character set. 100 (capitalized), we mean the character set.
75 101
76 4. Raw text 102 o CCL
77 103
78 A coding system for text containing random 8-bit code. Emacs does 104 If a user wants to decode/encode text encoded in a coding system
79 no code conversion on such text except for end-of-line format. 105 not listed above, he can supply a decoder and an encoder for it in
80 106 CCL (Code Conversion Language) programs. Emacs executes the CCL
81 5. Other 107 program while decoding/encoding.
82 108
83 If a user wants to read/write text encoded in a coding system not 109 o Raw-text
84 listed above, he can supply a decoder and an encoder for it as CCL 110
85 (Code Conversion Language) programs. Emacs executes the CCL program 111 A coding system for text containing raw eight-bit data. Emacs
86 while reading/writing. 112 treats each byte of source text as a character (except for
87 113 end-of-line conversion).
88 Emacs represents a coding system by a Lisp symbol that has a property 114
89 `coding-system'. But, before actually using the coding system, the 115 o No-conversion
90 information about it is set in a structure of type `struct 116
91 coding_system' for rapid processing. See section 6 for more details. 117 Like raw text, but don't do end-of-line conversion.
92 118
93 */ 119
94 120 END-OF-LINE FORMAT
95 /*** GENERAL NOTES on END-OF-LINE FORMAT *** 121
96 122 How text end-of-line is encoded depends on operating system. For
97 How end-of-line of text is encoded depends on the operating system. 123 instance, Unix's format is just one byte of LF (line-feed) code,
98 For instance, Unix's format is just one byte of `line-feed' code,
99 whereas DOS's format is two-byte sequence of `carriage-return' and 124 whereas DOS's format is two-byte sequence of `carriage-return' and
100 `line-feed' codes. MacOS's format is usually one byte of 125 `line-feed' codes. MacOS's format is usually one byte of
101 `carriage-return'. 126 `carriage-return'.
102 127
103 Since text character encoding and end-of-line encoding are 128 Since text character encoding and end-of-line encoding are
104 independent, any coding system described above can have any 129 independent, any coding system described above can take any format
105 end-of-line format. So Emacs has information about end-of-line 130 of end-of-line (except for no-conversion).
106 format in each coding-system. See section 6 for more details. 131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
107 138
108 */ 139 */
109 140
141 /* COMMON MACROS */
142
143
110 /*** GENERAL NOTES on `detect_coding_XXX ()' functions *** 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
111 145
112 These functions check if a text between SRC and SRC_END is encoded 146 These functions check if a byte sequence specified as a source in
113 in the coding system category XXX. Each returns an integer value in 147 CODING conforms to the format of XXX, and update the members of
114 which appropriate flag bits for the category XXX are set. The flag 148 DETECT_INFO.
115 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the 149
116 template for these functions. If MULTIBYTEP is nonzero, 8-bit codes 150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
117 of the range 0x80..0x9F are in multibyte form. */ 151
152 Below is the template of these functions. */
153
118 #if 0 154 #if 0
119 int 155 static int
120 detect_coding_emacs_mule (src, src_end, multibytep) 156 detect_coding_XXX (coding, detect_info)
121 unsigned char *src, *src_end; 157 struct coding_system *coding;
122 int multibytep; 158 struct coding_detection_info *detect_info;
123 { 159 {
124 ... 160 unsigned char *src = coding->source;
161 unsigned char *src_end = coding->source + coding->src_bytes;
162 int multibytep = coding->src_multibyte;
163 int consumed_chars = 0;
164 int found = 0;
165 ...;
166
167 while (1)
168 {
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
177 }
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
180 return 0;
181
182 no_more_source:
183 /* The source exausted successfully. */
184 detect_info->found |= found;
185 return 1;
125 } 186 }
126 #endif 187 #endif
127 188
128 /*** GENERAL NOTES on `decode_coding_XXX ()' functions *** 189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
129 190
130 These functions decode SRC_BYTES length of unibyte text at SOURCE 191 These functions decode a byte sequence specified as a source by
131 encoded in CODING to Emacs' internal format. The resulting 192 CODING. The resulting multibyte text goes to a place pointed to by
132 multibyte text goes to a place pointed to by DESTINATION, the length 193 CODING->charbuf, the length of which should not exceed
133 of which should not exceed DST_BYTES. 194 CODING->charbuf_size;
134 195
135 These functions set the information about original and decoded texts 196 These functions set the information of original and decoded texts in
136 in the members `produced', `produced_char', `consumed', and 197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
137 `consumed_char' of the structure *CODING. They also set the member 198 They also set CODING->result to one of CODING_RESULT_XXX indicating
138 `result' to one of CODING_FINISH_XXX indicating how the decoding 199 how the decoding is finished.
139 finished. 200
140 201 Below is the template of these functions. */
141 DST_BYTES zero means that the source area and destination area are 202
142 overlapped, which means that we can produce a decoded text until it
143 reaches the head of the not-yet-decoded source text.
144
145 Below is a template for these functions. */
146 #if 0 203 #if 0
147 static void 204 static void
148 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes) 205 decode_coding_XXXX (coding)
149 struct coding_system *coding; 206 struct coding_system *coding;
150 unsigned char *source, *destination; 207 {
151 int src_bytes, dst_bytes; 208 unsigned char *src = coding->source + coding->consumed;
152 { 209 unsigned char *src_end = coding->source + coding->src_bytes;
153 ... 210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base;
214 /* A buffer to produce decoded characters. */
215 int *charbuf = coding->charbuf;
216 int *charbuf_end = charbuf + coding->charbuf_size;
217 int multibytep = coding->src_multibyte;
218
219 while (1)
220 {
221 src_base = src;
222 if (charbuf < charbuf_end)
223 /* No more room to produce a decoded character. */
224 break;
225 ONE_MORE_BYTE (c);
226 /* Decode it. */
227 }
228
229 no_more_source:
230 if (src_base < src_end
231 && coding->mode & CODING_MODE_LAST_BLOCK)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base < src_end && charbuf < charbuf_end)
235 *charbuf++ = *src_base++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding->consumed = coding->consumed_char = src_base - coding->source;
239 /* Remember how many characters we produced. */
240 coding->charbuf_used = charbuf - coding->charbuf;
154 } 241 }
155 #endif 242 #endif
156 243
157 /*** GENERAL NOTES on `encode_coding_XXX ()' functions *** 244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
158 245
159 These functions encode SRC_BYTES length text at SOURCE from Emacs' 246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
160 internal multibyte format to CODING. The resulting unibyte text 247 internal multibyte format by CODING. The resulting byte sequence
161 goes to a place pointed to by DESTINATION, the length of which 248 goes to a place pointed to by DESTINATION, the length of which
162 should not exceed DST_BYTES. 249 should not exceed DST_BYTES.
163 250
164 These functions set the information about original and encoded texts 251 These functions set the information of original and encoded texts in
165 in the members `produced', `produced_char', `consumed', and 252 the members produced, produced_char, consumed, and consumed_char of
166 `consumed_char' of the structure *CODING. They also set the member 253 the structure *CODING. They also set the member result to one of
167 `result' to one of CODING_FINISH_XXX indicating how the encoding 254 CODING_RESULT_XXX indicating how the encoding finished.
168 finished. 255
169 256 DST_BYTES zero means that source area and destination area are
170 DST_BYTES zero means that the source area and destination area are 257 overlapped, which means that we can produce a encoded text until it
171 overlapped, which means that we can produce encoded text until it 258 reaches at the head of not-yet-encoded source text.
172 reaches at the head of the not-yet-encoded source text. 259
173 260 Below is a template of these functions. */
174 Below is a template for these functions. */
175 #if 0 261 #if 0
176 static void 262 static void
177 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes) 263 encode_coding_XXX (coding)
178 struct coding_system *coding; 264 struct coding_system *coding;
179 unsigned char *source, *destination; 265 {
180 int src_bytes, dst_bytes; 266 int multibytep = coding->dst_multibyte;
181 { 267 int *charbuf = coding->charbuf;
182 ... 268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
269 unsigned char *dst = coding->destination + coding->produced;
270 unsigned char *dst_end = coding->destination + coding->dst_bytes;
271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
272 int produced_chars = 0;
273
274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
275 {
276 int c = *charbuf;
277 /* Encode C into DST, and increment DST. */
278 }
279 label_no_more_destination:
280 /* How many chars and bytes we produced. */
281 coding->produced_char += produced_chars;
282 coding->produced = dst - coding->destination;
183 } 283 }
184 #endif 284 #endif
185
186 /*** COMMONLY USED MACROS ***/
187
188 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
189 get one, two, and three bytes from the source text respectively.
190 If there are not enough bytes in the source, they jump to
191 `label_end_of_loop'. The caller should set variables `coding',
192 `src' and `src_end' to appropriate pointer in advance. These
193 macros are called from decoding routines `decode_coding_XXX', thus
194 it is assumed that the source text is unibyte. */
195
196 #define ONE_MORE_BYTE(c1) \
197 do { \
198 if (src >= src_end) \
199 { \
200 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
201 goto label_end_of_loop; \
202 } \
203 c1 = *src++; \
204 } while (0)
205
206 #define TWO_MORE_BYTES(c1, c2) \
207 do { \
208 if (src + 1 >= src_end) \
209 { \
210 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
211 goto label_end_of_loop; \
212 } \
213 c1 = *src++; \
214 c2 = *src++; \
215 } while (0)
216
217
218 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
219 form if MULTIBYTEP is nonzero. */
220
221 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep) \
222 do { \
223 if (src >= src_end) \
224 { \
225 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
226 goto label_end_of_loop; \
227 } \
228 c1 = *src++; \
229 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \
230 c1 = *src++ - 0x20; \
231 } while (0)
232
233 /* Set C to the next character at the source text pointed by `src'.
234 If there are not enough characters in the source, jump to
235 `label_end_of_loop'. The caller should set variables `coding'
236 `src', `src_end', and `translation_table' to appropriate pointers
237 in advance. This macro is used in encoding routines
238 `encode_coding_XXX', thus it assumes that the source text is in
239 multibyte form except for 8-bit characters. 8-bit characters are
240 in multibyte form if coding->src_multibyte is nonzero, else they
241 are represented by a single byte. */
242
243 #define ONE_MORE_CHAR(c) \
244 do { \
245 int len = src_end - src; \
246 int bytes; \
247 if (len <= 0) \
248 { \
249 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
250 goto label_end_of_loop; \
251 } \
252 if (coding->src_multibyte \
253 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
254 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
255 else \
256 c = *src, bytes = 1; \
257 if (!NILP (translation_table)) \
258 c = translate_char (translation_table, c, -1, 0, 0); \
259 src += bytes; \
260 } while (0)
261
262
263 /* Produce a multibyte form of character C to `dst'. Jump to
264 `label_end_of_loop' if there's not enough space at `dst'.
265
266 If we are now in the middle of a composition sequence, the decoded
267 character may be ALTCHAR (for the current composition). In that
268 case, the character goes to coding->cmp_data->data instead of
269 `dst'.
270
271 This macro is used in decoding routines. */
272
273 #define EMIT_CHAR(c) \
274 do { \
275 if (! COMPOSING_P (coding) \
276 || coding->composing == COMPOSITION_RELATIVE \
277 || coding->composing == COMPOSITION_WITH_RULE) \
278 { \
279 int bytes = CHAR_BYTES (c); \
280 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
281 { \
282 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
283 goto label_end_of_loop; \
284 } \
285 dst += CHAR_STRING (c, dst); \
286 coding->produced_char++; \
287 } \
288 \
289 if (COMPOSING_P (coding) \
290 && coding->composing != COMPOSITION_RELATIVE) \
291 { \
292 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
293 coding->composition_rule_follows \
294 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
295 } \
296 } while (0)
297
298
299 #define EMIT_ONE_BYTE(c) \
300 do { \
301 if (dst >= (dst_bytes ? dst_end : src)) \
302 { \
303 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
304 goto label_end_of_loop; \
305 } \
306 *dst++ = c; \
307 } while (0)
308
309 #define EMIT_TWO_BYTES(c1, c2) \
310 do { \
311 if (dst + 2 > (dst_bytes ? dst_end : src)) \
312 { \
313 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
314 goto label_end_of_loop; \
315 } \
316 *dst++ = c1, *dst++ = c2; \
317 } while (0)
318
319 #define EMIT_BYTES(from, to) \
320 do { \
321 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
322 { \
323 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
324 goto label_end_of_loop; \
325 } \
326 while (from < to) \
327 *dst++ = *from++; \
328 } while (0)
329 285
330 286
331 /*** 1. Preamble ***/ 287 /*** 1. Preamble ***/
332 288
333 #ifdef emacs
334 #include <config.h> 289 #include <config.h>
335 #endif
336
337 #include <stdio.h> 290 #include <stdio.h>
338
339 #ifdef emacs
340 291
341 #include "lisp.h" 292 #include "lisp.h"
342 #include "buffer.h" 293 #include "buffer.h"
294 #include "character.h"
343 #include "charset.h" 295 #include "charset.h"
296 #include "ccl.h"
344 #include "composite.h" 297 #include "composite.h"
345 #include "ccl.h"
346 #include "coding.h" 298 #include "coding.h"
347 #include "window.h" 299 #include "window.h"
348 #include "intervals.h" 300
349 301 Lisp_Object Vcoding_system_hash_table;
350 #else /* not emacs */ 302
351 303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
352 #include "mulelib.h" 304 Lisp_Object Qunix, Qdos;
353 305 extern Lisp_Object Qmac; /* frame.c */
354 #endif /* not emacs */
355
356 Lisp_Object Qcoding_system, Qeol_type;
357 Lisp_Object Qbuffer_file_coding_system; 306 Lisp_Object Qbuffer_file_coding_system;
358 Lisp_Object Qpost_read_conversion, Qpre_write_conversion; 307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308 Lisp_Object Qdefault_char;
359 Lisp_Object Qno_conversion, Qundecided; 309 Lisp_Object Qno_conversion, Qundecided;
310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
311 Lisp_Object Qbig, Qlittle;
360 Lisp_Object Qcoding_system_history; 312 Lisp_Object Qcoding_system_history;
361 Lisp_Object Qsafe_chars;
362 Lisp_Object Qvalid_codes; 313 Lisp_Object Qvalid_codes;
314 Lisp_Object QCcategory;
363 315
364 extern Lisp_Object Qinsert_file_contents, Qwrite_region; 316 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
365 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument; 317 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
366 Lisp_Object Qstart_process, Qopen_network_stream; 318 Lisp_Object Qstart_process, Qopen_network_stream;
367 Lisp_Object Qtarget_idx; 319 Lisp_Object Qtarget_idx;
368 320
321 int coding_system_require_warning;
322
369 Lisp_Object Vselect_safe_coding_system_function; 323 Lisp_Object Vselect_safe_coding_system_function;
370
371 int coding_system_require_warning;
372 324
373 /* Mnemonic string for each format of end-of-line. */ 325 /* Mnemonic string for each format of end-of-line. */
374 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac; 326 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
375 /* Mnemonic string to indicate format of end-of-line is not yet 327 /* Mnemonic string to indicate format of end-of-line is not yet
376 decided. */ 328 decided. */
377 Lisp_Object eol_mnemonic_undecided; 329 Lisp_Object eol_mnemonic_undecided;
378 330
379 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
380 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
381 int system_eol_type;
382
383 #ifdef emacs 331 #ifdef emacs
384
385 /* Information about which coding system is safe for which chars.
386 The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
387
388 GENERIC-LIST is a list of generic coding systems which can encode
389 any characters.
390
391 NON-GENERIC-ALIST is an alist of non generic coding systems vs the
392 corresponding char table that contains safe chars. */
393 Lisp_Object Vcoding_system_safe_chars;
394 332
395 Lisp_Object Vcoding_system_list, Vcoding_system_alist; 333 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
396 334
397 Lisp_Object Qcoding_system_p, Qcoding_system_error; 335 Lisp_Object Qcoding_system_p, Qcoding_system_error;
398 336
399 /* Coding system emacs-mule and raw-text are for converting only 337 /* Coding system emacs-mule and raw-text are for converting only
400 end-of-line format. */ 338 end-of-line format. */
401 Lisp_Object Qemacs_mule, Qraw_text; 339 Lisp_Object Qemacs_mule, Qraw_text;
402 340 Lisp_Object Qutf_8_emacs;
403 Lisp_Object Qutf_8;
404 341
405 /* Coding-systems are handed between Emacs Lisp programs and C internal 342 /* Coding-systems are handed between Emacs Lisp programs and C internal
406 routines by the following three variables. */ 343 routines by the following three variables. */
407 /* Coding-system for reading files and receiving data from process. */ 344 /* Coding-system for reading files and receiving data from process. */
408 Lisp_Object Vcoding_system_for_read; 345 Lisp_Object Vcoding_system_for_read;
432 struct coding_system safe_terminal_coding; 369 struct coding_system safe_terminal_coding;
433 370
434 /* Coding system of what is sent from terminal keyboard. */ 371 /* Coding system of what is sent from terminal keyboard. */
435 struct coding_system keyboard_coding; 372 struct coding_system keyboard_coding;
436 373
437 /* Default coding system to be used to write a file. */
438 struct coding_system default_buffer_file_coding;
439
440 Lisp_Object Vfile_coding_system_alist; 374 Lisp_Object Vfile_coding_system_alist;
441 Lisp_Object Vprocess_coding_system_alist; 375 Lisp_Object Vprocess_coding_system_alist;
442 Lisp_Object Vnetwork_coding_system_alist; 376 Lisp_Object Vnetwork_coding_system_alist;
443 377
444 Lisp_Object Vlocale_coding_system; 378 Lisp_Object Vlocale_coding_system;
445 379
446 #endif /* emacs */ 380 #endif /* emacs */
447
448 Lisp_Object Qcoding_category, Qcoding_category_index;
449
450 /* List of symbols `coding-category-xxx' ordered by priority. */
451 Lisp_Object Vcoding_category_list;
452
453 /* Table of coding categories (Lisp symbols). */
454 Lisp_Object Vcoding_category_table;
455
456 /* Table of names of symbol for each coding-category. */
457 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
458 "coding-category-emacs-mule",
459 "coding-category-sjis",
460 "coding-category-iso-7",
461 "coding-category-iso-7-tight",
462 "coding-category-iso-8-1",
463 "coding-category-iso-8-2",
464 "coding-category-iso-7-else",
465 "coding-category-iso-8-else",
466 "coding-category-ccl",
467 "coding-category-big5",
468 "coding-category-utf-8",
469 "coding-category-utf-16-be",
470 "coding-category-utf-16-le",
471 "coding-category-raw-text",
472 "coding-category-binary"
473 };
474
475 /* Table of pointers to coding systems corresponding to each coding
476 categories. */
477 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
478
479 /* Table of coding category masks. Nth element is a mask for a coding
480 category of which priority is Nth. */
481 static
482 int coding_priorities[CODING_CATEGORY_IDX_MAX];
483 381
484 /* Flag to tell if we look up translation table on character code 382 /* Flag to tell if we look up translation table on character code
485 conversion. */ 383 conversion. */
486 Lisp_Object Venable_character_translation; 384 Lisp_Object Venable_character_translation;
487 /* Standard translation table to look up on decoding (reading). */ 385 /* Standard translation table to look up on decoding (reading). */
493 Lisp_Object Qtranslation_table_id; 391 Lisp_Object Qtranslation_table_id;
494 Lisp_Object Qtranslation_table_for_decode; 392 Lisp_Object Qtranslation_table_for_decode;
495 Lisp_Object Qtranslation_table_for_encode; 393 Lisp_Object Qtranslation_table_for_encode;
496 394
497 /* Alist of charsets vs revision number. */ 395 /* Alist of charsets vs revision number. */
498 Lisp_Object Vcharset_revision_alist; 396 static Lisp_Object Vcharset_revision_table;
499 397
500 /* Default coding systems used for process I/O. */ 398 /* Default coding systems used for process I/O. */
501 Lisp_Object Vdefault_process_coding_system; 399 Lisp_Object Vdefault_process_coding_system;
502 400
503 /* Char table for translating Quail and self-inserting input. */ 401 /* Char table for translating Quail and self-inserting input. */
507 pre-write-conversion functions. Usually the value is zero, but it 405 pre-write-conversion functions. Usually the value is zero, but it
508 is set to 1 temporarily while such functions are running. This is 406 is set to 1 temporarily while such functions are running. This is
509 to avoid infinite recursive call. */ 407 to avoid infinite recursive call. */
510 static int inhibit_pre_post_conversion; 408 static int inhibit_pre_post_conversion;
511 409
512 Lisp_Object Qchar_coding_system; 410 /* Two special coding systems. */
513 411 Lisp_Object Vsjis_coding_system;
514 /* Return `safe-chars' property of CODING_SYSTEM (symbol). Don't check 412 Lisp_Object Vbig5_coding_system;
515 its validity. */ 413
516 414
517 Lisp_Object 415 static int detect_coding_utf_8 P_ ((struct coding_system *,
518 coding_safe_chars (coding_system) 416 struct coding_detection_info *info));
519 Lisp_Object coding_system; 417 static void decode_coding_utf_8 P_ ((struct coding_system *));
520 { 418 static int encode_coding_utf_8 P_ ((struct coding_system *));
521 Lisp_Object coding_spec, plist, safe_chars; 419
522 420 static int detect_coding_utf_16 P_ ((struct coding_system *,
523 coding_spec = Fget (coding_system, Qcoding_system); 421 struct coding_detection_info *info));
524 plist = XVECTOR (coding_spec)->contents[3]; 422 static void decode_coding_utf_16 P_ ((struct coding_system *));
525 safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars); 423 static int encode_coding_utf_16 P_ ((struct coding_system *));
526 return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt); 424
527 } 425 static int detect_coding_iso_2022 P_ ((struct coding_system *,
528 426 struct coding_detection_info *info));
529 #define CODING_SAFE_CHAR_P(safe_chars, c) \ 427 static void decode_coding_iso_2022 P_ ((struct coding_system *));
530 (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c))) 428 static int encode_coding_iso_2022 P_ ((struct coding_system *));
429
430 static int detect_coding_emacs_mule P_ ((struct coding_system *,
431 struct coding_detection_info *info));
432 static void decode_coding_emacs_mule P_ ((struct coding_system *));
433 static int encode_coding_emacs_mule P_ ((struct coding_system *));
434
435 static int detect_coding_sjis P_ ((struct coding_system *,
436 struct coding_detection_info *info));
437 static void decode_coding_sjis P_ ((struct coding_system *));
438 static int encode_coding_sjis P_ ((struct coding_system *));
439
440 static int detect_coding_big5 P_ ((struct coding_system *,
441 struct coding_detection_info *info));
442 static void decode_coding_big5 P_ ((struct coding_system *));
443 static int encode_coding_big5 P_ ((struct coding_system *));
444
445 static int detect_coding_ccl P_ ((struct coding_system *,
446 struct coding_detection_info *info));
447 static void decode_coding_ccl P_ ((struct coding_system *));
448 static int encode_coding_ccl P_ ((struct coding_system *));
449
450 static void decode_coding_raw_text P_ ((struct coding_system *));
451 static int encode_coding_raw_text P_ ((struct coding_system *));
452
453
454 /* ISO2022 section */
455
456 #define CODING_ISO_INITIAL(coding, reg) \
457 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
458 coding_attr_iso_initial), \
459 reg)))
460
461
462 #define CODING_ISO_REQUEST(coding, charset_id) \
463 ((charset_id <= (coding)->max_charset_id \
464 ? (coding)->safe_charsets[charset_id] \
465 : -1))
466
467
468 #define CODING_ISO_FLAGS(coding) \
469 ((coding)->spec.iso_2022.flags)
470 #define CODING_ISO_DESIGNATION(coding, reg) \
471 ((coding)->spec.iso_2022.current_designation[reg])
472 #define CODING_ISO_INVOCATION(coding, plane) \
473 ((coding)->spec.iso_2022.current_invocation[plane])
474 #define CODING_ISO_SINGLE_SHIFTING(coding) \
475 ((coding)->spec.iso_2022.single_shifting)
476 #define CODING_ISO_BOL(coding) \
477 ((coding)->spec.iso_2022.bol)
478 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
479 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
480
481 /* Control characters of ISO2022. */
482 /* code */ /* function */
483 #define ISO_CODE_LF 0x0A /* line-feed */
484 #define ISO_CODE_CR 0x0D /* carriage-return */
485 #define ISO_CODE_SO 0x0E /* shift-out */
486 #define ISO_CODE_SI 0x0F /* shift-in */
487 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
488 #define ISO_CODE_ESC 0x1B /* escape */
489 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
490 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
491 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
492
493 /* All code (1-byte) of ISO2022 is classified into one of the
494 followings. */
495 enum iso_code_class_type
496 {
497 ISO_control_0, /* Control codes in the range
498 0x00..0x1F and 0x7F, except for the
499 following 5 codes. */
500 ISO_carriage_return, /* ISO_CODE_CR (0x0D) */
501 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
502 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
503 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
504 ISO_escape, /* ISO_CODE_SO (0x1B) */
505 ISO_control_1, /* Control codes in the range
506 0x80..0x9F, except for the
507 following 3 codes. */
508 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
509 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
510 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
511 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
512 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
513 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
514 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
515 };
516
517 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
518 `iso-flags' attribute of an iso2022 coding system. */
519
520 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
521 instead of the correct short-form sequence (e.g. ESC $ A). */
522 #define CODING_ISO_FLAG_LONG_FORM 0x0001
523
524 /* If set, reset graphic planes and registers at end-of-line to the
525 initial state. */
526 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
527
528 /* If set, reset graphic planes and registers before any control
529 characters to the initial state. */
530 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
531
532 /* If set, encode by 7-bit environment. */
533 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
534
535 /* If set, use locking-shift function. */
536 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
537
538 /* If set, use single-shift function. Overwrite
539 CODING_ISO_FLAG_LOCKING_SHIFT. */
540 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
541
542 /* If set, use designation escape sequence. */
543 #define CODING_ISO_FLAG_DESIGNATION 0x0040
544
545 /* If set, produce revision number sequence. */
546 #define CODING_ISO_FLAG_REVISION 0x0080
547
548 /* If set, produce ISO6429's direction specifying sequence. */
549 #define CODING_ISO_FLAG_DIRECTION 0x0100
550
551 /* If set, assume designation states are reset at beginning of line on
552 output. */
553 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
554
555 /* If set, designation sequence should be placed at beginning of line
556 on output. */
557 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
558
559 /* If set, do not encode unsafe charactes on output. */
560 #define CODING_ISO_FLAG_SAFE 0x0800
561
562 /* If set, extra latin codes (128..159) are accepted as a valid code
563 on input. */
564 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
565
566 #define CODING_ISO_FLAG_COMPOSITION 0x2000
567
568 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
569
570 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
571
572 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
573
574 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
575
576 /* A character to be produced on output if encoding of the original
577 character is prohibited by CODING_ISO_FLAG_SAFE. */
578 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
579
580
581 /* UTF-16 section */
582 #define CODING_UTF_16_BOM(coding) \
583 ((coding)->spec.utf_16.bom)
584
585 #define CODING_UTF_16_ENDIAN(coding) \
586 ((coding)->spec.utf_16.endian)
587
588 #define CODING_UTF_16_SURROGATE(coding) \
589 ((coding)->spec.utf_16.surrogate)
590
591
592 /* CCL section */
593 #define CODING_CCL_DECODER(coding) \
594 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
595 #define CODING_CCL_ENCODER(coding) \
596 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
597 #define CODING_CCL_VALIDS(coding) \
598 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
599
600 /* Index for each coding category in `coding_categories' */
601
602 enum coding_category
603 {
604 coding_category_iso_7,
605 coding_category_iso_7_tight,
606 coding_category_iso_8_1,
607 coding_category_iso_8_2,
608 coding_category_iso_7_else,
609 coding_category_iso_8_else,
610 coding_category_utf_8,
611 coding_category_utf_16_auto,
612 coding_category_utf_16_be,
613 coding_category_utf_16_le,
614 coding_category_utf_16_be_nosig,
615 coding_category_utf_16_le_nosig,
616 coding_category_charset,
617 coding_category_sjis,
618 coding_category_big5,
619 coding_category_ccl,
620 coding_category_emacs_mule,
621 /* All above are targets of code detection. */
622 coding_category_raw_text,
623 coding_category_undecided,
624 coding_category_max
625 };
626
627 /* Definitions of flag bits used in detect_coding_XXXX. */
628 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
629 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
630 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
631 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
632 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
633 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
634 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
635 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
636 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
637 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
638 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
639 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
640 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
641 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
642 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
643 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
644 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
645 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
646
647 /* This value is returned if detect_coding_mask () find nothing other
648 than ASCII characters. */
649 #define CATEGORY_MASK_ANY \
650 (CATEGORY_MASK_ISO_7 \
651 | CATEGORY_MASK_ISO_7_TIGHT \
652 | CATEGORY_MASK_ISO_8_1 \
653 | CATEGORY_MASK_ISO_8_2 \
654 | CATEGORY_MASK_ISO_7_ELSE \
655 | CATEGORY_MASK_ISO_8_ELSE \
656 | CATEGORY_MASK_UTF_8 \
657 | CATEGORY_MASK_UTF_16_BE \
658 | CATEGORY_MASK_UTF_16_LE \
659 | CATEGORY_MASK_UTF_16_BE_NOSIG \
660 | CATEGORY_MASK_UTF_16_LE_NOSIG \
661 | CATEGORY_MASK_CHARSET \
662 | CATEGORY_MASK_SJIS \
663 | CATEGORY_MASK_BIG5 \
664 | CATEGORY_MASK_CCL \
665 | CATEGORY_MASK_EMACS_MULE)
666
667
668 #define CATEGORY_MASK_ISO_7BIT \
669 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
670
671 #define CATEGORY_MASK_ISO_8BIT \
672 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
673
674 #define CATEGORY_MASK_ISO_ELSE \
675 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
676
677 #define CATEGORY_MASK_ISO_ESCAPE \
678 (CATEGORY_MASK_ISO_7 \
679 | CATEGORY_MASK_ISO_7_TIGHT \
680 | CATEGORY_MASK_ISO_7_ELSE \
681 | CATEGORY_MASK_ISO_8_ELSE)
682
683 #define CATEGORY_MASK_ISO \
684 ( CATEGORY_MASK_ISO_7BIT \
685 | CATEGORY_MASK_ISO_8BIT \
686 | CATEGORY_MASK_ISO_ELSE)
687
688 #define CATEGORY_MASK_UTF_16 \
689 (CATEGORY_MASK_UTF_16_BE \
690 | CATEGORY_MASK_UTF_16_LE \
691 | CATEGORY_MASK_UTF_16_BE_NOSIG \
692 | CATEGORY_MASK_UTF_16_LE_NOSIG)
693
694
695 /* List of symbols `coding-category-xxx' ordered by priority. This
696 variable is exposed to Emacs Lisp. */
697 static Lisp_Object Vcoding_category_list;
698
699 /* Table of coding categories (Lisp symbols). This variable is for
700 internal use oly. */
701 static Lisp_Object Vcoding_category_table;
702
703 /* Table of coding-categories ordered by priority. */
704 static enum coding_category coding_priorities[coding_category_max];
705
706 /* Nth element is a coding context for the coding system bound to the
707 Nth coding category. */
708 static struct coding_system coding_categories[coding_category_max];
709
710 /*** Commonly used macros and functions ***/
711
712 #ifndef min
713 #define min(a, b) ((a) < (b) ? (a) : (b))
714 #endif
715 #ifndef max
716 #define max(a, b) ((a) > (b) ? (a) : (b))
717 #endif
718
719 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
720 do { \
721 attrs = CODING_ID_ATTRS (coding->id); \
722 eol_type = CODING_ID_EOL_TYPE (coding->id); \
723 if (VECTORP (eol_type)) \
724 eol_type = Qunix; \
725 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
726 } while (0)
727
728
729 /* Safely get one byte from the source text pointed by SRC which ends
730 at SRC_END, and set C to that byte. If there are not enough bytes
731 in the source, it jumps to `no_more_source'. The caller
732 should declare and set these variables appropriately in advance:
733 src, src_end, multibytep
734 */
735
736 #define ONE_MORE_BYTE(c) \
737 do { \
738 if (src == src_end) \
739 { \
740 if (src_base < src) \
741 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
742 goto no_more_source; \
743 } \
744 c = *src++; \
745 if (multibytep && (c & 0x80)) \
746 { \
747 if ((c & 0xFE) != 0xC0) \
748 error ("Undecodable char found"); \
749 c = ((c & 1) << 6) | *src++; \
750 } \
751 consumed_chars++; \
752 } while (0)
753
754
755 #define ONE_MORE_BYTE_NO_CHECK(c) \
756 do { \
757 c = *src++; \
758 if (multibytep && (c & 0x80)) \
759 { \
760 if ((c & 0xFE) != 0xC0) \
761 error ("Undecodable char found"); \
762 c = ((c & 1) << 6) | *src++; \
763 } \
764 consumed_chars++; \
765 } while (0)
766
767
768 /* Store a byte C in the place pointed by DST and increment DST to the
769 next free point, and increment PRODUCED_CHARS. The caller should
770 assure that C is 0..127, and declare and set the variable `dst'
771 appropriately in advance.
772 */
773
774
775 #define EMIT_ONE_ASCII_BYTE(c) \
776 do { \
777 produced_chars++; \
778 *dst++ = (c); \
779 } while (0)
780
781
782 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
783
784 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
785 do { \
786 produced_chars += 2; \
787 *dst++ = (c1), *dst++ = (c2); \
788 } while (0)
789
790
791 /* Store a byte C in the place pointed by DST and increment DST to the
792 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
793 nonzero, store in an appropriate multibyte from. The caller should
794 declare and set the variables `dst' and `multibytep' appropriately
795 in advance. */
796
797 #define EMIT_ONE_BYTE(c) \
798 do { \
799 produced_chars++; \
800 if (multibytep) \
801 { \
802 int ch = (c); \
803 if (ch >= 0x80) \
804 ch = BYTE8_TO_CHAR (ch); \
805 CHAR_STRING_ADVANCE (ch, dst); \
806 } \
807 else \
808 *dst++ = (c); \
809 } while (0)
810
811
812 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
813
814 #define EMIT_TWO_BYTES(c1, c2) \
815 do { \
816 produced_chars += 2; \
817 if (multibytep) \
818 { \
819 int ch; \
820 \
821 ch = (c1); \
822 if (ch >= 0x80) \
823 ch = BYTE8_TO_CHAR (ch); \
824 CHAR_STRING_ADVANCE (ch, dst); \
825 ch = (c2); \
826 if (ch >= 0x80) \
827 ch = BYTE8_TO_CHAR (ch); \
828 CHAR_STRING_ADVANCE (ch, dst); \
829 } \
830 else \
831 { \
832 *dst++ = (c1); \
833 *dst++ = (c2); \
834 } \
835 } while (0)
836
837
838 #define EMIT_THREE_BYTES(c1, c2, c3) \
839 do { \
840 EMIT_ONE_BYTE (c1); \
841 EMIT_TWO_BYTES (c2, c3); \
842 } while (0)
843
844
845 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
846 do { \
847 EMIT_TWO_BYTES (c1, c2); \
848 EMIT_TWO_BYTES (c3, c4); \
849 } while (0)
850
851
852 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
853 do { \
854 charset_map_loaded = 0; \
855 c = DECODE_CHAR (charset, code); \
856 if (charset_map_loaded) \
857 { \
858 const unsigned char *orig = coding->source; \
859 EMACS_INT offset; \
860 \
861 coding_set_source (coding); \
862 offset = coding->source - orig; \
863 src += offset; \
864 src_base += offset; \
865 src_end += offset; \
866 } \
867 } while (0)
868
869
870 #define ASSURE_DESTINATION(bytes) \
871 do { \
872 if (dst + (bytes) >= dst_end) \
873 { \
874 int more_bytes = charbuf_end - charbuf + (bytes); \
875 \
876 dst = alloc_destination (coding, more_bytes, dst); \
877 dst_end = coding->destination + coding->dst_bytes; \
878 } \
879 } while (0)
880
881
882
883 static void
884 coding_set_source (coding)
885 struct coding_system *coding;
886 {
887 if (BUFFERP (coding->src_object))
888 {
889 struct buffer *buf = XBUFFER (coding->src_object);
890
891 if (coding->src_pos < 0)
892 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
893 else
894 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
895 }
896 else if (STRINGP (coding->src_object))
897 {
898 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
899 }
900 else
901 /* Otherwise, the source is C string and is never relocated
902 automatically. Thus we don't have to update anything. */
903 ;
904 }
905
906 static void
907 coding_set_destination (coding)
908 struct coding_system *coding;
909 {
910 if (BUFFERP (coding->dst_object))
911 {
912 if (coding->src_pos < 0)
913 {
914 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
915 coding->dst_bytes = (GAP_END_ADDR
916 - (coding->src_bytes - coding->consumed)
917 - coding->destination);
918 }
919 else
920 {
921 /* We are sure that coding->dst_pos_byte is before the gap
922 of the buffer. */
923 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
924 + coding->dst_pos_byte - 1);
925 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
926 - coding->destination);
927 }
928 }
929 else
930 /* Otherwise, the destination is C string and is never relocated
931 automatically. Thus we don't have to update anything. */
932 ;
933 }
934
935
936 static void
937 coding_alloc_by_realloc (coding, bytes)
938 struct coding_system *coding;
939 EMACS_INT bytes;
940 {
941 coding->destination = (unsigned char *) xrealloc (coding->destination,
942 coding->dst_bytes + bytes);
943 coding->dst_bytes += bytes;
944 }
945
946 static void
947 coding_alloc_by_making_gap (coding, bytes)
948 struct coding_system *coding;
949 EMACS_INT bytes;
950 {
951 if (BUFFERP (coding->dst_object)
952 && EQ (coding->src_object, coding->dst_object))
953 {
954 EMACS_INT add = coding->src_bytes - coding->consumed;
955
956 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
957 make_gap (bytes);
958 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
959 }
960 else
961 {
962 Lisp_Object this_buffer;
963
964 this_buffer = Fcurrent_buffer ();
965 set_buffer_internal (XBUFFER (coding->dst_object));
966 make_gap (bytes);
967 set_buffer_internal (XBUFFER (this_buffer));
968 }
969 }
970
971
972 static unsigned char *
973 alloc_destination (coding, nbytes, dst)
974 struct coding_system *coding;
975 int nbytes;
976 unsigned char *dst;
977 {
978 EMACS_INT offset = dst - coding->destination;
979
980 if (BUFFERP (coding->dst_object))
981 coding_alloc_by_making_gap (coding, nbytes);
982 else
983 coding_alloc_by_realloc (coding, nbytes);
984 coding->result = CODING_RESULT_SUCCESS;
985 coding_set_destination (coding);
986 dst = coding->destination + offset;
987 return dst;
988 }
989
990 /** Macros for annotations. */
991
992 /* Maximum length of annotation data (sum of annotations for
993 composition and charset). */
994 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
995
996 /* An annotation data is stored in the array coding->charbuf in this
997 format:
998 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
999 LENGTH is the number of elements in the annotation.
1000 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1001 FROM and TO specify the range of text annotated. They are relative
1002 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1003
1004 The format of the following elements depend on ANNOTATION_MASK.
1005
1006 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1007 follows:
1008 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1009 METHOD is one of enum composition_method.
1010 Optionnal COMPOSITION-COMPONENTS are characters and composition
1011 rules.
1012
1013 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1014 follows. */
1015
1016 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1017 do { \
1018 *(buf)++ = -(len); \
1019 *(buf)++ = (mask); \
1020 *(buf)++ = (from); \
1021 *(buf)++ = (to); \
1022 coding->annotated = 1; \
1023 } while (0);
1024
1025 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1026 do { \
1027 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1028 *buf++ = method; \
1029 } while (0)
1030
1031
1032 #define ADD_CHARSET_DATA(buf, from, to, id) \
1033 do { \
1034 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1035 *buf++ = id; \
1036 } while (0)
531 1037
532 1038
533 /*** 2. Emacs internal format (emacs-mule) handlers ***/ 1039 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1040
1041
1042
1043
1044 /*** 3. UTF-8 ***/
1045
1046 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1047 Check if a text is encoded in UTF-8. If it is, return 1, else
1048 return 0. */
1049
1050 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1051 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1052 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1053 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1054 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1055 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1056
1057 static int
1058 detect_coding_utf_8 (coding, detect_info)
1059 struct coding_system *coding;
1060 struct coding_detection_info *detect_info;
1061 {
1062 const unsigned char *src = coding->source, *src_base = src;
1063 const unsigned char *src_end = coding->source + coding->src_bytes;
1064 int multibytep = coding->src_multibyte;
1065 int consumed_chars = 0;
1066 int found = 0;
1067 int incomplete;
1068
1069 detect_info->checked |= CATEGORY_MASK_UTF_8;
1070 /* A coding system of this category is always ASCII compatible. */
1071 src += coding->head_ascii;
1072
1073 while (1)
1074 {
1075 int c, c1, c2, c3, c4;
1076
1077 incomplete = 0;
1078 ONE_MORE_BYTE (c);
1079 if (UTF_8_1_OCTET_P (c))
1080 continue;
1081 incomplete = 1;
1082 ONE_MORE_BYTE (c1);
1083 if (! UTF_8_EXTRA_OCTET_P (c1))
1084 break;
1085 if (UTF_8_2_OCTET_LEADING_P (c))
1086 {
1087 found = CATEGORY_MASK_UTF_8;
1088 continue;
1089 }
1090 ONE_MORE_BYTE (c2);
1091 if (! UTF_8_EXTRA_OCTET_P (c2))
1092 break;
1093 if (UTF_8_3_OCTET_LEADING_P (c))
1094 {
1095 found = CATEGORY_MASK_UTF_8;
1096 continue;
1097 }
1098 ONE_MORE_BYTE (c3);
1099 if (! UTF_8_EXTRA_OCTET_P (c3))
1100 break;
1101 if (UTF_8_4_OCTET_LEADING_P (c))
1102 {
1103 found = CATEGORY_MASK_UTF_8;
1104 continue;
1105 }
1106 ONE_MORE_BYTE (c4);
1107 if (! UTF_8_EXTRA_OCTET_P (c4))
1108 break;
1109 if (UTF_8_5_OCTET_LEADING_P (c))
1110 {
1111 found = CATEGORY_MASK_UTF_8;
1112 continue;
1113 }
1114 break;
1115 }
1116 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1117 return 0;
1118
1119 no_more_source:
1120 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1121 {
1122 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1123 return 0;
1124 }
1125 detect_info->found |= found;
1126 return 1;
1127 }
1128
1129
1130 static void
1131 decode_coding_utf_8 (coding)
1132 struct coding_system *coding;
1133 {
1134 const unsigned char *src = coding->source + coding->consumed;
1135 const unsigned char *src_end = coding->source + coding->src_bytes;
1136 const unsigned char *src_base;
1137 int *charbuf = coding->charbuf;
1138 int *charbuf_end = charbuf + coding->charbuf_size;
1139 int consumed_chars = 0, consumed_chars_base;
1140 int multibytep = coding->src_multibyte;
1141 Lisp_Object attr, eol_type, charset_list;
1142
1143 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1144
1145 while (1)
1146 {
1147 int c, c1, c2, c3, c4, c5;
1148
1149 src_base = src;
1150 consumed_chars_base = consumed_chars;
1151
1152 if (charbuf >= charbuf_end)
1153 break;
1154
1155 ONE_MORE_BYTE (c1);
1156 if (UTF_8_1_OCTET_P(c1))
1157 {
1158 c = c1;
1159 if (c == '\r')
1160 {
1161 if (EQ (eol_type, Qdos))
1162 {
1163 if (src == src_end)
1164 {
1165 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1166 goto no_more_source;
1167 }
1168 if (*src == '\n')
1169 ONE_MORE_BYTE (c);
1170 }
1171 else if (EQ (eol_type, Qmac))
1172 c = '\n';
1173 }
1174 }
1175 else
1176 {
1177 ONE_MORE_BYTE (c2);
1178 if (! UTF_8_EXTRA_OCTET_P (c2))
1179 goto invalid_code;
1180 if (UTF_8_2_OCTET_LEADING_P (c1))
1181 {
1182 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1183 /* Reject overlong sequences here and below. Encoders
1184 producing them are incorrect, they can be misleading,
1185 and they mess up read/write invariance. */
1186 if (c < 128)
1187 goto invalid_code;
1188 }
1189 else
1190 {
1191 ONE_MORE_BYTE (c3);
1192 if (! UTF_8_EXTRA_OCTET_P (c3))
1193 goto invalid_code;
1194 if (UTF_8_3_OCTET_LEADING_P (c1))
1195 {
1196 c = (((c1 & 0xF) << 12)
1197 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1198 if (c < 0x800
1199 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1200 goto invalid_code;
1201 }
1202 else
1203 {
1204 ONE_MORE_BYTE (c4);
1205 if (! UTF_8_EXTRA_OCTET_P (c4))
1206 goto invalid_code;
1207 if (UTF_8_4_OCTET_LEADING_P (c1))
1208 {
1209 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1210 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1211 if (c < 0x10000)
1212 goto invalid_code;
1213 }
1214 else
1215 {
1216 ONE_MORE_BYTE (c5);
1217 if (! UTF_8_EXTRA_OCTET_P (c5))
1218 goto invalid_code;
1219 if (UTF_8_5_OCTET_LEADING_P (c1))
1220 {
1221 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1222 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1223 | (c5 & 0x3F));
1224 if ((c > MAX_CHAR) || (c < 0x200000))
1225 goto invalid_code;
1226 }
1227 else
1228 goto invalid_code;
1229 }
1230 }
1231 }
1232 }
1233
1234 *charbuf++ = c;
1235 continue;
1236
1237 invalid_code:
1238 src = src_base;
1239 consumed_chars = consumed_chars_base;
1240 ONE_MORE_BYTE (c);
1241 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1242 coding->errors++;
1243 }
1244
1245 no_more_source:
1246 coding->consumed_char += consumed_chars_base;
1247 coding->consumed = src_base - coding->source;
1248 coding->charbuf_used = charbuf - coding->charbuf;
1249 }
1250
1251
1252 static int
1253 encode_coding_utf_8 (coding)
1254 struct coding_system *coding;
1255 {
1256 int multibytep = coding->dst_multibyte;
1257 int *charbuf = coding->charbuf;
1258 int *charbuf_end = charbuf + coding->charbuf_used;
1259 unsigned char *dst = coding->destination + coding->produced;
1260 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1261 int produced_chars = 0;
1262 int c;
1263
1264 if (multibytep)
1265 {
1266 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1267
1268 while (charbuf < charbuf_end)
1269 {
1270 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1271
1272 ASSURE_DESTINATION (safe_room);
1273 c = *charbuf++;
1274 if (CHAR_BYTE8_P (c))
1275 {
1276 c = CHAR_TO_BYTE8 (c);
1277 EMIT_ONE_BYTE (c);
1278 }
1279 else
1280 {
1281 CHAR_STRING_ADVANCE (c, pend);
1282 for (p = str; p < pend; p++)
1283 EMIT_ONE_BYTE (*p);
1284 }
1285 }
1286 }
1287 else
1288 {
1289 int safe_room = MAX_MULTIBYTE_LENGTH;
1290
1291 while (charbuf < charbuf_end)
1292 {
1293 ASSURE_DESTINATION (safe_room);
1294 c = *charbuf++;
1295 dst += CHAR_STRING (c, dst);
1296 produced_chars++;
1297 }
1298 }
1299 coding->result = CODING_RESULT_SUCCESS;
1300 coding->produced_char += produced_chars;
1301 coding->produced = dst - coding->destination;
1302 return 0;
1303 }
1304
1305
1306 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1307 Check if a text is encoded in one of UTF-16 based coding systems.
1308 If it is, return 1, else return 0. */
1309
1310 #define UTF_16_HIGH_SURROGATE_P(val) \
1311 (((val) & 0xFC00) == 0xD800)
1312
1313 #define UTF_16_LOW_SURROGATE_P(val) \
1314 (((val) & 0xFC00) == 0xDC00)
1315
1316 #define UTF_16_INVALID_P(val) \
1317 (((val) == 0xFFFE) \
1318 || ((val) == 0xFFFF) \
1319 || UTF_16_LOW_SURROGATE_P (val))
1320
1321
1322 static int
1323 detect_coding_utf_16 (coding, detect_info)
1324 struct coding_system *coding;
1325 struct coding_detection_info *detect_info;
1326 {
1327 const unsigned char *src = coding->source, *src_base = src;
1328 const unsigned char *src_end = coding->source + coding->src_bytes;
1329 int multibytep = coding->src_multibyte;
1330 int consumed_chars = 0;
1331 int c1, c2;
1332
1333 detect_info->checked |= CATEGORY_MASK_UTF_16;
1334
1335 if (coding->mode & CODING_MODE_LAST_BLOCK
1336 && (coding->src_bytes & 1))
1337 {
1338 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1339 return 0;
1340 }
1341 ONE_MORE_BYTE (c1);
1342 ONE_MORE_BYTE (c2);
1343
1344 if ((c1 == 0xFF) && (c2 == 0xFE))
1345 {
1346 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1347 | CATEGORY_MASK_UTF_16_AUTO);
1348 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE;
1349 }
1350 else if ((c1 == 0xFE) && (c2 == 0xFF))
1351 {
1352 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1353 | CATEGORY_MASK_UTF_16_AUTO);
1354 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE;
1355 }
1356 no_more_source:
1357 return 1;
1358 }
1359
1360 static void
1361 decode_coding_utf_16 (coding)
1362 struct coding_system *coding;
1363 {
1364 const unsigned char *src = coding->source + coding->consumed;
1365 const unsigned char *src_end = coding->source + coding->src_bytes;
1366 const unsigned char *src_base;
1367 int *charbuf = coding->charbuf;
1368 int *charbuf_end = charbuf + coding->charbuf_size;
1369 int consumed_chars = 0, consumed_chars_base;
1370 int multibytep = coding->src_multibyte;
1371 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1372 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1373 int surrogate = CODING_UTF_16_SURROGATE (coding);
1374 Lisp_Object attr, eol_type, charset_list;
1375
1376 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1377
1378 if (bom == utf_16_with_bom)
1379 {
1380 int c, c1, c2;
1381
1382 src_base = src;
1383 ONE_MORE_BYTE (c1);
1384 ONE_MORE_BYTE (c2);
1385 c = (c1 << 8) | c2;
1386
1387 if (endian == utf_16_big_endian
1388 ? c != 0xFEFF : c != 0xFFFE)
1389 {
1390 /* The first two bytes are not BOM. Treat them as bytes
1391 for a normal character. */
1392 src = src_base;
1393 coding->errors++;
1394 }
1395 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1396 }
1397 else if (bom == utf_16_detect_bom)
1398 {
1399 /* We have already tried to detect BOM and failed in
1400 detect_coding. */
1401 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1402 }
1403
1404 while (1)
1405 {
1406 int c, c1, c2;
1407
1408 src_base = src;
1409 consumed_chars_base = consumed_chars;
1410
1411 if (charbuf + 2 >= charbuf_end)
1412 break;
1413
1414 ONE_MORE_BYTE (c1);
1415 ONE_MORE_BYTE (c2);
1416 c = (endian == utf_16_big_endian
1417 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1418 if (surrogate)
1419 {
1420 if (! UTF_16_LOW_SURROGATE_P (c))
1421 {
1422 if (endian == utf_16_big_endian)
1423 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1424 else
1425 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1426 *charbuf++ = c1;
1427 *charbuf++ = c2;
1428 coding->errors++;
1429 if (UTF_16_HIGH_SURROGATE_P (c))
1430 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1431 else
1432 *charbuf++ = c;
1433 }
1434 else
1435 {
1436 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1437 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1438 *charbuf++ = c;
1439 }
1440 }
1441 else
1442 {
1443 if (UTF_16_HIGH_SURROGATE_P (c))
1444 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1445 else
1446 *charbuf++ = c;
1447 }
1448 }
1449
1450 no_more_source:
1451 coding->consumed_char += consumed_chars_base;
1452 coding->consumed = src_base - coding->source;
1453 coding->charbuf_used = charbuf - coding->charbuf;
1454 }
1455
1456 static int
1457 encode_coding_utf_16 (coding)
1458 struct coding_system *coding;
1459 {
1460 int multibytep = coding->dst_multibyte;
1461 int *charbuf = coding->charbuf;
1462 int *charbuf_end = charbuf + coding->charbuf_used;
1463 unsigned char *dst = coding->destination + coding->produced;
1464 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1465 int safe_room = 8;
1466 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1467 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1468 int produced_chars = 0;
1469 Lisp_Object attrs, eol_type, charset_list;
1470 int c;
1471
1472 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1473
1474 if (bom != utf_16_without_bom)
1475 {
1476 ASSURE_DESTINATION (safe_room);
1477 if (big_endian)
1478 EMIT_TWO_BYTES (0xFE, 0xFF);
1479 else
1480 EMIT_TWO_BYTES (0xFF, 0xFE);
1481 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1482 }
1483
1484 while (charbuf < charbuf_end)
1485 {
1486 ASSURE_DESTINATION (safe_room);
1487 c = *charbuf++;
1488 if (c >= MAX_UNICODE_CHAR)
1489 c = coding->default_char;
1490
1491 if (c < 0x10000)
1492 {
1493 if (big_endian)
1494 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1495 else
1496 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1497 }
1498 else
1499 {
1500 int c1, c2;
1501
1502 c -= 0x10000;
1503 c1 = (c >> 10) + 0xD800;
1504 c2 = (c & 0x3FF) + 0xDC00;
1505 if (big_endian)
1506 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1507 else
1508 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1509 }
1510 }
1511 coding->result = CODING_RESULT_SUCCESS;
1512 coding->produced = dst - coding->destination;
1513 coding->produced_char += produced_chars;
1514 return 0;
1515 }
1516
1517
1518 /*** 6. Old Emacs' internal format (emacs-mule) ***/
534 1519
535 /* Emacs' internal format for representation of multiple character 1520 /* Emacs' internal format for representation of multiple character
536 sets is a kind of multi-byte encoding, i.e. characters are 1521 sets is a kind of multi-byte encoding, i.e. characters are
537 represented by variable-length sequences of one-byte codes. 1522 represented by variable-length sequences of one-byte codes.
538 1523
570 format (i.e. by encoding by the coding system `emacs-mule'). 1555 format (i.e. by encoding by the coding system `emacs-mule').
571 1556
572 In that case, a sequence of one-byte codes has a slightly different 1557 In that case, a sequence of one-byte codes has a slightly different
573 form. 1558 form.
574 1559
575 Firstly, all characters in eight-bit-control are represented by 1560 At first, all characters in eight-bit-control are represented by
576 one-byte sequences which are their 8-bit code. 1561 one-byte sequences which are their 8-bit code.
577 1562
578 Next, character composition data are represented by the byte 1563 Next, character composition data are represented by the byte
579 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ..., 1564 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
580 where, 1565 where,
581 METHOD is 0xF0 plus one of composition method (enum 1566 METHOD is 0xF0 plus one of composition method (enum
582 composition_method), 1567 composition_method),
583 1568
584 BYTES is 0xA0 plus the byte length of these composition data, 1569 BYTES is 0xA0 plus a byte length of this composition data,
585 1570
586 CHARS is 0xA0 plus the number of characters composed by these 1571 CHARS is 0x20 plus a number of characters composed by this
587 data, 1572 data,
588 1573
589 COMPONENTs are characters of multibyte form or composition 1574 COMPONENTs are characters of multibye form or composition
590 rules encoded by two-byte of ASCII codes. 1575 rules encoded by two-byte of ASCII codes.
591 1576
592 In addition, for backward compatibility, the following formats are 1577 In addition, for backward compatibility, the following formats are
593 also recognized as composition data on decoding. 1578 also recognized as composition data on decoding.
594 1579
601 other: LEADING_CODE+0x20 FOLLOWING-BYTE ..., 1586 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
602 RULE is a one byte code of the range 0xA0..0xF0 that 1587 RULE is a one byte code of the range 0xA0..0xF0 that
603 represents a composition rule. 1588 represents a composition rule.
604 */ 1589 */
605 1590
606 enum emacs_code_class_type emacs_code_class[256]; 1591 char emacs_mule_bytes[256];
1592
1593 int
1594 emacs_mule_char (coding, src, nbytes, nchars, id)
1595 struct coding_system *coding;
1596 unsigned char *src;
1597 int *nbytes, *nchars, *id;
1598 {
1599 const unsigned char *src_end = coding->source + coding->src_bytes;
1600 const unsigned char *src_base = src;
1601 int multibytep = coding->src_multibyte;
1602 struct charset *charset;
1603 unsigned code;
1604 int c;
1605 int consumed_chars = 0;
1606
1607 ONE_MORE_BYTE (c);
1608 switch (emacs_mule_bytes[c])
1609 {
1610 case 2:
1611 if (! (charset = emacs_mule_charset[c]))
1612 goto invalid_code;
1613 ONE_MORE_BYTE (c);
1614 code = c & 0x7F;
1615 break;
1616
1617 case 3:
1618 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1619 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1620 {
1621 ONE_MORE_BYTE (c);
1622 if (! (charset = emacs_mule_charset[c]))
1623 goto invalid_code;
1624 ONE_MORE_BYTE (c);
1625 code = c & 0x7F;
1626 }
1627 else
1628 {
1629 if (! (charset = emacs_mule_charset[c]))
1630 goto invalid_code;
1631 ONE_MORE_BYTE (c);
1632 code = (c & 0x7F) << 8;
1633 ONE_MORE_BYTE (c);
1634 code |= c & 0x7F;
1635 }
1636 break;
1637
1638 case 4:
1639 ONE_MORE_BYTE (c);
1640 if (! (charset = emacs_mule_charset[c]))
1641 goto invalid_code;
1642 ONE_MORE_BYTE (c);
1643 code = (c & 0x7F) << 8;
1644 ONE_MORE_BYTE (c);
1645 code |= c & 0x7F;
1646 break;
1647
1648 case 1:
1649 code = c;
1650 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1651 ? charset_ascii : charset_eight_bit);
1652 break;
1653
1654 default:
1655 abort ();
1656 }
1657 c = DECODE_CHAR (charset, code);
1658 if (c < 0)
1659 goto invalid_code;
1660 *nbytes = src - src_base;
1661 *nchars = consumed_chars;
1662 if (id)
1663 *id = charset->id;
1664 return c;
1665
1666 no_more_source:
1667 return -2;
1668
1669 invalid_code:
1670 return -1;
1671 }
1672
607 1673
608 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 1674 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
609 Check if a text is encoded in Emacs' internal format. If it is, 1675 Check if a text is encoded in `emacs-mule'. If it is, return 1,
610 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */ 1676 else return 0. */
611 1677
612 static int 1678 static int
613 detect_coding_emacs_mule (src, src_end, multibytep) 1679 detect_coding_emacs_mule (coding, detect_info)
614 unsigned char *src, *src_end; 1680 struct coding_system *coding;
615 int multibytep; 1681 struct coding_detection_info *detect_info;
616 { 1682 {
617 unsigned char c; 1683 const unsigned char *src = coding->source, *src_base = src;
618 int composing = 0; 1684 const unsigned char *src_end = coding->source + coding->src_bytes;
619 /* Dummy for ONE_MORE_BYTE. */ 1685 int multibytep = coding->src_multibyte;
620 struct coding_system dummy_coding; 1686 int consumed_chars = 0;
621 struct coding_system *coding = &dummy_coding; 1687 int c;
1688 int found = 0;
1689 int incomplete;
1690
1691 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1692 /* A coding system of this category is always ASCII compatible. */
1693 src += coding->head_ascii;
622 1694
623 while (1) 1695 while (1)
624 { 1696 {
625 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 1697 incomplete = 0;
626 1698 ONE_MORE_BYTE (c);
627 if (composing) 1699 incomplete = 1;
628 { 1700
629 if (c < 0xA0) 1701 if (c == 0x80)
630 composing = 0; 1702 {
631 else if (c == 0xA0) 1703 /* Perhaps the start of composite character. We simple skip
1704 it because analyzing it is too heavy for detecting. But,
1705 at least, we check that the composite character
1706 constitues of more than 4 bytes. */
1707 const unsigned char *src_base;
1708
1709 repeat:
1710 src_base = src;
1711 do
632 { 1712 {
633 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 1713 ONE_MORE_BYTE (c);
634 c &= 0x7F; 1714 }
1715 while (c >= 0xA0);
1716
1717 if (src - src_base <= 4)
1718 break;
1719 found = CATEGORY_MASK_EMACS_MULE;
1720 if (c == 0x80)
1721 goto repeat;
1722 }
1723
1724 if (c < 0x80)
1725 {
1726 if (c < 0x20
1727 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1728 break;
1729 }
1730 else
1731 {
1732 const unsigned char *src_base = src - 1;
1733
1734 do
1735 {
1736 ONE_MORE_BYTE (c);
1737 }
1738 while (c >= 0xA0);
1739 if (src - src_base != emacs_mule_bytes[*src_base])
1740 break;
1741 found = CATEGORY_MASK_EMACS_MULE;
1742 }
1743 }
1744 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1745 return 0;
1746
1747 no_more_source:
1748 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1749 {
1750 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1751 return 0;
1752 }
1753 detect_info->found |= found;
1754 return 1;
1755 }
1756
1757
1758 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1759
1760 /* Decode a character represented as a component of composition
1761 sequence of Emacs 20/21 style at SRC. Set C to that character and
1762 update SRC to the head of next character (or an encoded composition
1763 rule). If SRC doesn't points a composition component, set C to -1.
1764 If SRC points an invalid byte sequence, global exit by a return
1765 value 0. */
1766
1767 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1768 if (1) \
1769 { \
1770 int c; \
1771 int nbytes, nchars; \
1772 \
1773 if (src == src_end) \
1774 break; \
1775 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1776 if (c < 0) \
1777 { \
1778 if (c == -2) \
1779 break; \
1780 goto invalid_code; \
1781 } \
1782 *buf++ = c; \
1783 src += nbytes; \
1784 consumed_chars += nchars; \
1785 } \
1786 else
1787
1788
1789 /* Decode a composition rule represented as a component of composition
1790 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1791 and increment BUF. If SRC points an invalid byte sequence, set C
1792 to -1. */
1793
1794 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1795 do { \
1796 int c, gref, nref; \
1797 \
1798 if (src >= src_end) \
1799 goto invalid_code; \
1800 ONE_MORE_BYTE_NO_CHECK (c); \
1801 c -= 0x20; \
1802 if (c < 0 || c >= 81) \
1803 goto invalid_code; \
1804 \
1805 gref = c / 9, nref = c % 9; \
1806 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1807 } while (0)
1808
1809
1810 /* Decode a composition rule represented as a component of composition
1811 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1812 and increment BUF. If SRC points an invalid byte sequence, set C
1813 to -1. */
1814
1815 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1816 do { \
1817 int gref, nref; \
1818 \
1819 if (src + 1>= src_end) \
1820 goto invalid_code; \
1821 ONE_MORE_BYTE_NO_CHECK (gref); \
1822 gref -= 0x20; \
1823 ONE_MORE_BYTE_NO_CHECK (nref); \
1824 nref -= 0x20; \
1825 if (gref < 0 || gref >= 81 \
1826 || nref < 0 || nref >= 81) \
1827 goto invalid_code; \
1828 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1829 } while (0)
1830
1831
1832 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1833 do { \
1834 /* Emacs 21 style format. The first three bytes at SRC are \
1835 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1836 the byte length of this composition information, CHARS is the \
1837 number of characters composed by this composition. */ \
1838 enum composition_method method = c - 0xF2; \
1839 int *charbuf_base = charbuf; \
1840 int from, to; \
1841 int consumed_chars_limit; \
1842 int nbytes, nchars; \
1843 \
1844 ONE_MORE_BYTE (c); \
1845 nbytes = c - 0xA0; \
1846 if (nbytes < 3) \
1847 goto invalid_code; \
1848 ONE_MORE_BYTE (c); \
1849 nchars = c - 0xA0; \
1850 from = coding->produced + char_offset; \
1851 to = from + nchars; \
1852 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1853 consumed_chars_limit = consumed_chars_base + nbytes; \
1854 if (method != COMPOSITION_RELATIVE) \
1855 { \
1856 int i = 0; \
1857 while (consumed_chars < consumed_chars_limit) \
1858 { \
1859 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1860 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1861 else \
1862 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1863 i++; \
1864 } \
1865 if (consumed_chars < consumed_chars_limit) \
1866 goto invalid_code; \
1867 charbuf_base[0] -= i; \
1868 } \
1869 } while (0)
1870
1871
1872 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1873 do { \
1874 /* Emacs 20 style format for relative composition. */ \
1875 /* Store multibyte form of characters to be composed. */ \
1876 enum composition_method method = COMPOSITION_RELATIVE; \
1877 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1878 int *buf = components; \
1879 int i, j; \
1880 int from, to; \
1881 \
1882 src = src_base; \
1883 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1884 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1885 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1886 if (i < 2) \
1887 goto invalid_code; \
1888 from = coding->produced_char + char_offset; \
1889 to = from + i; \
1890 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1891 for (j = 0; j < i; j++) \
1892 *charbuf++ = components[j]; \
1893 } while (0)
1894
1895
1896 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1897 do { \
1898 /* Emacs 20 style format for rule-base composition. */ \
1899 /* Store multibyte form of characters to be composed. */ \
1900 enum composition_method method = COMPOSITION_WITH_RULE; \
1901 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1902 int *buf = components; \
1903 int i, j; \
1904 int from, to; \
1905 \
1906 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1907 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1908 { \
1909 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1910 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1911 } \
1912 if (i < 1 || (buf - components) % 2 == 0) \
1913 goto invalid_code; \
1914 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1915 goto no_more_source; \
1916 from = coding->produced_char + char_offset; \
1917 to = from + i; \
1918 ADD_COMPOSITION_DATA (buf, from, to, method); \
1919 for (j = 0; j < i; j++) \
1920 *charbuf++ = components[j]; \
1921 for (j = 0; j < i; j += 2) \
1922 *charbuf++ = components[j]; \
1923 } while (0)
1924
1925
1926 static void
1927 decode_coding_emacs_mule (coding)
1928 struct coding_system *coding;
1929 {
1930 const unsigned char *src = coding->source + coding->consumed;
1931 const unsigned char *src_end = coding->source + coding->src_bytes;
1932 const unsigned char *src_base;
1933 int *charbuf = coding->charbuf;
1934 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
1935 int consumed_chars = 0, consumed_chars_base;
1936 int multibytep = coding->src_multibyte;
1937 Lisp_Object attrs, eol_type, charset_list;
1938 int char_offset = coding->produced_char;
1939 int last_offset = char_offset;
1940 int last_id = charset_ascii;
1941
1942 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1943
1944 while (1)
1945 {
1946 int c;
1947
1948 src_base = src;
1949 consumed_chars_base = consumed_chars;
1950
1951 if (charbuf >= charbuf_end)
1952 break;
1953
1954 ONE_MORE_BYTE (c);
1955
1956 if (c < 0x80)
1957 {
1958 if (c == '\r')
1959 {
1960 if (EQ (eol_type, Qdos))
1961 {
1962 if (src == src_end)
1963 {
1964 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1965 goto no_more_source;
1966 }
1967 if (*src == '\n')
1968 ONE_MORE_BYTE (c);
1969 }
1970 else if (EQ (eol_type, Qmac))
1971 c = '\n';
1972 }
1973 *charbuf++ = c;
1974 char_offset++;
1975 }
1976 else if (c == 0x80)
1977 {
1978 ONE_MORE_BYTE (c);
1979 if (c - 0xF2 >= COMPOSITION_RELATIVE
1980 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
1981 DECODE_EMACS_MULE_21_COMPOSITION (c);
1982 else if (c < 0xC0)
1983 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1984 else if (c == 0xFF)
1985 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1986 else
1987 goto invalid_code;
1988 }
1989 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1990 {
1991 int nbytes, nchars;
1992 int id;
1993
1994 src = src_base;
1995 consumed_chars = consumed_chars_base;
1996 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
1997 if (c < 0)
1998 {
1999 if (c == -2)
2000 break;
2001 goto invalid_code;
2002 }
2003 if (last_id != id)
2004 {
2005 if (last_id != charset_ascii)
2006 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2007 last_id = id;
2008 last_offset = char_offset;
2009 }
2010 *charbuf++ = c;
2011 src += nbytes;
2012 consumed_chars += nchars;
2013 char_offset++;
2014 }
2015 continue;
2016
2017 invalid_code:
2018 src = src_base;
2019 consumed_chars = consumed_chars_base;
2020 ONE_MORE_BYTE (c);
2021 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2022 char_offset++;
2023 coding->errors++;
2024 }
2025
2026 no_more_source:
2027 if (last_id != charset_ascii)
2028 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2029 coding->consumed_char += consumed_chars_base;
2030 coding->consumed = src_base - coding->source;
2031 coding->charbuf_used = charbuf - coding->charbuf;
2032 }
2033
2034
2035 #define EMACS_MULE_LEADING_CODES(id, codes) \
2036 do { \
2037 if (id < 0xA0) \
2038 codes[0] = id, codes[1] = 0; \
2039 else if (id < 0xE0) \
2040 codes[0] = 0x9A, codes[1] = id; \
2041 else if (id < 0xF0) \
2042 codes[0] = 0x9B, codes[1] = id; \
2043 else if (id < 0xF5) \
2044 codes[0] = 0x9C, codes[1] = id; \
2045 else \
2046 codes[0] = 0x9D, codes[1] = id; \
2047 } while (0);
2048
2049
2050 static int
2051 encode_coding_emacs_mule (coding)
2052 struct coding_system *coding;
2053 {
2054 int multibytep = coding->dst_multibyte;
2055 int *charbuf = coding->charbuf;
2056 int *charbuf_end = charbuf + coding->charbuf_used;
2057 unsigned char *dst = coding->destination + coding->produced;
2058 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2059 int safe_room = 8;
2060 int produced_chars = 0;
2061 Lisp_Object attrs, eol_type, charset_list;
2062 int c;
2063 int preferred_charset_id = -1;
2064
2065 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2066
2067 while (charbuf < charbuf_end)
2068 {
2069 ASSURE_DESTINATION (safe_room);
2070 c = *charbuf++;
2071
2072 if (c < 0)
2073 {
2074 /* Handle an annotation. */
2075 switch (*charbuf)
2076 {
2077 case CODING_ANNOTATE_COMPOSITION_MASK:
2078 /* Not yet implemented. */
2079 break;
2080 case CODING_ANNOTATE_CHARSET_MASK:
2081 preferred_charset_id = charbuf[3];
2082 if (preferred_charset_id >= 0
2083 && NILP (Fmemq (make_number (preferred_charset_id),
2084 charset_list)))
2085 preferred_charset_id = -1;
2086 break;
2087 default:
2088 abort ();
2089 }
2090 charbuf += -c - 1;
2091 continue;
2092 }
2093
2094 if (ASCII_CHAR_P (c))
2095 EMIT_ONE_ASCII_BYTE (c);
2096 else if (CHAR_BYTE8_P (c))
2097 {
2098 c = CHAR_TO_BYTE8 (c);
2099 EMIT_ONE_BYTE (c);
2100 }
2101 else
2102 {
2103 struct charset *charset;
2104 unsigned code;
2105 int dimension;
2106 int emacs_mule_id;
2107 unsigned char leading_codes[2];
2108
2109 if (preferred_charset_id >= 0)
2110 {
2111 charset = CHARSET_FROM_ID (preferred_charset_id);
2112 if (! CHAR_CHARSET_P (c, charset))
2113 charset = char_charset (c, charset_list, NULL);
635 } 2114 }
636 else 2115 else
637 c -= 0x20; 2116 charset = char_charset (c, charset_list, &code);
638 } 2117 if (! charset)
639 2118 {
640 if (c < 0x20) 2119 c = coding->default_char;
641 { 2120 if (ASCII_CHAR_P (c))
642 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) 2121 {
643 return 0; 2122 EMIT_ONE_ASCII_BYTE (c);
644 } 2123 continue;
645 else if (c >= 0x80 && c < 0xA0) 2124 }
646 { 2125 charset = char_charset (c, charset_list, &code);
647 if (c == 0x80) 2126 }
648 /* Old leading code for a composite character. */ 2127 dimension = CHARSET_DIMENSION (charset);
649 composing = 1; 2128 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2129 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2130 EMIT_ONE_BYTE (leading_codes[0]);
2131 if (leading_codes[1])
2132 EMIT_ONE_BYTE (leading_codes[1]);
2133 if (dimension == 1)
2134 EMIT_ONE_BYTE (code);
650 else 2135 else
651 { 2136 {
652 unsigned char *src_base = src - 1; 2137 EMIT_ONE_BYTE (code >> 8);
653 int bytes; 2138 EMIT_ONE_BYTE (code & 0xFF);
654
655 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
656 bytes))
657 return 0;
658 src = src_base + bytes;
659 } 2139 }
660 } 2140 }
661 } 2141 }
662 label_end_of_loop: 2142 coding->result = CODING_RESULT_SUCCESS;
663 return CODING_CATEGORY_MASK_EMACS_MULE; 2143 coding->produced_char += produced_chars;
664 } 2144 coding->produced = dst - coding->destination;
665 2145 return 0;
666
667 /* Record the starting position START and METHOD of one composition. */
668
669 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
670 do { \
671 struct composition_data *cmp_data = coding->cmp_data; \
672 int *data = cmp_data->data + cmp_data->used; \
673 coding->cmp_data_start = cmp_data->used; \
674 data[0] = -1; \
675 data[1] = cmp_data->char_offset + start; \
676 data[3] = (int) method; \
677 cmp_data->used += 4; \
678 } while (0)
679
680 /* Record the ending position END of the current composition. */
681
682 #define CODING_ADD_COMPOSITION_END(coding, end) \
683 do { \
684 struct composition_data *cmp_data = coding->cmp_data; \
685 int *data = cmp_data->data + coding->cmp_data_start; \
686 data[0] = cmp_data->used - coding->cmp_data_start; \
687 data[2] = cmp_data->char_offset + end; \
688 } while (0)
689
690 /* Record one COMPONENT (alternate character or composition rule). */
691
692 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
693 do { \
694 coding->cmp_data->data[coding->cmp_data->used++] = component; \
695 if (coding->cmp_data->used - coding->cmp_data_start \
696 == COMPOSITION_DATA_MAX_BUNCH_LENGTH) \
697 { \
698 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
699 coding->composing = COMPOSITION_NO; \
700 } \
701 } while (0)
702
703
704 /* Get one byte from a data pointed by SRC and increment SRC. If SRC
705 is not less than SRC_END, return -1 without incrementing Src. */
706
707 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
708
709
710 /* Decode a character represented as a component of composition
711 sequence of Emacs 20 style at SRC. Set C to that character, store
712 its multibyte form sequence at P, and set P to the end of that
713 sequence. If no valid character is found, set C to -1. */
714
715 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p) \
716 do { \
717 int bytes; \
718 \
719 c = SAFE_ONE_MORE_BYTE (); \
720 if (c < 0) \
721 break; \
722 if (CHAR_HEAD_P (c)) \
723 c = -1; \
724 else if (c == 0xA0) \
725 { \
726 c = SAFE_ONE_MORE_BYTE (); \
727 if (c < 0xA0) \
728 c = -1; \
729 else \
730 { \
731 c -= 0xA0; \
732 *p++ = c; \
733 } \
734 } \
735 else if (BASE_LEADING_CODE_P (c - 0x20)) \
736 { \
737 unsigned char *p0 = p; \
738 \
739 c -= 0x20; \
740 *p++ = c; \
741 bytes = BYTES_BY_CHAR_HEAD (c); \
742 while (--bytes) \
743 { \
744 c = SAFE_ONE_MORE_BYTE (); \
745 if (c < 0) \
746 break; \
747 *p++ = c; \
748 } \
749 if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes) \
750 || (coding->flags /* We are recovering a file. */ \
751 && p0[0] == LEADING_CODE_8_BIT_CONTROL \
752 && ! CHAR_HEAD_P (p0[1]))) \
753 c = STRING_CHAR (p0, bytes); \
754 else \
755 c = -1; \
756 } \
757 else \
758 c = -1; \
759 } while (0)
760
761
762 /* Decode a composition rule represented as a component of composition
763 sequence of Emacs 20 style at SRC. Set C to the rule. If not
764 valid rule is found, set C to -1. */
765
766 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c) \
767 do { \
768 c = SAFE_ONE_MORE_BYTE (); \
769 c -= 0xA0; \
770 if (c < 0 || c >= 81) \
771 c = -1; \
772 else \
773 { \
774 gref = c / 9, nref = c % 9; \
775 c = COMPOSITION_ENCODE_RULE (gref, nref); \
776 } \
777 } while (0)
778
779
780 /* Decode composition sequence encoded by `emacs-mule' at the source
781 pointed by SRC. SRC_END is the end of source. Store information
782 of the composition in CODING->cmp_data.
783
784 For backward compatibility, decode also a composition sequence of
785 Emacs 20 style. In that case, the composition sequence contains
786 characters that should be extracted into a buffer or string. Store
787 those characters at *DESTINATION in multibyte form.
788
789 If we encounter an invalid byte sequence, return 0.
790 If we encounter an insufficient source or destination, or
791 insufficient space in CODING->cmp_data, return 1.
792 Otherwise, return consumed bytes in the source.
793
794 */
795 static INLINE int
796 decode_composition_emacs_mule (coding, src, src_end,
797 destination, dst_end, dst_bytes)
798 struct coding_system *coding;
799 unsigned char *src, *src_end, **destination, *dst_end;
800 int dst_bytes;
801 {
802 unsigned char *dst = *destination;
803 int method, data_len, nchars;
804 unsigned char *src_base = src++;
805 /* Store components of composition. */
806 int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
807 int ncomponent;
808 /* Store multibyte form of characters to be composed. This is for
809 Emacs 20 style composition sequence. */
810 unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
811 unsigned char *bufp = buf;
812 int c, i, gref, nref;
813
814 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
815 >= COMPOSITION_DATA_SIZE)
816 {
817 coding->result = CODING_FINISH_INSUFFICIENT_CMP;
818 return -1;
819 }
820
821 ONE_MORE_BYTE (c);
822 if (c - 0xF0 >= COMPOSITION_RELATIVE
823 && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
824 {
825 int with_rule;
826
827 method = c - 0xF0;
828 with_rule = (method == COMPOSITION_WITH_RULE
829 || method == COMPOSITION_WITH_RULE_ALTCHARS);
830 ONE_MORE_BYTE (c);
831 data_len = c - 0xA0;
832 if (data_len < 4
833 || src_base + data_len > src_end)
834 return 0;
835 ONE_MORE_BYTE (c);
836 nchars = c - 0xA0;
837 if (c < 1)
838 return 0;
839 for (ncomponent = 0; src < src_base + data_len; ncomponent++)
840 {
841 /* If it is longer than this, it can't be valid. */
842 if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
843 return 0;
844
845 if (ncomponent % 2 && with_rule)
846 {
847 ONE_MORE_BYTE (gref);
848 gref -= 32;
849 ONE_MORE_BYTE (nref);
850 nref -= 32;
851 c = COMPOSITION_ENCODE_RULE (gref, nref);
852 }
853 else
854 {
855 int bytes;
856 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
857 || (coding->flags /* We are recovering a file. */
858 && src[0] == LEADING_CODE_8_BIT_CONTROL
859 && ! CHAR_HEAD_P (src[1])))
860 c = STRING_CHAR (src, bytes);
861 else
862 c = *src, bytes = 1;
863 src += bytes;
864 }
865 component[ncomponent] = c;
866 }
867 }
868 else
869 {
870 /* This may be an old Emacs 20 style format. See the comment at
871 the section 2 of this file. */
872 while (src < src_end && !CHAR_HEAD_P (*src)) src++;
873 if (src == src_end
874 && !(coding->mode & CODING_MODE_LAST_BLOCK))
875 goto label_end_of_loop;
876
877 src_end = src;
878 src = src_base + 1;
879 if (c < 0xC0)
880 {
881 method = COMPOSITION_RELATIVE;
882 for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
883 {
884 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
885 if (c < 0)
886 break;
887 component[ncomponent++] = c;
888 }
889 if (ncomponent < 2)
890 return 0;
891 nchars = ncomponent;
892 }
893 else if (c == 0xFF)
894 {
895 method = COMPOSITION_WITH_RULE;
896 src++;
897 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
898 if (c < 0)
899 return 0;
900 component[0] = c;
901 for (ncomponent = 1;
902 ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
903 {
904 DECODE_EMACS_MULE_COMPOSITION_RULE (c);
905 if (c < 0)
906 break;
907 component[ncomponent++] = c;
908 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
909 if (c < 0)
910 break;
911 component[ncomponent++] = c;
912 }
913 if (ncomponent < 3)
914 return 0;
915 nchars = (ncomponent + 1) / 2;
916 }
917 else
918 return 0;
919 }
920
921 if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
922 {
923 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
924 for (i = 0; i < ncomponent; i++)
925 CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
926 CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
927 if (buf < bufp)
928 {
929 unsigned char *p = buf;
930 EMIT_BYTES (p, bufp);
931 *destination += bufp - buf;
932 coding->produced_char += nchars;
933 }
934 return (src - src_base);
935 }
936 label_end_of_loop:
937 return -1;
938 }
939
940 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
941
942 static void
943 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
944 struct coding_system *coding;
945 unsigned char *source, *destination;
946 int src_bytes, dst_bytes;
947 {
948 unsigned char *src = source;
949 unsigned char *src_end = source + src_bytes;
950 unsigned char *dst = destination;
951 unsigned char *dst_end = destination + dst_bytes;
952 /* SRC_BASE remembers the start position in source in each loop.
953 The loop will be exited when there's not enough source code, or
954 when there's not enough destination area to produce a
955 character. */
956 unsigned char *src_base;
957
958 coding->produced_char = 0;
959 while ((src_base = src) < src_end)
960 {
961 unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
962 int bytes;
963
964 if (*src == '\r')
965 {
966 int c = *src++;
967
968 if (coding->eol_type == CODING_EOL_CR)
969 c = '\n';
970 else if (coding->eol_type == CODING_EOL_CRLF)
971 {
972 ONE_MORE_BYTE (c);
973 if (c != '\n')
974 {
975 src--;
976 c = '\r';
977 }
978 }
979 *dst++ = c;
980 coding->produced_char++;
981 continue;
982 }
983 else if (*src == '\n')
984 {
985 if ((coding->eol_type == CODING_EOL_CR
986 || coding->eol_type == CODING_EOL_CRLF)
987 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
988 {
989 coding->result = CODING_FINISH_INCONSISTENT_EOL;
990 goto label_end_of_loop;
991 }
992 *dst++ = *src++;
993 coding->produced_char++;
994 continue;
995 }
996 else if (*src == 0x80 && coding->cmp_data)
997 {
998 /* Start of composition data. */
999 int consumed = decode_composition_emacs_mule (coding, src, src_end,
1000 &dst, dst_end,
1001 dst_bytes);
1002 if (consumed < 0)
1003 goto label_end_of_loop;
1004 else if (consumed > 0)
1005 {
1006 src += consumed;
1007 continue;
1008 }
1009 bytes = CHAR_STRING (*src, tmp);
1010 p = tmp;
1011 src++;
1012 }
1013 else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1014 || (coding->flags /* We are recovering a file. */
1015 && src[0] == LEADING_CODE_8_BIT_CONTROL
1016 && ! CHAR_HEAD_P (src[1])))
1017 {
1018 p = src;
1019 src += bytes;
1020 }
1021 else
1022 {
1023 bytes = CHAR_STRING (*src, tmp);
1024 p = tmp;
1025 src++;
1026 }
1027 if (dst + bytes >= (dst_bytes ? dst_end : src))
1028 {
1029 coding->result = CODING_FINISH_INSUFFICIENT_DST;
1030 break;
1031 }
1032 while (bytes--) *dst++ = *p++;
1033 coding->produced_char++;
1034 }
1035 label_end_of_loop:
1036 coding->consumed = coding->consumed_char = src_base - source;
1037 coding->produced = dst - destination;
1038 }
1039
1040
1041 /* Encode composition data stored at DATA into a special byte sequence
1042 starting by 0x80. Update CODING->cmp_data_start and maybe
1043 CODING->cmp_data for the next call. */
1044
1045 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data) \
1046 do { \
1047 unsigned char buf[1024], *p0 = buf, *p; \
1048 int len = data[0]; \
1049 int i; \
1050 \
1051 buf[0] = 0x80; \
1052 buf[1] = 0xF0 + data[3]; /* METHOD */ \
1053 buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */ \
1054 p = buf + 4; \
1055 if (data[3] == COMPOSITION_WITH_RULE \
1056 || data[3] == COMPOSITION_WITH_RULE_ALTCHARS) \
1057 { \
1058 p += CHAR_STRING (data[4], p); \
1059 for (i = 5; i < len; i += 2) \
1060 { \
1061 int gref, nref; \
1062 COMPOSITION_DECODE_RULE (data[i], gref, nref); \
1063 *p++ = 0x20 + gref; \
1064 *p++ = 0x20 + nref; \
1065 p += CHAR_STRING (data[i + 1], p); \
1066 } \
1067 } \
1068 else \
1069 { \
1070 for (i = 4; i < len; i++) \
1071 p += CHAR_STRING (data[i], p); \
1072 } \
1073 buf[2] = 0xA0 + (p - buf); /* COMPONENTS-BYTES */ \
1074 \
1075 if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src)) \
1076 { \
1077 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
1078 goto label_end_of_loop; \
1079 } \
1080 while (p0 < p) \
1081 *dst++ = *p0++; \
1082 coding->cmp_data_start += data[0]; \
1083 if (coding->cmp_data_start == coding->cmp_data->used \
1084 && coding->cmp_data->next) \
1085 { \
1086 coding->cmp_data = coding->cmp_data->next; \
1087 coding->cmp_data_start = 0; \
1088 } \
1089 } while (0)
1090
1091
1092 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1093 unsigned char *, int, int));
1094
1095 static void
1096 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1097 struct coding_system *coding;
1098 unsigned char *source, *destination;
1099 int src_bytes, dst_bytes;
1100 {
1101 unsigned char *src = source;
1102 unsigned char *src_end = source + src_bytes;
1103 unsigned char *dst = destination;
1104 unsigned char *dst_end = destination + dst_bytes;
1105 unsigned char *src_base;
1106 int c;
1107 int char_offset;
1108 int *data;
1109
1110 Lisp_Object translation_table;
1111
1112 translation_table = Qnil;
1113
1114 /* Optimization for the case that there's no composition. */
1115 if (!coding->cmp_data || coding->cmp_data->used == 0)
1116 {
1117 encode_eol (coding, source, destination, src_bytes, dst_bytes);
1118 return;
1119 }
1120
1121 char_offset = coding->cmp_data->char_offset;
1122 data = coding->cmp_data->data + coding->cmp_data_start;
1123 while (1)
1124 {
1125 src_base = src;
1126
1127 /* If SRC starts a composition, encode the information about the
1128 composition in advance. */
1129 if (coding->cmp_data_start < coding->cmp_data->used
1130 && char_offset + coding->consumed_char == data[1])
1131 {
1132 ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1133 char_offset = coding->cmp_data->char_offset;
1134 data = coding->cmp_data->data + coding->cmp_data_start;
1135 }
1136
1137 ONE_MORE_CHAR (c);
1138 if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1139 || coding->eol_type == CODING_EOL_CR))
1140 {
1141 if (coding->eol_type == CODING_EOL_CRLF)
1142 EMIT_TWO_BYTES ('\r', c);
1143 else
1144 EMIT_ONE_BYTE ('\r');
1145 }
1146 else if (SINGLE_BYTE_CHAR_P (c))
1147 {
1148 if (coding->flags && ! ASCII_BYTE_P (c))
1149 {
1150 /* As we are auto saving, retain the multibyte form for
1151 8-bit chars. */
1152 unsigned char buf[MAX_MULTIBYTE_LENGTH];
1153 int bytes = CHAR_STRING (c, buf);
1154
1155 if (bytes == 1)
1156 EMIT_ONE_BYTE (buf[0]);
1157 else
1158 EMIT_TWO_BYTES (buf[0], buf[1]);
1159 }
1160 else
1161 EMIT_ONE_BYTE (c);
1162 }
1163 else
1164 EMIT_BYTES (src_base, src);
1165 coding->consumed_char++;
1166 }
1167 label_end_of_loop:
1168 coding->consumed = src_base - source;
1169 coding->produced = coding->produced_char = dst - destination;
1170 return;
1171 } 2146 }
1172 2147
1173 2148
1174 /*** 3. ISO2022 handlers ***/ 2149 /*** 7. ISO2022 handlers ***/
1175 2150
1176 /* The following note describes the coding system ISO2022 briefly. 2151 /* The following note describes the coding system ISO2022 briefly.
1177 Since the intention of this note is to help understand the 2152 Since the intention of this note is to help understand the
1178 functions in this file, some parts are NOT ACCURATE or are OVERLY 2153 functions in this file, some parts are NOT ACCURATE or are OVERLY
1179 SIMPLIFIED. For thorough understanding, please refer to the 2154 SIMPLIFIED. For thorough understanding, please refer to the
1299 Emacs accepts them on decoding, and produces them on encoding 2274 Emacs accepts them on decoding, and produces them on encoding
1300 CHARS96 character sets in a coding system which is characterized as 2275 CHARS96 character sets in a coding system which is characterized as
1301 7-bit environment, non-locking-shift, and non-single-shift. 2276 7-bit environment, non-locking-shift, and non-single-shift.
1302 2277
1303 Note (**): If <F> is '@', 'A', or 'B', the intermediate character 2278 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1304 '(' can be omitted. We refer to this as "short-form" hereafter. 2279 '(' must be omitted. We refer to this as "short-form" hereafter.
1305 2280
1306 Now you may notice that there are a lot of ways of encoding the 2281 Now you may notice that there are a lot of ways of encoding the
1307 same multilingual text in ISO2022. Actually, there exist many 2282 same multilingual text in ISO2022. Actually, there exist many
1308 coding systems such as Compound Text (used in X11's inter client 2283 coding systems such as Compound Text (used in X11's inter client
1309 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR 2284 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1329 o ESC '3' -- start relative composition with alternate chars (**) 2304 o ESC '3' -- start relative composition with alternate chars (**)
1330 o ESC '4' -- start rule-base composition with alternate chars (**) 2305 o ESC '4' -- start rule-base composition with alternate chars (**)
1331 Since these are not standard escape sequences of any ISO standard, 2306 Since these are not standard escape sequences of any ISO standard,
1332 the use of them with these meanings is restricted to Emacs only. 2307 the use of them with these meanings is restricted to Emacs only.
1333 2308
1334 (*) This form is used only in Emacs 20.5 and older versions, 2309 (*) This form is used only in Emacs 20.7 and older versions,
1335 but the newer versions can safely decode it. 2310 but newer versions can safely decode it.
1336 (**) This form is used only in Emacs 21.1 and newer versions, 2311 (**) This form is used only in Emacs 21.1 and newer versions,
1337 and the older versions can't decode it. 2312 and older versions can't decode it.
1338 2313
1339 Here's a list of example usages of these composition escape 2314 Here's a list of example usages of these composition escape
1340 sequences (categorized by `enum composition_method'). 2315 sequences (categorized by `enum composition_method').
1341 2316
1342 COMPOSITION_RELATIVE: 2317 COMPOSITION_RELATIVE:
1348 COMPOSITION_WITH_RULE_ALTCHARS: 2323 COMPOSITION_WITH_RULE_ALTCHARS:
1349 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */ 2324 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1350 2325
1351 enum iso_code_class_type iso_code_class[256]; 2326 enum iso_code_class_type iso_code_class[256];
1352 2327
1353 #define CHARSET_OK(idx, charset, c) \ 2328 #define SAFE_CHARSET_P(coding, id) \
1354 (coding_system_table[idx] \ 2329 ((id) <= (coding)->max_charset_id \
1355 && (charset == CHARSET_ASCII \ 2330 && (coding)->safe_charsets[id] >= 0)
1356 || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \ 2331
1357 CODING_SAFE_CHAR_P (safe_chars, c))) \ 2332
1358 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \ 2333 #define SHIFT_OUT_OK(category) \
1359 charset) \ 2334 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
1360 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)) 2335
1361 2336 static void
1362 #define SHIFT_OUT_OK(idx) \ 2337 setup_iso_safe_charsets (attrs)
1363 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0) 2338 Lisp_Object attrs;
1364 2339 {
1365 #define COMPOSITION_OK(idx) \ 2340 Lisp_Object charset_list, safe_charsets;
1366 (coding_system_table[idx]->composing != COMPOSITION_DISABLED) 2341 Lisp_Object request;
2342 Lisp_Object reg_usage;
2343 Lisp_Object tail;
2344 int reg94, reg96;
2345 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2346 int max_charset_id;
2347
2348 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2349 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2350 && ! EQ (charset_list, Viso_2022_charset_list))
2351 {
2352 CODING_ATTR_CHARSET_LIST (attrs)
2353 = charset_list = Viso_2022_charset_list;
2354 ASET (attrs, coding_attr_safe_charsets, Qnil);
2355 }
2356
2357 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2358 return;
2359
2360 max_charset_id = 0;
2361 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2362 {
2363 int id = XINT (XCAR (tail));
2364 if (max_charset_id < id)
2365 max_charset_id = id;
2366 }
2367
2368 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2369 make_number (255));
2370 request = AREF (attrs, coding_attr_iso_request);
2371 reg_usage = AREF (attrs, coding_attr_iso_usage);
2372 reg94 = XINT (XCAR (reg_usage));
2373 reg96 = XINT (XCDR (reg_usage));
2374
2375 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2376 {
2377 Lisp_Object id;
2378 Lisp_Object reg;
2379 struct charset *charset;
2380
2381 id = XCAR (tail);
2382 charset = CHARSET_FROM_ID (XINT (id));
2383 reg = Fcdr (Fassq (id, request));
2384 if (! NILP (reg))
2385 SSET (safe_charsets, XINT (id), XINT (reg));
2386 else if (charset->iso_chars_96)
2387 {
2388 if (reg96 < 4)
2389 SSET (safe_charsets, XINT (id), reg96);
2390 }
2391 else
2392 {
2393 if (reg94 < 4)
2394 SSET (safe_charsets, XINT (id), reg94);
2395 }
2396 }
2397 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2398 }
2399
1367 2400
1368 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 2401 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1369 Check if a text is encoded in ISO2022. If it is, return an 2402 Check if a text is encoded in one of ISO-2022 based codig systems.
1370 integer in which appropriate flag bits any of: 2403 If it is, return 1, else return 0. */
1371 CODING_CATEGORY_MASK_ISO_7
1372 CODING_CATEGORY_MASK_ISO_7_TIGHT
1373 CODING_CATEGORY_MASK_ISO_8_1
1374 CODING_CATEGORY_MASK_ISO_8_2
1375 CODING_CATEGORY_MASK_ISO_7_ELSE
1376 CODING_CATEGORY_MASK_ISO_8_ELSE
1377 are set. If a code which should never appear in ISO2022 is found,
1378 returns 0. */
1379 2404
1380 static int 2405 static int
1381 detect_coding_iso2022 (src, src_end, multibytep) 2406 detect_coding_iso_2022 (coding, detect_info)
1382 unsigned char *src, *src_end; 2407 struct coding_system *coding;
1383 int multibytep; 2408 struct coding_detection_info *detect_info;
1384 { 2409 {
1385 int mask = CODING_CATEGORY_MASK_ISO; 2410 const unsigned char *src = coding->source, *src_base = src;
1386 int mask_found = 0; 2411 const unsigned char *src_end = coding->source + coding->src_bytes;
1387 int reg[4], shift_out = 0, single_shifting = 0; 2412 int multibytep = coding->src_multibyte;
1388 int c, c1, charset; 2413 int single_shifting = 0;
1389 /* Dummy for ONE_MORE_BYTE. */ 2414 int id;
1390 struct coding_system dummy_coding; 2415 int c, c1;
1391 struct coding_system *coding = &dummy_coding; 2416 int consumed_chars = 0;
1392 Lisp_Object safe_chars; 2417 int i;
1393 2418 int rejected = 0;
1394 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1; 2419 int found = 0;
1395 while (mask && src < src_end) 2420
1396 { 2421 detect_info->checked |= CATEGORY_MASK_ISO;
1397 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 2422
1398 retry: 2423 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2424 {
2425 struct coding_system *this = &(coding_categories[i]);
2426 Lisp_Object attrs, val;
2427
2428 attrs = CODING_ID_ATTRS (this->id);
2429 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2430 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2431 setup_iso_safe_charsets (attrs);
2432 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2433 this->max_charset_id = SCHARS (val) - 1;
2434 this->safe_charsets = (char *) SDATA (val);
2435 }
2436
2437 /* A coding system of this category is always ASCII compatible. */
2438 src += coding->head_ascii;
2439
2440 while (rejected != CATEGORY_MASK_ISO)
2441 {
2442 ONE_MORE_BYTE (c);
1399 switch (c) 2443 switch (c)
1400 { 2444 {
1401 case ISO_CODE_ESC: 2445 case ISO_CODE_ESC:
1402 if (inhibit_iso_escape_detection) 2446 if (inhibit_iso_escape_detection)
1403 break; 2447 break;
1404 single_shifting = 0; 2448 single_shifting = 0;
1405 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 2449 ONE_MORE_BYTE (c);
1406 if (c >= '(' && c <= '/') 2450 if (c >= '(' && c <= '/')
1407 { 2451 {
1408 /* Designation sequence for a charset of dimension 1. */ 2452 /* Designation sequence for a charset of dimension 1. */
1409 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep); 2453 ONE_MORE_BYTE (c1);
1410 if (c1 < ' ' || c1 >= 0x80 2454 if (c1 < ' ' || c1 >= 0x80
1411 || (charset = iso_charset_table[0][c >= ','][c1]) < 0) 2455 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
1412 /* Invalid designation sequence. Just ignore. */ 2456 /* Invalid designation sequence. Just ignore. */
1413 break; 2457 break;
1414 reg[(c - '(') % 4] = charset;
1415 } 2458 }
1416 else if (c == '$') 2459 else if (c == '$')
1417 { 2460 {
1418 /* Designation sequence for a charset of dimension 2. */ 2461 /* Designation sequence for a charset of dimension 2. */
1419 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 2462 ONE_MORE_BYTE (c);
1420 if (c >= '@' && c <= 'B') 2463 if (c >= '@' && c <= 'B')
1421 /* Designation for JISX0208.1978, GB2312, or JISX0208. */ 2464 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
1422 reg[0] = charset = iso_charset_table[1][0][c]; 2465 id = iso_charset_table[1][0][c];
1423 else if (c >= '(' && c <= '/') 2466 else if (c >= '(' && c <= '/')
1424 { 2467 {
1425 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep); 2468 ONE_MORE_BYTE (c1);
1426 if (c1 < ' ' || c1 >= 0x80 2469 if (c1 < ' ' || c1 >= 0x80
1427 || (charset = iso_charset_table[1][c >= ','][c1]) < 0) 2470 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
1428 /* Invalid designation sequence. Just ignore. */ 2471 /* Invalid designation sequence. Just ignore. */
1429 break; 2472 break;
1430 reg[(c - '(') % 4] = charset;
1431 } 2473 }
1432 else 2474 else
1433 /* Invalid designation sequence. Just ignore. */ 2475 /* Invalid designation sequence. Just ignore it. */
1434 break; 2476 break;
1435 } 2477 }
1436 else if (c == 'N' || c == 'O') 2478 else if (c == 'N' || c == 'O')
1437 { 2479 {
1438 /* ESC <Fe> for SS2 or SS3. */ 2480 /* ESC <Fe> for SS2 or SS3. */
1439 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE; 2481 single_shifting = 1;
2482 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
1440 break; 2483 break;
1441 } 2484 }
1442 else if (c >= '0' && c <= '4') 2485 else if (c >= '0' && c <= '4')
1443 { 2486 {
1444 /* ESC <Fp> for start/end composition. */ 2487 /* ESC <Fp> for start/end composition. */
1445 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7)) 2488 found |= CATEGORY_MASK_ISO;
1446 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1447 else
1448 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1449 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1450 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1451 else
1452 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1453 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1454 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1455 else
1456 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1457 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1458 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1459 else
1460 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1461 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1462 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1463 else
1464 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1465 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1466 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1467 else
1468 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1469 break; 2489 break;
1470 } 2490 }
1471 else 2491 else
1472 /* Invalid escape sequence. Just ignore. */ 2492 {
1473 break; 2493 /* Invalid escape sequence. Just ignore it. */
2494 break;
2495 }
1474 2496
1475 /* We found a valid designation sequence for CHARSET. */ 2497 /* We found a valid designation sequence for CHARSET. */
1476 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT; 2498 rejected |= CATEGORY_MASK_ISO_8BIT;
1477 c = MAKE_CHAR (charset, 0, 0); 2499 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
1478 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c)) 2500 id))
1479 mask_found |= CODING_CATEGORY_MASK_ISO_7; 2501 found |= CATEGORY_MASK_ISO_7;
1480 else 2502 else
1481 mask &= ~CODING_CATEGORY_MASK_ISO_7; 2503 rejected |= CATEGORY_MASK_ISO_7;
1482 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c)) 2504 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
1483 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT; 2505 id))
2506 found |= CATEGORY_MASK_ISO_7_TIGHT;
1484 else 2507 else
1485 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT; 2508 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
1486 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c)) 2509 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
1487 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE; 2510 id))
2511 found |= CATEGORY_MASK_ISO_7_ELSE;
1488 else 2512 else
1489 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE; 2513 rejected |= CATEGORY_MASK_ISO_7_ELSE;
1490 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c)) 2514 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
1491 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE; 2515 id))
2516 found |= CATEGORY_MASK_ISO_8_ELSE;
1492 else 2517 else
1493 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE; 2518 rejected |= CATEGORY_MASK_ISO_8_ELSE;
1494 break; 2519 break;
1495 2520
1496 case ISO_CODE_SO: 2521 case ISO_CODE_SO:
2522 case ISO_CODE_SI:
2523 /* Locking shift out/in. */
1497 if (inhibit_iso_escape_detection) 2524 if (inhibit_iso_escape_detection)
1498 break; 2525 break;
1499 single_shifting = 0; 2526 single_shifting = 0;
1500 if (shift_out == 0 2527 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
1501 && (reg[1] >= 0 2528 found |= CATEGORY_MASK_ISO_ELSE;
1502 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1503 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1504 {
1505 /* Locking shift out. */
1506 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1507 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1508 }
1509 break; 2529 break;
1510 2530
1511 case ISO_CODE_SI: 2531 case ISO_CODE_CSI:
2532 /* Control sequence introducer. */
2533 single_shifting = 0;
2534 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2535 found |= CATEGORY_MASK_ISO_8_ELSE;
2536 goto check_extra_latin;
2537
2538
2539 case ISO_CODE_SS2:
2540 case ISO_CODE_SS3:
2541 /* Single shift. */
1512 if (inhibit_iso_escape_detection) 2542 if (inhibit_iso_escape_detection)
1513 break; 2543 break;
1514 single_shifting = 0; 2544 single_shifting = 1;
1515 if (shift_out == 1) 2545 rejected |= CATEGORY_MASK_ISO_7BIT;
1516 { 2546 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
1517 /* Locking shift in. */ 2547 & CODING_ISO_FLAG_SINGLE_SHIFT)
1518 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT; 2548 found |= CATEGORY_MASK_ISO_8_1;
1519 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT; 2549 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
1520 } 2550 & CODING_ISO_FLAG_SINGLE_SHIFT)
1521 break; 2551 found |= CATEGORY_MASK_ISO_8_2;
1522 2552 goto check_extra_latin;
1523 case ISO_CODE_CSI:
1524 single_shifting = 0;
1525 case ISO_CODE_SS2:
1526 case ISO_CODE_SS3:
1527 {
1528 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1529
1530 if (inhibit_iso_escape_detection)
1531 break;
1532 if (c != ISO_CODE_CSI)
1533 {
1534 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1535 & CODING_FLAG_ISO_SINGLE_SHIFT)
1536 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1537 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1538 & CODING_FLAG_ISO_SINGLE_SHIFT)
1539 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1540 single_shifting = 1;
1541 }
1542 if (VECTORP (Vlatin_extra_code_table)
1543 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1544 {
1545 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1546 & CODING_FLAG_ISO_LATIN_EXTRA)
1547 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1548 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1549 & CODING_FLAG_ISO_LATIN_EXTRA)
1550 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1551 }
1552 mask &= newmask;
1553 mask_found |= newmask;
1554 }
1555 break;
1556 2553
1557 default: 2554 default:
1558 if (c < 0x80) 2555 if (c < 0x80)
1559 { 2556 {
1560 single_shifting = 0; 2557 single_shifting = 0;
1561 break; 2558 break;
1562 } 2559 }
1563 else if (c < 0xA0) 2560 if (c >= 0xA0)
1564 { 2561 {
1565 single_shifting = 0; 2562 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
1566 if (VECTORP (Vlatin_extra_code_table) 2563 found |= CATEGORY_MASK_ISO_8_1;
1567 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1568 {
1569 int newmask = 0;
1570
1571 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1572 & CODING_FLAG_ISO_LATIN_EXTRA)
1573 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1574 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1575 & CODING_FLAG_ISO_LATIN_EXTRA)
1576 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1577 mask &= newmask;
1578 mask_found |= newmask;
1579 }
1580 else
1581 return 0;
1582 }
1583 else
1584 {
1585 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1586 | CODING_CATEGORY_MASK_ISO_7_ELSE);
1587 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1588 /* Check the length of succeeding codes of the range 2564 /* Check the length of succeeding codes of the range
1589 0xA0..0FF. If the byte length is odd, we exclude 2565 0xA0..0FF. If the byte length is even, we include
1590 CODING_CATEGORY_MASK_ISO_8_2. We can check this only 2566 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
1591 when we are not single shifting. */ 2567 only when we are not single shifting. */
1592 if (!single_shifting 2568 if (! single_shifting
1593 && mask & CODING_CATEGORY_MASK_ISO_8_2) 2569 && ! (rejected & CATEGORY_MASK_ISO_8_2))
1594 { 2570 {
1595 int i = 1; 2571 int i = 1;
1596
1597 c = -1;
1598 while (src < src_end) 2572 while (src < src_end)
1599 { 2573 {
1600 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 2574 ONE_MORE_BYTE (c);
1601 if (c < 0xA0) 2575 if (c < 0xA0)
1602 break; 2576 break;
1603 i++; 2577 i++;
1604 } 2578 }
1605 2579
1606 if (i & 1 && src < src_end) 2580 if (i & 1 && src < src_end)
1607 mask &= ~CODING_CATEGORY_MASK_ISO_8_2; 2581 rejected |= CATEGORY_MASK_ISO_8_2;
1608 else 2582 else
1609 mask_found |= CODING_CATEGORY_MASK_ISO_8_2; 2583 found |= CATEGORY_MASK_ISO_8_2;
1610 if (c >= 0)
1611 /* This means that we have read one extra byte. */
1612 goto retry;
1613 } 2584 }
2585 break;
1614 } 2586 }
1615 break; 2587 check_extra_latin:
1616 } 2588 single_shifting = 0;
1617 } 2589 if (! VECTORP (Vlatin_extra_code_table)
1618 label_end_of_loop: 2590 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1619 return (mask & mask_found); 2591 {
1620 } 2592 rejected = CATEGORY_MASK_ISO;
1621 2593 break;
1622 /* Decode a character of which charset is CHARSET, the 1st position 2594 }
1623 code is C1, the 2nd position code is C2, and return the decoded 2595 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
1624 character code. If the variable `translation_table' is non-nil, 2596 & CODING_ISO_FLAG_LATIN_EXTRA)
1625 returned the translated code. */ 2597 found |= CATEGORY_MASK_ISO_8_1;
1626 2598 else
1627 #define DECODE_ISO_CHARACTER(charset, c1, c2) \ 2599 rejected |= CATEGORY_MASK_ISO_8_1;
1628 (NILP (translation_table) \ 2600 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
1629 ? MAKE_CHAR (charset, c1, c2) \ 2601 & CODING_ISO_FLAG_LATIN_EXTRA)
1630 : translate_char (translation_table, -1, charset, c1, c2)) 2602 found |= CATEGORY_MASK_ISO_8_2;
2603 else
2604 rejected |= CATEGORY_MASK_ISO_8_2;
2605 }
2606 }
2607 detect_info->rejected |= CATEGORY_MASK_ISO;
2608 return 0;
2609
2610 no_more_source:
2611 detect_info->rejected |= rejected;
2612 detect_info->found |= (found & ~rejected);
2613 return 1;
2614 }
2615
1631 2616
1632 /* Set designation state into CODING. */ 2617 /* Set designation state into CODING. */
1633 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \ 2618 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
1634 do { \ 2619 do { \
1635 int charset, c; \ 2620 int id, prev; \
1636 \ 2621 \
1637 if (final_char < '0' || final_char >= 128) \ 2622 if (final < '0' || final >= 128 \
1638 goto label_invalid_code; \ 2623 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
1639 charset = ISO_CHARSET_TABLE (make_number (dimension), \ 2624 || !SAFE_CHARSET_P (coding, id)) \
1640 make_number (chars), \ 2625 { \
1641 make_number (final_char)); \ 2626 CODING_ISO_DESIGNATION (coding, reg) = -2; \
1642 c = MAKE_CHAR (charset, 0, 0); \ 2627 goto invalid_code; \
1643 if (charset >= 0 \ 2628 } \
1644 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \ 2629 prev = CODING_ISO_DESIGNATION (coding, reg); \
1645 || CODING_SAFE_CHAR_P (safe_chars, c))) \ 2630 if (id == charset_jisx0201_roman) \
1646 { \ 2631 { \
1647 if (coding->spec.iso2022.last_invalid_designation_register == 0 \ 2632 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
1648 && reg == 0 \ 2633 id = charset_ascii; \
1649 && charset == CHARSET_ASCII) \ 2634 } \
1650 { \ 2635 else if (id == charset_jisx0208_1978) \
1651 /* We should insert this designation sequence as is so \ 2636 { \
1652 that it is surely written back to a file. */ \ 2637 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
1653 coding->spec.iso2022.last_invalid_designation_register = -1; \ 2638 id = charset_jisx0208; \
1654 goto label_invalid_code; \ 2639 } \
1655 } \ 2640 CODING_ISO_DESIGNATION (coding, reg) = id; \
1656 coding->spec.iso2022.last_invalid_designation_register = -1; \ 2641 /* If there was an invalid designation to REG previously, and this \
1657 if ((coding->mode & CODING_MODE_DIRECTION) \ 2642 designation is ASCII to REG, we should keep this designation \
1658 && CHARSET_REVERSE_CHARSET (charset) >= 0) \ 2643 sequence. */ \
1659 charset = CHARSET_REVERSE_CHARSET (charset); \ 2644 if (prev == -2 && id == charset_ascii) \
1660 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \ 2645 goto invalid_code; \
1661 } \
1662 else \
1663 { \
1664 coding->spec.iso2022.last_invalid_designation_register = reg; \
1665 goto label_invalid_code; \
1666 } \
1667 } while (0) 2646 } while (0)
1668 2647
1669 /* Allocate a memory block for storing information about compositions. 2648
1670 The block is chained to the already allocated blocks. */ 2649 #define MAYBE_FINISH_COMPOSITION() \
1671 2650 do { \
1672 void 2651 int i; \
1673 coding_allocate_composition_data (coding, char_offset) 2652 if (composition_state == COMPOSING_NO) \
1674 struct coding_system *coding; 2653 break; \
1675 int char_offset; 2654 /* It is assured that we have enough room for producing \
1676 { 2655 characters stored in the table `components'. */ \
1677 struct composition_data *cmp_data 2656 if (charbuf + component_idx > charbuf_end) \
1678 = (struct composition_data *) xmalloc (sizeof *cmp_data); 2657 goto no_more_source; \
1679 2658 composition_state = COMPOSING_NO; \
1680 cmp_data->char_offset = char_offset; 2659 if (method == COMPOSITION_RELATIVE \
1681 cmp_data->used = 0; 2660 || method == COMPOSITION_WITH_ALTCHARS) \
1682 cmp_data->prev = coding->cmp_data; 2661 { \
1683 cmp_data->next = NULL; 2662 for (i = 0; i < component_idx; i++) \
1684 if (coding->cmp_data) 2663 *charbuf++ = components[i]; \
1685 coding->cmp_data->next = cmp_data; 2664 char_offset += component_idx; \
1686 coding->cmp_data = cmp_data; 2665 } \
1687 coding->cmp_data_start = 0; 2666 else \
1688 } 2667 { \
2668 for (i = 0; i < component_idx; i += 2) \
2669 *charbuf++ = components[i]; \
2670 char_offset += (component_idx / 2) + 1; \
2671 } \
2672 } while (0)
2673
1689 2674
1690 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4. 2675 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1691 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1 2676 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1692 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1 2677 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1693 ESC 3 : altchar composition : ESC 3 ALT ... ESC 0 CHAR ... ESC 1 2678 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
1694 ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1 2679 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
1695 */ 2680 */
1696 2681
1697 #define DECODE_COMPOSITION_START(c1) \ 2682 #define DECODE_COMPOSITION_START(c1) \
1698 do { \
1699 if (coding->composing == COMPOSITION_DISABLED) \
1700 { \
1701 *dst++ = ISO_CODE_ESC; \
1702 *dst++ = c1 & 0x7f; \
1703 coding->produced_char += 2; \
1704 } \
1705 else if (!COMPOSING_P (coding)) \
1706 { \
1707 /* This is surely the start of a composition. We must be sure \
1708 that coding->cmp_data has enough space to store the \
1709 information about the composition. If not, terminate the \
1710 current decoding loop, allocate one more memory block for \
1711 coding->cmp_data in the caller, then start the decoding \
1712 loop again. We can't allocate memory here directly because \
1713 it may cause buffer/string relocation. */ \
1714 if (!coding->cmp_data \
1715 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1716 >= COMPOSITION_DATA_SIZE)) \
1717 { \
1718 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1719 goto label_end_of_loop; \
1720 } \
1721 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1722 : c1 == '2' ? COMPOSITION_WITH_RULE \
1723 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1724 : COMPOSITION_WITH_RULE_ALTCHARS); \
1725 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1726 coding->composing); \
1727 coding->composition_rule_follows = 0; \
1728 } \
1729 else \
1730 { \
1731 /* We are already handling a composition. If the method is \
1732 the following two, the codes following the current escape \
1733 sequence are actual characters stored in a buffer. */ \
1734 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1735 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1736 { \
1737 coding->composing = COMPOSITION_RELATIVE; \
1738 coding->composition_rule_follows = 0; \
1739 } \
1740 } \
1741 } while (0)
1742
1743 /* Handle composition end sequence ESC 1. */
1744
1745 #define DECODE_COMPOSITION_END(c1) \
1746 do { \ 2683 do { \
1747 if (! COMPOSING_P (coding)) \ 2684 if (c1 == '0' \
2685 && composition_state == COMPOSING_COMPONENT_RULE) \
1748 { \ 2686 { \
1749 *dst++ = ISO_CODE_ESC; \ 2687 component_len = component_idx; \
1750 *dst++ = c1; \ 2688 composition_state = COMPOSING_CHAR; \
1751 coding->produced_char += 2; \
1752 } \ 2689 } \
1753 else \ 2690 else \
1754 { \ 2691 { \
1755 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \ 2692 const unsigned char *p; \
1756 coding->composing = COMPOSITION_NO; \ 2693 \
2694 MAYBE_FINISH_COMPOSITION (); \
2695 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2696 goto no_more_source; \
2697 for (p = src; p < src_end - 1; p++) \
2698 if (*p == ISO_CODE_ESC && p[1] == '1') \
2699 break; \
2700 if (p == src_end - 1) \
2701 { \
2702 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2703 goto invalid_code; \
2704 goto no_more_source; \
2705 } \
2706 \
2707 /* This is surely the start of a composition. */ \
2708 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2709 : c1 == '2' ? COMPOSITION_WITH_RULE \
2710 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2711 : COMPOSITION_WITH_RULE_ALTCHARS); \
2712 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2713 : COMPOSING_COMPONENT_CHAR); \
2714 component_idx = component_len = 0; \
1757 } \ 2715 } \
1758 } while (0) 2716 } while (0)
2717
2718
2719 /* Handle compositoin end sequence ESC 1. */
2720
2721 #define DECODE_COMPOSITION_END() \
2722 do { \
2723 int nchars = (component_len > 0 ? component_idx - component_len \
2724 : method == COMPOSITION_RELATIVE ? component_idx \
2725 : (component_idx + 1) / 2); \
2726 int i; \
2727 int *saved_charbuf = charbuf; \
2728 int from = char_offset; \
2729 int to = from + nchars; \
2730 \
2731 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2732 if (method != COMPOSITION_RELATIVE) \
2733 { \
2734 if (component_len == 0) \
2735 for (i = 0; i < component_idx; i++) \
2736 *charbuf++ = components[i]; \
2737 else \
2738 for (i = 0; i < component_len; i++) \
2739 *charbuf++ = components[i]; \
2740 *saved_charbuf = saved_charbuf - charbuf; \
2741 } \
2742 if (method == COMPOSITION_WITH_RULE) \
2743 for (i = 0; i < component_idx; i += 2, char_offset++) \
2744 *charbuf++ = components[i]; \
2745 else \
2746 for (i = component_len; i < component_idx; i++, char_offset++) \
2747 *charbuf++ = components[i]; \
2748 coding->annotated = 1; \
2749 composition_state = COMPOSING_NO; \
2750 } while (0)
2751
1759 2752
1760 /* Decode a composition rule from the byte C1 (and maybe one more byte 2753 /* Decode a composition rule from the byte C1 (and maybe one more byte
1761 from SRC) and store one encoded composition rule in 2754 from SRC) and store one encoded composition rule in
1762 coding->cmp_data. */ 2755 coding->cmp_data. */
1763 2756
1764 #define DECODE_COMPOSITION_RULE(c1) \ 2757 #define DECODE_COMPOSITION_RULE(c1) \
1765 do { \ 2758 do { \
1766 int rule = 0; \
1767 (c1) -= 32; \ 2759 (c1) -= 32; \
1768 if (c1 < 81) /* old format (before ver.21) */ \ 2760 if (c1 < 81) /* old format (before ver.21) */ \
1769 { \ 2761 { \
1770 int gref = (c1) / 9; \ 2762 int gref = (c1) / 9; \
1771 int nref = (c1) % 9; \ 2763 int nref = (c1) % 9; \
1772 if (gref == 4) gref = 10; \ 2764 if (gref == 4) gref = 10; \
1773 if (nref == 4) nref = 10; \ 2765 if (nref == 4) nref = 10; \
1774 rule = COMPOSITION_ENCODE_RULE (gref, nref); \ 2766 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
1775 } \ 2767 } \
1776 else if (c1 < 93) /* new format (after ver.21) */ \ 2768 else if (c1 < 93) /* new format (after ver.21) */ \
1777 { \ 2769 { \
1778 ONE_MORE_BYTE (c2); \ 2770 ONE_MORE_BYTE (c2); \
1779 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \ 2771 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1780 } \ 2772 } \
1781 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \ 2773 else \
1782 coding->composition_rule_follows = 0; \ 2774 c1 = 0; \
1783 } while (0) 2775 } while (0)
1784 2776
1785 2777
1786 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ 2778 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1787 2779
1788 static void 2780 static void
1789 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) 2781 decode_coding_iso_2022 (coding)
1790 struct coding_system *coding; 2782 struct coding_system *coding;
1791 unsigned char *source, *destination; 2783 {
1792 int src_bytes, dst_bytes; 2784 const unsigned char *src = coding->source + coding->consumed;
1793 { 2785 const unsigned char *src_end = coding->source + coding->src_bytes;
1794 unsigned char *src = source; 2786 const unsigned char *src_base;
1795 unsigned char *src_end = source + src_bytes; 2787 int *charbuf = coding->charbuf;
1796 unsigned char *dst = destination; 2788 int *charbuf_end
1797 unsigned char *dst_end = destination + dst_bytes; 2789 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2790 int consumed_chars = 0, consumed_chars_base;
2791 int multibytep = coding->src_multibyte;
1798 /* Charsets invoked to graphic plane 0 and 1 respectively. */ 2792 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1799 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); 2793 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
1800 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1); 2794 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
1801 /* SRC_BASE remembers the start position in source in each loop. 2795 struct charset *charset;
1802 The loop will be exited when there's not enough source code 2796 int c;
1803 (within macro ONE_MORE_BYTE), or when there's not enough 2797 /* For handling composition sequence. */
1804 destination area to produce a character (within macro 2798 #define COMPOSING_NO 0
1805 EMIT_CHAR). */ 2799 #define COMPOSING_CHAR 1
1806 unsigned char *src_base; 2800 #define COMPOSING_RULE 2
1807 int c, charset; 2801 #define COMPOSING_COMPONENT_CHAR 3
1808 Lisp_Object translation_table; 2802 #define COMPOSING_COMPONENT_RULE 4
1809 Lisp_Object safe_chars; 2803
1810 2804 int composition_state = COMPOSING_NO;
1811 safe_chars = coding_safe_chars (coding->symbol); 2805 enum composition_method method;
1812 2806 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
1813 if (NILP (Venable_character_translation)) 2807 int component_idx;
1814 translation_table = Qnil; 2808 int component_len;
1815 else 2809 Lisp_Object attrs, eol_type, charset_list;
1816 { 2810 int char_offset = coding->produced_char;
1817 translation_table = coding->translation_table_for_decode; 2811 int last_offset = char_offset;
1818 if (NILP (translation_table)) 2812 int last_id = charset_ascii;
1819 translation_table = Vstandard_translation_table_for_decode; 2813
1820 } 2814 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1821 2815 setup_iso_safe_charsets (attrs);
1822 coding->result = CODING_FINISH_NORMAL;
1823 2816
1824 while (1) 2817 while (1)
1825 { 2818 {
1826 int c1, c2; 2819 int c1, c2;
1827 2820
1828 src_base = src; 2821 src_base = src;
2822 consumed_chars_base = consumed_chars;
2823
2824 if (charbuf >= charbuf_end)
2825 break;
2826
1829 ONE_MORE_BYTE (c1); 2827 ONE_MORE_BYTE (c1);
1830 2828
1831 /* We produce no character or one character. */ 2829 /* We produce at most one character. */
1832 switch (iso_code_class [c1]) 2830 switch (iso_code_class [c1])
1833 { 2831 {
1834 case ISO_0x20_or_0x7F: 2832 case ISO_0x20_or_0x7F:
1835 if (COMPOSING_P (coding) && coding->composition_rule_follows) 2833 if (composition_state != COMPOSING_NO)
1836 { 2834 {
1837 DECODE_COMPOSITION_RULE (c1); 2835 if (composition_state == COMPOSING_RULE
1838 continue; 2836 || composition_state == COMPOSING_COMPONENT_RULE)
1839 }
1840 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1841 {
1842 /* This is SPACE or DEL. */
1843 charset = CHARSET_ASCII;
1844 break;
1845 }
1846 /* This is a graphic character, we fall down ... */
1847
1848 case ISO_graphic_plane_0:
1849 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1850 {
1851 DECODE_COMPOSITION_RULE (c1);
1852 continue;
1853 }
1854 charset = charset0;
1855 break;
1856
1857 case ISO_0xA0_or_0xFF:
1858 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1859 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1860 goto label_invalid_code;
1861 /* This is a graphic character, we fall down ... */
1862
1863 case ISO_graphic_plane_1:
1864 if (charset1 < 0)
1865 goto label_invalid_code;
1866 charset = charset1;
1867 break;
1868
1869 case ISO_control_0:
1870 if (COMPOSING_P (coding))
1871 DECODE_COMPOSITION_END ('1');
1872
1873 /* All ISO2022 control characters in this class have the
1874 same representation in Emacs internal format. */
1875 if (c1 == '\n'
1876 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1877 && (coding->eol_type == CODING_EOL_CR
1878 || coding->eol_type == CODING_EOL_CRLF))
1879 {
1880 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1881 goto label_end_of_loop;
1882 }
1883 charset = CHARSET_ASCII;
1884 break;
1885
1886 case ISO_control_1:
1887 if (COMPOSING_P (coding))
1888 DECODE_COMPOSITION_END ('1');
1889 goto label_invalid_code;
1890
1891 case ISO_carriage_return:
1892 if (COMPOSING_P (coding))
1893 DECODE_COMPOSITION_END ('1');
1894
1895 if (coding->eol_type == CODING_EOL_CR)
1896 c1 = '\n';
1897 else if (coding->eol_type == CODING_EOL_CRLF)
1898 {
1899 ONE_MORE_BYTE (c1);
1900 if (c1 != ISO_CODE_LF)
1901 { 2837 {
1902 src--; 2838 DECODE_COMPOSITION_RULE (c1);
1903 c1 = '\r'; 2839 components[component_idx++] = c1;
2840 composition_state--;
2841 continue;
1904 } 2842 }
1905 } 2843 }
1906 charset = CHARSET_ASCII; 2844 if (charset_id_0 < 0
2845 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2846 /* This is SPACE or DEL. */
2847 charset = CHARSET_FROM_ID (charset_ascii);
2848 else
2849 charset = CHARSET_FROM_ID (charset_id_0);
1907 break; 2850 break;
1908 2851
2852 case ISO_graphic_plane_0:
2853 if (composition_state != COMPOSING_NO)
2854 {
2855 if (composition_state == COMPOSING_RULE
2856 || composition_state == COMPOSING_COMPONENT_RULE)
2857 {
2858 DECODE_COMPOSITION_RULE (c1);
2859 components[component_idx++] = c1;
2860 composition_state--;
2861 continue;
2862 }
2863 }
2864 charset = CHARSET_FROM_ID (charset_id_0);
2865 break;
2866
2867 case ISO_0xA0_or_0xFF:
2868 if (charset_id_1 < 0
2869 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2870 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2871 goto invalid_code;
2872 /* This is a graphic character, we fall down ... */
2873
2874 case ISO_graphic_plane_1:
2875 if (charset_id_1 < 0)
2876 goto invalid_code;
2877 charset = CHARSET_FROM_ID (charset_id_1);
2878 break;
2879
2880 case ISO_carriage_return:
2881 if (c1 == '\r')
2882 {
2883 if (EQ (eol_type, Qdos))
2884 {
2885 if (src == src_end)
2886 {
2887 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
2888 goto no_more_source;
2889 }
2890 if (*src == '\n')
2891 ONE_MORE_BYTE (c1);
2892 }
2893 else if (EQ (eol_type, Qmac))
2894 c1 = '\n';
2895 }
2896 /* fall through */
2897
2898 case ISO_control_0:
2899 MAYBE_FINISH_COMPOSITION ();
2900 charset = CHARSET_FROM_ID (charset_ascii);
2901 break;
2902
2903 case ISO_control_1:
2904 MAYBE_FINISH_COMPOSITION ();
2905 goto invalid_code;
2906
1909 case ISO_shift_out: 2907 case ISO_shift_out:
1910 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT) 2908 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
1911 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0) 2909 || CODING_ISO_DESIGNATION (coding, 1) < 0)
1912 goto label_invalid_code; 2910 goto invalid_code;
1913 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; 2911 CODING_ISO_INVOCATION (coding, 0) = 1;
1914 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); 2912 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
1915 continue; 2913 continue;
1916 2914
1917 case ISO_shift_in: 2915 case ISO_shift_in:
1918 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)) 2916 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
1919 goto label_invalid_code; 2917 goto invalid_code;
1920 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; 2918 CODING_ISO_INVOCATION (coding, 0) = 0;
1921 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); 2919 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
1922 continue; 2920 continue;
1923 2921
1924 case ISO_single_shift_2_7: 2922 case ISO_single_shift_2_7:
1925 case ISO_single_shift_2: 2923 case ISO_single_shift_2:
1926 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)) 2924 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
1927 goto label_invalid_code; 2925 goto invalid_code;
1928 /* SS2 is handled as an escape sequence of ESC 'N' */ 2926 /* SS2 is handled as an escape sequence of ESC 'N' */
1929 c1 = 'N'; 2927 c1 = 'N';
1930 goto label_escape_sequence; 2928 goto label_escape_sequence;
1931 2929
1932 case ISO_single_shift_3: 2930 case ISO_single_shift_3:
1933 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)) 2931 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
1934 goto label_invalid_code; 2932 goto invalid_code;
1935 /* SS2 is handled as an escape sequence of ESC 'O' */ 2933 /* SS2 is handled as an escape sequence of ESC 'O' */
1936 c1 = 'O'; 2934 c1 = 'O';
1937 goto label_escape_sequence; 2935 goto label_escape_sequence;
1938 2936
1939 case ISO_control_sequence_introducer: 2937 case ISO_control_sequence_introducer:
1942 goto label_escape_sequence; 2940 goto label_escape_sequence;
1943 2941
1944 case ISO_escape: 2942 case ISO_escape:
1945 ONE_MORE_BYTE (c1); 2943 ONE_MORE_BYTE (c1);
1946 label_escape_sequence: 2944 label_escape_sequence:
1947 /* Escape sequences handled by Emacs are invocation, 2945 /* Escape sequences handled here are invocation,
1948 designation, direction specification, and character 2946 designation, direction specification, and character
1949 composition specification. */ 2947 composition specification. */
1950 switch (c1) 2948 switch (c1)
1951 { 2949 {
1952 case '&': /* revision of following character set */ 2950 case '&': /* revision of following character set */
1953 ONE_MORE_BYTE (c1); 2951 ONE_MORE_BYTE (c1);
1954 if (!(c1 >= '@' && c1 <= '~')) 2952 if (!(c1 >= '@' && c1 <= '~'))
1955 goto label_invalid_code; 2953 goto invalid_code;
1956 ONE_MORE_BYTE (c1); 2954 ONE_MORE_BYTE (c1);
1957 if (c1 != ISO_CODE_ESC) 2955 if (c1 != ISO_CODE_ESC)
1958 goto label_invalid_code; 2956 goto invalid_code;
1959 ONE_MORE_BYTE (c1); 2957 ONE_MORE_BYTE (c1);
1960 goto label_escape_sequence; 2958 goto label_escape_sequence;
1961 2959
1962 case '$': /* designation of 2-byte character set */ 2960 case '$': /* designation of 2-byte character set */
1963 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION)) 2961 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
1964 goto label_invalid_code; 2962 goto invalid_code;
1965 ONE_MORE_BYTE (c1); 2963 ONE_MORE_BYTE (c1);
1966 if (c1 >= '@' && c1 <= 'B') 2964 if (c1 >= '@' && c1 <= 'B')
1967 { /* designation of JISX0208.1978, GB2312.1980, 2965 { /* designation of JISX0208.1978, GB2312.1980,
1968 or JISX0208.1980 */ 2966 or JISX0208.1980 */
1969 DECODE_DESIGNATION (0, 2, 94, c1); 2967 DECODE_DESIGNATION (0, 2, 0, c1);
1970 } 2968 }
1971 else if (c1 >= 0x28 && c1 <= 0x2B) 2969 else if (c1 >= 0x28 && c1 <= 0x2B)
1972 { /* designation of DIMENSION2_CHARS94 character set */ 2970 { /* designation of DIMENSION2_CHARS94 character set */
1973 ONE_MORE_BYTE (c2); 2971 ONE_MORE_BYTE (c2);
1974 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2); 2972 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
1975 } 2973 }
1976 else if (c1 >= 0x2C && c1 <= 0x2F) 2974 else if (c1 >= 0x2C && c1 <= 0x2F)
1977 { /* designation of DIMENSION2_CHARS96 character set */ 2975 { /* designation of DIMENSION2_CHARS96 character set */
1978 ONE_MORE_BYTE (c2); 2976 ONE_MORE_BYTE (c2);
1979 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2); 2977 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
1980 } 2978 }
1981 else 2979 else
1982 goto label_invalid_code; 2980 goto invalid_code;
1983 /* We must update these variables now. */ 2981 /* We must update these variables now. */
1984 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); 2982 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
1985 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1); 2983 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
1986 continue; 2984 continue;
1987 2985
1988 case 'n': /* invocation of locking-shift-2 */ 2986 case 'n': /* invocation of locking-shift-2 */
1989 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT) 2987 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
1990 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0) 2988 || CODING_ISO_DESIGNATION (coding, 2) < 0)
1991 goto label_invalid_code; 2989 goto invalid_code;
1992 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; 2990 CODING_ISO_INVOCATION (coding, 0) = 2;
1993 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); 2991 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
1994 continue; 2992 continue;
1995 2993
1996 case 'o': /* invocation of locking-shift-3 */ 2994 case 'o': /* invocation of locking-shift-3 */
1997 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT) 2995 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
1998 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0) 2996 || CODING_ISO_DESIGNATION (coding, 3) < 0)
1999 goto label_invalid_code; 2997 goto invalid_code;
2000 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; 2998 CODING_ISO_INVOCATION (coding, 0) = 3;
2001 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); 2999 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2002 continue; 3000 continue;
2003 3001
2004 case 'N': /* invocation of single-shift-2 */ 3002 case 'N': /* invocation of single-shift-2 */
2005 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT) 3003 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2006 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0) 3004 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2007 goto label_invalid_code; 3005 goto invalid_code;
2008 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2); 3006 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
2009 ONE_MORE_BYTE (c1); 3007 ONE_MORE_BYTE (c1);
2010 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)) 3008 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2011 goto label_invalid_code; 3009 goto invalid_code;
2012 break; 3010 break;
2013 3011
2014 case 'O': /* invocation of single-shift-3 */ 3012 case 'O': /* invocation of single-shift-3 */
2015 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT) 3013 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2016 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0) 3014 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2017 goto label_invalid_code; 3015 goto invalid_code;
2018 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3); 3016 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
2019 ONE_MORE_BYTE (c1); 3017 ONE_MORE_BYTE (c1);
2020 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)) 3018 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2021 goto label_invalid_code; 3019 goto invalid_code;
2022 break; 3020 break;
2023 3021
2024 case '0': case '2': case '3': case '4': /* start composition */ 3022 case '0': case '2': case '3': case '4': /* start composition */
3023 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3024 goto invalid_code;
2025 DECODE_COMPOSITION_START (c1); 3025 DECODE_COMPOSITION_START (c1);
2026 continue; 3026 continue;
2027 3027
2028 case '1': /* end composition */ 3028 case '1': /* end composition */
2029 DECODE_COMPOSITION_END (c1); 3029 if (composition_state == COMPOSING_NO)
3030 goto invalid_code;
3031 DECODE_COMPOSITION_END ();
2030 continue; 3032 continue;
2031 3033
2032 case '[': /* specification of direction */ 3034 case '[': /* specification of direction */
2033 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION) 3035 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
2034 goto label_invalid_code; 3036 goto invalid_code;
2035 /* For the moment, nested direction is not supported. 3037 /* For the moment, nested direction is not supported.
2036 So, `coding->mode & CODING_MODE_DIRECTION' zero means 3038 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2037 left-to-right, and nonzero means right-to-left. */ 3039 left-to-right, and nozero means right-to-left. */
2038 ONE_MORE_BYTE (c1); 3040 ONE_MORE_BYTE (c1);
2039 switch (c1) 3041 switch (c1)
2040 { 3042 {
2041 case ']': /* end of the current direction */ 3043 case ']': /* end of the current direction */
2042 coding->mode &= ~CODING_MODE_DIRECTION; 3044 coding->mode &= ~CODING_MODE_DIRECTION;
2045 case '1': /* start of left-to-right direction */ 3047 case '1': /* start of left-to-right direction */
2046 ONE_MORE_BYTE (c1); 3048 ONE_MORE_BYTE (c1);
2047 if (c1 == ']') 3049 if (c1 == ']')
2048 coding->mode &= ~CODING_MODE_DIRECTION; 3050 coding->mode &= ~CODING_MODE_DIRECTION;
2049 else 3051 else
2050 goto label_invalid_code; 3052 goto invalid_code;
2051 break; 3053 break;
2052 3054
2053 case '2': /* start of right-to-left direction */ 3055 case '2': /* start of right-to-left direction */
2054 ONE_MORE_BYTE (c1); 3056 ONE_MORE_BYTE (c1);
2055 if (c1 == ']') 3057 if (c1 == ']')
2056 coding->mode |= CODING_MODE_DIRECTION; 3058 coding->mode |= CODING_MODE_DIRECTION;
2057 else 3059 else
2058 goto label_invalid_code; 3060 goto invalid_code;
2059 break; 3061 break;
2060 3062
2061 default: 3063 default:
2062 goto label_invalid_code; 3064 goto invalid_code;
2063 } 3065 }
2064 continue; 3066 continue;
2065 3067
2066 case '%': 3068 case '%':
2067 if (COMPOSING_P (coding))
2068 DECODE_COMPOSITION_END ('1');
2069 ONE_MORE_BYTE (c1); 3069 ONE_MORE_BYTE (c1);
2070 if (c1 == '/') 3070 if (c1 == '/')
2071 { 3071 {
2072 /* CTEXT extended segment: 3072 /* CTEXT extended segment:
2073 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES-- 3073 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2074 We keep these bytes as is for the moment. 3074 We keep these bytes as is for the moment.
2075 They may be decoded by post-read-conversion. */ 3075 They may be decoded by post-read-conversion. */
2076 int dim, M, L; 3076 int dim, M, L;
2077 int size, required; 3077 int size;
2078 int produced_chars; 3078
2079
2080 ONE_MORE_BYTE (dim); 3079 ONE_MORE_BYTE (dim);
2081 ONE_MORE_BYTE (M); 3080 ONE_MORE_BYTE (M);
2082 ONE_MORE_BYTE (L); 3081 ONE_MORE_BYTE (L);
2083 size = ((M - 128) * 128) + (L - 128); 3082 size = ((M - 128) * 128) + (L - 128);
2084 required = 8 + size * 2; 3083 if (charbuf + 8 + size > charbuf_end)
2085 if (dst + required > (dst_bytes ? dst_end : src)) 3084 goto break_loop;
2086 goto label_end_of_loop; 3085 *charbuf++ = ISO_CODE_ESC;
2087 *dst++ = ISO_CODE_ESC; 3086 *charbuf++ = '%';
2088 *dst++ = '%'; 3087 *charbuf++ = '/';
2089 *dst++ = '/'; 3088 *charbuf++ = dim;
2090 *dst++ = dim; 3089 *charbuf++ = BYTE8_TO_CHAR (M);
2091 produced_chars = 4; 3090 *charbuf++ = BYTE8_TO_CHAR (L);
2092 dst += CHAR_STRING (M, dst), produced_chars++;
2093 dst += CHAR_STRING (L, dst), produced_chars++;
2094 while (size-- > 0) 3091 while (size-- > 0)
2095 { 3092 {
2096 ONE_MORE_BYTE (c1); 3093 ONE_MORE_BYTE (c1);
2097 dst += CHAR_STRING (c1, dst), produced_chars++; 3094 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
2098 } 3095 }
2099 coding->produced_char += produced_chars;
2100 } 3096 }
2101 else if (c1 == 'G') 3097 else if (c1 == 'G')
2102 { 3098 {
2103 unsigned char *d = dst;
2104 int produced_chars;
2105
2106 /* XFree86 extension for embedding UTF-8 in CTEXT: 3099 /* XFree86 extension for embedding UTF-8 in CTEXT:
2107 ESC % G --UTF-8-BYTES-- ESC % @ 3100 ESC % G --UTF-8-BYTES-- ESC % @
2108 We keep these bytes as is for the moment. 3101 We keep these bytes as is for the moment.
2109 They may be decoded by post-read-conversion. */ 3102 They may be decoded by post-read-conversion. */
2110 if (d + 6 > (dst_bytes ? dst_end : src)) 3103 int *p = charbuf;
2111 goto label_end_of_loop; 3104
2112 *d++ = ISO_CODE_ESC; 3105 if (p + 6 > charbuf_end)
2113 *d++ = '%'; 3106 goto break_loop;
2114 *d++ = 'G'; 3107 *p++ = ISO_CODE_ESC;
2115 produced_chars = 3; 3108 *p++ = '%';
2116 while (d + 1 < (dst_bytes ? dst_end : src)) 3109 *p++ = 'G';
3110 while (p < charbuf_end)
2117 { 3111 {
2118 ONE_MORE_BYTE (c1); 3112 ONE_MORE_BYTE (c1);
2119 if (c1 == ISO_CODE_ESC 3113 if (c1 == ISO_CODE_ESC
2120 && src + 1 < src_end 3114 && src + 1 < src_end
2121 && src[0] == '%' 3115 && src[0] == '%'
2122 && src[1] == '@') 3116 && src[1] == '@')
2123 break; 3117 break;
2124 d += CHAR_STRING (c1, d), produced_chars++; 3118 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
2125 } 3119 }
2126 if (d + 3 > (dst_bytes ? dst_end : src)) 3120 if (p + 3 > charbuf_end)
2127 goto label_end_of_loop; 3121 goto break_loop;
2128 *d++ = ISO_CODE_ESC; 3122 *p++ = ISO_CODE_ESC;
2129 *d++ = '%'; 3123 *p++ = '%';
2130 *d++ = '@'; 3124 *p++ = '@';
2131 dst = d; 3125 charbuf = p;
2132 coding->produced_char += produced_chars + 3;
2133 } 3126 }
2134 else 3127 else
2135 goto label_invalid_code; 3128 goto invalid_code;
2136 continue; 3129 continue;
3130 break;
2137 3131
2138 default: 3132 default:
2139 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION)) 3133 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2140 goto label_invalid_code; 3134 goto invalid_code;
2141 if (c1 >= 0x28 && c1 <= 0x2B) 3135 if (c1 >= 0x28 && c1 <= 0x2B)
2142 { /* designation of DIMENSION1_CHARS94 character set */ 3136 { /* designation of DIMENSION1_CHARS94 character set */
2143 ONE_MORE_BYTE (c2); 3137 ONE_MORE_BYTE (c2);
2144 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2); 3138 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
2145 } 3139 }
2146 else if (c1 >= 0x2C && c1 <= 0x2F) 3140 else if (c1 >= 0x2C && c1 <= 0x2F)
2147 { /* designation of DIMENSION1_CHARS96 character set */ 3141 { /* designation of DIMENSION1_CHARS96 character set */
2148 ONE_MORE_BYTE (c2); 3142 ONE_MORE_BYTE (c2);
2149 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2); 3143 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
2150 } 3144 }
2151 else 3145 else
2152 goto label_invalid_code; 3146 goto invalid_code;
2153 /* We must update these variables now. */ 3147 /* We must update these variables now. */
2154 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); 3148 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2155 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1); 3149 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2156 continue; 3150 continue;
2157 } 3151 }
2158 } 3152 }
2159 3153
3154 if (charset->id != charset_ascii
3155 && last_id != charset->id)
3156 {
3157 if (last_id != charset_ascii)
3158 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3159 last_id = charset->id;
3160 last_offset = char_offset;
3161 }
3162
2160 /* Now we know CHARSET and 1st position code C1 of a character. 3163 /* Now we know CHARSET and 1st position code C1 of a character.
2161 Produce a multibyte sequence for that character while getting 3164 Produce a decoded character while getting 2nd position code
2162 2nd position code C2 if necessary. */ 3165 C2 if necessary. */
2163 if (CHARSET_DIMENSION (charset) == 2) 3166 c1 &= 0x7F;
3167 if (CHARSET_DIMENSION (charset) > 1)
2164 { 3168 {
2165 ONE_MORE_BYTE (c2); 3169 ONE_MORE_BYTE (c2);
2166 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0) 3170 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
2167 /* C2 is not in a valid range. */ 3171 /* C2 is not in a valid range. */
2168 goto label_invalid_code; 3172 goto invalid_code;
2169 } 3173 c1 = (c1 << 8) | (c2 & 0x7F);
2170 c = DECODE_ISO_CHARACTER (charset, c1, c2); 3174 if (CHARSET_DIMENSION (charset) > 2)
2171 EMIT_CHAR (c); 3175 {
3176 ONE_MORE_BYTE (c2);
3177 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3178 /* C2 is not in a valid range. */
3179 goto invalid_code;
3180 c1 = (c1 << 8) | (c2 & 0x7F);
3181 }
3182 }
3183
3184 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3185 if (c < 0)
3186 {
3187 MAYBE_FINISH_COMPOSITION ();
3188 for (; src_base < src; src_base++, char_offset++)
3189 {
3190 if (ASCII_BYTE_P (*src_base))
3191 *charbuf++ = *src_base;
3192 else
3193 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3194 }
3195 }
3196 else if (composition_state == COMPOSING_NO)
3197 {
3198 *charbuf++ = c;
3199 char_offset++;
3200 }
3201 else
3202 {
3203 components[component_idx++] = c;
3204 if (method == COMPOSITION_WITH_RULE
3205 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3206 && composition_state == COMPOSING_COMPONENT_CHAR))
3207 composition_state++;
3208 }
2172 continue; 3209 continue;
2173 3210
2174 label_invalid_code: 3211 invalid_code:
3212 MAYBE_FINISH_COMPOSITION ();
3213 src = src_base;
3214 consumed_chars = consumed_chars_base;
3215 ONE_MORE_BYTE (c);
3216 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3217 char_offset++;
2175 coding->errors++; 3218 coding->errors++;
2176 if (COMPOSING_P (coding)) 3219 continue;
2177 DECODE_COMPOSITION_END ('1'); 3220
2178 src = src_base; 3221 break_loop:
2179 c = *src++; 3222 break;
2180 EMIT_CHAR (c); 3223 }
2181 } 3224
2182 3225 no_more_source:
2183 label_end_of_loop: 3226 if (last_id != charset_ascii)
2184 coding->consumed = coding->consumed_char = src_base - source; 3227 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2185 coding->produced = dst - destination; 3228 coding->consumed_char += consumed_chars_base;
2186 return; 3229 coding->consumed = src_base - coding->source;
3230 coding->charbuf_used = charbuf - coding->charbuf;
2187 } 3231 }
2188 3232
2189 3233
2190 /* ISO2022 encoding stuff. */ 3234 /* ISO2022 encoding stuff. */
2191 3235
2192 /* 3236 /*
2193 It is not enough to say just "ISO2022" on encoding, we have to 3237 It is not enough to say just "ISO2022" on encoding, we have to
2194 specify more details. In Emacs, each ISO2022 coding system 3238 specify more details. In Emacs, each coding system of ISO2022
2195 variant has the following specifications: 3239 variant has the following specifications:
2196 1. Initial designation to G0 through G3. 3240 1. Initial designation to G0 thru G3.
2197 2. Allows short-form designation? 3241 2. Allows short-form designation?
2198 3. ASCII should be designated to G0 before control characters? 3242 3. ASCII should be designated to G0 before control characters?
2199 4. ASCII should be designated to G0 at end of line? 3243 4. ASCII should be designated to G0 at end of line?
2200 5. 7-bit environment or 8-bit environment? 3244 5. 7-bit environment or 8-bit environment?
2201 6. Use locking-shift? 3245 6. Use locking-shift?
2202 7. Use Single-shift? 3246 7. Use Single-shift?
2203 And the following two are only for Japanese: 3247 And the following two are only for Japanese:
2204 8. Use ASCII in place of JIS0201-1976-Roman? 3248 8. Use ASCII in place of JIS0201-1976-Roman?
2205 9. Use JISX0208-1983 in place of JISX0208-1978? 3249 9. Use JISX0208-1983 in place of JISX0208-1978?
2206 These specifications are encoded in `coding->flags' as flag bits 3250 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
2207 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more 3251 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
2208 details. 3252 details.
2209 */ 3253 */
2210 3254
2211 /* Produce codes (escape sequence) for designating CHARSET to graphic 3255 /* Produce codes (escape sequence) for designating CHARSET to graphic
2212 register REG at DST, and increment DST. If <final-char> of CHARSET is 3256 register REG at DST, and increment DST. If <final-char> of CHARSET is
2213 '@', 'A', or 'B' and the coding system CODING allows, produce 3257 '@', 'A', or 'B' and the coding system CODING allows, produce
2214 designation sequence of short-form. */ 3258 designation sequence of short-form. */
2215 3259
2216 #define ENCODE_DESIGNATION(charset, reg, coding) \ 3260 #define ENCODE_DESIGNATION(charset, reg, coding) \
2217 do { \ 3261 do { \
2218 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \ 3262 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
2219 char *intermediate_char_94 = "()*+"; \ 3263 char *intermediate_char_94 = "()*+"; \
2220 char *intermediate_char_96 = ",-./"; \ 3264 char *intermediate_char_96 = ",-./"; \
2221 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \ 3265 int revision = -1; \
2222 \ 3266 int c; \
2223 if (revision < 255) \ 3267 \
3268 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3269 revision = CHARSET_ISO_REVISION (charset); \
3270 \
3271 if (revision >= 0) \
2224 { \ 3272 { \
2225 *dst++ = ISO_CODE_ESC; \ 3273 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
2226 *dst++ = '&'; \ 3274 EMIT_ONE_BYTE ('@' + revision); \
2227 *dst++ = '@' + revision; \
2228 } \ 3275 } \
2229 *dst++ = ISO_CODE_ESC; \ 3276 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
2230 if (CHARSET_DIMENSION (charset) == 1) \ 3277 if (CHARSET_DIMENSION (charset) == 1) \
2231 { \ 3278 { \
2232 if (CHARSET_CHARS (charset) == 94) \ 3279 if (! CHARSET_ISO_CHARS_96 (charset)) \
2233 *dst++ = (unsigned char) (intermediate_char_94[reg]); \ 3280 c = intermediate_char_94[reg]; \
2234 else \ 3281 else \
2235 *dst++ = (unsigned char) (intermediate_char_96[reg]); \ 3282 c = intermediate_char_96[reg]; \
3283 EMIT_ONE_ASCII_BYTE (c); \
2236 } \ 3284 } \
2237 else \ 3285 else \
2238 { \ 3286 { \
2239 *dst++ = '$'; \ 3287 EMIT_ONE_ASCII_BYTE ('$'); \
2240 if (CHARSET_CHARS (charset) == 94) \ 3288 if (! CHARSET_ISO_CHARS_96 (charset)) \
2241 { \ 3289 { \
2242 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \ 3290 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
2243 || reg != 0 \ 3291 || reg != 0 \
2244 || final_char < '@' || final_char > 'B') \ 3292 || final_char < '@' || final_char > 'B') \
2245 *dst++ = (unsigned char) (intermediate_char_94[reg]); \ 3293 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
2246 } \ 3294 } \
2247 else \ 3295 else \
2248 *dst++ = (unsigned char) (intermediate_char_96[reg]); \ 3296 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
2249 } \ 3297 } \
2250 *dst++ = final_char; \ 3298 EMIT_ONE_ASCII_BYTE (final_char); \
2251 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \ 3299 \
3300 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
2252 } while (0) 3301 } while (0)
3302
2253 3303
2254 /* The following two macros produce codes (control character or escape 3304 /* The following two macros produce codes (control character or escape
2255 sequence) for ISO2022 single-shift functions (single-shift-2 and 3305 sequence) for ISO2022 single-shift functions (single-shift-2 and
2256 single-shift-3). */ 3306 single-shift-3). */
2257 3307
2258 #define ENCODE_SINGLE_SHIFT_2 \ 3308 #define ENCODE_SINGLE_SHIFT_2 \
2259 do { \ 3309 do { \
2260 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \ 3310 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
2261 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \ 3311 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
2262 else \ 3312 else \
2263 *dst++ = ISO_CODE_SS2; \ 3313 EMIT_ONE_BYTE (ISO_CODE_SS2); \
2264 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \ 3314 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
2265 } while (0) 3315 } while (0)
2266 3316
2267 #define ENCODE_SINGLE_SHIFT_3 \ 3317
2268 do { \ 3318 #define ENCODE_SINGLE_SHIFT_3 \
2269 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \ 3319 do { \
2270 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \ 3320 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
2271 else \ 3321 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
2272 *dst++ = ISO_CODE_SS3; \ 3322 else \
2273 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \ 3323 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3324 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
2274 } while (0) 3325 } while (0)
3326
2275 3327
2276 /* The following four macros produce codes (control character or 3328 /* The following four macros produce codes (control character or
2277 escape sequence) for ISO2022 locking-shift functions (shift-in, 3329 escape sequence) for ISO2022 locking-shift functions (shift-in,
2278 shift-out, locking-shift-2, and locking-shift-3). */ 3330 shift-out, locking-shift-2, and locking-shift-3). */
2279 3331
2280 #define ENCODE_SHIFT_IN \ 3332 #define ENCODE_SHIFT_IN \
2281 do { \ 3333 do { \
2282 *dst++ = ISO_CODE_SI; \ 3334 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
2283 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \ 3335 CODING_ISO_INVOCATION (coding, 0) = 0; \
2284 } while (0) 3336 } while (0)
2285 3337
2286 #define ENCODE_SHIFT_OUT \ 3338
2287 do { \ 3339 #define ENCODE_SHIFT_OUT \
2288 *dst++ = ISO_CODE_SO; \ 3340 do { \
2289 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \ 3341 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3342 CODING_ISO_INVOCATION (coding, 0) = 1; \
2290 } while (0) 3343 } while (0)
2291 3344
2292 #define ENCODE_LOCKING_SHIFT_2 \ 3345
2293 do { \ 3346 #define ENCODE_LOCKING_SHIFT_2 \
2294 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \ 3347 do { \
2295 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \ 3348 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3349 CODING_ISO_INVOCATION (coding, 0) = 2; \
2296 } while (0) 3350 } while (0)
2297 3351
2298 #define ENCODE_LOCKING_SHIFT_3 \ 3352
2299 do { \ 3353 #define ENCODE_LOCKING_SHIFT_3 \
2300 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \ 3354 do { \
2301 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \ 3355 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3356 CODING_ISO_INVOCATION (coding, 0) = 3; \
2302 } while (0) 3357 } while (0)
3358
2303 3359
2304 /* Produce codes for a DIMENSION1 character whose character set is 3360 /* Produce codes for a DIMENSION1 character whose character set is
2305 CHARSET and whose position-code is C1. Designation and invocation 3361 CHARSET and whose position-code is C1. Designation and invocation
2306 sequences are also produced in advance if necessary. */ 3362 sequences are also produced in advance if necessary. */
2307 3363
2308 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \ 3364 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
2309 do { \ 3365 do { \
2310 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \ 3366 int id = CHARSET_ID (charset); \
3367 \
3368 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3369 && id == charset_ascii) \
2311 { \ 3370 { \
2312 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \ 3371 id = charset_jisx0201_roman; \
2313 *dst++ = c1 & 0x7F; \ 3372 charset = CHARSET_FROM_ID (id); \
3373 } \
3374 \
3375 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3376 { \
3377 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3378 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
2314 else \ 3379 else \
2315 *dst++ = c1 | 0x80; \ 3380 EMIT_ONE_BYTE (c1 | 0x80); \
2316 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \ 3381 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
2317 break; \ 3382 break; \
2318 } \ 3383 } \
2319 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \ 3384 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
2320 { \ 3385 { \
2321 *dst++ = c1 & 0x7F; \ 3386 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
2322 break; \ 3387 break; \
2323 } \ 3388 } \
2324 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \ 3389 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
2325 { \ 3390 { \
2326 *dst++ = c1 | 0x80; \ 3391 EMIT_ONE_BYTE (c1 | 0x80); \
2327 break; \ 3392 break; \
2328 } \ 3393 } \
2329 else \ 3394 else \
2330 /* Since CHARSET is not yet invoked to any graphic planes, we \ 3395 /* Since CHARSET is not yet invoked to any graphic planes, we \
2331 must invoke it, or, at first, designate it to some graphic \ 3396 must invoke it, or, at first, designate it to some graphic \
2332 register. Then repeat the loop to actually produce the \ 3397 register. Then repeat the loop to actually produce the \
2333 character. */ \ 3398 character. */ \
2334 dst = encode_invocation_designation (charset, coding, dst); \ 3399 dst = encode_invocation_designation (charset, coding, dst, \
3400 &produced_chars); \
2335 } while (1) 3401 } while (1)
3402
2336 3403
2337 /* Produce codes for a DIMENSION2 character whose character set is 3404 /* Produce codes for a DIMENSION2 character whose character set is
2338 CHARSET and whose position-codes are C1 and C2. Designation and 3405 CHARSET and whose position-codes are C1 and C2. Designation and
2339 invocation codes are also produced in advance if necessary. */ 3406 invocation codes are also produced in advance if necessary. */
2340 3407
2341 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \ 3408 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
2342 do { \ 3409 do { \
2343 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \ 3410 int id = CHARSET_ID (charset); \
3411 \
3412 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3413 && id == charset_jisx0208) \
2344 { \ 3414 { \
2345 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \ 3415 id = charset_jisx0208_1978; \
2346 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \ 3416 charset = CHARSET_FROM_ID (id); \
3417 } \
3418 \
3419 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3420 { \
3421 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3422 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
2347 else \ 3423 else \
2348 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \ 3424 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
2349 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \ 3425 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
2350 break; \ 3426 break; \
2351 } \ 3427 } \
2352 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \ 3428 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
2353 { \ 3429 { \
2354 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \ 3430 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
2355 break; \ 3431 break; \
2356 } \ 3432 } \
2357 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \ 3433 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
2358 { \ 3434 { \
2359 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \ 3435 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
2360 break; \ 3436 break; \
2361 } \ 3437 } \
2362 else \ 3438 else \
2363 /* Since CHARSET is not yet invoked to any graphic planes, we \ 3439 /* Since CHARSET is not yet invoked to any graphic planes, we \
2364 must invoke it, or, at first, designate it to some graphic \ 3440 must invoke it, or, at first, designate it to some graphic \
2365 register. Then repeat the loop to actually produce the \ 3441 register. Then repeat the loop to actually produce the \
2366 character. */ \ 3442 character. */ \
2367 dst = encode_invocation_designation (charset, coding, dst); \ 3443 dst = encode_invocation_designation (charset, coding, dst, \
3444 &produced_chars); \
2368 } while (1) 3445 } while (1)
2369 3446
2370 #define ENCODE_ISO_CHARACTER(c) \ 3447
2371 do { \ 3448 #define ENCODE_ISO_CHARACTER(charset, c) \
2372 int charset, c1, c2; \ 3449 do { \
2373 \ 3450 int code = ENCODE_CHAR ((charset),(c)); \
2374 SPLIT_CHAR (c, charset, c1, c2); \ 3451 \
2375 if (CHARSET_DEFINED_P (charset)) \ 3452 if (CHARSET_DIMENSION (charset) == 1) \
2376 { \ 3453 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
2377 if (CHARSET_DIMENSION (charset) == 1) \ 3454 else \
2378 { \ 3455 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
2379 if (charset == CHARSET_ASCII \
2380 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
2381 charset = charset_latin_jisx0201; \
2382 ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \
2383 } \
2384 else \
2385 { \
2386 if (charset == charset_jisx0208 \
2387 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
2388 charset = charset_jisx0208_1978; \
2389 ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \
2390 } \
2391 } \
2392 else \
2393 { \
2394 *dst++ = c1; \
2395 if (c2 >= 0) \
2396 *dst++ = c2; \
2397 } \
2398 } while (0) 3456 } while (0)
2399 3457
2400 3458
2401 /* Instead of encoding character C, produce one or two `?'s. */
2402
2403 #define ENCODE_UNSAFE_CHARACTER(c) \
2404 do { \
2405 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2406 if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \
2407 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2408 } while (0)
2409
2410
2411 /* Produce designation and invocation codes at a place pointed by DST 3459 /* Produce designation and invocation codes at a place pointed by DST
2412 to use CHARSET. The element `spec.iso2022' of *CODING is updated. 3460 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
2413 Return new DST. */ 3461 Return new DST. */
2414 3462
2415 unsigned char * 3463 unsigned char *
2416 encode_invocation_designation (charset, coding, dst) 3464 encode_invocation_designation (charset, coding, dst, p_nchars)
2417 int charset; 3465 struct charset *charset;
2418 struct coding_system *coding; 3466 struct coding_system *coding;
2419 unsigned char *dst; 3467 unsigned char *dst;
2420 { 3468 int *p_nchars;
3469 {
3470 int multibytep = coding->dst_multibyte;
3471 int produced_chars = *p_nchars;
2421 int reg; /* graphic register number */ 3472 int reg; /* graphic register number */
3473 int id = CHARSET_ID (charset);
2422 3474
2423 /* At first, check designations. */ 3475 /* At first, check designations. */
2424 for (reg = 0; reg < 4; reg++) 3476 for (reg = 0; reg < 4; reg++)
2425 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg)) 3477 if (id == CODING_ISO_DESIGNATION (coding, reg))
2426 break; 3478 break;
2427 3479
2428 if (reg >= 4) 3480 if (reg >= 4)
2429 { 3481 {
2430 /* CHARSET is not yet designated to any graphic registers. */ 3482 /* CHARSET is not yet designated to any graphic registers. */
2431 /* At first check the requested designation. */ 3483 /* At first check the requested designation. */
2432 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset); 3484 reg = CODING_ISO_REQUEST (coding, id);
2433 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION) 3485 if (reg < 0)
2434 /* Since CHARSET requests no special designation, designate it 3486 /* Since CHARSET requests no special designation, designate it
2435 to graphic register 0. */ 3487 to graphic register 0. */
2436 reg = 0; 3488 reg = 0;
2437 3489
2438 ENCODE_DESIGNATION (charset, reg, coding); 3490 ENCODE_DESIGNATION (charset, reg, coding);
2439 } 3491 }
2440 3492
2441 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg 3493 if (CODING_ISO_INVOCATION (coding, 0) != reg
2442 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg) 3494 && CODING_ISO_INVOCATION (coding, 1) != reg)
2443 { 3495 {
2444 /* Since the graphic register REG is not invoked to any graphic 3496 /* Since the graphic register REG is not invoked to any graphic
2445 planes, invoke it to graphic plane 0. */ 3497 planes, invoke it to graphic plane 0. */
2446 switch (reg) 3498 switch (reg)
2447 { 3499 {
2452 case 1: /* graphic register 1 */ 3504 case 1: /* graphic register 1 */
2453 ENCODE_SHIFT_OUT; 3505 ENCODE_SHIFT_OUT;
2454 break; 3506 break;
2455 3507
2456 case 2: /* graphic register 2 */ 3508 case 2: /* graphic register 2 */
2457 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT) 3509 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2458 ENCODE_SINGLE_SHIFT_2; 3510 ENCODE_SINGLE_SHIFT_2;
2459 else 3511 else
2460 ENCODE_LOCKING_SHIFT_2; 3512 ENCODE_LOCKING_SHIFT_2;
2461 break; 3513 break;
2462 3514
2463 case 3: /* graphic register 3 */ 3515 case 3: /* graphic register 3 */
2464 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT) 3516 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2465 ENCODE_SINGLE_SHIFT_3; 3517 ENCODE_SINGLE_SHIFT_3;
2466 else 3518 else
2467 ENCODE_LOCKING_SHIFT_3; 3519 ENCODE_LOCKING_SHIFT_3;
2468 break; 3520 break;
2469 } 3521 }
2470 } 3522 }
2471 3523
3524 *p_nchars = produced_chars;
2472 return dst; 3525 return dst;
2473 } 3526 }
2474
2475 /* Produce 2-byte codes for encoded composition rule RULE. */
2476
2477 #define ENCODE_COMPOSITION_RULE(rule) \
2478 do { \
2479 int gref, nref; \
2480 COMPOSITION_DECODE_RULE (rule, gref, nref); \
2481 *dst++ = 32 + 81 + gref; \
2482 *dst++ = 32 + nref; \
2483 } while (0)
2484
2485 /* Produce codes for indicating the start of a composition sequence
2486 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
2487 which specify information about the composition. See the comment
2488 in coding.h for the format of DATA. */
2489
2490 #define ENCODE_COMPOSITION_START(coding, data) \
2491 do { \
2492 coding->composing = data[3]; \
2493 *dst++ = ISO_CODE_ESC; \
2494 if (coding->composing == COMPOSITION_RELATIVE) \
2495 *dst++ = '0'; \
2496 else \
2497 { \
2498 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
2499 ? '3' : '4'); \
2500 coding->cmp_data_index = coding->cmp_data_start + 4; \
2501 coding->composition_rule_follows = 0; \
2502 } \
2503 } while (0)
2504
2505 /* Produce codes for indicating the end of the current composition. */
2506
2507 #define ENCODE_COMPOSITION_END(coding, data) \
2508 do { \
2509 *dst++ = ISO_CODE_ESC; \
2510 *dst++ = '1'; \
2511 coding->cmp_data_start += data[0]; \
2512 coding->composing = COMPOSITION_NO; \
2513 if (coding->cmp_data_start == coding->cmp_data->used \
2514 && coding->cmp_data->next) \
2515 { \
2516 coding->cmp_data = coding->cmp_data->next; \
2517 coding->cmp_data_start = 0; \
2518 } \
2519 } while (0)
2520
2521 /* Produce composition start sequence ESC 0. Here, this sequence
2522 doesn't mean the start of a new composition but means that we have
2523 just produced components (alternate chars and composition rules) of
2524 the composition and the actual text follows in SRC. */
2525
2526 #define ENCODE_COMPOSITION_FAKE_START(coding) \
2527 do { \
2528 *dst++ = ISO_CODE_ESC; \
2529 *dst++ = '0'; \
2530 coding->composing = COMPOSITION_RELATIVE; \
2531 } while (0)
2532 3527
2533 /* The following three macros produce codes for indicating direction 3528 /* The following three macros produce codes for indicating direction
2534 of text. */ 3529 of text. */
2535 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \ 3530 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
2536 do { \ 3531 do { \
2537 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \ 3532 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
2538 *dst++ = ISO_CODE_ESC, *dst++ = '['; \ 3533 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
2539 else \ 3534 else \
2540 *dst++ = ISO_CODE_CSI; \ 3535 EMIT_ONE_BYTE (ISO_CODE_CSI); \
2541 } while (0) 3536 } while (0)
2542 3537
2543 #define ENCODE_DIRECTION_R2L \ 3538
2544 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']' 3539 #define ENCODE_DIRECTION_R2L() \
2545 3540 do { \
2546 #define ENCODE_DIRECTION_L2R \ 3541 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
2547 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']' 3542 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3543 } while (0)
3544
3545
3546 #define ENCODE_DIRECTION_L2R() \
3547 do { \
3548 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3549 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3550 } while (0)
3551
2548 3552
2549 /* Produce codes for designation and invocation to reset the graphic 3553 /* Produce codes for designation and invocation to reset the graphic
2550 planes and registers to initial state. */ 3554 planes and registers to initial state. */
2551 #define ENCODE_RESET_PLANE_AND_REGISTER \ 3555 #define ENCODE_RESET_PLANE_AND_REGISTER() \
2552 do { \ 3556 do { \
2553 int reg; \ 3557 int reg; \
2554 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \ 3558 struct charset *charset; \
2555 ENCODE_SHIFT_IN; \ 3559 \
2556 for (reg = 0; reg < 4; reg++) \ 3560 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
2557 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \ 3561 ENCODE_SHIFT_IN; \
2558 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \ 3562 for (reg = 0; reg < 4; reg++) \
2559 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \ 3563 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
2560 ENCODE_DESIGNATION \ 3564 && (CODING_ISO_DESIGNATION (coding, reg) \
2561 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \ 3565 != CODING_ISO_INITIAL (coding, reg))) \
3566 { \
3567 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3568 ENCODE_DESIGNATION (charset, reg, coding); \
3569 } \
2562 } while (0) 3570 } while (0)
3571
2563 3572
2564 /* Produce designation sequences of charsets in the line started from 3573 /* Produce designation sequences of charsets in the line started from
2565 SRC to a place pointed by DST, and return updated DST. 3574 SRC to a place pointed by DST, and return updated DST.
2566 3575
2567 If the current block ends before any end-of-line, we may fail to 3576 If the current block ends before any end-of-line, we may fail to
2568 find all the necessary designations. */ 3577 find all the necessary designations. */
2569 3578
2570 static unsigned char * 3579 static unsigned char *
2571 encode_designation_at_bol (coding, translation_table, src, src_end, dst) 3580 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
2572 struct coding_system *coding; 3581 struct coding_system *coding;
2573 Lisp_Object translation_table; 3582 int *charbuf, *charbuf_end;
2574 unsigned char *src, *src_end, *dst; 3583 unsigned char *dst;
2575 { 3584 {
2576 int charset, c, found = 0, reg; 3585 struct charset *charset;
2577 /* Table of charsets to be designated to each graphic register. */ 3586 /* Table of charsets to be designated to each graphic register. */
2578 int r[4]; 3587 int r[4];
3588 int c, found = 0, reg;
3589 int produced_chars = 0;
3590 int multibytep = coding->dst_multibyte;
3591 Lisp_Object attrs;
3592 Lisp_Object charset_list;
3593
3594 attrs = CODING_ID_ATTRS (coding->id);
3595 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3596 if (EQ (charset_list, Qiso_2022))
3597 charset_list = Viso_2022_charset_list;
2579 3598
2580 for (reg = 0; reg < 4; reg++) 3599 for (reg = 0; reg < 4; reg++)
2581 r[reg] = -1; 3600 r[reg] = -1;
2582 3601
2583 while (found < 4) 3602 while (found < 4)
2584 { 3603 {
2585 ONE_MORE_CHAR (c); 3604 int id;
3605
3606 c = *charbuf++;
2586 if (c == '\n') 3607 if (c == '\n')
2587 break; 3608 break;
2588 3609 charset = char_charset (c, charset_list, NULL);
2589 charset = CHAR_CHARSET (c); 3610 id = CHARSET_ID (charset);
2590 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset); 3611 reg = CODING_ISO_REQUEST (coding, id);
2591 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0) 3612 if (reg >= 0 && r[reg] < 0)
2592 { 3613 {
2593 found++; 3614 found++;
2594 r[reg] = charset; 3615 r[reg] = id;
2595 } 3616 }
2596 } 3617 }
2597 3618
2598 label_end_of_loop:
2599 if (found) 3619 if (found)
2600 { 3620 {
2601 for (reg = 0; reg < 4; reg++) 3621 for (reg = 0; reg < 4; reg++)
2602 if (r[reg] >= 0 3622 if (r[reg] >= 0
2603 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg]) 3623 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
2604 ENCODE_DESIGNATION (r[reg], reg, coding); 3624 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
2605 } 3625 }
2606 3626
2607 return dst; 3627 return dst;
2608 } 3628 }
2609 3629
2610 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */ 3630 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
2611 3631
2612 static void 3632 static int
2613 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) 3633 encode_coding_iso_2022 (coding)
2614 struct coding_system *coding; 3634 struct coding_system *coding;
2615 unsigned char *source, *destination; 3635 {
2616 int src_bytes, dst_bytes; 3636 int multibytep = coding->dst_multibyte;
2617 { 3637 int *charbuf = coding->charbuf;
2618 unsigned char *src = source; 3638 int *charbuf_end = charbuf + coding->charbuf_used;
2619 unsigned char *src_end = source + src_bytes; 3639 unsigned char *dst = coding->destination + coding->produced;
2620 unsigned char *dst = destination; 3640 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2621 unsigned char *dst_end = destination + dst_bytes; 3641 int safe_room = 16;
2622 /* Since the maximum bytes produced by each loop is 20, we subtract 19 3642 int bol_designation
2623 from DST_END to assure overflow checking is necessary only at the 3643 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
2624 head of loop. */ 3644 && CODING_ISO_BOL (coding));
2625 unsigned char *adjusted_dst_end = dst_end - 19; 3645 int produced_chars = 0;
2626 /* SRC_BASE remembers the start position in source in each loop. 3646 Lisp_Object attrs, eol_type, charset_list;
2627 The loop will be exited when there's not enough source text to 3647 int ascii_compatible;
2628 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2629 there's not enough destination area to produce encoded codes
2630 (within macro EMIT_BYTES). */
2631 unsigned char *src_base;
2632 int c; 3648 int c;
2633 Lisp_Object translation_table; 3649 int preferred_charset_id = -1;
2634 Lisp_Object safe_chars; 3650
2635 3651 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2636 if (coding->flags & CODING_FLAG_ISO_SAFE) 3652 setup_iso_safe_charsets (attrs);
2637 coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR; 3653 /* Charset list may have been changed. */
2638 3654 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
2639 safe_chars = coding_safe_chars (coding->symbol); 3655 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
2640 3656
2641 if (NILP (Venable_character_translation)) 3657 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
2642 translation_table = Qnil; 3658
2643 else 3659 while (charbuf < charbuf_end)
2644 { 3660 {
2645 translation_table = coding->translation_table_for_encode; 3661 ASSURE_DESTINATION (safe_room);
2646 if (NILP (translation_table)) 3662
2647 translation_table = Vstandard_translation_table_for_encode; 3663 if (bol_designation)
2648 } 3664 {
2649 3665 unsigned char *dst_prev = dst;
2650 coding->consumed_char = 0; 3666
2651 coding->errors = 0;
2652 while (1)
2653 {
2654 src_base = src;
2655
2656 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2657 {
2658 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2659 break;
2660 }
2661
2662 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2663 && CODING_SPEC_ISO_BOL (coding))
2664 {
2665 /* We have to produce designation sequences if any now. */ 3667 /* We have to produce designation sequences if any now. */
2666 dst = encode_designation_at_bol (coding, translation_table, 3668 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
2667 src, src_end, dst); 3669 bol_designation = 0;
2668 CODING_SPEC_ISO_BOL (coding) = 0; 3670 /* We are sure that designation sequences are all ASCII bytes. */
2669 } 3671 produced_chars += dst - dst_prev;
2670 3672 }
2671 /* Check composition start and end. */ 3673
2672 if (coding->composing != COMPOSITION_DISABLED 3674 c = *charbuf++;
2673 && coding->cmp_data_start < coding->cmp_data->used) 3675
2674 { 3676 if (c < 0)
2675 struct composition_data *cmp_data = coding->cmp_data; 3677 {
2676 int *data = cmp_data->data + coding->cmp_data_start; 3678 /* Handle an annotation. */
2677 int this_pos = cmp_data->char_offset + coding->consumed_char; 3679 switch (*charbuf)
2678
2679 if (coding->composing == COMPOSITION_RELATIVE)
2680 { 3680 {
2681 if (this_pos == data[2]) 3681 case CODING_ANNOTATE_COMPOSITION_MASK:
3682 /* Not yet implemented. */
3683 break;
3684 case CODING_ANNOTATE_CHARSET_MASK:
3685 preferred_charset_id = charbuf[3];
3686 if (preferred_charset_id >= 0
3687 && NILP (Fmemq (make_number (preferred_charset_id),
3688 charset_list)))
3689 preferred_charset_id = -1;
3690 break;
3691 default:
3692 abort ();
3693 }
3694 charbuf += -c - 1;
3695 continue;
3696 }
3697
3698 /* Now encode the character C. */
3699 if (c < 0x20 || c == 0x7F)
3700 {
3701 if (c == '\n'
3702 || (c == '\r' && EQ (eol_type, Qmac)))
3703 {
3704 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3705 ENCODE_RESET_PLANE_AND_REGISTER ();
3706 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
2682 { 3707 {
2683 ENCODE_COMPOSITION_END (coding, data); 3708 int i;
2684 cmp_data = coding->cmp_data; 3709
2685 data = cmp_data->data + coding->cmp_data_start; 3710 for (i = 0; i < 4; i++)
3711 CODING_ISO_DESIGNATION (coding, i)
3712 = CODING_ISO_INITIAL (coding, i);
3713 }
3714 bol_designation
3715 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3716 }
3717 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3718 ENCODE_RESET_PLANE_AND_REGISTER ();
3719 EMIT_ONE_ASCII_BYTE (c);
3720 }
3721 else if (ASCII_CHAR_P (c))
3722 {
3723 if (ascii_compatible)
3724 EMIT_ONE_ASCII_BYTE (c);
3725 else
3726 {
3727 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3728 ENCODE_ISO_CHARACTER (charset, c);
3729 }
3730 }
3731 else if (CHAR_BYTE8_P (c))
3732 {
3733 c = CHAR_TO_BYTE8 (c);
3734 EMIT_ONE_BYTE (c);
3735 }
3736 else
3737 {
3738 struct charset *charset;
3739
3740 if (preferred_charset_id >= 0)
3741 {
3742 charset = CHARSET_FROM_ID (preferred_charset_id);
3743 if (! CHAR_CHARSET_P (c, charset))
3744 charset = char_charset (c, charset_list, NULL);
3745 }
3746 else
3747 charset = char_charset (c, charset_list, NULL);
3748 if (!charset)
3749 {
3750 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3751 {
3752 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3753 charset = CHARSET_FROM_ID (charset_ascii);
3754 }
3755 else
3756 {
3757 c = coding->default_char;
3758 charset = char_charset (c, charset_list, NULL);
2686 } 3759 }
2687 } 3760 }
2688 else if (COMPOSING_P (coding)) 3761 ENCODE_ISO_CHARACTER (charset, c);
2689 { 3762 }
2690 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */ 3763 }
2691 if (coding->cmp_data_index == coding->cmp_data_start + data[0]) 3764
2692 /* We have consumed components of the composition. 3765 if (coding->mode & CODING_MODE_LAST_BLOCK
2693 What follows in SRC is the composition's base 3766 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
2694 text. */ 3767 {
2695 ENCODE_COMPOSITION_FAKE_START (coding); 3768 ASSURE_DESTINATION (safe_room);
2696 else 3769 ENCODE_RESET_PLANE_AND_REGISTER ();
2697 { 3770 }
2698 int c = cmp_data->data[coding->cmp_data_index++]; 3771 coding->result = CODING_RESULT_SUCCESS;
2699 if (coding->composition_rule_follows) 3772 CODING_ISO_BOL (coding) = bol_designation;
2700 { 3773 coding->produced_char += produced_chars;
2701 ENCODE_COMPOSITION_RULE (c); 3774 coding->produced = dst - coding->destination;
2702 coding->composition_rule_follows = 0; 3775 return 0;
2703 }
2704 else
2705 {
2706 if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2707 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2708 ENCODE_UNSAFE_CHARACTER (c);
2709 else
2710 ENCODE_ISO_CHARACTER (c);
2711 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2712 coding->composition_rule_follows = 1;
2713 }
2714 continue;
2715 }
2716 }
2717 if (!COMPOSING_P (coding))
2718 {
2719 if (this_pos == data[1])
2720 {
2721 ENCODE_COMPOSITION_START (coding, data);
2722 continue;
2723 }
2724 }
2725 }
2726
2727 ONE_MORE_CHAR (c);
2728
2729 /* Now encode the character C. */
2730 if (c < 0x20 || c == 0x7F)
2731 {
2732 if (c == '\r')
2733 {
2734 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2735 {
2736 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2737 ENCODE_RESET_PLANE_AND_REGISTER;
2738 *dst++ = c;
2739 continue;
2740 }
2741 /* fall down to treat '\r' as '\n' ... */
2742 c = '\n';
2743 }
2744 if (c == '\n')
2745 {
2746 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2747 ENCODE_RESET_PLANE_AND_REGISTER;
2748 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2749 bcopy (coding->spec.iso2022.initial_designation,
2750 coding->spec.iso2022.current_designation,
2751 sizeof coding->spec.iso2022.initial_designation);
2752 if (coding->eol_type == CODING_EOL_LF
2753 || coding->eol_type == CODING_EOL_UNDECIDED)
2754 *dst++ = ISO_CODE_LF;
2755 else if (coding->eol_type == CODING_EOL_CRLF)
2756 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2757 else
2758 *dst++ = ISO_CODE_CR;
2759 CODING_SPEC_ISO_BOL (coding) = 1;
2760 }
2761 else
2762 {
2763 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2764 ENCODE_RESET_PLANE_AND_REGISTER;
2765 *dst++ = c;
2766 }
2767 }
2768 else if (ASCII_BYTE_P (c))
2769 ENCODE_ISO_CHARACTER (c);
2770 else if (SINGLE_BYTE_CHAR_P (c))
2771 {
2772 *dst++ = c;
2773 coding->errors++;
2774 }
2775 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2776 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2777 ENCODE_UNSAFE_CHARACTER (c);
2778 else
2779 ENCODE_ISO_CHARACTER (c);
2780
2781 coding->consumed_char++;
2782 }
2783
2784 label_end_of_loop:
2785 coding->consumed = src_base - source;
2786 coding->produced = coding->produced_char = dst - destination;
2787 } 3776 }
2788 3777
2789 3778
2790 /*** 4. SJIS and BIG5 handlers ***/ 3779 /*** 8,9. SJIS and BIG5 handlers ***/
2791 3780
2792 /* Although SJIS and BIG5 are not ISO coding systems, they are used 3781 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2793 quite widely. So, for the moment, Emacs supports them in the bare 3782 quite widely. So, for the moment, Emacs supports them in the bare
2794 C code. But, in the future, they may be supported only by CCL. */ 3783 C code. But, in the future, they may be supported only by CCL. */
2795 3784
2796 /* SJIS is a coding system encoding three character sets: ASCII, right 3785 /* SJIS is a coding system encoding three character sets: ASCII, right
2797 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded 3786 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2798 as is. A character of charset katakana-jisx0201 is encoded by 3787 as is. A character of charset katakana-jisx0201 is encoded by
2799 "position-code + 0x80". A character of charset japanese-jisx0208 3788 "position-code + 0x80". A character of charset japanese-jisx0208
2800 is encoded in 2-byte but two position-codes are divided and shifted 3789 is encoded in 2-byte but two position-codes are divided and shifted
2801 so that it fits in the range below. 3790 so that it fit in the range below.
2802 3791
2803 --- CODE RANGE of SJIS --- 3792 --- CODE RANGE of SJIS ---
2804 (character set) (range) 3793 (character set) (range)
2805 ASCII 0x00 .. 0x7F 3794 ASCII 0x00 .. 0x7F
2806 KATAKANA-JISX0201 0xA1 .. 0xDF 3795 KATAKANA-JISX0201 0xA0 .. 0xDF
2807 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF 3796 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2808 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC 3797 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2809 ------------------------------- 3798 -------------------------------
2810 3799
2811 */ 3800 */
2812 3801
2813 /* BIG5 is a coding system encoding two character sets: ASCII and 3802 /* BIG5 is a coding system encoding two character sets: ASCII and
2814 Big5. An ASCII character is encoded as is. Big5 is a two-byte 3803 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2815 character set and is encoded in two bytes. 3804 character set and is encoded in two-byte.
2816 3805
2817 --- CODE RANGE of BIG5 --- 3806 --- CODE RANGE of BIG5 ---
2818 (character set) (range) 3807 (character set) (range)
2819 ASCII 0x00 .. 0x7F 3808 ASCII 0x00 .. 0x7F
2820 Big5 (1st byte) 0xA1 .. 0xFE 3809 Big5 (1st byte) 0xA1 .. 0xFE
2821 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE 3810 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2822 -------------------------- 3811 --------------------------
2823 3812
2824 Since the number of characters in Big5 is larger than maximum 3813 */
2825 characters in Emacs' charset (96x96), it can't be handled as one
2826 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2827 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2828 contains frequently used characters and the latter contains less
2829 frequently used characters. */
2830
2831 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2832 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2833 C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2834 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2835
2836 /* Number of Big5 characters which have the same code in 1st byte. */
2837 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2838
2839 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2840 do { \
2841 unsigned int temp \
2842 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2843 if (b1 < 0xC9) \
2844 charset = charset_big5_1; \
2845 else \
2846 { \
2847 charset = charset_big5_2; \
2848 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2849 } \
2850 c1 = temp / (0xFF - 0xA1) + 0x21; \
2851 c2 = temp % (0xFF - 0xA1) + 0x21; \
2852 } while (0)
2853
2854 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2855 do { \
2856 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2857 if (charset == charset_big5_2) \
2858 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2859 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2860 b2 = temp % BIG5_SAME_ROW; \
2861 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2862 } while (0)
2863 3814
2864 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 3815 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2865 Check if a text is encoded in SJIS. If it is, return 3816 Check if a text is encoded in SJIS. If it is, return
2866 CODING_CATEGORY_MASK_SJIS, else return 0. */ 3817 CATEGORY_MASK_SJIS, else return 0. */
2867 3818
2868 static int 3819 static int
2869 detect_coding_sjis (src, src_end, multibytep) 3820 detect_coding_sjis (coding, detect_info)
2870 unsigned char *src, *src_end; 3821 struct coding_system *coding;
2871 int multibytep; 3822 struct coding_detection_info *detect_info;
2872 { 3823 {
3824 const unsigned char *src = coding->source, *src_base = src;
3825 const unsigned char *src_end = coding->source + coding->src_bytes;
3826 int multibytep = coding->src_multibyte;
3827 int consumed_chars = 0;
3828 int found = 0;
2873 int c; 3829 int c;
2874 /* Dummy for ONE_MORE_BYTE. */ 3830 int incomplete;
2875 struct coding_system dummy_coding; 3831
2876 struct coding_system *coding = &dummy_coding; 3832 detect_info->checked |= CATEGORY_MASK_SJIS;
3833 /* A coding system of this category is always ASCII compatible. */
3834 src += coding->head_ascii;
2877 3835
2878 while (1) 3836 while (1)
2879 { 3837 {
2880 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 3838 incomplete = 0;
3839 ONE_MORE_BYTE (c);
3840 incomplete = 1;
2881 if (c < 0x80) 3841 if (c < 0x80)
2882 continue; 3842 continue;
2883 if (c == 0x80 || c == 0xA0 || c > 0xEF) 3843 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
2884 return 0; 3844 {
2885 if (c <= 0x9F || c >= 0xE0) 3845 ONE_MORE_BYTE (c);
2886 {
2887 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2888 if (c < 0x40 || c == 0x7F || c > 0xFC) 3846 if (c < 0x40 || c == 0x7F || c > 0xFC)
2889 return 0; 3847 break;
2890 } 3848 found = CATEGORY_MASK_SJIS;
2891 } 3849 }
2892 label_end_of_loop: 3850 else if (c >= 0xA0 && c < 0xE0)
2893 return CODING_CATEGORY_MASK_SJIS; 3851 found = CATEGORY_MASK_SJIS;
3852 else
3853 break;
3854 }
3855 detect_info->rejected |= CATEGORY_MASK_SJIS;
3856 return 0;
3857
3858 no_more_source:
3859 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3860 {
3861 detect_info->rejected |= CATEGORY_MASK_SJIS;
3862 return 0;
3863 }
3864 detect_info->found |= found;
3865 return 1;
2894 } 3866 }
2895 3867
2896 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 3868 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2897 Check if a text is encoded in BIG5. If it is, return 3869 Check if a text is encoded in BIG5. If it is, return
2898 CODING_CATEGORY_MASK_BIG5, else return 0. */ 3870 CATEGORY_MASK_BIG5, else return 0. */
2899 3871
2900 static int 3872 static int
2901 detect_coding_big5 (src, src_end, multibytep) 3873 detect_coding_big5 (coding, detect_info)
2902 unsigned char *src, *src_end; 3874 struct coding_system *coding;
2903 int multibytep; 3875 struct coding_detection_info *detect_info;
2904 { 3876 {
3877 const unsigned char *src = coding->source, *src_base = src;
3878 const unsigned char *src_end = coding->source + coding->src_bytes;
3879 int multibytep = coding->src_multibyte;
3880 int consumed_chars = 0;
3881 int found = 0;
2905 int c; 3882 int c;
2906 /* Dummy for ONE_MORE_BYTE. */ 3883 int incomplete;
2907 struct coding_system dummy_coding; 3884
2908 struct coding_system *coding = &dummy_coding; 3885 detect_info->checked |= CATEGORY_MASK_BIG5;
3886 /* A coding system of this category is always ASCII compatible. */
3887 src += coding->head_ascii;
2909 3888
2910 while (1) 3889 while (1)
2911 { 3890 {
2912 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 3891 incomplete = 0;
3892 ONE_MORE_BYTE (c);
3893 incomplete = 1;
2913 if (c < 0x80) 3894 if (c < 0x80)
2914 continue; 3895 continue;
2915 if (c < 0xA1 || c > 0xFE) 3896 if (c >= 0xA1)
2916 return 0; 3897 {
2917 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 3898 ONE_MORE_BYTE (c);
2918 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE) 3899 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2919 return 0; 3900 return 0;
2920 } 3901 found = CATEGORY_MASK_BIG5;
2921 label_end_of_loop: 3902 }
2922 return CODING_CATEGORY_MASK_BIG5;
2923 }
2924
2925 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2926 Check if a text is encoded in UTF-8. If it is, return
2927 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2928
2929 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2930 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2931 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2932 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2933 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2934 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2935 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2936
2937 static int
2938 detect_coding_utf_8 (src, src_end, multibytep)
2939 unsigned char *src, *src_end;
2940 int multibytep;
2941 {
2942 unsigned char c;
2943 int seq_maybe_bytes;
2944 /* Dummy for ONE_MORE_BYTE. */
2945 struct coding_system dummy_coding;
2946 struct coding_system *coding = &dummy_coding;
2947
2948 while (1)
2949 {
2950 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2951 if (UTF_8_1_OCTET_P (c))
2952 continue;
2953 else if (UTF_8_2_OCTET_LEADING_P (c))
2954 seq_maybe_bytes = 1;
2955 else if (UTF_8_3_OCTET_LEADING_P (c))
2956 seq_maybe_bytes = 2;
2957 else if (UTF_8_4_OCTET_LEADING_P (c))
2958 seq_maybe_bytes = 3;
2959 else if (UTF_8_5_OCTET_LEADING_P (c))
2960 seq_maybe_bytes = 4;
2961 else if (UTF_8_6_OCTET_LEADING_P (c))
2962 seq_maybe_bytes = 5;
2963 else 3903 else
2964 return 0; 3904 break;
2965 3905 }
2966 do 3906 detect_info->rejected |= CATEGORY_MASK_BIG5;
2967 {
2968 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2969 if (!UTF_8_EXTRA_OCTET_P (c))
2970 return 0;
2971 seq_maybe_bytes--;
2972 }
2973 while (seq_maybe_bytes > 0);
2974 }
2975
2976 label_end_of_loop:
2977 return CODING_CATEGORY_MASK_UTF_8;
2978 }
2979
2980 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2981 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2982 Little Endian (otherwise). If it is, return
2983 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2984 else return 0. */
2985
2986 #define UTF_16_INVALID_P(val) \
2987 (((val) == 0xFFFE) \
2988 || ((val) == 0xFFFF))
2989
2990 #define UTF_16_HIGH_SURROGATE_P(val) \
2991 (((val) & 0xD800) == 0xD800)
2992
2993 #define UTF_16_LOW_SURROGATE_P(val) \
2994 (((val) & 0xDC00) == 0xDC00)
2995
2996 static int
2997 detect_coding_utf_16 (src, src_end, multibytep)
2998 unsigned char *src, *src_end;
2999 int multibytep;
3000 {
3001 unsigned char c1, c2;
3002 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */
3003 struct coding_system dummy_coding;
3004 struct coding_system *coding = &dummy_coding;
3005
3006 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
3007 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
3008
3009 if ((c1 == 0xFF) && (c2 == 0xFE))
3010 return CODING_CATEGORY_MASK_UTF_16_LE;
3011 else if ((c1 == 0xFE) && (c2 == 0xFF))
3012 return CODING_CATEGORY_MASK_UTF_16_BE;
3013
3014 label_end_of_loop:
3015 return 0; 3907 return 0;
3908
3909 no_more_source:
3910 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3911 {
3912 detect_info->rejected |= CATEGORY_MASK_BIG5;
3913 return 0;
3914 }
3915 detect_info->found |= found;
3916 return 1;
3016 } 3917 }
3017 3918
3018 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". 3919 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3019 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */ 3920 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3020 3921
3021 static void 3922 static void
3022 decode_coding_sjis_big5 (coding, source, destination, 3923 decode_coding_sjis (coding)
3023 src_bytes, dst_bytes, sjis_p)
3024 struct coding_system *coding; 3924 struct coding_system *coding;
3025 unsigned char *source, *destination; 3925 {
3026 int src_bytes, dst_bytes; 3926 const unsigned char *src = coding->source + coding->consumed;
3027 int sjis_p; 3927 const unsigned char *src_end = coding->source + coding->src_bytes;
3028 { 3928 const unsigned char *src_base;
3029 unsigned char *src = source; 3929 int *charbuf = coding->charbuf;
3030 unsigned char *src_end = source + src_bytes; 3930 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3031 unsigned char *dst = destination; 3931 int consumed_chars = 0, consumed_chars_base;
3032 unsigned char *dst_end = destination + dst_bytes; 3932 int multibytep = coding->src_multibyte;
3033 /* SRC_BASE remembers the start position in source in each loop. 3933 struct charset *charset_roman, *charset_kanji, *charset_kana;
3034 The loop will be exited when there's not enough source code 3934 Lisp_Object attrs, eol_type, charset_list, val;
3035 (within macro ONE_MORE_BYTE), or when there's not enough 3935 int char_offset = coding->produced_char;
3036 destination area to produce a character (within macro 3936 int last_offset = char_offset;
3037 EMIT_CHAR). */ 3937 int last_id = charset_ascii;
3038 unsigned char *src_base; 3938
3039 Lisp_Object translation_table; 3939 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3040 3940
3041 if (NILP (Venable_character_translation)) 3941 val = charset_list;
3042 translation_table = Qnil; 3942 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3043 else 3943 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3044 { 3944 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3045 translation_table = coding->translation_table_for_decode; 3945
3046 if (NILP (translation_table))
3047 translation_table = Vstandard_translation_table_for_decode;
3048 }
3049
3050 coding->produced_char = 0;
3051 while (1) 3946 while (1)
3052 { 3947 {
3053 int c, charset, c1, c2; 3948 int c, c1;
3054 3949
3055 src_base = src; 3950 src_base = src;
3056 ONE_MORE_BYTE (c1); 3951 consumed_chars_base = consumed_chars;
3057 3952
3058 if (c1 < 0x80) 3953 if (charbuf >= charbuf_end)
3059 { 3954 break;
3060 charset = CHARSET_ASCII; 3955
3061 if (c1 < 0x20) 3956 ONE_MORE_BYTE (c);
3957
3958 if (c == '\r')
3959 {
3960 if (EQ (eol_type, Qdos))
3062 { 3961 {
3063 if (c1 == '\r') 3962 if (src == src_end)
3064 { 3963 {
3065 if (coding->eol_type == CODING_EOL_CRLF) 3964 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
3066 { 3965 goto no_more_source;
3067 ONE_MORE_BYTE (c2);
3068 if (c2 == '\n')
3069 c1 = c2;
3070 else
3071 /* To process C2 again, SRC is subtracted by 1. */
3072 src--;
3073 }
3074 else if (coding->eol_type == CODING_EOL_CR)
3075 c1 = '\n';
3076 } 3966 }
3077 else if (c1 == '\n' 3967 if (*src == '\n')
3078 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) 3968 ONE_MORE_BYTE (c);
3079 && (coding->eol_type == CODING_EOL_CR
3080 || coding->eol_type == CODING_EOL_CRLF))
3081 {
3082 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3083 goto label_end_of_loop;
3084 }
3085 } 3969 }
3970 else if (EQ (eol_type, Qmac))
3971 c = '\n';
3086 } 3972 }
3087 else 3973 else
3088 { 3974 {
3089 if (sjis_p) 3975 struct charset *charset;
3976
3977 if (c < 0x80)
3978 charset = charset_roman;
3979 else
3090 { 3980 {
3091 if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF) 3981 if (c >= 0xF0)
3092 goto label_invalid_code; 3982 goto invalid_code;
3093 if (c1 <= 0x9F || c1 >= 0xE0) 3983 if (c < 0xA0 || c >= 0xE0)
3094 { 3984 {
3095 /* SJIS -> JISX0208 */ 3985 /* SJIS -> JISX0208 */
3096 ONE_MORE_BYTE (c2); 3986 ONE_MORE_BYTE (c1);
3097 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC) 3987 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3098 goto label_invalid_code; 3988 goto invalid_code;
3099 DECODE_SJIS (c1, c2, c1, c2); 3989 c = (c << 8) | c1;
3100 charset = charset_jisx0208; 3990 SJIS_TO_JIS (c);
3991 charset = charset_kanji;
3992 }
3993 else if (c > 0xA0)
3994 {
3995 /* SJIS -> JISX0201-Kana */
3996 c &= 0x7F;
3997 charset = charset_kana;
3101 } 3998 }
3102 else 3999 else
3103 /* SJIS -> JISX0201-Kana */ 4000 goto invalid_code;
3104 charset = charset_katakana_jisx0201;
3105 } 4001 }
4002 if (charset->id != charset_ascii
4003 && last_id != charset->id)
4004 {
4005 if (last_id != charset_ascii)
4006 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4007 last_id = charset->id;
4008 last_offset = char_offset;
4009 }
4010 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4011 }
4012 *charbuf++ = c;
4013 char_offset++;
4014 continue;
4015
4016 invalid_code:
4017 src = src_base;
4018 consumed_chars = consumed_chars_base;
4019 ONE_MORE_BYTE (c);
4020 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4021 char_offset++;
4022 coding->errors++;
4023 }
4024
4025 no_more_source:
4026 if (last_id != charset_ascii)
4027 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4028 coding->consumed_char += consumed_chars_base;
4029 coding->consumed = src_base - coding->source;
4030 coding->charbuf_used = charbuf - coding->charbuf;
4031 }
4032
4033 static void
4034 decode_coding_big5 (coding)
4035 struct coding_system *coding;
4036 {
4037 const unsigned char *src = coding->source + coding->consumed;
4038 const unsigned char *src_end = coding->source + coding->src_bytes;
4039 const unsigned char *src_base;
4040 int *charbuf = coding->charbuf;
4041 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4042 int consumed_chars = 0, consumed_chars_base;
4043 int multibytep = coding->src_multibyte;
4044 struct charset *charset_roman, *charset_big5;
4045 Lisp_Object attrs, eol_type, charset_list, val;
4046 int char_offset = coding->produced_char;
4047 int last_offset = char_offset;
4048 int last_id = charset_ascii;
4049
4050 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4051 val = charset_list;
4052 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4053 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4054
4055 while (1)
4056 {
4057 int c, c1;
4058
4059 src_base = src;
4060 consumed_chars_base = consumed_chars;
4061
4062 if (charbuf >= charbuf_end)
4063 break;
4064
4065 ONE_MORE_BYTE (c);
4066
4067 if (c == '\r')
4068 {
4069 if (EQ (eol_type, Qdos))
4070 {
4071 if (src == src_end)
4072 {
4073 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4074 goto no_more_source;
4075 }
4076 if (*src == '\n')
4077 ONE_MORE_BYTE (c);
4078 }
4079 else if (EQ (eol_type, Qmac))
4080 c = '\n';
4081 }
4082 else
4083 {
4084 struct charset *charset;
4085 if (c < 0x80)
4086 charset = charset_roman;
3106 else 4087 else
3107 { 4088 {
3108 /* BIG5 -> Big5 */ 4089 /* BIG5 -> Big5 */
3109 if (c1 < 0xA0 || c1 > 0xFE) 4090 if (c < 0xA1 || c > 0xFE)
3110 goto label_invalid_code; 4091 goto invalid_code;
3111 ONE_MORE_BYTE (c2); 4092 ONE_MORE_BYTE (c1);
3112 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE) 4093 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
3113 goto label_invalid_code; 4094 goto invalid_code;
3114 DECODE_BIG5 (c1, c2, charset, c1, c2); 4095 c = c << 8 | c1;
4096 charset = charset_big5;
3115 } 4097 }
3116 } 4098 if (charset->id != charset_ascii
3117 4099 && last_id != charset->id)
3118 c = DECODE_ISO_CHARACTER (charset, c1, c2); 4100 {
3119 EMIT_CHAR (c); 4101 if (last_id != charset_ascii)
4102 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4103 last_id = charset->id;
4104 last_offset = char_offset;
4105 }
4106 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4107 }
4108
4109 *charbuf++ = c;
4110 char_offset++;
3120 continue; 4111 continue;
3121 4112
3122 label_invalid_code: 4113 invalid_code:
4114 src = src_base;
4115 consumed_chars = consumed_chars_base;
4116 ONE_MORE_BYTE (c);
4117 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4118 char_offset++;
3123 coding->errors++; 4119 coding->errors++;
3124 src = src_base; 4120 }
3125 c = *src++; 4121
3126 EMIT_CHAR (c); 4122 no_more_source:
3127 } 4123 if (last_id != charset_ascii)
3128 4124 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3129 label_end_of_loop: 4125 coding->consumed_char += consumed_chars_base;
3130 coding->consumed = coding->consumed_char = src_base - source; 4126 coding->consumed = src_base - coding->source;
3131 coding->produced = dst - destination; 4127 coding->charbuf_used = charbuf - coding->charbuf;
3132 return;
3133 } 4128 }
3134 4129
3135 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". 4130 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3136 This function can encode charsets `ascii', `katakana-jisx0201', 4131 This function can encode charsets `ascii', `katakana-jisx0201',
3137 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We 4132 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3138 are sure that all these charsets are registered as official charset 4133 are sure that all these charsets are registered as official charset
3139 (i.e. do not have extended leading-codes). Characters of other 4134 (i.e. do not have extended leading-codes). Characters of other
3140 charsets are produced without any encoding. If SJIS_P is 1, encode 4135 charsets are produced without any encoding. If SJIS_P is 1, encode
3141 SJIS text, else encode BIG5 text. */ 4136 SJIS text, else encode BIG5 text. */
3142 4137
3143 static void 4138 static int
3144 encode_coding_sjis_big5 (coding, source, destination, 4139 encode_coding_sjis (coding)
3145 src_bytes, dst_bytes, sjis_p)
3146 struct coding_system *coding; 4140 struct coding_system *coding;
3147 unsigned char *source, *destination; 4141 {
3148 int src_bytes, dst_bytes; 4142 int multibytep = coding->dst_multibyte;
3149 int sjis_p; 4143 int *charbuf = coding->charbuf;
3150 { 4144 int *charbuf_end = charbuf + coding->charbuf_used;
3151 unsigned char *src = source; 4145 unsigned char *dst = coding->destination + coding->produced;
3152 unsigned char *src_end = source + src_bytes; 4146 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3153 unsigned char *dst = destination; 4147 int safe_room = 4;
3154 unsigned char *dst_end = destination + dst_bytes; 4148 int produced_chars = 0;
3155 /* SRC_BASE remembers the start position in source in each loop. 4149 Lisp_Object attrs, eol_type, charset_list, val;
3156 The loop will be exited when there's not enough source text to 4150 int ascii_compatible;
3157 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when 4151 struct charset *charset_roman, *charset_kanji, *charset_kana;
3158 there's not enough destination area to produce encoded codes 4152 int c;
3159 (within macro EMIT_BYTES). */ 4153
3160 unsigned char *src_base; 4154 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3161 Lisp_Object translation_table; 4155 val = charset_list;
3162 4156 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3163 if (NILP (Venable_character_translation)) 4157 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3164 translation_table = Qnil; 4158 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3165 else 4159
3166 { 4160 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3167 translation_table = coding->translation_table_for_encode; 4161
3168 if (NILP (translation_table)) 4162 while (charbuf < charbuf_end)
3169 translation_table = Vstandard_translation_table_for_encode; 4163 {
3170 } 4164 ASSURE_DESTINATION (safe_room);
3171 4165 c = *charbuf++;
3172 while (1)
3173 {
3174 int c, charset, c1, c2;
3175
3176 src_base = src;
3177 ONE_MORE_CHAR (c);
3178
3179 /* Now encode the character C. */ 4166 /* Now encode the character C. */
3180 if (SINGLE_BYTE_CHAR_P (c)) 4167 if (ASCII_CHAR_P (c) && ascii_compatible)
3181 { 4168 EMIT_ONE_ASCII_BYTE (c);
3182 switch (c) 4169 else if (CHAR_BYTE8_P (c))
4170 {
4171 c = CHAR_TO_BYTE8 (c);
4172 EMIT_ONE_BYTE (c);
4173 }
4174 else
4175 {
4176 unsigned code;
4177 struct charset *charset = char_charset (c, charset_list, &code);
4178
4179 if (!charset)
3183 { 4180 {
3184 case '\r': 4181 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3185 if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3186 { 4182 {
3187 EMIT_ONE_BYTE (c); 4183 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3188 break; 4184 charset = CHARSET_FROM_ID (charset_ascii);
3189 }
3190 c = '\n';
3191 case '\n':
3192 if (coding->eol_type == CODING_EOL_CRLF)
3193 {
3194 EMIT_TWO_BYTES ('\r', c);
3195 break;
3196 }
3197 else if (coding->eol_type == CODING_EOL_CR)
3198 c = '\r';
3199 default:
3200 EMIT_ONE_BYTE (c);
3201 }
3202 }
3203 else
3204 {
3205 SPLIT_CHAR (c, charset, c1, c2);
3206 if (sjis_p)
3207 {
3208 if (charset == charset_jisx0208
3209 || charset == charset_jisx0208_1978)
3210 {
3211 ENCODE_SJIS (c1, c2, c1, c2);
3212 EMIT_TWO_BYTES (c1, c2);
3213 }
3214 else if (charset == charset_katakana_jisx0201)
3215 EMIT_ONE_BYTE (c1 | 0x80);
3216 else if (charset == charset_latin_jisx0201)
3217 EMIT_ONE_BYTE (c1);
3218 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3219 {
3220 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3221 if (CHARSET_WIDTH (charset) > 1)
3222 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3223 } 4185 }
3224 else 4186 else
3225 /* There's no way other than producing the internal 4187 {
3226 codes as is. */ 4188 c = coding->default_char;
3227 EMIT_BYTES (src_base, src); 4189 charset = char_charset (c, charset_list, &code);
4190 }
4191 }
4192 if (code == CHARSET_INVALID_CODE (charset))
4193 abort ();
4194 if (charset == charset_kanji)
4195 {
4196 int c1, c2;
4197 JIS_TO_SJIS (code);
4198 c1 = code >> 8, c2 = code & 0xFF;
4199 EMIT_TWO_BYTES (c1, c2);
4200 }
4201 else if (charset == charset_kana)
4202 EMIT_ONE_BYTE (code | 0x80);
4203 else
4204 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4205 }
4206 }
4207 coding->result = CODING_RESULT_SUCCESS;
4208 coding->produced_char += produced_chars;
4209 coding->produced = dst - coding->destination;
4210 return 0;
4211 }
4212
4213 static int
4214 encode_coding_big5 (coding)
4215 struct coding_system *coding;
4216 {
4217 int multibytep = coding->dst_multibyte;
4218 int *charbuf = coding->charbuf;
4219 int *charbuf_end = charbuf + coding->charbuf_used;
4220 unsigned char *dst = coding->destination + coding->produced;
4221 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4222 int safe_room = 4;
4223 int produced_chars = 0;
4224 Lisp_Object attrs, eol_type, charset_list, val;
4225 int ascii_compatible;
4226 struct charset *charset_roman, *charset_big5;
4227 int c;
4228
4229 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4230 val = charset_list;
4231 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4232 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4233 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4234
4235 while (charbuf < charbuf_end)
4236 {
4237 ASSURE_DESTINATION (safe_room);
4238 c = *charbuf++;
4239 /* Now encode the character C. */
4240 if (ASCII_CHAR_P (c) && ascii_compatible)
4241 EMIT_ONE_ASCII_BYTE (c);
4242 else if (CHAR_BYTE8_P (c))
4243 {
4244 c = CHAR_TO_BYTE8 (c);
4245 EMIT_ONE_BYTE (c);
4246 }
4247 else
4248 {
4249 unsigned code;
4250 struct charset *charset = char_charset (c, charset_list, &code);
4251
4252 if (! charset)
4253 {
4254 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4255 {
4256 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4257 charset = CHARSET_FROM_ID (charset_ascii);
4258 }
4259 else
4260 {
4261 c = coding->default_char;
4262 charset = char_charset (c, charset_list, &code);
4263 }
4264 }
4265 if (code == CHARSET_INVALID_CODE (charset))
4266 abort ();
4267 if (charset == charset_big5)
4268 {
4269 int c1, c2;
4270
4271 c1 = code >> 8, c2 = code & 0xFF;
4272 EMIT_TWO_BYTES (c1, c2);
4273 }
4274 else
4275 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4276 }
4277 }
4278 coding->result = CODING_RESULT_SUCCESS;
4279 coding->produced_char += produced_chars;
4280 coding->produced = dst - coding->destination;
4281 return 0;
4282 }
4283
4284
4285 /*** 10. CCL handlers ***/
4286
4287 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4288 Check if a text is encoded in a coding system of which
4289 encoder/decoder are written in CCL program. If it is, return
4290 CATEGORY_MASK_CCL, else return 0. */
4291
4292 static int
4293 detect_coding_ccl (coding, detect_info)
4294 struct coding_system *coding;
4295 struct coding_detection_info *detect_info;
4296 {
4297 const unsigned char *src = coding->source, *src_base = src;
4298 const unsigned char *src_end = coding->source + coding->src_bytes;
4299 int multibytep = coding->src_multibyte;
4300 int consumed_chars = 0;
4301 int found = 0;
4302 unsigned char *valids = CODING_CCL_VALIDS (coding);
4303 int head_ascii = coding->head_ascii;
4304 Lisp_Object attrs;
4305
4306 detect_info->checked |= CATEGORY_MASK_CCL;
4307
4308 coding = &coding_categories[coding_category_ccl];
4309 attrs = CODING_ID_ATTRS (coding->id);
4310 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4311 src += head_ascii;
4312
4313 while (1)
4314 {
4315 int c;
4316 ONE_MORE_BYTE (c);
4317 if (! valids[c])
4318 break;
4319 if ((valids[c] > 1))
4320 found = CATEGORY_MASK_CCL;
4321 }
4322 detect_info->rejected |= CATEGORY_MASK_CCL;
4323 return 0;
4324
4325 no_more_source:
4326 detect_info->found |= found;
4327 return 1;
4328 }
4329
4330 static void
4331 decode_coding_ccl (coding)
4332 struct coding_system *coding;
4333 {
4334 const unsigned char *src = coding->source + coding->consumed;
4335 const unsigned char *src_end = coding->source + coding->src_bytes;
4336 int *charbuf = coding->charbuf;
4337 int *charbuf_end = charbuf + coding->charbuf_size;
4338 int consumed_chars = 0;
4339 int multibytep = coding->src_multibyte;
4340 struct ccl_program ccl;
4341 int source_charbuf[1024];
4342 int source_byteidx[1024];
4343 Lisp_Object attrs, eol_type, charset_list;
4344
4345 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4346 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4347
4348 while (src < src_end)
4349 {
4350 const unsigned char *p = src;
4351 int *source, *source_end;
4352 int i = 0;
4353
4354 if (multibytep)
4355 while (i < 1024 && p < src_end)
4356 {
4357 source_byteidx[i] = p - src;
4358 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4359 }
4360 else
4361 while (i < 1024 && p < src_end)
4362 source_charbuf[i++] = *p++;
4363
4364 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4365 ccl.last_block = 1;
4366
4367 source = source_charbuf;
4368 source_end = source + i;
4369 while (source < source_end)
4370 {
4371 ccl_driver (&ccl, source, charbuf,
4372 source_end - source, charbuf_end - charbuf,
4373 charset_list);
4374 source += ccl.consumed;
4375 charbuf += ccl.produced;
4376 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4377 break;
4378 }
4379 if (source < source_end)
4380 src += source_byteidx[source - source_charbuf];
4381 else
4382 src = p;
4383 consumed_chars += source - source_charbuf;
4384
4385 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4386 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4387 break;
4388 }
4389
4390 switch (ccl.status)
4391 {
4392 case CCL_STAT_SUSPEND_BY_SRC:
4393 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4394 break;
4395 case CCL_STAT_SUSPEND_BY_DST:
4396 break;
4397 case CCL_STAT_QUIT:
4398 case CCL_STAT_INVALID_CMD:
4399 coding->result = CODING_RESULT_INTERRUPT;
4400 break;
4401 default:
4402 coding->result = CODING_RESULT_SUCCESS;
4403 break;
4404 }
4405 coding->consumed_char += consumed_chars;
4406 coding->consumed = src - coding->source;
4407 coding->charbuf_used = charbuf - coding->charbuf;
4408 }
4409
4410 static int
4411 encode_coding_ccl (coding)
4412 struct coding_system *coding;
4413 {
4414 struct ccl_program ccl;
4415 int multibytep = coding->dst_multibyte;
4416 int *charbuf = coding->charbuf;
4417 int *charbuf_end = charbuf + coding->charbuf_used;
4418 unsigned char *dst = coding->destination + coding->produced;
4419 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4420 unsigned char *adjusted_dst_end = dst_end - 1;
4421 int destination_charbuf[1024];
4422 int i, produced_chars = 0;
4423 Lisp_Object attrs, eol_type, charset_list;
4424
4425 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4426 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4427
4428 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4429 ccl.dst_multibyte = coding->dst_multibyte;
4430
4431 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4432 {
4433 int dst_bytes = dst_end - dst;
4434 if (dst_bytes > 1024)
4435 dst_bytes = 1024;
4436
4437 ccl_driver (&ccl, charbuf, destination_charbuf,
4438 charbuf_end - charbuf, dst_bytes, charset_list);
4439 charbuf += ccl.consumed;
4440 if (multibytep)
4441 for (i = 0; i < ccl.produced; i++)
4442 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4443 else
4444 {
4445 for (i = 0; i < ccl.produced; i++)
4446 *dst++ = destination_charbuf[i] & 0xFF;
4447 produced_chars += ccl.produced;
4448 }
4449 }
4450
4451 switch (ccl.status)
4452 {
4453 case CCL_STAT_SUSPEND_BY_SRC:
4454 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4455 break;
4456 case CCL_STAT_SUSPEND_BY_DST:
4457 coding->result = CODING_RESULT_INSUFFICIENT_DST;
4458 break;
4459 case CCL_STAT_QUIT:
4460 case CCL_STAT_INVALID_CMD:
4461 coding->result = CODING_RESULT_INTERRUPT;
4462 break;
4463 default:
4464 coding->result = CODING_RESULT_SUCCESS;
4465 break;
4466 }
4467
4468 coding->produced_char += produced_chars;
4469 coding->produced = dst - coding->destination;
4470 return 0;
4471 }
4472
4473
4474
4475 /*** 10, 11. no-conversion handlers ***/
4476
4477 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4478
4479 static void
4480 decode_coding_raw_text (coding)
4481 struct coding_system *coding;
4482 {
4483 coding->chars_at_source = 1;
4484 coding->consumed_char = 0;
4485 coding->consumed = 0;
4486 coding->result = CODING_RESULT_SUCCESS;
4487 }
4488
4489 static int
4490 encode_coding_raw_text (coding)
4491 struct coding_system *coding;
4492 {
4493 int multibytep = coding->dst_multibyte;
4494 int *charbuf = coding->charbuf;
4495 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4496 unsigned char *dst = coding->destination + coding->produced;
4497 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4498 int produced_chars = 0;
4499 int c;
4500
4501 if (multibytep)
4502 {
4503 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4504
4505 if (coding->src_multibyte)
4506 while (charbuf < charbuf_end)
4507 {
4508 ASSURE_DESTINATION (safe_room);
4509 c = *charbuf++;
4510 if (ASCII_CHAR_P (c))
4511 EMIT_ONE_ASCII_BYTE (c);
4512 else if (CHAR_BYTE8_P (c))
4513 {
4514 c = CHAR_TO_BYTE8 (c);
4515 EMIT_ONE_BYTE (c);
4516 }
4517 else
4518 {
4519 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4520
4521 CHAR_STRING_ADVANCE (c, p1);
4522 while (p0 < p1)
4523 {
4524 EMIT_ONE_BYTE (*p0);
4525 p0++;
4526 }
4527 }
4528 }
4529 else
4530 while (charbuf < charbuf_end)
4531 {
4532 ASSURE_DESTINATION (safe_room);
4533 c = *charbuf++;
4534 EMIT_ONE_BYTE (c);
4535 }
4536 }
4537 else
4538 {
4539 if (coding->src_multibyte)
4540 {
4541 int safe_room = MAX_MULTIBYTE_LENGTH;
4542
4543 while (charbuf < charbuf_end)
4544 {
4545 ASSURE_DESTINATION (safe_room);
4546 c = *charbuf++;
4547 if (ASCII_CHAR_P (c))
4548 *dst++ = c;
4549 else if (CHAR_BYTE8_P (c))
4550 *dst++ = CHAR_TO_BYTE8 (c);
4551 else
4552 CHAR_STRING_ADVANCE (c, dst);
4553 produced_chars++;
4554 }
4555 }
4556 else
4557 {
4558 ASSURE_DESTINATION (charbuf_end - charbuf);
4559 while (charbuf < charbuf_end && dst < dst_end)
4560 *dst++ = *charbuf++;
4561 produced_chars = dst - (coding->destination + coding->dst_bytes);
4562 }
4563 }
4564 coding->result = CODING_RESULT_SUCCESS;
4565 coding->produced_char += produced_chars;
4566 coding->produced = dst - coding->destination;
4567 return 0;
4568 }
4569
4570 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4571 Check if a text is encoded in a charset-based coding system. If it
4572 is, return 1, else return 0. */
4573
4574 static int
4575 detect_coding_charset (coding, detect_info)
4576 struct coding_system *coding;
4577 struct coding_detection_info *detect_info;
4578 {
4579 const unsigned char *src = coding->source, *src_base = src;
4580 const unsigned char *src_end = coding->source + coding->src_bytes;
4581 int multibytep = coding->src_multibyte;
4582 int consumed_chars = 0;
4583 Lisp_Object attrs, valids;
4584 int found = 0;
4585
4586 detect_info->checked |= CATEGORY_MASK_CHARSET;
4587
4588 coding = &coding_categories[coding_category_charset];
4589 attrs = CODING_ID_ATTRS (coding->id);
4590 valids = AREF (attrs, coding_attr_charset_valids);
4591
4592 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4593 src += coding->head_ascii;
4594
4595 while (1)
4596 {
4597 int c;
4598
4599 ONE_MORE_BYTE (c);
4600 if (NILP (AREF (valids, c)))
4601 break;
4602 if (c >= 0x80)
4603 found = CATEGORY_MASK_CHARSET;
4604 }
4605 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4606 return 0;
4607
4608 no_more_source:
4609 detect_info->found |= found;
4610 return 1;
4611 }
4612
4613 static void
4614 decode_coding_charset (coding)
4615 struct coding_system *coding;
4616 {
4617 const unsigned char *src = coding->source + coding->consumed;
4618 const unsigned char *src_end = coding->source + coding->src_bytes;
4619 const unsigned char *src_base;
4620 int *charbuf = coding->charbuf;
4621 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4622 int consumed_chars = 0, consumed_chars_base;
4623 int multibytep = coding->src_multibyte;
4624 Lisp_Object attrs, eol_type, charset_list, valids;
4625 int char_offset = coding->produced_char;
4626 int last_offset = char_offset;
4627 int last_id = charset_ascii;
4628
4629 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4630 valids = AREF (attrs, coding_attr_charset_valids);
4631
4632 while (1)
4633 {
4634 int c;
4635
4636 src_base = src;
4637 consumed_chars_base = consumed_chars;
4638
4639 if (charbuf >= charbuf_end)
4640 break;
4641
4642 ONE_MORE_BYTE (c);
4643 if (c == '\r')
4644 {
4645 /* Here we assume that no charset maps '\r' to something
4646 else. */
4647 if (EQ (eol_type, Qdos))
4648 {
4649 if (src == src_end)
4650 {
4651 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4652 goto no_more_source;
4653 }
4654 if (*src == '\n')
4655 ONE_MORE_BYTE (c);
4656 }
4657 else if (EQ (eol_type, Qmac))
4658 c = '\n';
4659 }
4660 else
4661 {
4662 Lisp_Object val;
4663 struct charset *charset;
4664 int dim;
4665 int len = 1;
4666 unsigned code = c;
4667
4668 val = AREF (valids, c);
4669 if (NILP (val))
4670 goto invalid_code;
4671 if (INTEGERP (val))
4672 {
4673 charset = CHARSET_FROM_ID (XFASTINT (val));
4674 dim = CHARSET_DIMENSION (charset);
4675 while (len < dim)
4676 {
4677 ONE_MORE_BYTE (c);
4678 code = (code << 8) | c;
4679 len++;
4680 }
4681 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4682 charset, code, c);
3228 } 4683 }
3229 else 4684 else
3230 { 4685 {
3231 if (charset == charset_big5_1 || charset == charset_big5_2) 4686 /* VAL is a list of charset IDs. It is assured that the
4687 list is sorted by charset dimensions (smaller one
4688 comes first). */
4689 while (CONSP (val))
3232 { 4690 {
3233 ENCODE_BIG5 (charset, c1, c2, c1, c2); 4691 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
3234 EMIT_TWO_BYTES (c1, c2); 4692 dim = CHARSET_DIMENSION (charset);
3235 } 4693 while (len < dim)
3236 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR) 4694 {
3237 { 4695 ONE_MORE_BYTE (c);
3238 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER); 4696 code = (code << 8) | c;
3239 if (CHARSET_WIDTH (charset) > 1) 4697 len++;
3240 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER); 4698 }
3241 } 4699 CODING_DECODE_CHAR (coding, src, src_base,
3242 else 4700 src_end, charset, code, c);
3243 /* There's no way other than producing the internal 4701 if (c >= 0)
3244 codes as is. */ 4702 break;
3245 EMIT_BYTES (src_base, src); 4703 val = XCDR (val);
3246 }
3247 }
3248 coding->consumed_char++;
3249 }
3250
3251 label_end_of_loop:
3252 coding->consumed = src_base - source;
3253 coding->produced = coding->produced_char = dst - destination;
3254 }
3255
3256
3257 /*** 5. CCL handlers ***/
3258
3259 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3260 Check if a text is encoded in a coding system of which
3261 encoder/decoder are written in CCL program. If it is, return
3262 CODING_CATEGORY_MASK_CCL, else return 0. */
3263
3264 static int
3265 detect_coding_ccl (src, src_end, multibytep)
3266 unsigned char *src, *src_end;
3267 int multibytep;
3268 {
3269 unsigned char *valid;
3270 int c;
3271 /* Dummy for ONE_MORE_BYTE. */
3272 struct coding_system dummy_coding;
3273 struct coding_system *coding = &dummy_coding;
3274
3275 /* No coding system is assigned to coding-category-ccl. */
3276 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3277 return 0;
3278
3279 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3280 while (1)
3281 {
3282 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3283 if (! valid[c])
3284 return 0;
3285 }
3286 label_end_of_loop:
3287 return CODING_CATEGORY_MASK_CCL;
3288 }
3289
3290
3291 /*** 6. End-of-line handlers ***/
3292
3293 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3294
3295 static void
3296 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3297 struct coding_system *coding;
3298 unsigned char *source, *destination;
3299 int src_bytes, dst_bytes;
3300 {
3301 unsigned char *src = source;
3302 unsigned char *dst = destination;
3303 unsigned char *src_end = src + src_bytes;
3304 unsigned char *dst_end = dst + dst_bytes;
3305 Lisp_Object translation_table;
3306 /* SRC_BASE remembers the start position in source in each loop.
3307 The loop will be exited when there's not enough source code
3308 (within macro ONE_MORE_BYTE), or when there's not enough
3309 destination area to produce a character (within macro
3310 EMIT_CHAR). */
3311 unsigned char *src_base;
3312 int c;
3313
3314 translation_table = Qnil;
3315 switch (coding->eol_type)
3316 {
3317 case CODING_EOL_CRLF:
3318 while (1)
3319 {
3320 src_base = src;
3321 ONE_MORE_BYTE (c);
3322 if (c == '\r')
3323 {
3324 ONE_MORE_BYTE (c);
3325 if (c != '\n')
3326 {
3327 src--;
3328 c = '\r';
3329 } 4704 }
3330 } 4705 }
3331 else if (c == '\n' 4706 if (c < 0)
3332 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)) 4707 goto invalid_code;
4708 if (charset->id != charset_ascii
4709 && last_id != charset->id)
3333 { 4710 {
3334 coding->result = CODING_FINISH_INCONSISTENT_EOL; 4711 if (last_id != charset_ascii)
3335 goto label_end_of_loop; 4712 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4713 last_id = charset->id;
4714 last_offset = char_offset;
3336 } 4715 }
3337 EMIT_CHAR (c); 4716 }
3338 } 4717 *charbuf++ = c;
3339 break; 4718 char_offset++;
3340 4719 continue;
3341 case CODING_EOL_CR: 4720
3342 while (1) 4721 invalid_code:
3343 { 4722 src = src_base;
3344 src_base = src; 4723 consumed_chars = consumed_chars_base;
3345 ONE_MORE_BYTE (c); 4724 ONE_MORE_BYTE (c);
3346 if (c == '\n') 4725 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4726 char_offset++;
4727 coding->errors++;
4728 }
4729
4730 no_more_source:
4731 if (last_id != charset_ascii)
4732 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4733 coding->consumed_char += consumed_chars_base;
4734 coding->consumed = src_base - coding->source;
4735 coding->charbuf_used = charbuf - coding->charbuf;
4736 }
4737
4738 static int
4739 encode_coding_charset (coding)
4740 struct coding_system *coding;
4741 {
4742 int multibytep = coding->dst_multibyte;
4743 int *charbuf = coding->charbuf;
4744 int *charbuf_end = charbuf + coding->charbuf_used;
4745 unsigned char *dst = coding->destination + coding->produced;
4746 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4747 int safe_room = MAX_MULTIBYTE_LENGTH;
4748 int produced_chars = 0;
4749 Lisp_Object attrs, eol_type, charset_list;
4750 int ascii_compatible;
4751 int c;
4752
4753 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4754 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4755
4756 while (charbuf < charbuf_end)
4757 {
4758 struct charset *charset;
4759 unsigned code;
4760
4761 ASSURE_DESTINATION (safe_room);
4762 c = *charbuf++;
4763 if (ascii_compatible && ASCII_CHAR_P (c))
4764 EMIT_ONE_ASCII_BYTE (c);
4765 else if (CHAR_BYTE8_P (c))
4766 {
4767 c = CHAR_TO_BYTE8 (c);
4768 EMIT_ONE_BYTE (c);
4769 }
4770 else
4771 {
4772 charset = char_charset (c, charset_list, &code);
4773 if (charset)
3347 { 4774 {
3348 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) 4775 if (CHARSET_DIMENSION (charset) == 1)
3349 { 4776 EMIT_ONE_BYTE (code);
3350 coding->result = CODING_FINISH_INCONSISTENT_EOL; 4777 else if (CHARSET_DIMENSION (charset) == 2)
3351 goto label_end_of_loop; 4778 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
3352 } 4779 else if (CHARSET_DIMENSION (charset) == 3)
4780 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4781 else
4782 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4783 (code >> 8) & 0xFF, code & 0xFF);
3353 } 4784 }
3354 else if (c == '\r')
3355 c = '\n';
3356 EMIT_CHAR (c);
3357 }
3358 break;
3359
3360 default: /* no need for EOL handling */
3361 while (1)
3362 {
3363 src_base = src;
3364 ONE_MORE_BYTE (c);
3365 EMIT_CHAR (c);
3366 }
3367 }
3368
3369 label_end_of_loop:
3370 coding->consumed = coding->consumed_char = src_base - source;
3371 coding->produced = dst - destination;
3372 return;
3373 }
3374
3375 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
3376 format of end-of-line according to `coding->eol_type'. It also
3377 convert multibyte form 8-bit characters to unibyte if
3378 CODING->src_multibyte is nonzero. If `coding->mode &
3379 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3380 also means end-of-line. */
3381
3382 static void
3383 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3384 struct coding_system *coding;
3385 const unsigned char *source;
3386 unsigned char *destination;
3387 int src_bytes, dst_bytes;
3388 {
3389 const unsigned char *src = source;
3390 unsigned char *dst = destination;
3391 const unsigned char *src_end = src + src_bytes;
3392 unsigned char *dst_end = dst + dst_bytes;
3393 Lisp_Object translation_table;
3394 /* SRC_BASE remembers the start position in source in each loop.
3395 The loop will be exited when there's not enough source text to
3396 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3397 there's not enough destination area to produce encoded codes
3398 (within macro EMIT_BYTES). */
3399 const unsigned char *src_base;
3400 unsigned char *tmp;
3401 int c;
3402 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3403
3404 translation_table = Qnil;
3405 if (coding->src_multibyte
3406 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3407 {
3408 src_end--;
3409 src_bytes--;
3410 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3411 }
3412
3413 if (coding->eol_type == CODING_EOL_CRLF)
3414 {
3415 while (src < src_end)
3416 {
3417 src_base = src;
3418 c = *src++;
3419 if (c >= 0x20)
3420 EMIT_ONE_BYTE (c);
3421 else if (c == '\n' || (c == '\r' && selective_display))
3422 EMIT_TWO_BYTES ('\r', '\n');
3423 else 4785 else
3424 EMIT_ONE_BYTE (c); 4786 {
3425 } 4787 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3426 src_base = src; 4788 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3427 label_end_of_loop: 4789 else
3428 ; 4790 c = coding->default_char;
3429 } 4791 EMIT_ONE_BYTE (c);
3430 else 4792 }
3431 { 4793 }
3432 if (!dst_bytes || src_bytes <= dst_bytes) 4794 }
3433 { 4795
3434 safe_bcopy (src, dst, src_bytes); 4796 coding->result = CODING_RESULT_SUCCESS;
3435 src_base = src_end; 4797 coding->produced_char += produced_chars;
3436 dst += src_bytes; 4798 coding->produced = dst - coding->destination;
3437 } 4799 return 0;
3438 else
3439 {
3440 if (coding->src_multibyte
3441 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3442 dst_bytes--;
3443 safe_bcopy (src, dst, dst_bytes);
3444 src_base = src + dst_bytes;
3445 dst = destination + dst_bytes;
3446 coding->result = CODING_FINISH_INSUFFICIENT_DST;
3447 }
3448 if (coding->eol_type == CODING_EOL_CR)
3449 {
3450 for (tmp = destination; tmp < dst; tmp++)
3451 if (*tmp == '\n') *tmp = '\r';
3452 }
3453 else if (selective_display)
3454 {
3455 for (tmp = destination; tmp < dst; tmp++)
3456 if (*tmp == '\r') *tmp = '\n';
3457 }
3458 }
3459 if (coding->src_multibyte)
3460 dst = destination + str_as_unibyte (destination, dst - destination);
3461
3462 coding->consumed = src_base - source;
3463 coding->produced = dst - destination;
3464 coding->produced_char = coding->produced;
3465 } 4800 }
3466 4801
3467 4802
3468 /*** 7. C library functions ***/ 4803 /*** 7. C library functions ***/
3469 4804
3470 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which 4805 /* Setup coding context CODING from information about CODING_SYSTEM.
3471 has a property `coding-system'. The value of this property is a 4806 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
3472 vector of length 5 (called the coding-vector). Among elements of 4807 CODING_SYSTEM is invalid, signal an error. */
3473 this vector, the first (element[0]) and the fifth (element[4]) 4808
3474 carry important information for decoding/encoding. Before 4809 void
3475 decoding/encoding, this information should be set in fields of a
3476 structure of type `coding_system'.
3477
3478 The value of the property `coding-system' can be a symbol of another
3479 subsidiary coding-system. In that case, Emacs gets coding-vector
3480 from that symbol.
3481
3482 `element[0]' contains information to be set in `coding->type'. The
3483 value and its meaning is as follows:
3484
3485 0 -- coding_type_emacs_mule
3486 1 -- coding_type_sjis
3487 2 -- coding_type_iso2022
3488 3 -- coding_type_big5
3489 4 -- coding_type_ccl encoder/decoder written in CCL
3490 nil -- coding_type_no_conversion
3491 t -- coding_type_undecided (automatic conversion on decoding,
3492 no-conversion on encoding)
3493
3494 `element[4]' contains information to be set in `coding->flags' and
3495 `coding->spec'. The meaning varies by `coding->type'.
3496
3497 If `coding->type' is `coding_type_iso2022', element[4] is a vector
3498 of length 32 (of which the first 13 sub-elements are used now).
3499 Meanings of these sub-elements are:
3500
3501 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3502 If the value is an integer of valid charset, the charset is
3503 assumed to be designated to graphic register N initially.
3504
3505 If the value is minus, it is a minus value of charset which
3506 reserves graphic register N, which means that the charset is
3507 not designated initially but should be designated to graphic
3508 register N just before encoding a character in that charset.
3509
3510 If the value is nil, graphic register N is never used on
3511 encoding.
3512
3513 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3514 Each value takes t or nil. See the section ISO2022 of
3515 `coding.h' for more information.
3516
3517 If `coding->type' is `coding_type_big5', element[4] is t to denote
3518 BIG5-ETen or nil to denote BIG5-HKU.
3519
3520 If `coding->type' takes the other value, element[4] is ignored.
3521
3522 Emacs Lisp's coding systems also carry information about format of
3523 end-of-line in a value of property `eol-type'. If the value is
3524 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3525 means CODING_EOL_CR. If it is not integer, it should be a vector
3526 of subsidiary coding systems of which property `eol-type' has one
3527 of the above values.
3528
3529 */
3530
3531 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3532 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
3533 is setup so that no conversion is necessary and return -1, else
3534 return 0. */
3535
3536 int
3537 setup_coding_system (coding_system, coding) 4810 setup_coding_system (coding_system, coding)
3538 Lisp_Object coding_system; 4811 Lisp_Object coding_system;
3539 struct coding_system *coding; 4812 struct coding_system *coding;
3540 { 4813 {
3541 Lisp_Object coding_spec, coding_type, eol_type, plist; 4814 Lisp_Object attrs;
4815 Lisp_Object eol_type;
4816 Lisp_Object coding_type;
3542 Lisp_Object val; 4817 Lisp_Object val;
3543 4818
3544 /* At first, zero clear all members. */
3545 bzero (coding, sizeof (struct coding_system));
3546
3547 /* Initialize some fields required for all kinds of coding systems. */
3548 coding->symbol = coding_system;
3549 coding->heading_ascii = -1;
3550 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3551 coding->composing = COMPOSITION_DISABLED;
3552 coding->cmp_data = NULL;
3553
3554 if (NILP (coding_system)) 4819 if (NILP (coding_system))
3555 goto label_invalid_coding_system; 4820 coding_system = Qno_conversion;
3556 4821
3557 coding_spec = Fget (coding_system, Qcoding_system); 4822 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
3558 4823
3559 if (!VECTORP (coding_spec) 4824 attrs = CODING_ID_ATTRS (coding->id);
3560 || XVECTOR (coding_spec)->size != 5 4825 eol_type = CODING_ID_EOL_TYPE (coding->id);
3561 || !CONSP (XVECTOR (coding_spec)->contents[3])) 4826
3562 goto label_invalid_coding_system; 4827 coding->mode = 0;
3563 4828 coding->head_ascii = -1;
3564 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type); 4829 coding->common_flags
4830 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4831 if (! NILP (CODING_ATTR_POST_READ (attrs)))
4832 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4833 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4834 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4835 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4836 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4837
4838 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4839 coding->max_charset_id = SCHARS (val) - 1;
4840 coding->safe_charsets = (char *) SDATA (val);
4841 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4842
4843 coding_type = CODING_ATTR_TYPE (attrs);
4844 if (EQ (coding_type, Qundecided))
4845 {
4846 coding->detector = NULL;
4847 coding->decoder = decode_coding_raw_text;
4848 coding->encoder = encode_coding_raw_text;
4849 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4850 }
4851 else if (EQ (coding_type, Qiso_2022))
4852 {
4853 int i;
4854 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4855
4856 /* Invoke graphic register 0 to plane 0. */
4857 CODING_ISO_INVOCATION (coding, 0) = 0;
4858 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4859 CODING_ISO_INVOCATION (coding, 1)
4860 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4861 /* Setup the initial status of designation. */
4862 for (i = 0; i < 4; i++)
4863 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4864 /* Not single shifting initially. */
4865 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4866 /* Beginning of buffer should also be regarded as bol. */
4867 CODING_ISO_BOL (coding) = 1;
4868 coding->detector = detect_coding_iso_2022;
4869 coding->decoder = decode_coding_iso_2022;
4870 coding->encoder = encode_coding_iso_2022;
4871 if (flags & CODING_ISO_FLAG_SAFE)
4872 coding->mode |= CODING_MODE_SAFE_ENCODING;
4873 coding->common_flags
4874 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4875 | CODING_REQUIRE_FLUSHING_MASK);
4876 if (flags & CODING_ISO_FLAG_COMPOSITION)
4877 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4878 if (flags & CODING_ISO_FLAG_DESIGNATION)
4879 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4880 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4881 {
4882 setup_iso_safe_charsets (attrs);
4883 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4884 coding->max_charset_id = SCHARS (val) - 1;
4885 coding->safe_charsets = (char *) SDATA (val);
4886 }
4887 CODING_ISO_FLAGS (coding) = flags;
4888 }
4889 else if (EQ (coding_type, Qcharset))
4890 {
4891 coding->detector = detect_coding_charset;
4892 coding->decoder = decode_coding_charset;
4893 coding->encoder = encode_coding_charset;
4894 coding->common_flags
4895 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4896 }
4897 else if (EQ (coding_type, Qutf_8))
4898 {
4899 coding->detector = detect_coding_utf_8;
4900 coding->decoder = decode_coding_utf_8;
4901 coding->encoder = encode_coding_utf_8;
4902 coding->common_flags
4903 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4904 }
4905 else if (EQ (coding_type, Qutf_16))
4906 {
4907 val = AREF (attrs, coding_attr_utf_16_bom);
4908 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4909 : EQ (val, Qt) ? utf_16_with_bom
4910 : utf_16_without_bom);
4911 val = AREF (attrs, coding_attr_utf_16_endian);
4912 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4913 : utf_16_little_endian);
4914 CODING_UTF_16_SURROGATE (coding) = 0;
4915 coding->detector = detect_coding_utf_16;
4916 coding->decoder = decode_coding_utf_16;
4917 coding->encoder = encode_coding_utf_16;
4918 coding->common_flags
4919 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4920 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4921 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4922 }
4923 else if (EQ (coding_type, Qccl))
4924 {
4925 coding->detector = detect_coding_ccl;
4926 coding->decoder = decode_coding_ccl;
4927 coding->encoder = encode_coding_ccl;
4928 coding->common_flags
4929 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4930 | CODING_REQUIRE_FLUSHING_MASK);
4931 }
4932 else if (EQ (coding_type, Qemacs_mule))
4933 {
4934 coding->detector = detect_coding_emacs_mule;
4935 coding->decoder = decode_coding_emacs_mule;
4936 coding->encoder = encode_coding_emacs_mule;
4937 coding->common_flags
4938 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4939 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4940 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4941 {
4942 Lisp_Object tail, safe_charsets;
4943 int max_charset_id = 0;
4944
4945 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4946 tail = XCDR (tail))
4947 if (max_charset_id < XFASTINT (XCAR (tail)))
4948 max_charset_id = XFASTINT (XCAR (tail));
4949 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4950 make_number (255));
4951 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4952 tail = XCDR (tail))
4953 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
4954 coding->max_charset_id = max_charset_id;
4955 coding->safe_charsets = (char *) SDATA (safe_charsets);
4956 }
4957 }
4958 else if (EQ (coding_type, Qshift_jis))
4959 {
4960 coding->detector = detect_coding_sjis;
4961 coding->decoder = decode_coding_sjis;
4962 coding->encoder = encode_coding_sjis;
4963 coding->common_flags
4964 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4965 }
4966 else if (EQ (coding_type, Qbig5))
4967 {
4968 coding->detector = detect_coding_big5;
4969 coding->decoder = decode_coding_big5;
4970 coding->encoder = encode_coding_big5;
4971 coding->common_flags
4972 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4973 }
4974 else /* EQ (coding_type, Qraw_text) */
4975 {
4976 coding->detector = NULL;
4977 coding->decoder = decode_coding_raw_text;
4978 coding->encoder = encode_coding_raw_text;
4979 }
4980
4981 return;
4982 }
4983
4984 /* Return raw-text or one of its subsidiaries that has the same
4985 eol_type as CODING-SYSTEM. */
4986
4987 Lisp_Object
4988 raw_text_coding_system (coding_system)
4989 Lisp_Object coding_system;
4990 {
4991 Lisp_Object spec, attrs;
4992 Lisp_Object eol_type, raw_text_eol_type;
4993
4994 if (NILP (coding_system))
4995 return Qraw_text;
4996 spec = CODING_SYSTEM_SPEC (coding_system);
4997 attrs = AREF (spec, 0);
4998
4999 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5000 return coding_system;
5001
5002 eol_type = AREF (spec, 2);
3565 if (VECTORP (eol_type)) 5003 if (VECTORP (eol_type))
3566 { 5004 return Qraw_text;
3567 coding->eol_type = CODING_EOL_UNDECIDED; 5005 spec = CODING_SYSTEM_SPEC (Qraw_text);
3568 coding->common_flags = CODING_REQUIRE_DETECTION_MASK; 5006 raw_text_eol_type = AREF (spec, 2);
3569 } 5007 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
3570 else if (XFASTINT (eol_type) == 1) 5008 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
3571 { 5009 : AREF (raw_text_eol_type, 2));
3572 coding->eol_type = CODING_EOL_CRLF; 5010 }
3573 coding->common_flags 5011
3574 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK; 5012
3575 } 5013 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
3576 else if (XFASTINT (eol_type) == 2) 5014 does, return one of the subsidiary that has the same eol-spec as
3577 { 5015 PARENT. Otherwise, return CODING_SYSTEM. */
3578 coding->eol_type = CODING_EOL_CR; 5016
3579 coding->common_flags 5017 Lisp_Object
3580 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK; 5018 coding_inherit_eol_type (coding_system, parent)
3581 } 5019 Lisp_Object coding_system, parent;
3582 else 5020 {
3583 coding->eol_type = CODING_EOL_LF; 5021 Lisp_Object spec, attrs, eol_type;
3584 5022
3585 coding_type = XVECTOR (coding_spec)->contents[0]; 5023 if (NILP (coding_system))
3586 /* Try short cut. */ 5024 coding_system = Qraw_text;
3587 if (SYMBOLP (coding_type)) 5025 spec = CODING_SYSTEM_SPEC (coding_system);
3588 { 5026 attrs = AREF (spec, 0);
3589 if (EQ (coding_type, Qt)) 5027 eol_type = AREF (spec, 2);
3590 { 5028 if (VECTORP (eol_type)
3591 coding->type = coding_type_undecided; 5029 && ! NILP (parent))
3592 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK; 5030 {
3593 } 5031 Lisp_Object parent_spec;
3594 else 5032 Lisp_Object parent_eol_type;
3595 coding->type = coding_type_no_conversion; 5033
3596 /* Initialize this member. Any thing other than 5034 parent_spec
3597 CODING_CATEGORY_IDX_UTF_16_BE and 5035 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
3598 CODING_CATEGORY_IDX_UTF_16_LE are ok because they have 5036 parent_eol_type = AREF (parent_spec, 2);
3599 special treatment in detect_eol. */ 5037 if (EQ (parent_eol_type, Qunix))
3600 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE; 5038 coding_system = AREF (eol_type, 0);
3601 5039 else if (EQ (parent_eol_type, Qdos))
3602 return 0; 5040 coding_system = AREF (eol_type, 1);
3603 } 5041 else if (EQ (parent_eol_type, Qmac))
3604 5042 coding_system = AREF (eol_type, 2);
3605 /* Get values of coding system properties: 5043 }
3606 `post-read-conversion', `pre-write-conversion', 5044 return coding_system;
3607 `translation-table-for-decode', `translation-table-for-encode'. */
3608 plist = XVECTOR (coding_spec)->contents[3];
3609 /* Pre & post conversion functions should be disabled if
3610 inhibit_eol_conversion is nonzero. This is the case that a code
3611 conversion function is called while those functions are running. */
3612 if (! inhibit_pre_post_conversion)
3613 {
3614 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3615 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3616 }
3617 val = Fplist_get (plist, Qtranslation_table_for_decode);
3618 if (SYMBOLP (val))
3619 val = Fget (val, Qtranslation_table_for_decode);
3620 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3621 val = Fplist_get (plist, Qtranslation_table_for_encode);
3622 if (SYMBOLP (val))
3623 val = Fget (val, Qtranslation_table_for_encode);
3624 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3625 val = Fplist_get (plist, Qcoding_category);
3626 if (!NILP (val))
3627 {
3628 val = Fget (val, Qcoding_category_index);
3629 if (INTEGERP (val))
3630 coding->category_idx = XINT (val);
3631 else
3632 goto label_invalid_coding_system;
3633 }
3634 else
3635 goto label_invalid_coding_system;
3636
3637 /* If the coding system has non-nil `composition' property, enable
3638 composition handling. */
3639 val = Fplist_get (plist, Qcomposition);
3640 if (!NILP (val))
3641 coding->composing = COMPOSITION_NO;
3642
3643 switch (XFASTINT (coding_type))
3644 {
3645 case 0:
3646 coding->type = coding_type_emacs_mule;
3647 coding->common_flags
3648 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3649 if (!NILP (coding->post_read_conversion))
3650 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3651 if (!NILP (coding->pre_write_conversion))
3652 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3653 break;
3654
3655 case 1:
3656 coding->type = coding_type_sjis;
3657 coding->common_flags
3658 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3659 break;
3660
3661 case 2:
3662 coding->type = coding_type_iso2022;
3663 coding->common_flags
3664 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3665 {
3666 Lisp_Object val, temp;
3667 Lisp_Object *flags;
3668 int i, charset, reg_bits = 0;
3669
3670 val = XVECTOR (coding_spec)->contents[4];
3671
3672 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3673 goto label_invalid_coding_system;
3674
3675 flags = XVECTOR (val)->contents;
3676 coding->flags
3677 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3678 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3679 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3680 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3681 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3682 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3683 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3684 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3685 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3686 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3687 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3688 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3689 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3690 );
3691
3692 /* Invoke graphic register 0 to plane 0. */
3693 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3694 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3695 CODING_SPEC_ISO_INVOCATION (coding, 1)
3696 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3697 /* Not single shifting at first. */
3698 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3699 /* Beginning of buffer should also be regarded as bol. */
3700 CODING_SPEC_ISO_BOL (coding) = 1;
3701
3702 for (charset = 0; charset <= MAX_CHARSET; charset++)
3703 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3704 val = Vcharset_revision_alist;
3705 while (CONSP (val))
3706 {
3707 charset = get_charset_id (Fcar_safe (XCAR (val)));
3708 if (charset >= 0
3709 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3710 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3711 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3712 val = XCDR (val);
3713 }
3714
3715 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3716 FLAGS[REG] can be one of below:
3717 integer CHARSET: CHARSET occupies register I,
3718 t: designate nothing to REG initially, but can be used
3719 by any charsets,
3720 list of integer, nil, or t: designate the first
3721 element (if integer) to REG initially, the remaining
3722 elements (if integer) is designated to REG on request,
3723 if an element is t, REG can be used by any charsets,
3724 nil: REG is never used. */
3725 for (charset = 0; charset <= MAX_CHARSET; charset++)
3726 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3727 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3728 for (i = 0; i < 4; i++)
3729 {
3730 if ((INTEGERP (flags[i])
3731 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3732 || (charset = get_charset_id (flags[i])) >= 0)
3733 {
3734 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3735 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3736 }
3737 else if (EQ (flags[i], Qt))
3738 {
3739 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3740 reg_bits |= 1 << i;
3741 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3742 }
3743 else if (CONSP (flags[i]))
3744 {
3745 Lisp_Object tail;
3746 tail = flags[i];
3747
3748 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3749 if ((INTEGERP (XCAR (tail))
3750 && (charset = XINT (XCAR (tail)),
3751 CHARSET_VALID_P (charset)))
3752 || (charset = get_charset_id (XCAR (tail))) >= 0)
3753 {
3754 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3755 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3756 }
3757 else
3758 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3759 tail = XCDR (tail);
3760 while (CONSP (tail))
3761 {
3762 if ((INTEGERP (XCAR (tail))
3763 && (charset = XINT (XCAR (tail)),
3764 CHARSET_VALID_P (charset)))
3765 || (charset = get_charset_id (XCAR (tail))) >= 0)
3766 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3767 = i;
3768 else if (EQ (XCAR (tail), Qt))
3769 reg_bits |= 1 << i;
3770 tail = XCDR (tail);
3771 }
3772 }
3773 else
3774 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3775
3776 CODING_SPEC_ISO_DESIGNATION (coding, i)
3777 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3778 }
3779
3780 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3781 {
3782 /* REG 1 can be used only by locking shift in 7-bit env. */
3783 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3784 reg_bits &= ~2;
3785 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3786 /* Without any shifting, only REG 0 and 1 can be used. */
3787 reg_bits &= 3;
3788 }
3789
3790 if (reg_bits)
3791 for (charset = 0; charset <= MAX_CHARSET; charset++)
3792 {
3793 if (CHARSET_DEFINED_P (charset)
3794 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3795 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3796 {
3797 /* There exist some default graphic registers to be
3798 used by CHARSET. */
3799
3800 /* We had better avoid designating a charset of
3801 CHARS96 to REG 0 as far as possible. */
3802 if (CHARSET_CHARS (charset) == 96)
3803 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3804 = (reg_bits & 2
3805 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3806 else
3807 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3808 = (reg_bits & 1
3809 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3810 }
3811 }
3812 }
3813 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3814 coding->spec.iso2022.last_invalid_designation_register = -1;
3815 break;
3816
3817 case 3:
3818 coding->type = coding_type_big5;
3819 coding->common_flags
3820 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3821 coding->flags
3822 = (NILP (XVECTOR (coding_spec)->contents[4])
3823 ? CODING_FLAG_BIG5_HKU
3824 : CODING_FLAG_BIG5_ETEN);
3825 break;
3826
3827 case 4:
3828 coding->type = coding_type_ccl;
3829 coding->common_flags
3830 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3831 {
3832 val = XVECTOR (coding_spec)->contents[4];
3833 if (! CONSP (val)
3834 || setup_ccl_program (&(coding->spec.ccl.decoder),
3835 XCAR (val)) < 0
3836 || setup_ccl_program (&(coding->spec.ccl.encoder),
3837 XCDR (val)) < 0)
3838 goto label_invalid_coding_system;
3839
3840 bzero (coding->spec.ccl.valid_codes, 256);
3841 val = Fplist_get (plist, Qvalid_codes);
3842 if (CONSP (val))
3843 {
3844 Lisp_Object this;
3845
3846 for (; CONSP (val); val = XCDR (val))
3847 {
3848 this = XCAR (val);
3849 if (INTEGERP (this)
3850 && XINT (this) >= 0 && XINT (this) < 256)
3851 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3852 else if (CONSP (this)
3853 && INTEGERP (XCAR (this))
3854 && INTEGERP (XCDR (this)))
3855 {
3856 int start = XINT (XCAR (this));
3857 int end = XINT (XCDR (this));
3858
3859 if (start >= 0 && start <= end && end < 256)
3860 while (start <= end)
3861 coding->spec.ccl.valid_codes[start++] = 1;
3862 }
3863 }
3864 }
3865 }
3866 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3867 coding->spec.ccl.cr_carryover = 0;
3868 coding->spec.ccl.eight_bit_carryover[0] = 0;
3869 break;
3870
3871 case 5:
3872 coding->type = coding_type_raw_text;
3873 break;
3874
3875 default:
3876 goto label_invalid_coding_system;
3877 }
3878 return 0;
3879
3880 label_invalid_coding_system:
3881 coding->type = coding_type_no_conversion;
3882 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3883 coding->common_flags = 0;
3884 coding->eol_type = CODING_EOL_LF;
3885 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3886 return -1;
3887 }
3888
3889 /* Free memory blocks allocated for storing composition information. */
3890
3891 void
3892 coding_free_composition_data (coding)
3893 struct coding_system *coding;
3894 {
3895 struct composition_data *cmp_data = coding->cmp_data, *next;
3896
3897 if (!cmp_data)
3898 return;
3899 /* Memory blocks are chained. At first, rewind to the first, then,
3900 free blocks one by one. */
3901 while (cmp_data->prev)
3902 cmp_data = cmp_data->prev;
3903 while (cmp_data)
3904 {
3905 next = cmp_data->next;
3906 xfree (cmp_data);
3907 cmp_data = next;
3908 }
3909 coding->cmp_data = NULL;
3910 }
3911
3912 /* Set `char_offset' member of all memory blocks pointed by
3913 coding->cmp_data to POS. */
3914
3915 void
3916 coding_adjust_composition_offset (coding, pos)
3917 struct coding_system *coding;
3918 int pos;
3919 {
3920 struct composition_data *cmp_data;
3921
3922 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3923 cmp_data->char_offset = pos;
3924 }
3925
3926 /* Setup raw-text or one of its subsidiaries in the structure
3927 coding_system CODING according to the already setup value eol_type
3928 in CODING. CODING should be setup for some coding system in
3929 advance. */
3930
3931 void
3932 setup_raw_text_coding_system (coding)
3933 struct coding_system *coding;
3934 {
3935 if (coding->type != coding_type_raw_text)
3936 {
3937 coding->symbol = Qraw_text;
3938 coding->type = coding_type_raw_text;
3939 if (coding->eol_type != CODING_EOL_UNDECIDED)
3940 {
3941 Lisp_Object subsidiaries;
3942 subsidiaries = Fget (Qraw_text, Qeol_type);
3943
3944 if (VECTORP (subsidiaries)
3945 && XVECTOR (subsidiaries)->size == 3)
3946 coding->symbol
3947 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3948 }
3949 setup_coding_system (coding->symbol, coding);
3950 }
3951 return;
3952 } 5045 }
3953 5046
3954 /* Emacs has a mechanism to automatically detect a coding system if it 5047 /* Emacs has a mechanism to automatically detect a coding system if it
3955 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But, 5048 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3956 it's impossible to distinguish some coding systems accurately 5049 it's impossible to distinguish some coding systems accurately
3999 symbol) `japanese-iso-8bit' by default. 5092 symbol) `japanese-iso-8bit' by default.
4000 5093
4001 o coding-category-iso-7-else 5094 o coding-category-iso-7-else
4002 5095
4003 The category for a coding system which has the same code range 5096 The category for a coding system which has the same code range
4004 as ISO2022 of 7-bit environment but uses locking shift or 5097 as ISO2022 of 7-bit environemnt but uses locking shift or
4005 single shift functions. Assigned the coding-system (Lisp 5098 single shift functions. Assigned the coding-system (Lisp
4006 symbol) `iso-2022-7bit-lock' by default. 5099 symbol) `iso-2022-7bit-lock' by default.
4007 5100
4008 o coding-category-iso-8-else 5101 o coding-category-iso-8-else
4009 5102
4010 The category for a coding system which has the same code range 5103 The category for a coding system which has the same code range
4011 as ISO2022 of 8-bit environment but uses locking shift or 5104 as ISO2022 of 8-bit environemnt but uses locking shift or
4012 single shift functions. Assigned the coding-system (Lisp 5105 single shift functions. Assigned the coding-system (Lisp
4013 symbol) `iso-2022-8bit-ss2' by default. 5106 symbol) `iso-2022-8bit-ss2' by default.
4014 5107
4015 o coding-category-big5 5108 o coding-category-big5
4016 5109
4049 The category for a coding system not categorized in any of the 5142 The category for a coding system not categorized in any of the
4050 above. Assigned the coding-system (Lisp symbol) 5143 above. Assigned the coding-system (Lisp symbol)
4051 `no-conversion' by default. 5144 `no-conversion' by default.
4052 5145
4053 Each of them is a Lisp symbol and the value is an actual 5146 Each of them is a Lisp symbol and the value is an actual
4054 `coding-system' (this is also a Lisp symbol) assigned by a user. 5147 `coding-system's (this is also a Lisp symbol) assigned by a user.
4055 What Emacs does actually is to detect a category of coding system. 5148 What Emacs does actually is to detect a category of coding system.
4056 Then, it uses a `coding-system' assigned to it. If Emacs can't 5149 Then, it uses a `coding-system' assigned to it. If Emacs can't
4057 decide a single possible category, it selects a category of the 5150 decide only one possible category, it selects a category of the
4058 highest priority. Priorities of categories are also specified by a 5151 highest priority. Priorities of categories are also specified by a
4059 user in a Lisp variable `coding-category-list'. 5152 user in a Lisp variable `coding-category-list'.
4060 5153
4061 */ 5154 */
4062 5155
4063 static 5156 #define EOL_SEEN_NONE 0
4064 int ascii_skip_code[256]; 5157 #define EOL_SEEN_LF 1
4065 5158 #define EOL_SEEN_CR 2
4066 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded. 5159 #define EOL_SEEN_CRLF 4
4067 If it detects possible coding systems, return an integer in which 5160
4068 appropriate flag bits are set. Flag bits are defined by macros 5161 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4069 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL, 5162 SOURCE is encoded. If CATEGORY is one of
4070 it should point the table `coding_priorities'. In that case, only 5163 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
4071 the flag bit for a coding system of the highest priority is set in 5164 two-byte, else they are encoded by one-byte.
4072 the returned value. If MULTIBYTEP is nonzero, 8-bit codes of the 5165
4073 range 0x80..0x9F are in multibyte form. 5166 Return one of EOL_SEEN_XXX. */
4074 5167
4075 How many ASCII characters are at the head is returned as *SKIP. */ 5168 #define MAX_EOL_CHECK_COUNT 3
4076 5169
4077 static int 5170 static int
4078 detect_coding_mask (source, src_bytes, priorities, skip, multibytep) 5171 detect_eol (source, src_bytes, category)
4079 unsigned char *source; 5172 unsigned char *source;
4080 int src_bytes, *priorities, *skip; 5173 EMACS_INT src_bytes;
4081 int multibytep; 5174 enum coding_category category;
4082 {
4083 register unsigned char c;
4084 unsigned char *src = source, *src_end = source + src_bytes;
4085 unsigned int mask, utf16_examined_p, iso2022_examined_p;
4086 int i;
4087
4088 /* At first, skip all ASCII characters and control characters except
4089 for three ISO2022 specific control characters. */
4090 ascii_skip_code[ISO_CODE_SO] = 0;
4091 ascii_skip_code[ISO_CODE_SI] = 0;
4092 ascii_skip_code[ISO_CODE_ESC] = 0;
4093
4094 label_loop_detect_coding:
4095 while (src < src_end && ascii_skip_code[*src]) src++;
4096 *skip = src - source;
4097
4098 if (src >= src_end)
4099 /* We found nothing other than ASCII. There's nothing to do. */
4100 return 0;
4101
4102 c = *src;
4103 /* The text seems to be encoded in some multilingual coding system.
4104 Now, try to find in which coding system the text is encoded. */
4105 if (c < 0x80)
4106 {
4107 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4108 /* C is an ISO2022 specific control code of C0. */
4109 mask = detect_coding_iso2022 (src, src_end, multibytep);
4110 if (mask == 0)
4111 {
4112 /* No valid ISO2022 code follows C. Try again. */
4113 src++;
4114 if (c == ISO_CODE_ESC)
4115 ascii_skip_code[ISO_CODE_ESC] = 1;
4116 else
4117 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4118 goto label_loop_detect_coding;
4119 }
4120 if (priorities)
4121 {
4122 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4123 {
4124 if (mask & priorities[i])
4125 return priorities[i];
4126 }
4127 return CODING_CATEGORY_MASK_RAW_TEXT;
4128 }
4129 }
4130 else
4131 {
4132 int try;
4133
4134 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4135 c = src[1] - 0x20;
4136
4137 if (c < 0xA0)
4138 {
4139 /* C is the first byte of SJIS character code,
4140 or a leading-code of Emacs' internal format (emacs-mule),
4141 or the first byte of UTF-16. */
4142 try = (CODING_CATEGORY_MASK_SJIS
4143 | CODING_CATEGORY_MASK_EMACS_MULE
4144 | CODING_CATEGORY_MASK_UTF_16_BE
4145 | CODING_CATEGORY_MASK_UTF_16_LE);
4146
4147 /* Or, if C is a special latin extra code,
4148 or is an ISO2022 specific control code of C1 (SS2 or SS3),
4149 or is an ISO2022 control-sequence-introducer (CSI),
4150 we should also consider the possibility of ISO2022 codings. */
4151 if ((VECTORP (Vlatin_extra_code_table)
4152 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4153 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4154 || (c == ISO_CODE_CSI
4155 && (src < src_end
4156 && (*src == ']'
4157 || ((*src == '0' || *src == '1' || *src == '2')
4158 && src + 1 < src_end
4159 && src[1] == ']')))))
4160 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4161 | CODING_CATEGORY_MASK_ISO_8BIT);
4162 }
4163 else
4164 /* C is a character of ISO2022 in graphic plane right,
4165 or a SJIS's 1-byte character code (i.e. JISX0201),
4166 or the first byte of BIG5's 2-byte code,
4167 or the first byte of UTF-8/16. */
4168 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4169 | CODING_CATEGORY_MASK_ISO_8BIT
4170 | CODING_CATEGORY_MASK_SJIS
4171 | CODING_CATEGORY_MASK_BIG5
4172 | CODING_CATEGORY_MASK_UTF_8
4173 | CODING_CATEGORY_MASK_UTF_16_BE
4174 | CODING_CATEGORY_MASK_UTF_16_LE);
4175
4176 /* Or, we may have to consider the possibility of CCL. */
4177 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4178 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4179 ->spec.ccl.valid_codes)[c])
4180 try |= CODING_CATEGORY_MASK_CCL;
4181
4182 mask = 0;
4183 utf16_examined_p = iso2022_examined_p = 0;
4184 if (priorities)
4185 {
4186 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4187 {
4188 if (!iso2022_examined_p
4189 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4190 {
4191 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4192 iso2022_examined_p = 1;
4193 }
4194 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4195 mask |= detect_coding_sjis (src, src_end, multibytep);
4196 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4197 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4198 else if (!utf16_examined_p
4199 && (priorities[i] & try &
4200 CODING_CATEGORY_MASK_UTF_16_BE_LE))
4201 {
4202 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4203 utf16_examined_p = 1;
4204 }
4205 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4206 mask |= detect_coding_big5 (src, src_end, multibytep);
4207 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4208 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4209 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4210 mask |= detect_coding_ccl (src, src_end, multibytep);
4211 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4212 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4213 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4214 mask |= CODING_CATEGORY_MASK_BINARY;
4215 if (mask & priorities[i])
4216 return priorities[i];
4217 }
4218 return CODING_CATEGORY_MASK_RAW_TEXT;
4219 }
4220 if (try & CODING_CATEGORY_MASK_ISO)
4221 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4222 if (try & CODING_CATEGORY_MASK_SJIS)
4223 mask |= detect_coding_sjis (src, src_end, multibytep);
4224 if (try & CODING_CATEGORY_MASK_BIG5)
4225 mask |= detect_coding_big5 (src, src_end, multibytep);
4226 if (try & CODING_CATEGORY_MASK_UTF_8)
4227 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4228 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4229 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4230 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4231 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4232 if (try & CODING_CATEGORY_MASK_CCL)
4233 mask |= detect_coding_ccl (src, src_end, multibytep);
4234 }
4235 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4236 }
4237
4238 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4239 The information of the detected coding system is set in CODING. */
4240
4241 void
4242 detect_coding (coding, src, src_bytes)
4243 struct coding_system *coding;
4244 const unsigned char *src;
4245 int src_bytes;
4246 {
4247 unsigned int idx;
4248 int skip, mask;
4249 Lisp_Object val;
4250
4251 val = Vcoding_category_list;
4252 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4253 coding->src_multibyte);
4254 coding->heading_ascii = skip;
4255
4256 if (!mask) return;
4257
4258 /* We found a single coding system of the highest priority in MASK. */
4259 idx = 0;
4260 while (mask && ! (mask & 1)) mask >>= 1, idx++;
4261 if (! mask)
4262 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4263
4264 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4265
4266 if (coding->eol_type != CODING_EOL_UNDECIDED)
4267 {
4268 Lisp_Object tmp;
4269
4270 tmp = Fget (val, Qeol_type);
4271 if (VECTORP (tmp))
4272 val = XVECTOR (tmp)->contents[coding->eol_type];
4273 }
4274
4275 /* Setup this new coding system while preserving some slots. */
4276 {
4277 int src_multibyte = coding->src_multibyte;
4278 int dst_multibyte = coding->dst_multibyte;
4279
4280 setup_coding_system (val, coding);
4281 coding->src_multibyte = src_multibyte;
4282 coding->dst_multibyte = dst_multibyte;
4283 coding->heading_ascii = skip;
4284 }
4285 }
4286
4287 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4288 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4289 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4290
4291 How many non-eol characters are at the head is returned as *SKIP. */
4292
4293 #define MAX_EOL_CHECK_COUNT 3
4294
4295 static int
4296 detect_eol_type (source, src_bytes, skip)
4297 unsigned char *source;
4298 int src_bytes, *skip;
4299 { 5175 {
4300 unsigned char *src = source, *src_end = src + src_bytes; 5176 unsigned char *src = source, *src_end = src + src_bytes;
4301 unsigned char c; 5177 unsigned char c;
4302 int total = 0; /* How many end-of-lines are found so far. */ 5178 int total = 0;
4303 int eol_type = CODING_EOL_UNDECIDED; 5179 int eol_seen = EOL_SEEN_NONE;
4304 int this_eol_type; 5180
4305 5181 if ((1 << category) & CATEGORY_MASK_UTF_16)
4306 *skip = 0; 5182 {
4307 5183 int msb, lsb;
4308 while (src < src_end && total < MAX_EOL_CHECK_COUNT) 5184
4309 { 5185 msb = category == (coding_category_utf_16_le
4310 c = *src++; 5186 | coding_category_utf_16_le_nosig);
4311 if (c == '\n' || c == '\r') 5187 lsb = 1 - msb;
4312 { 5188
4313 if (*skip == 0) 5189 while (src + 1 < src_end)
4314 *skip = src - 1 - source; 5190 {
4315 total++; 5191 c = src[lsb];
4316 if (c == '\n') 5192 if (src[msb] == 0 && (c == '\n' || c == '\r'))
4317 this_eol_type = CODING_EOL_LF; 5193 {
4318 else if (src >= src_end || *src != '\n') 5194 int this_eol;
4319 this_eol_type = CODING_EOL_CR; 5195
5196 if (c == '\n')
5197 this_eol = EOL_SEEN_LF;
5198 else if (src + 3 >= src_end
5199 || src[msb + 2] != 0
5200 || src[lsb + 2] != '\n')
5201 this_eol = EOL_SEEN_CR;
5202 else
5203 this_eol = EOL_SEEN_CRLF;
5204
5205 if (eol_seen == EOL_SEEN_NONE)
5206 /* This is the first end-of-line. */
5207 eol_seen = this_eol;
5208 else if (eol_seen != this_eol)
5209 {
5210 /* The found type is different from what found before. */
5211 eol_seen = EOL_SEEN_LF;
5212 break;
5213 }
5214 if (++total == MAX_EOL_CHECK_COUNT)
5215 break;
5216 }
5217 src += 2;
5218 }
5219 }
5220 else
5221 {
5222 while (src < src_end)
5223 {
5224 c = *src++;
5225 if (c == '\n' || c == '\r')
5226 {
5227 int this_eol;
5228
5229 if (c == '\n')
5230 this_eol = EOL_SEEN_LF;
5231 else if (src >= src_end || *src != '\n')
5232 this_eol = EOL_SEEN_CR;
5233 else
5234 this_eol = EOL_SEEN_CRLF, src++;
5235
5236 if (eol_seen == EOL_SEEN_NONE)
5237 /* This is the first end-of-line. */
5238 eol_seen = this_eol;
5239 else if (eol_seen != this_eol)
5240 {
5241 /* The found type is different from what found before. */
5242 eol_seen = EOL_SEEN_LF;
5243 break;
5244 }
5245 if (++total == MAX_EOL_CHECK_COUNT)
5246 break;
5247 }
5248 }
5249 }
5250 return eol_seen;
5251 }
5252
5253
5254 static void
5255 adjust_coding_eol_type (coding, eol_seen)
5256 struct coding_system *coding;
5257 int eol_seen;
5258 {
5259 Lisp_Object eol_type;
5260
5261 eol_type = CODING_ID_EOL_TYPE (coding->id);
5262 if (eol_seen & EOL_SEEN_LF)
5263 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5264 else if (eol_seen & EOL_SEEN_CRLF)
5265 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5266 else if (eol_seen & EOL_SEEN_CR)
5267 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5268 }
5269
5270 /* Detect how a text specified in CODING is encoded. If a coding
5271 system is detected, update fields of CODING by the detected coding
5272 system. */
5273
5274 void
5275 detect_coding (coding)
5276 struct coding_system *coding;
5277 {
5278 const unsigned char *src, *src_end;
5279 Lisp_Object attrs, coding_type;
5280
5281 coding->consumed = coding->consumed_char = 0;
5282 coding->produced = coding->produced_char = 0;
5283 coding_set_source (coding);
5284
5285 src_end = coding->source + coding->src_bytes;
5286
5287 /* If we have not yet decided the text encoding type, detect it
5288 now. */
5289 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5290 {
5291 int c, i;
5292
5293 for (src = coding->source; src < src_end; src++)
5294 {
5295 c = *src;
5296 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5297 || c == ISO_CODE_SI
5298 || c == ISO_CODE_SO)))
5299 break;
5300 }
5301 coding->head_ascii = src - (coding->source + coding->consumed);
5302
5303 if (coding->head_ascii < coding->src_bytes)
5304 {
5305 struct coding_detection_info detect_info;
5306 enum coding_category category;
5307 struct coding_system *this;
5308
5309 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5310 for (i = 0; i < coding_category_raw_text; i++)
5311 {
5312 category = coding_priorities[i];
5313 this = coding_categories + category;
5314 if (this->id < 0)
5315 {
5316 /* No coding system of this category is defined. */
5317 detect_info.rejected |= (1 << category);
5318 }
5319 else if (category >= coding_category_raw_text)
5320 continue;
5321 else if (detect_info.checked & (1 << category))
5322 {
5323 if (detect_info.found & (1 << category))
5324 break;
5325 }
5326 else if ((*(this->detector)) (coding, &detect_info)
5327 && detect_info.found & (1 << category))
5328 break;
5329 }
5330 if (i < coding_category_raw_text)
5331 setup_coding_system (CODING_ID_NAME (this->id), coding);
5332 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5333 setup_coding_system (Qraw_text, coding);
5334 else if (detect_info.rejected)
5335 for (i = 0; i < coding_category_raw_text; i++)
5336 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5337 {
5338 this = coding_categories + coding_priorities[i];
5339 setup_coding_system (CODING_ID_NAME (this->id), coding);
5340 break;
5341 }
5342 }
5343 }
5344 else if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qutf_16))
5345 {
5346 Lisp_Object coding_systems;
5347 struct coding_detection_info detect_info;
5348
5349 coding_systems
5350 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5351 detect_info.found = detect_info.rejected = 0;
5352 if (CONSP (coding_systems)
5353 && detect_coding_utf_16 (coding, &detect_info)
5354 && (detect_info.found & (CATEGORY_MASK_UTF_16_LE
5355 | CATEGORY_MASK_UTF_16_BE)))
5356 {
5357 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5358 setup_coding_system (XCAR (coding_systems), coding);
4320 else 5359 else
4321 this_eol_type = CODING_EOL_CRLF, src++; 5360 setup_coding_system (XCDR (coding_systems), coding);
4322 5361 }
4323 if (eol_type == CODING_EOL_UNDECIDED) 5362 }
4324 /* This is the first end-of-line. */ 5363
4325 eol_type = this_eol_type; 5364 attrs = CODING_ID_ATTRS (coding->id);
4326 else if (eol_type != this_eol_type) 5365 coding_type = CODING_ATTR_TYPE (attrs);
5366
5367 /* If we have not yet decided the EOL type, detect it now. But, the
5368 detection is impossible for a CCL based coding system, in which
5369 case, we detct the EOL type after decoding. */
5370 if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5371 && ! EQ (coding_type, Qccl))
5372 {
5373 int eol_seen = detect_eol (coding->source, coding->src_bytes,
5374 XINT (CODING_ATTR_CATEGORY (attrs)));
5375
5376 if (eol_seen != EOL_SEEN_NONE)
5377 adjust_coding_eol_type (coding, eol_seen);
5378 }
5379 }
5380
5381
5382 static void
5383 decode_eol (coding)
5384 struct coding_system *coding;
5385 {
5386 if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
5387 {
5388 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5389 unsigned char *pend = p + coding->produced;
5390 int eol_seen = EOL_SEEN_NONE;
5391
5392 for (; p < pend; p++)
5393 {
5394 if (*p == '\n')
5395 eol_seen |= EOL_SEEN_LF;
5396 else if (*p == '\r')
4327 { 5397 {
4328 /* The found type is different from what found before. */ 5398 if (p + 1 < pend && *(p + 1) == '\n')
4329 eol_type = CODING_EOL_INCONSISTENT; 5399 {
4330 break; 5400 eol_seen |= EOL_SEEN_CRLF;
5401 p++;
5402 }
5403 else
5404 eol_seen |= EOL_SEEN_CR;
4331 } 5405 }
4332 } 5406 }
4333 } 5407 if (eol_seen != EOL_SEEN_NONE)
4334 5408 adjust_coding_eol_type (coding, eol_seen);
4335 if (*skip == 0) 5409 }
4336 *skip = src_end - source; 5410
4337 return eol_type; 5411 if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
4338 } 5412 {
4339 5413 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
4340 /* Like detect_eol_type, but detect EOL type in 2-octet 5414 unsigned char *pend = p + coding->produced;
4341 big-endian/little-endian format for coding systems utf-16-be and 5415
4342 utf-16-le. */ 5416 for (; p < pend; p++)
5417 if (*p == '\r')
5418 *p = '\n';
5419 }
5420 else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5421 {
5422 unsigned char *p, *pbeg, *pend;
5423 Lisp_Object undo_list;
5424
5425 move_gap_both (coding->dst_pos + coding->produced_char,
5426 coding->dst_pos_byte + coding->produced);
5427 undo_list = current_buffer->undo_list;
5428 current_buffer->undo_list = Qt;
5429 del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, 0);
5430 current_buffer->undo_list = undo_list;
5431 pbeg = GPT_ADDR;
5432 pend = pbeg + coding->produced;
5433
5434 for (p = pend - 1; p >= pbeg; p--)
5435 if (*p == '\r')
5436 {
5437 safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5438 pend--;
5439 }
5440 coding->produced_char -= coding->produced - (pend - pbeg);
5441 coding->produced = pend - pbeg;
5442 insert_from_gap (coding->produced_char, coding->produced);
5443 }
5444 }
5445
5446 static void
5447 translate_chars (coding, table)
5448 struct coding_system *coding;
5449 Lisp_Object table;
5450 {
5451 int *charbuf = coding->charbuf;
5452 int *charbuf_end = charbuf + coding->charbuf_used;
5453 int c;
5454
5455 if (coding->chars_at_source)
5456 return;
5457
5458 while (charbuf < charbuf_end)
5459 {
5460 c = *charbuf;
5461 if (c < 0)
5462 charbuf += c;
5463 else
5464 *charbuf++ = translate_char (table, c);
5465 }
5466 }
4343 5467
4344 static int 5468 static int
4345 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p) 5469 produce_chars (coding)
4346 unsigned char *source; 5470 struct coding_system *coding;
4347 int src_bytes, *skip, big_endian_p; 5471 {
4348 { 5472 unsigned char *dst = coding->destination + coding->produced;
4349 unsigned char *src = source, *src_end = src + src_bytes; 5473 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4350 unsigned int c1, c2; 5474 int produced;
4351 int total = 0; /* How many end-of-lines are found so far. */ 5475 int produced_chars = 0;
4352 int eol_type = CODING_EOL_UNDECIDED; 5476
4353 int this_eol_type; 5477 if (! coding->chars_at_source)
4354 int msb, lsb; 5478 {
4355 5479 /* Characters are in coding->charbuf. */
4356 if (big_endian_p) 5480 int *buf = coding->charbuf;
4357 msb = 0, lsb = 1; 5481 int *buf_end = buf + coding->charbuf_used;
4358 else 5482 unsigned char *adjusted_dst_end;
4359 msb = 1, lsb = 0; 5483
4360 5484 if (BUFFERP (coding->src_object)
4361 *skip = 0; 5485 && EQ (coding->src_object, coding->dst_object))
4362 5486 dst_end = ((unsigned char *) coding->source) + coding->consumed;
4363 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT) 5487 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
4364 { 5488
4365 c1 = (src[msb] << 8) | (src[lsb]); 5489 while (buf < buf_end)
4366 src += 2; 5490 {
4367 5491 int c = *buf++;
4368 if (c1 == '\n' || c1 == '\r') 5492
4369 { 5493 if (dst >= adjusted_dst_end)
4370 if (*skip == 0)
4371 *skip = src - 2 - source;
4372 total++;
4373 if (c1 == '\n')
4374 { 5494 {
4375 this_eol_type = CODING_EOL_LF; 5495 dst = alloc_destination (coding,
5496 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5497 dst);
5498 dst_end = coding->destination + coding->dst_bytes;
5499 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5500 }
5501 if (c >= 0)
5502 {
5503 if (coding->dst_multibyte
5504 || ! CHAR_BYTE8_P (c))
5505 CHAR_STRING_ADVANCE (c, dst);
5506 else
5507 *dst++ = CHAR_TO_BYTE8 (c);
5508 produced_chars++;
4376 } 5509 }
4377 else 5510 else
5511 /* This is an annotation datum. (-C) is the length of
5512 it. */
5513 buf += -c - 1;
5514 }
5515 }
5516 else
5517 {
5518 const unsigned char *src = coding->source;
5519 const unsigned char *src_end = src + coding->src_bytes;
5520 Lisp_Object eol_type;
5521
5522 eol_type = CODING_ID_EOL_TYPE (coding->id);
5523
5524 if (coding->src_multibyte != coding->dst_multibyte)
5525 {
5526 if (coding->src_multibyte)
4378 { 5527 {
4379 if ((src + 1) >= src_end) 5528 int multibytep = 1;
5529 int consumed_chars;
5530
5531 while (1)
4380 { 5532 {
4381 this_eol_type = CODING_EOL_CR; 5533 const unsigned char *src_base = src;
5534 int c;
5535
5536 ONE_MORE_BYTE (c);
5537 if (c == '\r')
5538 {
5539 if (EQ (eol_type, Qdos))
5540 {
5541 if (src == src_end)
5542 {
5543 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
5544 goto no_more_source;
5545 }
5546 if (*src == '\n')
5547 c = *src++;
5548 }
5549 else if (EQ (eol_type, Qmac))
5550 c = '\n';
5551 }
5552 if (dst == dst_end)
5553 {
5554 coding->consumed = src - coding->source;
5555
5556 if (EQ (coding->src_object, coding->dst_object))
5557 dst_end = (unsigned char *) src;
5558 if (dst == dst_end)
5559 {
5560 dst = alloc_destination (coding, src_end - src + 1,
5561 dst);
5562 dst_end = coding->destination + coding->dst_bytes;
5563 coding_set_source (coding);
5564 src = coding->source + coding->consumed;
5565 src_end = coding->source + coding->src_bytes;
5566 }
5567 }
5568 *dst++ = c;
5569 produced_chars++;
5570 }
5571 no_more_source:
5572 ;
5573 }
5574 else
5575 while (src < src_end)
5576 {
5577 int multibytep = 1;
5578 int c = *src++;
5579
5580 if (c == '\r')
5581 {
5582 if (EQ (eol_type, Qdos))
5583 {
5584 if (src < src_end
5585 && *src == '\n')
5586 c = *src++;
5587 }
5588 else if (EQ (eol_type, Qmac))
5589 c = '\n';
5590 }
5591 if (dst >= dst_end - 1)
5592 {
5593 coding->consumed = src - coding->source;
5594
5595 if (EQ (coding->src_object, coding->dst_object))
5596 dst_end = (unsigned char *) src;
5597 if (dst >= dst_end - 1)
5598 {
5599 dst = alloc_destination (coding, src_end - src + 2,
5600 dst);
5601 dst_end = coding->destination + coding->dst_bytes;
5602 coding_set_source (coding);
5603 src = coding->source + coding->consumed;
5604 src_end = coding->source + coding->src_bytes;
5605 }
5606 }
5607 EMIT_ONE_BYTE (c);
5608 }
5609 }
5610 else
5611 {
5612 if (!EQ (coding->src_object, coding->dst_object))
5613 {
5614 int require = coding->src_bytes - coding->dst_bytes;
5615
5616 if (require > 0)
5617 {
5618 EMACS_INT offset = src - coding->source;
5619
5620 dst = alloc_destination (coding, require, dst);
5621 coding_set_source (coding);
5622 src = coding->source + offset;
5623 src_end = coding->source + coding->src_bytes;
5624 }
5625 }
5626 produced_chars = coding->src_chars;
5627 while (src < src_end)
5628 {
5629 int c = *src++;
5630
5631 if (c == '\r')
5632 {
5633 if (EQ (eol_type, Qdos))
5634 {
5635 if (src < src_end
5636 && *src == '\n')
5637 c = *src++;
5638 produced_chars--;
5639 }
5640 else if (EQ (eol_type, Qmac))
5641 c = '\n';
5642 }
5643 *dst++ = c;
5644 }
5645 }
5646 coding->consumed = coding->src_bytes;
5647 coding->consumed_char = coding->src_chars;
5648 }
5649
5650 produced = dst - (coding->destination + coding->produced);
5651 if (BUFFERP (coding->dst_object))
5652 insert_from_gap (produced_chars, produced);
5653 coding->produced += produced;
5654 coding->produced_char += produced_chars;
5655 return produced_chars;
5656 }
5657
5658 /* Compose text in CODING->object according to the annotation data at
5659 CHARBUF. CHARBUF is an array:
5660 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5661 */
5662
5663 static INLINE void
5664 produce_composition (coding, charbuf)
5665 struct coding_system *coding;
5666 int *charbuf;
5667 {
5668 int len;
5669 EMACS_INT from, to;
5670 enum composition_method method;
5671 Lisp_Object components;
5672
5673 len = -charbuf[0];
5674 from = coding->dst_pos + charbuf[2];
5675 to = coding->dst_pos + charbuf[3];
5676 method = (enum composition_method) (charbuf[4]);
5677
5678 if (method == COMPOSITION_RELATIVE)
5679 components = Qnil;
5680 else
5681 {
5682 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5683 int i;
5684
5685 len -= 5;
5686 charbuf += 5;
5687 for (i = 0; i < len; i++)
5688 args[i] = make_number (charbuf[i]);
5689 components = (method == COMPOSITION_WITH_ALTCHARS
5690 ? Fstring (len, args) : Fvector (len, args));
5691 }
5692 compose_text (from, to, components, Qnil, coding->dst_object);
5693 }
5694
5695
5696 /* Put `charset' property on text in CODING->object according to
5697 the annotation data at CHARBUF. CHARBUF is an array:
5698 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5699 */
5700
5701 static INLINE void
5702 produce_charset (coding, charbuf)
5703 struct coding_system *coding;
5704 int *charbuf;
5705 {
5706 EMACS_INT from = coding->dst_pos + charbuf[2];
5707 EMACS_INT to = coding->dst_pos + charbuf[3];
5708 struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5709
5710 Fput_text_property (make_number (from), make_number (to),
5711 Qcharset, CHARSET_NAME (charset),
5712 coding->dst_object);
5713 }
5714
5715
5716 #define CHARBUF_SIZE 0x4000
5717
5718 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5719 do { \
5720 int size = CHARBUF_SIZE;; \
5721 \
5722 coding->charbuf = NULL; \
5723 while (size > 1024) \
5724 { \
5725 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5726 if (coding->charbuf) \
5727 break; \
5728 size >>= 1; \
5729 } \
5730 if (! coding->charbuf) \
5731 { \
5732 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5733 return coding->result; \
5734 } \
5735 coding->charbuf_size = size; \
5736 } while (0)
5737
5738
5739 static void
5740 produce_annotation (coding)
5741 struct coding_system *coding;
5742 {
5743 int *charbuf = coding->charbuf;
5744 int *charbuf_end = charbuf + coding->charbuf_used;
5745
5746 if (NILP (coding->dst_object))
5747 return;
5748
5749 while (charbuf < charbuf_end)
5750 {
5751 if (*charbuf >= 0)
5752 charbuf++;
5753 else
5754 {
5755 int len = -*charbuf;
5756 switch (charbuf[1])
5757 {
5758 case CODING_ANNOTATE_COMPOSITION_MASK:
5759 produce_composition (coding, charbuf);
5760 break;
5761 case CODING_ANNOTATE_CHARSET_MASK:
5762 produce_charset (coding, charbuf);
5763 break;
5764 default:
5765 abort ();
5766 }
5767 charbuf += len;
5768 }
5769 }
5770 }
5771
5772 /* Decode the data at CODING->src_object into CODING->dst_object.
5773 CODING->src_object is a buffer, a string, or nil.
5774 CODING->dst_object is a buffer.
5775
5776 If CODING->src_object is a buffer, it must be the current buffer.
5777 In this case, if CODING->src_pos is positive, it is a position of
5778 the source text in the buffer, otherwise, the source text is in the
5779 gap area of the buffer, and CODING->src_pos specifies the offset of
5780 the text from GPT (which must be the same as PT). If this is the
5781 same buffer as CODING->dst_object, CODING->src_pos must be
5782 negative.
5783
5784 If CODING->src_object is a string, CODING->src_pos in an index to
5785 that string.
5786
5787 If CODING->src_object is nil, CODING->source must already point to
5788 the non-relocatable memory area. In this case, CODING->src_pos is
5789 an offset from CODING->source.
5790
5791 The decoded data is inserted at the current point of the buffer
5792 CODING->dst_object.
5793 */
5794
5795 static int
5796 decode_coding (coding)
5797 struct coding_system *coding;
5798 {
5799 Lisp_Object attrs;
5800
5801 if (BUFFERP (coding->src_object)
5802 && coding->src_pos > 0
5803 && coding->src_pos < GPT
5804 && coding->src_pos + coding->src_chars > GPT)
5805 move_gap_both (coding->src_pos, coding->src_pos_byte);
5806
5807 if (BUFFERP (coding->dst_object))
5808 {
5809 if (current_buffer != XBUFFER (coding->dst_object))
5810 set_buffer_internal (XBUFFER (coding->dst_object));
5811 if (GPT != PT)
5812 move_gap_both (PT, PT_BYTE);
5813 }
5814
5815 coding->consumed = coding->consumed_char = 0;
5816 coding->produced = coding->produced_char = 0;
5817 coding->chars_at_source = 0;
5818 coding->result = CODING_RESULT_SUCCESS;
5819 coding->errors = 0;
5820
5821 ALLOC_CONVERSION_WORK_AREA (coding);
5822
5823 attrs = CODING_ID_ATTRS (coding->id);
5824
5825 do
5826 {
5827 coding_set_source (coding);
5828 coding->annotated = 0;
5829 (*(coding->decoder)) (coding);
5830 if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5831 translate_chars (coding, CODING_ATTR_DECODE_TBL (attrs));
5832 else if (!NILP (Vstandard_translation_table_for_decode))
5833 translate_chars (coding, Vstandard_translation_table_for_decode);
5834 coding_set_destination (coding);
5835 produce_chars (coding);
5836 if (coding->annotated)
5837 produce_annotation (coding);
5838 }
5839 while (coding->consumed < coding->src_bytes
5840 && ! coding->result);
5841
5842 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5843 && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5844 && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5845 decode_eol (coding);
5846
5847 coding->carryover_bytes = 0;
5848 if (coding->consumed < coding->src_bytes)
5849 {
5850 int nbytes = coding->src_bytes - coding->consumed;
5851 const unsigned char *src;
5852
5853 coding_set_source (coding);
5854 coding_set_destination (coding);
5855 src = coding->source + coding->consumed;
5856
5857 if (coding->mode & CODING_MODE_LAST_BLOCK)
5858 {
5859 /* Flush out unprocessed data as binary chars. We are sure
5860 that the number of data is less than the size of
5861 coding->charbuf. */
5862 while (nbytes-- > 0)
5863 {
5864 int c = *src++;
5865
5866 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5867 }
5868 produce_chars (coding);
5869 }
5870 else
5871 {
5872 /* Record unprocessed bytes in coding->carryover. We are
5873 sure that the number of data is less than the size of
5874 coding->carryover. */
5875 unsigned char *p = coding->carryover;
5876
5877 coding->carryover_bytes = nbytes;
5878 while (nbytes-- > 0)
5879 *p++ = *src++;
5880 }
5881 coding->consumed = coding->src_bytes;
5882 }
5883
5884 return coding->result;
5885 }
5886
5887
5888 /* Extract an annotation datum from a composition starting at POS and
5889 ending before LIMIT of CODING->src_object (buffer or string), store
5890 the data in BUF, set *STOP to a starting position of the next
5891 composition (if any) or to LIMIT, and return the address of the
5892 next element of BUF.
5893
5894 If such an annotation is not found, set *STOP to a starting
5895 position of a composition after POS (if any) or to LIMIT, and
5896 return BUF. */
5897
5898 static INLINE int *
5899 handle_composition_annotation (pos, limit, coding, buf, stop)
5900 EMACS_INT pos, limit;
5901 struct coding_system *coding;
5902 int *buf;
5903 EMACS_INT *stop;
5904 {
5905 EMACS_INT start, end;
5906 Lisp_Object prop;
5907
5908 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
5909 || end > limit)
5910 *stop = limit;
5911 else if (start > pos)
5912 *stop = start;
5913 else
5914 {
5915 if (start == pos)
5916 {
5917 /* We found a composition. Store the corresponding
5918 annotation data in BUF. */
5919 int *head = buf;
5920 enum composition_method method = COMPOSITION_METHOD (prop);
5921 int nchars = COMPOSITION_LENGTH (prop);
5922
5923 ADD_COMPOSITION_DATA (buf, 0, nchars, method);
5924 if (method != COMPOSITION_RELATIVE)
5925 {
5926 Lisp_Object components;
5927 int len, i, i_byte;
5928
5929 components = COMPOSITION_COMPONENTS (prop);
5930 if (VECTORP (components))
5931 {
5932 len = XVECTOR (components)->size;
5933 for (i = 0; i < len; i++)
5934 *buf++ = XINT (AREF (components, i));
5935 }
5936 else if (STRINGP (components))
5937 {
5938 len = SCHARS (components);
5939 i = i_byte = 0;
5940 while (i < len)
5941 {
5942 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
5943 buf++;
5944 }
5945 }
5946 else if (INTEGERP (components))
5947 {
5948 len = 1;
5949 *buf++ = XINT (components);
5950 }
5951 else if (CONSP (components))
5952 {
5953 for (len = 0; CONSP (components);
5954 len++, components = XCDR (components))
5955 *buf++ = XINT (XCAR (components));
4382 } 5956 }
4383 else 5957 else
4384 { 5958 abort ();
4385 c2 = (src[msb] << 8) | (src[lsb]); 5959 *head -= len;
4386 if (c2 == '\n')
4387 this_eol_type = CODING_EOL_CRLF, src += 2;
4388 else
4389 this_eol_type = CODING_EOL_CR;
4390 }
4391 } 5960 }
4392 5961 }
4393 if (eol_type == CODING_EOL_UNDECIDED) 5962
4394 /* This is the first end-of-line. */ 5963 if (find_composition (end, limit, &start, &end, &prop,
4395 eol_type = this_eol_type; 5964 coding->src_object)
4396 else if (eol_type != this_eol_type) 5965 && end <= limit)
5966 *stop = start;
5967 else
5968 *stop = limit;
5969 }
5970 return buf;
5971 }
5972
5973
5974 /* Extract an annotation datum from a text property `charset' at POS of
5975 CODING->src_object (buffer of string), store the data in BUF, set
5976 *STOP to the position where the value of `charset' property changes
5977 (limiting by LIMIT), and return the address of the next element of
5978 BUF.
5979
5980 If the property value is nil, set *STOP to the position where the
5981 property value is non-nil (limiting by LIMIT), and return BUF. */
5982
5983 static INLINE int *
5984 handle_charset_annotation (pos, limit, coding, buf, stop)
5985 EMACS_INT pos, limit;
5986 struct coding_system *coding;
5987 int *buf;
5988 EMACS_INT *stop;
5989 {
5990 Lisp_Object val, next;
5991 int id;
5992
5993 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
5994 if (! NILP (val) && CHARSETP (val))
5995 id = XINT (CHARSET_SYMBOL_ID (val));
5996 else
5997 id = -1;
5998 ADD_CHARSET_DATA (buf, 0, 0, id);
5999 next = Fnext_single_property_change (make_number (pos), Qcharset,
6000 coding->src_object,
6001 make_number (limit));
6002 *stop = XINT (next);
6003 return buf;
6004 }
6005
6006
6007 static void
6008 consume_chars (coding)
6009 struct coding_system *coding;
6010 {
6011 int *buf = coding->charbuf;
6012 int *buf_end = coding->charbuf + coding->charbuf_size;
6013 const unsigned char *src = coding->source + coding->consumed;
6014 const unsigned char *src_end = coding->source + coding->src_bytes;
6015 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6016 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6017 int multibytep = coding->src_multibyte;
6018 Lisp_Object eol_type;
6019 int c;
6020 EMACS_INT stop, stop_composition, stop_charset;
6021
6022 eol_type = CODING_ID_EOL_TYPE (coding->id);
6023 if (VECTORP (eol_type))
6024 eol_type = Qunix;
6025
6026 /* Note: composition handling is not yet implemented. */
6027 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6028
6029 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6030 stop = stop_composition = pos;
6031 else
6032 stop = stop_composition = end_pos;
6033 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6034 stop = stop_charset = pos;
6035 else
6036 stop_charset = end_pos;
6037
6038 /* Compensate for CRLF and annotation. */
6039 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6040 while (buf < buf_end)
6041 {
6042 if (pos == stop)
6043 {
6044 if (pos == end_pos)
6045 break;
6046 if (pos == stop_composition)
6047 buf = handle_composition_annotation (pos, end_pos, coding,
6048 buf, &stop_composition);
6049 if (pos == stop_charset)
6050 buf = handle_charset_annotation (pos, end_pos, coding,
6051 buf, &stop_charset);
6052 stop = (stop_composition < stop_charset
6053 ? stop_composition : stop_charset);
6054 }
6055
6056 if (! multibytep)
6057 {
6058 EMACS_INT bytes;
6059
6060 if (! CODING_FOR_UNIBYTE (coding)
6061 && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6062 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6063 else
6064 c = *src++, pos++;
6065 }
6066 else
6067 c = STRING_CHAR_ADVANCE (src), pos++;
6068 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6069 c = '\n';
6070 if (! EQ (eol_type, Qunix))
6071 {
6072 if (c == '\n')
4397 { 6073 {
4398 /* The found type is different from what found before. */ 6074 if (EQ (eol_type, Qdos))
4399 eol_type = CODING_EOL_INCONSISTENT; 6075 *buf++ = '\r';
4400 break; 6076 else
6077 c = '\r';
4401 } 6078 }
4402 } 6079 }
4403 } 6080 *buf++ = c;
4404 6081 }
4405 if (*skip == 0) 6082
4406 *skip = src_end - source; 6083 coding->consumed = src - coding->source;
4407 return eol_type; 6084 coding->consumed_char = pos - coding->src_pos;
4408 } 6085 coding->charbuf_used = buf - coding->charbuf;
4409 6086 coding->chars_at_source = 0;
4410 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC 6087 }
4411 is encoded. If it detects an appropriate format of end-of-line, it 6088
4412 sets the information in *CODING. */ 6089
6090 /* Encode the text at CODING->src_object into CODING->dst_object.
6091 CODING->src_object is a buffer or a string.
6092 CODING->dst_object is a buffer or nil.
6093
6094 If CODING->src_object is a buffer, it must be the current buffer.
6095 In this case, if CODING->src_pos is positive, it is a position of
6096 the source text in the buffer, otherwise. the source text is in the
6097 gap area of the buffer, and coding->src_pos specifies the offset of
6098 the text from GPT (which must be the same as PT). If this is the
6099 same buffer as CODING->dst_object, CODING->src_pos must be
6100 negative and CODING should not have `pre-write-conversion'.
6101
6102 If CODING->src_object is a string, CODING should not have
6103 `pre-write-conversion'.
6104
6105 If CODING->dst_object is a buffer, the encoded data is inserted at
6106 the current point of that buffer.
6107
6108 If CODING->dst_object is nil, the encoded data is placed at the
6109 memory area specified by CODING->destination. */
6110
6111 static int
6112 encode_coding (coding)
6113 struct coding_system *coding;
6114 {
6115 Lisp_Object attrs;
6116
6117 attrs = CODING_ID_ATTRS (coding->id);
6118
6119 if (BUFFERP (coding->dst_object))
6120 {
6121 set_buffer_internal (XBUFFER (coding->dst_object));
6122 coding->dst_multibyte
6123 = ! NILP (current_buffer->enable_multibyte_characters);
6124 }
6125
6126 coding->consumed = coding->consumed_char = 0;
6127 coding->produced = coding->produced_char = 0;
6128 coding->result = CODING_RESULT_SUCCESS;
6129 coding->errors = 0;
6130
6131 ALLOC_CONVERSION_WORK_AREA (coding);
6132
6133 do {
6134 coding_set_source (coding);
6135 consume_chars (coding);
6136
6137 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
6138 translate_chars (coding, CODING_ATTR_ENCODE_TBL (attrs));
6139 else if (!NILP (Vstandard_translation_table_for_encode))
6140 translate_chars (coding, Vstandard_translation_table_for_encode);
6141
6142 coding_set_destination (coding);
6143 (*(coding->encoder)) (coding);
6144 } while (coding->consumed_char < coding->src_chars);
6145
6146 if (BUFFERP (coding->dst_object))
6147 insert_from_gap (coding->produced_char, coding->produced);
6148
6149 return (coding->result);
6150 }
6151
6152
6153 /* Stack of working buffers used in code conversion. An nil element
6154 means that the code conversion of that level is not using a working
6155 buffer. */
6156 Lisp_Object Vcode_conversion_work_buf_list;
6157
6158 /* A working buffer used by the top level conversion. */
6159 Lisp_Object Vcode_conversion_reused_work_buf;
6160
6161
6162 /* Return a working buffer that can be freely used by the following
6163 code conversion. MULTIBYTEP specifies the multibyteness of the
6164 buffer. */
6165
6166 Lisp_Object
6167 make_conversion_work_buffer (multibytep, depth)
6168 int multibytep, depth;
6169 {
6170 struct buffer *current = current_buffer;
6171 Lisp_Object buf, name;
6172
6173 if (depth == 0)
6174 {
6175 if (NILP (Vcode_conversion_reused_work_buf))
6176 Vcode_conversion_reused_work_buf
6177 = Fget_buffer_create (build_string (" *code-converting-work<0>*"));
6178 buf = Vcode_conversion_reused_work_buf;
6179 }
6180 else
6181 {
6182 if (depth < 0)
6183 {
6184 name = build_string (" *code-converting-work*");
6185 name = Fgenerate_new_buffer_name (name, Qnil);
6186 }
6187 else
6188 {
6189 char str[128];
6190
6191 sprintf (str, " *code-converting-work*<%d>", depth);
6192 name = build_string (str);
6193 }
6194 buf = Fget_buffer_create (name);
6195 }
6196 set_buffer_internal (XBUFFER (buf));
6197 current_buffer->undo_list = Qt;
6198 Ferase_buffer ();
6199 Fset_buffer_multibyte (multibytep ? Qt : Qnil);
6200 set_buffer_internal (current);
6201 return buf;
6202 }
6203
6204 static Lisp_Object
6205 code_conversion_restore (buffer)
6206 Lisp_Object buffer;
6207 {
6208 Lisp_Object workbuf;
6209
6210 workbuf = XCAR (Vcode_conversion_work_buf_list);
6211 if (! NILP (workbuf)
6212 && ! EQ (workbuf, Vcode_conversion_reused_work_buf)
6213 && ! NILP (Fbuffer_live_p (workbuf)))
6214 Fkill_buffer (workbuf);
6215 Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
6216 set_buffer_internal (XBUFFER (buffer));
6217 return Qnil;
6218 }
6219
6220 static Lisp_Object
6221 code_conversion_save (buffer, with_work_buf, multibyte)
6222 Lisp_Object buffer;
6223 int with_work_buf, multibyte;
6224 {
6225 Lisp_Object workbuf;
6226
6227 if (with_work_buf)
6228 {
6229 int depth = XINT (Flength (Vcode_conversion_work_buf_list));
6230
6231 workbuf = make_conversion_work_buffer (multibyte, depth);
6232 }
6233 else
6234 workbuf = Qnil;
6235 Vcode_conversion_work_buf_list
6236 = Fcons (workbuf, Vcode_conversion_work_buf_list);
6237 record_unwind_protect (code_conversion_restore, buffer);
6238 return workbuf;
6239 }
6240
6241 int
6242 decode_coding_gap (coding, chars, bytes)
6243 struct coding_system *coding;
6244 EMACS_INT chars, bytes;
6245 {
6246 int count = specpdl_ptr - specpdl;
6247 Lisp_Object attrs;
6248 Lisp_Object buffer;
6249
6250 buffer = Fcurrent_buffer ();
6251 code_conversion_save (buffer, 0, 0);
6252
6253 coding->src_object = buffer;
6254 coding->src_chars = chars;
6255 coding->src_bytes = bytes;
6256 coding->src_pos = -chars;
6257 coding->src_pos_byte = -bytes;
6258 coding->src_multibyte = chars < bytes;
6259 coding->dst_object = buffer;
6260 coding->dst_pos = PT;
6261 coding->dst_pos_byte = PT_BYTE;
6262 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6263 coding->mode |= CODING_MODE_LAST_BLOCK;
6264
6265 if (CODING_REQUIRE_DETECTION (coding))
6266 detect_coding (coding);
6267
6268 decode_coding (coding);
6269
6270 attrs = CODING_ID_ATTRS (coding->id);
6271 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6272 {
6273 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6274 Lisp_Object val;
6275
6276 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6277 val = call1 (CODING_ATTR_POST_READ (attrs),
6278 make_number (coding->produced_char));
6279 CHECK_NATNUM (val);
6280 coding->produced_char += Z - prev_Z;
6281 coding->produced += Z_BYTE - prev_Z_BYTE;
6282 }
6283
6284 unbind_to (count, Qnil);
6285 return coding->result;
6286 }
6287
6288 int
6289 encode_coding_gap (coding, chars, bytes)
6290 struct coding_system *coding;
6291 EMACS_INT chars, bytes;
6292 {
6293 int count = specpdl_ptr - specpdl;
6294 Lisp_Object buffer;
6295
6296 buffer = Fcurrent_buffer ();
6297 code_conversion_save (buffer, 0, 0);
6298
6299 coding->src_object = buffer;
6300 coding->src_chars = chars;
6301 coding->src_bytes = bytes;
6302 coding->src_pos = -chars;
6303 coding->src_pos_byte = -bytes;
6304 coding->src_multibyte = chars < bytes;
6305 coding->dst_object = coding->src_object;
6306 coding->dst_pos = PT;
6307 coding->dst_pos_byte = PT_BYTE;
6308
6309 encode_coding (coding);
6310
6311 unbind_to (count, Qnil);
6312 return coding->result;
6313 }
6314
6315
6316 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6317 SRC_OBJECT into DST_OBJECT by coding context CODING.
6318
6319 SRC_OBJECT is a buffer, a string, or Qnil.
6320
6321 If it is a buffer, the text is at point of the buffer. FROM and TO
6322 are positions in the buffer.
6323
6324 If it is a string, the text is at the beginning of the string.
6325 FROM and TO are indices to the string.
6326
6327 If it is nil, the text is at coding->source. FROM and TO are
6328 indices to coding->source.
6329
6330 DST_OBJECT is a buffer, Qt, or Qnil.
6331
6332 If it is a buffer, the decoded text is inserted at point of the
6333 buffer. If the buffer is the same as SRC_OBJECT, the source text
6334 is deleted.
6335
6336 If it is Qt, a string is made from the decoded text, and
6337 set in CODING->dst_object.
6338
6339 If it is Qnil, the decoded text is stored at CODING->destination.
6340 The caller must allocate CODING->dst_bytes bytes at
6341 CODING->destination by xmalloc. If the decoded text is longer than
6342 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6343 */
4413 6344
4414 void 6345 void
4415 detect_eol (coding, src, src_bytes) 6346 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6347 dst_object)
4416 struct coding_system *coding; 6348 struct coding_system *coding;
4417 const unsigned char *src; 6349 Lisp_Object src_object;
4418 int src_bytes; 6350 EMACS_INT from, from_byte, to, to_byte;
4419 { 6351 Lisp_Object dst_object;
4420 Lisp_Object val; 6352 {
4421 int skip; 6353 int count = specpdl_ptr - specpdl;
4422 int eol_type; 6354 unsigned char *destination;
4423 6355 EMACS_INT dst_bytes;
4424 switch (coding->category_idx) 6356 EMACS_INT chars = to - from;
4425 { 6357 EMACS_INT bytes = to_byte - from_byte;
4426 case CODING_CATEGORY_IDX_UTF_16_BE: 6358 Lisp_Object attrs;
4427 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1); 6359 Lisp_Object buffer;
4428 break; 6360 int saved_pt = -1, saved_pt_byte;
4429 case CODING_CATEGORY_IDX_UTF_16_LE: 6361
4430 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0); 6362 buffer = Fcurrent_buffer ();
4431 break; 6363
4432 default: 6364 if (NILP (dst_object))
4433 eol_type = detect_eol_type (src, src_bytes, &skip); 6365 {
4434 break; 6366 destination = coding->destination;
4435 } 6367 dst_bytes = coding->dst_bytes;
4436 6368 }
4437 if (coding->heading_ascii > skip) 6369
4438 coding->heading_ascii = skip; 6370 coding->src_object = src_object;
6371 coding->src_chars = chars;
6372 coding->src_bytes = bytes;
6373 coding->src_multibyte = chars < bytes;
6374
6375 if (STRINGP (src_object))
6376 {
6377 coding->src_pos = from;
6378 coding->src_pos_byte = from_byte;
6379 }
6380 else if (BUFFERP (src_object))
6381 {
6382 set_buffer_internal (XBUFFER (src_object));
6383 if (from != GPT)
6384 move_gap_both (from, from_byte);
6385 if (EQ (src_object, dst_object))
6386 {
6387 saved_pt = PT, saved_pt_byte = PT_BYTE;
6388 TEMP_SET_PT_BOTH (from, from_byte);
6389 del_range_both (from, from_byte, to, to_byte, 1);
6390 coding->src_pos = -chars;
6391 coding->src_pos_byte = -bytes;
6392 }
6393 else
6394 {
6395 coding->src_pos = from;
6396 coding->src_pos_byte = from_byte;
6397 }
6398 }
6399
6400 if (CODING_REQUIRE_DETECTION (coding))
6401 detect_coding (coding);
6402 attrs = CODING_ID_ATTRS (coding->id);
6403
6404 if (EQ (dst_object, Qt)
6405 || (! NILP (CODING_ATTR_POST_READ (attrs))
6406 && NILP (dst_object)))
6407 {
6408 coding->dst_object = code_conversion_save (buffer, 1, 1);
6409 coding->dst_pos = BEG;
6410 coding->dst_pos_byte = BEG_BYTE;
6411 coding->dst_multibyte = 1;
6412 }
6413 else if (BUFFERP (dst_object))
6414 {
6415 code_conversion_save (buffer, 0, 0);
6416 coding->dst_object = dst_object;
6417 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6418 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6419 coding->dst_multibyte
6420 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6421 }
4439 else 6422 else
4440 skip = coding->heading_ascii; 6423 {
4441 6424 code_conversion_save (buffer, 0, 0);
4442 if (eol_type == CODING_EOL_UNDECIDED) 6425 coding->dst_object = Qnil;
4443 return; 6426 coding->dst_multibyte = 1;
4444 if (eol_type == CODING_EOL_INCONSISTENT) 6427 }
4445 { 6428
4446 #if 0 6429 decode_coding (coding);
4447 /* This code is suppressed until we find a better way to 6430
4448 distinguish raw text file and binary file. */ 6431 if (BUFFERP (coding->dst_object))
4449 6432 set_buffer_internal (XBUFFER (coding->dst_object));
4450 /* If we have already detected that the coding is raw-text, the 6433
4451 coding should actually be no-conversion. */ 6434 if (! NILP (CODING_ATTR_POST_READ (attrs)))
4452 if (coding->type == coding_type_raw_text) 6435 {
4453 { 6436 struct gcpro gcpro1, gcpro2;
4454 setup_coding_system (Qno_conversion, coding); 6437 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
4455 return; 6438 Lisp_Object val;
4456 } 6439
4457 /* Else, let's decode only text code anyway. */ 6440 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
4458 #endif /* 0 */ 6441 GCPRO2 (coding->src_object, coding->dst_object);
4459 eol_type = CODING_EOL_LF; 6442 val = call1 (CODING_ATTR_POST_READ (attrs),
4460 } 6443 make_number (coding->produced_char));
4461 6444 UNGCPRO;
4462 val = Fget (coding->symbol, Qeol_type); 6445 CHECK_NATNUM (val);
4463 if (VECTORP (val) && XVECTOR (val)->size == 3) 6446 coding->produced_char += Z - prev_Z;
4464 { 6447 coding->produced += Z_BYTE - prev_Z_BYTE;
4465 int src_multibyte = coding->src_multibyte; 6448 }
4466 int dst_multibyte = coding->dst_multibyte; 6449
4467 struct composition_data *cmp_data = coding->cmp_data; 6450 if (EQ (dst_object, Qt))
4468 6451 {
4469 setup_coding_system (XVECTOR (val)->contents[eol_type], coding); 6452 coding->dst_object = Fbuffer_string ();
4470 coding->src_multibyte = src_multibyte; 6453 }
4471 coding->dst_multibyte = dst_multibyte; 6454 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
4472 coding->heading_ascii = skip; 6455 {
4473 coding->cmp_data = cmp_data; 6456 set_buffer_internal (XBUFFER (coding->dst_object));
4474 } 6457 if (dst_bytes < coding->produced)
4475 } 6458 {
4476 6459 destination
4477 #define CONVERSION_BUFFER_EXTRA_ROOM 256 6460 = (unsigned char *) xrealloc (destination, coding->produced);
4478 6461 if (! destination)
4479 #define DECODING_BUFFER_MAG(coding) \ 6462 {
4480 (coding->type == coding_type_iso2022 \ 6463 coding->result = CODING_RESULT_INSUFFICIENT_DST;
4481 ? 3 \ 6464 unbind_to (count, Qnil);
4482 : (coding->type == coding_type_ccl \ 6465 return;
4483 ? coding->spec.ccl.decoder.buf_magnification \ 6466 }
4484 : 2)) 6467 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
4485 6468 move_gap_both (BEGV, BEGV_BYTE);
4486 /* Return maximum size (bytes) of a buffer enough for decoding 6469 bcopy (BEGV_ADDR, destination, coding->produced);
4487 SRC_BYTES of text encoded in CODING. */ 6470 coding->destination = destination;
4488 6471 }
4489 int 6472 }
4490 decoding_buffer_size (coding, src_bytes) 6473
6474 if (saved_pt >= 0)
6475 {
6476 /* This is the case of:
6477 (BUFFERP (src_object) && EQ (src_object, dst_object))
6478 As we have moved PT while replacing the original buffer
6479 contents, we must recover it now. */
6480 set_buffer_internal (XBUFFER (src_object));
6481 if (saved_pt < from)
6482 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6483 else if (saved_pt < from + chars)
6484 TEMP_SET_PT_BOTH (from, from_byte);
6485 else if (! NILP (current_buffer->enable_multibyte_characters))
6486 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6487 saved_pt_byte + (coding->produced - bytes));
6488 else
6489 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6490 saved_pt_byte + (coding->produced - bytes));
6491 }
6492
6493 unbind_to (count, Qnil);
6494 }
6495
6496
6497 void
6498 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6499 dst_object)
4491 struct coding_system *coding; 6500 struct coding_system *coding;
4492 int src_bytes; 6501 Lisp_Object src_object;
4493 { 6502 EMACS_INT from, from_byte, to, to_byte;
4494 return (src_bytes * DECODING_BUFFER_MAG (coding) 6503 Lisp_Object dst_object;
4495 + CONVERSION_BUFFER_EXTRA_ROOM); 6504 {
4496 } 6505 int count = specpdl_ptr - specpdl;
4497 6506 EMACS_INT chars = to - from;
4498 /* Return maximum size (bytes) of a buffer enough for encoding 6507 EMACS_INT bytes = to_byte - from_byte;
4499 SRC_BYTES of text to CODING. */ 6508 Lisp_Object attrs;
4500 6509 Lisp_Object buffer;
4501 int 6510 int saved_pt = -1, saved_pt_byte;
4502 encoding_buffer_size (coding, src_bytes) 6511
4503 struct coding_system *coding; 6512 buffer = Fcurrent_buffer ();
4504 int src_bytes; 6513
4505 { 6514 coding->src_object = src_object;
4506 int magnification; 6515 coding->src_chars = chars;
4507 6516 coding->src_bytes = bytes;
4508 if (coding->type == coding_type_ccl) 6517 coding->src_multibyte = chars < bytes;
4509 { 6518
4510 magnification = coding->spec.ccl.encoder.buf_magnification; 6519 attrs = CODING_ID_ATTRS (coding->id);
4511 if (coding->eol_type == CODING_EOL_CRLF) 6520
4512 magnification *= 2; 6521 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4513 } 6522 {
4514 else if (CODING_REQUIRE_ENCODING (coding)) 6523 coding->src_object = code_conversion_save (buffer, 1,
4515 magnification = 3; 6524 coding->src_multibyte);
6525 set_buffer_internal (XBUFFER (coding->src_object));
6526 if (STRINGP (src_object))
6527 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6528 else if (BUFFERP (src_object))
6529 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6530 else
6531 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6532
6533 if (EQ (src_object, dst_object))
6534 {
6535 set_buffer_internal (XBUFFER (src_object));
6536 saved_pt = PT, saved_pt_byte = PT_BYTE;
6537 del_range_both (from, from_byte, to, to_byte, 1);
6538 set_buffer_internal (XBUFFER (coding->src_object));
6539 }
6540
6541 call2 (CODING_ATTR_PRE_WRITE (attrs),
6542 make_number (BEG), make_number (Z));
6543 coding->src_object = Fcurrent_buffer ();
6544 if (BEG != GPT)
6545 move_gap_both (BEG, BEG_BYTE);
6546 coding->src_chars = Z - BEG;
6547 coding->src_bytes = Z_BYTE - BEG_BYTE;
6548 coding->src_pos = BEG;
6549 coding->src_pos_byte = BEG_BYTE;
6550 coding->src_multibyte = Z < Z_BYTE;
6551 }
6552 else if (STRINGP (src_object))
6553 {
6554 code_conversion_save (buffer, 0, 0);
6555 coding->src_pos = from;
6556 coding->src_pos_byte = from_byte;
6557 }
6558 else if (BUFFERP (src_object))
6559 {
6560 code_conversion_save (buffer, 0, 0);
6561 set_buffer_internal (XBUFFER (src_object));
6562 if (EQ (src_object, dst_object))
6563 {
6564 saved_pt = PT, saved_pt_byte = PT_BYTE;
6565 coding->src_object = del_range_1 (from, to, 1, 1);
6566 coding->src_pos = 0;
6567 coding->src_pos_byte = 0;
6568 }
6569 else
6570 {
6571 if (from < GPT && to >= GPT)
6572 move_gap_both (from, from_byte);
6573 coding->src_pos = from;
6574 coding->src_pos_byte = from_byte;
6575 }
6576 }
4516 else 6577 else
4517 magnification = 1; 6578 code_conversion_save (buffer, 0, 0);
4518 6579
4519 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM); 6580 if (BUFFERP (dst_object))
4520 } 6581 {
4521 6582 coding->dst_object = dst_object;
4522 /* Working buffer for code conversion. */ 6583 if (EQ (src_object, dst_object))
4523 struct conversion_buffer 6584 {
4524 { 6585 coding->dst_pos = from;
4525 int size; /* size of data. */ 6586 coding->dst_pos_byte = from_byte;
4526 int on_stack; /* 1 if allocated by alloca. */ 6587 }
4527 unsigned char *data; 6588 else
4528 }; 6589 {
4529 6590 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
4530 /* Don't use alloca for allocating memory space larger than this, lest 6591 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
4531 we overflow their stack. */ 6592 }
4532 #define MAX_ALLOCA 16*1024 6593 coding->dst_multibyte
4533 6594 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
4534 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer). */ 6595 }
4535 #define allocate_conversion_buffer(buf, len) \ 6596 else if (EQ (dst_object, Qt))
4536 do { \ 6597 {
4537 if (len < MAX_ALLOCA) \ 6598 coding->dst_object = Qnil;
4538 { \ 6599 coding->dst_bytes = coding->src_chars;
4539 buf.data = (unsigned char *) alloca (len); \ 6600 if (coding->dst_bytes == 0)
4540 buf.on_stack = 1; \ 6601 coding->dst_bytes = 1;
4541 } \ 6602 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
4542 else \ 6603 coding->dst_multibyte = 0;
4543 { \
4544 buf.data = (unsigned char *) xmalloc (len); \
4545 buf.on_stack = 0; \
4546 } \
4547 buf.size = len; \
4548 } while (0)
4549
4550 /* Double the allocated memory for *BUF. */
4551 static void
4552 extend_conversion_buffer (buf)
4553 struct conversion_buffer *buf;
4554 {
4555 if (buf->on_stack)
4556 {
4557 unsigned char *save = buf->data;
4558 buf->data = (unsigned char *) xmalloc (buf->size * 2);
4559 bcopy (save, buf->data, buf->size);
4560 buf->on_stack = 0;
4561 } 6604 }
4562 else 6605 else
4563 { 6606 {
4564 buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2); 6607 coding->dst_object = Qnil;
4565 } 6608 coding->dst_multibyte = 0;
4566 buf->size *= 2; 6609 }
4567 } 6610
4568 6611 encode_coding (coding);
4569 /* Free the allocated memory for BUF if it is not on stack. */ 6612
4570 static void 6613 if (EQ (dst_object, Qt))
4571 free_conversion_buffer (buf) 6614 {
4572 struct conversion_buffer *buf; 6615 if (BUFFERP (coding->dst_object))
4573 { 6616 coding->dst_object = Fbuffer_string ();
4574 if (!buf->on_stack)
4575 xfree (buf->data);
4576 }
4577
4578 int
4579 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4580 struct coding_system *coding;
4581 unsigned char *source, *destination;
4582 int src_bytes, dst_bytes, encodep;
4583 {
4584 struct ccl_program *ccl
4585 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4586 unsigned char *dst = destination;
4587
4588 ccl->suppress_error = coding->suppress_error;
4589 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4590 if (encodep)
4591 {
4592 /* On encoding, EOL format is converted within ccl_driver. For
4593 that, setup proper information in the structure CCL. */
4594 ccl->eol_type = coding->eol_type;
4595 if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4596 ccl->eol_type = CODING_EOL_LF;
4597 ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4598 ccl->eight_bit_control = coding->dst_multibyte;
4599 }
4600 else
4601 ccl->eight_bit_control = 1;
4602 ccl->multibyte = coding->src_multibyte;
4603 if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4604 {
4605 /* Move carryover bytes to DESTINATION. */
4606 unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4607 while (*p)
4608 *dst++ = *p++;
4609 coding->spec.ccl.eight_bit_carryover[0] = 0;
4610 if (dst_bytes)
4611 dst_bytes -= dst - destination;
4612 }
4613
4614 coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4615 &(coding->consumed))
4616 + dst - destination);
4617
4618 if (encodep)
4619 {
4620 coding->produced_char = coding->produced;
4621 coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4622 }
4623 else if (!ccl->eight_bit_control)
4624 {
4625 /* The produced bytes forms a valid multibyte sequence. */
4626 coding->produced_char
4627 = multibyte_chars_in_text (destination, coding->produced);
4628 coding->spec.ccl.eight_bit_carryover[0] = 0;
4629 }
4630 else
4631 {
4632 /* On decoding, the destination should always multibyte. But,
4633 CCL program might have been generated an invalid multibyte
4634 sequence. Here we make such a sequence valid as
4635 multibyte. */
4636 int bytes
4637 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4638
4639 if ((coding->consumed < src_bytes
4640 || !ccl->last_block)
4641 && coding->produced >= 1
4642 && destination[coding->produced - 1] >= 0x80)
4643 {
4644 /* We should not convert the tailing 8-bit codes to
4645 multibyte form even if they doesn't form a valid
4646 multibyte sequence. They may form a valid sequence in
4647 the next call. */
4648 int carryover = 0;
4649
4650 if (destination[coding->produced - 1] < 0xA0)
4651 carryover = 1;
4652 else if (coding->produced >= 2)
4653 {
4654 if (destination[coding->produced - 2] >= 0x80)
4655 {
4656 if (destination[coding->produced - 2] < 0xA0)
4657 carryover = 2;
4658 else if (coding->produced >= 3
4659 && destination[coding->produced - 3] >= 0x80
4660 && destination[coding->produced - 3] < 0xA0)
4661 carryover = 3;
4662 }
4663 }
4664 if (carryover > 0)
4665 {
4666 BCOPY_SHORT (destination + coding->produced - carryover,
4667 coding->spec.ccl.eight_bit_carryover,
4668 carryover);
4669 coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4670 coding->produced -= carryover;
4671 }
4672 }
4673 coding->produced = str_as_multibyte (destination, bytes,
4674 coding->produced,
4675 &(coding->produced_char));
4676 }
4677
4678 switch (ccl->status)
4679 {
4680 case CCL_STAT_SUSPEND_BY_SRC:
4681 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4682 break;
4683 case CCL_STAT_SUSPEND_BY_DST:
4684 coding->result = CODING_FINISH_INSUFFICIENT_DST;
4685 break;
4686 case CCL_STAT_QUIT:
4687 case CCL_STAT_INVALID_CMD:
4688 coding->result = CODING_FINISH_INTERRUPT;
4689 break;
4690 default:
4691 coding->result = CODING_FINISH_NORMAL;
4692 break;
4693 }
4694 return coding->result;
4695 }
4696
4697 /* Decode EOL format of the text at PTR of BYTES length destructively
4698 according to CODING->eol_type. This is called after the CCL
4699 program produced a decoded text at PTR. If we do CRLF->LF
4700 conversion, update CODING->produced and CODING->produced_char. */
4701
4702 static void
4703 decode_eol_post_ccl (coding, ptr, bytes)
4704 struct coding_system *coding;
4705 unsigned char *ptr;
4706 int bytes;
4707 {
4708 Lisp_Object val, saved_coding_symbol;
4709 unsigned char *pend = ptr + bytes;
4710 int dummy;
4711
4712 /* Remember the current coding system symbol. We set it back when
4713 an inconsistent EOL is found so that `last-coding-system-used' is
4714 set to the coding system that doesn't specify EOL conversion. */
4715 saved_coding_symbol = coding->symbol;
4716
4717 coding->spec.ccl.cr_carryover = 0;
4718 if (coding->eol_type == CODING_EOL_UNDECIDED)
4719 {
4720 /* Here, to avoid the call of setup_coding_system, we directly
4721 call detect_eol_type. */
4722 coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4723 if (coding->eol_type == CODING_EOL_INCONSISTENT)
4724 coding->eol_type = CODING_EOL_LF;
4725 if (coding->eol_type != CODING_EOL_UNDECIDED)
4726 {
4727 val = Fget (coding->symbol, Qeol_type);
4728 if (VECTORP (val) && XVECTOR (val)->size == 3)
4729 coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4730 }
4731 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4732 }
4733
4734 if (coding->eol_type == CODING_EOL_LF
4735 || coding->eol_type == CODING_EOL_UNDECIDED)
4736 {
4737 /* We have nothing to do. */
4738 ptr = pend;
4739 }
4740 else if (coding->eol_type == CODING_EOL_CRLF)
4741 {
4742 unsigned char *pstart = ptr, *p = ptr;
4743
4744 if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4745 && *(pend - 1) == '\r')
4746 {
4747 /* If the last character is CR, we can't handle it here
4748 because LF will be in the not-yet-decoded source text.
4749 Record that the CR is not yet processed. */
4750 coding->spec.ccl.cr_carryover = 1;
4751 coding->produced--;
4752 coding->produced_char--;
4753 pend--;
4754 }
4755 while (ptr < pend)
4756 {
4757 if (*ptr == '\r')
4758 {
4759 if (ptr + 1 < pend && *(ptr + 1) == '\n')
4760 {
4761 *p++ = '\n';
4762 ptr += 2;
4763 }
4764 else
4765 {
4766 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4767 goto undo_eol_conversion;
4768 *p++ = *ptr++;
4769 }
4770 }
4771 else if (*ptr == '\n'
4772 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4773 goto undo_eol_conversion;
4774 else
4775 *p++ = *ptr++;
4776 continue;
4777
4778 undo_eol_conversion:
4779 /* We have faced with inconsistent EOL format at PTR.
4780 Convert all LFs before PTR back to CRLFs. */
4781 for (p--, ptr--; p >= pstart; p--)
4782 {
4783 if (*p == '\n')
4784 *ptr-- = '\n', *ptr-- = '\r';
4785 else
4786 *ptr-- = *p;
4787 }
4788 /* If carryover is recorded, cancel it because we don't
4789 convert CRLF anymore. */
4790 if (coding->spec.ccl.cr_carryover)
4791 {
4792 coding->spec.ccl.cr_carryover = 0;
4793 coding->produced++;
4794 coding->produced_char++;
4795 pend++;
4796 }
4797 p = ptr = pend;
4798 coding->eol_type = CODING_EOL_LF;
4799 coding->symbol = saved_coding_symbol;
4800 }
4801 if (p < pend)
4802 {
4803 /* As each two-byte sequence CRLF was converted to LF, (PEND
4804 - P) is the number of deleted characters. */
4805 coding->produced -= pend - p;
4806 coding->produced_char -= pend - p;
4807 }
4808 }
4809 else /* i.e. coding->eol_type == CODING_EOL_CR */
4810 {
4811 unsigned char *p = ptr;
4812
4813 for (; ptr < pend; ptr++)
4814 {
4815 if (*ptr == '\r')
4816 *ptr = '\n';
4817 else if (*ptr == '\n'
4818 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4819 {
4820 for (; p < ptr; p++)
4821 {
4822 if (*p == '\n')
4823 *p = '\r';
4824 }
4825 ptr = pend;
4826 coding->eol_type = CODING_EOL_LF;
4827 coding->symbol = saved_coding_symbol;
4828 }
4829 }
4830 }
4831 }
4832
4833 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4834 decoding, it may detect coding system and format of end-of-line if
4835 those are not yet decided. The source should be unibyte, the
4836 result is multibyte if CODING->dst_multibyte is nonzero, else
4837 unibyte. */
4838
4839 int
4840 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4841 struct coding_system *coding;
4842 const unsigned char *source;
4843 unsigned char *destination;
4844 int src_bytes, dst_bytes;
4845 {
4846 int extra = 0;
4847
4848 if (coding->type == coding_type_undecided)
4849 detect_coding (coding, source, src_bytes);
4850
4851 if (coding->eol_type == CODING_EOL_UNDECIDED
4852 && coding->type != coding_type_ccl)
4853 {
4854 detect_eol (coding, source, src_bytes);
4855 /* We had better recover the original eol format if we
4856 encounter an inconsistent eol format while decoding. */
4857 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4858 }
4859
4860 coding->produced = coding->produced_char = 0;
4861 coding->consumed = coding->consumed_char = 0;
4862 coding->errors = 0;
4863 coding->result = CODING_FINISH_NORMAL;
4864
4865 switch (coding->type)
4866 {
4867 case coding_type_sjis:
4868 decode_coding_sjis_big5 (coding, source, destination,
4869 src_bytes, dst_bytes, 1);
4870 break;
4871
4872 case coding_type_iso2022:
4873 decode_coding_iso2022 (coding, source, destination,
4874 src_bytes, dst_bytes);
4875 break;
4876
4877 case coding_type_big5:
4878 decode_coding_sjis_big5 (coding, source, destination,
4879 src_bytes, dst_bytes, 0);
4880 break;
4881
4882 case coding_type_emacs_mule:
4883 decode_coding_emacs_mule (coding, source, destination,
4884 src_bytes, dst_bytes);
4885 break;
4886
4887 case coding_type_ccl:
4888 if (coding->spec.ccl.cr_carryover)
4889 {
4890 /* Put the CR which was not processed by the previous call
4891 of decode_eol_post_ccl in DESTINATION. It will be
4892 decoded together with the following LF by the call to
4893 decode_eol_post_ccl below. */
4894 *destination = '\r';
4895 coding->produced++;
4896 coding->produced_char++;
4897 dst_bytes--;
4898 extra = coding->spec.ccl.cr_carryover;
4899 }
4900 ccl_coding_driver (coding, source, destination + extra,
4901 src_bytes, dst_bytes, 0);
4902 if (coding->eol_type != CODING_EOL_LF)
4903 {
4904 coding->produced += extra;
4905 coding->produced_char += extra;
4906 decode_eol_post_ccl (coding, destination, coding->produced);
4907 }
4908 break;
4909
4910 default:
4911 decode_eol (coding, source, destination, src_bytes, dst_bytes);
4912 }
4913
4914 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4915 && coding->mode & CODING_MODE_LAST_BLOCK
4916 && coding->consumed == src_bytes)
4917 coding->result = CODING_FINISH_NORMAL;
4918
4919 if (coding->mode & CODING_MODE_LAST_BLOCK
4920 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4921 {
4922 const unsigned char *src = source + coding->consumed;
4923 unsigned char *dst = destination + coding->produced;
4924
4925 src_bytes -= coding->consumed;
4926 coding->errors++;
4927 if (COMPOSING_P (coding))
4928 DECODE_COMPOSITION_END ('1');
4929 while (src_bytes--)
4930 {
4931 int c = *src++;
4932 dst += CHAR_STRING (c, dst);
4933 coding->produced_char++;
4934 }
4935 coding->consumed = coding->consumed_char = src - source;
4936 coding->produced = dst - destination;
4937 coding->result = CODING_FINISH_NORMAL;
4938 }
4939
4940 if (!coding->dst_multibyte)
4941 {
4942 coding->produced = str_as_unibyte (destination, coding->produced);
4943 coding->produced_char = coding->produced;
4944 }
4945
4946 return coding->result;
4947 }
4948
4949 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4950 multibyteness of the source is CODING->src_multibyte, the
4951 multibyteness of the result is always unibyte. */
4952
4953 int
4954 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4955 struct coding_system *coding;
4956 const unsigned char *source;
4957 unsigned char *destination;
4958 int src_bytes, dst_bytes;
4959 {
4960 coding->produced = coding->produced_char = 0;
4961 coding->consumed = coding->consumed_char = 0;
4962 coding->errors = 0;
4963 coding->result = CODING_FINISH_NORMAL;
4964
4965 switch (coding->type)
4966 {
4967 case coding_type_sjis:
4968 encode_coding_sjis_big5 (coding, source, destination,
4969 src_bytes, dst_bytes, 1);
4970 break;
4971
4972 case coding_type_iso2022:
4973 encode_coding_iso2022 (coding, source, destination,
4974 src_bytes, dst_bytes);
4975 break;
4976
4977 case coding_type_big5:
4978 encode_coding_sjis_big5 (coding, source, destination,
4979 src_bytes, dst_bytes, 0);
4980 break;
4981
4982 case coding_type_emacs_mule:
4983 encode_coding_emacs_mule (coding, source, destination,
4984 src_bytes, dst_bytes);
4985 break;
4986
4987 case coding_type_ccl:
4988 ccl_coding_driver (coding, source, destination,
4989 src_bytes, dst_bytes, 1);
4990 break;
4991
4992 default:
4993 encode_eol (coding, source, destination, src_bytes, dst_bytes);
4994 }
4995
4996 if (coding->mode & CODING_MODE_LAST_BLOCK
4997 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4998 {
4999 const unsigned char *src = source + coding->consumed;
5000 unsigned char *dst = destination + coding->produced;
5001
5002 if (coding->type == coding_type_iso2022)
5003 ENCODE_RESET_PLANE_AND_REGISTER;
5004 if (COMPOSING_P (coding))
5005 *dst++ = ISO_CODE_ESC, *dst++ = '1';
5006 if (coding->consumed < src_bytes)
5007 {
5008 int len = src_bytes - coding->consumed;
5009
5010 BCOPY_SHORT (src, dst, len);
5011 if (coding->src_multibyte)
5012 len = str_as_unibyte (dst, len);
5013 dst += len;
5014 coding->consumed = src_bytes;
5015 }
5016 coding->produced = coding->produced_char = dst - destination;
5017 coding->result = CODING_FINISH_NORMAL;
5018 }
5019
5020 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5021 && coding->consumed == src_bytes)
5022 coding->result = CODING_FINISH_NORMAL;
5023
5024 return coding->result;
5025 }
5026
5027 /* Scan text in the region between *BEG and *END (byte positions),
5028 skip characters which we don't have to decode by coding system
5029 CODING at the head and tail, then set *BEG and *END to the region
5030 of the text we actually have to convert. The caller should move
5031 the gap out of the region in advance if the region is from a
5032 buffer.
5033
5034 If STR is not NULL, *BEG and *END are indices into STR. */
5035
5036 static void
5037 shrink_decoding_region (beg, end, coding, str)
5038 int *beg, *end;
5039 struct coding_system *coding;
5040 unsigned char *str;
5041 {
5042 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5043 int eol_conversion;
5044 Lisp_Object translation_table;
5045
5046 if (coding->type == coding_type_ccl
5047 || coding->type == coding_type_undecided
5048 || coding->eol_type != CODING_EOL_LF
5049 || !NILP (coding->post_read_conversion)
5050 || coding->composing != COMPOSITION_DISABLED)
5051 {
5052 /* We can't skip any data. */
5053 return;
5054 }
5055 if (coding->type == coding_type_no_conversion
5056 || coding->type == coding_type_raw_text
5057 || coding->type == coding_type_emacs_mule)
5058 {
5059 /* We need no conversion, but don't have to skip any data here.
5060 Decoding routine handles them effectively anyway. */
5061 return;
5062 }
5063
5064 translation_table = coding->translation_table_for_decode;
5065 if (NILP (translation_table) && !NILP (Venable_character_translation))
5066 translation_table = Vstandard_translation_table_for_decode;
5067 if (CHAR_TABLE_P (translation_table))
5068 {
5069 int i;
5070 for (i = 0; i < 128; i++)
5071 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5072 break;
5073 if (i < 128)
5074 /* Some ASCII character should be translated. We give up
5075 shrinking. */
5076 return;
5077 }
5078
5079 if (coding->heading_ascii >= 0)
5080 /* Detection routine has already found how much we can skip at the
5081 head. */
5082 *beg += coding->heading_ascii;
5083
5084 if (str)
5085 {
5086 begp_orig = begp = str + *beg;
5087 endp_orig = endp = str + *end;
5088 }
5089 else
5090 {
5091 begp_orig = begp = BYTE_POS_ADDR (*beg);
5092 endp_orig = endp = begp + *end - *beg;
5093 }
5094
5095 eol_conversion = (coding->eol_type == CODING_EOL_CR
5096 || coding->eol_type == CODING_EOL_CRLF);
5097
5098 switch (coding->type)
5099 {
5100 case coding_type_sjis:
5101 case coding_type_big5:
5102 /* We can skip all ASCII characters at the head. */
5103 if (coding->heading_ascii < 0)
5104 {
5105 if (eol_conversion)
5106 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5107 else
5108 while (begp < endp && *begp < 0x80) begp++;
5109 }
5110 /* We can skip all ASCII characters at the tail except for the
5111 second byte of SJIS or BIG5 code. */
5112 if (eol_conversion)
5113 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5114 else 6617 else
5115 while (begp < endp && endp[-1] < 0x80) endp--; 6618 {
5116 /* Do not consider LF as ascii if preceded by CR, since that 6619 coding->dst_object
5117 confuses eol decoding. */ 6620 = make_unibyte_string ((char *) coding->destination,
5118 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n') 6621 coding->produced);
5119 endp++; 6622 xfree (coding->destination);
5120 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80) 6623 }
5121 endp++; 6624 }
5122 break; 6625
5123 6626 if (saved_pt >= 0)
5124 case coding_type_iso2022: 6627 {
5125 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII) 6628 /* This is the case of:
5126 /* We can't skip any data. */ 6629 (BUFFERP (src_object) && EQ (src_object, dst_object))
5127 break; 6630 As we have moved PT while replacing the original buffer
5128 if (coding->heading_ascii < 0) 6631 contents, we must recover it now. */
5129 { 6632 set_buffer_internal (XBUFFER (src_object));
5130 /* We can skip all ASCII characters at the head except for a 6633 if (saved_pt < from)
5131 few control codes. */ 6634 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
5132 while (begp < endp && (c = *begp) < 0x80 6635 else if (saved_pt < from + chars)
5133 && c != ISO_CODE_CR && c != ISO_CODE_SO 6636 TEMP_SET_PT_BOTH (from, from_byte);
5134 && c != ISO_CODE_SI && c != ISO_CODE_ESC 6637 else if (! NILP (current_buffer->enable_multibyte_characters))
5135 && (!eol_conversion || c != ISO_CODE_LF)) 6638 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
5136 begp++; 6639 saved_pt_byte + (coding->produced - bytes));
5137 }
5138 switch (coding->category_idx)
5139 {
5140 case CODING_CATEGORY_IDX_ISO_8_1:
5141 case CODING_CATEGORY_IDX_ISO_8_2:
5142 /* We can skip all ASCII characters at the tail. */
5143 if (eol_conversion)
5144 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5145 else
5146 while (begp < endp && endp[-1] < 0x80) endp--;
5147 /* Do not consider LF as ascii if preceded by CR, since that
5148 confuses eol decoding. */
5149 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5150 endp++;
5151 break;
5152
5153 case CODING_CATEGORY_IDX_ISO_7:
5154 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5155 {
5156 /* We can skip all characters at the tail except for 8-bit
5157 codes and ESC and the following 2-byte at the tail. */
5158 unsigned char *eight_bit = NULL;
5159
5160 if (eol_conversion)
5161 while (begp < endp
5162 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5163 {
5164 if (!eight_bit && c & 0x80) eight_bit = endp;
5165 endp--;
5166 }
5167 else
5168 while (begp < endp
5169 && (c = endp[-1]) != ISO_CODE_ESC)
5170 {
5171 if (!eight_bit && c & 0x80) eight_bit = endp;
5172 endp--;
5173 }
5174 /* Do not consider LF as ascii if preceded by CR, since that
5175 confuses eol decoding. */
5176 if (begp < endp && endp < endp_orig
5177 && endp[-1] == '\r' && endp[0] == '\n')
5178 endp++;
5179 if (begp < endp && endp[-1] == ISO_CODE_ESC)
5180 {
5181 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5182 /* This is an ASCII designation sequence. We can
5183 surely skip the tail. But, if we have
5184 encountered an 8-bit code, skip only the codes
5185 after that. */
5186 endp = eight_bit ? eight_bit : endp + 2;
5187 else
5188 /* Hmmm, we can't skip the tail. */
5189 endp = endp_orig;
5190 }
5191 else if (eight_bit)
5192 endp = eight_bit;
5193 }
5194 }
5195 break;
5196
5197 default:
5198 abort ();
5199 }
5200 *beg += begp - begp_orig;
5201 *end += endp - endp_orig;
5202 return;
5203 }
5204
5205 /* Like shrink_decoding_region but for encoding. */
5206
5207 static void
5208 shrink_encoding_region (beg, end, coding, str)
5209 int *beg, *end;
5210 struct coding_system *coding;
5211 unsigned char *str;
5212 {
5213 unsigned char *begp_orig, *begp, *endp_orig, *endp;
5214 int eol_conversion;
5215 Lisp_Object translation_table;
5216
5217 if (coding->type == coding_type_ccl
5218 || coding->eol_type == CODING_EOL_CRLF
5219 || coding->eol_type == CODING_EOL_CR
5220 || (coding->cmp_data && coding->cmp_data->used > 0))
5221 {
5222 /* We can't skip any data. */
5223 return;
5224 }
5225 if (coding->type == coding_type_no_conversion
5226 || coding->type == coding_type_raw_text
5227 || coding->type == coding_type_emacs_mule
5228 || coding->type == coding_type_undecided)
5229 {
5230 /* We need no conversion, but don't have to skip any data here.
5231 Encoding routine handles them effectively anyway. */
5232 return;
5233 }
5234
5235 translation_table = coding->translation_table_for_encode;
5236 if (NILP (translation_table) && !NILP (Venable_character_translation))
5237 translation_table = Vstandard_translation_table_for_encode;
5238 if (CHAR_TABLE_P (translation_table))
5239 {
5240 int i;
5241 for (i = 0; i < 128; i++)
5242 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5243 break;
5244 if (i < 128)
5245 /* Some ASCII character should be translated. We give up
5246 shrinking. */
5247 return;
5248 }
5249
5250 if (str)
5251 {
5252 begp_orig = begp = str + *beg;
5253 endp_orig = endp = str + *end;
5254 }
5255 else
5256 {
5257 begp_orig = begp = BYTE_POS_ADDR (*beg);
5258 endp_orig = endp = begp + *end - *beg;
5259 }
5260
5261 eol_conversion = (coding->eol_type == CODING_EOL_CR
5262 || coding->eol_type == CODING_EOL_CRLF);
5263
5264 /* Here, we don't have to check coding->pre_write_conversion because
5265 the caller is expected to have handled it already. */
5266 switch (coding->type)
5267 {
5268 case coding_type_iso2022:
5269 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5270 /* We can't skip any data. */
5271 break;
5272 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5273 {
5274 unsigned char *bol = begp;
5275 while (begp < endp && *begp < 0x80)
5276 {
5277 begp++;
5278 if (begp[-1] == '\n')
5279 bol = begp;
5280 }
5281 begp = bol;
5282 goto label_skip_tail;
5283 }
5284 /* fall down ... */
5285
5286 case coding_type_sjis:
5287 case coding_type_big5:
5288 /* We can skip all ASCII characters at the head and tail. */
5289 if (eol_conversion)
5290 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5291 else 6640 else
5292 while (begp < endp && *begp < 0x80) begp++; 6641 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
5293 label_skip_tail: 6642 saved_pt_byte + (coding->produced - bytes));
5294 if (eol_conversion) 6643 }
5295 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--; 6644
5296 else 6645 unbind_to (count, Qnil);
5297 while (begp < endp && *(endp - 1) < 0x80) endp--; 6646 }
5298 break; 6647
5299
5300 default:
5301 abort ();
5302 }
5303
5304 *beg += begp - begp_orig;
5305 *end += endp - endp_orig;
5306 return;
5307 }
5308
5309 /* As shrinking conversion region requires some overhead, we don't try
5310 shrinking if the length of conversion region is less than this
5311 value. */
5312 static int shrink_conversion_region_threshhold = 1024;
5313
5314 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
5315 do { \
5316 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
5317 { \
5318 if (encodep) shrink_encoding_region (beg, end, coding, str); \
5319 else shrink_decoding_region (beg, end, coding, str); \
5320 } \
5321 } while (0)
5322
5323 static Lisp_Object
5324 code_convert_region_unwind (arg)
5325 Lisp_Object arg;
5326 {
5327 inhibit_pre_post_conversion = 0;
5328 Vlast_coding_system_used = arg;
5329 return Qnil;
5330 }
5331
5332 /* Store information about all compositions in the range FROM and TO
5333 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
5334 buffer or a string, defaults to the current buffer. */
5335
5336 void
5337 coding_save_composition (coding, from, to, obj)
5338 struct coding_system *coding;
5339 int from, to;
5340 Lisp_Object obj;
5341 {
5342 Lisp_Object prop;
5343 int start, end;
5344
5345 if (coding->composing == COMPOSITION_DISABLED)
5346 return;
5347 if (!coding->cmp_data)
5348 coding_allocate_composition_data (coding, from);
5349 if (!find_composition (from, to, &start, &end, &prop, obj)
5350 || end > to)
5351 return;
5352 if (start < from
5353 && (!find_composition (end, to, &start, &end, &prop, obj)
5354 || end > to))
5355 return;
5356 coding->composing = COMPOSITION_NO;
5357 do
5358 {
5359 if (COMPOSITION_VALID_P (start, end, prop))
5360 {
5361 enum composition_method method = COMPOSITION_METHOD (prop);
5362 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5363 >= COMPOSITION_DATA_SIZE)
5364 coding_allocate_composition_data (coding, from);
5365 /* For relative composition, we remember start and end
5366 positions, for the other compositions, we also remember
5367 components. */
5368 CODING_ADD_COMPOSITION_START (coding, start - from, method);
5369 if (method != COMPOSITION_RELATIVE)
5370 {
5371 /* We must store a*/
5372 Lisp_Object val, ch;
5373
5374 val = COMPOSITION_COMPONENTS (prop);
5375 if (CONSP (val))
5376 while (CONSP (val))
5377 {
5378 ch = XCAR (val), val = XCDR (val);
5379 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5380 }
5381 else if (VECTORP (val) || STRINGP (val))
5382 {
5383 int len = (VECTORP (val)
5384 ? XVECTOR (val)->size : SCHARS (val));
5385 int i;
5386 for (i = 0; i < len; i++)
5387 {
5388 ch = (STRINGP (val)
5389 ? Faref (val, make_number (i))
5390 : XVECTOR (val)->contents[i]);
5391 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5392 }
5393 }
5394 else /* INTEGERP (val) */
5395 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5396 }
5397 CODING_ADD_COMPOSITION_END (coding, end - from);
5398 }
5399 start = end;
5400 }
5401 while (start < to
5402 && find_composition (start, to, &start, &end, &prop, obj)
5403 && end <= to);
5404
5405 /* Make coding->cmp_data point to the first memory block. */
5406 while (coding->cmp_data->prev)
5407 coding->cmp_data = coding->cmp_data->prev;
5408 coding->cmp_data_start = 0;
5409 }
5410
5411 /* Reflect the saved information about compositions to OBJ.
5412 CODING->cmp_data points to a memory block for the information. OBJ
5413 is a buffer or a string, defaults to the current buffer. */
5414
5415 void
5416 coding_restore_composition (coding, obj)
5417 struct coding_system *coding;
5418 Lisp_Object obj;
5419 {
5420 struct composition_data *cmp_data = coding->cmp_data;
5421
5422 if (!cmp_data)
5423 return;
5424
5425 while (cmp_data->prev)
5426 cmp_data = cmp_data->prev;
5427
5428 while (cmp_data)
5429 {
5430 int i;
5431
5432 for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5433 i += cmp_data->data[i])
5434 {
5435 int *data = cmp_data->data + i;
5436 enum composition_method method = (enum composition_method) data[3];
5437 Lisp_Object components;
5438
5439 if (method == COMPOSITION_RELATIVE)
5440 components = Qnil;
5441 else
5442 {
5443 int len = data[0] - 4, j;
5444 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5445
5446 if (method == COMPOSITION_WITH_RULE_ALTCHARS
5447 && len % 2 == 0)
5448 len --;
5449 for (j = 0; j < len; j++)
5450 args[j] = make_number (data[4 + j]);
5451 components = (method == COMPOSITION_WITH_ALTCHARS
5452 ? Fstring (len, args) : Fvector (len, args));
5453 }
5454 compose_text (data[1], data[2], components, Qnil, obj);
5455 }
5456 cmp_data = cmp_data->next;
5457 }
5458 }
5459
5460 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5461 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5462 coding system CODING, and return the status code of code conversion
5463 (currently, this value has no meaning).
5464
5465 How many characters (and bytes) are converted to how many
5466 characters (and bytes) are recorded in members of the structure
5467 CODING.
5468
5469 If REPLACE is nonzero, we do various things as if the original text
5470 is deleted and a new text is inserted. See the comments in
5471 replace_range (insdel.c) to know what we are doing.
5472
5473 If REPLACE is zero, it is assumed that the source text is unibyte.
5474 Otherwise, it is assumed that the source text is multibyte. */
5475
5476 int
5477 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5478 int from, from_byte, to, to_byte, encodep, replace;
5479 struct coding_system *coding;
5480 {
5481 int len = to - from, len_byte = to_byte - from_byte;
5482 int nchars_del = 0, nbytes_del = 0;
5483 int require, inserted, inserted_byte;
5484 int head_skip, tail_skip, total_skip = 0;
5485 Lisp_Object saved_coding_symbol;
5486 int first = 1;
5487 unsigned char *src, *dst;
5488 Lisp_Object deletion;
5489 int orig_point = PT, orig_len = len;
5490 int prev_Z;
5491 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5492
5493 deletion = Qnil;
5494 saved_coding_symbol = coding->symbol;
5495
5496 if (from < PT && PT < to)
5497 {
5498 TEMP_SET_PT_BOTH (from, from_byte);
5499 orig_point = from;
5500 }
5501
5502 if (replace)
5503 {
5504 int saved_from = from;
5505 int saved_inhibit_modification_hooks;
5506
5507 prepare_to_modify_buffer (from, to, &from);
5508 if (saved_from != from)
5509 {
5510 to = from + len;
5511 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5512 len_byte = to_byte - from_byte;
5513 }
5514
5515 /* The code conversion routine can not preserve text properties
5516 for now. So, we must remove all text properties in the
5517 region. Here, we must suppress all modification hooks. */
5518 saved_inhibit_modification_hooks = inhibit_modification_hooks;
5519 inhibit_modification_hooks = 1;
5520 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5521 inhibit_modification_hooks = saved_inhibit_modification_hooks;
5522 }
5523
5524 if (! encodep && CODING_REQUIRE_DETECTION (coding))
5525 {
5526 /* We must detect encoding of text and eol format. */
5527
5528 if (from < GPT && to > GPT)
5529 move_gap_both (from, from_byte);
5530 if (coding->type == coding_type_undecided)
5531 {
5532 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5533 if (coding->type == coding_type_undecided)
5534 {
5535 /* It seems that the text contains only ASCII, but we
5536 should not leave it undecided because the deeper
5537 decoding routine (decode_coding) tries to detect the
5538 encodings again in vain. */
5539 coding->type = coding_type_emacs_mule;
5540 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5541 /* As emacs-mule decoder will handle composition, we
5542 need this setting to allocate coding->cmp_data
5543 later. */
5544 coding->composing = COMPOSITION_NO;
5545 }
5546 }
5547 if (coding->eol_type == CODING_EOL_UNDECIDED
5548 && coding->type != coding_type_ccl)
5549 {
5550 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5551 if (coding->eol_type == CODING_EOL_UNDECIDED)
5552 coding->eol_type = CODING_EOL_LF;
5553 /* We had better recover the original eol format if we
5554 encounter an inconsistent eol format while decoding. */
5555 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5556 }
5557 }
5558
5559 /* Now we convert the text. */
5560
5561 /* For encoding, we must process pre-write-conversion in advance. */
5562 if (! inhibit_pre_post_conversion
5563 && encodep
5564 && SYMBOLP (coding->pre_write_conversion)
5565 && ! NILP (Ffboundp (coding->pre_write_conversion)))
5566 {
5567 /* The function in pre-write-conversion may put a new text in a
5568 new buffer. */
5569 struct buffer *prev = current_buffer;
5570 Lisp_Object new;
5571
5572 record_unwind_protect (code_convert_region_unwind,
5573 Vlast_coding_system_used);
5574 /* We should not call any more pre-write/post-read-conversion
5575 functions while this pre-write-conversion is running. */
5576 inhibit_pre_post_conversion = 1;
5577 call2 (coding->pre_write_conversion,
5578 make_number (from), make_number (to));
5579 inhibit_pre_post_conversion = 0;
5580 /* Discard the unwind protect. */
5581 specpdl_ptr--;
5582
5583 if (current_buffer != prev)
5584 {
5585 len = ZV - BEGV;
5586 new = Fcurrent_buffer ();
5587 set_buffer_internal_1 (prev);
5588 del_range_2 (from, from_byte, to, to_byte, 0);
5589 TEMP_SET_PT_BOTH (from, from_byte);
5590 insert_from_buffer (XBUFFER (new), 1, len, 0);
5591 Fkill_buffer (new);
5592 if (orig_point >= to)
5593 orig_point += len - orig_len;
5594 else if (orig_point > from)
5595 orig_point = from;
5596 orig_len = len;
5597 to = from + len;
5598 from_byte = CHAR_TO_BYTE (from);
5599 to_byte = CHAR_TO_BYTE (to);
5600 len_byte = to_byte - from_byte;
5601 TEMP_SET_PT_BOTH (from, from_byte);
5602 }
5603 }
5604
5605 if (replace)
5606 {
5607 if (! EQ (current_buffer->undo_list, Qt))
5608 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5609 else
5610 {
5611 nchars_del = to - from;
5612 nbytes_del = to_byte - from_byte;
5613 }
5614 }
5615
5616 if (coding->composing != COMPOSITION_DISABLED)
5617 {
5618 if (encodep)
5619 coding_save_composition (coding, from, to, Fcurrent_buffer ());
5620 else
5621 coding_allocate_composition_data (coding, from);
5622 }
5623
5624 /* Try to skip the heading and tailing ASCIIs. */
5625 if (coding->type != coding_type_ccl)
5626 {
5627 int from_byte_orig = from_byte, to_byte_orig = to_byte;
5628
5629 if (from < GPT && GPT < to)
5630 move_gap_both (from, from_byte);
5631 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5632 if (from_byte == to_byte
5633 && (encodep || NILP (coding->post_read_conversion))
5634 && ! CODING_REQUIRE_FLUSHING (coding))
5635 {
5636 coding->produced = len_byte;
5637 coding->produced_char = len;
5638 if (!replace)
5639 /* We must record and adjust for this new text now. */
5640 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5641 return 0;
5642 }
5643
5644 head_skip = from_byte - from_byte_orig;
5645 tail_skip = to_byte_orig - to_byte;
5646 total_skip = head_skip + tail_skip;
5647 from += head_skip;
5648 to -= tail_skip;
5649 len -= total_skip; len_byte -= total_skip;
5650 }
5651
5652 /* For conversion, we must put the gap before the text in addition to
5653 making the gap larger for efficient decoding. The required gap
5654 size starts from 2000 which is the magic number used in make_gap.
5655 But, after one batch of conversion, it will be incremented if we
5656 find that it is not enough . */
5657 require = 2000;
5658
5659 if (GAP_SIZE < require)
5660 make_gap (require - GAP_SIZE);
5661 move_gap_both (from, from_byte);
5662
5663 inserted = inserted_byte = 0;
5664
5665 GAP_SIZE += len_byte;
5666 ZV -= len;
5667 Z -= len;
5668 ZV_BYTE -= len_byte;
5669 Z_BYTE -= len_byte;
5670
5671 if (GPT - BEG < BEG_UNCHANGED)
5672 BEG_UNCHANGED = GPT - BEG;
5673 if (Z - GPT < END_UNCHANGED)
5674 END_UNCHANGED = Z - GPT;
5675
5676 if (!encodep && coding->src_multibyte)
5677 {
5678 /* Decoding routines expects that the source text is unibyte.
5679 We must convert 8-bit characters of multibyte form to
5680 unibyte. */
5681 int len_byte_orig = len_byte;
5682 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5683 if (len_byte < len_byte_orig)
5684 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5685 len_byte);
5686 coding->src_multibyte = 0;
5687 }
5688
5689 for (;;)
5690 {
5691 int result;
5692
5693 /* The buffer memory is now:
5694 +--------+converted-text+---------+-------original-text-------+---+
5695 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5696 |<---------------------- GAP ----------------------->| */
5697 src = GAP_END_ADDR - len_byte;
5698 dst = GPT_ADDR + inserted_byte;
5699
5700 if (encodep)
5701 result = encode_coding (coding, src, dst, len_byte, 0);
5702 else
5703 {
5704 if (coding->composing != COMPOSITION_DISABLED)
5705 coding->cmp_data->char_offset = from + inserted;
5706 result = decode_coding (coding, src, dst, len_byte, 0);
5707 }
5708
5709 /* The buffer memory is now:
5710 +--------+-------converted-text----+--+------original-text----+---+
5711 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5712 |<---------------------- GAP ----------------------->| */
5713
5714 inserted += coding->produced_char;
5715 inserted_byte += coding->produced;
5716 len_byte -= coding->consumed;
5717
5718 if (result == CODING_FINISH_INSUFFICIENT_CMP)
5719 {
5720 coding_allocate_composition_data (coding, from + inserted);
5721 continue;
5722 }
5723
5724 src += coding->consumed;
5725 dst += coding->produced;
5726
5727 if (result == CODING_FINISH_NORMAL)
5728 {
5729 src += len_byte;
5730 break;
5731 }
5732 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5733 {
5734 unsigned char *pend = dst, *p = pend - inserted_byte;
5735 Lisp_Object eol_type;
5736
5737 /* Encode LFs back to the original eol format (CR or CRLF). */
5738 if (coding->eol_type == CODING_EOL_CR)
5739 {
5740 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5741 }
5742 else
5743 {
5744 int count = 0;
5745
5746 while (p < pend) if (*p++ == '\n') count++;
5747 if (src - dst < count)
5748 {
5749 /* We don't have sufficient room for encoding LFs
5750 back to CRLF. We must record converted and
5751 not-yet-converted text back to the buffer
5752 content, enlarge the gap, then record them out of
5753 the buffer contents again. */
5754 int add = len_byte + inserted_byte;
5755
5756 GAP_SIZE -= add;
5757 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5758 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5759 make_gap (count - GAP_SIZE);
5760 GAP_SIZE += add;
5761 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5762 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5763 /* Don't forget to update SRC, DST, and PEND. */
5764 src = GAP_END_ADDR - len_byte;
5765 dst = GPT_ADDR + inserted_byte;
5766 pend = dst;
5767 }
5768 inserted += count;
5769 inserted_byte += count;
5770 coding->produced += count;
5771 p = dst = pend + count;
5772 while (count)
5773 {
5774 *--p = *--pend;
5775 if (*p == '\n') count--, *--p = '\r';
5776 }
5777 }
5778
5779 /* Suppress eol-format conversion in the further conversion. */
5780 coding->eol_type = CODING_EOL_LF;
5781
5782 /* Set the coding system symbol to that for Unix-like EOL. */
5783 eol_type = Fget (saved_coding_symbol, Qeol_type);
5784 if (VECTORP (eol_type)
5785 && XVECTOR (eol_type)->size == 3
5786 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5787 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5788 else
5789 coding->symbol = saved_coding_symbol;
5790
5791 continue;
5792 }
5793 if (len_byte <= 0)
5794 {
5795 if (coding->type != coding_type_ccl
5796 || coding->mode & CODING_MODE_LAST_BLOCK)
5797 break;
5798 coding->mode |= CODING_MODE_LAST_BLOCK;
5799 continue;
5800 }
5801 if (result == CODING_FINISH_INSUFFICIENT_SRC)
5802 {
5803 /* The source text ends in invalid codes. Let's just
5804 make them valid buffer contents, and finish conversion. */
5805 if (multibyte_p)
5806 {
5807 unsigned char *start = dst;
5808
5809 inserted += len_byte;
5810 while (len_byte--)
5811 {
5812 int c = *src++;
5813 dst += CHAR_STRING (c, dst);
5814 }
5815
5816 inserted_byte += dst - start;
5817 }
5818 else
5819 {
5820 inserted += len_byte;
5821 inserted_byte += len_byte;
5822 while (len_byte--)
5823 *dst++ = *src++;
5824 }
5825 break;
5826 }
5827 if (result == CODING_FINISH_INTERRUPT)
5828 {
5829 /* The conversion procedure was interrupted by a user. */
5830 break;
5831 }
5832 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5833 if (coding->consumed < 1)
5834 {
5835 /* It's quite strange to require more memory without
5836 consuming any bytes. Perhaps CCL program bug. */
5837 break;
5838 }
5839 if (first)
5840 {
5841 /* We have just done the first batch of conversion which was
5842 stopped because of insufficient gap. Let's reconsider the
5843 required gap size (i.e. SRT - DST) now.
5844
5845 We have converted ORIG bytes (== coding->consumed) into
5846 NEW bytes (coding->produced). To convert the remaining
5847 LEN bytes, we may need REQUIRE bytes of gap, where:
5848 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5849 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5850 Here, we are sure that NEW >= ORIG. */
5851 float ratio;
5852
5853 if (coding->produced <= coding->consumed)
5854 {
5855 /* This happens because of CCL-based coding system with
5856 eol-type CRLF. */
5857 require = 0;
5858 }
5859 else
5860 {
5861 ratio = (coding->produced - coding->consumed) / coding->consumed;
5862 require = len_byte * ratio;
5863 }
5864 first = 0;
5865 }
5866 if ((src - dst) < (require + 2000))
5867 {
5868 /* See the comment above the previous call of make_gap. */
5869 int add = len_byte + inserted_byte;
5870
5871 GAP_SIZE -= add;
5872 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5873 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5874 make_gap (require + 2000);
5875 GAP_SIZE += add;
5876 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5877 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5878 }
5879 }
5880 if (src - dst > 0) *dst = 0; /* Put an anchor. */
5881
5882 if (encodep && coding->dst_multibyte)
5883 {
5884 /* The output is unibyte. We must convert 8-bit characters to
5885 multibyte form. */
5886 if (inserted_byte * 2 > GAP_SIZE)
5887 {
5888 GAP_SIZE -= inserted_byte;
5889 ZV += inserted_byte; Z += inserted_byte;
5890 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5891 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5892 make_gap (inserted_byte - GAP_SIZE);
5893 GAP_SIZE += inserted_byte;
5894 ZV -= inserted_byte; Z -= inserted_byte;
5895 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5896 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5897 }
5898 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5899 }
5900
5901 /* If we shrank the conversion area, adjust it now. */
5902 if (total_skip > 0)
5903 {
5904 if (tail_skip > 0)
5905 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5906 inserted += total_skip; inserted_byte += total_skip;
5907 GAP_SIZE += total_skip;
5908 GPT -= head_skip; GPT_BYTE -= head_skip;
5909 ZV -= total_skip; ZV_BYTE -= total_skip;
5910 Z -= total_skip; Z_BYTE -= total_skip;
5911 from -= head_skip; from_byte -= head_skip;
5912 to += tail_skip; to_byte += tail_skip;
5913 }
5914
5915 prev_Z = Z;
5916 if (! EQ (current_buffer->undo_list, Qt))
5917 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5918 else
5919 adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5920 inserted, inserted_byte);
5921 inserted = Z - prev_Z;
5922
5923 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5924 coding_restore_composition (coding, Fcurrent_buffer ());
5925 coding_free_composition_data (coding);
5926
5927 if (! inhibit_pre_post_conversion
5928 && ! encodep && ! NILP (coding->post_read_conversion))
5929 {
5930 Lisp_Object val;
5931 Lisp_Object saved_coding_system;
5932
5933 if (from != PT)
5934 TEMP_SET_PT_BOTH (from, from_byte);
5935 prev_Z = Z;
5936 record_unwind_protect (code_convert_region_unwind,
5937 Vlast_coding_system_used);
5938 saved_coding_system = Vlast_coding_system_used;
5939 Vlast_coding_system_used = coding->symbol;
5940 /* We should not call any more pre-write/post-read-conversion
5941 functions while this post-read-conversion is running. */
5942 inhibit_pre_post_conversion = 1;
5943 val = call1 (coding->post_read_conversion, make_number (inserted));
5944 inhibit_pre_post_conversion = 0;
5945 coding->symbol = Vlast_coding_system_used;
5946 Vlast_coding_system_used = saved_coding_system;
5947 /* Discard the unwind protect. */
5948 specpdl_ptr--;
5949 CHECK_NUMBER (val);
5950 inserted += Z - prev_Z;
5951 }
5952
5953 if (orig_point >= from)
5954 {
5955 if (orig_point >= from + orig_len)
5956 orig_point += inserted - orig_len;
5957 else
5958 orig_point = from;
5959 TEMP_SET_PT (orig_point);
5960 }
5961
5962 if (replace)
5963 {
5964 signal_after_change (from, to - from, inserted);
5965 update_compositions (from, from + inserted, CHECK_BORDER);
5966 }
5967
5968 {
5969 coding->consumed = to_byte - from_byte;
5970 coding->consumed_char = to - from;
5971 coding->produced = inserted_byte;
5972 coding->produced_char = inserted;
5973 }
5974
5975 return 0;
5976 }
5977 6648
5978 Lisp_Object 6649 Lisp_Object
5979 run_pre_post_conversion_on_str (str, coding, encodep) 6650 preferred_coding_system ()
5980 Lisp_Object str; 6651 {
5981 struct coding_system *coding; 6652 int id = coding_categories[coding_priorities[0]].id;
5982 int encodep; 6653
5983 { 6654 return CODING_ID_NAME (id);
5984 int count = SPECPDL_INDEX ();
5985 struct gcpro gcpro1, gcpro2;
5986 int multibyte = STRING_MULTIBYTE (str);
5987 Lisp_Object buffer;
5988 struct buffer *buf;
5989 Lisp_Object old_deactivate_mark;
5990
5991 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5992 record_unwind_protect (code_convert_region_unwind,
5993 Vlast_coding_system_used);
5994 /* It is not crucial to specbind this. */
5995 old_deactivate_mark = Vdeactivate_mark;
5996 GCPRO2 (str, old_deactivate_mark);
5997
5998 buffer = Fget_buffer_create (build_string (" *code-converting-work*"));
5999 buf = XBUFFER (buffer);
6000
6001 delete_all_overlays (buf);
6002 buf->directory = current_buffer->directory;
6003 buf->read_only = Qnil;
6004 buf->filename = Qnil;
6005 buf->undo_list = Qt;
6006 eassert (buf->overlays_before == NULL);
6007 eassert (buf->overlays_after == NULL);
6008
6009 set_buffer_internal (buf);
6010 /* We must insert the contents of STR as is without
6011 unibyte<->multibyte conversion. For that, we adjust the
6012 multibyteness of the working buffer to that of STR. */
6013 Ferase_buffer ();
6014 buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6015
6016 insert_from_string (str, 0, 0,
6017 SCHARS (str), SBYTES (str), 0);
6018 UNGCPRO;
6019 inhibit_pre_post_conversion = 1;
6020 if (encodep)
6021 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6022 else
6023 {
6024 Vlast_coding_system_used = coding->symbol;
6025 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6026 call1 (coding->post_read_conversion, make_number (Z - BEG));
6027 coding->symbol = Vlast_coding_system_used;
6028 }
6029 inhibit_pre_post_conversion = 0;
6030 Vdeactivate_mark = old_deactivate_mark;
6031 str = make_buffer_string (BEG, Z, 1);
6032 return unbind_to (count, str);
6033 }
6034
6035 Lisp_Object
6036 decode_coding_string (str, coding, nocopy)
6037 Lisp_Object str;
6038 struct coding_system *coding;
6039 int nocopy;
6040 {
6041 int len;
6042 struct conversion_buffer buf;
6043 int from, to_byte;
6044 Lisp_Object saved_coding_symbol;
6045 int result;
6046 int require_decoding;
6047 int shrinked_bytes = 0;
6048 Lisp_Object newstr;
6049 int consumed, consumed_char, produced, produced_char;
6050
6051 from = 0;
6052 to_byte = SBYTES (str);
6053
6054 saved_coding_symbol = coding->symbol;
6055 coding->src_multibyte = STRING_MULTIBYTE (str);
6056 coding->dst_multibyte = 1;
6057 if (CODING_REQUIRE_DETECTION (coding))
6058 {
6059 /* See the comments in code_convert_region. */
6060 if (coding->type == coding_type_undecided)
6061 {
6062 detect_coding (coding, SDATA (str), to_byte);
6063 if (coding->type == coding_type_undecided)
6064 {
6065 coding->type = coding_type_emacs_mule;
6066 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6067 /* As emacs-mule decoder will handle composition, we
6068 need this setting to allocate coding->cmp_data
6069 later. */
6070 coding->composing = COMPOSITION_NO;
6071 }
6072 }
6073 if (coding->eol_type == CODING_EOL_UNDECIDED
6074 && coding->type != coding_type_ccl)
6075 {
6076 saved_coding_symbol = coding->symbol;
6077 detect_eol (coding, SDATA (str), to_byte);
6078 if (coding->eol_type == CODING_EOL_UNDECIDED)
6079 coding->eol_type = CODING_EOL_LF;
6080 /* We had better recover the original eol format if we
6081 encounter an inconsistent eol format while decoding. */
6082 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6083 }
6084 }
6085
6086 if (coding->type == coding_type_no_conversion
6087 || coding->type == coding_type_raw_text)
6088 coding->dst_multibyte = 0;
6089
6090 require_decoding = CODING_REQUIRE_DECODING (coding);
6091
6092 if (STRING_MULTIBYTE (str))
6093 {
6094 /* Decoding routines expect the source text to be unibyte. */
6095 str = Fstring_as_unibyte (str);
6096 to_byte = SBYTES (str);
6097 nocopy = 1;
6098 coding->src_multibyte = 0;
6099 }
6100
6101 /* Try to skip the heading and tailing ASCIIs. */
6102 if (require_decoding && coding->type != coding_type_ccl)
6103 {
6104 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6105 0);
6106 if (from == to_byte)
6107 require_decoding = 0;
6108 shrinked_bytes = from + (SBYTES (str) - to_byte);
6109 }
6110
6111 if (!require_decoding
6112 && !(SYMBOLP (coding->post_read_conversion)
6113 && !NILP (Ffboundp (coding->post_read_conversion))))
6114 {
6115 coding->consumed = SBYTES (str);
6116 coding->consumed_char = SCHARS (str);
6117 if (coding->dst_multibyte)
6118 {
6119 str = Fstring_as_multibyte (str);
6120 nocopy = 1;
6121 }
6122 coding->produced = SBYTES (str);
6123 coding->produced_char = SCHARS (str);
6124 return (nocopy ? str : Fcopy_sequence (str));
6125 }
6126
6127 if (coding->composing != COMPOSITION_DISABLED)
6128 coding_allocate_composition_data (coding, from);
6129 len = decoding_buffer_size (coding, to_byte - from);
6130 allocate_conversion_buffer (buf, len);
6131
6132 consumed = consumed_char = produced = produced_char = 0;
6133 while (1)
6134 {
6135 result = decode_coding (coding, SDATA (str) + from + consumed,
6136 buf.data + produced, to_byte - from - consumed,
6137 buf.size - produced);
6138 consumed += coding->consumed;
6139 consumed_char += coding->consumed_char;
6140 produced += coding->produced;
6141 produced_char += coding->produced_char;
6142 if (result == CODING_FINISH_NORMAL
6143 || (result == CODING_FINISH_INSUFFICIENT_SRC
6144 && coding->consumed == 0))
6145 break;
6146 if (result == CODING_FINISH_INSUFFICIENT_CMP)
6147 coding_allocate_composition_data (coding, from + produced_char);
6148 else if (result == CODING_FINISH_INSUFFICIENT_DST)
6149 extend_conversion_buffer (&buf);
6150 else if (result == CODING_FINISH_INCONSISTENT_EOL)
6151 {
6152 Lisp_Object eol_type;
6153
6154 /* Recover the original EOL format. */
6155 if (coding->eol_type == CODING_EOL_CR)
6156 {
6157 unsigned char *p;
6158 for (p = buf.data; p < buf.data + produced; p++)
6159 if (*p == '\n') *p = '\r';
6160 }
6161 else if (coding->eol_type == CODING_EOL_CRLF)
6162 {
6163 int num_eol = 0;
6164 unsigned char *p0, *p1;
6165 for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6166 if (*p0 == '\n') num_eol++;
6167 if (produced + num_eol >= buf.size)
6168 extend_conversion_buffer (&buf);
6169 for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6170 {
6171 *--p1 = *--p0;
6172 if (*p0 == '\n') *--p1 = '\r';
6173 }
6174 produced += num_eol;
6175 produced_char += num_eol;
6176 }
6177 /* Suppress eol-format conversion in the further conversion. */
6178 coding->eol_type = CODING_EOL_LF;
6179
6180 /* Set the coding system symbol to that for Unix-like EOL. */
6181 eol_type = Fget (saved_coding_symbol, Qeol_type);
6182 if (VECTORP (eol_type)
6183 && XVECTOR (eol_type)->size == 3
6184 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6185 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6186 else
6187 coding->symbol = saved_coding_symbol;
6188
6189
6190 }
6191 }
6192
6193 coding->consumed = consumed;
6194 coding->consumed_char = consumed_char;
6195 coding->produced = produced;
6196 coding->produced_char = produced_char;
6197
6198 if (coding->dst_multibyte)
6199 newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6200 produced + shrinked_bytes);
6201 else
6202 newstr = make_uninit_string (produced + shrinked_bytes);
6203 if (from > 0)
6204 STRING_COPYIN (newstr, 0, SDATA (str), from);
6205 STRING_COPYIN (newstr, from, buf.data, produced);
6206 if (shrinked_bytes > from)
6207 STRING_COPYIN (newstr, from + produced,
6208 SDATA (str) + to_byte,
6209 shrinked_bytes - from);
6210 free_conversion_buffer (&buf);
6211
6212 if (coding->cmp_data && coding->cmp_data->used)
6213 coding_restore_composition (coding, newstr);
6214 coding_free_composition_data (coding);
6215
6216 if (SYMBOLP (coding->post_read_conversion)
6217 && !NILP (Ffboundp (coding->post_read_conversion)))
6218 newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6219
6220 return newstr;
6221 }
6222
6223 Lisp_Object
6224 encode_coding_string (str, coding, nocopy)
6225 Lisp_Object str;
6226 struct coding_system *coding;
6227 int nocopy;
6228 {
6229 int len;
6230 struct conversion_buffer buf;
6231 int from, to, to_byte;
6232 int result;
6233 int shrinked_bytes = 0;
6234 Lisp_Object newstr;
6235 int consumed, consumed_char, produced, produced_char;
6236
6237 if (SYMBOLP (coding->pre_write_conversion)
6238 && !NILP (Ffboundp (coding->pre_write_conversion)))
6239 str = run_pre_post_conversion_on_str (str, coding, 1);
6240
6241 from = 0;
6242 to = SCHARS (str);
6243 to_byte = SBYTES (str);
6244
6245 /* Encoding routines determine the multibyteness of the source text
6246 by coding->src_multibyte. */
6247 coding->src_multibyte = STRING_MULTIBYTE (str);
6248 coding->dst_multibyte = 0;
6249 if (! CODING_REQUIRE_ENCODING (coding))
6250 {
6251 coding->consumed = SBYTES (str);
6252 coding->consumed_char = SCHARS (str);
6253 if (STRING_MULTIBYTE (str))
6254 {
6255 str = Fstring_as_unibyte (str);
6256 nocopy = 1;
6257 }
6258 coding->produced = SBYTES (str);
6259 coding->produced_char = SCHARS (str);
6260 return (nocopy ? str : Fcopy_sequence (str));
6261 }
6262
6263 if (coding->composing != COMPOSITION_DISABLED)
6264 coding_save_composition (coding, from, to, str);
6265
6266 /* Try to skip the heading and tailing ASCIIs. */
6267 if (coding->type != coding_type_ccl)
6268 {
6269 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6270 1);
6271 if (from == to_byte)
6272 return (nocopy ? str : Fcopy_sequence (str));
6273 shrinked_bytes = from + (SBYTES (str) - to_byte);
6274 }
6275
6276 len = encoding_buffer_size (coding, to_byte - from);
6277 allocate_conversion_buffer (buf, len);
6278
6279 consumed = consumed_char = produced = produced_char = 0;
6280 while (1)
6281 {
6282 result = encode_coding (coding, SDATA (str) + from + consumed,
6283 buf.data + produced, to_byte - from - consumed,
6284 buf.size - produced);
6285 consumed += coding->consumed;
6286 consumed_char += coding->consumed_char;
6287 produced += coding->produced;
6288 produced_char += coding->produced_char;
6289 if (result == CODING_FINISH_NORMAL
6290 || (result == CODING_FINISH_INSUFFICIENT_SRC
6291 && coding->consumed == 0))
6292 break;
6293 /* Now result should be CODING_FINISH_INSUFFICIENT_DST. */
6294 extend_conversion_buffer (&buf);
6295 }
6296
6297 coding->consumed = consumed;
6298 coding->consumed_char = consumed_char;
6299 coding->produced = produced;
6300 coding->produced_char = produced_char;
6301
6302 newstr = make_uninit_string (produced + shrinked_bytes);
6303 if (from > 0)
6304 STRING_COPYIN (newstr, 0, SDATA (str), from);
6305 STRING_COPYIN (newstr, from, buf.data, produced);
6306 if (shrinked_bytes > from)
6307 STRING_COPYIN (newstr, from + produced,
6308 SDATA (str) + to_byte,
6309 shrinked_bytes - from);
6310
6311 free_conversion_buffer (&buf);
6312 coding_free_composition_data (coding);
6313
6314 return newstr;
6315 } 6655 }
6316 6656
6317 6657
6318 #ifdef emacs 6658 #ifdef emacs
6319 /*** 8. Emacs Lisp library functions ***/ 6659 /*** 8. Emacs Lisp library functions ***/
6320 6660
6321 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0, 6661 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6322 doc: /* Return t if OBJECT is nil or a coding-system. 6662 doc: /* Return t if OBJECT is nil or a coding-system.
6323 See the documentation of `make-coding-system' for information 6663 See the documentation of `define-coding-system' for information
6324 about coding-system objects. */) 6664 about coding-system objects. */)
6325 (obj) 6665 (obj)
6326 Lisp_Object obj; 6666 Lisp_Object obj;
6327 { 6667 {
6328 if (NILP (obj)) 6668 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6329 return Qt;
6330 if (!SYMBOLP (obj))
6331 return Qnil;
6332 /* Get coding-spec vector for OBJ. */
6333 obj = Fget (obj, Qcoding_system);
6334 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6335 ? Qt : Qnil);
6336 } 6669 }
6337 6670
6338 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system, 6671 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6339 Sread_non_nil_coding_system, 1, 1, 0, 6672 Sread_non_nil_coding_system, 1, 1, 0,
6340 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */) 6673 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6357 (prompt, default_coding_system) 6690 (prompt, default_coding_system)
6358 Lisp_Object prompt, default_coding_system; 6691 Lisp_Object prompt, default_coding_system;
6359 { 6692 {
6360 Lisp_Object val; 6693 Lisp_Object val;
6361 if (SYMBOLP (default_coding_system)) 6694 if (SYMBOLP (default_coding_system))
6362 default_coding_system = SYMBOL_NAME (default_coding_system); 6695 XSETSTRING (default_coding_system, SYMBOL_NAME (default_coding_system));
6363 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil, 6696 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6364 Qt, Qnil, Qcoding_system_history, 6697 Qt, Qnil, Qcoding_system_history,
6365 default_coding_system, Qnil); 6698 default_coding_system, Qnil);
6366 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil)); 6699 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6367 } 6700 }
6368 6701
6369 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system, 6702 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6370 1, 1, 0, 6703 1, 1, 0,
6371 doc: /* Check validity of CODING-SYSTEM. 6704 doc: /* Check validity of CODING-SYSTEM.
6372 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. 6705 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6373 It is valid if it is a symbol with a non-nil `coding-system' property. 6706 (coding_system)
6374 The value of property should be a vector of length 5. */)
6375 (coding_system)
6376 Lisp_Object coding_system; 6707 Lisp_Object coding_system;
6377 { 6708 {
6378 CHECK_SYMBOL (coding_system); 6709 CHECK_SYMBOL (coding_system);
6379 if (!NILP (Fcoding_system_p (coding_system))) 6710 if (!NILP (Fcoding_system_p (coding_system)))
6380 return coding_system; 6711 return coding_system;
6381 while (1) 6712 while (1)
6382 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil)); 6713 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6383 } 6714 }
6715
6384 6716
6717 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6718 HIGHEST is nonzero, return the coding system of the highest
6719 priority among the detected coding systems. Otherwize return a
6720 list of detected coding systems sorted by their priorities. If
6721 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6722 multibyte form but contains only ASCII and eight-bit chars.
6723 Otherwise, the bytes are raw bytes.
6724
6725 CODING-SYSTEM controls the detection as below:
6726
6727 If it is nil, detect both text-format and eol-format. If the
6728 text-format part of CODING-SYSTEM is already specified
6729 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6730 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6731 detect only text-format. */
6732
6385 Lisp_Object 6733 Lisp_Object
6386 detect_coding_system (src, src_bytes, highest, multibytep) 6734 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6387 const unsigned char *src; 6735 const unsigned char *src;
6388 int src_bytes, highest; 6736 int src_bytes, highest;
6389 int multibytep; 6737 int multibytep;
6390 { 6738 Lisp_Object coding_system;
6391 int coding_mask, eol_type; 6739 {
6392 Lisp_Object val, tmp; 6740 const unsigned char *src_end = src + src_bytes;
6393 int dummy; 6741 Lisp_Object attrs, eol_type;
6394 6742 Lisp_Object val;
6395 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep); 6743 struct coding_system coding;
6396 eol_type = detect_eol_type (src, src_bytes, &dummy); 6744 int id;
6397 if (eol_type == CODING_EOL_INCONSISTENT) 6745 struct coding_detection_info detect_info;
6398 eol_type = CODING_EOL_UNDECIDED; 6746
6399 6747 if (NILP (coding_system))
6400 if (!coding_mask) 6748 coding_system = Qundecided;
6401 { 6749 setup_coding_system (coding_system, &coding);
6402 val = Qundecided; 6750 attrs = CODING_ID_ATTRS (coding.id);
6403 if (eol_type != CODING_EOL_UNDECIDED) 6751 eol_type = CODING_ID_EOL_TYPE (coding.id);
6404 { 6752 coding_system = CODING_ATTR_BASE_NAME (attrs);
6405 Lisp_Object val2; 6753
6406 val2 = Fget (Qundecided, Qeol_type); 6754 coding.source = src;
6407 if (VECTORP (val2)) 6755 coding.src_bytes = src_bytes;
6408 val = XVECTOR (val2)->contents[eol_type]; 6756 coding.src_multibyte = multibytep;
6409 } 6757 coding.consumed = 0;
6410 return (highest ? val : Fcons (val, Qnil)); 6758 coding.mode |= CODING_MODE_LAST_BLOCK;
6411 } 6759
6412 6760 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6413 /* At first, gather possible coding systems in VAL. */ 6761
6414 val = Qnil; 6762 /* At first, detect text-format if necessary. */
6415 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp)) 6763 if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided)
6416 { 6764 {
6417 Lisp_Object category_val, category_index; 6765 enum coding_category category;
6418 6766 struct coding_system *this;
6419 category_index = Fget (XCAR (tmp), Qcoding_category_index); 6767 int c, i;
6420 category_val = Fsymbol_value (XCAR (tmp)); 6768
6421 if (!NILP (category_val) 6769 for (; src < src_end; src++)
6422 && NATNUMP (category_index) 6770 {
6423 && (coding_mask & (1 << XFASTINT (category_index)))) 6771 c = *src;
6424 { 6772 if (c & 0x80
6425 val = Fcons (category_val, val); 6773 || (c < 0x20 && (c == ISO_CODE_ESC
6426 if (highest) 6774 || c == ISO_CODE_SI
6775 || c == ISO_CODE_SO)))
6427 break; 6776 break;
6428 } 6777 }
6429 } 6778 coding.head_ascii = src - coding.source;
6430 if (!highest) 6779
6431 val = Fnreverse (val); 6780 if (src < src_end)
6432 6781 for (i = 0; i < coding_category_raw_text; i++)
6433 /* Then, replace the elements with subsidiary coding systems. */ 6782 {
6434 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp)) 6783 category = coding_priorities[i];
6435 { 6784 this = coding_categories + category;
6436 if (eol_type != CODING_EOL_UNDECIDED 6785
6437 && eol_type != CODING_EOL_INCONSISTENT) 6786 if (this->id < 0)
6438 { 6787 {
6439 Lisp_Object eol; 6788 /* No coding system of this category is defined. */
6440 eol = Fget (XCAR (tmp), Qeol_type); 6789 detect_info.rejected |= (1 << category);
6441 if (VECTORP (eol)) 6790 }
6442 XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]); 6791 else if (category >= coding_category_raw_text)
6443 } 6792 continue;
6444 } 6793 else if (detect_info.checked & (1 << category))
6794 {
6795 if (highest
6796 && (detect_info.found & (1 << category)))
6797 break;
6798 }
6799 else
6800 {
6801 if ((*(this->detector)) (&coding, &detect_info)
6802 && highest
6803 && (detect_info.found & (1 << category)))
6804 break;
6805 }
6806 }
6807
6808
6809 if (detect_info.rejected == CATEGORY_MASK_ANY)
6810 {
6811 detect_info.found = CATEGORY_MASK_RAW_TEXT;
6812 id = coding_categories[coding_category_raw_text].id;
6813 val = Fcons (make_number (id), Qnil);
6814 }
6815 else if (! detect_info.rejected && ! detect_info.found)
6816 {
6817 detect_info.found = CATEGORY_MASK_ANY;
6818 id = coding_categories[coding_category_undecided].id;
6819 val = Fcons (make_number (id), Qnil);
6820 }
6821 else if (highest)
6822 {
6823 if (detect_info.found)
6824 {
6825 detect_info.found = 1 << category;
6826 val = Fcons (make_number (this->id), Qnil);
6827 }
6828 else
6829 for (i = 0; i < coding_category_raw_text; i++)
6830 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6831 {
6832 detect_info.found = 1 << coding_priorities[i];
6833 id = coding_categories[coding_priorities[i]].id;
6834 val = Fcons (make_number (id), Qnil);
6835 break;
6836 }
6837 }
6838 else
6839 {
6840 int mask = detect_info.rejected | detect_info.found;
6841 int found = 0;
6842 val = Qnil;
6843
6844 for (i = coding_category_raw_text - 1; i >= 0; i--)
6845 {
6846 category = coding_priorities[i];
6847 if (! (mask & (1 << category)))
6848 {
6849 found |= 1 << category;
6850 id = coding_categories[category].id;
6851 val = Fcons (make_number (id), val);
6852 }
6853 }
6854 for (i = coding_category_raw_text - 1; i >= 0; i--)
6855 {
6856 category = coding_priorities[i];
6857 if (detect_info.found & (1 << category))
6858 {
6859 id = coding_categories[category].id;
6860 val = Fcons (make_number (id), val);
6861 }
6862 }
6863 detect_info.found |= found;
6864 }
6865 }
6866 else
6867 {
6868 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6869 val = Fcons (make_number (coding.id), Qnil);
6870 }
6871
6872 /* Then, detect eol-format if necessary. */
6873 {
6874 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
6875 Lisp_Object tail;
6876
6877 if (VECTORP (eol_type))
6878 {
6879 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
6880 normal_eol = detect_eol (coding.source, src_bytes,
6881 coding_category_raw_text);
6882 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
6883 | CATEGORY_MASK_UTF_16_BE_NOSIG))
6884 utf_16_be_eol = detect_eol (coding.source, src_bytes,
6885 coding_category_utf_16_be);
6886 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
6887 | CATEGORY_MASK_UTF_16_LE_NOSIG))
6888 utf_16_le_eol = detect_eol (coding.source, src_bytes,
6889 coding_category_utf_16_le);
6890 }
6891 else
6892 {
6893 if (EQ (eol_type, Qunix))
6894 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
6895 else if (EQ (eol_type, Qdos))
6896 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
6897 else
6898 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
6899 }
6900
6901 for (tail = val; CONSP (tail); tail = XCDR (tail))
6902 {
6903 enum coding_category category;
6904 int this_eol;
6905
6906 id = XINT (XCAR (tail));
6907 attrs = CODING_ID_ATTRS (id);
6908 category = XINT (CODING_ATTR_CATEGORY (attrs));
6909 eol_type = CODING_ID_EOL_TYPE (id);
6910 if (VECTORP (eol_type))
6911 {
6912 if (category == coding_category_utf_16_be
6913 || category == coding_category_utf_16_be_nosig)
6914 this_eol = utf_16_be_eol;
6915 else if (category == coding_category_utf_16_le
6916 || category == coding_category_utf_16_le_nosig)
6917 this_eol = utf_16_le_eol;
6918 else
6919 this_eol = normal_eol;
6920
6921 if (this_eol == EOL_SEEN_LF)
6922 XSETCAR (tail, AREF (eol_type, 0));
6923 else if (this_eol == EOL_SEEN_CRLF)
6924 XSETCAR (tail, AREF (eol_type, 1));
6925 else if (this_eol == EOL_SEEN_CR)
6926 XSETCAR (tail, AREF (eol_type, 2));
6927 else
6928 XSETCAR (tail, CODING_ID_NAME (id));
6929 }
6930 else
6931 XSETCAR (tail, CODING_ID_NAME (id));
6932 }
6933 }
6934
6445 return (highest ? XCAR (val) : val); 6935 return (highest ? XCAR (val) : val);
6446 } 6936 }
6937
6447 6938
6448 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region, 6939 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6449 2, 3, 0, 6940 2, 3, 0,
6450 doc: /* Detect how the byte sequence in the region is encoded. 6941 doc: /* Detect coding system of the text in the region between START and END.
6451 Return a list of possible coding systems used on decoding a byte 6942 Return a list of possible coding systems ordered by priority.
6452 sequence containing the bytes in the region between START and END when
6453 the coding system `undecided' is specified. The list is ordered by
6454 priority decided in the current language environment.
6455 6943
6456 If only ASCII characters are found, it returns a list of single element 6944 If only ASCII characters are found, it returns a list of single element
6457 `undecided' or its subsidiary coding system according to a detected 6945 `undecided' or its subsidiary coding system according to a detected
6458 end-of-line format. 6946 end-of-line format.
6459 6947
6462 (start, end, highest) 6950 (start, end, highest)
6463 Lisp_Object start, end, highest; 6951 Lisp_Object start, end, highest;
6464 { 6952 {
6465 int from, to; 6953 int from, to;
6466 int from_byte, to_byte; 6954 int from_byte, to_byte;
6467 int include_anchor_byte = 0;
6468 6955
6469 CHECK_NUMBER_COERCE_MARKER (start); 6956 CHECK_NUMBER_COERCE_MARKER (start);
6470 CHECK_NUMBER_COERCE_MARKER (end); 6957 CHECK_NUMBER_COERCE_MARKER (end);
6471 6958
6472 validate_region (&start, &end); 6959 validate_region (&start, &end);
6474 from_byte = CHAR_TO_BYTE (from); 6961 from_byte = CHAR_TO_BYTE (from);
6475 to_byte = CHAR_TO_BYTE (to); 6962 to_byte = CHAR_TO_BYTE (to);
6476 6963
6477 if (from < GPT && to >= GPT) 6964 if (from < GPT && to >= GPT)
6478 move_gap_both (to, to_byte); 6965 move_gap_both (to, to_byte);
6479 /* If we an anchor byte `\0' follows the region, we include it in 6966
6480 the detecting source. Then code detectors can handle the tailing
6481 byte sequence more accurately.
6482
6483 Fix me: This is not a perfect solution. It is better that we
6484 add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6485 */
6486 if (to == Z || (to == GPT && GAP_SIZE > 0))
6487 include_anchor_byte = 1;
6488 return detect_coding_system (BYTE_POS_ADDR (from_byte), 6967 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6489 to_byte - from_byte + include_anchor_byte, 6968 to_byte - from_byte,
6490 !NILP (highest), 6969 !NILP (highest),
6491 !NILP (current_buffer 6970 !NILP (current_buffer
6492 ->enable_multibyte_characters)); 6971 ->enable_multibyte_characters),
6972 Qnil);
6493 } 6973 }
6494 6974
6495 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string, 6975 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6496 1, 2, 0, 6976 1, 2, 0,
6497 doc: /* Detect how the byte sequence in STRING is encoded. 6977 doc: /* Detect coding system of the text in STRING.
6498 Return a list of possible coding systems used on decoding a byte 6978 Return a list of possible coding systems ordered by priority.
6499 sequence containing the bytes in STRING when the coding system
6500 `undecided' is specified. The list is ordered by priority decided in
6501 the current language environment.
6502 6979
6503 If only ASCII characters are found, it returns a list of single element 6980 If only ASCII characters are found, it returns a list of single element
6504 `undecided' or its subsidiary coding system according to a detected 6981 `undecided' or its subsidiary coding system according to a detected
6505 end-of-line format. 6982 end-of-line format.
6506 6983
6509 (string, highest) 6986 (string, highest)
6510 Lisp_Object string, highest; 6987 Lisp_Object string, highest;
6511 { 6988 {
6512 CHECK_STRING (string); 6989 CHECK_STRING (string);
6513 6990
6514 return detect_coding_system (SDATA (string), 6991 return detect_coding_system (SDATA (string), SBYTES (string),
6515 /* "+ 1" is to include the anchor byte 6992 !NILP (highest), STRING_MULTIBYTE (string),
6516 `\0'. With this, code detectors can 6993 Qnil);
6517 handle the tailing bytes more 6994 }
6518 accurately. */ 6995
6519 SBYTES (string) + 1, 6996
6520 !NILP (highest), 6997 static INLINE int
6521 STRING_MULTIBYTE (string)); 6998 char_encodable_p (c, attrs)
6522 } 6999 int c;
6523 7000 Lisp_Object attrs;
6524 /* Subroutine for Fsafe_coding_systems_region_internal. 7001 {
6525 7002 Lisp_Object tail;
6526 Return a list of coding systems that safely encode the multibyte 7003 struct charset *charset;
6527 text between P and PEND. SAFE_CODINGS, if non-nil, is an alist of 7004
6528 possible coding systems. If it is nil, it means that we have not 7005 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
6529 yet found any coding systems. 7006 CONSP (tail); tail = XCDR (tail))
6530 7007 {
6531 WORK_TABLE is a copy of the char-table Vchar_coding_system_table. An 7008 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
6532 element of WORK_TABLE is set to t once the element is looked up. 7009 if (CHAR_CHARSET_P (c, charset))
6533 7010 break;
6534 If a non-ASCII single byte char is found, set 7011 }
6535 *single_byte_char_found to 1. */ 7012 return (! NILP (tail));
6536 7013 }
6537 static Lisp_Object 7014
6538 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found) 7015
6539 unsigned char *p, *pend; 7016 /* Return a list of coding systems that safely encode the text between
6540 Lisp_Object safe_codings, work_table; 7017 START and END. If EXCLUDE is non-nil, it is a list of coding
6541 int *single_byte_char_found; 7018 systems not to check. The returned list doesn't contain any such
6542 { 7019 coding systems. In any case, if the text contains only ASCII or is
6543 int c, len; 7020 unibyte, return t. */
6544 Lisp_Object val, ch;
6545 Lisp_Object prev, tail;
6546
6547 while (p < pend)
6548 {
6549 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6550 p += len;
6551 if (ASCII_BYTE_P (c))
6552 /* We can ignore ASCII characters here. */
6553 continue;
6554 if (SINGLE_BYTE_CHAR_P (c))
6555 *single_byte_char_found = 1;
6556 if (NILP (safe_codings))
6557 /* Already all coding systems are excluded. But, we can't
6558 terminate the loop here because non-ASCII single-byte char
6559 must be found. */
6560 continue;
6561 /* Check the safe coding systems for C. */
6562 ch = make_number (c);
6563 val = Faref (work_table, ch);
6564 if (EQ (val, Qt))
6565 /* This element was already checked. Ignore it. */
6566 continue;
6567 /* Remember that we checked this element. */
6568 Faset (work_table, ch, Qt);
6569
6570 for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6571 {
6572 Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6573 int encodable;
6574
6575 elt = XCAR (tail);
6576 if (CONSP (XCDR (elt)))
6577 {
6578 /* This entry has this format now:
6579 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6580 ACCEPT-LATIN-EXTRA ) */
6581 val = XCDR (elt);
6582 encodable = ! NILP (Faref (XCAR (val), ch));
6583 if (! encodable)
6584 {
6585 val = XCDR (val);
6586 translation_table = XCAR (val);
6587 hash_table = XCAR (XCDR (val));
6588 accept_latin_extra = XCAR (XCDR (XCDR (val)));
6589 }
6590 }
6591 else
6592 {
6593 /* This entry has this format now: ( CODING . SAFE-CHARS) */
6594 encodable = ! NILP (Faref (XCDR (elt), ch));
6595 if (! encodable)
6596 {
6597 /* Transform the format to:
6598 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6599 ACCEPT-LATIN-EXTRA ) */
6600 val = Fget (XCAR (elt), Qcoding_system);
6601 translation_table
6602 = Fplist_get (AREF (val, 3),
6603 Qtranslation_table_for_encode);
6604 if (SYMBOLP (translation_table))
6605 translation_table = Fget (translation_table,
6606 Qtranslation_table);
6607 hash_table
6608 = (CHAR_TABLE_P (translation_table)
6609 ? XCHAR_TABLE (translation_table)->extras[1]
6610 : Qnil);
6611 accept_latin_extra
6612 = ((EQ (AREF (val, 0), make_number (2))
6613 && VECTORP (AREF (val, 4)))
6614 ? AREF (AREF (val, 4), 16)
6615 : Qnil);
6616 XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6617 translation_table, hash_table,
6618 accept_latin_extra));
6619 }
6620 }
6621
6622 if (! encodable
6623 && ((CHAR_TABLE_P (translation_table)
6624 && ! NILP (Faref (translation_table, ch)))
6625 || (HASH_TABLE_P (hash_table)
6626 && ! NILP (Fgethash (ch, hash_table, Qnil)))
6627 || (SINGLE_BYTE_CHAR_P (c)
6628 && ! NILP (accept_latin_extra)
6629 && VECTORP (Vlatin_extra_code_table)
6630 && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6631 encodable = 1;
6632 if (encodable)
6633 prev = tail;
6634 else
6635 {
6636 /* Exclude this coding system from SAFE_CODINGS. */
6637 if (EQ (tail, safe_codings))
6638 safe_codings = XCDR (safe_codings);
6639 else
6640 XSETCDR (prev, XCDR (tail));
6641 }
6642 }
6643 }
6644 return safe_codings;
6645 }
6646 7021
6647 DEFUN ("find-coding-systems-region-internal", 7022 DEFUN ("find-coding-systems-region-internal",
6648 Ffind_coding_systems_region_internal, 7023 Ffind_coding_systems_region_internal,
6649 Sfind_coding_systems_region_internal, 2, 2, 0, 7024 Sfind_coding_systems_region_internal, 2, 3, 0,
6650 doc: /* Internal use only. */) 7025 doc: /* Internal use only. */)
6651 (start, end) 7026 (start, end, exclude)
6652 Lisp_Object start, end; 7027 Lisp_Object start, end, exclude;
6653 { 7028 {
6654 Lisp_Object work_table, safe_codings; 7029 Lisp_Object coding_attrs_list, safe_codings;
6655 int non_ascii_p = 0; 7030 EMACS_INT start_byte, end_byte;
6656 int single_byte_char_found = 0; 7031 const unsigned char *p, *pbeg, *pend;
6657 const unsigned char *p1, *p1end, *p2, *p2end, *p; 7032 int c;
7033 Lisp_Object tail, elt;
6658 7034
6659 if (STRINGP (start)) 7035 if (STRINGP (start))
6660 { 7036 {
6661 if (!STRING_MULTIBYTE (start)) 7037 if (!STRING_MULTIBYTE (start)
7038 || SCHARS (start) == SBYTES (start))
6662 return Qt; 7039 return Qt;
6663 p1 = SDATA (start), p1end = p1 + SBYTES (start); 7040 start_byte = 0;
6664 p2 = p2end = p1end; 7041 end_byte = SBYTES (start);
6665 if (SCHARS (start) != SBYTES (start))
6666 non_ascii_p = 1;
6667 } 7042 }
6668 else 7043 else
6669 { 7044 {
6670 int from, to, stop;
6671
6672 CHECK_NUMBER_COERCE_MARKER (start); 7045 CHECK_NUMBER_COERCE_MARKER (start);
6673 CHECK_NUMBER_COERCE_MARKER (end); 7046 CHECK_NUMBER_COERCE_MARKER (end);
6674 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end)) 7047 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6675 args_out_of_range (start, end); 7048 args_out_of_range (start, end);
6676 if (NILP (current_buffer->enable_multibyte_characters)) 7049 if (NILP (current_buffer->enable_multibyte_characters))
6677 return Qt; 7050 return Qt;
6678 from = CHAR_TO_BYTE (XINT (start)); 7051 start_byte = CHAR_TO_BYTE (XINT (start));
6679 to = CHAR_TO_BYTE (XINT (end)); 7052 end_byte = CHAR_TO_BYTE (XINT (end));
6680 stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to; 7053 if (XINT (end) - XINT (start) == end_byte - start_byte)
6681 p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from); 7054 return Qt;
6682 if (stop == to) 7055
6683 p2 = p2end = p1end; 7056 if (XINT (start) < GPT && XINT (end) > GPT)
7057 {
7058 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7059 move_gap_both (XINT (start), start_byte);
7060 else
7061 move_gap_both (XINT (end), end_byte);
7062 }
7063 }
7064
7065 coding_attrs_list = Qnil;
7066 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7067 if (NILP (exclude)
7068 || NILP (Fmemq (XCAR (tail), exclude)))
7069 {
7070 Lisp_Object attrs;
7071
7072 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7073 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7074 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7075 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7076 }
7077
7078 if (STRINGP (start))
7079 p = pbeg = SDATA (start);
7080 else
7081 p = pbeg = BYTE_POS_ADDR (start_byte);
7082 pend = p + (end_byte - start_byte);
7083
7084 while (p < pend && ASCII_BYTE_P (*p)) p++;
7085 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7086
7087 while (p < pend)
7088 {
7089 if (ASCII_BYTE_P (*p))
7090 p++;
6684 else 7091 else
6685 p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop); 7092 {
6686 if (XINT (end) - XINT (start) != to - from) 7093 c = STRING_CHAR_ADVANCE (p);
6687 non_ascii_p = 1; 7094
6688 } 7095 charset_map_loaded = 0;
6689 7096 for (tail = coding_attrs_list; CONSP (tail);)
6690 if (!non_ascii_p) 7097 {
6691 { 7098 elt = XCAR (tail);
6692 /* We are sure that the text contains no multibyte character. 7099 if (NILP (elt))
6693 Check if it contains eight-bit-graphic. */ 7100 tail = XCDR (tail);
6694 p = p1; 7101 else if (char_encodable_p (c, elt))
6695 for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++); 7102 tail = XCDR (tail);
6696 if (p == p1end) 7103 else if (CONSP (XCDR (tail)))
6697 { 7104 {
6698 for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++); 7105 XSETCAR (tail, XCAR (XCDR (tail)));
6699 if (p == p2end) 7106 XSETCDR (tail, XCDR (XCDR (tail)));
6700 return Qt; 7107 }
6701 } 7108 else
6702 } 7109 {
6703 7110 XSETCAR (tail, Qnil);
6704 /* The text contains non-ASCII characters. */ 7111 tail = XCDR (tail);
6705 7112 }
6706 work_table = Fmake_char_table (Qchar_coding_system, Qnil); 7113 }
6707 safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars)); 7114 if (charset_map_loaded)
6708 7115 {
6709 safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table, 7116 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6710 &single_byte_char_found); 7117
6711 if (p2 < p2end) 7118 if (STRINGP (start))
6712 safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table, 7119 pbeg = SDATA (start);
6713 &single_byte_char_found); 7120 else
6714 if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars))) 7121 pbeg = BYTE_POS_ADDR (start_byte);
6715 safe_codings = Qt; 7122 p = pbeg + p_offset;
6716 else 7123 pend = pbeg + pend_offset;
6717 { 7124 }
6718 /* Turn safe_codings to a list of coding systems... */ 7125 }
6719 Lisp_Object val; 7126 }
6720 7127
6721 if (single_byte_char_found) 7128 safe_codings = Qnil;
6722 /* ... and append these for eight-bit chars. */ 7129 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
6723 val = Fcons (Qraw_text, 7130 if (! NILP (XCAR (tail)))
6724 Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil))); 7131 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
6725 else
6726 /* ... and append generic coding systems. */
6727 val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6728
6729 for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6730 val = Fcons (XCAR (XCAR (safe_codings)), val);
6731 safe_codings = val;
6732 }
6733 7132
6734 return safe_codings; 7133 return safe_codings;
6735 }
6736
6737
6738 /* Search from position POS for such characters that are unencodable
6739 accoding to SAFE_CHARS, and return a list of their positions. P
6740 points where in the memory the character at POS exists. Limit the
6741 search at PEND or when Nth unencodable characters are found.
6742
6743 If SAFE_CHARS is a char table, an element for an unencodable
6744 character is nil.
6745
6746 If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6747
6748 Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6749 eight-bit-graphic characters are unencodable. */
6750
6751 static Lisp_Object
6752 unencodable_char_position (safe_chars, pos, p, pend, n)
6753 Lisp_Object safe_chars;
6754 int pos;
6755 unsigned char *p, *pend;
6756 int n;
6757 {
6758 Lisp_Object pos_list;
6759
6760 pos_list = Qnil;
6761 while (p < pend)
6762 {
6763 int len;
6764 int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6765
6766 if (c >= 128
6767 && (CHAR_TABLE_P (safe_chars)
6768 ? NILP (CHAR_TABLE_REF (safe_chars, c))
6769 : (NILP (safe_chars) || c < 256)))
6770 {
6771 pos_list = Fcons (make_number (pos), pos_list);
6772 if (--n <= 0)
6773 break;
6774 }
6775 pos++;
6776 p += len;
6777 }
6778 return Fnreverse (pos_list);
6779 } 7134 }
6780 7135
6781 7136
6782 DEFUN ("unencodable-char-position", Funencodable_char_position, 7137 DEFUN ("unencodable-char-position", Funencodable_char_position,
6783 Sunencodable_char_position, 3, 5, 0, 7138 Sunencodable_char_position, 3, 5, 0,
6795 to the string. */) 7150 to the string. */)
6796 (start, end, coding_system, count, string) 7151 (start, end, coding_system, count, string)
6797 Lisp_Object start, end, coding_system, count, string; 7152 Lisp_Object start, end, coding_system, count, string;
6798 { 7153 {
6799 int n; 7154 int n;
6800 Lisp_Object safe_chars;
6801 struct coding_system coding; 7155 struct coding_system coding;
7156 Lisp_Object attrs, charset_list;
6802 Lisp_Object positions; 7157 Lisp_Object positions;
6803 int from, to; 7158 int from, to;
6804 unsigned char *p, *pend; 7159 const unsigned char *p, *stop, *pend;
7160 int ascii_compatible;
7161
7162 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7163 attrs = CODING_ID_ATTRS (coding.id);
7164 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7165 return Qnil;
7166 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7167 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6805 7168
6806 if (NILP (string)) 7169 if (NILP (string))
6807 { 7170 {
6808 validate_region (&start, &end); 7171 validate_region (&start, &end);
6809 from = XINT (start); 7172 from = XINT (start);
6810 to = XINT (end); 7173 to = XINT (end);
6811 if (NILP (current_buffer->enable_multibyte_characters)) 7174 if (NILP (current_buffer->enable_multibyte_characters)
7175 || (ascii_compatible
7176 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
6812 return Qnil; 7177 return Qnil;
6813 p = CHAR_POS_ADDR (from); 7178 p = CHAR_POS_ADDR (from);
6814 if (to == GPT) 7179 pend = CHAR_POS_ADDR (to);
6815 pend = GPT_ADDR; 7180 if (from < GPT && to >= GPT)
7181 stop = GPT_ADDR;
6816 else 7182 else
6817 pend = CHAR_POS_ADDR (to); 7183 stop = pend;
6818 } 7184 }
6819 else 7185 else
6820 { 7186 {
6821 CHECK_STRING (string); 7187 CHECK_STRING (string);
6822 CHECK_NATNUM (start); 7188 CHECK_NATNUM (start);
6827 || to > SCHARS (string)) 7193 || to > SCHARS (string))
6828 args_out_of_range_3 (string, start, end); 7194 args_out_of_range_3 (string, start, end);
6829 if (! STRING_MULTIBYTE (string)) 7195 if (! STRING_MULTIBYTE (string))
6830 return Qnil; 7196 return Qnil;
6831 p = SDATA (string) + string_char_to_byte (string, from); 7197 p = SDATA (string) + string_char_to_byte (string, from);
6832 pend = SDATA (string) + string_char_to_byte (string, to); 7198 stop = pend = SDATA (string) + string_char_to_byte (string, to);
6833 } 7199 if (ascii_compatible && (to - from) == (pend - p))
6834 7200 return Qnil;
6835 setup_coding_system (Fcheck_coding_system (coding_system), &coding); 7201 }
6836 7202
6837 if (NILP (count)) 7203 if (NILP (count))
6838 n = 1; 7204 n = 1;
6839 else 7205 else
6840 { 7206 {
6841 CHECK_NATNUM (count); 7207 CHECK_NATNUM (count);
6842 n = XINT (count); 7208 n = XINT (count);
6843 } 7209 }
6844 7210
6845 if (coding.type == coding_type_no_conversion 7211 positions = Qnil;
6846 || coding.type == coding_type_raw_text) 7212 while (1)
6847 return Qnil; 7213 {
6848 7214 int c;
6849 if (coding.type == coding_type_undecided) 7215
6850 safe_chars = Qnil; 7216 if (ascii_compatible)
7217 while (p < stop && ASCII_BYTE_P (*p))
7218 p++, from++;
7219 if (p >= stop)
7220 {
7221 if (p >= pend)
7222 break;
7223 stop = pend;
7224 p = GAP_END_ADDR;
7225 }
7226
7227 c = STRING_CHAR_ADVANCE (p);
7228 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7229 && ! char_charset (c, charset_list, NULL))
7230 {
7231 positions = Fcons (make_number (from), positions);
7232 n--;
7233 if (n == 0)
7234 break;
7235 }
7236
7237 from++;
7238 }
7239
7240 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7241 }
7242
7243
7244 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7245 Scheck_coding_systems_region, 3, 3, 0,
7246 doc: /* Check if the region is encodable by coding systems.
7247
7248 START and END are buffer positions specifying the region.
7249 CODING-SYSTEM-LIST is a list of coding systems to check.
7250
7251 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7252 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7253 whole region, POS0, POS1, ... are buffer positions where non-encodable
7254 characters are found.
7255
7256 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7257 value is nil.
7258
7259 START may be a string. In that case, check if the string is
7260 encodable, and the value contains indices to the string instead of
7261 buffer positions. END is ignored. */)
7262 (start, end, coding_system_list)
7263 Lisp_Object start, end, coding_system_list;
7264 {
7265 Lisp_Object list;
7266 EMACS_INT start_byte, end_byte;
7267 int pos;
7268 const unsigned char *p, *pbeg, *pend;
7269 int c;
7270 Lisp_Object tail, elt;
7271
7272 if (STRINGP (start))
7273 {
7274 if (!STRING_MULTIBYTE (start)
7275 && SCHARS (start) != SBYTES (start))
7276 return Qnil;
7277 start_byte = 0;
7278 end_byte = SBYTES (start);
7279 pos = 0;
7280 }
6851 else 7281 else
6852 safe_chars = coding_safe_chars (coding_system); 7282 {
6853 7283 CHECK_NUMBER_COERCE_MARKER (start);
6854 if (STRINGP (string) 7284 CHECK_NUMBER_COERCE_MARKER (end);
6855 || from >= GPT || to <= GPT) 7285 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6856 positions = unencodable_char_position (safe_chars, from, p, pend, n); 7286 args_out_of_range (start, end);
7287 if (NILP (current_buffer->enable_multibyte_characters))
7288 return Qnil;
7289 start_byte = CHAR_TO_BYTE (XINT (start));
7290 end_byte = CHAR_TO_BYTE (XINT (end));
7291 if (XINT (end) - XINT (start) == end_byte - start_byte)
7292 return Qt;
7293
7294 if (XINT (start) < GPT && XINT (end) > GPT)
7295 {
7296 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7297 move_gap_both (XINT (start), start_byte);
7298 else
7299 move_gap_both (XINT (end), end_byte);
7300 }
7301 pos = XINT (start);
7302 }
7303
7304 list = Qnil;
7305 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7306 {
7307 elt = XCAR (tail);
7308 list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
7309 Qnil)),
7310 list);
7311 }
7312
7313 if (STRINGP (start))
7314 p = pbeg = SDATA (start);
6857 else 7315 else
6858 { 7316 p = pbeg = BYTE_POS_ADDR (start_byte);
6859 Lisp_Object args[2]; 7317 pend = p + (end_byte - start_byte);
6860 7318
6861 args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n); 7319 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
6862 n -= XINT (Flength (args[0])); 7320 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6863 if (n <= 0) 7321
6864 positions = args[0]; 7322 while (p < pend)
7323 {
7324 if (ASCII_BYTE_P (*p))
7325 p++;
6865 else 7326 else
6866 { 7327 {
6867 args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR, 7328 c = STRING_CHAR_ADVANCE (p);
6868 pend, n); 7329
6869 positions = Fappend (2, args); 7330 charset_map_loaded = 0;
6870 } 7331 for (tail = list; CONSP (tail); tail = XCDR (tail))
6871 } 7332 {
6872 7333 elt = XCDR (XCAR (tail));
6873 return (NILP (count) ? Fcar (positions) : positions); 7334 if (! char_encodable_p (c, XCAR (elt)))
6874 } 7335 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7336 }
7337 if (charset_map_loaded)
7338 {
7339 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7340
7341 if (STRINGP (start))
7342 pbeg = SDATA (start);
7343 else
7344 pbeg = BYTE_POS_ADDR (start_byte);
7345 p = pbeg + p_offset;
7346 pend = pbeg + pend_offset;
7347 }
7348 }
7349 pos++;
7350 }
7351
7352 tail = list;
7353 list = Qnil;
7354 for (; CONSP (tail); tail = XCDR (tail))
7355 {
7356 elt = XCAR (tail);
7357 if (CONSP (XCDR (XCDR (elt))))
7358 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7359 list);
7360 }
7361
7362 return list;
7363 }
7364
6875 7365
6876 7366
6877 Lisp_Object 7367 Lisp_Object
6878 code_convert_region1 (start, end, coding_system, encodep) 7368 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
6879 Lisp_Object start, end, coding_system; 7369 Lisp_Object start, end, coding_system, dst_object;
6880 int encodep; 7370 int encodep, norecord;
6881 { 7371 {
6882 struct coding_system coding; 7372 struct coding_system coding;
6883 int from, to; 7373 EMACS_INT from, from_byte, to, to_byte;
7374 Lisp_Object src_object;
6884 7375
6885 CHECK_NUMBER_COERCE_MARKER (start); 7376 CHECK_NUMBER_COERCE_MARKER (start);
6886 CHECK_NUMBER_COERCE_MARKER (end); 7377 CHECK_NUMBER_COERCE_MARKER (end);
6887 CHECK_SYMBOL (coding_system); 7378 if (NILP (coding_system))
7379 coding_system = Qno_conversion;
7380 else
7381 CHECK_CODING_SYSTEM (coding_system);
7382 src_object = Fcurrent_buffer ();
7383 if (NILP (dst_object))
7384 dst_object = src_object;
7385 else if (! EQ (dst_object, Qt))
7386 CHECK_BUFFER (dst_object);
6888 7387
6889 validate_region (&start, &end); 7388 validate_region (&start, &end);
6890 from = XFASTINT (start); 7389 from = XFASTINT (start);
7390 from_byte = CHAR_TO_BYTE (from);
6891 to = XFASTINT (end); 7391 to = XFASTINT (end);
6892 7392 to_byte = CHAR_TO_BYTE (to);
6893 if (NILP (coding_system)) 7393
6894 return make_number (to - from); 7394 setup_coding_system (coding_system, &coding);
6895
6896 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6897 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6898
6899 coding.mode |= CODING_MODE_LAST_BLOCK; 7395 coding.mode |= CODING_MODE_LAST_BLOCK;
6900 coding.src_multibyte = coding.dst_multibyte 7396
6901 = !NILP (current_buffer->enable_multibyte_characters); 7397 if (encodep)
6902 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to), 7398 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6903 &coding, encodep, 1); 7399 dst_object);
6904 Vlast_coding_system_used = coding.symbol; 7400 else
6905 return make_number (coding.produced_char); 7401 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6906 } 7402 dst_object);
7403 if (! norecord)
7404 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7405
7406 if (coding.result != CODING_RESULT_SUCCESS)
7407 error ("Code conversion error: %d", coding.result);
7408
7409 return (BUFFERP (dst_object)
7410 ? make_number (coding.produced_char)
7411 : coding.dst_object);
7412 }
7413
6907 7414
6908 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region, 7415 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6909 3, 3, "r\nzCoding system: ", 7416 3, 4, "r\nzCoding system: ",
6910 doc: /* Decode the current region from the specified coding system. 7417 doc: /* Decode the current region from the specified coding system.
6911 When called from a program, takes three arguments: 7418 When called from a program, takes four arguments:
6912 START, END, and CODING-SYSTEM. START and END are buffer positions. 7419 START, END, CODING-SYSTEM, and DESTINATION.
7420 START and END are buffer positions.
7421
7422 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7423 If nil, the region between START and END is replace by the decoded text.
7424 If buffer, the decoded text is inserted in the buffer.
7425 If t, the decoded text is returned.
7426
6913 This function sets `last-coding-system-used' to the precise coding system 7427 This function sets `last-coding-system-used' to the precise coding system
6914 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is 7428 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6915 not fully specified.) 7429 not fully specified.)
6916 It returns the length of the decoded text. */) 7430 It returns the length of the decoded text. */)
6917 (start, end, coding_system) 7431 (start, end, coding_system, destination)
6918 Lisp_Object start, end, coding_system; 7432 Lisp_Object start, end, coding_system, destination;
6919 { 7433 {
6920 return code_convert_region1 (start, end, coding_system, 0); 7434 return code_convert_region (start, end, coding_system, destination, 0, 0);
6921 } 7435 }
6922 7436
6923 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region, 7437 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6924 3, 3, "r\nzCoding system: ", 7438 3, 4, "r\nzCoding system: ",
6925 doc: /* Encode the current region into the specified coding system. 7439 doc: /* Encode the current region by specified coding system.
6926 When called from a program, takes three arguments: 7440 When called from a program, takes three arguments:
6927 START, END, and CODING-SYSTEM. START and END are buffer positions. 7441 START, END, and CODING-SYSTEM. START and END are buffer positions.
7442
7443 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7444 If nil, the region between START and END is replace by the encoded text.
7445 If buffer, the encoded text is inserted in the buffer.
7446 If t, the encoded text is returned.
7447
6928 This function sets `last-coding-system-used' to the precise coding system 7448 This function sets `last-coding-system-used' to the precise coding system
6929 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is 7449 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6930 not fully specified.) 7450 not fully specified.)
6931 It returns the length of the encoded text. */) 7451 It returns the length of the encoded text. */)
6932 (start, end, coding_system) 7452 (start, end, coding_system, destination)
6933 Lisp_Object start, end, coding_system; 7453 Lisp_Object start, end, coding_system, destination;
6934 { 7454 {
6935 return code_convert_region1 (start, end, coding_system, 1); 7455 return code_convert_region (start, end, coding_system, destination, 1, 0);
6936 } 7456 }
6937 7457
6938 Lisp_Object 7458 Lisp_Object
6939 code_convert_string1 (string, coding_system, nocopy, encodep) 7459 code_convert_string (string, coding_system, dst_object,
6940 Lisp_Object string, coding_system, nocopy; 7460 encodep, nocopy, norecord)
6941 int encodep; 7461 Lisp_Object string, coding_system, dst_object;
7462 int encodep, nocopy, norecord;
6942 { 7463 {
6943 struct coding_system coding; 7464 struct coding_system coding;
7465 EMACS_INT chars, bytes;
6944 7466
6945 CHECK_STRING (string); 7467 CHECK_STRING (string);
6946 CHECK_SYMBOL (coding_system);
6947
6948 if (NILP (coding_system)) 7468 if (NILP (coding_system))
6949 return (NILP (nocopy) ? Fcopy_sequence (string) : string); 7469 {
6950 7470 if (! norecord)
6951 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0) 7471 Vlast_coding_system_used = Qno_conversion;
6952 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system))); 7472 if (NILP (dst_object))
6953 7473 return (nocopy ? Fcopy_sequence (string) : string);
7474 }
7475
7476 if (NILP (coding_system))
7477 coding_system = Qno_conversion;
7478 else
7479 CHECK_CODING_SYSTEM (coding_system);
7480 if (NILP (dst_object))
7481 dst_object = Qt;
7482 else if (! EQ (dst_object, Qt))
7483 CHECK_BUFFER (dst_object);
7484
7485 setup_coding_system (coding_system, &coding);
6954 coding.mode |= CODING_MODE_LAST_BLOCK; 7486 coding.mode |= CODING_MODE_LAST_BLOCK;
6955 string = (encodep 7487 chars = SCHARS (string);
6956 ? encode_coding_string (string, &coding, !NILP (nocopy)) 7488 bytes = SBYTES (string);
6957 : decode_coding_string (string, &coding, !NILP (nocopy))); 7489 if (encodep)
6958 Vlast_coding_system_used = coding.symbol; 7490 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6959 7491 else
6960 return string; 7492 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6961 } 7493 if (! norecord)
6962 7494 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
6963 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string, 7495
6964 2, 3, 0, 7496 if (coding.result != CODING_RESULT_SUCCESS)
6965 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result. 7497 error ("Code conversion error: %d", coding.result);
6966 Optional arg NOCOPY non-nil means it is OK to return STRING itself 7498
6967 if the decoding operation is trivial. 7499 return (BUFFERP (dst_object)
6968 This function sets `last-coding-system-used' to the precise coding system 7500 ? make_number (coding.produced_char)
6969 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is 7501 : coding.dst_object);
6970 not fully specified.) */) 7502 }
6971 (string, coding_system, nocopy) 7503
6972 Lisp_Object string, coding_system, nocopy;
6973 {
6974 return code_convert_string1 (string, coding_system, nocopy, 0);
6975 }
6976
6977 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6978 2, 3, 0,
6979 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6980 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6981 if the encoding operation is trivial.
6982 This function sets `last-coding-system-used' to the precise coding system
6983 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6984 not fully specified.) */)
6985 (string, coding_system, nocopy)
6986 Lisp_Object string, coding_system, nocopy;
6987 {
6988 return code_convert_string1 (string, coding_system, nocopy, 1);
6989 }
6990 7504
6991 /* Encode or decode STRING according to CODING_SYSTEM. 7505 /* Encode or decode STRING according to CODING_SYSTEM.
6992 Do not set Vlast_coding_system_used. 7506 Do not set Vlast_coding_system_used.
6993 7507
6994 This function is called only from macros DECODE_FILE and 7508 This function is called only from macros DECODE_FILE and
6997 Lisp_Object 7511 Lisp_Object
6998 code_convert_string_norecord (string, coding_system, encodep) 7512 code_convert_string_norecord (string, coding_system, encodep)
6999 Lisp_Object string, coding_system; 7513 Lisp_Object string, coding_system;
7000 int encodep; 7514 int encodep;
7001 { 7515 {
7002 struct coding_system coding; 7516 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7003 7517 }
7004 CHECK_STRING (string); 7518
7005 CHECK_SYMBOL (coding_system); 7519
7006 7520 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7007 if (NILP (coding_system)) 7521 2, 4, 0,
7008 return string; 7522 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7009 7523
7010 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0) 7524 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7011 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system))); 7525 if the decoding operation is trivial.
7012 7526
7013 coding.composing = COMPOSITION_DISABLED; 7527 Optional fourth arg BUFFER non-nil meant that the decoded text is
7014 coding.mode |= CODING_MODE_LAST_BLOCK; 7528 inserted in BUFFER instead of returned as a string. In this case,
7015 return (encodep 7529 the return value is BUFFER.
7016 ? encode_coding_string (string, &coding, 1) 7530
7017 : decode_coding_string (string, &coding, 1)); 7531 This function sets `last-coding-system-used' to the precise coding system
7018 } 7532 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7533 not fully specified. */)
7534 (string, coding_system, nocopy, buffer)
7535 Lisp_Object string, coding_system, nocopy, buffer;
7536 {
7537 return code_convert_string (string, coding_system, buffer,
7538 0, ! NILP (nocopy), 0);
7539 }
7540
7541 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7542 2, 4, 0,
7543 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7544
7545 Optional third arg NOCOPY non-nil means it is OK to return STRING
7546 itself if the encoding operation is trivial.
7547
7548 Optional fourth arg BUFFER non-nil meant that the encoded text is
7549 inserted in BUFFER instead of returned as a string. In this case,
7550 the return value is BUFFER.
7551
7552 This function sets `last-coding-system-used' to the precise coding system
7553 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7554 not fully specified.) */)
7555 (string, coding_system, nocopy, buffer)
7556 Lisp_Object string, coding_system, nocopy, buffer;
7557 {
7558 return code_convert_string (string, coding_system, buffer,
7559 1, ! NILP (nocopy), 1);
7560 }
7561
7019 7562
7020 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0, 7563 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7021 doc: /* Decode a Japanese character which has CODE in shift_jis encoding. 7564 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7022 Return the corresponding character. */) 7565 Return the corresponding character. */)
7023 (code) 7566 (code)
7024 Lisp_Object code; 7567 Lisp_Object code;
7025 { 7568 {
7026 unsigned char c1, c2, s1, s2; 7569 Lisp_Object spec, attrs, val;
7027 Lisp_Object val; 7570 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7028 7571 int c;
7029 CHECK_NUMBER (code); 7572
7030 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF; 7573 CHECK_NATNUM (code);
7031 if (s1 == 0) 7574 c = XFASTINT (code);
7032 { 7575 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7033 if (s2 < 0x80) 7576 attrs = AREF (spec, 0);
7034 XSETFASTINT (val, s2); 7577
7035 else if (s2 >= 0xA0 || s2 <= 0xDF) 7578 if (ASCII_BYTE_P (c)
7036 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0)); 7579 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7037 else 7580 return code;
7038 error ("Invalid Shift JIS code: %x", XFASTINT (code)); 7581
7582 val = CODING_ATTR_CHARSET_LIST (attrs);
7583 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7584 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7585 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7586
7587 if (c <= 0x7F)
7588 charset = charset_roman;
7589 else if (c >= 0xA0 && c < 0xDF)
7590 {
7591 charset = charset_kana;
7592 c -= 0x80;
7039 } 7593 }
7040 else 7594 else
7041 { 7595 {
7042 if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF) 7596 int s1 = c >> 8, s2 = c & 0xFF;
7043 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)) 7597
7044 error ("Invalid Shift JIS code: %x", XFASTINT (code)); 7598 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7045 DECODE_SJIS (s1, s2, c1, c2); 7599 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7046 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2)); 7600 error ("Invalid code: %d", code);
7047 } 7601 SJIS_TO_JIS (c);
7048 return val; 7602 charset = charset_kanji;
7049 } 7603 }
7604 c = DECODE_CHAR (charset, c);
7605 if (c < 0)
7606 error ("Invalid code: %d", code);
7607 return make_number (c);
7608 }
7609
7050 7610
7051 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0, 7611 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7052 doc: /* Encode a Japanese character CHAR to shift_jis encoding. 7612 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7053 Return the corresponding code in SJIS. */) 7613 Return the corresponding code in SJIS. */)
7054 (ch) 7614 (ch)
7055 Lisp_Object ch; 7615 Lisp_Object ch;
7056 { 7616 {
7057 int charset, c1, c2, s1, s2; 7617 Lisp_Object spec, attrs, charset_list;
7058 Lisp_Object val; 7618 int c;
7059 7619 struct charset *charset;
7060 CHECK_NUMBER (ch); 7620 unsigned code;
7061 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2); 7621
7062 if (charset == CHARSET_ASCII) 7622 CHECK_CHARACTER (ch);
7063 { 7623 c = XFASTINT (ch);
7064 val = ch; 7624 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7065 } 7625 attrs = AREF (spec, 0);
7066 else if (charset == charset_jisx0208 7626
7067 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F) 7627 if (ASCII_CHAR_P (c)
7068 { 7628 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7069 ENCODE_SJIS (c1, c2, s1, s2); 7629 return ch;
7070 XSETFASTINT (val, (s1 << 8) | s2); 7630
7071 } 7631 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7072 else if (charset == charset_katakana_jisx0201 7632 charset = char_charset (c, charset_list, &code);
7073 && c1 > 0x20 && c2 < 0xE0) 7633 if (code == CHARSET_INVALID_CODE (charset))
7074 { 7634 error ("Can't encode by shift_jis encoding: %d", c);
7075 XSETFASTINT (val, c1 | 0x80); 7635 JIS_TO_SJIS (code);
7076 } 7636
7077 else 7637 return make_number (code);
7078 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7079 return val;
7080 } 7638 }
7081 7639
7082 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0, 7640 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7083 doc: /* Decode a Big5 character which has CODE in BIG5 coding system. 7641 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7084 Return the corresponding character. */) 7642 Return the corresponding character. */)
7085 (code) 7643 (code)
7086 Lisp_Object code; 7644 Lisp_Object code;
7087 { 7645 {
7088 int charset; 7646 Lisp_Object spec, attrs, val;
7089 unsigned char b1, b2, c1, c2; 7647 struct charset *charset_roman, *charset_big5, *charset;
7090 Lisp_Object val; 7648 int c;
7091 7649
7092 CHECK_NUMBER (code); 7650 CHECK_NATNUM (code);
7093 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF; 7651 c = XFASTINT (code);
7094 if (b1 == 0) 7652 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7095 { 7653 attrs = AREF (spec, 0);
7096 if (b2 >= 0x80) 7654
7097 error ("Invalid BIG5 code: %x", XFASTINT (code)); 7655 if (ASCII_BYTE_P (c)
7098 val = code; 7656 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7099 } 7657 return code;
7658
7659 val = CODING_ATTR_CHARSET_LIST (attrs);
7660 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7661 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7662
7663 if (c <= 0x7F)
7664 charset = charset_roman;
7100 else 7665 else
7101 { 7666 {
7102 if ((b1 < 0xA1 || b1 > 0xFE) 7667 int b1 = c >> 8, b2 = c & 0x7F;
7103 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)) 7668 if (b1 < 0xA1 || b1 > 0xFE
7104 error ("Invalid BIG5 code: %x", XFASTINT (code)); 7669 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7105 DECODE_BIG5 (b1, b2, charset, c1, c2); 7670 error ("Invalid code: %d", code);
7106 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2)); 7671 charset = charset_big5;
7107 } 7672 }
7108 return val; 7673 c = DECODE_CHAR (charset, (unsigned )c);
7674 if (c < 0)
7675 error ("Invalid code: %d", code);
7676 return make_number (c);
7109 } 7677 }
7110 7678
7111 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0, 7679 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7112 doc: /* Encode the Big5 character CHAR to BIG5 coding system. 7680 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7113 Return the corresponding character code in Big5. */) 7681 Return the corresponding character code in Big5. */)
7114 (ch) 7682 (ch)
7115 Lisp_Object ch; 7683 Lisp_Object ch;
7116 { 7684 {
7117 int charset, c1, c2, b1, b2; 7685 Lisp_Object spec, attrs, charset_list;
7118 Lisp_Object val; 7686 struct charset *charset;
7119 7687 int c;
7120 CHECK_NUMBER (ch); 7688 unsigned code;
7121 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2); 7689
7122 if (charset == CHARSET_ASCII) 7690 CHECK_CHARACTER (ch);
7123 { 7691 c = XFASTINT (ch);
7124 val = ch; 7692 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7125 } 7693 attrs = AREF (spec, 0);
7126 else if ((charset == charset_big5_1 7694 if (ASCII_CHAR_P (c)
7127 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec)) 7695 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7128 || (charset == charset_big5_2 7696 return ch;
7129 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2)) 7697
7130 { 7698 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7131 ENCODE_BIG5 (charset, c1, c2, b1, b2); 7699 charset = char_charset (c, charset_list, &code);
7132 XSETFASTINT (val, (b1 << 8) | b2); 7700 if (code == CHARSET_INVALID_CODE (charset))
7133 } 7701 error ("Can't encode by Big5 encoding: %d", c);
7134 else 7702
7135 error ("Can't encode to Big5: %d", XFASTINT (ch)); 7703 return make_number (code);
7136 return val; 7704 }
7137 } 7705
7138 7706
7139 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal, 7707 DEFUN ("set-terminal-coding-system-internal",
7708 Fset_terminal_coding_system_internal,
7140 Sset_terminal_coding_system_internal, 1, 1, 0, 7709 Sset_terminal_coding_system_internal, 1, 1, 0,
7141 doc: /* Internal use only. */) 7710 doc: /* Internal use only. */)
7142 (coding_system) 7711 (coding_system)
7143 Lisp_Object coding_system; 7712 Lisp_Object coding_system;
7144 { 7713 {
7145 CHECK_SYMBOL (coding_system); 7714 CHECK_SYMBOL (coding_system);
7146 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding); 7715 setup_coding_system (Fcheck_coding_system (coding_system),
7716 &terminal_coding);
7717
7147 /* We had better not send unsafe characters to terminal. */ 7718 /* We had better not send unsafe characters to terminal. */
7148 terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR; 7719 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7149 /* Character composition should be disabled. */ 7720 /* Characer composition should be disabled. */
7150 terminal_coding.composing = COMPOSITION_DISABLED; 7721 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7151 /* Error notification should be suppressed. */
7152 terminal_coding.suppress_error = 1;
7153 terminal_coding.src_multibyte = 1; 7722 terminal_coding.src_multibyte = 1;
7154 terminal_coding.dst_multibyte = 0; 7723 terminal_coding.dst_multibyte = 0;
7155 return Qnil; 7724 return Qnil;
7156 } 7725 }
7157 7726
7158 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal, 7727 DEFUN ("set-safe-terminal-coding-system-internal",
7728 Fset_safe_terminal_coding_system_internal,
7159 Sset_safe_terminal_coding_system_internal, 1, 1, 0, 7729 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7160 doc: /* Internal use only. */) 7730 doc: /* Internal use only. */)
7161 (coding_system) 7731 (coding_system)
7162 Lisp_Object coding_system; 7732 Lisp_Object coding_system;
7163 { 7733 {
7164 CHECK_SYMBOL (coding_system); 7734 CHECK_SYMBOL (coding_system);
7165 setup_coding_system (Fcheck_coding_system (coding_system), 7735 setup_coding_system (Fcheck_coding_system (coding_system),
7166 &safe_terminal_coding); 7736 &safe_terminal_coding);
7167 /* Character composition should be disabled. */ 7737 /* Characer composition should be disabled. */
7168 safe_terminal_coding.composing = COMPOSITION_DISABLED; 7738 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7169 /* Error notification should be suppressed. */
7170 terminal_coding.suppress_error = 1;
7171 safe_terminal_coding.src_multibyte = 1; 7739 safe_terminal_coding.src_multibyte = 1;
7172 safe_terminal_coding.dst_multibyte = 0; 7740 safe_terminal_coding.dst_multibyte = 0;
7173 return Qnil; 7741 return Qnil;
7174 } 7742 }
7175 7743
7176 DEFUN ("terminal-coding-system", Fterminal_coding_system, 7744 DEFUN ("terminal-coding-system",
7177 Sterminal_coding_system, 0, 0, 0, 7745 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7178 doc: /* Return coding system specified for terminal output. */) 7746 doc: /* Return coding system specified for terminal output. */)
7179 () 7747 ()
7180 { 7748 {
7181 return terminal_coding.symbol; 7749 return CODING_ID_NAME (terminal_coding.id);
7182 } 7750 }
7183 7751
7184 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal, 7752 DEFUN ("set-keyboard-coding-system-internal",
7753 Fset_keyboard_coding_system_internal,
7185 Sset_keyboard_coding_system_internal, 1, 1, 0, 7754 Sset_keyboard_coding_system_internal, 1, 1, 0,
7186 doc: /* Internal use only. */) 7755 doc: /* Internal use only. */)
7187 (coding_system) 7756 (coding_system)
7188 Lisp_Object coding_system; 7757 Lisp_Object coding_system;
7189 { 7758 {
7190 CHECK_SYMBOL (coding_system); 7759 CHECK_SYMBOL (coding_system);
7191 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding); 7760 setup_coding_system (Fcheck_coding_system (coding_system),
7192 /* Character composition should be disabled. */ 7761 &keyboard_coding);
7193 keyboard_coding.composing = COMPOSITION_DISABLED; 7762 /* Characer composition should be disabled. */
7763 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7194 return Qnil; 7764 return Qnil;
7195 } 7765 }
7196 7766
7197 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system, 7767 DEFUN ("keyboard-coding-system",
7198 Skeyboard_coding_system, 0, 0, 0, 7768 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7199 doc: /* Return coding system specified for decoding keyboard input. */) 7769 doc: /* Return coding system specified for decoding keyboard input. */)
7200 () 7770 ()
7201 { 7771 {
7202 return keyboard_coding.symbol; 7772 return CODING_ID_NAME (keyboard_coding.id);
7203 } 7773 }
7204 7774
7205 7775
7206 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system, 7776 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7207 Sfind_operation_coding_system, 1, MANY, 0, 7777 Sfind_operation_coding_system, 1, MANY, 0,
7245 if (nargs < 2) 7815 if (nargs < 2)
7246 error ("Too few arguments"); 7816 error ("Too few arguments");
7247 operation = args[0]; 7817 operation = args[0];
7248 if (!SYMBOLP (operation) 7818 if (!SYMBOLP (operation)
7249 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx))) 7819 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7250 error ("Invalid first argument"); 7820 error ("Invalid first arguement");
7251 if (nargs < 1 + XINT (target_idx)) 7821 if (nargs < 1 + XINT (target_idx))
7252 error ("Too few arguments for operation: %s", 7822 error ("Too few arguments for operation: %s",
7253 SDATA (SYMBOL_NAME (operation))); 7823 SDATA (SYMBOL_NAME (operation)));
7254 /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7255 argument to write-region) is string, it must be treated as a
7256 target file name. */
7257 if (EQ (operation, Qwrite_region)
7258 && nargs > 5
7259 && STRINGP (args[5]))
7260 target_idx = make_number (4);
7261 target = args[XINT (target_idx) + 1]; 7824 target = args[XINT (target_idx) + 1];
7262 if (!(STRINGP (target) 7825 if (!(STRINGP (target)
7263 || (EQ (operation, Qopen_network_stream) && INTEGERP (target)))) 7826 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7264 error ("Invalid argument %d", XINT (target_idx) + 1); 7827 error ("Invalid %dth argument", XINT (target_idx) + 1);
7265 7828
7266 chain = ((EQ (operation, Qinsert_file_contents) 7829 chain = ((EQ (operation, Qinsert_file_contents)
7267 || EQ (operation, Qwrite_region)) 7830 || EQ (operation, Qwrite_region))
7268 ? Vfile_coding_system_alist 7831 ? Vfile_coding_system_alist
7269 : (EQ (operation, Qopen_network_stream) 7832 : (EQ (operation, Qopen_network_stream)
7273 return Qnil; 7836 return Qnil;
7274 7837
7275 for (; CONSP (chain); chain = XCDR (chain)) 7838 for (; CONSP (chain); chain = XCDR (chain))
7276 { 7839 {
7277 Lisp_Object elt; 7840 Lisp_Object elt;
7841
7278 elt = XCAR (chain); 7842 elt = XCAR (chain);
7279
7280 if (CONSP (elt) 7843 if (CONSP (elt)
7281 && ((STRINGP (target) 7844 && ((STRINGP (target)
7282 && STRINGP (XCAR (elt)) 7845 && STRINGP (XCAR (elt))
7283 && fast_string_match (XCAR (elt), target) >= 0) 7846 && fast_string_match (XCAR (elt), target) >= 0)
7284 || (INTEGERP (target) && EQ (target, XCAR (elt))))) 7847 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7304 } 7867 }
7305 } 7868 }
7306 return Qnil; 7869 return Qnil;
7307 } 7870 }
7308 7871
7309 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal, 7872 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7310 Supdate_coding_systems_internal, 0, 0, 0, 7873 Sset_coding_system_priority, 0, MANY, 0,
7311 doc: /* Update internal database for ISO2022 and CCL based coding systems. 7874 doc: /* Assign higher priority to the coding systems given as arguments.
7312 When values of any coding categories are changed, you must 7875 If multiple coding systems belongs to the same category,
7313 call this function. */) 7876 all but the first one are ignored. */)
7314 () 7877 (nargs, args)
7878 int nargs;
7879 Lisp_Object *args;
7880 {
7881 int i, j;
7882 int changed[coding_category_max];
7883 enum coding_category priorities[coding_category_max];
7884
7885 bzero (changed, sizeof changed);
7886
7887 for (i = j = 0; i < nargs; i++)
7888 {
7889 enum coding_category category;
7890 Lisp_Object spec, attrs;
7891
7892 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7893 attrs = AREF (spec, 0);
7894 category = XINT (CODING_ATTR_CATEGORY (attrs));
7895 if (changed[category])
7896 /* Ignore this coding system because a coding system of the
7897 same category already had a higher priority. */
7898 continue;
7899 changed[category] = 1;
7900 priorities[j++] = category;
7901 if (coding_categories[category].id >= 0
7902 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7903 setup_coding_system (args[i], &coding_categories[category]);
7904 Fset (AREF (Vcoding_category_table, category), args[i]);
7905 }
7906
7907 /* Now we have decided top J priorities. Reflect the order of the
7908 original priorities to the remaining priorities. */
7909
7910 for (i = j, j = 0; i < coding_category_max; i++, j++)
7911 {
7912 while (j < coding_category_max
7913 && changed[coding_priorities[j]])
7914 j++;
7915 if (j == coding_category_max)
7916 abort ();
7917 priorities[i] = coding_priorities[j];
7918 }
7919
7920 bcopy (priorities, coding_priorities, sizeof priorities);
7921
7922 /* Update `coding-category-list'. */
7923 Vcoding_category_list = Qnil;
7924 for (i = coding_category_max - 1; i >= 0; i--)
7925 Vcoding_category_list
7926 = Fcons (AREF (Vcoding_category_table, priorities[i]),
7927 Vcoding_category_list);
7928
7929 return Qnil;
7930 }
7931
7932 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7933 Scoding_system_priority_list, 0, 1, 0,
7934 doc: /* Return a list of coding systems ordered by their priorities.
7935 HIGHESTP non-nil means just return the highest priority one. */)
7936 (highestp)
7937 Lisp_Object highestp;
7315 { 7938 {
7316 int i; 7939 int i;
7317 7940 Lisp_Object val;
7318 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++) 7941
7319 { 7942 for (i = 0, val = Qnil; i < coding_category_max; i++)
7320 Lisp_Object val; 7943 {
7321 7944 enum coding_category category = coding_priorities[i];
7322 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]); 7945 int id = coding_categories[category].id;
7323 if (!NILP (val)) 7946 Lisp_Object attrs;
7324 { 7947
7325 if (! coding_system_table[i]) 7948 if (id < 0)
7326 coding_system_table[i] = ((struct coding_system *) 7949 continue;
7327 xmalloc (sizeof (struct coding_system))); 7950 attrs = CODING_ID_ATTRS (id);
7328 setup_coding_system (val, coding_system_table[i]); 7951 if (! NILP (highestp))
7329 } 7952 return CODING_ATTR_BASE_NAME (attrs);
7330 else if (coding_system_table[i]) 7953 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7331 { 7954 }
7332 xfree (coding_system_table[i]); 7955 return Fnreverse (val);
7333 coding_system_table[i] = NULL; 7956 }
7334 } 7957
7335 } 7958 static char *suffixes[] = { "-unix", "-dos", "-mac" };
7959
7960 static Lisp_Object
7961 make_subsidiaries (base)
7962 Lisp_Object base;
7963 {
7964 Lisp_Object subsidiaries;
7965 int base_name_len = SBYTES (SYMBOL_NAME (base));
7966 char *buf = (char *) alloca (base_name_len + 6);
7967 int i;
7968
7969 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
7970 subsidiaries = Fmake_vector (make_number (3), Qnil);
7971 for (i = 0; i < 3; i++)
7972 {
7973 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7974 ASET (subsidiaries, i, intern (buf));
7975 }
7976 return subsidiaries;
7977 }
7978
7979
7980 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7981 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7982 doc: /* For internal use only.
7983 usage: (define-coding-system-internal ...) */)
7984 (nargs, args)
7985 int nargs;
7986 Lisp_Object *args;
7987 {
7988 Lisp_Object name;
7989 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
7990 Lisp_Object attrs; /* Vector of attributes. */
7991 Lisp_Object eol_type;
7992 Lisp_Object aliases;
7993 Lisp_Object coding_type, charset_list, safe_charsets;
7994 enum coding_category category;
7995 Lisp_Object tail, val;
7996 int max_charset_id = 0;
7997 int i;
7998
7999 if (nargs < coding_arg_max)
8000 goto short_args;
8001
8002 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8003
8004 name = args[coding_arg_name];
8005 CHECK_SYMBOL (name);
8006 CODING_ATTR_BASE_NAME (attrs) = name;
8007
8008 val = args[coding_arg_mnemonic];
8009 if (! STRINGP (val))
8010 CHECK_CHARACTER (val);
8011 CODING_ATTR_MNEMONIC (attrs) = val;
8012
8013 coding_type = args[coding_arg_coding_type];
8014 CHECK_SYMBOL (coding_type);
8015 CODING_ATTR_TYPE (attrs) = coding_type;
8016
8017 charset_list = args[coding_arg_charset_list];
8018 if (SYMBOLP (charset_list))
8019 {
8020 if (EQ (charset_list, Qiso_2022))
8021 {
8022 if (! EQ (coding_type, Qiso_2022))
8023 error ("Invalid charset-list");
8024 charset_list = Viso_2022_charset_list;
8025 }
8026 else if (EQ (charset_list, Qemacs_mule))
8027 {
8028 if (! EQ (coding_type, Qemacs_mule))
8029 error ("Invalid charset-list");
8030 charset_list = Vemacs_mule_charset_list;
8031 }
8032 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8033 if (max_charset_id < XFASTINT (XCAR (tail)))
8034 max_charset_id = XFASTINT (XCAR (tail));
8035 }
8036 else
8037 {
8038 charset_list = Fcopy_sequence (charset_list);
8039 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8040 {
8041 struct charset *charset;
8042
8043 val = Fcar (tail);
8044 CHECK_CHARSET_GET_CHARSET (val, charset);
8045 if (EQ (coding_type, Qiso_2022)
8046 ? CHARSET_ISO_FINAL (charset) < 0
8047 : EQ (coding_type, Qemacs_mule)
8048 ? CHARSET_EMACS_MULE_ID (charset) < 0
8049 : 0)
8050 error ("Can't handle charset `%s'",
8051 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8052
8053 XSETCAR (tail, make_number (charset->id));
8054 if (max_charset_id < charset->id)
8055 max_charset_id = charset->id;
8056 }
8057 }
8058 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8059
8060 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8061 make_number (255));
8062 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8063 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8064 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8065
8066 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8067
8068 val = args[coding_arg_decode_translation_table];
8069 if (! NILP (val))
8070 CHECK_CHAR_TABLE (val);
8071 CODING_ATTR_DECODE_TBL (attrs) = val;
8072
8073 val = args[coding_arg_encode_translation_table];
8074 if (! NILP (val))
8075 CHECK_CHAR_TABLE (val);
8076 CODING_ATTR_ENCODE_TBL (attrs) = val;
8077
8078 val = args[coding_arg_post_read_conversion];
8079 CHECK_SYMBOL (val);
8080 CODING_ATTR_POST_READ (attrs) = val;
8081
8082 val = args[coding_arg_pre_write_conversion];
8083 CHECK_SYMBOL (val);
8084 CODING_ATTR_PRE_WRITE (attrs) = val;
8085
8086 val = args[coding_arg_default_char];
8087 if (NILP (val))
8088 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8089 else
8090 {
8091 CHECK_CHARACTER (val);
8092 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8093 }
8094
8095 val = args[coding_arg_for_unibyte];
8096 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8097
8098 val = args[coding_arg_plist];
8099 CHECK_LIST (val);
8100 CODING_ATTR_PLIST (attrs) = val;
8101
8102 if (EQ (coding_type, Qcharset))
8103 {
8104 Lisp_Object list;
8105 /* Generate a lisp vector of 256 elements. Each element is nil,
8106 integer, or a list of charset IDs.
8107
8108 If Nth element is nil, the byte code N is invalid in this
8109 coding system.
8110
8111 If Nth element is a number NUM, N is the first byte of a
8112 charset whose ID is NUM.
8113
8114 If Nth element is a list of charset IDs, N is the first byte
8115 of one of them. The list is sorted by dimensions of the
8116 charsets. A charset of smaller dimension comes firtst.
8117 */
8118 for (list = Qnil, tail = charset_list; CONSP (tail); tail = XCDR (tail))
8119 {
8120 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8121
8122 if (charset->method == CHARSET_METHOD_SUPERSET)
8123 {
8124 val = CHARSET_SUPERSET (charset);
8125 for (; CONSP (val); val = XCDR (val))
8126 list = Fcons (XCAR (XCAR (val)), list);
8127 }
8128 else
8129 list = Fcons (XCAR (tail), list);
8130 }
8131
8132 val = Fmake_vector (make_number (256), Qnil);
8133
8134 for (tail = Fnreverse (list); CONSP (tail); tail = XCDR (tail))
8135 {
8136 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8137 int dim = CHARSET_DIMENSION (charset);
8138 int idx = (dim - 1) * 4;
8139
8140 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8141 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8142
8143 for (i = charset->code_space[idx];
8144 i <= charset->code_space[idx + 1]; i++)
8145 {
8146 Lisp_Object tmp, tmp2;
8147 int dim2;
8148
8149 tmp = AREF (val, i);
8150 if (NILP (tmp))
8151 tmp = XCAR (tail);
8152 else if (NUMBERP (tmp))
8153 {
8154 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8155 if (dim < dim2)
8156 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8157 else
8158 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8159 }
8160 else
8161 {
8162 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8163 {
8164 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8165 if (dim < dim2)
8166 break;
8167 }
8168 if (NILP (tmp2))
8169 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8170 else
8171 {
8172 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8173 XSETCAR (tmp2, XCAR (tail));
8174 }
8175 }
8176 ASET (val, i, tmp);
8177 }
8178 }
8179 ASET (attrs, coding_attr_charset_valids, val);
8180 category = coding_category_charset;
8181 }
8182 else if (EQ (coding_type, Qccl))
8183 {
8184 Lisp_Object valids;
8185
8186 if (nargs < coding_arg_ccl_max)
8187 goto short_args;
8188
8189 val = args[coding_arg_ccl_decoder];
8190 CHECK_CCL_PROGRAM (val);
8191 if (VECTORP (val))
8192 val = Fcopy_sequence (val);
8193 ASET (attrs, coding_attr_ccl_decoder, val);
8194
8195 val = args[coding_arg_ccl_encoder];
8196 CHECK_CCL_PROGRAM (val);
8197 if (VECTORP (val))
8198 val = Fcopy_sequence (val);
8199 ASET (attrs, coding_attr_ccl_encoder, val);
8200
8201 val = args[coding_arg_ccl_valids];
8202 valids = Fmake_string (make_number (256), make_number (0));
8203 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8204 {
8205 int from, to;
8206
8207 val = Fcar (tail);
8208 if (INTEGERP (val))
8209 {
8210 from = to = XINT (val);
8211 if (from < 0 || from > 255)
8212 args_out_of_range_3 (val, make_number (0), make_number (255));
8213 }
8214 else
8215 {
8216 CHECK_CONS (val);
8217 CHECK_NATNUM_CAR (val);
8218 CHECK_NATNUM_CDR (val);
8219 from = XINT (XCAR (val));
8220 if (from > 255)
8221 args_out_of_range_3 (XCAR (val),
8222 make_number (0), make_number (255));
8223 to = XINT (XCDR (val));
8224 if (to < from || to > 255)
8225 args_out_of_range_3 (XCDR (val),
8226 XCAR (val), make_number (255));
8227 }
8228 for (i = from; i <= to; i++)
8229 SSET (valids, i, 1);
8230 }
8231 ASET (attrs, coding_attr_ccl_valids, valids);
8232
8233 category = coding_category_ccl;
8234 }
8235 else if (EQ (coding_type, Qutf_16))
8236 {
8237 Lisp_Object bom, endian;
8238
8239 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8240
8241 if (nargs < coding_arg_utf16_max)
8242 goto short_args;
8243
8244 bom = args[coding_arg_utf16_bom];
8245 if (! NILP (bom) && ! EQ (bom, Qt))
8246 {
8247 CHECK_CONS (bom);
8248 val = XCAR (bom);
8249 CHECK_CODING_SYSTEM (val);
8250 val = XCDR (bom);
8251 CHECK_CODING_SYSTEM (val);
8252 }
8253 ASET (attrs, coding_attr_utf_16_bom, bom);
8254
8255 endian = args[coding_arg_utf16_endian];
8256 CHECK_SYMBOL (endian);
8257 if (NILP (endian))
8258 endian = Qbig;
8259 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8260 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8261 ASET (attrs, coding_attr_utf_16_endian, endian);
8262
8263 category = (CONSP (bom)
8264 ? coding_category_utf_16_auto
8265 : NILP (bom)
8266 ? (EQ (endian, Qbig)
8267 ? coding_category_utf_16_be_nosig
8268 : coding_category_utf_16_le_nosig)
8269 : (EQ (endian, Qbig)
8270 ? coding_category_utf_16_be
8271 : coding_category_utf_16_le));
8272 }
8273 else if (EQ (coding_type, Qiso_2022))
8274 {
8275 Lisp_Object initial, reg_usage, request, flags;
8276 int i;
8277
8278 if (nargs < coding_arg_iso2022_max)
8279 goto short_args;
8280
8281 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8282 CHECK_VECTOR (initial);
8283 for (i = 0; i < 4; i++)
8284 {
8285 val = Faref (initial, make_number (i));
8286 if (! NILP (val))
8287 {
8288 struct charset *charset;
8289
8290 CHECK_CHARSET_GET_CHARSET (val, charset);
8291 ASET (initial, i, make_number (CHARSET_ID (charset)));
8292 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8293 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8294 }
8295 else
8296 ASET (initial, i, make_number (-1));
8297 }
8298
8299 reg_usage = args[coding_arg_iso2022_reg_usage];
8300 CHECK_CONS (reg_usage);
8301 CHECK_NUMBER_CAR (reg_usage);
8302 CHECK_NUMBER_CDR (reg_usage);
8303
8304 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8305 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8306 {
8307 int id;
8308 Lisp_Object tmp;
8309
8310 val = Fcar (tail);
8311 CHECK_CONS (val);
8312 tmp = XCAR (val);
8313 CHECK_CHARSET_GET_ID (tmp, id);
8314 CHECK_NATNUM_CDR (val);
8315 if (XINT (XCDR (val)) >= 4)
8316 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8317 XSETCAR (val, make_number (id));
8318 }
8319
8320 flags = args[coding_arg_iso2022_flags];
8321 CHECK_NATNUM (flags);
8322 i = XINT (flags);
8323 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8324 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8325
8326 ASET (attrs, coding_attr_iso_initial, initial);
8327 ASET (attrs, coding_attr_iso_usage, reg_usage);
8328 ASET (attrs, coding_attr_iso_request, request);
8329 ASET (attrs, coding_attr_iso_flags, flags);
8330 setup_iso_safe_charsets (attrs);
8331
8332 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8333 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8334 | CODING_ISO_FLAG_SINGLE_SHIFT))
8335 ? coding_category_iso_7_else
8336 : EQ (args[coding_arg_charset_list], Qiso_2022)
8337 ? coding_category_iso_7
8338 : coding_category_iso_7_tight);
8339 else
8340 {
8341 int id = XINT (AREF (initial, 1));
8342
8343 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8344 || EQ (args[coding_arg_charset_list], Qiso_2022)
8345 || id < 0)
8346 ? coding_category_iso_8_else
8347 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8348 ? coding_category_iso_8_1
8349 : coding_category_iso_8_2);
8350 }
8351 if (category != coding_category_iso_8_1
8352 && category != coding_category_iso_8_2)
8353 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8354 }
8355 else if (EQ (coding_type, Qemacs_mule))
8356 {
8357 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8358 ASET (attrs, coding_attr_emacs_mule_full, Qt);
8359 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8360 category = coding_category_emacs_mule;
8361 }
8362 else if (EQ (coding_type, Qshift_jis))
8363 {
8364
8365 struct charset *charset;
8366
8367 if (XINT (Flength (charset_list)) != 3)
8368 error ("There should be just three charsets");
8369
8370 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8371 if (CHARSET_DIMENSION (charset) != 1)
8372 error ("Dimension of charset %s is not one",
8373 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8374 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8375 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8376
8377 charset_list = XCDR (charset_list);
8378 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8379 if (CHARSET_DIMENSION (charset) != 1)
8380 error ("Dimension of charset %s is not one",
8381 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8382
8383 charset_list = XCDR (charset_list);
8384 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8385 if (CHARSET_DIMENSION (charset) != 2)
8386 error ("Dimension of charset %s is not two",
8387 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8388
8389 category = coding_category_sjis;
8390 Vsjis_coding_system = name;
8391 }
8392 else if (EQ (coding_type, Qbig5))
8393 {
8394 struct charset *charset;
8395
8396 if (XINT (Flength (charset_list)) != 2)
8397 error ("There should be just two charsets");
8398
8399 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8400 if (CHARSET_DIMENSION (charset) != 1)
8401 error ("Dimension of charset %s is not one",
8402 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8403 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8404 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8405
8406 charset_list = XCDR (charset_list);
8407 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8408 if (CHARSET_DIMENSION (charset) != 2)
8409 error ("Dimension of charset %s is not two",
8410 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8411
8412 category = coding_category_big5;
8413 Vbig5_coding_system = name;
8414 }
8415 else if (EQ (coding_type, Qraw_text))
8416 {
8417 category = coding_category_raw_text;
8418 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8419 }
8420 else if (EQ (coding_type, Qutf_8))
8421 {
8422 category = coding_category_utf_8;
8423 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8424 }
8425 else if (EQ (coding_type, Qundecided))
8426 category = coding_category_undecided;
8427 else
8428 error ("Invalid coding system type: %s",
8429 SDATA (SYMBOL_NAME (coding_type)));
8430
8431 CODING_ATTR_CATEGORY (attrs) = make_number (category);
8432 CODING_ATTR_PLIST (attrs)
8433 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8434 CODING_ATTR_PLIST (attrs)));
8435
8436 eol_type = args[coding_arg_eol_type];
8437 if (! NILP (eol_type)
8438 && ! EQ (eol_type, Qunix)
8439 && ! EQ (eol_type, Qdos)
8440 && ! EQ (eol_type, Qmac))
8441 error ("Invalid eol-type");
8442
8443 aliases = Fcons (name, Qnil);
8444
8445 if (NILP (eol_type))
8446 {
8447 eol_type = make_subsidiaries (name);
8448 for (i = 0; i < 3; i++)
8449 {
8450 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8451
8452 this_name = AREF (eol_type, i);
8453 this_aliases = Fcons (this_name, Qnil);
8454 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8455 this_spec = Fmake_vector (make_number (3), attrs);
8456 ASET (this_spec, 1, this_aliases);
8457 ASET (this_spec, 2, this_eol_type);
8458 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8459 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8460 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8461 Vcoding_system_alist);
8462 }
8463 }
8464
8465 spec_vec = Fmake_vector (make_number (3), attrs);
8466 ASET (spec_vec, 1, aliases);
8467 ASET (spec_vec, 2, eol_type);
8468
8469 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8470 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8471 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8472 Vcoding_system_alist);
8473
8474 {
8475 int id = coding_categories[category].id;
8476
8477 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8478 setup_coding_system (name, &coding_categories[category]);
8479 }
7336 8480
7337 return Qnil; 8481 return Qnil;
7338 } 8482
7339 8483 short_args:
7340 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal, 8484 return Fsignal (Qwrong_number_of_arguments,
7341 Sset_coding_priority_internal, 0, 0, 0, 8485 Fcons (intern ("define-coding-system-internal"),
7342 doc: /* Update internal database for the current value of `coding-category-list'. 8486 make_number (nargs)));
7343 This function is internal use only. */) 8487 }
7344 () 8488
7345 { 8489 /* Fixme: should this record the alias relationships for
7346 int i = 0, idx; 8490 diagnostics? Should it update coding-system-list? */
7347 Lisp_Object val; 8491 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
7348 8492 Sdefine_coding_system_alias, 2, 2, 0,
7349 val = Vcoding_category_list; 8493 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
7350 8494 (alias, coding_system)
7351 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX) 8495 Lisp_Object alias, coding_system;
7352 { 8496 {
7353 if (! SYMBOLP (XCAR (val))) 8497 Lisp_Object spec, aliases, eol_type;
7354 break; 8498
7355 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index)); 8499 CHECK_SYMBOL (alias);
7356 if (idx >= CODING_CATEGORY_IDX_MAX) 8500 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7357 break; 8501 aliases = AREF (spec, 1);
7358 coding_priorities[i++] = (1 << idx); 8502 while (!NILP (XCDR (aliases)))
7359 val = XCDR (val); 8503 aliases = XCDR (aliases);
7360 } 8504 XSETCDR (aliases, Fcons (alias, Qnil));
7361 /* If coding-category-list is valid and contains all coding 8505
7362 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not, 8506 eol_type = AREF (spec, 2);
7363 the following code saves Emacs from crashing. */ 8507 if (VECTORP (eol_type))
7364 while (i < CODING_CATEGORY_IDX_MAX) 8508 {
7365 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT; 8509 Lisp_Object subsidiaries;
8510 int i;
8511
8512 subsidiaries = make_subsidiaries (alias);
8513 for (i = 0; i < 3; i++)
8514 Fdefine_coding_system_alias (AREF (subsidiaries, i),
8515 AREF (eol_type, i));
8516
8517 ASET (spec, 2, subsidiaries);
8518 }
8519
8520 Fputhash (alias, spec, Vcoding_system_hash_table);
8521 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8522 Vcoding_system_alist);
7366 8523
7367 return Qnil; 8524 return Qnil;
7368 } 8525 }
7369 8526
7370 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal, 8527 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
7371 Sdefine_coding_system_internal, 1, 1, 0, 8528 1, 1, 0,
7372 doc: /* Register CODING-SYSTEM as a base coding system. 8529 doc: /* Return the base of CODING-SYSTEM.
7373 This function is internal use only. */) 8530 Any alias or subsidiary coding system is not a base coding system. */)
8531 (coding_system)
8532 Lisp_Object coding_system;
8533 {
8534 Lisp_Object spec, attrs;
8535
8536 if (NILP (coding_system))
8537 return (Qno_conversion);
8538 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8539 attrs = AREF (spec, 0);
8540 return CODING_ATTR_BASE_NAME (attrs);
8541 }
8542
8543 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8544 1, 1, 0,
8545 doc: "Return the property list of CODING-SYSTEM.")
7374 (coding_system) 8546 (coding_system)
7375 Lisp_Object coding_system; 8547 Lisp_Object coding_system;
7376 { 8548 {
7377 Lisp_Object safe_chars, slot; 8549 Lisp_Object spec, attrs;
7378 8550
7379 if (NILP (Fcheck_coding_system (coding_system))) 8551 if (NILP (coding_system))
7380 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil)); 8552 coding_system = Qno_conversion;
7381 safe_chars = coding_safe_chars (coding_system); 8553 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7382 if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars)) 8554 attrs = AREF (spec, 0);
7383 error ("No valid safe-chars property for %s", 8555 return CODING_ATTR_PLIST (attrs);
7384 SDATA (SYMBOL_NAME (coding_system))); 8556 }
7385 if (EQ (safe_chars, Qt)) 8557
7386 { 8558
7387 if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars)))) 8559 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
7388 XSETCAR (Vcoding_system_safe_chars, 8560 1, 1, 0,
7389 Fcons (coding_system, XCAR (Vcoding_system_safe_chars))); 8561 doc: /* Return the list of aliases of CODING-SYSTEM. */)
7390 } 8562 (coding_system)
7391 else 8563 Lisp_Object coding_system;
7392 { 8564 {
7393 slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars)); 8565 Lisp_Object spec;
7394 if (NILP (slot)) 8566
7395 XSETCDR (Vcoding_system_safe_chars, 8567 if (NILP (coding_system))
7396 nconc2 (XCDR (Vcoding_system_safe_chars), 8568 coding_system = Qno_conversion;
7397 Fcons (Fcons (coding_system, safe_chars), Qnil))); 8569 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7398 else 8570 return AREF (spec, 1);
7399 XSETCDR (slot, safe_chars); 8571 }
7400 } 8572
7401 return Qnil; 8573 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8574 Scoding_system_eol_type, 1, 1, 0,
8575 doc: /* Return eol-type of CODING-SYSTEM.
8576 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8577
8578 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8579 and CR respectively.
8580
8581 A vector value indicates that a format of end-of-line should be
8582 detected automatically. Nth element of the vector is the subsidiary
8583 coding system whose eol-type is N. */)
8584 (coding_system)
8585 Lisp_Object coding_system;
8586 {
8587 Lisp_Object spec, eol_type;
8588 int n;
8589
8590 if (NILP (coding_system))
8591 coding_system = Qno_conversion;
8592 if (! CODING_SYSTEM_P (coding_system))
8593 return Qnil;
8594 spec = CODING_SYSTEM_SPEC (coding_system);
8595 eol_type = AREF (spec, 2);
8596 if (VECTORP (eol_type))
8597 return Fcopy_sequence (eol_type);
8598 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8599 return make_number (n);
7402 } 8600 }
7403 8601
7404 #endif /* emacs */ 8602 #endif /* emacs */
7405 8603
7406 8604
7409 void 8607 void
7410 init_coding_once () 8608 init_coding_once ()
7411 { 8609 {
7412 int i; 8610 int i;
7413 8611
7414 /* Emacs' internal format specific initialize routine. */ 8612 for (i = 0; i < coding_category_max; i++)
7415 for (i = 0; i <= 0x20; i++) 8613 {
7416 emacs_code_class[i] = EMACS_control_code; 8614 coding_categories[i].id = -1;
7417 emacs_code_class[0x0A] = EMACS_linefeed_code; 8615 coding_priorities[i] = i;
7418 emacs_code_class[0x0D] = EMACS_carriage_return_code; 8616 }
7419 for (i = 0x21 ; i < 0x7F; i++)
7420 emacs_code_class[i] = EMACS_ascii_code;
7421 emacs_code_class[0x7F] = EMACS_control_code;
7422 for (i = 0x80; i < 0xFF; i++)
7423 emacs_code_class[i] = EMACS_invalid_code;
7424 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7425 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7426 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7427 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7428 8617
7429 /* ISO2022 specific initialize routine. */ 8618 /* ISO2022 specific initialize routine. */
7430 for (i = 0; i < 0x20; i++) 8619 for (i = 0; i < 0x20; i++)
7431 iso_code_class[i] = ISO_control_0; 8620 iso_code_class[i] = ISO_control_0;
7432 for (i = 0x21; i < 0x7F; i++) 8621 for (i = 0x21; i < 0x7F; i++)
7444 iso_code_class[ISO_CODE_ESC] = ISO_escape; 8633 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7445 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2; 8634 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7446 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3; 8635 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7447 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer; 8636 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7448 8637
7449 setup_coding_system (Qnil, &keyboard_coding);
7450 setup_coding_system (Qnil, &terminal_coding);
7451 setup_coding_system (Qnil, &safe_terminal_coding);
7452 setup_coding_system (Qnil, &default_buffer_file_coding);
7453
7454 bzero (coding_system_table, sizeof coding_system_table);
7455
7456 bzero (ascii_skip_code, sizeof ascii_skip_code);
7457 for (i = 0; i < 128; i++)
7458 ascii_skip_code[i] = 1;
7459
7460 #if defined (MSDOS) || defined (WINDOWSNT)
7461 system_eol_type = CODING_EOL_CRLF;
7462 #else
7463 system_eol_type = CODING_EOL_LF;
7464 #endif
7465
7466 inhibit_pre_post_conversion = 0; 8638 inhibit_pre_post_conversion = 0;
8639
8640 for (i = 0; i < 256; i++)
8641 {
8642 emacs_mule_bytes[i] = 1;
8643 }
8644 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8645 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8646 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8647 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
7467 } 8648 }
7468 8649
7469 #ifdef emacs 8650 #ifdef emacs
7470 8651
7471 void 8652 void
7472 syms_of_coding () 8653 syms_of_coding ()
7473 { 8654 {
7474 Qtarget_idx = intern ("target-idx"); 8655 staticpro (&Vcoding_system_hash_table);
7475 staticpro (&Qtarget_idx); 8656 {
7476 8657 Lisp_Object args[2];
7477 Qcoding_system_history = intern ("coding-system-history"); 8658 args[0] = QCtest;
7478 staticpro (&Qcoding_system_history); 8659 args[1] = Qeq;
8660 Vcoding_system_hash_table = Fmake_hash_table (2, args);
8661 }
8662
8663 staticpro (&Vsjis_coding_system);
8664 Vsjis_coding_system = Qnil;
8665
8666 staticpro (&Vbig5_coding_system);
8667 Vbig5_coding_system = Qnil;
8668
8669 staticpro (&Vcode_conversion_work_buf_list);
8670 Vcode_conversion_work_buf_list = Qnil;
8671
8672 staticpro (&Vcode_conversion_reused_work_buf);
8673 Vcode_conversion_reused_work_buf = Qnil;
8674
8675 DEFSYM (Qcharset, "charset");
8676 DEFSYM (Qtarget_idx, "target-idx");
8677 DEFSYM (Qcoding_system_history, "coding-system-history");
7479 Fset (Qcoding_system_history, Qnil); 8678 Fset (Qcoding_system_history, Qnil);
7480 8679
7481 /* Target FILENAME is the first argument. */ 8680 /* Target FILENAME is the first argument. */
7482 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0)); 8681 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7483 /* Target FILENAME is the third argument. */ 8682 /* Target FILENAME is the third argument. */
7484 Fput (Qwrite_region, Qtarget_idx, make_number (2)); 8683 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7485 8684
7486 Qcall_process = intern ("call-process"); 8685 DEFSYM (Qcall_process, "call-process");
7487 staticpro (&Qcall_process);
7488 /* Target PROGRAM is the first argument. */ 8686 /* Target PROGRAM is the first argument. */
7489 Fput (Qcall_process, Qtarget_idx, make_number (0)); 8687 Fput (Qcall_process, Qtarget_idx, make_number (0));
7490 8688
7491 Qcall_process_region = intern ("call-process-region"); 8689 DEFSYM (Qcall_process_region, "call-process-region");
7492 staticpro (&Qcall_process_region);
7493 /* Target PROGRAM is the third argument. */ 8690 /* Target PROGRAM is the third argument. */
7494 Fput (Qcall_process_region, Qtarget_idx, make_number (2)); 8691 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7495 8692
7496 Qstart_process = intern ("start-process"); 8693 DEFSYM (Qstart_process, "start-process");
7497 staticpro (&Qstart_process);
7498 /* Target PROGRAM is the third argument. */ 8694 /* Target PROGRAM is the third argument. */
7499 Fput (Qstart_process, Qtarget_idx, make_number (2)); 8695 Fput (Qstart_process, Qtarget_idx, make_number (2));
7500 8696
7501 Qopen_network_stream = intern ("open-network-stream"); 8697 DEFSYM (Qopen_network_stream, "open-network-stream");
7502 staticpro (&Qopen_network_stream);
7503 /* Target SERVICE is the fourth argument. */ 8698 /* Target SERVICE is the fourth argument. */
7504 Fput (Qopen_network_stream, Qtarget_idx, make_number (3)); 8699 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7505 8700
7506 Qcoding_system = intern ("coding-system"); 8701 DEFSYM (Qcoding_system, "coding-system");
7507 staticpro (&Qcoding_system); 8702 DEFSYM (Qcoding_aliases, "coding-aliases");
7508 8703
7509 Qeol_type = intern ("eol-type"); 8704 DEFSYM (Qeol_type, "eol-type");
7510 staticpro (&Qeol_type); 8705 DEFSYM (Qunix, "unix");
7511 8706 DEFSYM (Qdos, "dos");
7512 Qbuffer_file_coding_system = intern ("buffer-file-coding-system"); 8707
7513 staticpro (&Qbuffer_file_coding_system); 8708 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
7514 8709 DEFSYM (Qpost_read_conversion, "post-read-conversion");
7515 Qpost_read_conversion = intern ("post-read-conversion"); 8710 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
7516 staticpro (&Qpost_read_conversion); 8711 DEFSYM (Qdefault_char, "default-char");
7517 8712 DEFSYM (Qundecided, "undecided");
7518 Qpre_write_conversion = intern ("pre-write-conversion"); 8713 DEFSYM (Qno_conversion, "no-conversion");
7519 staticpro (&Qpre_write_conversion); 8714 DEFSYM (Qraw_text, "raw-text");
7520 8715
7521 Qno_conversion = intern ("no-conversion"); 8716 DEFSYM (Qiso_2022, "iso-2022");
7522 staticpro (&Qno_conversion); 8717
7523 8718 DEFSYM (Qutf_8, "utf-8");
7524 Qundecided = intern ("undecided"); 8719 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
7525 staticpro (&Qundecided); 8720
7526 8721 DEFSYM (Qutf_16, "utf-16");
7527 Qcoding_system_p = intern ("coding-system-p"); 8722 DEFSYM (Qbig, "big");
7528 staticpro (&Qcoding_system_p); 8723 DEFSYM (Qlittle, "little");
7529 8724
7530 Qcoding_system_error = intern ("coding-system-error"); 8725 DEFSYM (Qshift_jis, "shift-jis");
7531 staticpro (&Qcoding_system_error); 8726 DEFSYM (Qbig5, "big5");
7532 8727
8728 DEFSYM (Qcoding_system_p, "coding-system-p");
8729
8730 DEFSYM (Qcoding_system_error, "coding-system-error");
7533 Fput (Qcoding_system_error, Qerror_conditions, 8731 Fput (Qcoding_system_error, Qerror_conditions,
7534 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil))); 8732 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7535 Fput (Qcoding_system_error, Qerror_message, 8733 Fput (Qcoding_system_error, Qerror_message,
7536 build_string ("Invalid coding system")); 8734 build_string ("Invalid coding system"));
7537 8735
7538 Qcoding_category = intern ("coding-category");
7539 staticpro (&Qcoding_category);
7540 Qcoding_category_index = intern ("coding-category-index");
7541 staticpro (&Qcoding_category_index);
7542
7543 Vcoding_category_table
7544 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7545 staticpro (&Vcoding_category_table);
7546 {
7547 int i;
7548 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7549 {
7550 XVECTOR (Vcoding_category_table)->contents[i]
7551 = intern (coding_category_name[i]);
7552 Fput (XVECTOR (Vcoding_category_table)->contents[i],
7553 Qcoding_category_index, make_number (i));
7554 }
7555 }
7556
7557 Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7558 staticpro (&Vcoding_system_safe_chars);
7559
7560 Qtranslation_table = intern ("translation-table");
7561 staticpro (&Qtranslation_table);
7562 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7563
7564 Qtranslation_table_id = intern ("translation-table-id");
7565 staticpro (&Qtranslation_table_id);
7566
7567 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7568 staticpro (&Qtranslation_table_for_decode);
7569
7570 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7571 staticpro (&Qtranslation_table_for_encode);
7572
7573 Qsafe_chars = intern ("safe-chars");
7574 staticpro (&Qsafe_chars);
7575
7576 Qchar_coding_system = intern ("char-coding-system");
7577 staticpro (&Qchar_coding_system);
7578
7579 /* Intern this now in case it isn't already done. 8736 /* Intern this now in case it isn't already done.
7580 Setting this variable twice is harmless. 8737 Setting this variable twice is harmless.
7581 But don't staticpro it here--that is done in alloc.c. */ 8738 But don't staticpro it here--that is done in alloc.c. */
7582 Qchar_table_extra_slots = intern ("char-table-extra-slots"); 8739 Qchar_table_extra_slots = intern ("char-table-extra-slots");
7583 Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0)); 8740
7584 Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0)); 8741 DEFSYM (Qtranslation_table, "translation-table");
7585 8742 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
7586 Qvalid_codes = intern ("valid-codes"); 8743 DEFSYM (Qtranslation_table_id, "translation-table-id");
7587 staticpro (&Qvalid_codes); 8744 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
7588 8745 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
7589 Qemacs_mule = intern ("emacs-mule"); 8746
7590 staticpro (&Qemacs_mule); 8747 DEFSYM (Qvalid_codes, "valid-codes");
7591 8748
7592 Qraw_text = intern ("raw-text"); 8749 DEFSYM (Qemacs_mule, "emacs-mule");
7593 staticpro (&Qraw_text); 8750
7594 8751 DEFSYM (QCcategory, ":category");
7595 Qutf_8 = intern ("utf-8"); 8752
7596 staticpro (&Qutf_8); 8753 Vcoding_category_table
8754 = Fmake_vector (make_number (coding_category_max), Qnil);
8755 staticpro (&Vcoding_category_table);
8756 /* Followings are target of code detection. */
8757 ASET (Vcoding_category_table, coding_category_iso_7,
8758 intern ("coding-category-iso-7"));
8759 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8760 intern ("coding-category-iso-7-tight"));
8761 ASET (Vcoding_category_table, coding_category_iso_8_1,
8762 intern ("coding-category-iso-8-1"));
8763 ASET (Vcoding_category_table, coding_category_iso_8_2,
8764 intern ("coding-category-iso-8-2"));
8765 ASET (Vcoding_category_table, coding_category_iso_7_else,
8766 intern ("coding-category-iso-7-else"));
8767 ASET (Vcoding_category_table, coding_category_iso_8_else,
8768 intern ("coding-category-iso-8-else"));
8769 ASET (Vcoding_category_table, coding_category_utf_8,
8770 intern ("coding-category-utf-8"));
8771 ASET (Vcoding_category_table, coding_category_utf_16_be,
8772 intern ("coding-category-utf-16-be"));
8773 ASET (Vcoding_category_table, coding_category_utf_16_auto,
8774 intern ("coding-category-utf-16-auto"));
8775 ASET (Vcoding_category_table, coding_category_utf_16_le,
8776 intern ("coding-category-utf-16-le"));
8777 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8778 intern ("coding-category-utf-16-be-nosig"));
8779 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8780 intern ("coding-category-utf-16-le-nosig"));
8781 ASET (Vcoding_category_table, coding_category_charset,
8782 intern ("coding-category-charset"));
8783 ASET (Vcoding_category_table, coding_category_sjis,
8784 intern ("coding-category-sjis"));
8785 ASET (Vcoding_category_table, coding_category_big5,
8786 intern ("coding-category-big5"));
8787 ASET (Vcoding_category_table, coding_category_ccl,
8788 intern ("coding-category-ccl"));
8789 ASET (Vcoding_category_table, coding_category_emacs_mule,
8790 intern ("coding-category-emacs-mule"));
8791 /* Followings are NOT target of code detection. */
8792 ASET (Vcoding_category_table, coding_category_raw_text,
8793 intern ("coding-category-raw-text"));
8794 ASET (Vcoding_category_table, coding_category_undecided,
8795 intern ("coding-category-undecided"));
7597 8796
7598 defsubr (&Scoding_system_p); 8797 defsubr (&Scoding_system_p);
7599 defsubr (&Sread_coding_system); 8798 defsubr (&Sread_coding_system);
7600 defsubr (&Sread_non_nil_coding_system); 8799 defsubr (&Sread_non_nil_coding_system);
7601 defsubr (&Scheck_coding_system); 8800 defsubr (&Scheck_coding_system);
7602 defsubr (&Sdetect_coding_region); 8801 defsubr (&Sdetect_coding_region);
7603 defsubr (&Sdetect_coding_string); 8802 defsubr (&Sdetect_coding_string);
7604 defsubr (&Sfind_coding_systems_region_internal); 8803 defsubr (&Sfind_coding_systems_region_internal);
7605 defsubr (&Sunencodable_char_position); 8804 defsubr (&Sunencodable_char_position);
8805 defsubr (&Scheck_coding_systems_region);
7606 defsubr (&Sdecode_coding_region); 8806 defsubr (&Sdecode_coding_region);
7607 defsubr (&Sencode_coding_region); 8807 defsubr (&Sencode_coding_region);
7608 defsubr (&Sdecode_coding_string); 8808 defsubr (&Sdecode_coding_string);
7609 defsubr (&Sencode_coding_string); 8809 defsubr (&Sencode_coding_string);
7610 defsubr (&Sdecode_sjis_char); 8810 defsubr (&Sdecode_sjis_char);
7615 defsubr (&Sset_safe_terminal_coding_system_internal); 8815 defsubr (&Sset_safe_terminal_coding_system_internal);
7616 defsubr (&Sterminal_coding_system); 8816 defsubr (&Sterminal_coding_system);
7617 defsubr (&Sset_keyboard_coding_system_internal); 8817 defsubr (&Sset_keyboard_coding_system_internal);
7618 defsubr (&Skeyboard_coding_system); 8818 defsubr (&Skeyboard_coding_system);
7619 defsubr (&Sfind_operation_coding_system); 8819 defsubr (&Sfind_operation_coding_system);
7620 defsubr (&Supdate_coding_systems_internal); 8820 defsubr (&Sset_coding_system_priority);
7621 defsubr (&Sset_coding_priority_internal);
7622 defsubr (&Sdefine_coding_system_internal); 8821 defsubr (&Sdefine_coding_system_internal);
8822 defsubr (&Sdefine_coding_system_alias);
8823 defsubr (&Scoding_system_base);
8824 defsubr (&Scoding_system_plist);
8825 defsubr (&Scoding_system_aliases);
8826 defsubr (&Scoding_system_eol_type);
8827 defsubr (&Scoding_system_priority_list);
7623 8828
7624 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list, 8829 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7625 doc: /* List of coding systems. 8830 doc: /* List of coding systems.
7626 8831
7627 Do not alter the value of this variable manually. This variable should be 8832 Do not alter the value of this variable manually. This variable should be
7628 updated by the functions `make-coding-system' and 8833 updated by the functions `define-coding-system' and
7629 `define-coding-system-alias'. */); 8834 `define-coding-system-alias'. */);
7630 Vcoding_system_list = Qnil; 8835 Vcoding_system_list = Qnil;
7631 8836
7632 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist, 8837 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7633 doc: /* Alist of coding system names. 8838 doc: /* Alist of coding system names.
7648 system bound to the corresponding coding-category is selected. */); 8853 system bound to the corresponding coding-category is selected. */);
7649 { 8854 {
7650 int i; 8855 int i;
7651 8856
7652 Vcoding_category_list = Qnil; 8857 Vcoding_category_list = Qnil;
7653 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--) 8858 for (i = coding_category_max - 1; i >= 0; i--)
7654 Vcoding_category_list 8859 Vcoding_category_list
7655 = Fcons (XVECTOR (Vcoding_category_table)->contents[i], 8860 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7656 Vcoding_category_list); 8861 Vcoding_category_list);
7657 } 8862 }
7658 8863
7678 For output to files, if the above procedure does not specify a coding system, 8883 For output to files, if the above procedure does not specify a coding system,
7679 the value of `buffer-file-coding-system' is used. */); 8884 the value of `buffer-file-coding-system' is used. */);
7680 Vcoding_system_for_write = Qnil; 8885 Vcoding_system_for_write = Qnil;
7681 8886
7682 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used, 8887 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7683 doc: /* Coding system used in the latest file or process I/O. 8888 doc: /*
7684 Also set by `encode-coding-region', `decode-coding-region', 8889 Coding system used in the latest file or process I/O. */);
7685 `encode-coding-string' and `decode-coding-string'. */);
7686 Vlast_coding_system_used = Qnil; 8890 Vlast_coding_system_used = Qnil;
7687 8891
7688 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion, 8892 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7689 doc: /* *Non-nil means always inhibit code conversion of end-of-line format. 8893 doc: /*
8894 *Non-nil means always inhibit code conversion of end-of-line format.
7690 See info node `Coding Systems' and info node `Text and Binary' concerning 8895 See info node `Coding Systems' and info node `Text and Binary' concerning
7691 such conversion. */); 8896 such conversion. */);
7692 inhibit_eol_conversion = 0; 8897 inhibit_eol_conversion = 0;
7693 8898
7694 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system, 8899 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7695 doc: /* Non-nil means process buffer inherits coding system of process output. 8900 doc: /*
8901 Non-nil means process buffer inherits coding system of process output.
7696 Bind it to t if the process output is to be treated as if it were a file 8902 Bind it to t if the process output is to be treated as if it were a file
7697 read from some filesystem. */); 8903 read from some filesystem. */);
7698 inherit_process_coding_system = 0; 8904 inherit_process_coding_system = 0;
7699 8905
7700 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist, 8906 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7701 doc: /* Alist to decide a coding system to use for a file I/O operation. 8907 doc: /*
8908 Alist to decide a coding system to use for a file I/O operation.
7702 The format is ((PATTERN . VAL) ...), 8909 The format is ((PATTERN . VAL) ...),
7703 where PATTERN is a regular expression matching a file name, 8910 where PATTERN is a regular expression matching a file name,
7704 VAL is a coding system, a cons of coding systems, or a function symbol. 8911 VAL is a coding system, a cons of coding systems, or a function symbol.
7705 If VAL is a coding system, it is used for both decoding and encoding 8912 If VAL is a coding system, it is used for both decoding and encoding
7706 the file contents. 8913 the file contents.
7707 If VAL is a cons of coding systems, the car part is used for decoding, 8914 If VAL is a cons of coding systems, the car part is used for decoding,
7708 and the cdr part is used for encoding. 8915 and the cdr part is used for encoding.
7709 If VAL is a function symbol, the function must return a coding system 8916 If VAL is a function symbol, the function must return a coding system
7710 or a cons of coding systems which are used as above. The function gets 8917 or a cons of coding systems which are used as above. The function gets
7711 the arguments with which `find-operation-coding-system' was called. 8918 the arguments with which `find-operation-coding-systems' was called.
7712 8919
7713 See also the function `find-operation-coding-system' 8920 See also the function `find-operation-coding-system'
7714 and the variable `auto-coding-alist'. */); 8921 and the variable `auto-coding-alist'. */);
7715 Vfile_coding_system_alist = Qnil; 8922 Vfile_coding_system_alist = Qnil;
7716 8923
7717 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist, 8924 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7718 doc: /* Alist to decide a coding system to use for a process I/O operation. 8925 doc: /*
8926 Alist to decide a coding system to use for a process I/O operation.
7719 The format is ((PATTERN . VAL) ...), 8927 The format is ((PATTERN . VAL) ...),
7720 where PATTERN is a regular expression matching a program name, 8928 where PATTERN is a regular expression matching a program name,
7721 VAL is a coding system, a cons of coding systems, or a function symbol. 8929 VAL is a coding system, a cons of coding systems, or a function symbol.
7722 If VAL is a coding system, it is used for both decoding what received 8930 If VAL is a coding system, it is used for both decoding what received
7723 from the program and encoding what sent to the program. 8931 from the program and encoding what sent to the program.
7728 8936
7729 See also the function `find-operation-coding-system'. */); 8937 See also the function `find-operation-coding-system'. */);
7730 Vprocess_coding_system_alist = Qnil; 8938 Vprocess_coding_system_alist = Qnil;
7731 8939
7732 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist, 8940 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7733 doc: /* Alist to decide a coding system to use for a network I/O operation. 8941 doc: /*
8942 Alist to decide a coding system to use for a network I/O operation.
7734 The format is ((PATTERN . VAL) ...), 8943 The format is ((PATTERN . VAL) ...),
7735 where PATTERN is a regular expression matching a network service name 8944 where PATTERN is a regular expression matching a network service name
7736 or is a port number to connect to, 8945 or is a port number to connect to,
7737 VAL is a coding system, a cons of coding systems, or a function symbol. 8946 VAL is a coding system, a cons of coding systems, or a function symbol.
7738 If VAL is a coding system, it is used for both decoding what received 8947 If VAL is a coding system, it is used for both decoding what received
7750 Also used for decoding keyboard input on X Window system. */); 8959 Also used for decoding keyboard input on X Window system. */);
7751 Vlocale_coding_system = Qnil; 8960 Vlocale_coding_system = Qnil;
7752 8961
7753 /* The eol mnemonics are reset in startup.el system-dependently. */ 8962 /* The eol mnemonics are reset in startup.el system-dependently. */
7754 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix, 8963 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
7755 doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format. */); 8964 doc: /*
8965 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
7756 eol_mnemonic_unix = build_string (":"); 8966 eol_mnemonic_unix = build_string (":");
7757 8967
7758 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos, 8968 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
7759 doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format. */); 8969 doc: /*
8970 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
7760 eol_mnemonic_dos = build_string ("\\"); 8971 eol_mnemonic_dos = build_string ("\\");
7761 8972
7762 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac, 8973 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
7763 doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format. */); 8974 doc: /*
8975 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
7764 eol_mnemonic_mac = build_string ("/"); 8976 eol_mnemonic_mac = build_string ("/");
7765 8977
7766 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided, 8978 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
7767 doc: /* *String displayed in mode line when end-of-line format is not yet determined. */); 8979 doc: /*
8980 *String displayed in mode line when end-of-line format is not yet determined. */);
7768 eol_mnemonic_undecided = build_string (":"); 8981 eol_mnemonic_undecided = build_string (":");
7769 8982
7770 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation, 8983 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
7771 doc: /* *Non-nil enables character translation while encoding and decoding. */); 8984 doc: /*
8985 *Non-nil enables character translation while encoding and decoding. */);
7772 Venable_character_translation = Qt; 8986 Venable_character_translation = Qt;
7773 8987
7774 DEFVAR_LISP ("standard-translation-table-for-decode", 8988 DEFVAR_LISP ("standard-translation-table-for-decode",
7775 &Vstandard_translation_table_for_decode, 8989 &Vstandard_translation_table_for_decode,
7776 doc: /* Table for translating characters while decoding. */); 8990 doc: /* Table for translating characters while decoding. */);
7779 DEFVAR_LISP ("standard-translation-table-for-encode", 8993 DEFVAR_LISP ("standard-translation-table-for-encode",
7780 &Vstandard_translation_table_for_encode, 8994 &Vstandard_translation_table_for_encode,
7781 doc: /* Table for translating characters while encoding. */); 8995 doc: /* Table for translating characters while encoding. */);
7782 Vstandard_translation_table_for_encode = Qnil; 8996 Vstandard_translation_table_for_encode = Qnil;
7783 8997
7784 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist, 8998 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
7785 doc: /* Alist of charsets vs revision numbers. 8999 doc: /* Alist of charsets vs revision numbers.
7786 While encoding, if a charset (car part of an element) is found, 9000 While encoding, if a charset (car part of an element) is found,
7787 designate it with the escape sequence identifying revision (cdr part of the element). */); 9001 designate it with the escape sequence identifying revision (cdr part
7788 Vcharset_revision_alist = Qnil; 9002 of the element). */);
9003 Vcharset_revision_table = Qnil;
7789 9004
7790 DEFVAR_LISP ("default-process-coding-system", 9005 DEFVAR_LISP ("default-process-coding-system",
7791 &Vdefault_process_coding_system, 9006 &Vdefault_process_coding_system,
7792 doc: /* Cons of coding systems used for process I/O by default. 9007 doc: /* Cons of coding systems used for process I/O by default.
7793 The car part is used for decoding a process output, 9008 The car part is used for decoding a process output,
7794 the cdr part is used for encoding a text to be sent to a process. */); 9009 the cdr part is used for encoding a text to be sent to a process. */);
7795 Vdefault_process_coding_system = Qnil; 9010 Vdefault_process_coding_system = Qnil;
7796 9011
7797 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table, 9012 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
7798 doc: /* Table of extra Latin codes in the range 128..159 (inclusive). 9013 doc: /*
9014 Table of extra Latin codes in the range 128..159 (inclusive).
7799 This is a vector of length 256. 9015 This is a vector of length 256.
7800 If Nth element is non-nil, the existence of code N in a file 9016 If Nth element is non-nil, the existence of code N in a file
7801 \(or output of subprocess) doesn't prevent it to be detected as 9017 \(or output of subprocess) doesn't prevent it to be detected as
7802 a coding system of ISO 2022 variant which has a flag 9018 a coding system of ISO 2022 variant which has a flag
7803 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file 9019 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
7805 Only 128th through 159th elements has a meaning. */); 9021 Only 128th through 159th elements has a meaning. */);
7806 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil); 9022 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
7807 9023
7808 DEFVAR_LISP ("select-safe-coding-system-function", 9024 DEFVAR_LISP ("select-safe-coding-system-function",
7809 &Vselect_safe_coding_system_function, 9025 &Vselect_safe_coding_system_function,
7810 doc: /* Function to call to select safe coding system for encoding a text. 9026 doc: /*
9027 Function to call to select safe coding system for encoding a text.
7811 9028
7812 If set, this function is called to force a user to select a proper 9029 If set, this function is called to force a user to select a proper
7813 coding system which can encode the text in the case that a default 9030 coding system which can encode the text in the case that a default
7814 coding system used in each operation can't encode the text. 9031 coding system used in each operation can't encode the text.
7815 9032
7825 coding_system_require_warning = 0; 9042 coding_system_require_warning = 0;
7826 9043
7827 9044
7828 DEFVAR_BOOL ("inhibit-iso-escape-detection", 9045 DEFVAR_BOOL ("inhibit-iso-escape-detection",
7829 &inhibit_iso_escape_detection, 9046 &inhibit_iso_escape_detection,
7830 doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection. 9047 doc: /*
9048 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
7831 9049
7832 By default, on reading a file, Emacs tries to detect how the text is 9050 By default, on reading a file, Emacs tries to detect how the text is
7833 encoded. This code detection is sensitive to escape sequences. If 9051 encoded. This code detection is sensitive to escape sequences. If
7834 the sequence is valid as ISO2022, the code is determined as one of 9052 the sequence is valid as ISO2022, the code is determined as one of
7835 the ISO2022 encodings, and the file is decoded by the corresponding 9053 the ISO2022 encodings, and the file is decoded by the corresponding
7855 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input, 9073 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
7856 doc: /* Char table for translating self-inserting characters. 9074 doc: /* Char table for translating self-inserting characters.
7857 This is applied to the result of input methods, not their input. See also 9075 This is applied to the result of input methods, not their input. See also
7858 `keyboard-translate-table'. */); 9076 `keyboard-translate-table'. */);
7859 Vtranslation_table_for_input = Qnil; 9077 Vtranslation_table_for_input = Qnil;
9078
9079 {
9080 Lisp_Object args[coding_arg_max];
9081 Lisp_Object plist[16];
9082 int i;
9083
9084 for (i = 0; i < coding_arg_max; i++)
9085 args[i] = Qnil;
9086
9087 plist[0] = intern (":name");
9088 plist[1] = args[coding_arg_name] = Qno_conversion;
9089 plist[2] = intern (":mnemonic");
9090 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9091 plist[4] = intern (":coding-type");
9092 plist[5] = args[coding_arg_coding_type] = Qraw_text;
9093 plist[6] = intern (":ascii-compatible-p");
9094 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9095 plist[8] = intern (":default-char");
9096 plist[9] = args[coding_arg_default_char] = make_number (0);
9097 plist[10] = intern (":for-unibyte");
9098 plist[11] = args[coding_arg_for_unibyte] = Qt;
9099 plist[12] = intern (":docstring");
9100 plist[13] = build_string ("Do no conversion.\n\
9101 \n\
9102 When you visit a file with this coding, the file is read into a\n\
9103 unibyte buffer as is, thus each byte of a file is treated as a\n\
9104 character.");
9105 plist[14] = intern (":eol-type");
9106 plist[15] = args[coding_arg_eol_type] = Qunix;
9107 args[coding_arg_plist] = Flist (16, plist);
9108 Fdefine_coding_system_internal (coding_arg_max, args);
9109 }
9110
9111 setup_coding_system (Qno_conversion, &keyboard_coding);
9112 setup_coding_system (Qno_conversion, &terminal_coding);
9113 setup_coding_system (Qno_conversion, &safe_terminal_coding);
9114
9115 {
9116 int i;
9117
9118 for (i = 0; i < coding_category_max; i++)
9119 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9120 }
7860 } 9121 }
7861 9122
7862 char * 9123 char *
7863 emacs_strerror (error_number) 9124 emacs_strerror (error_number)
7864 int error_number; 9125 int error_number;
7878 9139
7879 return str; 9140 return str;
7880 } 9141 }
7881 9142
7882 #endif /* emacs */ 9143 #endif /* emacs */
7883