17052
|
1 /* Coding system handler (conversion, detection, and etc).
|
|
2 Ver.1.0.
|
|
3
|
|
4 Copyright (C) 1995 Free Software Foundation, Inc.
|
|
5 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
|
|
6
|
|
7 This program is free software; you can redistribute it and/or modify
|
|
8 it under the terms of the GNU General Public License as published by
|
|
9 the Free Software Foundation; either version 2, or (at your option)
|
|
10 any later version.
|
|
11
|
|
12 This program is distributed in the hope that it will be useful,
|
|
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15 GNU General Public License for more details.
|
|
16
|
|
17 You should have received a copy of the GNU General Public License
|
|
18 along with this program; if not, write to the Free Software
|
|
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
|
20
|
|
21 /*** TABLE OF CONTENTS ***
|
|
22
|
|
23 1. Preamble
|
|
24 2. Emacs' internal format handlers
|
|
25 3. ISO2022 handlers
|
|
26 4. Shift-JIS and BIG5 handlers
|
|
27 5. End-of-line handlers
|
|
28 6. C library functions
|
|
29 7. Emacs Lisp library functions
|
|
30 8. Post-amble
|
|
31
|
|
32 */
|
|
33
|
|
34 /*** GENERAL NOTE on CODING SYSTEM ***
|
|
35
|
|
36 Coding system is an encoding mechanism of one or more character
|
|
37 sets. Here's a list of coding systems which Emacs can handle. When
|
|
38 we say "decode", it means converting some other coding system to
|
|
39 Emacs' internal format, and when we say "encode", it means
|
|
40 converting Emacs' internal format to some other coding system.
|
|
41
|
|
42 0. Emacs' internal format
|
|
43
|
|
44 Emacs itself holds a multi-lingual character in a buffer and a string
|
|
45 in a special format. Details are described in the section 2.
|
|
46
|
|
47 1. ISO2022
|
|
48
|
|
49 The most famous coding system for multiple character sets. X's
|
|
50 Compound Text, various EUCs (Extended Unix Code), and such coding
|
|
51 systems used in Internet communication as ISO-2022-JP are all
|
|
52 variants of ISO2022. Details are described in the section 3.
|
|
53
|
|
54 2. SJIS (or Shift-JIS or MS-Kanji-Code)
|
|
55
|
|
56 A coding system to encode character sets: ASCII, JISX0201, and
|
|
57 JISX0208. Widely used for PC's in Japan. Details are described in
|
|
58 the section 4.
|
|
59
|
|
60 3. BIG5
|
|
61
|
|
62 A coding system to encode character sets: ASCII and Big5. Widely
|
|
63 used by Chinese (mainly in Taiwan and Hong Kong). Details are
|
|
64 described in the section 4. In this file, when written as "BIG5"
|
|
65 (all uppercase), it means the coding system, and when written as
|
|
66 "Big5" (capitalized), it means the character set.
|
|
67
|
|
68 4. Else
|
|
69
|
|
70 If a user want to read/write a text encoded in a coding system not
|
|
71 listed above, he can supply a decoder and an encoder for it in CCL
|
|
72 (Code Conversion Language) programs. Emacs executes the CCL program
|
|
73 while reading/writing.
|
|
74
|
|
75 Emacs represent a coding-system by a Lisp symbol that has a property
|
|
76 `coding-system'. But, before actually using the coding-system, the
|
|
77 information about it is set in a structure of type `struct
|
|
78 coding_system' for rapid processing. See the section 6 for more
|
|
79 detail.
|
|
80
|
|
81 */
|
|
82
|
|
83 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
|
|
84
|
|
85 How end-of-line of a text is encoded depends on a system. For
|
|
86 instance, Unix's format is just one byte of `line-feed' code,
|
|
87 whereas DOS's format is two bytes sequence of `carriage-return' and
|
|
88 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
|
|
89
|
|
90 Since how characters in a text is encoded and how end-of-line is
|
|
91 encoded is independent, any coding system described above can take
|
|
92 any format of end-of-line. So, Emacs has information of format of
|
|
93 end-of-line in each coding-system. See the section 6 for more
|
|
94 detail.
|
|
95
|
|
96 */
|
|
97
|
|
98 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
|
|
99
|
|
100 These functions check if a text between SRC and SRC_END is encoded
|
|
101 in the coding system category XXX. Each returns an integer value in
|
|
102 which appropriate flag bits for the category XXX is set. The flag
|
|
103 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
|
|
104 template of these functions. */
|
|
105 #if 0
|
|
106 int
|
|
107 detect_coding_internal (src, src_end)
|
|
108 unsigned char *src, *src_end;
|
|
109 {
|
|
110 ...
|
|
111 }
|
|
112 #endif
|
|
113
|
|
114 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
|
|
115
|
|
116 These functions decode SRC_BYTES length text at SOURCE encoded in
|
|
117 CODING to Emacs' internal format. The resulting text goes to a
|
|
118 place pointed by DESTINATION, the length of which should not exceed
|
|
119 DST_BYTES. The bytes actually processed is returned as *CONSUMED.
|
|
120 The return value is the length of the decoded text. Below is a
|
|
121 template of these functions. */
|
|
122 #if 0
|
|
123 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
|
|
124 struct coding_system *coding;
|
|
125 unsigned char *source, *destination;
|
|
126 int src_bytes, dst_bytes;
|
|
127 int *consumed;
|
|
128 {
|
|
129 ...
|
|
130 }
|
|
131 #endif
|
|
132
|
|
133 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
|
|
134
|
|
135 These functions encode SRC_BYTES length text at SOURCE of Emacs
|
|
136 internal format to CODING. The resulting text goes to a place
|
|
137 pointed by DESTINATION, the length of which should not exceed
|
|
138 DST_BYTES. The bytes actually processed is returned as *CONSUMED.
|
|
139 The return value is the length of the encoded text. Below is a
|
|
140 template of these functions. */
|
|
141 #if 0
|
|
142 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
|
|
143 struct coding_system *coding;
|
|
144 unsigned char *source, *destination;
|
|
145 int src_bytes, dst_bytes;
|
|
146 int *consumed;
|
|
147 {
|
|
148 ...
|
|
149 }
|
|
150 #endif
|
|
151
|
|
152 /*** COMMONLY USED MACROS ***/
|
|
153
|
|
154 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
|
|
155 THREE_MORE_BYTES safely get one, two, and three bytes from the
|
|
156 source text respectively. If there are not enough bytes in the
|
|
157 source, they jump to `label_end_of_loop'. The caller should set
|
|
158 variables `src' and `src_end' to appropriate areas in advance. */
|
|
159
|
|
160 #define ONE_MORE_BYTE(c1) \
|
|
161 do { \
|
|
162 if (src < src_end) \
|
|
163 c1 = *src++; \
|
|
164 else \
|
|
165 goto label_end_of_loop; \
|
|
166 } while (0)
|
|
167
|
|
168 #define TWO_MORE_BYTES(c1, c2) \
|
|
169 do { \
|
|
170 if (src + 1 < src_end) \
|
|
171 c1 = *src++, c2 = *src++; \
|
|
172 else \
|
|
173 goto label_end_of_loop; \
|
|
174 } while (0)
|
|
175
|
|
176 #define THREE_MORE_BYTES(c1, c2, c3) \
|
|
177 do { \
|
|
178 if (src + 2 < src_end) \
|
|
179 c1 = *src++, c2 = *src++, c3 = *src++; \
|
|
180 else \
|
|
181 goto label_end_of_loop; \
|
|
182 } while (0)
|
|
183
|
|
184 /* The following three macros DECODE_CHARACTER_ASCII,
|
|
185 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
|
|
186 the multi-byte form of a character of each class at the place
|
|
187 pointed by `dst'. The caller should set the variable `dst' to
|
|
188 point to an appropriate area and the variable `coding' to point to
|
|
189 the coding-system of the currently decoding text in advance. */
|
|
190
|
|
191 /* Decode one ASCII character C. */
|
|
192
|
|
193 #define DECODE_CHARACTER_ASCII(c) \
|
|
194 do { \
|
|
195 if (COMPOSING_P (coding->composing)) \
|
|
196 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
|
|
197 else \
|
|
198 *dst++ = (c); \
|
|
199 } while (0)
|
|
200
|
|
201 /* Decode one DIMENSION1 character of which charset is CHARSET and
|
|
202 position-code is C. */
|
|
203
|
|
204 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
|
|
205 do { \
|
|
206 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
|
|
207 if (COMPOSING_P (coding->composing)) \
|
|
208 *dst++ = leading_code + 0x20; \
|
|
209 else \
|
|
210 *dst++ = leading_code; \
|
|
211 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
|
|
212 *dst++ = leading_code; \
|
|
213 *dst++ = (c) | 0x80; \
|
|
214 } while (0)
|
|
215
|
|
216 /* Decode one DIMENSION2 character of which charset is CHARSET and
|
|
217 position-codes are C1 and C2. */
|
|
218
|
|
219 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
|
|
220 do { \
|
|
221 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
|
|
222 *dst++ = (c2) | 0x80; \
|
|
223 } while (0)
|
|
224
|
|
225
|
|
226 /*** 1. Preamble ***/
|
|
227
|
|
228 #include <stdio.h>
|
|
229
|
|
230 #ifdef emacs
|
|
231
|
|
232 #include <config.h>
|
|
233 #include "lisp.h"
|
|
234 #include "buffer.h"
|
|
235 #include "charset.h"
|
|
236 #include "ccl.h"
|
|
237 #include "coding.h"
|
|
238 #include "window.h"
|
|
239
|
|
240 #else /* not emacs */
|
|
241
|
|
242 #include "mulelib.h"
|
|
243
|
|
244 #endif /* not emacs */
|
|
245
|
|
246 Lisp_Object Qcoding_system, Qeol_type;
|
|
247 Lisp_Object Qbuffer_file_coding_system;
|
|
248 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
|
|
249
|
|
250 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
|
|
251 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
|
|
252 Lisp_Object Qstart_process, Qopen_network_stream;
|
|
253 Lisp_Object Qtarget_idx;
|
|
254
|
|
255 /* Mnemonic character of each format of end-of-line. */
|
|
256 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
|
|
257 /* Mnemonic character to indicate format of end-of-line is not yet
|
|
258 decided. */
|
|
259 int eol_mnemonic_undecided;
|
|
260
|
|
261 #ifdef emacs
|
|
262
|
|
263 Lisp_Object Qcoding_system_vector, Qcoding_system_p, Qcoding_system_error;
|
|
264
|
|
265 /* Coding-systems are handed between Emacs Lisp programs and C internal
|
|
266 routines by the following three variables. */
|
|
267 /* Coding-system for reading files and receiving data from process. */
|
|
268 Lisp_Object Vcoding_system_for_read;
|
|
269 /* Coding-system for writing files and sending data to process. */
|
|
270 Lisp_Object Vcoding_system_for_write;
|
|
271 /* Coding-system actually used in the latest I/O. */
|
|
272 Lisp_Object Vlast_coding_system_used;
|
|
273
|
|
274 /* Coding-system of what terminal accept for displaying. */
|
|
275 struct coding_system terminal_coding;
|
|
276
|
|
277 /* Coding-system of what is sent from terminal keyboard. */
|
|
278 struct coding_system keyboard_coding;
|
|
279
|
|
280 Lisp_Object Vcoding_system_alist;
|
|
281
|
|
282 #endif /* emacs */
|
|
283
|
|
284 Lisp_Object Qcoding_category_index;
|
|
285
|
|
286 /* List of symbols `coding-category-xxx' ordered by priority. */
|
|
287 Lisp_Object Vcoding_category_list;
|
|
288
|
|
289 /* Table of coding-systems currently assigned to each coding-category. */
|
|
290 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
|
|
291
|
|
292 /* Table of names of symbol for each coding-category. */
|
|
293 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
|
|
294 "coding-category-internal",
|
|
295 "coding-category-sjis",
|
|
296 "coding-category-iso-7",
|
|
297 "coding-category-iso-8-1",
|
|
298 "coding-category-iso-8-2",
|
|
299 "coding-category-iso-else",
|
|
300 "coding-category-big5",
|
|
301 "coding-category-binary"
|
|
302 };
|
|
303
|
|
304 /* Alist of charsets vs the alternate charsets. */
|
|
305 Lisp_Object Valternate_charset_table;
|
|
306
|
|
307 /* Alist of charsets vs revision number. */
|
|
308 Lisp_Object Vcharset_revision_alist;
|
|
309
|
|
310
|
|
311 /*** 2. Emacs internal format handlers ***/
|
|
312
|
|
313 /* Emacs' internal format for encoding multiple character sets is a
|
|
314 kind of multi-byte encoding, i.e. encoding a character by a sequence
|
|
315 of one-byte codes of variable length. ASCII characters and control
|
|
316 characters (e.g. `tab', `newline') are represented by one-byte as
|
|
317 is. It takes the range 0x00 through 0x7F. The other characters
|
|
318 are represented by a sequence of `base leading-code', optional
|
|
319 `extended leading-code', and one or two `position-code's. Length
|
|
320 of the sequence is decided by the base leading-code. Leading-code
|
|
321 takes the range 0x80 through 0x9F, whereas extended leading-code
|
|
322 and position-code take the range 0xA0 through 0xFF. See the
|
|
323 document of `charset.h' for more detail about leading-code and
|
|
324 position-code.
|
|
325
|
|
326 There's one exception in this rule. Special leading-code
|
|
327 `leading-code-composition' denotes that the following several
|
|
328 characters should be composed into one character. Leading-codes of
|
|
329 components (except for ASCII) are added 0x20. An ASCII character
|
|
330 component is represented by a 2-byte sequence of `0xA0' and
|
|
331 `ASCII-code + 0x80'. See also the document in `charset.h' for the
|
|
332 detail of composite character. Hence, we can summarize the code
|
|
333 range as follows:
|
|
334
|
|
335 --- CODE RANGE of Emacs' internal format ---
|
|
336 (character set) (range)
|
|
337 ASCII 0x00 .. 0x7F
|
|
338 ELSE (1st byte) 0x80 .. 0x9F
|
|
339 (rest bytes) 0xA0 .. 0xFF
|
|
340 ---------------------------------------------
|
|
341
|
|
342 */
|
|
343
|
|
344 enum emacs_code_class_type emacs_code_class[256];
|
|
345
|
|
346 /* Go to the next statement only if *SRC is accessible and the code is
|
|
347 greater than 0xA0. */
|
|
348 #define CHECK_CODE_RANGE_A0_FF \
|
|
349 do { \
|
|
350 if (src >= src_end) \
|
|
351 goto label_end_of_switch; \
|
|
352 else if (*src++ < 0xA0) \
|
|
353 return 0; \
|
|
354 } while (0)
|
|
355
|
|
356 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
|
|
357 Check if a text is encoded in Emacs' internal format. If it is,
|
|
358 return CODING_CATEGORY_MASK_INTERNAL, else return 0. */
|
|
359
|
|
360 int
|
|
361 detect_coding_internal (src, src_end)
|
|
362 unsigned char *src, *src_end;
|
|
363 {
|
|
364 unsigned char c;
|
|
365 int composing = 0;
|
|
366
|
|
367 while (src < src_end)
|
|
368 {
|
|
369 c = *src++;
|
|
370
|
|
371 if (composing)
|
|
372 {
|
|
373 if (c < 0xA0)
|
|
374 composing = 0;
|
|
375 else
|
|
376 c -= 0x20;
|
|
377 }
|
|
378
|
|
379 switch (emacs_code_class[c])
|
|
380 {
|
|
381 case EMACS_ascii_code:
|
|
382 case EMACS_linefeed_code:
|
|
383 break;
|
|
384
|
|
385 case EMACS_control_code:
|
|
386 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
|
|
387 return 0;
|
|
388 break;
|
|
389
|
|
390 case EMACS_invalid_code:
|
|
391 return 0;
|
|
392
|
|
393 case EMACS_leading_code_composition: /* c == 0x80 */
|
|
394 if (composing)
|
|
395 CHECK_CODE_RANGE_A0_FF;
|
|
396 else
|
|
397 composing = 1;
|
|
398 break;
|
|
399
|
|
400 case EMACS_leading_code_4:
|
|
401 CHECK_CODE_RANGE_A0_FF;
|
|
402 /* fall down to check it two more times ... */
|
|
403
|
|
404 case EMACS_leading_code_3:
|
|
405 CHECK_CODE_RANGE_A0_FF;
|
|
406 /* fall down to check it one more time ... */
|
|
407
|
|
408 case EMACS_leading_code_2:
|
|
409 CHECK_CODE_RANGE_A0_FF;
|
|
410 break;
|
|
411
|
|
412 default:
|
|
413 label_end_of_switch:
|
|
414 break;
|
|
415 }
|
|
416 }
|
|
417 return CODING_CATEGORY_MASK_INTERNAL;
|
|
418 }
|
|
419
|
|
420
|
|
421 /*** 3. ISO2022 handlers ***/
|
|
422
|
|
423 /* The following note describes the coding system ISO2022 briefly.
|
|
424 Since the intension of this note is to help understanding of the
|
|
425 programs in this file, some parts are NOT ACCURATE or OVERLY
|
|
426 SIMPLIFIED. For the thorough understanding, please refer to the
|
|
427 original document of ISO2022.
|
|
428
|
|
429 ISO2022 provides many mechanisms to encode several character sets
|
|
430 in 7-bit and 8-bit environment. If one choose 7-bite environment,
|
|
431 all text is encoded by codes of less than 128. This may make the
|
|
432 encoded text a little bit longer, but the text get more stability
|
|
433 to pass through several gateways (some of them split MSB off).
|
|
434
|
|
435 There are two kind of character set: control character set and
|
|
436 graphic character set. The former contains control characters such
|
|
437 as `newline' and `escape' to provide control functions (control
|
|
438 functions are provided also by escape sequence). The latter
|
|
439 contains graphic characters such as ' A' and '-'. Emacs recognizes
|
|
440 two control character sets and many graphic character sets.
|
|
441
|
|
442 Graphic character sets are classified into one of the following
|
|
443 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
|
|
444 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
|
|
445 bytes (DIMENSION) and the number of characters in one dimension
|
|
446 (CHARS) of the set. In addition, each character set is assigned an
|
|
447 identification tag (called "final character" and denoted as <F>
|
|
448 here after) which is unique in each class. <F> of each character
|
|
449 set is decided by ECMA(*) when it is registered in ISO. Code range
|
|
450 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
|
|
451
|
|
452 Note (*): ECMA = European Computer Manufacturers Association
|
|
453
|
|
454 Here are examples of graphic character set [NAME(<F>)]:
|
|
455 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
|
|
456 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
|
|
457 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
|
|
458 o DIMENSION2_CHARS96 -- none for the moment
|
|
459
|
|
460 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
|
|
461 C0 [0x00..0x1F] -- control character plane 0
|
|
462 GL [0x20..0x7F] -- graphic character plane 0
|
|
463 C1 [0x80..0x9F] -- control character plane 1
|
|
464 GR [0xA0..0xFF] -- graphic character plane 1
|
|
465
|
|
466 A control character set is directly designated and invoked to C0 or
|
|
467 C1 by an escape sequence. The most common case is that ISO646's
|
|
468 control character set is designated/invoked to C0 and ISO6429's
|
|
469 control character set is designated/invoked to C1, and usually
|
|
470 these designations/invocations are omitted in a coded text. With
|
|
471 7-bit environment, only C0 can be used, and a control character for
|
|
472 C1 is encoded by an appropriate escape sequence to fit in the
|
|
473 environment. All control characters for C1 are defined the
|
|
474 corresponding escape sequences.
|
|
475
|
|
476 A graphic character set is at first designated to one of four
|
|
477 graphic registers (G0 through G3), then these graphic registers are
|
|
478 invoked to GL or GR. These designations and invocations can be
|
|
479 done independently. The most common case is that G0 is invoked to
|
|
480 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
|
|
481 these invocations and designations are omitted in a coded text.
|
|
482 With 7-bit environment, only GL can be used.
|
|
483
|
|
484 When a graphic character set of CHARS94 is invoked to GL, code 0x20
|
|
485 and 0x7F of GL area work as control characters SPACE and DEL
|
|
486 respectively, and code 0xA0 and 0xFF of GR area should not be used.
|
|
487
|
|
488 There are two ways of invocation: locking-shift and single-shift.
|
|
489 With locking-shift, the invocation lasts until the next different
|
|
490 invocation, whereas with single-shift, the invocation works only
|
|
491 for the following character and doesn't affect locking-shift.
|
|
492 Invocations are done by the following control characters or escape
|
|
493 sequences.
|
|
494
|
|
495 ----------------------------------------------------------------------
|
|
496 function control char escape sequence description
|
|
497 ----------------------------------------------------------------------
|
|
498 SI (shift-in) 0x0F none invoke G0 to GL
|
|
499 SI (shift-out) 0x0E none invoke G1 to GL
|
|
500 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
|
|
501 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
|
|
502 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
|
|
503 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
|
|
504 ----------------------------------------------------------------------
|
|
505 The first four are for locking-shift. Control characters for these
|
|
506 functions are defined by macros ISO_CODE_XXX in `coding.h'.
|
|
507
|
|
508 Designations are done by the following escape sequences.
|
|
509 ----------------------------------------------------------------------
|
|
510 escape sequence description
|
|
511 ----------------------------------------------------------------------
|
|
512 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
|
|
513 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
|
|
514 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
|
|
515 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
|
|
516 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
|
|
517 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
|
|
518 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
|
|
519 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
|
|
520 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
|
|
521 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
|
|
522 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
|
|
523 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
|
|
524 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
|
|
525 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
|
|
526 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
|
|
527 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
|
|
528 ----------------------------------------------------------------------
|
|
529
|
|
530 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
|
|
531 of dimension 1, chars 94, and final character <F>, and etc.
|
|
532
|
|
533 Note (*): Although these designations are not allowed in ISO2022,
|
|
534 Emacs accepts them on decoding, and produces them on encoding
|
|
535 CHARS96 character set in a coding system which is characterized as
|
|
536 7-bit environment, non-locking-shift, and non-single-shift.
|
|
537
|
|
538 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
|
|
539 '(' can be omitted. We call this as "short-form" here after.
|
|
540
|
|
541 Now you may notice that there are a lot of ways for encoding the
|
|
542 same multilingual text in ISO2022. Actually, there exist many
|
|
543 coding systems such as Compound Text (used in X's inter client
|
|
544 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
|
|
545 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
|
|
546 localized platforms), and all of these are variants of ISO2022.
|
|
547
|
|
548 In addition to the above, Emacs handles two more kinds of escape
|
|
549 sequences: ISO6429's direction specification and Emacs' private
|
|
550 sequence for specifying character composition.
|
|
551
|
|
552 ISO6429's direction specification takes the following format:
|
|
553 o CSI ']' -- end of the current direction
|
|
554 o CSI '0' ']' -- end of the current direction
|
|
555 o CSI '1' ']' -- start of left-to-right text
|
|
556 o CSI '2' ']' -- start of right-to-left text
|
|
557 The control character CSI (0x9B: control sequence introducer) is
|
|
558 abbreviated to the escape sequence ESC '[' in 7-bit environment.
|
|
559
|
|
560 Character composition specification takes the following format:
|
|
561 o ESC '0' -- start character composition
|
|
562 o ESC '1' -- end character composition
|
|
563 Since these are not standard escape sequences of any ISO, the use
|
|
564 of them for these meaning is restricted to Emacs only. */
|
|
565
|
|
566 enum iso_code_class_type iso_code_class[256];
|
|
567
|
|
568 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
|
|
569 Check if a text is encoded in ISO2022. If it is, returns an
|
|
570 integer in which appropriate flag bits any of:
|
|
571 CODING_CATEGORY_MASK_ISO_7
|
|
572 CODING_CATEGORY_MASK_ISO_8_1
|
|
573 CODING_CATEGORY_MASK_ISO_8_2
|
|
574 CODING_CATEGORY_MASK_ISO_ELSE
|
|
575 are set. If a code which should never appear in ISO2022 is found,
|
|
576 returns 0. */
|
|
577
|
|
578 int
|
|
579 detect_coding_iso2022 (src, src_end)
|
|
580 unsigned char *src, *src_end;
|
|
581 {
|
|
582 unsigned char graphic_register[4];
|
|
583 unsigned char c, esc_cntl;
|
|
584 int mask = (CODING_CATEGORY_MASK_ISO_7
|
|
585 | CODING_CATEGORY_MASK_ISO_8_1
|
|
586 | CODING_CATEGORY_MASK_ISO_8_2);
|
|
587 /* We may look ahead maximum 3 bytes. */
|
|
588 unsigned char *adjusted_src_end = src_end - 3;
|
|
589 int i;
|
|
590
|
|
591 for (i = 0; i < 4; i++)
|
|
592 graphic_register[i] = CHARSET_ASCII;
|
|
593
|
|
594 while (src < adjusted_src_end)
|
|
595 {
|
|
596 c = *src++;
|
|
597 switch (c)
|
|
598 {
|
|
599 case ISO_CODE_ESC:
|
|
600 if (src >= adjusted_src_end)
|
|
601 break;
|
|
602 c = *src++;
|
|
603 if (c == '$')
|
|
604 {
|
|
605 /* Designation of 2-byte character set. */
|
|
606 if (src >= adjusted_src_end)
|
|
607 break;
|
|
608 c = *src++;
|
|
609 }
|
|
610 if ((c >= ')' && c <= '+') || (c >= '-' && c <= '/'))
|
|
611 /* Designation to graphic register 1, 2, or 3. */
|
|
612 mask &= ~CODING_CATEGORY_MASK_ISO_7;
|
|
613 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
|
|
614 return CODING_CATEGORY_MASK_ISO_ELSE;
|
|
615 break;
|
|
616
|
|
617 case ISO_CODE_SI:
|
|
618 case ISO_CODE_SO:
|
|
619 return CODING_CATEGORY_MASK_ISO_ELSE;
|
|
620
|
|
621 case ISO_CODE_CSI:
|
|
622 case ISO_CODE_SS2:
|
|
623 case ISO_CODE_SS3:
|
|
624 mask &= ~CODING_CATEGORY_MASK_ISO_7;
|
|
625 break;
|
|
626
|
|
627 default:
|
|
628 if (c < 0x80)
|
|
629 break;
|
|
630 else if (c < 0xA0)
|
|
631 return 0;
|
|
632 else
|
|
633 {
|
|
634 int count = 1;
|
|
635
|
|
636 mask &= ~CODING_CATEGORY_MASK_ISO_7;
|
|
637 while (src < adjusted_src_end && *src >= 0xA0)
|
|
638 count++, src++;
|
|
639 if (count & 1 && src < adjusted_src_end)
|
|
640 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
|
|
641 }
|
|
642 break;
|
|
643 }
|
|
644 }
|
|
645
|
|
646 return mask;
|
|
647 }
|
|
648
|
|
649 /* Decode a character of which charset is CHARSET and the 1st position
|
|
650 code is C1. If dimension of CHARSET 2, the 2nd position code is
|
|
651 fetched from SRC and set to C2. If CHARSET is negative, it means
|
|
652 that we are decoding ill formed text, and what we can do is just to
|
|
653 read C1 as is. */
|
|
654
|
|
655 #define DECODE_ISO_CHARACTER(charset, c1) \
|
|
656 do { \
|
|
657 if ((charset) >= 0 && CHARSET_DIMENSION (charset) == 2) \
|
|
658 ONE_MORE_BYTE (c2); \
|
|
659 if (COMPOSING_HEAD_P (coding->composing)) \
|
|
660 { \
|
|
661 *dst++ = LEADING_CODE_COMPOSITION; \
|
|
662 if (COMPOSING_WITH_RULE_P (coding->composing)) \
|
|
663 /* To tell composition rules are embeded. */ \
|
|
664 *dst++ = 0xFF; \
|
|
665 coding->composing += 2; \
|
|
666 } \
|
|
667 if ((charset) < 0) \
|
|
668 *dst++ = c1; \
|
|
669 else if ((charset) == CHARSET_ASCII) \
|
|
670 DECODE_CHARACTER_ASCII (c1); \
|
|
671 else if (CHARSET_DIMENSION (charset) == 1) \
|
|
672 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
|
|
673 else \
|
|
674 DECODE_CHARACTER_DIMENSION2 (charset, c1, c2); \
|
|
675 if (COMPOSING_WITH_RULE_P (coding->composing)) \
|
|
676 /* To tell a composition rule follows. */ \
|
|
677 coding->composing = COMPOSING_WITH_RULE_RULE; \
|
|
678 } while (0)
|
|
679
|
|
680 /* Set designation state into CODING. */
|
|
681 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
|
|
682 do { \
|
|
683 int charset = ISO_CHARSET_TABLE (dimension, chars, final_char); \
|
|
684 Lisp_Object temp \
|
|
685 = Fassq (CHARSET_SYMBOL (charset), Valternate_charset_table); \
|
|
686 if (! NILP (temp)) \
|
|
687 charset = get_charset_id (XCONS (temp)->cdr); \
|
|
688 if (charset >= 0) \
|
|
689 { \
|
|
690 if (coding->direction == 1 \
|
|
691 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
|
|
692 charset = CHARSET_REVERSE_CHARSET (charset); \
|
|
693 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
|
|
694 } \
|
|
695 } while (0)
|
|
696
|
|
697 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
|
|
698
|
|
699 int
|
|
700 decode_coding_iso2022 (coding, source, destination,
|
|
701 src_bytes, dst_bytes, consumed)
|
|
702 struct coding_system *coding;
|
|
703 unsigned char *source, *destination;
|
|
704 int src_bytes, dst_bytes;
|
|
705 int *consumed;
|
|
706 {
|
|
707 unsigned char *src = source;
|
|
708 unsigned char *src_end = source + src_bytes;
|
|
709 unsigned char *dst = destination;
|
|
710 unsigned char *dst_end = destination + dst_bytes;
|
|
711 /* Since the maximum bytes produced by each loop is 7, we subtract 6
|
|
712 from DST_END to assure that overflow checking is necessary only
|
|
713 at the head of loop. */
|
|
714 unsigned char *adjusted_dst_end = dst_end - 6;
|
|
715 int charset;
|
|
716 /* Charsets invoked to graphic plane 0 and 1 respectively. */
|
|
717 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
|
|
718 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
|
|
719
|
|
720 while (src < src_end && dst < adjusted_dst_end)
|
|
721 {
|
|
722 /* SRC_BASE remembers the start position in source in each loop.
|
|
723 The loop will be exited when there's not enough source text
|
|
724 to analyze long escape sequence or 2-byte code (within macros
|
|
725 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
|
|
726 to SRC_BASE before exiting. */
|
|
727 unsigned char *src_base = src;
|
|
728 unsigned char c1 = *src++, c2, cmprule;
|
|
729
|
|
730 switch (iso_code_class [c1])
|
|
731 {
|
|
732 case ISO_0x20_or_0x7F:
|
|
733 if (!coding->composing
|
|
734 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
|
|
735 {
|
|
736 /* This is SPACE or DEL. */
|
|
737 *dst++ = c1;
|
|
738 break;
|
|
739 }
|
|
740 /* This is a graphic character, we fall down ... */
|
|
741
|
|
742 case ISO_graphic_plane_0:
|
|
743 if (coding->composing == COMPOSING_WITH_RULE_RULE)
|
|
744 {
|
|
745 /* This is a composition rule. */
|
|
746 *dst++ = c1 | 0x80;
|
|
747 coding->composing = COMPOSING_WITH_RULE_TAIL;
|
|
748 }
|
|
749 else
|
|
750 DECODE_ISO_CHARACTER (charset0, c1);
|
|
751 break;
|
|
752
|
|
753 case ISO_0xA0_or_0xFF:
|
|
754 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
|
|
755 {
|
|
756 /* Invalid code. */
|
|
757 *dst++ = c1;
|
|
758 break;
|
|
759 }
|
|
760 /* This is a graphic character, we fall down ... */
|
|
761
|
|
762 case ISO_graphic_plane_1:
|
|
763 DECODE_ISO_CHARACTER (charset1, c1);
|
|
764 break;
|
|
765
|
|
766 case ISO_control_code:
|
|
767 /* All ISO2022 control characters in this class have the
|
|
768 same representation in Emacs internal format. */
|
|
769 *dst++ = c1;
|
|
770 break;
|
|
771
|
|
772 case ISO_carriage_return:
|
|
773 if (coding->eol_type == CODING_EOL_CR)
|
|
774 {
|
|
775 *dst++ = '\n';
|
|
776 }
|
|
777 else if (coding->eol_type == CODING_EOL_CRLF)
|
|
778 {
|
|
779 ONE_MORE_BYTE (c1);
|
|
780 if (c1 == ISO_CODE_LF)
|
|
781 *dst++ = '\n';
|
|
782 else
|
|
783 {
|
|
784 src--;
|
|
785 *dst++ = c1;
|
|
786 }
|
|
787 }
|
|
788 else
|
|
789 {
|
|
790 *dst++ = c1;
|
|
791 }
|
|
792 break;
|
|
793
|
|
794 case ISO_shift_out:
|
|
795 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
|
|
796 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
|
|
797 break;
|
|
798
|
|
799 case ISO_shift_in:
|
|
800 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
|
|
801 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
|
|
802 break;
|
|
803
|
|
804 case ISO_single_shift_2_7:
|
|
805 case ISO_single_shift_2:
|
|
806 /* SS2 is handled as an escape sequence of ESC 'N' */
|
|
807 c1 = 'N';
|
|
808 goto label_escape_sequence;
|
|
809
|
|
810 case ISO_single_shift_3:
|
|
811 /* SS2 is handled as an escape sequence of ESC 'O' */
|
|
812 c1 = 'O';
|
|
813 goto label_escape_sequence;
|
|
814
|
|
815 case ISO_control_sequence_introducer:
|
|
816 /* CSI is handled as an escape sequence of ESC '[' ... */
|
|
817 c1 = '[';
|
|
818 goto label_escape_sequence;
|
|
819
|
|
820 case ISO_escape:
|
|
821 ONE_MORE_BYTE (c1);
|
|
822 label_escape_sequence:
|
|
823 /* Escape sequences handled by Emacs are invocation,
|
|
824 designation, direction specification, and character
|
|
825 composition specification. */
|
|
826 switch (c1)
|
|
827 {
|
|
828 case '&': /* revision of following character set */
|
|
829 ONE_MORE_BYTE (c1);
|
|
830 if (!(c1 >= '@' && c1 <= '~'))
|
|
831 {
|
|
832 goto label_invalid_escape_sequence;
|
|
833 }
|
|
834 ONE_MORE_BYTE (c1);
|
|
835 if (c1 != ISO_CODE_ESC)
|
|
836 {
|
|
837 goto label_invalid_escape_sequence;
|
|
838 }
|
|
839 ONE_MORE_BYTE (c1);
|
|
840 goto label_escape_sequence;
|
|
841
|
|
842 case '$': /* designation of 2-byte character set */
|
|
843 ONE_MORE_BYTE (c1);
|
|
844 if (c1 >= '@' && c1 <= 'B')
|
|
845 { /* designation of JISX0208.1978, GB2312.1980,
|
|
846 or JISX0208.1980 */
|
|
847 DECODE_DESIGNATION (0, 2, 94, c1);
|
|
848 }
|
|
849 else if (c1 >= 0x28 && c1 <= 0x2B)
|
|
850 { /* designation of DIMENSION2_CHARS94 character set */
|
|
851 ONE_MORE_BYTE (c2);
|
|
852 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
|
|
853 }
|
|
854 else if (c1 >= 0x2C && c1 <= 0x2F)
|
|
855 { /* designation of DIMENSION2_CHARS96 character set */
|
|
856 ONE_MORE_BYTE (c2);
|
|
857 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
|
|
858 }
|
|
859 else
|
|
860 {
|
|
861 goto label_invalid_escape_sequence;
|
|
862 }
|
|
863 break;
|
|
864
|
|
865 case 'n': /* invocation of locking-shift-2 */
|
|
866 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
|
|
867 break;
|
|
868
|
|
869 case 'o': /* invocation of locking-shift-3 */
|
|
870 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
|
|
871 break;
|
|
872
|
|
873 case 'N': /* invocation of single-shift-2 */
|
|
874 ONE_MORE_BYTE (c1);
|
|
875 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
|
|
876 DECODE_ISO_CHARACTER (charset, c1);
|
|
877 break;
|
|
878
|
|
879 case 'O': /* invocation of single-shift-3 */
|
|
880 ONE_MORE_BYTE (c1);
|
|
881 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
|
|
882 DECODE_ISO_CHARACTER (charset, c1);
|
|
883 break;
|
|
884
|
|
885 case '0': /* start composing without embeded rules */
|
|
886 coding->composing = COMPOSING_NO_RULE_HEAD;
|
|
887 break;
|
|
888
|
|
889 case '1': /* end composing */
|
|
890 coding->composing = COMPOSING_NO;
|
|
891 break;
|
|
892
|
|
893 case '2': /* start composing with embeded rules */
|
|
894 coding->composing = COMPOSING_WITH_RULE_HEAD;
|
|
895 break;
|
|
896
|
|
897 case '[': /* specification of direction */
|
|
898 /* For the moment, nested direction is not supported.
|
|
899 So, the value of `coding->direction' is 0 or 1: 0
|
|
900 means left-to-right, 1 means right-to-left. */
|
|
901 ONE_MORE_BYTE (c1);
|
|
902 switch (c1)
|
|
903 {
|
|
904 case ']': /* end of the current direction */
|
|
905 coding->direction = 0;
|
|
906
|
|
907 case '0': /* end of the current direction */
|
|
908 case '1': /* start of left-to-right direction */
|
|
909 ONE_MORE_BYTE (c1);
|
|
910 if (c1 == ']')
|
|
911 coding->direction = 0;
|
|
912 else
|
|
913 goto label_invalid_escape_sequence;
|
|
914 break;
|
|
915
|
|
916 case '2': /* start of right-to-left direction */
|
|
917 ONE_MORE_BYTE (c1);
|
|
918 if (c1 == ']')
|
|
919 coding->direction= 1;
|
|
920 else
|
|
921 goto label_invalid_escape_sequence;
|
|
922 break;
|
|
923
|
|
924 default:
|
|
925 goto label_invalid_escape_sequence;
|
|
926 }
|
|
927 break;
|
|
928
|
|
929 default:
|
|
930 if (c1 >= 0x28 && c1 <= 0x2B)
|
|
931 { /* designation of DIMENSION1_CHARS94 character set */
|
|
932 ONE_MORE_BYTE (c2);
|
|
933 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
|
|
934 }
|
|
935 else if (c1 >= 0x2C && c1 <= 0x2F)
|
|
936 { /* designation of DIMENSION1_CHARS96 character set */
|
|
937 ONE_MORE_BYTE (c2);
|
|
938 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
|
|
939 }
|
|
940 else
|
|
941 {
|
|
942 goto label_invalid_escape_sequence;
|
|
943 }
|
|
944 }
|
|
945 /* We must update these variables now. */
|
|
946 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
|
|
947 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
|
|
948 break;
|
|
949
|
|
950 label_invalid_escape_sequence:
|
|
951 {
|
|
952 int length = src - src_base;
|
|
953
|
|
954 bcopy (src_base, dst, length);
|
|
955 dst += length;
|
|
956 }
|
|
957 }
|
|
958 continue;
|
|
959
|
|
960 label_end_of_loop:
|
|
961 coding->carryover_size = src - src_base;
|
|
962 bcopy (src_base, coding->carryover, coding->carryover_size);
|
|
963 src = src_base;
|
|
964 break;
|
|
965 }
|
|
966
|
|
967 /* If this is the last block of the text to be decoded, we had
|
|
968 better just flush out all remaining codes in the text although
|
|
969 they are not valid characters. */
|
|
970 if (coding->last_block)
|
|
971 {
|
|
972 bcopy (src, dst, src_end - src);
|
|
973 dst += (src_end - src);
|
|
974 src = src_end;
|
|
975 }
|
|
976 *consumed = src - source;
|
|
977 return dst - destination;
|
|
978 }
|
|
979
|
|
980 /* ISO2022 encoding staffs. */
|
|
981
|
|
982 /*
|
|
983 It is not enough to say just "ISO2022" on encoding, but we have to
|
|
984 specify more details. In Emacs, each coding-system of ISO2022
|
|
985 variant has the following specifications:
|
|
986 1. Initial designation to G0 thru G3.
|
|
987 2. Allows short-form designation?
|
|
988 3. ASCII should be designated to G0 before control characters?
|
|
989 4. ASCII should be designated to G0 at end of line?
|
|
990 5. 7-bit environment or 8-bit environment?
|
|
991 6. Use locking-shift?
|
|
992 7. Use Single-shift?
|
|
993 And the following two are only for Japanese:
|
|
994 8. Use ASCII in place of JIS0201-1976-Roman?
|
|
995 9. Use JISX0208-1983 in place of JISX0208-1978?
|
|
996 These specifications are encoded in `coding->flags' as flag bits
|
|
997 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
|
|
998 detail.
|
|
999 */
|
|
1000
|
|
1001 /* Produce codes (escape sequence) for designating CHARSET to graphic
|
|
1002 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
|
|
1003 the coding system CODING allows, produce designation sequence of
|
|
1004 short-form. */
|
|
1005
|
|
1006 #define ENCODE_DESIGNATION(charset, reg, coding) \
|
|
1007 do { \
|
|
1008 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
|
|
1009 char *intermediate_char_94 = "()*+"; \
|
|
1010 char *intermediate_char_96 = ",-./"; \
|
|
1011 Lisp_Object temp \
|
|
1012 = Fassq (make_number (charset), Vcharset_revision_alist); \
|
|
1013 if (! NILP (temp)) \
|
|
1014 { \
|
|
1015 *dst++ = ISO_CODE_ESC; \
|
|
1016 *dst++ = '&'; \
|
|
1017 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
|
|
1018 } \
|
|
1019 *dst++ = ISO_CODE_ESC; \
|
|
1020 if (CHARSET_DIMENSION (charset) == 1) \
|
|
1021 { \
|
|
1022 if (CHARSET_CHARS (charset) == 94) \
|
|
1023 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
|
|
1024 else \
|
|
1025 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
|
|
1026 } \
|
|
1027 else \
|
|
1028 { \
|
|
1029 *dst++ = '$'; \
|
|
1030 if (CHARSET_CHARS (charset) == 94) \
|
|
1031 { \
|
|
1032 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
|
|
1033 || reg != 0 \
|
|
1034 || final_char < '@' || final_char > 'B') \
|
|
1035 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
|
|
1036 } \
|
|
1037 else \
|
|
1038 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
|
|
1039 } \
|
|
1040 *dst++ = final_char; \
|
|
1041 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
|
|
1042 } while (0)
|
|
1043
|
|
1044 /* The following two macros produce codes (control character or escape
|
|
1045 sequence) for ISO2022 single-shift functions (single-shift-2 and
|
|
1046 single-shift-3). */
|
|
1047
|
|
1048 #define ENCODE_SINGLE_SHIFT_2 \
|
|
1049 do { \
|
|
1050 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
|
|
1051 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
|
|
1052 else \
|
|
1053 *dst++ = ISO_CODE_SS2; \
|
|
1054 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
|
|
1055 } while (0)
|
|
1056
|
|
1057 #define ENCODE_SINGLE_SHIFT_3 \
|
|
1058 do { \
|
|
1059 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
|
|
1060 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
|
|
1061 else \
|
|
1062 *dst++ = ISO_CODE_SS3; \
|
|
1063 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
|
|
1064 } while (0)
|
|
1065
|
|
1066 /* The following four macros produce codes (control character or
|
|
1067 escape sequence) for ISO2022 locking-shift functions (shift-in,
|
|
1068 shift-out, locking-shift-2, and locking-shift-3). */
|
|
1069
|
|
1070 #define ENCODE_SHIFT_IN \
|
|
1071 do { \
|
|
1072 *dst++ = ISO_CODE_SI; \
|
|
1073 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
|
|
1074 } while (0)
|
|
1075
|
|
1076 #define ENCODE_SHIFT_OUT \
|
|
1077 do { \
|
|
1078 *dst++ = ISO_CODE_SO; \
|
|
1079 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
|
|
1080 } while (0)
|
|
1081
|
|
1082 #define ENCODE_LOCKING_SHIFT_2 \
|
|
1083 do { \
|
|
1084 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
|
|
1085 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
|
|
1086 } while (0)
|
|
1087
|
|
1088 #define ENCODE_LOCKING_SHIFT_3 \
|
|
1089 do { \
|
|
1090 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
|
|
1091 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
|
|
1092 } while (0)
|
|
1093
|
|
1094 /* Produce codes for a DIMENSION1 character of which character set is
|
|
1095 CHARSET and position-code is C1. Designation and invocation
|
|
1096 sequences are also produced in advance if necessary. */
|
|
1097
|
|
1098
|
|
1099 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
|
|
1100 do { \
|
|
1101 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
|
|
1102 { \
|
|
1103 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
|
|
1104 *dst++ = c1 & 0x7F; \
|
|
1105 else \
|
|
1106 *dst++ = c1 | 0x80; \
|
|
1107 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
|
|
1108 break; \
|
|
1109 } \
|
|
1110 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
|
|
1111 { \
|
|
1112 *dst++ = c1 & 0x7F; \
|
|
1113 break; \
|
|
1114 } \
|
|
1115 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
|
|
1116 { \
|
|
1117 *dst++ = c1 | 0x80; \
|
|
1118 break; \
|
|
1119 } \
|
|
1120 else \
|
|
1121 /* Since CHARSET is not yet invoked to any graphic planes, we \
|
|
1122 must invoke it, or, at first, designate it to some graphic \
|
|
1123 register. Then repeat the loop to actually produce the \
|
|
1124 character. */ \
|
|
1125 dst = encode_invocation_designation (charset, coding, dst); \
|
|
1126 } while (1)
|
|
1127
|
|
1128 /* Produce codes for a DIMENSION2 character of which character set is
|
|
1129 CHARSET and position-codes are C1 and C2. Designation and
|
|
1130 invocation codes are also produced in advance if necessary. */
|
|
1131
|
|
1132 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
|
|
1133 do { \
|
|
1134 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
|
|
1135 { \
|
|
1136 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
|
|
1137 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
|
|
1138 else \
|
|
1139 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
|
|
1140 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
|
|
1141 break; \
|
|
1142 } \
|
|
1143 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
|
|
1144 { \
|
|
1145 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
|
|
1146 break; \
|
|
1147 } \
|
|
1148 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
|
|
1149 { \
|
|
1150 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
|
|
1151 break; \
|
|
1152 } \
|
|
1153 else \
|
|
1154 /* Since CHARSET is not yet invoked to any graphic planes, we \
|
|
1155 must invoke it, or, at first, designate it to some graphic \
|
|
1156 register. Then repeat the loop to actually produce the \
|
|
1157 character. */ \
|
|
1158 dst = encode_invocation_designation (charset, coding, dst); \
|
|
1159 } while (1)
|
|
1160
|
|
1161 /* Produce designation and invocation codes at a place pointed by DST
|
|
1162 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
|
|
1163 Return new DST. */
|
|
1164
|
|
1165 unsigned char *
|
|
1166 encode_invocation_designation (charset, coding, dst)
|
|
1167 int charset;
|
|
1168 struct coding_system *coding;
|
|
1169 unsigned char *dst;
|
|
1170 {
|
|
1171 int reg; /* graphic register number */
|
|
1172
|
|
1173 /* At first, check designations. */
|
|
1174 for (reg = 0; reg < 4; reg++)
|
|
1175 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
|
|
1176 break;
|
|
1177
|
|
1178 if (reg >= 4)
|
|
1179 {
|
|
1180 /* CHARSET is not yet designated to any graphic registers. */
|
|
1181 /* At first check the requested designation. */
|
|
1182 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
|
|
1183 if (reg < 0)
|
|
1184 /* Since CHARSET requests no special designation, designate to
|
|
1185 graphic register 0. */
|
|
1186 reg = 0;
|
|
1187
|
|
1188 ENCODE_DESIGNATION (charset, reg, coding);
|
|
1189 }
|
|
1190
|
|
1191 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
|
|
1192 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
|
|
1193 {
|
|
1194 /* Since the graphic register REG is not invoked to any graphic
|
|
1195 planes, invoke it to graphic plane 0. */
|
|
1196 switch (reg)
|
|
1197 {
|
|
1198 case 0: /* graphic register 0 */
|
|
1199 ENCODE_SHIFT_IN;
|
|
1200 break;
|
|
1201
|
|
1202 case 1: /* graphic register 1 */
|
|
1203 ENCODE_SHIFT_OUT;
|
|
1204 break;
|
|
1205
|
|
1206 case 2: /* graphic register 2 */
|
|
1207 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
|
|
1208 ENCODE_SINGLE_SHIFT_2;
|
|
1209 else
|
|
1210 ENCODE_LOCKING_SHIFT_2;
|
|
1211 break;
|
|
1212
|
|
1213 case 3: /* graphic register 3 */
|
|
1214 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
|
|
1215 ENCODE_SINGLE_SHIFT_3;
|
|
1216 else
|
|
1217 ENCODE_LOCKING_SHIFT_3;
|
|
1218 break;
|
|
1219 }
|
|
1220 }
|
|
1221 return dst;
|
|
1222 }
|
|
1223
|
|
1224 /* The following two macros produce codes for indicating composition. */
|
|
1225 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
|
|
1226 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
|
|
1227 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
|
|
1228
|
|
1229 /* The following three macros produce codes for indicating direction
|
|
1230 of text. */
|
|
1231 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
|
|
1232 do { \
|
|
1233 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
|
|
1234 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
|
|
1235 else \
|
|
1236 *dst++ = ISO_CODE_CSI; \
|
|
1237 } while (0)
|
|
1238
|
|
1239 #define ENCODE_DIRECTION_R2L \
|
|
1240 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
|
|
1241
|
|
1242 #define ENCODE_DIRECTION_L2R \
|
|
1243 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
|
|
1244
|
|
1245 /* Produce codes for designation and invocation to reset the graphic
|
|
1246 planes and registers to initial state. */
|
|
1247 #define ENCODE_RESET_PLANE_AND_REGISTER(eol) \
|
|
1248 do { \
|
|
1249 int reg; \
|
|
1250 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
|
|
1251 ENCODE_SHIFT_IN; \
|
|
1252 for (reg = 0; reg < 4; reg++) \
|
|
1253 { \
|
|
1254 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) < 0) \
|
|
1255 { \
|
|
1256 if (eol) CODING_SPEC_ISO_DESIGNATION (coding, reg) = -1; \
|
|
1257 } \
|
|
1258 else if (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
|
|
1259 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)) \
|
|
1260 ENCODE_DESIGNATION \
|
|
1261 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
|
|
1262 } \
|
|
1263 } while (0)
|
|
1264
|
|
1265 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
|
|
1266
|
|
1267 int
|
|
1268 encode_coding_iso2022 (coding, source, destination,
|
|
1269 src_bytes, dst_bytes, consumed)
|
|
1270 struct coding_system *coding;
|
|
1271 unsigned char *source, *destination;
|
|
1272 int src_bytes, dst_bytes;
|
|
1273 int *consumed;
|
|
1274 {
|
|
1275 unsigned char *src = source;
|
|
1276 unsigned char *src_end = source + src_bytes;
|
|
1277 unsigned char *dst = destination;
|
|
1278 unsigned char *dst_end = destination + dst_bytes;
|
|
1279 /* Since the maximum bytes produced by each loop is 6, we subtract 5
|
|
1280 from DST_END to assure overflow checking is necessary only at the
|
|
1281 head of loop. */
|
|
1282 unsigned char *adjusted_dst_end = dst_end - 5;
|
|
1283
|
|
1284 while (src < src_end && dst < adjusted_dst_end)
|
|
1285 {
|
|
1286 /* SRC_BASE remembers the start position in source in each loop.
|
|
1287 The loop will be exited when there's not enough source text
|
|
1288 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
|
|
1289 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
|
|
1290 reset to SRC_BASE before exiting. */
|
|
1291 unsigned char *src_base = src;
|
|
1292 unsigned char c1 = *src++, c2, c3, c4;
|
|
1293 int charset;
|
|
1294
|
|
1295 /* If we are seeing a component of a composite character, we are
|
|
1296 seeing a leading-code specially encoded for composition, or a
|
|
1297 composition rule if composing with rule. We must set C1
|
|
1298 to a normal leading-code or an ASCII code. If we are not at
|
|
1299 a composed character, we must reset the composition state. */
|
|
1300 if (COMPOSING_P (coding->composing))
|
|
1301 {
|
|
1302 if (c1 < 0xA0)
|
|
1303 {
|
|
1304 /* We are not in a composite character any longer. */
|
|
1305 coding->composing = COMPOSING_NO;
|
|
1306 ENCODE_COMPOSITION_END;
|
|
1307 }
|
|
1308 else
|
|
1309 {
|
|
1310 if (coding->composing == COMPOSING_WITH_RULE_RULE)
|
|
1311 {
|
|
1312 *dst++ = c1 & 0x7F;
|
|
1313 coding->composing = COMPOSING_WITH_RULE_HEAD;
|
|
1314 continue;
|
|
1315 }
|
|
1316 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
|
|
1317 coding->composing = COMPOSING_WITH_RULE_RULE;
|
|
1318 if (c1 == 0xA0)
|
|
1319 {
|
|
1320 /* This is an ASCII component. */
|
|
1321 ONE_MORE_BYTE (c1);
|
|
1322 c1 &= 0x7F;
|
|
1323 }
|
|
1324 else
|
|
1325 /* This is a leading-code of non ASCII component. */
|
|
1326 c1 -= 0x20;
|
|
1327 }
|
|
1328 }
|
|
1329
|
|
1330 /* Now encode one character. C1 is a control character, an
|
|
1331 ASCII character, or a leading-code of multi-byte character. */
|
|
1332 switch (emacs_code_class[c1])
|
|
1333 {
|
|
1334 case EMACS_ascii_code:
|
|
1335 ENCODE_ISO_CHARACTER_DIMENSION1 (CHARSET_ASCII, c1);
|
|
1336 break;
|
|
1337
|
|
1338 case EMACS_control_code:
|
|
1339 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
|
|
1340 ENCODE_RESET_PLANE_AND_REGISTER (0);
|
|
1341 *dst++ = c1;
|
|
1342 break;
|
|
1343
|
|
1344 case EMACS_carriage_return_code:
|
|
1345 if (!coding->selective)
|
|
1346 {
|
|
1347 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
|
|
1348 ENCODE_RESET_PLANE_AND_REGISTER (0);
|
|
1349 *dst++ = c1;
|
|
1350 break;
|
|
1351 }
|
|
1352 /* fall down to treat '\r' as '\n' ... */
|
|
1353
|
|
1354 case EMACS_linefeed_code:
|
|
1355 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
|
|
1356 ENCODE_RESET_PLANE_AND_REGISTER (1);
|
|
1357 if (coding->eol_type == CODING_EOL_LF
|
|
1358 || coding->eol_type == CODING_EOL_AUTOMATIC)
|
|
1359 *dst++ = ISO_CODE_LF;
|
|
1360 else if (coding->eol_type == CODING_EOL_CRLF)
|
|
1361 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
|
|
1362 else
|
|
1363 *dst++ = ISO_CODE_CR;
|
|
1364 break;
|
|
1365
|
|
1366 case EMACS_leading_code_2:
|
|
1367 ONE_MORE_BYTE (c2);
|
|
1368 ENCODE_ISO_CHARACTER_DIMENSION1 (c1, c2);
|
|
1369 break;
|
|
1370
|
|
1371 case EMACS_leading_code_3:
|
|
1372 TWO_MORE_BYTES (c2, c3);
|
|
1373 if (c1 < LEADING_CODE_PRIVATE_11)
|
|
1374 ENCODE_ISO_CHARACTER_DIMENSION2 (c1, c2, c3);
|
|
1375 else
|
|
1376 ENCODE_ISO_CHARACTER_DIMENSION1 (c2, c3);
|
|
1377 break;
|
|
1378
|
|
1379 case EMACS_leading_code_4:
|
|
1380 THREE_MORE_BYTES (c2, c3, c4);
|
|
1381 ENCODE_ISO_CHARACTER_DIMENSION2 (c2, c3, c4);
|
|
1382 break;
|
|
1383
|
|
1384 case EMACS_leading_code_composition:
|
|
1385 ONE_MORE_BYTE (c1);
|
|
1386 if (c1 == 0xFF)
|
|
1387 {
|
|
1388 coding->composing = COMPOSING_WITH_RULE_HEAD;
|
|
1389 ENCODE_COMPOSITION_WITH_RULE_START;
|
|
1390 }
|
|
1391 else
|
|
1392 {
|
|
1393 /* Rewind one byte because it is a character code of
|
|
1394 composition elements. */
|
|
1395 src--;
|
|
1396 coding->composing = COMPOSING_NO_RULE_HEAD;
|
|
1397 ENCODE_COMPOSITION_NO_RULE_START;
|
|
1398 }
|
|
1399 break;
|
|
1400
|
|
1401 case EMACS_invalid_code:
|
|
1402 *dst++ = c1;
|
|
1403 break;
|
|
1404 }
|
|
1405 continue;
|
|
1406 label_end_of_loop:
|
|
1407 coding->carryover_size = src - src_base;
|
|
1408 bcopy (src_base, coding->carryover, coding->carryover_size);
|
|
1409 src = src_base;
|
|
1410 break;
|
|
1411 }
|
|
1412
|
|
1413 /* If this is the last block of the text to be encoded, we must
|
|
1414 reset the state of graphic planes and registers to initial one.
|
|
1415 In addition, we had better just flush out all remaining codes in
|
|
1416 the text although they are not valid characters. */
|
|
1417 if (coding->last_block)
|
|
1418 {
|
|
1419 ENCODE_RESET_PLANE_AND_REGISTER (1);
|
|
1420 bcopy(src, dst, src_end - src);
|
|
1421 dst += (src_end - src);
|
|
1422 src = src_end;
|
|
1423 }
|
|
1424 *consumed = src - source;
|
|
1425 return dst - destination;
|
|
1426 }
|
|
1427
|
|
1428
|
|
1429 /*** 4. SJIS and BIG5 handlers ***/
|
|
1430
|
|
1431 /* Although SJIS and BIG5 are not ISO's coding system, They are used
|
|
1432 quite widely. So, for the moment, Emacs supports them in the bare
|
|
1433 C code. But, in the future, they may be supported only by CCL. */
|
|
1434
|
|
1435 /* SJIS is a coding system encoding three character sets: ASCII, right
|
|
1436 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
|
|
1437 as is. A character of charset katakana-jisx0201 is encoded by
|
|
1438 "position-code + 0x80". A character of charset japanese-jisx0208
|
|
1439 is encoded in 2-byte but two position-codes are divided and shifted
|
|
1440 so that it fit in the range below.
|
|
1441
|
|
1442 --- CODE RANGE of SJIS ---
|
|
1443 (character set) (range)
|
|
1444 ASCII 0x00 .. 0x7F
|
|
1445 KATAKANA-JISX0201 0xA0 .. 0xDF
|
|
1446 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
|
|
1447 (2nd byte) 0x40 .. 0xFF
|
|
1448 -------------------------------
|
|
1449
|
|
1450 */
|
|
1451
|
|
1452 /* BIG5 is a coding system encoding two character sets: ASCII and
|
|
1453 Big5. An ASCII character is encoded as is. Big5 is a two-byte
|
|
1454 character set and is encoded in two-byte.
|
|
1455
|
|
1456 --- CODE RANGE of BIG5 ---
|
|
1457 (character set) (range)
|
|
1458 ASCII 0x00 .. 0x7F
|
|
1459 Big5 (1st byte) 0xA1 .. 0xFE
|
|
1460 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
|
|
1461 --------------------------
|
|
1462
|
|
1463 Since the number of characters in Big5 is larger than maximum
|
|
1464 characters in Emacs' charset (96x96), it can't be handled as one
|
|
1465 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
|
|
1466 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
|
|
1467 contains frequently used characters and the latter contains less
|
|
1468 frequently used characters. */
|
|
1469
|
|
1470 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
|
|
1471 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
|
|
1472 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
|
|
1473 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
|
|
1474
|
|
1475 /* Number of Big5 characters which have the same code in 1st byte. */
|
|
1476 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
|
|
1477
|
|
1478 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
|
|
1479 do { \
|
|
1480 unsigned int temp \
|
|
1481 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
|
|
1482 if (b1 < 0xC9) \
|
|
1483 charset = charset_big5_1; \
|
|
1484 else \
|
|
1485 { \
|
|
1486 charset = charset_big5_2; \
|
|
1487 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
|
|
1488 } \
|
|
1489 c1 = temp / (0xFF - 0xA1) + 0x21; \
|
|
1490 c2 = temp % (0xFF - 0xA1) + 0x21; \
|
|
1491 } while (0)
|
|
1492
|
|
1493 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
|
|
1494 do { \
|
|
1495 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
|
|
1496 if (charset == charset_big5_2) \
|
|
1497 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
|
|
1498 b1 = temp / BIG5_SAME_ROW + 0xA1; \
|
|
1499 b2 = temp % BIG5_SAME_ROW; \
|
|
1500 b2 += b2 < 0x3F ? 0x40 : 0x62; \
|
|
1501 } while (0)
|
|
1502
|
|
1503 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
|
|
1504 Check if a text is encoded in SJIS. If it is, return
|
|
1505 CODING_CATEGORY_MASK_SJIS, else return 0. */
|
|
1506
|
|
1507 int
|
|
1508 detect_coding_sjis (src, src_end)
|
|
1509 unsigned char *src, *src_end;
|
|
1510 {
|
|
1511 unsigned char c;
|
|
1512
|
|
1513 while (src < src_end)
|
|
1514 {
|
|
1515 c = *src++;
|
|
1516 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
|
|
1517 return 0;
|
|
1518 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
|
|
1519 {
|
|
1520 if (src < src_end && *src++ < 0x40)
|
|
1521 return 0;
|
|
1522 }
|
|
1523 }
|
|
1524 return CODING_CATEGORY_MASK_SJIS;
|
|
1525 }
|
|
1526
|
|
1527 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
|
|
1528 Check if a text is encoded in BIG5. If it is, return
|
|
1529 CODING_CATEGORY_MASK_BIG5, else return 0. */
|
|
1530
|
|
1531 int
|
|
1532 detect_coding_big5 (src, src_end)
|
|
1533 unsigned char *src, *src_end;
|
|
1534 {
|
|
1535 unsigned char c;
|
|
1536
|
|
1537 while (src < src_end)
|
|
1538 {
|
|
1539 c = *src++;
|
|
1540 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
|
|
1541 return 0;
|
|
1542 if (c >= 0xA1)
|
|
1543 {
|
|
1544 if (src >= src_end)
|
|
1545 break;
|
|
1546 c = *src++;
|
|
1547 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
|
|
1548 return 0;
|
|
1549 }
|
|
1550 }
|
|
1551 return CODING_CATEGORY_MASK_BIG5;
|
|
1552 }
|
|
1553
|
|
1554 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
|
|
1555 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
|
|
1556
|
|
1557 int
|
|
1558 decode_coding_sjis_big5 (coding, source, destination,
|
|
1559 src_bytes, dst_bytes, consumed, sjis_p)
|
|
1560 struct coding_system *coding;
|
|
1561 unsigned char *source, *destination;
|
|
1562 int src_bytes, dst_bytes;
|
|
1563 int *consumed;
|
|
1564 int sjis_p;
|
|
1565 {
|
|
1566 unsigned char *src = source;
|
|
1567 unsigned char *src_end = source + src_bytes;
|
|
1568 unsigned char *dst = destination;
|
|
1569 unsigned char *dst_end = destination + dst_bytes;
|
|
1570 /* Since the maximum bytes produced by each loop is 4, we subtract 3
|
|
1571 from DST_END to assure overflow checking is necessary only at the
|
|
1572 head of loop. */
|
|
1573 unsigned char *adjusted_dst_end = dst_end - 3;
|
|
1574
|
|
1575 while (src < src_end && dst < adjusted_dst_end)
|
|
1576 {
|
|
1577 /* SRC_BASE remembers the start position in source in each loop.
|
|
1578 The loop will be exited when there's not enough source text
|
|
1579 to analyze two-byte character (within macro ONE_MORE_BYTE).
|
|
1580 In that case, SRC is reset to SRC_BASE before exiting. */
|
|
1581 unsigned char *src_base = src;
|
|
1582 unsigned char c1 = *src++, c2, c3, c4;
|
|
1583
|
|
1584 if (c1 == '\r')
|
|
1585 {
|
|
1586 if (coding->eol_type == CODING_EOL_CRLF)
|
|
1587 {
|
|
1588 ONE_MORE_BYTE (c2);
|
|
1589 if (c2 == '\n')
|
|
1590 *dst++ = c2;
|
|
1591 else
|
|
1592 /* To process C2 again, SRC is subtracted by 1. */
|
|
1593 *dst++ = c1, src--;
|
|
1594 }
|
|
1595 else
|
|
1596 *dst++ = c1;
|
|
1597 }
|
|
1598 else if (c1 < 0x80)
|
|
1599 *dst++ = c1;
|
|
1600 else if (c1 < 0xA0 || c1 >= 0xE0)
|
|
1601 {
|
|
1602 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
|
|
1603 if (sjis_p)
|
|
1604 {
|
|
1605 ONE_MORE_BYTE (c2);
|
|
1606 DECODE_SJIS (c1, c2, c3, c4);
|
|
1607 DECODE_CHARACTER_DIMENSION2 (charset_jisx0208, c3, c4);
|
|
1608 }
|
|
1609 else if (c1 >= 0xE0 && c1 < 0xFF)
|
|
1610 {
|
|
1611 int charset;
|
|
1612
|
|
1613 ONE_MORE_BYTE (c2);
|
|
1614 DECODE_BIG5 (c1, c2, charset, c3, c4);
|
|
1615 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
|
|
1616 }
|
|
1617 else /* Invalid code */
|
|
1618 *dst++ = c1;
|
|
1619 }
|
|
1620 else
|
|
1621 {
|
|
1622 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
|
|
1623 if (sjis_p)
|
|
1624 DECODE_CHARACTER_DIMENSION1 (charset_katakana_jisx0201, c1);
|
|
1625 else
|
|
1626 {
|
|
1627 int charset;
|
|
1628
|
|
1629 ONE_MORE_BYTE (c2);
|
|
1630 DECODE_BIG5 (c1, c2, charset, c3, c4);
|
|
1631 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
|
|
1632 }
|
|
1633 }
|
|
1634 continue;
|
|
1635
|
|
1636 label_end_of_loop:
|
|
1637 coding->carryover_size = src - src_base;
|
|
1638 bcopy (src_base, coding->carryover, coding->carryover_size);
|
|
1639 src = src_base;
|
|
1640 break;
|
|
1641 }
|
|
1642
|
|
1643 *consumed = src - source;
|
|
1644 return dst - destination;
|
|
1645 }
|
|
1646
|
|
1647 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
|
|
1648 This function can encode `charset_ascii', `charset_katakana_jisx0201',
|
|
1649 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
|
|
1650 sure that all these charsets are registered as official charset
|
|
1651 (i.e. do not have extended leading-codes). Characters of other
|
|
1652 charsets are produced without any encoding. If SJIS_P is 1, encode
|
|
1653 SJIS text, else encode BIG5 text. */
|
|
1654
|
|
1655 int
|
|
1656 encode_coding_sjis_big5 (coding, source, destination,
|
|
1657 src_bytes, dst_bytes, consumed, sjis_p)
|
|
1658 struct coding_system *coding;
|
|
1659 unsigned char *source, *destination;
|
|
1660 int src_bytes, dst_bytes;
|
|
1661 int *consumed;
|
|
1662 int sjis_p;
|
|
1663 {
|
|
1664 unsigned char *src = source;
|
|
1665 unsigned char *src_end = source + src_bytes;
|
|
1666 unsigned char *dst = destination;
|
|
1667 unsigned char *dst_end = destination + dst_bytes;
|
|
1668 /* Since the maximum bytes produced by each loop is 2, we subtract 1
|
|
1669 from DST_END to assure overflow checking is necessary only at the
|
|
1670 head of loop. */
|
|
1671 unsigned char *adjusted_dst_end = dst_end - 1;
|
|
1672
|
|
1673 while (src < src_end && dst < adjusted_dst_end)
|
|
1674 {
|
|
1675 /* SRC_BASE remembers the start position in source in each loop.
|
|
1676 The loop will be exited when there's not enough source text
|
|
1677 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
|
|
1678 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
|
|
1679 before exiting. */
|
|
1680 unsigned char *src_base = src;
|
|
1681 unsigned char c1 = *src++, c2, c3, c4;
|
|
1682
|
|
1683 if (coding->composing)
|
|
1684 {
|
|
1685 if (c1 == 0xA0)
|
|
1686 {
|
|
1687 ONE_MORE_BYTE (c1);
|
|
1688 c1 &= 0x7F;
|
|
1689 }
|
|
1690 else if (c1 >= 0xA0)
|
|
1691 c1 -= 0x20;
|
|
1692 else
|
|
1693 coding->composing = 0;
|
|
1694 }
|
|
1695
|
|
1696 switch (emacs_code_class[c1])
|
|
1697 {
|
|
1698 case EMACS_ascii_code:
|
|
1699 case EMACS_control_code:
|
|
1700 *dst++ = c1;
|
|
1701 break;
|
|
1702
|
|
1703 case EMACS_carriage_return_code:
|
|
1704 if (!coding->selective)
|
|
1705 {
|
|
1706 *dst++ = c1;
|
|
1707 break;
|
|
1708 }
|
|
1709 /* fall down to treat '\r' as '\n' ... */
|
|
1710
|
|
1711 case EMACS_linefeed_code:
|
|
1712 if (coding->eol_type == CODING_EOL_LF
|
|
1713 || coding->eol_type == CODING_EOL_AUTOMATIC)
|
|
1714 *dst++ = '\n';
|
|
1715 else if (coding->eol_type == CODING_EOL_CRLF)
|
|
1716 *dst++ = '\r', *dst++ = '\n';
|
|
1717 else
|
|
1718 *dst++ = '\r';
|
|
1719 break;
|
|
1720
|
|
1721 case EMACS_leading_code_2:
|
|
1722 ONE_MORE_BYTE (c2);
|
|
1723 if (sjis_p && c1 == charset_katakana_jisx0201)
|
|
1724 *dst++ = c2;
|
|
1725 else
|
|
1726 *dst++ = c1, *dst++ = c2;
|
|
1727 break;
|
|
1728
|
|
1729 case EMACS_leading_code_3:
|
|
1730 TWO_MORE_BYTES (c2, c3);
|
|
1731 c2 &= 0x7F, c3 &= 0x7F;
|
|
1732 if (sjis_p && c1 == charset_jisx0208)
|
|
1733 {
|
|
1734 unsigned char s1, s2;
|
|
1735
|
|
1736 ENCODE_SJIS (c2, c3, s1, s2);
|
|
1737 *dst++ = s1, *dst++ = s2;
|
|
1738 }
|
|
1739 else if (!sjis_p && (c1 == charset_big5_1 || c1 == charset_big5_2))
|
|
1740 {
|
|
1741 unsigned char b1, b2;
|
|
1742
|
|
1743 ENCODE_BIG5 (c1, c2, c3, b1, b2);
|
|
1744 *dst++ = b1, *dst++ = b2;
|
|
1745 }
|
|
1746 else
|
|
1747 *dst++ = c1, *dst++ = c2, *dst++ = c3;
|
|
1748 break;
|
|
1749
|
|
1750 case EMACS_leading_code_4:
|
|
1751 THREE_MORE_BYTES (c2, c3, c4);
|
|
1752 *dst++ = c1, *dst++ = c2, *dst++ = c3, *dst++ = c4;
|
|
1753 break;
|
|
1754
|
|
1755 case EMACS_leading_code_composition:
|
|
1756 coding->composing = 1;
|
|
1757 break;
|
|
1758
|
|
1759 default: /* i.e. case EMACS_invalid_code: */
|
|
1760 *dst++ = c1;
|
|
1761 }
|
|
1762 continue;
|
|
1763
|
|
1764 label_end_of_loop:
|
|
1765 coding->carryover_size = src - src_base;
|
|
1766 bcopy (src_base, coding->carryover, coding->carryover_size);
|
|
1767 src = src_base;
|
|
1768 break;
|
|
1769 }
|
|
1770
|
|
1771 *consumed = src - source;
|
|
1772 return dst - destination;
|
|
1773 }
|
|
1774
|
|
1775
|
|
1776 /*** 5. End-of-line handlers ***/
|
|
1777
|
|
1778 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
|
|
1779 This function is called only when `coding->eol_type' is
|
|
1780 CODING_EOL_CRLF or CODING_EOL_CR. */
|
|
1781
|
|
1782 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
|
|
1783 struct coding_system *coding;
|
|
1784 unsigned char *source, *destination;
|
|
1785 int src_bytes, dst_bytes;
|
|
1786 int *consumed;
|
|
1787 {
|
|
1788 unsigned char *src = source;
|
|
1789 unsigned char *src_end = source + src_bytes;
|
|
1790 unsigned char *dst = destination;
|
|
1791 unsigned char *dst_end = destination + dst_bytes;
|
|
1792 int produced;
|
|
1793
|
|
1794 switch (coding->eol_type)
|
|
1795 {
|
|
1796 case CODING_EOL_CRLF:
|
|
1797 {
|
|
1798 /* Since the maximum bytes produced by each loop is 2, we
|
|
1799 subtract 1 from DST_END to assure overflow checking is
|
|
1800 necessary only at the head of loop. */
|
|
1801 unsigned char *adjusted_dst_end = dst_end - 1;
|
|
1802
|
|
1803 while (src < src_end && dst < adjusted_dst_end)
|
|
1804 {
|
|
1805 unsigned char *src_base = src;
|
|
1806 unsigned char c = *src++;
|
|
1807 if (c == '\r')
|
|
1808 {
|
|
1809 ONE_MORE_BYTE (c);
|
|
1810 if (c != '\n')
|
|
1811 *dst++ = '\r';
|
|
1812
|
|
1813 }
|
|
1814 else
|
|
1815 *dst++ = c;
|
|
1816 continue;
|
|
1817
|
|
1818 label_end_of_loop:
|
|
1819 coding->carryover_size = src - src_base;
|
|
1820 bcopy (src_base, coding->carryover, coding->carryover_size);
|
|
1821 src = src_base;
|
|
1822 break;
|
|
1823 }
|
|
1824 *consumed = src - source;
|
|
1825 produced = dst - destination;
|
|
1826 break;
|
|
1827 }
|
|
1828
|
|
1829 case CODING_EOL_CR:
|
|
1830 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
|
|
1831 bcopy (source, destination, produced);
|
|
1832 dst_end = destination + produced;
|
|
1833 while (dst < dst_end)
|
|
1834 if (*dst++ == '\r') dst[-1] = '\n';
|
|
1835 *consumed = produced;
|
|
1836 break;
|
|
1837
|
|
1838 default: /* i.e. case: CODING_EOL_LF */
|
|
1839 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
|
|
1840 bcopy (source, destination, produced);
|
|
1841 *consumed = produced;
|
|
1842 break;
|
|
1843 }
|
|
1844
|
|
1845 return produced;
|
|
1846 }
|
|
1847
|
|
1848 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
|
|
1849 format of end-of-line according to `coding->eol_type'. If
|
|
1850 `coding->selective' is 1, code '\r' in source text also means
|
|
1851 end-of-line. */
|
|
1852
|
|
1853 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
|
|
1854 struct coding_system *coding;
|
|
1855 unsigned char *source, *destination;
|
|
1856 int src_bytes, dst_bytes;
|
|
1857 int *consumed;
|
|
1858 {
|
|
1859 unsigned char *src = source;
|
|
1860 unsigned char *dst = destination;
|
|
1861 int produced;
|
|
1862
|
|
1863 if (src_bytes <= 0)
|
|
1864 return 0;
|
|
1865
|
|
1866 switch (coding->eol_type)
|
|
1867 {
|
|
1868 case CODING_EOL_LF:
|
|
1869 case CODING_EOL_AUTOMATIC:
|
|
1870 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
|
|
1871 bcopy (source, destination, produced);
|
|
1872 if (coding->selective)
|
|
1873 {
|
|
1874 int i = produced;
|
|
1875 while (i--)
|
|
1876 if (*dst++ == '\r') dst[-1] = '\n';
|
|
1877 }
|
|
1878 *consumed = produced;
|
|
1879
|
|
1880 case CODING_EOL_CRLF:
|
|
1881 {
|
|
1882 unsigned char c;
|
|
1883 unsigned char *src_end = source + src_bytes;
|
|
1884 unsigned char *dst_end = destination + dst_bytes;
|
|
1885 /* Since the maximum bytes produced by each loop is 2, we
|
|
1886 subtract 1 from DST_END to assure overflow checking is
|
|
1887 necessary only at the head of loop. */
|
|
1888 unsigned char *adjusted_dst_end = dst_end - 1;
|
|
1889
|
|
1890 while (src < src_end && dst < adjusted_dst_end)
|
|
1891 {
|
|
1892 c = *src++;
|
|
1893 if (c == '\n' || (c == '\r' && coding->selective))
|
|
1894 *dst++ = '\r', *dst++ = '\n';
|
|
1895 else
|
|
1896 *dst++ = c;
|
|
1897 }
|
|
1898 produced = dst - destination;
|
|
1899 *consumed = src - source;
|
|
1900 break;
|
|
1901 }
|
|
1902
|
|
1903 default: /* i.e. case CODING_EOL_CR: */
|
|
1904 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
|
|
1905 bcopy (source, destination, produced);
|
|
1906 {
|
|
1907 int i = produced;
|
|
1908 while (i--)
|
|
1909 if (*dst++ == '\n') dst[-1] = '\r';
|
|
1910 }
|
|
1911 *consumed = produced;
|
|
1912 }
|
|
1913
|
|
1914 return produced;
|
|
1915 }
|
|
1916
|
|
1917
|
|
1918 /*** 6. C library functions ***/
|
|
1919
|
|
1920 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
|
|
1921 has a property `coding-system'. The value of this property is a
|
|
1922 vector of length 5 (called as coding-vector). Among elements of
|
|
1923 this vector, the first (element[0]) and the fifth (element[4])
|
|
1924 carry important information for decoding/encoding. Before
|
|
1925 decoding/encoding, this information should be set in fields of a
|
|
1926 structure of type `coding_system'.
|
|
1927
|
|
1928 A value of property `coding-system' can be a symbol of another
|
|
1929 subsidiary coding-system. In that case, Emacs gets coding-vector
|
|
1930 from that symbol.
|
|
1931
|
|
1932 `element[0]' contains information to be set in `coding->type'. The
|
|
1933 value and its meaning is as follows:
|
|
1934
|
|
1935 0 -- coding_system_internal
|
|
1936 1 -- coding_system_sjis
|
|
1937 2 -- coding_system_iso2022
|
|
1938 3 -- coding_system_big5
|
|
1939 4 -- coding_system_ccl
|
|
1940 nil -- coding_system_no_conversion
|
|
1941 t -- coding_system_automatic
|
|
1942
|
|
1943 `element[4]' contains information to be set in `coding->flags' and
|
|
1944 `coding->spec'. The meaning varies by `coding->type'.
|
|
1945
|
|
1946 If `coding->type' is `coding_type_iso2022', element[4] is a vector
|
|
1947 of length 32 (of which the first 13 sub-elements are used now).
|
|
1948 Meanings of these sub-elements are:
|
|
1949
|
|
1950 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
|
|
1951 If the value is an integer of valid charset, the charset is
|
|
1952 assumed to be designated to graphic register N initially.
|
|
1953
|
|
1954 If the value is minus, it is a minus value of charset which
|
|
1955 reserves graphic register N, which means that the charset is
|
|
1956 not designated initially but should be designated to graphic
|
|
1957 register N just before encoding a character in that charset.
|
|
1958
|
|
1959 If the value is nil, graphic register N is never used on
|
|
1960 encoding.
|
|
1961
|
|
1962 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
|
|
1963 Each value takes t or nil. See the section ISO2022 of
|
|
1964 `coding.h' for more information.
|
|
1965
|
|
1966 If `coding->type' is `coding_type_big5', element[4] is t to denote
|
|
1967 BIG5-ETen or nil to denote BIG5-HKU.
|
|
1968
|
|
1969 If `coding->type' takes the other value, element[4] is ignored.
|
|
1970
|
|
1971 Emacs Lisp's coding system also carries information about format of
|
|
1972 end-of-line in a value of property `eol-type'. If the value is
|
|
1973 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
|
|
1974 means CODING_EOL_CR. If it is not integer, it should be a vector
|
|
1975 of subsidiary coding systems of which property `eol-type' has one
|
|
1976 of above values.
|
|
1977
|
|
1978 */
|
|
1979
|
|
1980 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
|
|
1981 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
|
|
1982 is setup so that no conversion is necessary and return -1, else
|
|
1983 return 0. */
|
|
1984
|
|
1985 int
|
|
1986 setup_coding_system (coding_system_symbol, coding)
|
|
1987 Lisp_Object coding_system_symbol;
|
|
1988 struct coding_system *coding;
|
|
1989 {
|
|
1990 Lisp_Object coding_system_vector = Qnil;
|
|
1991 Lisp_Object type, eol_type;
|
|
1992
|
|
1993 /* At first, set several fields default values. */
|
|
1994 coding->require_flushing = 0;
|
|
1995 coding->last_block = 0;
|
|
1996 coding->selective = 0;
|
|
1997 coding->composing = 0;
|
|
1998 coding->direction = 0;
|
|
1999 coding->carryover_size = 0;
|
|
2000 coding->symbol = Qnil;
|
|
2001 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
|
|
2002
|
|
2003 /* Get value of property `coding-system'. If it is a Lisp symbol
|
|
2004 pointing another coding system, fetch its property until we get a
|
|
2005 vector. */
|
|
2006 while (!NILP (coding_system_symbol))
|
|
2007 {
|
|
2008 coding->symbol = coding_system_symbol;
|
|
2009 if (NILP (coding->post_read_conversion))
|
|
2010 coding->post_read_conversion = Fget (coding_system_symbol,
|
|
2011 Qpost_read_conversion);
|
|
2012 if (NILP (coding->pre_write_conversion))
|
|
2013 coding->pre_write_conversion = Fget (coding_system_symbol,
|
|
2014 Qpre_write_conversion);
|
|
2015
|
|
2016 coding_system_vector = Fget (coding_system_symbol, Qcoding_system);
|
|
2017 if (VECTORP (coding_system_vector))
|
|
2018 break;
|
|
2019 coding_system_symbol = coding_system_vector;
|
|
2020 }
|
|
2021 Vlast_coding_system_used = coding->symbol;
|
|
2022
|
|
2023 if (!VECTORP (coding_system_vector)
|
|
2024 || XVECTOR (coding_system_vector)->size != 5)
|
|
2025 goto label_invalid_coding_system;
|
|
2026
|
|
2027 /* Get value of property `eol-type' by searching from the root
|
|
2028 coding-system. */
|
|
2029 coding_system_symbol = coding->symbol;
|
|
2030 eol_type = Qnil;
|
|
2031 while (SYMBOLP (coding_system_symbol) && !NILP (coding_system_symbol))
|
|
2032 {
|
|
2033 eol_type = Fget (coding_system_symbol, Qeol_type);
|
|
2034 if (!NILP (eol_type))
|
|
2035 break;
|
|
2036 coding_system_symbol = Fget (coding_system_symbol, Qcoding_system);
|
|
2037 }
|
|
2038
|
|
2039 if (VECTORP (eol_type))
|
|
2040 coding->eol_type = CODING_EOL_AUTOMATIC;
|
|
2041 else if (XFASTINT (eol_type) == 1)
|
|
2042 coding->eol_type = CODING_EOL_CRLF;
|
|
2043 else if (XFASTINT (eol_type) == 2)
|
|
2044 coding->eol_type = CODING_EOL_CR;
|
|
2045 else
|
|
2046 coding->eol_type = CODING_EOL_LF;
|
|
2047
|
|
2048 type = XVECTOR (coding_system_vector)->contents[0];
|
|
2049 switch (XFASTINT (type))
|
|
2050 {
|
|
2051 case 0:
|
|
2052 coding->type = coding_type_internal;
|
|
2053 break;
|
|
2054
|
|
2055 case 1:
|
|
2056 coding->type = coding_type_sjis;
|
|
2057 break;
|
|
2058
|
|
2059 case 2:
|
|
2060 coding->type = coding_type_iso2022;
|
|
2061 {
|
|
2062 Lisp_Object val = XVECTOR (coding_system_vector)->contents[4];
|
|
2063 Lisp_Object *flags;
|
|
2064 int i, charset, default_reg_bits = 0;
|
|
2065
|
|
2066 if (!VECTORP (val) || XVECTOR (val)->size != 32)
|
|
2067 goto label_invalid_coding_system;
|
|
2068
|
|
2069 flags = XVECTOR (val)->contents;
|
|
2070 coding->flags
|
|
2071 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
|
|
2072 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
|
|
2073 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
|
|
2074 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
|
|
2075 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
|
|
2076 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
|
|
2077 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
|
|
2078 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
|
|
2079 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION));
|
|
2080
|
|
2081 /* Invoke graphic register 0 to plane 0. */
|
|
2082 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
|
|
2083 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
|
|
2084 CODING_SPEC_ISO_INVOCATION (coding, 1)
|
|
2085 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
|
|
2086 /* Not single shifting at first. */
|
|
2087 CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
|
|
2088
|
|
2089 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
|
|
2090 FLAGS[REG] can be one of below:
|
|
2091 integer CHARSET: CHARSET occupies register I,
|
|
2092 t: designate nothing to REG initially, but can be used
|
|
2093 by any charsets,
|
|
2094 list of integer, nil, or t: designate the first
|
|
2095 element (if integer) to REG initially, the remaining
|
|
2096 elements (if integer) is designated to REG on request,
|
|
2097 if an element is t, REG can be used by any charset,
|
|
2098 nil: REG is never used. */
|
|
2099 for (charset = 0; charset < MAX_CHARSET; charset++)
|
|
2100 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = -1;
|
|
2101 for (i = 0; i < 4; i++)
|
|
2102 {
|
|
2103 if (INTEGERP (flags[i])
|
|
2104 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
|
|
2105 {
|
|
2106 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
|
|
2107 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
|
|
2108 }
|
|
2109 else if (EQ (flags[i], Qt))
|
|
2110 {
|
|
2111 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
|
|
2112 default_reg_bits |= 1 << i;
|
|
2113 }
|
|
2114 else if (CONSP (flags[i]))
|
|
2115 {
|
|
2116 Lisp_Object tail = flags[i];
|
|
2117
|
|
2118 if (INTEGERP (XCONS (tail)->car)
|
|
2119 && (charset = XINT (XCONS (tail)->car),
|
|
2120 CHARSET_VALID_P (charset)))
|
|
2121 {
|
|
2122 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
|
|
2123 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
|
|
2124 }
|
|
2125 else
|
|
2126 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
|
|
2127 tail = XCONS (tail)->cdr;
|
|
2128 while (CONSP (tail))
|
|
2129 {
|
|
2130 if (INTEGERP (XCONS (tail)->car)
|
|
2131 && (charset = XINT (XCONS (tail)->car),
|
|
2132 CHARSET_VALID_P (charset)))
|
|
2133 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
|
|
2134 = i;
|
|
2135 else if (EQ (XCONS (tail)->car, Qt))
|
|
2136 default_reg_bits |= 1 << i;
|
|
2137 tail = XCONS (tail)->cdr;
|
|
2138 }
|
|
2139 }
|
|
2140 else
|
|
2141 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
|
|
2142
|
|
2143 CODING_SPEC_ISO_DESIGNATION (coding, i)
|
|
2144 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
|
|
2145 }
|
|
2146
|
|
2147 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
|
|
2148 {
|
|
2149 /* REG 1 can be used only by locking shift in 7-bit env. */
|
|
2150 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
|
|
2151 default_reg_bits &= ~2;
|
|
2152 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
|
|
2153 /* Without any shifting, only REG 0 and 1 can be used. */
|
|
2154 default_reg_bits &= 3;
|
|
2155 }
|
|
2156
|
|
2157 for (charset = 0; charset < MAX_CHARSET; charset++)
|
|
2158 if (CHARSET_VALID_P (charset)
|
|
2159 && CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) < 0)
|
|
2160 {
|
|
2161 /* We have not yet decided where to designate CHARSET. */
|
|
2162 int reg_bits = default_reg_bits;
|
|
2163
|
|
2164 if (CHARSET_CHARS (charset) == 96)
|
|
2165 /* A charset of CHARS96 can't be designated to REG 0. */
|
|
2166 reg_bits &= ~1;
|
|
2167
|
|
2168 if (reg_bits)
|
|
2169 /* There exist some default graphic register. */
|
|
2170 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
|
|
2171 = (reg_bits & 1
|
|
2172 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
|
|
2173 else
|
|
2174 /* We anyway have to designate CHARSET to somewhere. */
|
|
2175 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
|
|
2176 = (CHARSET_CHARS (charset) == 94
|
|
2177 ? 0
|
|
2178 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
|
|
2179 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
|
|
2180 ? 1
|
|
2181 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
|
|
2182 ? 2 : 0)));
|
|
2183 }
|
|
2184 }
|
|
2185 coding->require_flushing = 1;
|
|
2186 break;
|
|
2187
|
|
2188 case 3:
|
|
2189 coding->type = coding_type_big5;
|
|
2190 coding->flags
|
|
2191 = (NILP (XVECTOR (coding_system_vector)->contents[4])
|
|
2192 ? CODING_FLAG_BIG5_HKU
|
|
2193 : CODING_FLAG_BIG5_ETEN);
|
|
2194 break;
|
|
2195
|
|
2196 case 4:
|
|
2197 coding->type = coding_type_ccl;
|
|
2198 {
|
|
2199 Lisp_Object val = XVECTOR (coding_system_vector)->contents[4];
|
|
2200 if (CONSP (val)
|
|
2201 && VECTORP (XCONS (val)->car)
|
|
2202 && VECTORP (XCONS (val)->cdr))
|
|
2203 {
|
|
2204 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
|
|
2205 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
|
|
2206 }
|
|
2207 else
|
|
2208 goto label_invalid_coding_system;
|
|
2209 }
|
|
2210 coding->require_flushing = 1;
|
|
2211 break;
|
|
2212
|
|
2213 default:
|
|
2214 if (EQ (type, Qt))
|
|
2215 coding->type = coding_type_automatic;
|
|
2216 else
|
|
2217 coding->type = coding_type_no_conversion;
|
|
2218 break;
|
|
2219 }
|
|
2220 return 0;
|
|
2221
|
|
2222 label_invalid_coding_system:
|
|
2223 coding->type = coding_type_no_conversion;
|
|
2224 return -1;
|
|
2225 }
|
|
2226
|
|
2227 /* Emacs has a mechanism to automatically detect a coding system if it
|
|
2228 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
|
|
2229 it's impossible to distinguish some coding systems accurately
|
|
2230 because they use the same range of codes. So, at first, coding
|
|
2231 systems are categorized into 7, those are:
|
|
2232
|
|
2233 o coding-category-internal
|
|
2234
|
|
2235 The category for a coding system which has the same code range
|
|
2236 as Emacs' internal format. Assigned the coding-system (Lisp
|
|
2237 symbol) `coding-system-internal' by default.
|
|
2238
|
|
2239 o coding-category-sjis
|
|
2240
|
|
2241 The category for a coding system which has the same code range
|
|
2242 as SJIS. Assigned the coding-system (Lisp
|
|
2243 symbol) `coding-system-sjis' by default.
|
|
2244
|
|
2245 o coding-category-iso-7
|
|
2246
|
|
2247 The category for a coding system which has the same code range
|
|
2248 as ISO2022 of 7-bit environment. Assigned the coding-system
|
|
2249 (Lisp symbol) `coding-system-junet' by default.
|
|
2250
|
|
2251 o coding-category-iso-8-1
|
|
2252
|
|
2253 The category for a coding system which has the same code range
|
|
2254 as ISO2022 of 8-bit environment and graphic plane 1 used only
|
|
2255 for DIMENSION1 charset. Assigned the coding-system (Lisp
|
|
2256 symbol) `coding-system-ctext' by default.
|
|
2257
|
|
2258 o coding-category-iso-8-2
|
|
2259
|
|
2260 The category for a coding system which has the same code range
|
|
2261 as ISO2022 of 8-bit environment and graphic plane 1 used only
|
|
2262 for DIMENSION2 charset. Assigned the coding-system (Lisp
|
|
2263 symbol) `coding-system-euc-japan' by default.
|
|
2264
|
|
2265 o coding-category-iso-else
|
|
2266
|
|
2267 The category for a coding system which has the same code range
|
|
2268 as ISO2022 but not belongs to any of the above three
|
|
2269 categories. Assigned the coding-system (Lisp symbol)
|
|
2270 `coding-system-iso-2022-ss2-7' by default.
|
|
2271
|
|
2272 o coding-category-big5
|
|
2273
|
|
2274 The category for a coding system which has the same code range
|
|
2275 as BIG5. Assigned the coding-system (Lisp symbol)
|
|
2276 `coding-system-big5' by default.
|
|
2277
|
|
2278 o coding-category-binary
|
|
2279
|
|
2280 The category for a coding system not categorized in any of the
|
|
2281 above. Assigned the coding-system (Lisp symbol)
|
|
2282 `coding-system-noconv' by default.
|
|
2283
|
|
2284 Each of them is a Lisp symbol and the value is an actual
|
|
2285 `coding-system's (this is also a Lisp symbol) assigned by a user.
|
|
2286 What Emacs does actually is to detect a category of coding system.
|
|
2287 Then, it uses a `coding-system' assigned to it. If Emacs can't
|
|
2288 decide only one possible category, it selects a category of the
|
|
2289 highest priority. Priorities of categories are also specified by a
|
|
2290 user in a Lisp variable `coding-category-list'.
|
|
2291
|
|
2292 */
|
|
2293
|
|
2294 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
|
|
2295 If it detects possible coding systems, return an integer in which
|
|
2296 appropriate flag bits are set. Flag bits are defined by macros
|
|
2297 CODING_CATEGORY_MASK_XXX in `coding.h'. */
|
|
2298
|
|
2299 int
|
|
2300 detect_coding_mask (src, src_bytes)
|
|
2301 unsigned char *src;
|
|
2302 int src_bytes;
|
|
2303 {
|
|
2304 register unsigned char c;
|
|
2305 unsigned char *src_end = src + src_bytes;
|
|
2306 int mask;
|
|
2307
|
|
2308 /* At first, skip all ASCII characters and control characters except
|
|
2309 for three ISO2022 specific control characters. */
|
|
2310 while (src < src_end)
|
|
2311 {
|
|
2312 c = *src;
|
|
2313 if (c >= 0x80
|
|
2314 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
|
|
2315 break;
|
|
2316 src++;
|
|
2317 }
|
|
2318
|
|
2319 if (src >= src_end)
|
|
2320 /* We found nothing other than ASCII. There's nothing to do. */
|
|
2321 return CODING_CATEGORY_MASK_ANY;
|
|
2322
|
|
2323 /* The text seems to be encoded in some multilingual coding system.
|
|
2324 Now, try to find in which coding system the text is encoded. */
|
|
2325 if (c < 0x80)
|
|
2326 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
|
|
2327 /* C is an ISO2022 specific control code of C0. */
|
|
2328 mask = detect_coding_iso2022 (src, src_end);
|
|
2329
|
|
2330 else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
|
|
2331 /* C is an ISO2022 specific control code of C1,
|
|
2332 or the first byte of SJIS's 2-byte character code,
|
|
2333 or a leading code of Emacs. */
|
|
2334 mask = (detect_coding_iso2022 (src, src_end)
|
|
2335 | detect_coding_sjis (src, src_end)
|
|
2336 | detect_coding_internal (src, src_end));
|
|
2337
|
|
2338 else if (c < 0xA0)
|
|
2339 /* C is the first byte of SJIS character code,
|
|
2340 or a leading-code of Emacs. */
|
|
2341 mask = (detect_coding_sjis (src, src_end)
|
|
2342 | detect_coding_internal (src, src_end));
|
|
2343
|
|
2344 else
|
|
2345 /* C is a character of ISO2022 in graphic plane right,
|
|
2346 or a SJIS's 1-byte character code (i.e. JISX0201),
|
|
2347 or the first byte of BIG5's 2-byte code. */
|
|
2348 mask = (detect_coding_iso2022 (src, src_end)
|
|
2349 | detect_coding_sjis (src, src_end)
|
|
2350 | detect_coding_big5 (src, src_end));
|
|
2351
|
|
2352 return mask;
|
|
2353 }
|
|
2354
|
|
2355 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
|
|
2356 The information of the detected coding system is set in CODING. */
|
|
2357
|
|
2358 void
|
|
2359 detect_coding (coding, src, src_bytes)
|
|
2360 struct coding_system *coding;
|
|
2361 unsigned char *src;
|
|
2362 int src_bytes;
|
|
2363 {
|
|
2364 int mask = detect_coding_mask (src, src_bytes);
|
|
2365 int idx;
|
|
2366
|
|
2367 if (mask == CODING_CATEGORY_MASK_ANY)
|
|
2368 /* We found nothing other than ASCII. There's nothing to do. */
|
|
2369 return;
|
|
2370
|
|
2371 if (!mask)
|
|
2372 /* The source text seems to be encoded in unknown coding system.
|
|
2373 Emacs regards the category of such a kind of coding system as
|
|
2374 `coding-category-binary'. We assume that a user has assigned
|
|
2375 an appropriate coding system for a `coding-category-binary'. */
|
|
2376 idx = CODING_CATEGORY_IDX_BINARY;
|
|
2377 else
|
|
2378 {
|
|
2379 /* We found some plausible coding systems. Let's use a coding
|
|
2380 system of the highest priority. */
|
|
2381 Lisp_Object val = Vcoding_category_list;
|
|
2382
|
|
2383 if (CONSP (val))
|
|
2384 while (!NILP (val))
|
|
2385 {
|
|
2386 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
|
|
2387 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
|
|
2388 break;
|
|
2389 val = XCONS (val)->cdr;
|
|
2390 }
|
|
2391 else
|
|
2392 val = Qnil;
|
|
2393
|
|
2394 if (NILP (val))
|
|
2395 {
|
|
2396 /* For unknown reason, `Vcoding_category_list' contains none
|
|
2397 of found categories. Let's use any of them. */
|
|
2398 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
|
|
2399 if (mask & (1 << idx))
|
|
2400 break;
|
|
2401 }
|
|
2402 }
|
|
2403 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
|
|
2404 }
|
|
2405
|
|
2406 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
|
|
2407 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
|
|
2408 CODING_EOL_CR, and CODING_EOL_AUTOMATIC. */
|
|
2409
|
|
2410 int
|
|
2411 detect_eol_type (src, src_bytes)
|
|
2412 unsigned char *src;
|
|
2413 int src_bytes;
|
|
2414 {
|
|
2415 unsigned char *src_end = src + src_bytes;
|
|
2416 unsigned char c;
|
|
2417
|
|
2418 while (src < src_end)
|
|
2419 {
|
|
2420 c = *src++;
|
|
2421 if (c == '\n')
|
|
2422 return CODING_EOL_LF;
|
|
2423 else if (c == '\r')
|
|
2424 {
|
|
2425 if (src < src_end && *src == '\n')
|
|
2426 return CODING_EOL_CRLF;
|
|
2427 else
|
|
2428 return CODING_EOL_CR;
|
|
2429 }
|
|
2430 }
|
|
2431 return CODING_EOL_AUTOMATIC;
|
|
2432 }
|
|
2433
|
|
2434 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
|
|
2435 is encoded. If it detects an appropriate format of end-of-line, it
|
|
2436 sets the information in *CODING. */
|
|
2437
|
|
2438 void
|
|
2439 detect_eol (coding, src, src_bytes)
|
|
2440 struct coding_system *coding;
|
|
2441 unsigned char *src;
|
|
2442 int src_bytes;
|
|
2443 {
|
|
2444 Lisp_Object val;
|
|
2445 int eol_type = detect_eol_type (src, src_bytes);
|
|
2446
|
|
2447 if (eol_type == CODING_EOL_AUTOMATIC)
|
|
2448 /* We found no end-of-line in the source text. */
|
|
2449 return;
|
|
2450
|
|
2451 val = Fget (coding->symbol, Qeol_type);
|
|
2452 if (VECTORP (val) && XVECTOR (val)->size == 3)
|
|
2453 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
|
|
2454 }
|
|
2455
|
|
2456 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
|
|
2457 decoding, it may detect coding system and format of end-of-line if
|
|
2458 those are not yet decided. */
|
|
2459
|
|
2460 int
|
|
2461 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
|
|
2462 struct coding_system *coding;
|
|
2463 unsigned char *source, *destination;
|
|
2464 int src_bytes, dst_bytes;
|
|
2465 int *consumed;
|
|
2466 {
|
|
2467 int produced;
|
|
2468
|
|
2469 if (src_bytes <= 0)
|
|
2470 {
|
|
2471 *consumed = 0;
|
|
2472 return 0;
|
|
2473 }
|
|
2474
|
|
2475 if (coding->type == coding_type_automatic)
|
|
2476 detect_coding (coding, source, src_bytes);
|
|
2477
|
|
2478 if (coding->eol_type == CODING_EOL_AUTOMATIC)
|
|
2479 detect_eol (coding, source, src_bytes);
|
|
2480
|
|
2481 coding->carryover_size = 0;
|
|
2482 switch (coding->type)
|
|
2483 {
|
|
2484 case coding_type_no_conversion:
|
|
2485 label_no_conversion:
|
|
2486 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
|
|
2487 bcopy (source, destination, produced);
|
|
2488 *consumed = produced;
|
|
2489 break;
|
|
2490
|
|
2491 case coding_type_internal:
|
|
2492 case coding_type_automatic:
|
|
2493 if (coding->eol_type == CODING_EOL_LF
|
|
2494 || coding->eol_type == CODING_EOL_AUTOMATIC)
|
|
2495 goto label_no_conversion;
|
|
2496 produced = decode_eol (coding, source, destination,
|
|
2497 src_bytes, dst_bytes, consumed);
|
|
2498 break;
|
|
2499
|
|
2500 case coding_type_sjis:
|
|
2501 produced = decode_coding_sjis_big5 (coding, source, destination,
|
|
2502 src_bytes, dst_bytes, consumed,
|
|
2503 1);
|
|
2504 break;
|
|
2505
|
|
2506 case coding_type_iso2022:
|
|
2507 produced = decode_coding_iso2022 (coding, source, destination,
|
|
2508 src_bytes, dst_bytes, consumed);
|
|
2509 break;
|
|
2510
|
|
2511 case coding_type_big5:
|
|
2512 produced = decode_coding_sjis_big5 (coding, source, destination,
|
|
2513 src_bytes, dst_bytes, consumed,
|
|
2514 0);
|
|
2515 break;
|
|
2516
|
|
2517 case coding_type_ccl:
|
|
2518 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
|
|
2519 src_bytes, dst_bytes, consumed);
|
|
2520 break;
|
|
2521 }
|
|
2522
|
|
2523 return produced;
|
|
2524 }
|
|
2525
|
|
2526 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
|
|
2527
|
|
2528 int
|
|
2529 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
|
|
2530 struct coding_system *coding;
|
|
2531 unsigned char *source, *destination;
|
|
2532 int src_bytes, dst_bytes;
|
|
2533 int *consumed;
|
|
2534 {
|
|
2535 int produced;
|
|
2536
|
|
2537 coding->carryover_size = 0;
|
|
2538 switch (coding->type)
|
|
2539 {
|
|
2540 case coding_type_no_conversion:
|
|
2541 label_no_conversion:
|
|
2542 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
|
|
2543 if (produced > 0)
|
|
2544 {
|
|
2545 bcopy (source, destination, produced);
|
|
2546 if (coding->selective)
|
|
2547 {
|
|
2548 unsigned char *p = destination, *pend = destination + produced;
|
|
2549 while (p < pend)
|
|
2550 if (*p++ = '\015') p[-1] = '\n';
|
|
2551 }
|
|
2552 }
|
|
2553 *consumed = produced;
|
|
2554 break;
|
|
2555
|
|
2556 case coding_type_internal:
|
|
2557 case coding_type_automatic:
|
|
2558 if (coding->eol_type == CODING_EOL_LF
|
|
2559 || coding->eol_type == CODING_EOL_AUTOMATIC)
|
|
2560 goto label_no_conversion;
|
|
2561 produced = encode_eol (coding, source, destination,
|
|
2562 src_bytes, dst_bytes, consumed);
|
|
2563 break;
|
|
2564
|
|
2565 case coding_type_sjis:
|
|
2566 produced = encode_coding_sjis_big5 (coding, source, destination,
|
|
2567 src_bytes, dst_bytes, consumed,
|
|
2568 1);
|
|
2569 break;
|
|
2570
|
|
2571 case coding_type_iso2022:
|
|
2572 produced = encode_coding_iso2022 (coding, source, destination,
|
|
2573 src_bytes, dst_bytes, consumed);
|
|
2574 break;
|
|
2575
|
|
2576 case coding_type_big5:
|
|
2577 produced = encode_coding_sjis_big5 (coding, source, destination,
|
|
2578 src_bytes, dst_bytes, consumed,
|
|
2579 0);
|
|
2580 break;
|
|
2581
|
|
2582 case coding_type_ccl:
|
|
2583 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
|
|
2584 src_bytes, dst_bytes, consumed);
|
|
2585 break;
|
|
2586 }
|
|
2587
|
|
2588 return produced;
|
|
2589 }
|
|
2590
|
|
2591 #define CONVERSION_BUFFER_EXTRA_ROOM 256
|
|
2592
|
|
2593 /* Return maximum size (bytes) of a buffer enough for decoding
|
|
2594 SRC_BYTES of text encoded in CODING. */
|
|
2595
|
|
2596 int
|
|
2597 decoding_buffer_size (coding, src_bytes)
|
|
2598 struct coding_system *coding;
|
|
2599 int src_bytes;
|
|
2600 {
|
|
2601 int magnification;
|
|
2602
|
|
2603 if (coding->type == coding_type_iso2022)
|
|
2604 magnification = 3;
|
|
2605 else if (coding->type == coding_type_ccl)
|
|
2606 magnification = coding->spec.ccl.decoder.buf_magnification;
|
|
2607 else
|
|
2608 magnification = 2;
|
|
2609
|
|
2610 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
|
|
2611 }
|
|
2612
|
|
2613 /* Return maximum size (bytes) of a buffer enough for encoding
|
|
2614 SRC_BYTES of text to CODING. */
|
|
2615
|
|
2616 int
|
|
2617 encoding_buffer_size (coding, src_bytes)
|
|
2618 struct coding_system *coding;
|
|
2619 int src_bytes;
|
|
2620 {
|
|
2621 int magnification;
|
|
2622
|
|
2623 if (coding->type == coding_type_ccl)
|
|
2624 magnification = coding->spec.ccl.encoder.buf_magnification;
|
|
2625 else
|
|
2626 magnification = 3;
|
|
2627
|
|
2628 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
|
|
2629 }
|
|
2630
|
|
2631 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
|
|
2632 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
|
|
2633 #endif
|
|
2634
|
|
2635 char *conversion_buffer;
|
|
2636 int conversion_buffer_size;
|
|
2637
|
|
2638 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
|
|
2639 or decoding. Sufficient memory is allocated automatically. If we
|
|
2640 run out of memory, return NULL. */
|
|
2641
|
|
2642 char *
|
|
2643 get_conversion_buffer (size)
|
|
2644 int size;
|
|
2645 {
|
|
2646 if (size > conversion_buffer_size)
|
|
2647 {
|
|
2648 char *buf;
|
|
2649 int real_size = conversion_buffer_size * 2;
|
|
2650
|
|
2651 while (real_size < size) real_size *= 2;
|
|
2652 buf = (char *) xmalloc (real_size);
|
|
2653 xfree (conversion_buffer);
|
|
2654 conversion_buffer = buf;
|
|
2655 conversion_buffer_size = real_size;
|
|
2656 }
|
|
2657 return conversion_buffer;
|
|
2658 }
|
|
2659
|
|
2660
|
|
2661 #ifdef emacs
|
|
2662 /*** 7. Emacs Lisp library functions ***/
|
|
2663
|
|
2664 DEFUN ("coding-system-vector", Fcoding_system_vector, Scoding_system_vector,
|
|
2665 1, 1, 0,
|
|
2666 "Return coding-vector of CODING-SYSTEM.\n\
|
|
2667 If CODING-SYSTEM is not a valid coding-system, return nil.")
|
|
2668 (obj)
|
|
2669 Lisp_Object obj;
|
|
2670 {
|
|
2671 while (SYMBOLP (obj) && !NILP (obj))
|
|
2672 obj = Fget (obj, Qcoding_system);
|
|
2673 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
|
|
2674 ? Qnil : obj);
|
|
2675 }
|
|
2676
|
|
2677 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
|
|
2678 "Return t if OBJECT is nil or a coding-system.\n\
|
|
2679 See document of make-coding-system for coding-system object.")
|
|
2680 (obj)
|
|
2681 Lisp_Object obj;
|
|
2682 {
|
|
2683 return ((NILP (obj) || !NILP (Fcoding_system_vector (obj))) ? Qt : Qnil);
|
|
2684 }
|
|
2685
|
|
2686 DEFUN ("read-non-nil-coding-system",
|
|
2687 Fread_non_nil_coding_system, Sread_non_nil_coding_system, 1, 1, 0,
|
|
2688 "Read a coding-system from the minibuffer, prompting with string PROMPT.")
|
|
2689 (prompt)
|
|
2690 Lisp_Object prompt;
|
|
2691 {
|
|
2692 return Fintern (Fcompleting_read (prompt, Vobarray, Qcoding_system_vector,
|
|
2693 Qt, Qnil, Qnil),
|
|
2694 Qnil);
|
|
2695 }
|
|
2696
|
|
2697 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
|
|
2698 "Read a coding-system or nil from the minibuffer, prompting with string PROMPT.")
|
|
2699 (prompt)
|
|
2700 Lisp_Object prompt;
|
|
2701 {
|
|
2702 return Fintern (Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
|
|
2703 Qt, Qnil, Qnil),
|
|
2704 Qnil);
|
|
2705 }
|
|
2706
|
|
2707 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
|
|
2708 1, 1, 0,
|
|
2709 "Check validity of CODING-SYSTEM.\n\
|
|
2710 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
|
|
2711 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
|
|
2712 The value of property should be a vector of length 5.")
|
|
2713 (coding_system)
|
|
2714 Lisp_Object coding_system;
|
|
2715 {
|
|
2716 CHECK_SYMBOL (coding_system, 0);
|
|
2717 if (!NILP (Fcoding_system_p (coding_system)))
|
|
2718 return coding_system;
|
|
2719 while (1)
|
|
2720 Fsignal (Qcoding_system_error, coding_system);
|
|
2721 }
|
|
2722
|
|
2723 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
|
|
2724 2, 2, 0,
|
|
2725 "Detect coding-system of the text in the region between START and END.\n\
|
|
2726 Return a list of possible coding-systems ordered by priority.\n\
|
|
2727 If only ASCII characters are found, it returns `coding-system-automatic'\n\
|
|
2728 or its subsidiary coding-system according to a detected end-of-line format.")
|
|
2729 (b, e)
|
|
2730 Lisp_Object b, e;
|
|
2731 {
|
|
2732 int coding_mask, eol_type;
|
|
2733 Lisp_Object val;
|
|
2734 int beg, end;
|
|
2735
|
|
2736 validate_region (&b, &e);
|
|
2737 beg = XINT (b), end = XINT (e);
|
|
2738 if (beg < GPT && end >= GPT) move_gap (end);
|
|
2739
|
|
2740 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
|
|
2741 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
|
|
2742
|
|
2743 if (coding_mask == CODING_CATEGORY_MASK_ANY)
|
|
2744 {
|
|
2745 val = intern ("coding-system-automatic");
|
|
2746 if (eol_type != CODING_EOL_AUTOMATIC)
|
|
2747 {
|
|
2748 Lisp_Object val2 = Fget (val, Qeol_type);
|
|
2749 if (VECTORP (val2))
|
|
2750 val = XVECTOR (val2)->contents[eol_type];
|
|
2751 }
|
|
2752 }
|
|
2753 else
|
|
2754 {
|
|
2755 Lisp_Object val2;
|
|
2756
|
|
2757 /* At first, gather possible coding-systems in VAL in a reverse
|
|
2758 order. */
|
|
2759 val = Qnil;
|
|
2760 for (val2 = Vcoding_category_list;
|
|
2761 !NILP (val2);
|
|
2762 val2 = XCONS (val2)->cdr)
|
|
2763 {
|
|
2764 int idx
|
|
2765 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
|
|
2766 if (coding_mask & (1 << idx))
|
|
2767 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
|
|
2768 }
|
|
2769
|
|
2770 /* Then, change the order of the list, while getting subsidiary
|
|
2771 coding-systems. */
|
|
2772 val2 = val;
|
|
2773 val = Qnil;
|
|
2774 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
|
|
2775 {
|
|
2776 if (eol_type == CODING_EOL_AUTOMATIC)
|
|
2777 val = Fcons (XCONS (val2)->car, val);
|
|
2778 else
|
|
2779 {
|
|
2780 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
|
|
2781 if (VECTORP (val3))
|
|
2782 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
|
|
2783 else
|
|
2784 val = Fcons (XCONS (val2)->car, val);
|
|
2785 }
|
|
2786 }
|
|
2787 }
|
|
2788
|
|
2789 return val;
|
|
2790 }
|
|
2791
|
|
2792 /* Scan text in the region between *BEGP and *ENDP, skip characters
|
|
2793 which we never have to encode to (iff ENCODEP is 1) or decode from
|
|
2794 coding system CODING at the head and tail, then set BEGP and ENDP
|
|
2795 to the addresses of start and end of the text we actually convert. */
|
|
2796
|
|
2797 void
|
|
2798 shrink_conversion_area (begp, endp, coding, encodep)
|
|
2799 unsigned char **begp, **endp;
|
|
2800 struct coding_system *coding;
|
|
2801 int encodep;
|
|
2802 {
|
|
2803 register unsigned char *beg_addr = *begp, *end_addr = *endp;
|
|
2804
|
|
2805 if (coding->eol_type != CODING_EOL_LF
|
|
2806 && coding->eol_type != CODING_EOL_AUTOMATIC)
|
|
2807 /* Since we anyway have to convert end-of-line format, it is not
|
|
2808 worth skipping at most 100 bytes or so. */
|
|
2809 return;
|
|
2810
|
|
2811 if (encodep) /* for encoding */
|
|
2812 {
|
|
2813 switch (coding->type)
|
|
2814 {
|
|
2815 case coding_type_no_conversion:
|
|
2816 case coding_type_internal:
|
|
2817 case coding_type_automatic:
|
|
2818 /* We need no conversion. */
|
|
2819 *begp = *endp;
|
|
2820 return;
|
|
2821 case coding_type_ccl:
|
|
2822 /* We can't skip any data. */
|
|
2823 return;
|
|
2824 default:
|
|
2825 /* We can skip all ASCII characters at the head and tail. */
|
|
2826 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
|
|
2827 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
|
|
2828 break;
|
|
2829 }
|
|
2830 }
|
|
2831 else /* for decoding */
|
|
2832 {
|
|
2833 switch (coding->type)
|
|
2834 {
|
|
2835 case coding_type_no_conversion:
|
|
2836 /* We need no conversion. */
|
|
2837 *begp = *endp;
|
|
2838 return;
|
|
2839 case coding_type_internal:
|
|
2840 if (coding->eol_type == CODING_EOL_LF)
|
|
2841 {
|
|
2842 /* We need no conversion. */
|
|
2843 *begp = *endp;
|
|
2844 return;
|
|
2845 }
|
|
2846 /* We can skip all but carriage-return. */
|
|
2847 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
|
|
2848 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
|
|
2849 break;
|
|
2850 case coding_type_sjis:
|
|
2851 case coding_type_big5:
|
|
2852 /* We can skip all ASCII characters at the head. */
|
|
2853 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
|
|
2854 /* We can skip all ASCII characters at the tail except for
|
|
2855 the second byte of SJIS or BIG5 code. */
|
|
2856 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
|
|
2857 if (end_addr != *endp)
|
|
2858 end_addr++;
|
|
2859 break;
|
|
2860 case coding_type_ccl:
|
|
2861 /* We can't skip any data. */
|
|
2862 return;
|
|
2863 default: /* i.e. case coding_type_iso2022: */
|
|
2864 {
|
|
2865 unsigned char c;
|
|
2866
|
|
2867 /* We can skip all ASCII characters except for a few
|
|
2868 control codes at the head. */
|
|
2869 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
|
|
2870 && c != ISO_CODE_CR && c != ISO_CODE_SO
|
|
2871 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
|
|
2872 beg_addr++;
|
|
2873 }
|
|
2874 break;
|
|
2875 }
|
|
2876 }
|
|
2877 *begp = beg_addr;
|
|
2878 *endp = end_addr;
|
|
2879 return;
|
|
2880 }
|
|
2881
|
|
2882 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
|
|
2883 text between B and E. B and E are buffer position. */
|
|
2884
|
|
2885 Lisp_Object
|
|
2886 code_convert_region (b, e, coding, encodep)
|
|
2887 Lisp_Object b, e;
|
|
2888 struct coding_system *coding;
|
|
2889 int encodep;
|
|
2890 {
|
|
2891 int beg, end, len, consumed, produced;
|
|
2892 char *buf;
|
|
2893 unsigned char *begp, *endp;
|
|
2894 int pos = PT;
|
|
2895
|
|
2896 validate_region (&b, &e);
|
|
2897 beg = XINT (b), end = XINT (e);
|
|
2898 if (beg < GPT && end >= GPT)
|
|
2899 move_gap (end);
|
|
2900
|
|
2901 if (encodep && !NILP (coding->pre_write_conversion))
|
|
2902 {
|
|
2903 /* We must call a pre-conversion function which may put a new
|
|
2904 text to be converted in a new buffer. */
|
|
2905 struct buffer *old = current_buffer, *new;
|
|
2906
|
|
2907 TEMP_SET_PT (beg);
|
|
2908 call2 (coding->pre_write_conversion, b, e);
|
|
2909 if (old != current_buffer)
|
|
2910 {
|
|
2911 /* Replace the original text by the text just generated. */
|
|
2912 len = ZV - BEGV;
|
|
2913 new = current_buffer;
|
|
2914 set_buffer_internal (old);
|
|
2915 del_range (beg, end);
|
|
2916 insert_from_buffer (new, 1, len, 0);
|
|
2917 end = beg + len;
|
|
2918 }
|
|
2919 }
|
|
2920
|
|
2921 /* We may be able to shrink the conversion region. */
|
|
2922 begp = POS_ADDR (beg); endp = begp + (end - beg);
|
|
2923 shrink_conversion_area (&begp, &endp, coding, encodep);
|
|
2924
|
|
2925 if (begp == endp)
|
|
2926 /* We need no conversion. */
|
|
2927 len = end - beg;
|
|
2928 else
|
|
2929 {
|
|
2930 beg += begp - POS_ADDR (beg);
|
|
2931 end = beg + (endp - begp);
|
|
2932
|
|
2933 if (encodep)
|
|
2934 len = encoding_buffer_size (coding, end - beg);
|
|
2935 else
|
|
2936 len = decoding_buffer_size (coding, end - beg);
|
|
2937 buf = get_conversion_buffer (len);
|
|
2938
|
|
2939 coding->last_block = 1;
|
|
2940 produced = (encodep
|
|
2941 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
|
|
2942 &consumed)
|
|
2943 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
|
|
2944 &consumed));
|
|
2945
|
|
2946 len = produced + (beg - XINT (b)) + (XINT (e) - end);
|
|
2947
|
|
2948 TEMP_SET_PT (beg);
|
|
2949 insert (buf, produced);
|
|
2950 del_range (PT, PT + end - beg);
|
|
2951 if (pos >= end)
|
|
2952 pos = PT + (pos - end);
|
|
2953 else if (pos > beg)
|
|
2954 pos = beg;
|
|
2955 TEMP_SET_PT (pos);
|
|
2956 }
|
|
2957
|
|
2958 if (!encodep && !NILP (coding->post_read_conversion))
|
|
2959 {
|
|
2960 /* We must call a post-conversion function which may alter
|
|
2961 the text just converted. */
|
|
2962 Lisp_Object insval;
|
|
2963
|
|
2964 beg = XINT (b);
|
|
2965 TEMP_SET_PT (beg);
|
|
2966 insval = call1 (coding->post_read_conversion, make_number (len));
|
|
2967 CHECK_NUMBER (insval, 0);
|
|
2968 len = XINT (insval);
|
|
2969 }
|
|
2970
|
|
2971 return make_number (len);
|
|
2972 }
|
|
2973
|
|
2974 Lisp_Object
|
|
2975 code_convert_string (str, coding, encodep)
|
|
2976 Lisp_Object str;
|
|
2977 struct coding_system *coding;
|
|
2978 int encodep;
|
|
2979 {
|
|
2980 int len, consumed, produced;
|
|
2981 char *buf;
|
|
2982 unsigned char *begp, *endp;
|
|
2983 int head_skip, tail_skip;
|
|
2984 struct gcpro gcpro1;
|
|
2985
|
|
2986 if (encodep && !NILP (coding->pre_write_conversion)
|
|
2987 || !encodep && !NILP (coding->post_read_conversion))
|
|
2988 {
|
|
2989 /* Since we have to call Lisp functions which assume target text
|
|
2990 is in a buffer, after setting a temporary buffer, call
|
|
2991 code_convert_region. */
|
|
2992 int count = specpdl_ptr - specpdl;
|
|
2993 int len = XSTRING (str)->size;
|
|
2994 Lisp_Object result;
|
|
2995 struct buffer *old = current_buffer;
|
|
2996
|
|
2997 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
|
|
2998 temp_output_buffer_setup (" *code-converting-work*");
|
|
2999 set_buffer_internal (XBUFFER (Vstandard_output));
|
|
3000 insert_from_string (str, 0, len, 0);
|
|
3001 code_convert_region (make_number (BEGV), make_number (ZV),
|
|
3002 coding, encodep);
|
|
3003 result = make_buffer_string (BEGV, ZV, 0);
|
|
3004 set_buffer_internal (old);
|
|
3005 return unbind_to (count, result);
|
|
3006 }
|
|
3007
|
|
3008 /* We may be able to shrink the conversion region. */
|
|
3009 begp = XSTRING (str)->data;
|
|
3010 endp = begp + XSTRING (str)->size;
|
|
3011 shrink_conversion_area (&begp, &endp, coding, encodep);
|
|
3012
|
|
3013 if (begp == endp)
|
|
3014 /* We need no conversion. */
|
|
3015 return str;
|
|
3016
|
|
3017 head_skip = begp - XSTRING (str)->data;
|
|
3018 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
|
|
3019
|
|
3020 GCPRO1 (str);
|
|
3021
|
|
3022 if (encodep)
|
|
3023 len = encoding_buffer_size (coding, endp - begp);
|
|
3024 else
|
|
3025 len = decoding_buffer_size (coding, endp - begp);
|
|
3026 buf = get_conversion_buffer (len + head_skip + tail_skip);
|
|
3027
|
|
3028 bcopy (XSTRING (str)->data, buf, head_skip);
|
|
3029 coding->last_block = 1;
|
|
3030 produced = (encodep
|
|
3031 ? encode_coding (coding, XSTRING (str)->data + head_skip,
|
|
3032 buf + head_skip, endp - begp, len, &consumed)
|
|
3033 : decode_coding (coding, XSTRING (str)->data + head_skip,
|
|
3034 buf + head_skip, endp - begp, len, &consumed));
|
|
3035 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
|
|
3036 buf + head_skip + produced,
|
|
3037 tail_skip);
|
|
3038
|
|
3039 UNGCPRO;
|
|
3040
|
|
3041 return make_string (buf, head_skip + produced + tail_skip);
|
|
3042 }
|
|
3043
|
|
3044 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
|
|
3045 3, 3, 0,
|
|
3046 "Decode the text between START and END which is encoded in CODING-SYSTEM.\n\
|
|
3047 Return length of decoded text.")
|
|
3048 (b, e, coding_system)
|
|
3049 Lisp_Object b, e, coding_system;
|
|
3050 {
|
|
3051 struct coding_system coding;
|
|
3052
|
|
3053 CHECK_NUMBER_COERCE_MARKER (b, 0);
|
|
3054 CHECK_NUMBER_COERCE_MARKER (e, 1);
|
|
3055 CHECK_SYMBOL (coding_system, 2);
|
|
3056
|
|
3057 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
|
|
3058 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
|
|
3059
|
|
3060 return code_convert_region (b, e, &coding, 0);
|
|
3061 }
|
|
3062
|
|
3063 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
|
|
3064 3, 3, 0,
|
|
3065 "Encode the text between START and END to CODING-SYSTEM.\n\
|
|
3066 Return length of encoded text.")
|
|
3067 (b, e, coding_system)
|
|
3068 Lisp_Object b, e, coding_system;
|
|
3069 {
|
|
3070 struct coding_system coding;
|
|
3071
|
|
3072 CHECK_NUMBER_COERCE_MARKER (b, 0);
|
|
3073 CHECK_NUMBER_COERCE_MARKER (e, 1);
|
|
3074 CHECK_SYMBOL (coding_system, 2);
|
|
3075
|
|
3076 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
|
|
3077 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
|
|
3078
|
|
3079 return code_convert_region (b, e, &coding, 1);
|
|
3080 }
|
|
3081
|
|
3082 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
|
|
3083 2, 2, 0,
|
|
3084 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.")
|
|
3085 (string, coding_system)
|
|
3086 Lisp_Object string, coding_system;
|
|
3087 {
|
|
3088 struct coding_system coding;
|
|
3089
|
|
3090 CHECK_STRING (string, 0);
|
|
3091 CHECK_SYMBOL (coding_system, 1);
|
|
3092
|
|
3093 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
|
|
3094 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
|
|
3095
|
|
3096 return code_convert_string (string, &coding, 0);
|
|
3097 }
|
|
3098
|
|
3099 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
|
|
3100 2, 2, 0,
|
|
3101 "Encode STRING to CODING-SYSTEM, and return the result.")
|
|
3102 (string, coding_system)
|
|
3103 Lisp_Object string, coding_system;
|
|
3104 {
|
|
3105 struct coding_system coding;
|
|
3106
|
|
3107 CHECK_STRING (string, 0);
|
|
3108 CHECK_SYMBOL (coding_system, 1);
|
|
3109
|
|
3110 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
|
|
3111 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
|
|
3112
|
|
3113 return code_convert_string (string, &coding, 1);
|
|
3114 }
|
|
3115
|
|
3116 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
|
|
3117 "Decode a JISX0208 character of SJIS coding-system-sjis.\n\
|
|
3118 CODE is the character code in SJIS.\n\
|
|
3119 Return the corresponding character.")
|
|
3120 (code)
|
|
3121 Lisp_Object code;
|
|
3122 {
|
|
3123 unsigned char c1, c2, s1, s2;
|
|
3124 Lisp_Object val;
|
|
3125
|
|
3126 CHECK_NUMBER (code, 0);
|
|
3127 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
|
|
3128 DECODE_SJIS (s1, s2, c1, c2);
|
|
3129 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
|
|
3130 return val;
|
|
3131 }
|
|
3132
|
|
3133 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
|
|
3134 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
|
|
3135 Return the corresponding character code in SJIS.")
|
|
3136 (ch)
|
|
3137 Lisp_Object ch;
|
|
3138 {
|
|
3139 int charset;
|
|
3140 unsigned char c1, c2, s1, s2;
|
|
3141 Lisp_Object val;
|
|
3142
|
|
3143 CHECK_NUMBER (ch, 0);
|
|
3144 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
|
|
3145 if (charset == charset_jisx0208)
|
|
3146 {
|
|
3147 ENCODE_SJIS (c1, c2, s1, s2);
|
|
3148 XSETFASTINT (val, ((int)s1 << 8) | s2);
|
|
3149 }
|
|
3150 else
|
|
3151 XSETFASTINT (val, 0);
|
|
3152 return val;
|
|
3153 }
|
|
3154
|
|
3155 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
|
|
3156 "Decode a Big5 character CODE of BIG5 coding-system.\n\
|
|
3157 CODE is the character code in BIG5.\n\
|
|
3158 Return the corresponding character.")
|
|
3159 (code)
|
|
3160 Lisp_Object code;
|
|
3161 {
|
|
3162 int charset;
|
|
3163 unsigned char b1, b2, c1, c2;
|
|
3164 Lisp_Object val;
|
|
3165
|
|
3166 CHECK_NUMBER (code, 0);
|
|
3167 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
|
|
3168 DECODE_BIG5 (b1, b2, charset, c1, c2);
|
|
3169 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
|
|
3170 return val;
|
|
3171 }
|
|
3172
|
|
3173 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
|
|
3174 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
|
|
3175 Return the corresponding character code in Big5.")
|
|
3176 (ch)
|
|
3177 Lisp_Object ch;
|
|
3178 {
|
|
3179 int charset;
|
|
3180 unsigned char c1, c2, b1, b2;
|
|
3181 Lisp_Object val;
|
|
3182
|
|
3183 CHECK_NUMBER (ch, 0);
|
|
3184 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
|
|
3185 if (charset == charset_big5_1 || charset == charset_big5_2)
|
|
3186 {
|
|
3187 ENCODE_BIG5 (charset, c1, c2, b1, b2);
|
|
3188 XSETFASTINT (val, ((int)b1 << 8) | b2);
|
|
3189 }
|
|
3190 else
|
|
3191 XSETFASTINT (val, 0);
|
|
3192 return val;
|
|
3193 }
|
|
3194
|
|
3195 DEFUN ("set-terminal-coding-system",
|
|
3196 Fset_terminal_coding_system, Sset_terminal_coding_system, 1, 1,
|
|
3197 "zCoding-system for terminal display: ",
|
|
3198 "Set coding-system of your terminal to CODING-SYSTEM.\n\
|
|
3199 All outputs to terminal are encoded to this coding-system.")
|
|
3200 (coding_system)
|
|
3201 Lisp_Object coding_system;
|
|
3202 {
|
|
3203 CHECK_SYMBOL (coding_system, 0);
|
|
3204 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
|
|
3205 update_mode_lines++;
|
|
3206 if (!NILP (Finteractive_p ()))
|
|
3207 Fredraw_display ();
|
|
3208 return Qnil;
|
|
3209 }
|
|
3210
|
|
3211 DEFUN ("terminal-coding-system",
|
|
3212 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
|
|
3213 "Return coding-system of your terminal.")
|
|
3214 ()
|
|
3215 {
|
|
3216 return terminal_coding.symbol;
|
|
3217 }
|
|
3218
|
|
3219 DEFUN ("set-keyboard-coding-system",
|
|
3220 Fset_keyboard_coding_system, Sset_keyboard_coding_system, 1, 1,
|
|
3221 "zCoding-system for keyboard input: ",
|
|
3222 "Set coding-system of what is sent from terminal keyboard to CODING-SYSTEM.\n\
|
|
3223 All inputs from terminal are decoded from this coding-system.")
|
|
3224 (coding_system)
|
|
3225 Lisp_Object coding_system;
|
|
3226 {
|
|
3227 CHECK_SYMBOL (coding_system, 0);
|
|
3228 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
|
|
3229 return Qnil;
|
|
3230 }
|
|
3231
|
|
3232 DEFUN ("keyboard-coding-system",
|
|
3233 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
|
|
3234 "Return coding-system of what is sent from terminal keyboard.")
|
|
3235 ()
|
|
3236 {
|
|
3237 return keyboard_coding.symbol;
|
|
3238 }
|
|
3239
|
|
3240
|
|
3241 DEFUN ("find-coding-system", Ffind_coding_system, Sfind_coding_system,
|
|
3242 1, MANY, 0,
|
|
3243 "Return a cons of coding systems for I/O primitive OPERATION.\n\
|
|
3244 Remaining arguments are for OPERATION.\n\
|
|
3245 OPERATION is one of the following Emacs I/O primitives:\n\
|
|
3246 For file I/O, insert-file-contents or write-region.\n\
|
|
3247 For process I/O, call-process, call-process-region, or start-process.\n\
|
|
3248 For network I/O, open-network-stream.\n\
|
|
3249 For each OPERATION, TARGET is selected from the arguments as below:\n\
|
|
3250 For file I/O, TARGET is a file name.\n\
|
|
3251 For process I/O, TARGET is a process name.\n\
|
|
3252 For network I/O, TARGET is a service name or a port number\n\
|
|
3253 \n\
|
|
3254 The return value is a cons of coding systems for decoding and encoding\n\
|
|
3255 registered in nested alist `coding-system-alist' (which see) at a slot\n\
|
|
3256 corresponding to OPERATION and TARGET.
|
|
3257 If a function symbol is at the slot, return a result of the function call.\n\
|
|
3258 The function is called with one argument, a list of all the arguments.")
|
|
3259 (nargs, args)
|
|
3260 int nargs;
|
|
3261 Lisp_Object *args;
|
|
3262 {
|
|
3263 Lisp_Object operation, target_idx, target, val;
|
|
3264 register Lisp_Object chain;
|
|
3265
|
|
3266 if (nargs < 2)
|
|
3267 error ("Too few arguments");
|
|
3268 operation = args[0];
|
|
3269 if (!SYMBOLP (operation)
|
|
3270 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
|
|
3271 error ("Invalid first arguement");
|
|
3272 if (nargs < 1 + XINT (target_idx))
|
|
3273 error ("Too few arguments for operation: %s",
|
|
3274 XSYMBOL (operation)->name->data);
|
|
3275 target = args[XINT (target_idx) + 1];
|
|
3276 if (!(STRINGP (target)
|
|
3277 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
|
|
3278 error ("Invalid %dth argument", XINT (target_idx) + 1);
|
|
3279
|
|
3280 chain = Fassq (operation, Vcoding_system_alist);
|
|
3281 if (NILP (chain))
|
|
3282 return Qnil;
|
|
3283
|
|
3284 for (chain = XCONS (chain)->cdr; CONSP (chain); chain = XCONS (chain)->cdr)
|
|
3285 {
|
|
3286 Lisp_Object elt = XCONS (chain)->car;
|
|
3287
|
|
3288 if (CONSP (elt)
|
|
3289 && ((STRINGP (target)
|
|
3290 && STRINGP (XCONS (elt)->car)
|
|
3291 && fast_string_match (XCONS (elt)->car, target) >= 0)
|
|
3292 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
|
|
3293 return (CONSP (val = XCONS (elt)->cdr)
|
|
3294 ? val
|
|
3295 : ((SYMBOLP (val) && Fboundp (val)
|
|
3296 ? call2 (val, Flist (nargs, args))
|
|
3297 : Qnil)));
|
|
3298 }
|
|
3299 return Qnil;
|
|
3300 }
|
|
3301
|
|
3302 #endif /* emacs */
|
|
3303
|
|
3304
|
|
3305 /*** 8. Post-amble ***/
|
|
3306
|
|
3307 init_coding_once ()
|
|
3308 {
|
|
3309 int i;
|
|
3310
|
|
3311 /* Emacs internal format specific initialize routine. */
|
|
3312 for (i = 0; i <= 0x20; i++)
|
|
3313 emacs_code_class[i] = EMACS_control_code;
|
|
3314 emacs_code_class[0x0A] = EMACS_linefeed_code;
|
|
3315 emacs_code_class[0x0D] = EMACS_carriage_return_code;
|
|
3316 for (i = 0x21 ; i < 0x7F; i++)
|
|
3317 emacs_code_class[i] = EMACS_ascii_code;
|
|
3318 emacs_code_class[0x7F] = EMACS_control_code;
|
|
3319 emacs_code_class[0x80] = EMACS_leading_code_composition;
|
|
3320 for (i = 0x81; i < 0xFF; i++)
|
|
3321 emacs_code_class[i] = EMACS_invalid_code;
|
|
3322 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
|
|
3323 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
|
|
3324 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
|
|
3325 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
|
|
3326
|
|
3327 /* ISO2022 specific initialize routine. */
|
|
3328 for (i = 0; i < 0x20; i++)
|
|
3329 iso_code_class[i] = ISO_control_code;
|
|
3330 for (i = 0x21; i < 0x7F; i++)
|
|
3331 iso_code_class[i] = ISO_graphic_plane_0;
|
|
3332 for (i = 0x80; i < 0xA0; i++)
|
|
3333 iso_code_class[i] = ISO_control_code;
|
|
3334 for (i = 0xA1; i < 0xFF; i++)
|
|
3335 iso_code_class[i] = ISO_graphic_plane_1;
|
|
3336 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
|
|
3337 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
|
|
3338 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
|
|
3339 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
|
|
3340 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
|
|
3341 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
|
|
3342 iso_code_class[ISO_CODE_ESC] = ISO_escape;
|
|
3343 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
|
|
3344 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
|
|
3345 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
|
|
3346
|
|
3347 Qcoding_system = intern ("coding-system");
|
|
3348 staticpro (&Qcoding_system);
|
|
3349
|
|
3350 Qeol_type = intern ("eol-type");
|
|
3351 staticpro (&Qeol_type);
|
|
3352
|
|
3353 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
|
|
3354 staticpro (&Qbuffer_file_coding_system);
|
|
3355
|
|
3356 Qpost_read_conversion = intern ("post-read-conversion");
|
|
3357 staticpro (&Qpost_read_conversion);
|
|
3358
|
|
3359 Qpre_write_conversion = intern ("pre-write-conversion");
|
|
3360 staticpro (&Qpre_write_conversion);
|
|
3361
|
|
3362 Qcoding_system_vector = intern ("coding-system-vector");
|
|
3363 staticpro (&Qcoding_system_vector);
|
|
3364
|
|
3365 Qcoding_system_p = intern ("coding-system-p");
|
|
3366 staticpro (&Qcoding_system_p);
|
|
3367
|
|
3368 Qcoding_system_error = intern ("coding-system-error");
|
|
3369 staticpro (&Qcoding_system_error);
|
|
3370
|
|
3371 Fput (Qcoding_system_error, Qerror_conditions,
|
|
3372 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
|
|
3373 Fput (Qcoding_system_error, Qerror_message,
|
|
3374 build_string ("Coding-system error"));
|
|
3375
|
|
3376 Qcoding_category_index = intern ("coding-category-index");
|
|
3377 staticpro (&Qcoding_category_index);
|
|
3378
|
|
3379 {
|
|
3380 int i;
|
|
3381 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
|
|
3382 {
|
|
3383 coding_category_table[i] = intern (coding_category_name[i]);
|
|
3384 staticpro (&coding_category_table[i]);
|
|
3385 Fput (coding_category_table[i], Qcoding_category_index,
|
|
3386 make_number (i));
|
|
3387 }
|
|
3388 }
|
|
3389
|
|
3390 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
|
|
3391 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
|
|
3392
|
|
3393 setup_coding_system (Qnil, &keyboard_coding);
|
|
3394 setup_coding_system (Qnil, &terminal_coding);
|
|
3395 }
|
|
3396
|
|
3397 #ifdef emacs
|
|
3398
|
|
3399 syms_of_coding ()
|
|
3400 {
|
|
3401 Qtarget_idx = intern ("target-idx");
|
|
3402 staticpro (&Qtarget_idx);
|
|
3403
|
|
3404 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
|
|
3405 Fput (Qwrite_region, Qtarget_idx, make_number (2));
|
|
3406
|
|
3407 Qcall_process = intern ("call-process");
|
|
3408 staticpro (&Qcall_process);
|
|
3409 Fput (Qcall_process, Qtarget_idx, make_number (0));
|
|
3410
|
|
3411 Qcall_process_region = intern ("call-process-region");
|
|
3412 staticpro (&Qcall_process_region);
|
|
3413 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
|
|
3414
|
|
3415 Qstart_process = intern ("start-process");
|
|
3416 staticpro (&Qstart_process);
|
|
3417 Fput (Qstart_process, Qtarget_idx, make_number (2));
|
|
3418
|
|
3419 Qopen_network_stream = intern ("open-network-stream");
|
|
3420 staticpro (&Qopen_network_stream);
|
|
3421 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
|
|
3422
|
|
3423 defsubr (&Scoding_system_vector);
|
|
3424 defsubr (&Scoding_system_p);
|
|
3425 defsubr (&Sread_coding_system);
|
|
3426 defsubr (&Sread_non_nil_coding_system);
|
|
3427 defsubr (&Scheck_coding_system);
|
|
3428 defsubr (&Sdetect_coding_region);
|
|
3429 defsubr (&Sdecode_coding_region);
|
|
3430 defsubr (&Sencode_coding_region);
|
|
3431 defsubr (&Sdecode_coding_string);
|
|
3432 defsubr (&Sencode_coding_string);
|
|
3433 defsubr (&Sdecode_sjis_char);
|
|
3434 defsubr (&Sencode_sjis_char);
|
|
3435 defsubr (&Sdecode_big5_char);
|
|
3436 defsubr (&Sencode_big5_char);
|
|
3437 defsubr (&Sset_terminal_coding_system);
|
|
3438 defsubr (&Sterminal_coding_system);
|
|
3439 defsubr (&Sset_keyboard_coding_system);
|
|
3440 defsubr (&Skeyboard_coding_system);
|
|
3441 defsubr (&Sfind_coding_system);
|
|
3442
|
|
3443 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
|
|
3444 "List of coding-categories (symbols) ordered by priority.");
|
|
3445 {
|
|
3446 int i;
|
|
3447
|
|
3448 Vcoding_category_list = Qnil;
|
|
3449 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
|
|
3450 Vcoding_category_list
|
|
3451 = Fcons (coding_category_table[i], Vcoding_category_list);
|
|
3452 }
|
|
3453
|
|
3454 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
|
|
3455 "A variable of internal use only.\n\
|
|
3456 If the value is a coding system, it is used for decoding on read operation.\n\
|
|
3457 If not, an appropriate element in `coding-system-alist' (which see) is used.");
|
|
3458 Vcoding_system_for_read = Qnil;
|
|
3459
|
|
3460 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
|
|
3461 "A variable of internal use only.\n\
|
|
3462 If the value is a coding system, it is used for encoding on write operation.\n\
|
|
3463 If not, an appropriate element in `coding-system-alist' (which see) is used.");
|
|
3464 Vcoding_system_for_write = Qnil;
|
|
3465
|
|
3466 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
|
|
3467 "Coding-system used in the latest file or process I/O.");
|
|
3468 Vlast_coding_system_used = Qnil;
|
|
3469
|
|
3470 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
|
|
3471 "Nested alist to decide a coding system for a specific I/O operation.\n\
|
|
3472 The format is ((OPERATION . ((REGEXP . CODING-SYSTEMS) ...)) ...).\n\
|
|
3473
|
|
3474 OPERATION is one of the following Emacs I/O primitives:\n\
|
|
3475 For file I/O, insert-file-contents and write-region.\n\
|
|
3476 For process I/O, call-process, call-process-region, and start-process.\n\
|
|
3477 For network I/O, open-network-stream.\n\
|
|
3478 In addition, for process I/O, `process-argument' can be specified for\n\
|
|
3479 encoding arguments of the process.\n\
|
|
3480 \n\
|
|
3481 REGEXP is a regular expression matching a target of OPERATION, where\n\
|
|
3482 target is a file name for file I/O operations, a process name for\n\
|
|
3483 process I/O operations, or a service name for network I/O\n\
|
|
3484 operations. REGEXP might be a port number for network I/O operation.\n\
|
|
3485 \n\
|
|
3486 CODING-SYSTEMS is a cons of coding systems to encode and decode\n\
|
|
3487 character code on OPERATION, or a function symbol returning the cons.\n\
|
|
3488 See the documentation of `find-coding-system' for more detail.");
|
|
3489 Vcoding_system_alist = Qnil;
|
|
3490
|
|
3491 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
|
|
3492 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
|
|
3493 eol_mnemonic_unix = '.';
|
|
3494
|
|
3495 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
|
|
3496 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
|
|
3497 eol_mnemonic_dos = ':';
|
|
3498
|
|
3499 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
|
|
3500 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
|
|
3501 eol_mnemonic_mac = '\'';
|
|
3502
|
|
3503 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
|
|
3504 "Mnemonic character indicating end-of-line format is not yet decided.");
|
|
3505 eol_mnemonic_undecided = '-';
|
|
3506
|
|
3507 DEFVAR_LISP ("alternate-charset-table", &Valternate_charset_table,
|
|
3508 "Alist of charsets vs the alternate charsets.\n\
|
|
3509 While decoding, if a charset (car part of an element) is found,\n\
|
|
3510 decode it as the alternate charset (cdr part of the element).");
|
|
3511 Valternate_charset_table = Qnil;
|
|
3512
|
|
3513 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
|
|
3514 "Alist of charsets vs revision numbers.\n\
|
|
3515 While encoding, if a charset (car part of an element) is found,\n\
|
|
3516 designate it with the escape sequence identifing revision (cdr part of the element).");
|
|
3517 Vcharset_revision_alist = Qnil;
|
|
3518 }
|
|
3519
|
|
3520 #endif /* emacs */
|