Mercurial > emacs
changeset 88438:3a34b722dd71
(encode_coding_utf_8): Initialize produced_chars to 0.
(decode_coding_utf_16): Fix converting high and low bytes to
code-point.
(encode_coding_utf_16): Substitute coding->default_char for
non-Unicode characters.
(decode_coding): Don't call record_insert here.
(setup_coding_system): Initialize `surrogate' of
coding->spec.utf_16 to 0.
(EMIT_ONE_BYTE): Fix for multibyte case.
author | Kenichi Handa <handa@m17n.org> |
---|---|
date | Fri, 08 Mar 2002 00:19:39 +0000 |
parents | ffec838ec027 |
children | 7af269fae48f |
files | src/coding.c |
diffstat | 1 files changed, 59 insertions(+), 52 deletions(-) [+] |
line wrap: on
line diff
--- a/src/coding.c Fri Mar 08 00:18:28 2002 +0000 +++ b/src/coding.c Fri Mar 08 00:19:39 2002 +0000 @@ -46,31 +46,23 @@ CODING SYSTEM - Coding system is an encoding mechanism of one or more character - sets. Here's a list of coding system types supported by Emacs. - When we say "decode", it means converting a text encoded by some - coding system into Emacs' internal format (emacs-utf-8), and when we - say "encode", it means converting a text of emacs-utf-8 to some - other coding system. - - Emacs represents a coding system by a Lisp symbol. Each symbol is a - key to the hash table Vcharset_hash_table. This hash table - associates the symbol to the corresponding detailed specifications. - - Before using a coding system for decoding and encoding, we setup a - structure of type `struct coding_system'. This structure keeps - various information about a specific code conversion (e.g. the - location of source and destination data). - - Coding systems are classified into the following types by how to - represent a character in a byte sequence. Here's a brief descrition - about type. - - o Emacs' internal format (emacs-utf-8) - - The extended UTF-8 which allows eight-bit raw bytes mixed with - character codes. Emacs holds characters in buffers and strings by - this format. + Coding system is an object for a encoding mechanism that contains + information about how to convert byte sequence to character + sequences and vice versa. When we say "decode", it means converting + a byte sequence of a specific coding system into a character + sequence that is represented by Emacs' internal coding system + `emacs-utf-8', and when we say "encode", it means converting a + character sequence of emacs-utf-8 to a byte sequence of a specific + coding system. + + In Emacs Lisp, a coding system is represented by a Lisp symbol. In + C level, a coding system is represented by a vector of attributes + stored in the hash table Vcharset_hash_table. The conversion from a + coding system symbol to attributes vector is done by looking up + Vcharset_hash_table by the symbol. + + Coding systems are classified into the following types depending on + the mechanism of encoding. Here's a brief descrition about type. o UTF-8 @@ -137,6 +129,13 @@ independent, any coding system described above can take any format of end-of-line (except for no-conversion). +STRUCT CODING_SYSTEM + + Before using a coding system for code conversion (i.e. decoding and + encoding), we setup a structure of type `struct coding_system'. + This structure keeps various information about a specific code + conversion (e.g. the location of source and destination data). + */ /* COMMON MACROS */ @@ -818,19 +817,27 @@ /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */ -#define EMIT_TWO_BYTES(c1, c2) \ - do { \ - produced_chars += 2; \ - if (multibytep) \ - { \ - CHAR_STRING_ADVANCE ((int) (c1), dst); \ - CHAR_STRING_ADVANCE ((int) (c2), dst); \ - } \ - else \ - { \ - *dst++ = (c1); \ - *dst++ = (c2); \ - } \ +#define EMIT_TWO_BYTES(c1, c2) \ + do { \ + produced_chars += 2; \ + if (multibytep) \ + { \ + int ch; \ + \ + ch = (c1); \ + if (ch >= 0x80) \ + ch = BYTE8_TO_CHAR (ch); \ + CHAR_STRING_ADVANCE (ch, dst); \ + ch = (c2); \ + if (ch >= 0x80) \ + ch = BYTE8_TO_CHAR (ch); \ + CHAR_STRING_ADVANCE (ch, dst); \ + } \ + else \ + { \ + *dst++ = (c1); \ + *dst++ = (c2); \ + } \ } while (0) @@ -889,10 +896,14 @@ coding->source = GAP_END_ADDR + coding->src_pos_byte; else { - if (coding->src_pos < GPT - && coding->src_pos + coding->src_chars >= GPT) - move_gap_both (coding->src_pos, coding->src_pos_byte); - coding->source = BYTE_POS_ADDR (coding->src_pos_byte); + struct buffer *buf = XBUFFER (coding->src_object); + EMACS_INT beg_byte = BUF_BEG_BYTE (buf); + EMACS_INT gpt_byte = BUF_GPT_BYTE (buf); + unsigned char *beg_addr = BUF_BEG_ADDR (buf); + + coding->source = beg_addr + coding->src_pos_byte - 1; + if (coding->src_pos_byte >= gpt_byte) + coding->source += BUF_GAP_SIZE (buf); } } else if (STRINGP (coding->src_object)) @@ -1182,7 +1193,7 @@ int *charbuf_end = charbuf + coding->charbuf_used; unsigned char *dst = coding->destination + coding->produced; unsigned char *dst_end = coding->destination + coding->dst_bytes; - int produced_chars; + int produced_chars = 0; int c; if (multibytep) @@ -1290,7 +1301,7 @@ src_base = src; ONE_MORE_BYTE (c1); ONE_MORE_BYTE (c2); - c = (c1 << 16) | c2; + c = (c1 << 8) | c2; if (bom == utf_16_with_bom) { if (endian == utf_16_big_endian @@ -1333,7 +1344,7 @@ ONE_MORE_BYTE (c1); ONE_MORE_BYTE (c2); c = (endian == utf_16_big_endian - ? ((c1 << 16) | c2) : ((c2 << 16) | c1)); + ? ((c1 << 8) | c2) : ((c2 << 8) | c1)); if (surrogate) { if (! UTF_16_LOW_SURROGATE_P (c)) @@ -1404,8 +1415,8 @@ { ASSURE_DESTINATION (safe_room); c = *charbuf++; - if (c >= 0x110000) - c = 0xFFFF; + if (c >= MAX_UNICODE_CHAR) + c = coding->default_char; if (c < 0x10000) { @@ -4504,6 +4515,7 @@ val = AREF (attrs, coding_attr_utf_16_endian); CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian : utf_16_little_endian); + CODING_UTF_16_SURROGATE (coding) = 0; coding->detector = detect_coding_utf_16; coding->decoder = decode_coding_utf_16; coding->encoder = encode_coding_utf_16; @@ -5458,11 +5470,6 @@ coding->consumed = coding->src_bytes; } - if (BUFFERP (coding->dst_object)) - { - record_insert (coding->dst_pos, coding->produced_char); - } - return coding->result; }