Mercurial > emacs
comparison src/coding.c @ 87676:2aeceff24280
(detect_coding_iso2022): New arg latin_extra_code_state. Allow Latin
extra codes only when *latin_extra_code_state is nonzero.
(detect_coding_mask): If there is a NULL byte, detect the encoding as
UTF-16 or binary. If there is a Latin extra code, detect the encoding
as ISO-2022 only when no other proper encoding is found.
author | Kenichi Handa <handa@m17n.org> |
---|---|
date | Wed, 09 Jan 2008 06:05:23 +0000 |
parents | 107ccd98fa12 |
children | df9e1c663162 |
comparison
equal
deleted
inserted
replaced
87675:e4a11c2d5016 | 87676:2aeceff24280 |
---|---|
1404 CODING_CATEGORY_MASK_ISO_8_1 | 1404 CODING_CATEGORY_MASK_ISO_8_1 |
1405 CODING_CATEGORY_MASK_ISO_8_2 | 1405 CODING_CATEGORY_MASK_ISO_8_2 |
1406 CODING_CATEGORY_MASK_ISO_7_ELSE | 1406 CODING_CATEGORY_MASK_ISO_7_ELSE |
1407 CODING_CATEGORY_MASK_ISO_8_ELSE | 1407 CODING_CATEGORY_MASK_ISO_8_ELSE |
1408 are set. If a code which should never appear in ISO2022 is found, | 1408 are set. If a code which should never appear in ISO2022 is found, |
1409 returns 0. */ | 1409 returns 0. |
1410 | |
1411 If *latin_extra_code_state is zero and Latin extra codes are found, | |
1412 set *latin_extra_code_state to 1 and return 0. If it is nonzero, | |
1413 accept Latin extra codes. */ | |
1410 | 1414 |
1411 static int | 1415 static int |
1412 detect_coding_iso2022 (src, src_end, multibytep) | 1416 detect_coding_iso2022 (src, src_end, multibytep, latin_extra_code_state) |
1413 unsigned char *src, *src_end; | 1417 unsigned char *src, *src_end; |
1414 int multibytep; | 1418 int multibytep; |
1419 int *latin_extra_code_state; | |
1415 { | 1420 { |
1416 int mask = CODING_CATEGORY_MASK_ISO; | 1421 int mask = CODING_CATEGORY_MASK_ISO; |
1417 int mask_found = 0; | 1422 int mask_found = 0; |
1418 int reg[4], shift_out = 0, single_shifting = 0; | 1423 int reg[4], shift_out = 0, single_shifting = 0; |
1419 int c, c1, charset; | 1424 int c, c1, charset; |
1572 single_shifting = 1; | 1577 single_shifting = 1; |
1573 } | 1578 } |
1574 if (VECTORP (Vlatin_extra_code_table) | 1579 if (VECTORP (Vlatin_extra_code_table) |
1575 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) | 1580 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) |
1576 { | 1581 { |
1582 if (! *latin_extra_code_state) | |
1583 { | |
1584 *latin_extra_code_state = 1; | |
1585 return 0; | |
1586 } | |
1577 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags | 1587 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags |
1578 & CODING_FLAG_ISO_LATIN_EXTRA) | 1588 & CODING_FLAG_ISO_LATIN_EXTRA) |
1579 newmask |= CODING_CATEGORY_MASK_ISO_8_1; | 1589 newmask |= CODING_CATEGORY_MASK_ISO_8_1; |
1580 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags | 1590 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags |
1581 & CODING_FLAG_ISO_LATIN_EXTRA) | 1591 & CODING_FLAG_ISO_LATIN_EXTRA) |
1598 if (VECTORP (Vlatin_extra_code_table) | 1608 if (VECTORP (Vlatin_extra_code_table) |
1599 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) | 1609 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) |
1600 { | 1610 { |
1601 int newmask = 0; | 1611 int newmask = 0; |
1602 | 1612 |
1613 if (! *latin_extra_code_state) | |
1614 { | |
1615 *latin_extra_code_state = 1; | |
1616 return 0; | |
1617 } | |
1603 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags | 1618 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags |
1604 & CODING_FLAG_ISO_LATIN_EXTRA) | 1619 & CODING_FLAG_ISO_LATIN_EXTRA) |
1605 newmask |= CODING_CATEGORY_MASK_ISO_8_1; | 1620 newmask |= CODING_CATEGORY_MASK_ISO_8_1; |
1606 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags | 1621 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags |
1607 & CODING_FLAG_ISO_LATIN_EXTRA) | 1622 & CODING_FLAG_ISO_LATIN_EXTRA) |
4125 { | 4140 { |
4126 register unsigned char c; | 4141 register unsigned char c; |
4127 unsigned char *src = source, *src_end = source + src_bytes; | 4142 unsigned char *src = source, *src_end = source + src_bytes; |
4128 unsigned int mask, utf16_examined_p, iso2022_examined_p; | 4143 unsigned int mask, utf16_examined_p, iso2022_examined_p; |
4129 int i; | 4144 int i; |
4145 int null_byte_found; | |
4146 int latin_extra_code_state = 1; | |
4130 | 4147 |
4131 /* At first, skip all ASCII characters and control characters except | 4148 /* At first, skip all ASCII characters and control characters except |
4132 for three ISO2022 specific control characters. */ | 4149 for three ISO2022 specific control characters. */ |
4133 ascii_skip_code[ISO_CODE_SO] = 0; | 4150 ascii_skip_code[ISO_CODE_SO] = 0; |
4134 ascii_skip_code[ISO_CODE_SI] = 0; | 4151 ascii_skip_code[ISO_CODE_SI] = 0; |
4135 ascii_skip_code[ISO_CODE_ESC] = 0; | 4152 ascii_skip_code[ISO_CODE_ESC] = 0; |
4136 | 4153 |
4137 label_loop_detect_coding: | 4154 label_loop_detect_coding: |
4138 while (src < src_end && ascii_skip_code[*src]) src++; | 4155 null_byte_found = 0; |
4156 while (src < src_end && ascii_skip_code[*src]) | |
4157 null_byte_found |= (! *src++); | |
4158 if (! null_byte_found) | |
4159 { | |
4160 unsigned char *p = src + 1; | |
4161 while (p < src_end) | |
4162 null_byte_found |= (! *p++); | |
4163 } | |
4139 *skip = src - source; | 4164 *skip = src - source; |
4140 | 4165 |
4141 if (src >= src_end) | 4166 if (src >= src_end) |
4142 /* We found nothing other than ASCII. There's nothing to do. */ | 4167 /* We found nothing other than ASCII (and NULL byte). There's |
4168 nothing to do. */ | |
4143 return 0; | 4169 return 0; |
4144 | 4170 |
4145 c = *src; | 4171 c = *src; |
4146 /* The text seems to be encoded in some multilingual coding system. | 4172 /* The text seems to be encoded in some multilingual coding system. |
4147 Now, try to find in which coding system the text is encoded. */ | 4173 Now, try to find in which coding system the text is encoded. */ |
4148 if (c < 0x80) | 4174 if (! null_byte_found && c < 0x80) |
4149 { | 4175 { |
4150 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */ | 4176 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */ |
4151 /* C is an ISO2022 specific control code of C0. */ | 4177 /* C is an ISO2022 specific control code of C0. */ |
4152 mask = detect_coding_iso2022 (src, src_end, multibytep); | 4178 latin_extra_code_state = 1; |
4179 mask = detect_coding_iso2022 (src, src_end, multibytep, | |
4180 &latin_extra_code_state); | |
4153 if (mask == 0) | 4181 if (mask == 0) |
4154 { | 4182 { |
4155 /* No valid ISO2022 code follows C. Try again. */ | 4183 /* No valid ISO2022 code follows C. Try again. */ |
4156 src++; | 4184 src++; |
4157 if (c == ISO_CODE_ESC) | 4185 if (c == ISO_CODE_ESC) |
4175 int try; | 4203 int try; |
4176 | 4204 |
4177 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL) | 4205 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL) |
4178 c = src[1] - 0x20; | 4206 c = src[1] - 0x20; |
4179 | 4207 |
4180 if (c < 0xA0) | 4208 if (null_byte_found) |
4209 { | |
4210 try = (CODING_CATEGORY_MASK_UTF_16_BE | |
4211 | CODING_CATEGORY_MASK_UTF_16_LE); | |
4212 } | |
4213 else if (c < 0xA0) | |
4181 { | 4214 { |
4182 /* C is the first byte of SJIS character code, | 4215 /* C is the first byte of SJIS character code, |
4183 or a leading-code of Emacs' internal format (emacs-mule), | 4216 or a leading-code of Emacs' internal format (emacs-mule), |
4184 or the first byte of UTF-16. */ | 4217 or the first byte of UTF-16. */ |
4185 try = (CODING_CATEGORY_MASK_SJIS | 4218 try = (CODING_CATEGORY_MASK_SJIS |
4186 | CODING_CATEGORY_MASK_EMACS_MULE | 4219 | CODING_CATEGORY_MASK_EMACS_MULE |
4187 | CODING_CATEGORY_MASK_UTF_16_BE | 4220 | CODING_CATEGORY_MASK_UTF_16_BE |
4188 | CODING_CATEGORY_MASK_UTF_16_LE); | 4221 | CODING_CATEGORY_MASK_UTF_16_LE); |
4189 | 4222 |
4190 /* Or, if C is a special latin extra code, | 4223 /* Or, if C is a special latin extra code, |
4191 or is an ISO2022 specific control code of C1 (SS2 or SS3), | 4224 or is an ISO2022 specific control code of C1 (SS2 or SS3), |
4192 or is an ISO2022 control-sequence-introducer (CSI), | 4225 or is an ISO2022 control-sequence-introducer (CSI), |
4193 we should also consider the possibility of ISO2022 codings. */ | 4226 we should also consider the possibility of ISO2022 codings. */ |
4194 if ((VECTORP (Vlatin_extra_code_table) | 4227 if ((latin_extra_code_state |
4228 && VECTORP (Vlatin_extra_code_table) | |
4195 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) | 4229 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) |
4196 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3) | 4230 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3) |
4197 || (c == ISO_CODE_CSI | 4231 || (c == ISO_CODE_CSI |
4198 && (src < src_end | 4232 && (src < src_end |
4199 && (*src == ']' | 4233 && (*src == ']' |
4200 || ((*src == '0' || *src == '1' || *src == '2') | 4234 || ((*src == '0' || *src == '1' || *src == '2') |
4201 && src + 1 < src_end | 4235 && src + 1 < src_end |
4202 && src[1] == ']'))))) | 4236 && src[1] == ']'))))) |
4203 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE | 4237 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE |
4204 | CODING_CATEGORY_MASK_ISO_8BIT); | 4238 | CODING_CATEGORY_MASK_ISO_8BIT); |
4205 } | 4239 } |
4206 else | 4240 else |
4207 /* C is a character of ISO2022 in graphic plane right, | 4241 /* C is a character of ISO2022 in graphic plane right, |
4208 or a SJIS's 1-byte character code (i.e. JISX0201), | 4242 or a SJIS's 1-byte character code (i.e. JISX0201), |
4209 or the first byte of BIG5's 2-byte code, | 4243 or the first byte of BIG5's 2-byte code, |
4210 or the first byte of UTF-8/16. */ | 4244 or the first byte of UTF-8/16. */ |
4211 try = (CODING_CATEGORY_MASK_ISO_8_ELSE | 4245 try = (CODING_CATEGORY_MASK_ISO_8_ELSE |
4212 | CODING_CATEGORY_MASK_ISO_8BIT | 4246 | CODING_CATEGORY_MASK_ISO_8BIT |
4213 | CODING_CATEGORY_MASK_SJIS | 4247 | CODING_CATEGORY_MASK_SJIS |
4214 | CODING_CATEGORY_MASK_BIG5 | 4248 | CODING_CATEGORY_MASK_BIG5 |
4215 | CODING_CATEGORY_MASK_UTF_8 | 4249 | CODING_CATEGORY_MASK_UTF_8 |
4216 | CODING_CATEGORY_MASK_UTF_16_BE | 4250 | CODING_CATEGORY_MASK_UTF_16_BE |
4217 | CODING_CATEGORY_MASK_UTF_16_LE); | 4251 | CODING_CATEGORY_MASK_UTF_16_LE); |
4218 | 4252 |
4219 /* Or, we may have to consider the possibility of CCL. */ | 4253 /* Or, we may have to consider the possibility of CCL. */ |
4220 if (coding_system_table[CODING_CATEGORY_IDX_CCL] | 4254 if (! null_byte_found |
4255 && coding_system_table[CODING_CATEGORY_IDX_CCL] | |
4221 && (coding_system_table[CODING_CATEGORY_IDX_CCL] | 4256 && (coding_system_table[CODING_CATEGORY_IDX_CCL] |
4222 ->spec.ccl.valid_codes)[c]) | 4257 ->spec.ccl.valid_codes)[c]) |
4223 try |= CODING_CATEGORY_MASK_CCL; | 4258 try |= CODING_CATEGORY_MASK_CCL; |
4224 | 4259 |
4225 mask = 0; | 4260 mask = 0; |
4226 utf16_examined_p = iso2022_examined_p = 0; | |
4227 if (priorities) | 4261 if (priorities) |
4228 { | 4262 { |
4263 /* At first try detection with Latin extra codes not-allowed. | |
4264 If no proper coding system is found because of Latin extra | |
4265 codes, try detection with Latin extra codes allowed. */ | |
4266 latin_extra_code_state = 0; | |
4267 label_retry: | |
4268 utf16_examined_p = iso2022_examined_p = 0; | |
4229 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) | 4269 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) |
4230 { | 4270 { |
4231 if (!iso2022_examined_p | 4271 if (!iso2022_examined_p |
4232 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO)) | 4272 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO)) |
4233 { | 4273 { |
4234 mask |= detect_coding_iso2022 (src, src_end, multibytep); | 4274 mask |= detect_coding_iso2022 (src, src_end, multibytep, |
4275 &latin_extra_code_state); | |
4235 iso2022_examined_p = 1; | 4276 iso2022_examined_p = 1; |
4236 } | 4277 } |
4237 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS) | 4278 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS) |
4238 mask |= detect_coding_sjis (src, src_end, multibytep); | 4279 mask |= detect_coding_sjis (src, src_end, multibytep); |
4239 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8) | 4280 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8) |
4250 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE) | 4291 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE) |
4251 mask |= detect_coding_emacs_mule (src, src_end, multibytep); | 4292 mask |= detect_coding_emacs_mule (src, src_end, multibytep); |
4252 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL) | 4293 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL) |
4253 mask |= detect_coding_ccl (src, src_end, multibytep); | 4294 mask |= detect_coding_ccl (src, src_end, multibytep); |
4254 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT) | 4295 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT) |
4255 mask |= CODING_CATEGORY_MASK_RAW_TEXT; | 4296 { |
4297 if (latin_extra_code_state == 1) | |
4298 { | |
4299 /* Detection of ISO-2022 based coding system | |
4300 failed because of Latin extra codes. Before | |
4301 falling back to raw-text, try again with | |
4302 Latin extra codes allowed. */ | |
4303 latin_extra_code_state = 2; | |
4304 try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE | |
4305 | CODING_CATEGORY_MASK_ISO_8BIT); | |
4306 goto label_retry; | |
4307 } | |
4308 mask |= CODING_CATEGORY_MASK_RAW_TEXT; | |
4309 } | |
4256 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY) | 4310 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY) |
4257 mask |= CODING_CATEGORY_MASK_BINARY; | 4311 { |
4312 if (latin_extra_code_state == 1) | |
4313 { | |
4314 /* See the above comment. */ | |
4315 latin_extra_code_state = 2; | |
4316 try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE | |
4317 | CODING_CATEGORY_MASK_ISO_8BIT); | |
4318 goto label_retry; | |
4319 } | |
4320 mask |= CODING_CATEGORY_MASK_BINARY; | |
4321 } | |
4258 if (mask & priorities[i]) | 4322 if (mask & priorities[i]) |
4259 return priorities[i]; | 4323 return priorities[i]; |
4260 } | 4324 } |
4261 return CODING_CATEGORY_MASK_RAW_TEXT; | 4325 return CODING_CATEGORY_MASK_RAW_TEXT; |
4262 } | 4326 } |
4263 if (try & CODING_CATEGORY_MASK_ISO) | 4327 if (try & CODING_CATEGORY_MASK_ISO) |
4264 mask |= detect_coding_iso2022 (src, src_end, multibytep); | 4328 mask |= detect_coding_iso2022 (src, src_end, multibytep, |
4329 &latin_extra_code_state); | |
4265 if (try & CODING_CATEGORY_MASK_SJIS) | 4330 if (try & CODING_CATEGORY_MASK_SJIS) |
4266 mask |= detect_coding_sjis (src, src_end, multibytep); | 4331 mask |= detect_coding_sjis (src, src_end, multibytep); |
4267 if (try & CODING_CATEGORY_MASK_BIG5) | 4332 if (try & CODING_CATEGORY_MASK_BIG5) |
4268 mask |= detect_coding_big5 (src, src_end, multibytep); | 4333 mask |= detect_coding_big5 (src, src_end, multibytep); |
4269 if (try & CODING_CATEGORY_MASK_UTF_8) | 4334 if (try & CODING_CATEGORY_MASK_UTF_8) |