comparison src/coding.c @ 87676:2aeceff24280

(detect_coding_iso2022): New arg latin_extra_code_state. Allow Latin extra codes only when *latin_extra_code_state is nonzero. (detect_coding_mask): If there is a NULL byte, detect the encoding as UTF-16 or binary. If there is a Latin extra code, detect the encoding as ISO-2022 only when no other proper encoding is found.
author Kenichi Handa <handa@m17n.org>
date Wed, 09 Jan 2008 06:05:23 +0000
parents 107ccd98fa12
children df9e1c663162
comparison
equal deleted inserted replaced
87675:e4a11c2d5016 87676:2aeceff24280
1404 CODING_CATEGORY_MASK_ISO_8_1 1404 CODING_CATEGORY_MASK_ISO_8_1
1405 CODING_CATEGORY_MASK_ISO_8_2 1405 CODING_CATEGORY_MASK_ISO_8_2
1406 CODING_CATEGORY_MASK_ISO_7_ELSE 1406 CODING_CATEGORY_MASK_ISO_7_ELSE
1407 CODING_CATEGORY_MASK_ISO_8_ELSE 1407 CODING_CATEGORY_MASK_ISO_8_ELSE
1408 are set. If a code which should never appear in ISO2022 is found, 1408 are set. If a code which should never appear in ISO2022 is found,
1409 returns 0. */ 1409 returns 0.
1410
1411 If *latin_extra_code_state is zero and Latin extra codes are found,
1412 set *latin_extra_code_state to 1 and return 0. If it is nonzero,
1413 accept Latin extra codes. */
1410 1414
1411 static int 1415 static int
1412 detect_coding_iso2022 (src, src_end, multibytep) 1416 detect_coding_iso2022 (src, src_end, multibytep, latin_extra_code_state)
1413 unsigned char *src, *src_end; 1417 unsigned char *src, *src_end;
1414 int multibytep; 1418 int multibytep;
1419 int *latin_extra_code_state;
1415 { 1420 {
1416 int mask = CODING_CATEGORY_MASK_ISO; 1421 int mask = CODING_CATEGORY_MASK_ISO;
1417 int mask_found = 0; 1422 int mask_found = 0;
1418 int reg[4], shift_out = 0, single_shifting = 0; 1423 int reg[4], shift_out = 0, single_shifting = 0;
1419 int c, c1, charset; 1424 int c, c1, charset;
1572 single_shifting = 1; 1577 single_shifting = 1;
1573 } 1578 }
1574 if (VECTORP (Vlatin_extra_code_table) 1579 if (VECTORP (Vlatin_extra_code_table)
1575 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) 1580 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1576 { 1581 {
1582 if (! *latin_extra_code_state)
1583 {
1584 *latin_extra_code_state = 1;
1585 return 0;
1586 }
1577 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags 1587 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1578 & CODING_FLAG_ISO_LATIN_EXTRA) 1588 & CODING_FLAG_ISO_LATIN_EXTRA)
1579 newmask |= CODING_CATEGORY_MASK_ISO_8_1; 1589 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1580 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags 1590 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1581 & CODING_FLAG_ISO_LATIN_EXTRA) 1591 & CODING_FLAG_ISO_LATIN_EXTRA)
1598 if (VECTORP (Vlatin_extra_code_table) 1608 if (VECTORP (Vlatin_extra_code_table)
1599 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) 1609 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1600 { 1610 {
1601 int newmask = 0; 1611 int newmask = 0;
1602 1612
1613 if (! *latin_extra_code_state)
1614 {
1615 *latin_extra_code_state = 1;
1616 return 0;
1617 }
1603 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags 1618 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1604 & CODING_FLAG_ISO_LATIN_EXTRA) 1619 & CODING_FLAG_ISO_LATIN_EXTRA)
1605 newmask |= CODING_CATEGORY_MASK_ISO_8_1; 1620 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1606 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags 1621 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1607 & CODING_FLAG_ISO_LATIN_EXTRA) 1622 & CODING_FLAG_ISO_LATIN_EXTRA)
4125 { 4140 {
4126 register unsigned char c; 4141 register unsigned char c;
4127 unsigned char *src = source, *src_end = source + src_bytes; 4142 unsigned char *src = source, *src_end = source + src_bytes;
4128 unsigned int mask, utf16_examined_p, iso2022_examined_p; 4143 unsigned int mask, utf16_examined_p, iso2022_examined_p;
4129 int i; 4144 int i;
4145 int null_byte_found;
4146 int latin_extra_code_state = 1;
4130 4147
4131 /* At first, skip all ASCII characters and control characters except 4148 /* At first, skip all ASCII characters and control characters except
4132 for three ISO2022 specific control characters. */ 4149 for three ISO2022 specific control characters. */
4133 ascii_skip_code[ISO_CODE_SO] = 0; 4150 ascii_skip_code[ISO_CODE_SO] = 0;
4134 ascii_skip_code[ISO_CODE_SI] = 0; 4151 ascii_skip_code[ISO_CODE_SI] = 0;
4135 ascii_skip_code[ISO_CODE_ESC] = 0; 4152 ascii_skip_code[ISO_CODE_ESC] = 0;
4136 4153
4137 label_loop_detect_coding: 4154 label_loop_detect_coding:
4138 while (src < src_end && ascii_skip_code[*src]) src++; 4155 null_byte_found = 0;
4156 while (src < src_end && ascii_skip_code[*src])
4157 null_byte_found |= (! *src++);
4158 if (! null_byte_found)
4159 {
4160 unsigned char *p = src + 1;
4161 while (p < src_end)
4162 null_byte_found |= (! *p++);
4163 }
4139 *skip = src - source; 4164 *skip = src - source;
4140 4165
4141 if (src >= src_end) 4166 if (src >= src_end)
4142 /* We found nothing other than ASCII. There's nothing to do. */ 4167 /* We found nothing other than ASCII (and NULL byte). There's
4168 nothing to do. */
4143 return 0; 4169 return 0;
4144 4170
4145 c = *src; 4171 c = *src;
4146 /* The text seems to be encoded in some multilingual coding system. 4172 /* The text seems to be encoded in some multilingual coding system.
4147 Now, try to find in which coding system the text is encoded. */ 4173 Now, try to find in which coding system the text is encoded. */
4148 if (c < 0x80) 4174 if (! null_byte_found && c < 0x80)
4149 { 4175 {
4150 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */ 4176 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4151 /* C is an ISO2022 specific control code of C0. */ 4177 /* C is an ISO2022 specific control code of C0. */
4152 mask = detect_coding_iso2022 (src, src_end, multibytep); 4178 latin_extra_code_state = 1;
4179 mask = detect_coding_iso2022 (src, src_end, multibytep,
4180 &latin_extra_code_state);
4153 if (mask == 0) 4181 if (mask == 0)
4154 { 4182 {
4155 /* No valid ISO2022 code follows C. Try again. */ 4183 /* No valid ISO2022 code follows C. Try again. */
4156 src++; 4184 src++;
4157 if (c == ISO_CODE_ESC) 4185 if (c == ISO_CODE_ESC)
4175 int try; 4203 int try;
4176 4204
4177 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL) 4205 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4178 c = src[1] - 0x20; 4206 c = src[1] - 0x20;
4179 4207
4180 if (c < 0xA0) 4208 if (null_byte_found)
4209 {
4210 try = (CODING_CATEGORY_MASK_UTF_16_BE
4211 | CODING_CATEGORY_MASK_UTF_16_LE);
4212 }
4213 else if (c < 0xA0)
4181 { 4214 {
4182 /* C is the first byte of SJIS character code, 4215 /* C is the first byte of SJIS character code,
4183 or a leading-code of Emacs' internal format (emacs-mule), 4216 or a leading-code of Emacs' internal format (emacs-mule),
4184 or the first byte of UTF-16. */ 4217 or the first byte of UTF-16. */
4185 try = (CODING_CATEGORY_MASK_SJIS 4218 try = (CODING_CATEGORY_MASK_SJIS
4186 | CODING_CATEGORY_MASK_EMACS_MULE 4219 | CODING_CATEGORY_MASK_EMACS_MULE
4187 | CODING_CATEGORY_MASK_UTF_16_BE 4220 | CODING_CATEGORY_MASK_UTF_16_BE
4188 | CODING_CATEGORY_MASK_UTF_16_LE); 4221 | CODING_CATEGORY_MASK_UTF_16_LE);
4189 4222
4190 /* Or, if C is a special latin extra code, 4223 /* Or, if C is a special latin extra code,
4191 or is an ISO2022 specific control code of C1 (SS2 or SS3), 4224 or is an ISO2022 specific control code of C1 (SS2 or SS3),
4192 or is an ISO2022 control-sequence-introducer (CSI), 4225 or is an ISO2022 control-sequence-introducer (CSI),
4193 we should also consider the possibility of ISO2022 codings. */ 4226 we should also consider the possibility of ISO2022 codings. */
4194 if ((VECTORP (Vlatin_extra_code_table) 4227 if ((latin_extra_code_state
4228 && VECTORP (Vlatin_extra_code_table)
4195 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) 4229 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4196 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3) 4230 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4197 || (c == ISO_CODE_CSI 4231 || (c == ISO_CODE_CSI
4198 && (src < src_end 4232 && (src < src_end
4199 && (*src == ']' 4233 && (*src == ']'
4200 || ((*src == '0' || *src == '1' || *src == '2') 4234 || ((*src == '0' || *src == '1' || *src == '2')
4201 && src + 1 < src_end 4235 && src + 1 < src_end
4202 && src[1] == ']'))))) 4236 && src[1] == ']')))))
4203 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE 4237 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4204 | CODING_CATEGORY_MASK_ISO_8BIT); 4238 | CODING_CATEGORY_MASK_ISO_8BIT);
4205 } 4239 }
4206 else 4240 else
4207 /* C is a character of ISO2022 in graphic plane right, 4241 /* C is a character of ISO2022 in graphic plane right,
4208 or a SJIS's 1-byte character code (i.e. JISX0201), 4242 or a SJIS's 1-byte character code (i.e. JISX0201),
4209 or the first byte of BIG5's 2-byte code, 4243 or the first byte of BIG5's 2-byte code,
4210 or the first byte of UTF-8/16. */ 4244 or the first byte of UTF-8/16. */
4211 try = (CODING_CATEGORY_MASK_ISO_8_ELSE 4245 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4212 | CODING_CATEGORY_MASK_ISO_8BIT 4246 | CODING_CATEGORY_MASK_ISO_8BIT
4213 | CODING_CATEGORY_MASK_SJIS 4247 | CODING_CATEGORY_MASK_SJIS
4214 | CODING_CATEGORY_MASK_BIG5 4248 | CODING_CATEGORY_MASK_BIG5
4215 | CODING_CATEGORY_MASK_UTF_8 4249 | CODING_CATEGORY_MASK_UTF_8
4216 | CODING_CATEGORY_MASK_UTF_16_BE 4250 | CODING_CATEGORY_MASK_UTF_16_BE
4217 | CODING_CATEGORY_MASK_UTF_16_LE); 4251 | CODING_CATEGORY_MASK_UTF_16_LE);
4218 4252
4219 /* Or, we may have to consider the possibility of CCL. */ 4253 /* Or, we may have to consider the possibility of CCL. */
4220 if (coding_system_table[CODING_CATEGORY_IDX_CCL] 4254 if (! null_byte_found
4255 && coding_system_table[CODING_CATEGORY_IDX_CCL]
4221 && (coding_system_table[CODING_CATEGORY_IDX_CCL] 4256 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4222 ->spec.ccl.valid_codes)[c]) 4257 ->spec.ccl.valid_codes)[c])
4223 try |= CODING_CATEGORY_MASK_CCL; 4258 try |= CODING_CATEGORY_MASK_CCL;
4224 4259
4225 mask = 0; 4260 mask = 0;
4226 utf16_examined_p = iso2022_examined_p = 0;
4227 if (priorities) 4261 if (priorities)
4228 { 4262 {
4263 /* At first try detection with Latin extra codes not-allowed.
4264 If no proper coding system is found because of Latin extra
4265 codes, try detection with Latin extra codes allowed. */
4266 latin_extra_code_state = 0;
4267 label_retry:
4268 utf16_examined_p = iso2022_examined_p = 0;
4229 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) 4269 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4230 { 4270 {
4231 if (!iso2022_examined_p 4271 if (!iso2022_examined_p
4232 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO)) 4272 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4233 { 4273 {
4234 mask |= detect_coding_iso2022 (src, src_end, multibytep); 4274 mask |= detect_coding_iso2022 (src, src_end, multibytep,
4275 &latin_extra_code_state);
4235 iso2022_examined_p = 1; 4276 iso2022_examined_p = 1;
4236 } 4277 }
4237 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS) 4278 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4238 mask |= detect_coding_sjis (src, src_end, multibytep); 4279 mask |= detect_coding_sjis (src, src_end, multibytep);
4239 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8) 4280 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4250 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE) 4291 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4251 mask |= detect_coding_emacs_mule (src, src_end, multibytep); 4292 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4252 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL) 4293 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4253 mask |= detect_coding_ccl (src, src_end, multibytep); 4294 mask |= detect_coding_ccl (src, src_end, multibytep);
4254 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT) 4295 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4255 mask |= CODING_CATEGORY_MASK_RAW_TEXT; 4296 {
4297 if (latin_extra_code_state == 1)
4298 {
4299 /* Detection of ISO-2022 based coding system
4300 failed because of Latin extra codes. Before
4301 falling back to raw-text, try again with
4302 Latin extra codes allowed. */
4303 latin_extra_code_state = 2;
4304 try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
4305 | CODING_CATEGORY_MASK_ISO_8BIT);
4306 goto label_retry;
4307 }
4308 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4309 }
4256 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY) 4310 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4257 mask |= CODING_CATEGORY_MASK_BINARY; 4311 {
4312 if (latin_extra_code_state == 1)
4313 {
4314 /* See the above comment. */
4315 latin_extra_code_state = 2;
4316 try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
4317 | CODING_CATEGORY_MASK_ISO_8BIT);
4318 goto label_retry;
4319 }
4320 mask |= CODING_CATEGORY_MASK_BINARY;
4321 }
4258 if (mask & priorities[i]) 4322 if (mask & priorities[i])
4259 return priorities[i]; 4323 return priorities[i];
4260 } 4324 }
4261 return CODING_CATEGORY_MASK_RAW_TEXT; 4325 return CODING_CATEGORY_MASK_RAW_TEXT;
4262 } 4326 }
4263 if (try & CODING_CATEGORY_MASK_ISO) 4327 if (try & CODING_CATEGORY_MASK_ISO)
4264 mask |= detect_coding_iso2022 (src, src_end, multibytep); 4328 mask |= detect_coding_iso2022 (src, src_end, multibytep,
4329 &latin_extra_code_state);
4265 if (try & CODING_CATEGORY_MASK_SJIS) 4330 if (try & CODING_CATEGORY_MASK_SJIS)
4266 mask |= detect_coding_sjis (src, src_end, multibytep); 4331 mask |= detect_coding_sjis (src, src_end, multibytep);
4267 if (try & CODING_CATEGORY_MASK_BIG5) 4332 if (try & CODING_CATEGORY_MASK_BIG5)
4268 mask |= detect_coding_big5 (src, src_end, multibytep); 4333 mask |= detect_coding_big5 (src, src_end, multibytep);
4269 if (try & CODING_CATEGORY_MASK_UTF_8) 4334 if (try & CODING_CATEGORY_MASK_UTF_8)