Mercurial > emacs
comparison src/coding.c @ 72395:94e4795b333d
(ONE_MORE_BYTE_CHECK_MULTIBYTE): New arg RET. If SRC
is exhausted, return with RET.
(detect_coding_emacs_mule, detect_coding_iso2022)
(detect_coding_sjis, detect_coding_big5, detect_coding_utf_8)
(detect_coding_utf_16, detect_coding_ccl): Adjusted for the above
change.
author | Kenichi Handa <handa@m17n.org> |
---|---|
date | Tue, 15 Aug 2006 02:41:29 +0000 |
parents | af796bc81ff0 |
children | 6493d4697ad2 694bbb62a75d |
comparison
equal
deleted
inserted
replaced
72394:bec9a701aee6 | 72395:94e4795b333d |
---|---|
217 c2 = *src++; \ | 217 c2 = *src++; \ |
218 } while (0) | 218 } while (0) |
219 | 219 |
220 | 220 |
221 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte | 221 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte |
222 form if MULTIBYTEP is nonzero. */ | 222 form if MULTIBYTEP is nonzero. In addition, if SRC is not less |
223 | 223 than SRC_END, return with RET. */ |
224 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep) \ | 224 |
225 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep, ret) \ | |
225 do { \ | 226 do { \ |
226 if (src >= src_end) \ | 227 if (src >= src_end) \ |
227 { \ | 228 { \ |
228 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \ | 229 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \ |
229 goto label_end_of_loop; \ | 230 return ret; \ |
230 } \ | 231 } \ |
231 c1 = *src++; \ | 232 c1 = *src++; \ |
232 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \ | 233 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \ |
233 c1 = *src++ - 0x20; \ | 234 c1 = *src++ - 0x20; \ |
234 } while (0) | 235 } while (0) |
630 struct coding_system dummy_coding; | 631 struct coding_system dummy_coding; |
631 struct coding_system *coding = &dummy_coding; | 632 struct coding_system *coding = &dummy_coding; |
632 | 633 |
633 while (1) | 634 while (1) |
634 { | 635 { |
635 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 636 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, |
636 | 637 CODING_CATEGORY_MASK_EMACS_MULE); |
637 if (composing) | 638 if (composing) |
638 { | 639 { |
639 if (c < 0xA0) | 640 if (c < 0xA0) |
640 composing = 0; | 641 composing = 0; |
641 else if (c == 0xA0) | 642 else if (c == 0xA0) |
642 { | 643 { |
643 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 644 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0); |
644 c &= 0x7F; | 645 c &= 0x7F; |
645 } | 646 } |
646 else | 647 else |
647 c -= 0x20; | 648 c -= 0x20; |
648 } | 649 } |
667 return 0; | 668 return 0; |
668 src = src_base + bytes; | 669 src = src_base + bytes; |
669 } | 670 } |
670 } | 671 } |
671 } | 672 } |
672 label_end_of_loop: | |
673 return CODING_CATEGORY_MASK_EMACS_MULE; | |
674 } | 673 } |
675 | 674 |
676 | 675 |
677 /* Record the starting position START and METHOD of one composition. */ | 676 /* Record the starting position START and METHOD of one composition. */ |
678 | 677 |
1423 struct coding_system dummy_coding; | 1422 struct coding_system dummy_coding; |
1424 struct coding_system *coding = &dummy_coding; | 1423 struct coding_system *coding = &dummy_coding; |
1425 Lisp_Object safe_chars; | 1424 Lisp_Object safe_chars; |
1426 | 1425 |
1427 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1; | 1426 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1; |
1428 while (mask && src < src_end) | 1427 while (mask) |
1429 { | 1428 { |
1430 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 1429 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found); |
1431 retry: | 1430 retry: |
1432 switch (c) | 1431 switch (c) |
1433 { | 1432 { |
1434 case ISO_CODE_ESC: | 1433 case ISO_CODE_ESC: |
1435 if (inhibit_iso_escape_detection) | 1434 if (inhibit_iso_escape_detection) |
1436 break; | 1435 break; |
1437 single_shifting = 0; | 1436 single_shifting = 0; |
1438 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 1437 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found); |
1439 if (c >= '(' && c <= '/') | 1438 if (c >= '(' && c <= '/') |
1440 { | 1439 { |
1441 /* Designation sequence for a charset of dimension 1. */ | 1440 /* Designation sequence for a charset of dimension 1. */ |
1442 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep); | 1441 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, mask & mask_found); |
1443 if (c1 < ' ' || c1 >= 0x80 | 1442 if (c1 < ' ' || c1 >= 0x80 |
1444 || (charset = iso_charset_table[0][c >= ','][c1]) < 0) | 1443 || (charset = iso_charset_table[0][c >= ','][c1]) < 0) |
1445 /* Invalid designation sequence. Just ignore. */ | 1444 /* Invalid designation sequence. Just ignore. */ |
1446 break; | 1445 break; |
1447 reg[(c - '(') % 4] = charset; | 1446 reg[(c - '(') % 4] = charset; |
1448 } | 1447 } |
1449 else if (c == '$') | 1448 else if (c == '$') |
1450 { | 1449 { |
1451 /* Designation sequence for a charset of dimension 2. */ | 1450 /* Designation sequence for a charset of dimension 2. */ |
1452 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 1451 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found); |
1453 if (c >= '@' && c <= 'B') | 1452 if (c >= '@' && c <= 'B') |
1454 /* Designation for JISX0208.1978, GB2312, or JISX0208. */ | 1453 /* Designation for JISX0208.1978, GB2312, or JISX0208. */ |
1455 reg[0] = charset = iso_charset_table[1][0][c]; | 1454 reg[0] = charset = iso_charset_table[1][0][c]; |
1456 else if (c >= '(' && c <= '/') | 1455 else if (c >= '(' && c <= '/') |
1457 { | 1456 { |
1458 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep); | 1457 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, |
1458 mask & mask_found); | |
1459 if (c1 < ' ' || c1 >= 0x80 | 1459 if (c1 < ' ' || c1 >= 0x80 |
1460 || (charset = iso_charset_table[1][c >= ','][c1]) < 0) | 1460 || (charset = iso_charset_table[1][c >= ','][c1]) < 0) |
1461 /* Invalid designation sequence. Just ignore. */ | 1461 /* Invalid designation sequence. Just ignore. */ |
1462 break; | 1462 break; |
1463 reg[(c - '(') % 4] = charset; | 1463 reg[(c - '(') % 4] = charset; |
1628 int i = 1; | 1628 int i = 1; |
1629 | 1629 |
1630 c = -1; | 1630 c = -1; |
1631 while (src < src_end) | 1631 while (src < src_end) |
1632 { | 1632 { |
1633 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 1633 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, |
1634 mask & mask_found); | |
1634 if (c < 0xA0) | 1635 if (c < 0xA0) |
1635 break; | 1636 break; |
1636 i++; | 1637 i++; |
1637 } | 1638 } |
1638 | 1639 |
1646 } | 1647 } |
1647 } | 1648 } |
1648 break; | 1649 break; |
1649 } | 1650 } |
1650 } | 1651 } |
1651 label_end_of_loop: | |
1652 return (mask & mask_found); | 1652 return (mask & mask_found); |
1653 } | 1653 } |
1654 | 1654 |
1655 /* Decode a character of which charset is CHARSET, the 1st position | 1655 /* Decode a character of which charset is CHARSET, the 1st position |
1656 code is C1, the 2nd position code is C2, and return the decoded | 1656 code is C1, the 2nd position code is C2, and return the decoded |
2917 struct coding_system dummy_coding; | 2917 struct coding_system dummy_coding; |
2918 struct coding_system *coding = &dummy_coding; | 2918 struct coding_system *coding = &dummy_coding; |
2919 | 2919 |
2920 while (1) | 2920 while (1) |
2921 { | 2921 { |
2922 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 2922 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_SJIS); |
2923 if (c < 0x80) | 2923 if (c < 0x80) |
2924 continue; | 2924 continue; |
2925 if (c == 0x80 || c == 0xA0 || c > 0xEF) | 2925 if (c == 0x80 || c == 0xA0 || c > 0xEF) |
2926 return 0; | 2926 return 0; |
2927 if (c <= 0x9F || c >= 0xE0) | 2927 if (c <= 0x9F || c >= 0xE0) |
2928 { | 2928 { |
2929 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 2929 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0); |
2930 if (c < 0x40 || c == 0x7F || c > 0xFC) | 2930 if (c < 0x40 || c == 0x7F || c > 0xFC) |
2931 return 0; | 2931 return 0; |
2932 } | 2932 } |
2933 } | 2933 } |
2934 label_end_of_loop: | |
2935 return CODING_CATEGORY_MASK_SJIS; | |
2936 } | 2934 } |
2937 | 2935 |
2938 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 2936 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
2939 Check if a text is encoded in BIG5. If it is, return | 2937 Check if a text is encoded in BIG5. If it is, return |
2940 CODING_CATEGORY_MASK_BIG5, else return 0. */ | 2938 CODING_CATEGORY_MASK_BIG5, else return 0. */ |
2949 struct coding_system dummy_coding; | 2947 struct coding_system dummy_coding; |
2950 struct coding_system *coding = &dummy_coding; | 2948 struct coding_system *coding = &dummy_coding; |
2951 | 2949 |
2952 while (1) | 2950 while (1) |
2953 { | 2951 { |
2954 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 2952 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_BIG5); |
2955 if (c < 0x80) | 2953 if (c < 0x80) |
2956 continue; | 2954 continue; |
2957 if (c < 0xA1 || c > 0xFE) | 2955 if (c < 0xA1 || c > 0xFE) |
2958 return 0; | 2956 return 0; |
2959 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 2957 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0); |
2960 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE) | 2958 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE) |
2961 return 0; | 2959 return 0; |
2962 } | 2960 } |
2963 label_end_of_loop: | |
2964 return CODING_CATEGORY_MASK_BIG5; | |
2965 } | 2961 } |
2966 | 2962 |
2967 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 2963 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
2968 Check if a text is encoded in UTF-8. If it is, return | 2964 Check if a text is encoded in UTF-8. If it is, return |
2969 CODING_CATEGORY_MASK_UTF_8, else return 0. */ | 2965 CODING_CATEGORY_MASK_UTF_8, else return 0. */ |
2987 struct coding_system dummy_coding; | 2983 struct coding_system dummy_coding; |
2988 struct coding_system *coding = &dummy_coding; | 2984 struct coding_system *coding = &dummy_coding; |
2989 | 2985 |
2990 while (1) | 2986 while (1) |
2991 { | 2987 { |
2992 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 2988 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_UTF_8); |
2993 if (UTF_8_1_OCTET_P (c)) | 2989 if (UTF_8_1_OCTET_P (c)) |
2994 continue; | 2990 continue; |
2995 else if (UTF_8_2_OCTET_LEADING_P (c)) | 2991 else if (UTF_8_2_OCTET_LEADING_P (c)) |
2996 seq_maybe_bytes = 1; | 2992 seq_maybe_bytes = 1; |
2997 else if (UTF_8_3_OCTET_LEADING_P (c)) | 2993 else if (UTF_8_3_OCTET_LEADING_P (c)) |
3005 else | 3001 else |
3006 return 0; | 3002 return 0; |
3007 | 3003 |
3008 do | 3004 do |
3009 { | 3005 { |
3010 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 3006 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0); |
3011 if (!UTF_8_EXTRA_OCTET_P (c)) | 3007 if (!UTF_8_EXTRA_OCTET_P (c)) |
3012 return 0; | 3008 return 0; |
3013 seq_maybe_bytes--; | 3009 seq_maybe_bytes--; |
3014 } | 3010 } |
3015 while (seq_maybe_bytes > 0); | 3011 while (seq_maybe_bytes > 0); |
3016 } | 3012 } |
3017 | |
3018 label_end_of_loop: | |
3019 return CODING_CATEGORY_MASK_UTF_8; | |
3020 } | 3013 } |
3021 | 3014 |
3022 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 3015 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
3023 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or | 3016 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or |
3024 Little Endian (otherwise). If it is, return | 3017 Little Endian (otherwise). If it is, return |
3043 unsigned char c1, c2; | 3036 unsigned char c1, c2; |
3044 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */ | 3037 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */ |
3045 struct coding_system dummy_coding; | 3038 struct coding_system dummy_coding; |
3046 struct coding_system *coding = &dummy_coding; | 3039 struct coding_system *coding = &dummy_coding; |
3047 | 3040 |
3048 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep); | 3041 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, 0); |
3049 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep); | 3042 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep, 0); |
3050 | 3043 |
3051 if ((c1 == 0xFF) && (c2 == 0xFE)) | 3044 if ((c1 == 0xFF) && (c2 == 0xFE)) |
3052 return CODING_CATEGORY_MASK_UTF_16_LE; | 3045 return CODING_CATEGORY_MASK_UTF_16_LE; |
3053 else if ((c1 == 0xFE) && (c2 == 0xFF)) | 3046 else if ((c1 == 0xFE) && (c2 == 0xFF)) |
3054 return CODING_CATEGORY_MASK_UTF_16_BE; | 3047 return CODING_CATEGORY_MASK_UTF_16_BE; |
3055 | |
3056 label_end_of_loop: | |
3057 return 0; | 3048 return 0; |
3058 } | 3049 } |
3059 | 3050 |
3060 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". | 3051 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". |
3061 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */ | 3052 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */ |
3320 return 0; | 3311 return 0; |
3321 | 3312 |
3322 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes; | 3313 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes; |
3323 while (1) | 3314 while (1) |
3324 { | 3315 { |
3325 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 3316 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_CCL); |
3326 if (! valid[c]) | 3317 if (! valid[c]) |
3327 return 0; | 3318 return 0; |
3328 } | 3319 } |
3329 label_end_of_loop: | |
3330 return CODING_CATEGORY_MASK_CCL; | |
3331 } | 3320 } |
3332 | 3321 |
3333 | 3322 |
3334 /*** 6. End-of-line handlers ***/ | 3323 /*** 6. End-of-line handlers ***/ |
3335 | 3324 |