comparison src/coding.c @ 72395:94e4795b333d

(ONE_MORE_BYTE_CHECK_MULTIBYTE): New arg RET. If SRC is exhausted, return with RET. (detect_coding_emacs_mule, detect_coding_iso2022) (detect_coding_sjis, detect_coding_big5, detect_coding_utf_8) (detect_coding_utf_16, detect_coding_ccl): Adjusted for the above change.
author Kenichi Handa <handa@m17n.org>
date Tue, 15 Aug 2006 02:41:29 +0000
parents af796bc81ff0
children 6493d4697ad2 694bbb62a75d
comparison
equal deleted inserted replaced
72394:bec9a701aee6 72395:94e4795b333d
217 c2 = *src++; \ 217 c2 = *src++; \
218 } while (0) 218 } while (0)
219 219
220 220
221 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte 221 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
222 form if MULTIBYTEP is nonzero. */ 222 form if MULTIBYTEP is nonzero. In addition, if SRC is not less
223 223 than SRC_END, return with RET. */
224 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep) \ 224
225 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep, ret) \
225 do { \ 226 do { \
226 if (src >= src_end) \ 227 if (src >= src_end) \
227 { \ 228 { \
228 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \ 229 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
229 goto label_end_of_loop; \ 230 return ret; \
230 } \ 231 } \
231 c1 = *src++; \ 232 c1 = *src++; \
232 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \ 233 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \
233 c1 = *src++ - 0x20; \ 234 c1 = *src++ - 0x20; \
234 } while (0) 235 } while (0)
630 struct coding_system dummy_coding; 631 struct coding_system dummy_coding;
631 struct coding_system *coding = &dummy_coding; 632 struct coding_system *coding = &dummy_coding;
632 633
633 while (1) 634 while (1)
634 { 635 {
635 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 636 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
636 637 CODING_CATEGORY_MASK_EMACS_MULE);
637 if (composing) 638 if (composing)
638 { 639 {
639 if (c < 0xA0) 640 if (c < 0xA0)
640 composing = 0; 641 composing = 0;
641 else if (c == 0xA0) 642 else if (c == 0xA0)
642 { 643 {
643 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 644 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
644 c &= 0x7F; 645 c &= 0x7F;
645 } 646 }
646 else 647 else
647 c -= 0x20; 648 c -= 0x20;
648 } 649 }
667 return 0; 668 return 0;
668 src = src_base + bytes; 669 src = src_base + bytes;
669 } 670 }
670 } 671 }
671 } 672 }
672 label_end_of_loop:
673 return CODING_CATEGORY_MASK_EMACS_MULE;
674 } 673 }
675 674
676 675
677 /* Record the starting position START and METHOD of one composition. */ 676 /* Record the starting position START and METHOD of one composition. */
678 677
1423 struct coding_system dummy_coding; 1422 struct coding_system dummy_coding;
1424 struct coding_system *coding = &dummy_coding; 1423 struct coding_system *coding = &dummy_coding;
1425 Lisp_Object safe_chars; 1424 Lisp_Object safe_chars;
1426 1425
1427 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1; 1426 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1428 while (mask && src < src_end) 1427 while (mask)
1429 { 1428 {
1430 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 1429 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1431 retry: 1430 retry:
1432 switch (c) 1431 switch (c)
1433 { 1432 {
1434 case ISO_CODE_ESC: 1433 case ISO_CODE_ESC:
1435 if (inhibit_iso_escape_detection) 1434 if (inhibit_iso_escape_detection)
1436 break; 1435 break;
1437 single_shifting = 0; 1436 single_shifting = 0;
1438 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 1437 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1439 if (c >= '(' && c <= '/') 1438 if (c >= '(' && c <= '/')
1440 { 1439 {
1441 /* Designation sequence for a charset of dimension 1. */ 1440 /* Designation sequence for a charset of dimension 1. */
1442 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep); 1441 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, mask & mask_found);
1443 if (c1 < ' ' || c1 >= 0x80 1442 if (c1 < ' ' || c1 >= 0x80
1444 || (charset = iso_charset_table[0][c >= ','][c1]) < 0) 1443 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1445 /* Invalid designation sequence. Just ignore. */ 1444 /* Invalid designation sequence. Just ignore. */
1446 break; 1445 break;
1447 reg[(c - '(') % 4] = charset; 1446 reg[(c - '(') % 4] = charset;
1448 } 1447 }
1449 else if (c == '$') 1448 else if (c == '$')
1450 { 1449 {
1451 /* Designation sequence for a charset of dimension 2. */ 1450 /* Designation sequence for a charset of dimension 2. */
1452 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 1451 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1453 if (c >= '@' && c <= 'B') 1452 if (c >= '@' && c <= 'B')
1454 /* Designation for JISX0208.1978, GB2312, or JISX0208. */ 1453 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
1455 reg[0] = charset = iso_charset_table[1][0][c]; 1454 reg[0] = charset = iso_charset_table[1][0][c];
1456 else if (c >= '(' && c <= '/') 1455 else if (c >= '(' && c <= '/')
1457 { 1456 {
1458 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep); 1457 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep,
1458 mask & mask_found);
1459 if (c1 < ' ' || c1 >= 0x80 1459 if (c1 < ' ' || c1 >= 0x80
1460 || (charset = iso_charset_table[1][c >= ','][c1]) < 0) 1460 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1461 /* Invalid designation sequence. Just ignore. */ 1461 /* Invalid designation sequence. Just ignore. */
1462 break; 1462 break;
1463 reg[(c - '(') % 4] = charset; 1463 reg[(c - '(') % 4] = charset;
1628 int i = 1; 1628 int i = 1;
1629 1629
1630 c = -1; 1630 c = -1;
1631 while (src < src_end) 1631 while (src < src_end)
1632 { 1632 {
1633 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 1633 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
1634 mask & mask_found);
1634 if (c < 0xA0) 1635 if (c < 0xA0)
1635 break; 1636 break;
1636 i++; 1637 i++;
1637 } 1638 }
1638 1639
1646 } 1647 }
1647 } 1648 }
1648 break; 1649 break;
1649 } 1650 }
1650 } 1651 }
1651 label_end_of_loop:
1652 return (mask & mask_found); 1652 return (mask & mask_found);
1653 } 1653 }
1654 1654
1655 /* Decode a character of which charset is CHARSET, the 1st position 1655 /* Decode a character of which charset is CHARSET, the 1st position
1656 code is C1, the 2nd position code is C2, and return the decoded 1656 code is C1, the 2nd position code is C2, and return the decoded
2917 struct coding_system dummy_coding; 2917 struct coding_system dummy_coding;
2918 struct coding_system *coding = &dummy_coding; 2918 struct coding_system *coding = &dummy_coding;
2919 2919
2920 while (1) 2920 while (1)
2921 { 2921 {
2922 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 2922 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_SJIS);
2923 if (c < 0x80) 2923 if (c < 0x80)
2924 continue; 2924 continue;
2925 if (c == 0x80 || c == 0xA0 || c > 0xEF) 2925 if (c == 0x80 || c == 0xA0 || c > 0xEF)
2926 return 0; 2926 return 0;
2927 if (c <= 0x9F || c >= 0xE0) 2927 if (c <= 0x9F || c >= 0xE0)
2928 { 2928 {
2929 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 2929 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2930 if (c < 0x40 || c == 0x7F || c > 0xFC) 2930 if (c < 0x40 || c == 0x7F || c > 0xFC)
2931 return 0; 2931 return 0;
2932 } 2932 }
2933 } 2933 }
2934 label_end_of_loop:
2935 return CODING_CATEGORY_MASK_SJIS;
2936 } 2934 }
2937 2935
2938 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 2936 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2939 Check if a text is encoded in BIG5. If it is, return 2937 Check if a text is encoded in BIG5. If it is, return
2940 CODING_CATEGORY_MASK_BIG5, else return 0. */ 2938 CODING_CATEGORY_MASK_BIG5, else return 0. */
2949 struct coding_system dummy_coding; 2947 struct coding_system dummy_coding;
2950 struct coding_system *coding = &dummy_coding; 2948 struct coding_system *coding = &dummy_coding;
2951 2949
2952 while (1) 2950 while (1)
2953 { 2951 {
2954 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 2952 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_BIG5);
2955 if (c < 0x80) 2953 if (c < 0x80)
2956 continue; 2954 continue;
2957 if (c < 0xA1 || c > 0xFE) 2955 if (c < 0xA1 || c > 0xFE)
2958 return 0; 2956 return 0;
2959 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 2957 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2960 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE) 2958 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2961 return 0; 2959 return 0;
2962 } 2960 }
2963 label_end_of_loop:
2964 return CODING_CATEGORY_MASK_BIG5;
2965 } 2961 }
2966 2962
2967 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 2963 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2968 Check if a text is encoded in UTF-8. If it is, return 2964 Check if a text is encoded in UTF-8. If it is, return
2969 CODING_CATEGORY_MASK_UTF_8, else return 0. */ 2965 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2987 struct coding_system dummy_coding; 2983 struct coding_system dummy_coding;
2988 struct coding_system *coding = &dummy_coding; 2984 struct coding_system *coding = &dummy_coding;
2989 2985
2990 while (1) 2986 while (1)
2991 { 2987 {
2992 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 2988 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_UTF_8);
2993 if (UTF_8_1_OCTET_P (c)) 2989 if (UTF_8_1_OCTET_P (c))
2994 continue; 2990 continue;
2995 else if (UTF_8_2_OCTET_LEADING_P (c)) 2991 else if (UTF_8_2_OCTET_LEADING_P (c))
2996 seq_maybe_bytes = 1; 2992 seq_maybe_bytes = 1;
2997 else if (UTF_8_3_OCTET_LEADING_P (c)) 2993 else if (UTF_8_3_OCTET_LEADING_P (c))
3005 else 3001 else
3006 return 0; 3002 return 0;
3007 3003
3008 do 3004 do
3009 { 3005 {
3010 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 3006 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
3011 if (!UTF_8_EXTRA_OCTET_P (c)) 3007 if (!UTF_8_EXTRA_OCTET_P (c))
3012 return 0; 3008 return 0;
3013 seq_maybe_bytes--; 3009 seq_maybe_bytes--;
3014 } 3010 }
3015 while (seq_maybe_bytes > 0); 3011 while (seq_maybe_bytes > 0);
3016 } 3012 }
3017
3018 label_end_of_loop:
3019 return CODING_CATEGORY_MASK_UTF_8;
3020 } 3013 }
3021 3014
3022 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 3015 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3023 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or 3016 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3024 Little Endian (otherwise). If it is, return 3017 Little Endian (otherwise). If it is, return
3043 unsigned char c1, c2; 3036 unsigned char c1, c2;
3044 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */ 3037 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */
3045 struct coding_system dummy_coding; 3038 struct coding_system dummy_coding;
3046 struct coding_system *coding = &dummy_coding; 3039 struct coding_system *coding = &dummy_coding;
3047 3040
3048 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep); 3041 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, 0);
3049 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep); 3042 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep, 0);
3050 3043
3051 if ((c1 == 0xFF) && (c2 == 0xFE)) 3044 if ((c1 == 0xFF) && (c2 == 0xFE))
3052 return CODING_CATEGORY_MASK_UTF_16_LE; 3045 return CODING_CATEGORY_MASK_UTF_16_LE;
3053 else if ((c1 == 0xFE) && (c2 == 0xFF)) 3046 else if ((c1 == 0xFE) && (c2 == 0xFF))
3054 return CODING_CATEGORY_MASK_UTF_16_BE; 3047 return CODING_CATEGORY_MASK_UTF_16_BE;
3055
3056 label_end_of_loop:
3057 return 0; 3048 return 0;
3058 } 3049 }
3059 3050
3060 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". 3051 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3061 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */ 3052 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3320 return 0; 3311 return 0;
3321 3312
3322 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes; 3313 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3323 while (1) 3314 while (1)
3324 { 3315 {
3325 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); 3316 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_CCL);
3326 if (! valid[c]) 3317 if (! valid[c])
3327 return 0; 3318 return 0;
3328 } 3319 }
3329 label_end_of_loop:
3330 return CODING_CATEGORY_MASK_CCL;
3331 } 3320 }
3332 3321
3333 3322
3334 /*** 6. End-of-line handlers ***/ 3323 /*** 6. End-of-line handlers ***/
3335 3324