comparison src/coding.c @ 19612:783efd6c7c1e

(Qno_conversion, Qundecided): New variables. (syms_of_coding): Initialize and staticpro them. (coding_category_name): Include "coding-category-raw-test". (setup_coding_system): Handle coding_type_raw_text. (detect_coding_mask): Include CODING_CATEGORY_MASK_RAW_TEXT in the return value instead of CODING_CATEGORY_MASK_BINARY. (detect_coding): Do not check the case that `mask' is 0, which never happens now. (detect_eol_type): If EOL format is inconsistent, return CODING_EOL_INCONSISTENT. (detect_eol): If EOL format of raw-text file is inconsistent, detect it as no-conversion. (decode_coding): Handle coding_type_raw_text. (encode_coding): Likewise. (Fdetect_coding_region): Ajusted for the above changes. (shrink_conversion_area): Handle coding_type_raw_text.
author Kenichi Handa <handa@m17n.org>
date Thu, 28 Aug 1997 10:51:12 +0000
parents 38c46419910a
children 666288d82ae1
comparison
equal deleted inserted replaced
19611:79c3bdba351b 19612:783efd6c7c1e
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are 65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in section 4. In this file, when we write "BIG5" 66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write 67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set. 68 "Big5" (capitalized), we mean the character set.
69 69
70 4. Other 70 4. Raw text
71
72 A coding system to for a text containing random 8-bit code. Emacs
73 does no code conversion on such a text except for end-of-line
74 format.
75
76 5. Other
71 77
72 If a user wants to read/write a text encoded in a coding system not 78 If a user wants to read/write a text encoded in a coding system not
73 listed above, he can supply a decoder and an encoder for it in CCL 79 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program 80 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing. 81 while reading/writing.
244 #endif /* not emacs */ 250 #endif /* not emacs */
245 251
246 Lisp_Object Qcoding_system, Qeol_type; 252 Lisp_Object Qcoding_system, Qeol_type;
247 Lisp_Object Qbuffer_file_coding_system; 253 Lisp_Object Qbuffer_file_coding_system;
248 Lisp_Object Qpost_read_conversion, Qpre_write_conversion; 254 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
255 Lisp_Object Qno_conversion, Qundecided;
249 256
250 extern Lisp_Object Qinsert_file_contents, Qwrite_region; 257 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
251 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument; 258 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
252 Lisp_Object Qstart_process, Qopen_network_stream; 259 Lisp_Object Qstart_process, Qopen_network_stream;
253 Lisp_Object Qtarget_idx; 260 Lisp_Object Qtarget_idx;
317 "coding-category-iso-8-1", 324 "coding-category-iso-8-1",
318 "coding-category-iso-8-2", 325 "coding-category-iso-8-2",
319 "coding-category-iso-7-else", 326 "coding-category-iso-7-else",
320 "coding-category-iso-8-else", 327 "coding-category-iso-8-else",
321 "coding-category-big5", 328 "coding-category-big5",
329 "coding-category-raw-text",
322 "coding-category-binary" 330 "coding-category-binary"
323 }; 331 };
324 332
325 /* Flag to tell if we look up unification table on character code 333 /* Flag to tell if we look up unification table on character code
326 conversion. */ 334 conversion. */
2544 goto label_invalid_coding_system; 2552 goto label_invalid_coding_system;
2545 } 2553 }
2546 coding->require_flushing = 1; 2554 coding->require_flushing = 1;
2547 break; 2555 break;
2548 2556
2557 case 5:
2558 coding->type = coding_type_raw_text;
2559 break;
2560
2549 default: 2561 default:
2550 if (EQ (type, Qt)) 2562 if (EQ (type, Qt))
2551 coding->type = coding_type_undecided; 2563 coding->type = coding_type_undecided;
2552 else 2564 else
2553 coding->type = coding_type_no_conversion; 2565 coding->type = coding_type_no_conversion;
2685 else if (c < 0xA0) 2697 else if (c < 0xA0)
2686 { 2698 {
2687 /* If C is a special latin extra code, 2699 /* If C is a special latin extra code,
2688 or is an ISO2022 specific control code of C1 (SS2 or SS3), 2700 or is an ISO2022 specific control code of C1 (SS2 or SS3),
2689 or is an ISO2022 control-sequence-introducer (CSI), 2701 or is an ISO2022 control-sequence-introducer (CSI),
2690 we should also consider the possibility of someof ISO2022 codings. */ 2702 we should also consider the possibility of ISO2022 codings. */
2691 if ((VECTORP (Vlatin_extra_code_table) 2703 if ((VECTORP (Vlatin_extra_code_table)
2692 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) 2704 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2693 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3) 2705 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2694 || (c == ISO_CODE_CSI 2706 || (c == ISO_CODE_CSI
2695 && (src < src_end 2707 && (src < src_end
2698 && src[1] == ']' 2710 && src[1] == ']'
2699 && (*src == '0' || *src == '1' || *src == '2')))))) 2711 && (*src == '0' || *src == '1' || *src == '2'))))))
2700 mask = (detect_coding_iso2022 (src, src_end) 2712 mask = (detect_coding_iso2022 (src, src_end)
2701 | detect_coding_sjis (src, src_end) 2713 | detect_coding_sjis (src, src_end)
2702 | detect_coding_emacs_mule (src, src_end) 2714 | detect_coding_emacs_mule (src, src_end)
2703 | CODING_CATEGORY_MASK_BINARY); 2715 | CODING_CATEGORY_MASK_RAW_TEXT);
2704 2716
2705 else 2717 else
2706 /* C is the first byte of SJIS character code, or a 2718 /* C is the first byte of SJIS character code,
2707 leading-code of Emacs. */ 2719 or a leading-code of Emacs' internal format (emacs-mule). */
2708 mask = (detect_coding_sjis (src, src_end) 2720 mask = (detect_coding_sjis (src, src_end)
2709 | detect_coding_emacs_mule (src, src_end) 2721 | detect_coding_emacs_mule (src, src_end)
2710 | CODING_CATEGORY_MASK_BINARY); 2722 | CODING_CATEGORY_MASK_RAW_TEXT);
2711 } 2723 }
2712 else 2724 else
2713 /* C is a character of ISO2022 in graphic plane right, 2725 /* C is a character of ISO2022 in graphic plane right,
2714 or a SJIS's 1-byte character code (i.e. JISX0201), 2726 or a SJIS's 1-byte character code (i.e. JISX0201),
2715 or the first byte of BIG5's 2-byte code. */ 2727 or the first byte of BIG5's 2-byte code. */
2716 mask = (detect_coding_iso2022 (src, src_end) 2728 mask = (detect_coding_iso2022 (src, src_end)
2717 | detect_coding_sjis (src, src_end) 2729 | detect_coding_sjis (src, src_end)
2718 | detect_coding_big5 (src, src_end) 2730 | detect_coding_big5 (src, src_end)
2719 | CODING_CATEGORY_MASK_BINARY); 2731 | CODING_CATEGORY_MASK_RAW_TEXT);
2720 2732
2721 return mask; 2733 return mask;
2722 } 2734 }
2723 2735
2724 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded. 2736 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2730 unsigned char *src; 2742 unsigned char *src;
2731 int src_bytes; 2743 int src_bytes;
2732 { 2744 {
2733 int mask = detect_coding_mask (src, src_bytes); 2745 int mask = detect_coding_mask (src, src_bytes);
2734 int idx; 2746 int idx;
2747 Lisp_Object val = Vcoding_category_list;
2735 2748
2736 if (mask == CODING_CATEGORY_MASK_ANY) 2749 if (mask == CODING_CATEGORY_MASK_ANY)
2737 /* We found nothing other than ASCII. There's nothing to do. */ 2750 /* We found nothing other than ASCII. There's nothing to do. */
2738 return; 2751 return;
2739 2752
2740 if (!mask) 2753 /* We found some plausible coding systems. Let's use a coding
2741 /* The source text seems to be encoded in unknown coding system. 2754 system of the highest priority. */
2742 Emacs regards the category of such a kind of coding system as 2755
2743 `coding-category-binary'. We assume that a user has assigned 2756 if (CONSP (val))
2744 an appropriate coding system for a `coding-category-binary'. */ 2757 while (!NILP (val))
2745 idx = CODING_CATEGORY_IDX_BINARY; 2758 {
2759 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2760 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2761 break;
2762 val = XCONS (val)->cdr;
2763 }
2746 else 2764 else
2765 val = Qnil;
2766
2767 if (NILP (val))
2747 { 2768 {
2748 /* We found some plausible coding systems. Let's use a coding 2769 /* For unknown reason, `Vcoding_category_list' contains none of
2749 system of the highest priority. */ 2770 found categories. Let's use any of them. */
2750 Lisp_Object val = Vcoding_category_list; 2771 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2751 2772 if (mask & (1 << idx))
2752 if (CONSP (val)) 2773 break;
2753 while (!NILP (val))
2754 {
2755 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2756 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2757 break;
2758 val = XCONS (val)->cdr;
2759 }
2760 else
2761 val = Qnil;
2762
2763 if (NILP (val))
2764 {
2765 /* For unknown reason, `Vcoding_category_list' contains none
2766 of found categories. Let's use any of them. */
2767 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2768 if (mask & (1 << idx))
2769 break;
2770 }
2771 } 2774 }
2772 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding); 2775 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2773 } 2776 }
2774 2777
2775 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC 2778 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2805 if (eol_type == CODING_EOL_UNDECIDED) 2808 if (eol_type == CODING_EOL_UNDECIDED)
2806 /* This is the first end-of-line. */ 2809 /* This is the first end-of-line. */
2807 eol_type = this_eol_type; 2810 eol_type = this_eol_type;
2808 else if (eol_type != this_eol_type) 2811 else if (eol_type != this_eol_type)
2809 /* The found type is different from what found before. 2812 /* The found type is different from what found before.
2810 We had better not decode end-of-line. */ 2813 Let's notice the caller about this inconsistency. */
2811 return CODING_EOL_LF; 2814 return CODING_EOL_INCONSISTENT;
2812 } 2815 }
2813 } 2816 }
2814 2817
2815 return eol_type; 2818 return eol_type;
2816 } 2819 }
2829 int eol_type = detect_eol_type (src, src_bytes); 2832 int eol_type = detect_eol_type (src, src_bytes);
2830 2833
2831 if (eol_type == CODING_EOL_UNDECIDED) 2834 if (eol_type == CODING_EOL_UNDECIDED)
2832 /* We found no end-of-line in the source text. */ 2835 /* We found no end-of-line in the source text. */
2833 return; 2836 return;
2837
2838 if (eol_type == CODING_EOL_INCONSISTENT)
2839 {
2840 #if 0
2841 /* This code is suppressed until we find a better way to
2842 distinguish raw-text and binary. */
2843
2844 /* If we have already detected that the coding is raw-text, the
2845 coding should actually be no-conversion. */
2846 if (coding->type == coding_type_raw_text)
2847 {
2848 setup_coding_system (Qno_conversion, coding);
2849 return;
2850 }
2851 /* Else, let's decode only text code anyway. */
2852 #endif /* 0 */
2853 eol_type == CODING_EOL_LF;
2854 }
2834 2855
2835 coding_system = coding->symbol; 2856 coding_system = coding->symbol;
2836 while (!NILP (coding_system) 2857 while (!NILP (coding_system)
2837 && NILP (val = Fget (coding_system, Qeol_type))) 2858 && NILP (val = Fget (coding_system, Qeol_type)))
2838 coding_system = Fget (coding_system, Qcoding_system); 2859 coding_system = Fget (coding_system, Qcoding_system);
2875 *consumed = produced; 2896 *consumed = produced;
2876 break; 2897 break;
2877 2898
2878 case coding_type_emacs_mule: 2899 case coding_type_emacs_mule:
2879 case coding_type_undecided: 2900 case coding_type_undecided:
2901 case coding_type_raw_text:
2880 if (coding->eol_type == CODING_EOL_LF 2902 if (coding->eol_type == CODING_EOL_LF
2881 || coding->eol_type == CODING_EOL_UNDECIDED) 2903 || coding->eol_type == CODING_EOL_UNDECIDED)
2882 goto label_no_conversion; 2904 goto label_no_conversion;
2883 produced = decode_eol (coding, source, destination, 2905 produced = decode_eol (coding, source, destination,
2884 src_bytes, dst_bytes, consumed); 2906 src_bytes, dst_bytes, consumed);
2939 *consumed = produced; 2961 *consumed = produced;
2940 break; 2962 break;
2941 2963
2942 case coding_type_emacs_mule: 2964 case coding_type_emacs_mule:
2943 case coding_type_undecided: 2965 case coding_type_undecided:
2966 case coding_type_raw_text:
2944 if (coding->eol_type == CODING_EOL_LF 2967 if (coding->eol_type == CODING_EOL_LF
2945 || coding->eol_type == CODING_EOL_UNDECIDED) 2968 || coding->eol_type == CODING_EOL_UNDECIDED)
2946 goto label_no_conversion; 2969 goto label_no_conversion;
2947 produced = encode_eol (coding, source, destination, 2970 produced = encode_eol (coding, source, destination,
2948 src_bytes, dst_bytes, consumed); 2971 src_bytes, dst_bytes, consumed);
3131 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg); 3154 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3132 eol_type = detect_eol_type (POS_ADDR (beg), end - beg); 3155 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
3133 3156
3134 if (coding_mask == CODING_CATEGORY_MASK_ANY) 3157 if (coding_mask == CODING_CATEGORY_MASK_ANY)
3135 { 3158 {
3136 val = intern ("undecided"); 3159 val = Qundecided;
3137 if (eol_type != CODING_EOL_UNDECIDED) 3160 if (eol_type != CODING_EOL_UNDECIDED
3161 && eol_type != CODING_EOL_INCONSISTENT)
3138 { 3162 {
3139 Lisp_Object val2 = Fget (val, Qeol_type); 3163 Lisp_Object val2 = Fget (Qundecided, Qeol_type);
3140 if (VECTORP (val2)) 3164 if (VECTORP (val2))
3141 val = XVECTOR (val2)->contents[eol_type]; 3165 val = XVECTOR (val2)->contents[eol_type];
3142 } 3166 }
3143 } 3167 }
3144 else 3168 else
3153 val2 = XCONS (val2)->cdr) 3177 val2 = XCONS (val2)->cdr)
3154 { 3178 {
3155 int idx 3179 int idx
3156 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index)); 3180 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3157 if (coding_mask & (1 << idx)) 3181 if (coding_mask & (1 << idx))
3158 val = Fcons (Fsymbol_value (XCONS (val2)->car), val); 3182 {
3183 #if 0
3184 /* This code is suppressed until we find a better way to
3185 distinguish raw-text and binary. */
3186
3187 if (idx == CODING_CATEGORY_IDX_RAW_TEXT
3188 && eol_type == CODING_EOL_INCONSISTENT)
3189 val = Fcons (Qno_conversion, val);
3190 else
3191 #endif /* 0 */
3192 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3193 }
3159 } 3194 }
3160 3195
3161 /* Then, change the order of the list, while getting subsidiary 3196 /* Then, change the order of the list, while getting subsidiary
3162 coding-systems. */ 3197 coding-systems. */
3163 val2 = val; 3198 val2 = val;
3164 val = Qnil; 3199 val = Qnil;
3200 if (eol_type == CODING_EOL_INCONSISTENT)
3201 eol_type == CODING_EOL_UNDECIDED;
3165 for (; !NILP (val2); val2 = XCONS (val2)->cdr) 3202 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3166 { 3203 {
3167 if (eol_type == CODING_EOL_UNDECIDED) 3204 if (eol_type == CODING_EOL_UNDECIDED)
3168 val = Fcons (XCONS (val2)->car, val); 3205 val = Fcons (XCONS (val2)->car, val);
3169 else 3206 else
3204 switch (coding->type) 3241 switch (coding->type)
3205 { 3242 {
3206 case coding_type_no_conversion: 3243 case coding_type_no_conversion:
3207 case coding_type_emacs_mule: 3244 case coding_type_emacs_mule:
3208 case coding_type_undecided: 3245 case coding_type_undecided:
3246 case coding_type_raw_text:
3209 /* We need no conversion. */ 3247 /* We need no conversion. */
3210 *begp = *endp; 3248 *begp = *endp;
3211 return; 3249 return;
3212 case coding_type_ccl: 3250 case coding_type_ccl:
3213 /* We can't skip any data. */ 3251 /* We can't skip any data. */
3241 case coding_type_no_conversion: 3279 case coding_type_no_conversion:
3242 /* We need no conversion. */ 3280 /* We need no conversion. */
3243 *begp = *endp; 3281 *begp = *endp;
3244 return; 3282 return;
3245 case coding_type_emacs_mule: 3283 case coding_type_emacs_mule:
3284 case coding_type_raw_text:
3246 if (coding->eol_type == CODING_EOL_LF) 3285 if (coding->eol_type == CODING_EOL_LF)
3247 { 3286 {
3248 /* We need no conversion. */ 3287 /* We need no conversion. */
3249 *begp = *endp; 3288 *begp = *endp;
3250 return; 3289 return;
3855 staticpro (&Qpost_read_conversion); 3894 staticpro (&Qpost_read_conversion);
3856 3895
3857 Qpre_write_conversion = intern ("pre-write-conversion"); 3896 Qpre_write_conversion = intern ("pre-write-conversion");
3858 staticpro (&Qpre_write_conversion); 3897 staticpro (&Qpre_write_conversion);
3859 3898
3899 Qno_conversion = intern ("no-conversion");
3900 staticpro (&Qno_conversion);
3901
3902 Qundecided = intern ("undecided");
3903 staticpro (&Qundecided);
3904
3860 Qcoding_system_spec = intern ("coding-system-spec"); 3905 Qcoding_system_spec = intern ("coding-system-spec");
3861 staticpro (&Qcoding_system_spec); 3906 staticpro (&Qcoding_system_spec);
3862 3907
3863 Qcoding_system_p = intern ("coding-system-p"); 3908 Qcoding_system_p = intern ("coding-system-p");
3864 staticpro (&Qcoding_system_p); 3909 staticpro (&Qcoding_system_p);