comparison src/coding.c @ 95533:831c8ee4d884

(detect_coding): Fix handling of coding->head_ascii. Be sure to call setup_coding_system when a proper coding system is found. (detect_coding_system): Fix handling of coding->head_ascii.
author Kenichi Handa <handa@m17n.org>
date Wed, 04 Jun 2008 07:52:46 +0000
parents c99f0a16c077
children 862c7386145c
comparison
equal deleted inserted replaced
95532:b657ba21e4d3 95533:831c8ee4d884
5780 coding->consumed = coding->consumed_char = 0; 5780 coding->consumed = coding->consumed_char = 0;
5781 coding->produced = coding->produced_char = 0; 5781 coding->produced = coding->produced_char = 0;
5782 coding_set_source (coding); 5782 coding_set_source (coding);
5783 5783
5784 src_end = coding->source + coding->src_bytes; 5784 src_end = coding->source + coding->src_bytes;
5785 coding->head_ascii = 0;
5785 5786
5786 /* If we have not yet decided the text encoding type, detect it 5787 /* If we have not yet decided the text encoding type, detect it
5787 now. */ 5788 now. */
5788 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) 5789 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5789 { 5790 {
5790 int c, i; 5791 int c, i;
5791 struct coding_detection_info detect_info; 5792 struct coding_detection_info detect_info;
5792 int null_byte_found = 0, eight_bit_found = 0; 5793 int null_byte_found = 0, eight_bit_found = 0;
5793 5794
5794 detect_info.checked = detect_info.found = detect_info.rejected = 0; 5795 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5795 coding->head_ascii = -1;
5796 for (src = coding->source; src < src_end; src++) 5796 for (src = coding->source; src < src_end; src++)
5797 { 5797 {
5798 c = *src; 5798 c = *src;
5799 if (c & 0x80) 5799 if (c & 0x80)
5800 { 5800 {
5801 eight_bit_found = 1; 5801 eight_bit_found = 1;
5802 if (coding->head_ascii < 0)
5803 coding->head_ascii = src - coding->source;
5804 if (null_byte_found) 5802 if (null_byte_found)
5805 break; 5803 break;
5806 } 5804 }
5807 else if (c < 0x20) 5805 else if (c < 0x20)
5808 { 5806 {
5809 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) 5807 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5810 && ! inhibit_iso_escape_detection 5808 && ! inhibit_iso_escape_detection
5811 && ! detect_info.checked) 5809 && ! detect_info.checked)
5812 { 5810 {
5813 if (coding->head_ascii < 0)
5814 coding->head_ascii = src - coding->source;
5815 if (detect_coding_iso_2022 (coding, &detect_info)) 5811 if (detect_coding_iso_2022 (coding, &detect_info))
5816 { 5812 {
5817 /* We have scanned the whole data. */ 5813 /* We have scanned the whole data. */
5818 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) 5814 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5819 /* We didn't find an 8-bit code. We may have 5815 {
5820 found a null-byte, but it's very rare that 5816 /* We didn't find an 8-bit code. We may
5821 a binary file confirm to ISO-2022. */ 5817 have found a null-byte, but it's very
5822 src = src_end; 5818 rare that a binary file confirm to
5819 ISO-2022. */
5820 src = src_end;
5821 coding->head_ascii = src - coding->source;
5822 }
5823 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
5823 break; 5824 break;
5824 } 5825 }
5825 } 5826 }
5826 else if (! c) 5827 else if (! c)
5827 { 5828 {
5828 null_byte_found = 1; 5829 null_byte_found = 1;
5829 if (eight_bit_found) 5830 if (eight_bit_found)
5830 break; 5831 break;
5831 } 5832 }
5833 coding->head_ascii++;
5832 } 5834 }
5833 } 5835 else
5834 if (coding->head_ascii < 0) 5836 coding->head_ascii++;
5835 coding->head_ascii = src - coding->source; 5837 }
5836 5838
5837 if (null_byte_found || eight_bit_found 5839 if (null_byte_found || eight_bit_found
5838 || coding->head_ascii < coding->src_bytes 5840 || coding->head_ascii < coding->src_bytes
5839 || detect_info.found) 5841 || detect_info.found)
5840 { 5842 {
5884 category = coding_category_utf_16_be; 5886 category = coding_category_utf_16_be;
5885 } 5887 }
5886 break; 5888 break;
5887 } 5889 }
5888 } 5890 }
5889
5890 if (i < coding_category_raw_text)
5891 setup_coding_system (CODING_ID_NAME (this->id), coding);
5892 else if (null_byte_found)
5893 setup_coding_system (Qno_conversion, coding);
5894 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
5895 == CATEGORY_MASK_ANY)
5896 setup_coding_system (Qraw_text, coding);
5897 else if (detect_info.rejected)
5898 for (i = 0; i < coding_category_raw_text; i++)
5899 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5900 {
5901 this = coding_categories + coding_priorities[i];
5902 setup_coding_system (CODING_ID_NAME (this->id), coding);
5903 break;
5904 }
5905 } 5891 }
5892
5893 if (i < coding_category_raw_text)
5894 setup_coding_system (CODING_ID_NAME (this->id), coding);
5895 else if (null_byte_found)
5896 setup_coding_system (Qno_conversion, coding);
5897 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
5898 == CATEGORY_MASK_ANY)
5899 setup_coding_system (Qraw_text, coding);
5900 else if (detect_info.rejected)
5901 for (i = 0; i < coding_category_raw_text; i++)
5902 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5903 {
5904 this = coding_categories + coding_priorities[i];
5905 setup_coding_system (CODING_ID_NAME (this->id), coding);
5906 break;
5907 }
5906 } 5908 }
5907 } 5909 }
5908 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) 5910 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5909 == coding_category_utf_8_auto) 5911 == coding_category_utf_8_auto)
5910 { 5912 {
7653 coding.src_chars = src_chars; 7655 coding.src_chars = src_chars;
7654 coding.src_bytes = src_bytes; 7656 coding.src_bytes = src_bytes;
7655 coding.src_multibyte = multibytep; 7657 coding.src_multibyte = multibytep;
7656 coding.consumed = 0; 7658 coding.consumed = 0;
7657 coding.mode |= CODING_MODE_LAST_BLOCK; 7659 coding.mode |= CODING_MODE_LAST_BLOCK;
7660 coding.head_ascii = 0;
7658 7661
7659 detect_info.checked = detect_info.found = detect_info.rejected = 0; 7662 detect_info.checked = detect_info.found = detect_info.rejected = 0;
7660 7663
7661 /* At first, detect text-format if necessary. */ 7664 /* At first, detect text-format if necessary. */
7662 base_category = XINT (CODING_ATTR_CATEGORY (attrs)); 7665 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7664 { 7667 {
7665 enum coding_category category; 7668 enum coding_category category;
7666 struct coding_system *this; 7669 struct coding_system *this;
7667 int c, i; 7670 int c, i;
7668 7671
7669 coding.head_ascii = -1;
7670 /* Skip all ASCII bytes except for a few ISO2022 controls. */ 7672 /* Skip all ASCII bytes except for a few ISO2022 controls. */
7671 for (; src < src_end; src++) 7673 for (; src < src_end; src++)
7672 { 7674 {
7673 c = *src; 7675 c = *src;
7674 if (c & 0x80) 7676 if (c & 0x80)
7675 { 7677 {
7676 eight_bit_found = 1; 7678 eight_bit_found = 1;
7677 if (coding.head_ascii < 0)
7678 coding.head_ascii = src - coding.source;
7679 if (null_byte_found) 7679 if (null_byte_found)
7680 break; 7680 break;
7681 } 7681 }
7682 if (c < 0x20) 7682 else if (c < 0x20)
7683 { 7683 {
7684 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) 7684 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7685 && ! inhibit_iso_escape_detection 7685 && ! inhibit_iso_escape_detection
7686 && ! detect_info.checked) 7686 && ! detect_info.checked)
7687 { 7687 {
7688 if (coding.head_ascii < 0)
7689 coding.head_ascii = src - coding.source;
7690 if (detect_coding_iso_2022 (&coding, &detect_info)) 7688 if (detect_coding_iso_2022 (&coding, &detect_info))
7691 { 7689 {
7692 /* We have scanned the whole data. */ 7690 /* We have scanned the whole data. */
7693 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) 7691 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7694 /* We didn't find an 8-bit code. We may have 7692 {
7695 found a null-byte, but it's very rare that 7693 /* We didn't find an 8-bit code. We may
7696 a binary file confirm to ISO-2022. */ 7694 have found a null-byte, but it's very
7697 src = src_end; 7695 rare that a binary file confirm to
7696 ISO-2022. */
7697 src = src_end;
7698 coding.head_ascii = src - coding.source;
7699 }
7700 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
7698 break; 7701 break;
7699 } 7702 }
7700 } 7703 }
7701 else if (! c) 7704 else if (! c)
7702 { 7705 {
7703 null_byte_found = 1; 7706 null_byte_found = 1;
7704 if (eight_bit_found) 7707 if (eight_bit_found)
7705 break; 7708 break;
7706 } 7709 }
7710 coding.head_ascii++;
7707 } 7711 }
7708 } 7712 else
7709 if (coding.head_ascii < 0) 7713 coding.head_ascii++;
7710 coding.head_ascii = src - coding.source; 7714 }
7711 7715
7712 if (null_byte_found || eight_bit_found 7716 if (null_byte_found || eight_bit_found
7713 || coding.head_ascii < coding.src_bytes 7717 || coding.head_ascii < coding.src_bytes
7714 || detect_info.found) 7718 || detect_info.found)
7715 { 7719 {