comparison src/coding.c @ 93595:ac4d127a841a

(CATEGORY_MASK_ANY): Add CATEGORY_MASK_UTF_16_AUTO. (CATEGORY_MASK_UTF_16): Likewise. (detect_coding_utf_16): Add heuristics to reject utf-16 for a binary file. (detect_coding): Add null-byte detection for a binary file. (detect_coding_system): Likewise.
author Kenichi Handa <handa@m17n.org>
date Thu, 03 Apr 2008 12:30:02 +0000
parents 06e93ffa2e9f
children 62d97ebb13a9
comparison
equal deleted inserted replaced
93594:46b8fe649bbc 93595:ac4d127a841a
623 | CATEGORY_MASK_ISO_8_1 \ 623 | CATEGORY_MASK_ISO_8_1 \
624 | CATEGORY_MASK_ISO_8_2 \ 624 | CATEGORY_MASK_ISO_8_2 \
625 | CATEGORY_MASK_ISO_7_ELSE \ 625 | CATEGORY_MASK_ISO_7_ELSE \
626 | CATEGORY_MASK_ISO_8_ELSE \ 626 | CATEGORY_MASK_ISO_8_ELSE \
627 | CATEGORY_MASK_UTF_8 \ 627 | CATEGORY_MASK_UTF_8 \
628 | CATEGORY_MASK_UTF_16_AUTO \
628 | CATEGORY_MASK_UTF_16_BE \ 629 | CATEGORY_MASK_UTF_16_BE \
629 | CATEGORY_MASK_UTF_16_LE \ 630 | CATEGORY_MASK_UTF_16_LE \
630 | CATEGORY_MASK_UTF_16_BE_NOSIG \ 631 | CATEGORY_MASK_UTF_16_BE_NOSIG \
631 | CATEGORY_MASK_UTF_16_LE_NOSIG \ 632 | CATEGORY_MASK_UTF_16_LE_NOSIG \
632 | CATEGORY_MASK_CHARSET \ 633 | CATEGORY_MASK_CHARSET \
655 ( CATEGORY_MASK_ISO_7BIT \ 656 ( CATEGORY_MASK_ISO_7BIT \
656 | CATEGORY_MASK_ISO_8BIT \ 657 | CATEGORY_MASK_ISO_8BIT \
657 | CATEGORY_MASK_ISO_ELSE) 658 | CATEGORY_MASK_ISO_ELSE)
658 659
659 #define CATEGORY_MASK_UTF_16 \ 660 #define CATEGORY_MASK_UTF_16 \
660 (CATEGORY_MASK_UTF_16_BE \ 661 (CATEGORY_MASK_UTF_16_AUTO \
662 | CATEGORY_MASK_UTF_16_BE \
661 | CATEGORY_MASK_UTF_16_LE \ 663 | CATEGORY_MASK_UTF_16_LE \
662 | CATEGORY_MASK_UTF_16_BE_NOSIG \ 664 | CATEGORY_MASK_UTF_16_BE_NOSIG \
663 | CATEGORY_MASK_UTF_16_LE_NOSIG) 665 | CATEGORY_MASK_UTF_16_LE_NOSIG)
664 666
665 667
1511 | CATEGORY_MASK_UTF_16_AUTO); 1513 | CATEGORY_MASK_UTF_16_AUTO);
1512 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE 1514 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1513 | CATEGORY_MASK_UTF_16_BE_NOSIG 1515 | CATEGORY_MASK_UTF_16_BE_NOSIG
1514 | CATEGORY_MASK_UTF_16_LE_NOSIG); 1516 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1515 } 1517 }
1516 else if (c1 >= 0 && c2 >= 0) 1518 else
1517 { 1519 {
1520 /* We check the dispersion of Eth and Oth bytes where E is even and
1521 O is odd. If both are high, we assume binary data.*/
1522 unsigned char e[256], o[256];
1523 unsigned e_num = 1, o_num = 1;
1524
1525 memset (e, 0, 256);
1526 memset (o, 0, 256);
1527 e[c1] = 1;
1528 o[c2] = 1;
1529
1518 detect_info->rejected 1530 detect_info->rejected
1519 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE); 1531 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1520 } 1532
1533 while (1)
1534 {
1535 ONE_MORE_BYTE (c1);
1536 ONE_MORE_BYTE (c2);
1537 if (! e[c1])
1538 {
1539 e[c1] = 1;
1540 e_num++;
1541 if (e_num >= 128)
1542 break;
1543 }
1544 if (! o[c2])
1545 {
1546 o[c1] = 1;
1547 o_num++;
1548 if (o_num >= 128)
1549 break;
1550 }
1551 }
1552 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1553 return 0;
1554 }
1555
1521 no_more_source: 1556 no_more_source:
1522 return 1; 1557 return 1;
1523 } 1558 }
1524 1559
1525 static void 1560 static void
5675 now. */ 5710 now. */
5676 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) 5711 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5677 { 5712 {
5678 int c, i; 5713 int c, i;
5679 struct coding_detection_info detect_info; 5714 struct coding_detection_info detect_info;
5715 int null_byte_found = 0, eight_bit_found = 0;
5680 5716
5681 detect_info.checked = detect_info.found = detect_info.rejected = 0; 5717 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5682 for (i = 0, src = coding->source; src < src_end; i++, src++) 5718 coding->head_ascii = -1;
5719 for (src = coding->source; src < src_end; src++)
5683 { 5720 {
5684 c = *src; 5721 c = *src;
5685 if (c & 0x80) 5722 if (c & 0x80)
5686 break;
5687 if (c < 0x20
5688 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5689 && ! inhibit_iso_escape_detection
5690 && ! detect_info.checked)
5691 { 5723 {
5692 coding->head_ascii = src - (coding->source + coding->consumed); 5724 eight_bit_found = 1;
5693 if (detect_coding_iso_2022 (coding, &detect_info)) 5725 if (coding->head_ascii < 0)
5726 coding->head_ascii = src - coding->source;
5727 if (null_byte_found)
5728 break;
5729 }
5730 else if (c < 0x20)
5731 {
5732 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5733 && ! inhibit_iso_escape_detection
5734 && ! detect_info.checked)
5694 { 5735 {
5695 /* We have scanned the whole data. */ 5736 if (coding->head_ascii < 0)
5696 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) 5737 coding->head_ascii = src - coding->source;
5697 /* We didn't find an 8-bit code. */ 5738 if (detect_coding_iso_2022 (coding, &detect_info))
5698 src = src_end; 5739 {
5699 break; 5740 /* We have scanned the whole data. */
5741 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5742 /* We didn't find an 8-bit code. We may have
5743 found a null-byte, but it's very rare that
5744 a binary file confirm to ISO-2022. */
5745 src = src_end;
5746 break;
5747 }
5748 }
5749 else if (! c)
5750 {
5751 null_byte_found = 1;
5752 if (eight_bit_found)
5753 break;
5700 } 5754 }
5701 } 5755 }
5702 } 5756 }
5703 coding->head_ascii = src - (coding->source + coding->consumed); 5757 if (coding->head_ascii < 0)
5704 5758 coding->head_ascii = src - coding->source;
5705 if (coding->head_ascii < coding->src_bytes 5759
5760 if (null_byte_found || eight_bit_found
5761 || coding->head_ascii < coding->src_bytes
5706 || detect_info.found) 5762 || detect_info.found)
5707 { 5763 {
5708 enum coding_category category; 5764 enum coding_category category;
5709 struct coding_system *this; 5765 struct coding_system *this;
5710 5766
5716 this = coding_categories + category; 5772 this = coding_categories + category;
5717 if (detect_info.found & (1 << category)) 5773 if (detect_info.found & (1 << category))
5718 break; 5774 break;
5719 } 5775 }
5720 else 5776 else
5721 for (i = 0; i < coding_category_raw_text; i++) 5777 {
5722 { 5778 if (null_byte_found)
5723 category = coding_priorities[i]; 5779 {
5724 this = coding_categories + category; 5780 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
5725 if (this->id < 0) 5781 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
5726 { 5782 }
5727 /* No coding system of this category is defined. */ 5783 for (i = 0; i < coding_category_raw_text; i++)
5728 detect_info.rejected |= (1 << category); 5784 {
5729 } 5785 category = coding_priorities[i];
5730 else if (category >= coding_category_raw_text) 5786 this = coding_categories + category;
5731 continue; 5787 if (this->id < 0)
5732 else if (detect_info.checked & (1 << category)) 5788 {
5733 { 5789 /* No coding system of this category is defined. */
5734 if (detect_info.found & (1 << category)) 5790 detect_info.rejected |= (1 << category);
5791 }
5792 else if (category >= coding_category_raw_text)
5793 continue;
5794 else if (detect_info.checked & (1 << category))
5795 {
5796 if (detect_info.found & (1 << category))
5797 break;
5798 }
5799 else if ((*(this->detector)) (coding, &detect_info)
5800 && detect_info.found & (1 << category))
5801 {
5802 if (category == coding_category_utf_16_auto)
5803 {
5804 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5805 category = coding_category_utf_16_le;
5806 else
5807 category = coding_category_utf_16_be;
5808 }
5735 break; 5809 break;
5736 } 5810 }
5737 else if ((*(this->detector)) (coding, &detect_info) 5811 }
5738 && detect_info.found & (1 << category))
5739 {
5740 if (category == coding_category_utf_16_auto)
5741 {
5742 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5743 category = coding_category_utf_16_le;
5744 else
5745 category = coding_category_utf_16_be;
5746 }
5747 break;
5748 }
5749 }
5750 5812
5751 if (i < coding_category_raw_text) 5813 if (i < coding_category_raw_text)
5752 setup_coding_system (CODING_ID_NAME (this->id), coding); 5814 setup_coding_system (CODING_ID_NAME (this->id), coding);
5753 else if (detect_info.rejected == CATEGORY_MASK_ANY) 5815 else if (null_byte_found)
5754 setup_coding_system (Qraw_text, coding); 5816 setup_coding_system (Qno_conversion, coding);
5755 else if (detect_info.rejected) 5817 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
5756 for (i = 0; i < coding_category_raw_text; i++) 5818 == CATEGORY_MASK_ANY)
5757 if (! (detect_info.rejected & (1 << coding_priorities[i]))) 5819 setup_coding_system (Qraw_text, coding);
5758 { 5820 else if (detect_info.rejected)
5759 this = coding_categories + coding_priorities[i]; 5821 for (i = 0; i < coding_category_raw_text; i++)
5760 setup_coding_system (CODING_ID_NAME (this->id), coding); 5822 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5761 break; 5823 {
5762 } 5824 this = coding_categories + coding_priorities[i];
5825 setup_coding_system (CODING_ID_NAME (this->id), coding);
5826 break;
5827 }
5828 }
5763 } 5829 }
5764 } 5830 }
5765 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) 5831 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5766 == coding_category_utf_16_auto) 5832 == coding_category_utf_16_auto)
5767 { 5833 {
7470 Lisp_Object val; 7536 Lisp_Object val;
7471 struct coding_system coding; 7537 struct coding_system coding;
7472 int id; 7538 int id;
7473 struct coding_detection_info detect_info; 7539 struct coding_detection_info detect_info;
7474 enum coding_category base_category; 7540 enum coding_category base_category;
7541 int null_byte_found = 0, eight_bit_found = 0;
7475 7542
7476 if (NILP (coding_system)) 7543 if (NILP (coding_system))
7477 coding_system = Qundecided; 7544 coding_system = Qundecided;
7478 setup_coding_system (coding_system, &coding); 7545 setup_coding_system (coding_system, &coding);
7479 attrs = CODING_ID_ATTRS (coding.id); 7546 attrs = CODING_ID_ATTRS (coding.id);
7495 { 7562 {
7496 enum coding_category category; 7563 enum coding_category category;
7497 struct coding_system *this; 7564 struct coding_system *this;
7498 int c, i; 7565 int c, i;
7499 7566
7567 coding.head_ascii = -1;
7500 /* Skip all ASCII bytes except for a few ISO2022 controls. */ 7568 /* Skip all ASCII bytes except for a few ISO2022 controls. */
7501 for (i = 0; src < src_end; i++, src++) 7569 for (; src < src_end; src++)
7502 { 7570 {
7503 c = *src; 7571 c = *src;
7504 if (c & 0x80) 7572 if (c & 0x80)
7505 break;
7506 if (c < 0x20
7507 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7508 && ! inhibit_iso_escape_detection)
7509 { 7573 {
7510 coding.head_ascii = src - coding.source; 7574 eight_bit_found = 1;
7511 if (detect_coding_iso_2022 (&coding, &detect_info)) 7575 if (coding.head_ascii < 0)
7576 coding.head_ascii = src - coding.source;
7577 if (null_byte_found)
7578 break;
7579 }
7580 if (c < 0x20)
7581 {
7582 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7583 && ! inhibit_iso_escape_detection
7584 && ! detect_info.checked)
7512 { 7585 {
7513 /* We have scanned the whole data. */ 7586 if (coding.head_ascii < 0)
7514 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) 7587 coding.head_ascii = src - coding.source;
7515 /* We didn't find an 8-bit code. */ 7588 if (detect_coding_iso_2022 (&coding, &detect_info))
7516 src = src_end; 7589 {
7517 break; 7590 /* We have scanned the whole data. */
7591 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7592 /* We didn't find an 8-bit code. We may have
7593 found a null-byte, but it's very rare that
7594 a binary file confirm to ISO-2022. */
7595 src = src_end;
7596 break;
7597 }
7598 }
7599 else if (! c)
7600 {
7601 null_byte_found = 1;
7602 if (eight_bit_found)
7603 break;
7518 } 7604 }
7519 } 7605 }
7520 } 7606 }
7521 coding.head_ascii = src - coding.source; 7607 if (coding.head_ascii < 0)
7522 7608 coding.head_ascii = src - coding.source;
7523 if (src < src_end 7609
7610 if (null_byte_found || eight_bit_found
7611 || coding.head_ascii < coding.src_bytes
7524 || detect_info.found) 7612 || detect_info.found)
7525 { 7613 {
7526 if (src == src_end) 7614 if (coding.head_ascii == coding.src_bytes)
7527 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */ 7615 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
7528 for (i = 0; i < coding_category_raw_text; i++) 7616 for (i = 0; i < coding_category_raw_text; i++)
7529 { 7617 {
7530 category = coding_priorities[i]; 7618 category = coding_priorities[i];
7531 this = coding_categories + category; 7619 this = coding_categories + category;
7532 if (detect_info.found & (1 << category)) 7620 if (detect_info.found & (1 << category))
7533 break; 7621 break;
7534 } 7622 }
7535 else 7623 else
7536 for (i = 0; i < coding_category_raw_text; i++) 7624 {
7537 { 7625 if (null_byte_found)
7538 category = coding_priorities[i]; 7626 {
7539 this = coding_categories + category; 7627 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
7540 7628 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
7541 if (this->id < 0) 7629 }
7542 { 7630 for (i = 0; i < coding_category_raw_text; i++)
7543 /* No coding system of this category is defined. */ 7631 {
7544 detect_info.rejected |= (1 << category); 7632 category = coding_priorities[i];
7545 } 7633 this = coding_categories + category;
7546 else if (category >= coding_category_raw_text) 7634
7547 continue; 7635 if (this->id < 0)
7548 else if (detect_info.checked & (1 << category)) 7636 {
7549 { 7637 /* No coding system of this category is defined. */
7550 if (highest 7638 detect_info.rejected |= (1 << category);
7551 && (detect_info.found & (1 << category))) 7639 }
7640 else if (category >= coding_category_raw_text)
7641 continue;
7642 else if (detect_info.checked & (1 << category))
7643 {
7644 if (highest
7645 && (detect_info.found & (1 << category)))
7646 break;
7647 }
7648 else if ((*(this->detector)) (&coding, &detect_info)
7649 && highest
7650 && (detect_info.found & (1 << category)))
7651 {
7652 if (category == coding_category_utf_16_auto)
7653 {
7654 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7655 category = coding_category_utf_16_le;
7656 else
7657 category = coding_category_utf_16_be;
7658 }
7552 break; 7659 break;
7553 } 7660 }
7554 else 7661 }
7555 { 7662 }
7556 if ((*(this->detector)) (&coding, &detect_info) 7663 }
7557 && highest 7664
7558 && (detect_info.found & (1 << category))) 7665 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY)
7559 {
7560 if (category == coding_category_utf_16_auto)
7561 {
7562 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7563 category = coding_category_utf_16_le;
7564 else
7565 category = coding_category_utf_16_be;
7566 }
7567 break;
7568 }
7569 }
7570 }
7571 }
7572
7573 if (detect_info.rejected == CATEGORY_MASK_ANY)
7574 { 7666 {
7575 detect_info.found = CATEGORY_MASK_RAW_TEXT; 7667 detect_info.found = CATEGORY_MASK_RAW_TEXT;
7576 id = coding_categories[coding_category_raw_text].id; 7668 id = coding_categories[coding_category_raw_text].id;
7577 val = Fcons (make_number (id), Qnil); 7669 val = Fcons (make_number (id), Qnil);
7578 } 7670 }
7657 Lisp_Object tail; 7749 Lisp_Object tail;
7658 7750
7659 if (VECTORP (eol_type)) 7751 if (VECTORP (eol_type))
7660 { 7752 {
7661 if (detect_info.found & ~CATEGORY_MASK_UTF_16) 7753 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7662 normal_eol = detect_eol (coding.source, src_bytes, 7754 {
7663 coding_category_raw_text); 7755 if (null_byte_found)
7756 normal_eol = EOL_SEEN_LF;
7757 else
7758 normal_eol = detect_eol (coding.source, src_bytes,
7759 coding_category_raw_text);
7760 }
7664 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE 7761 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7665 | CATEGORY_MASK_UTF_16_BE_NOSIG)) 7762 | CATEGORY_MASK_UTF_16_BE_NOSIG))
7666 utf_16_be_eol = detect_eol (coding.source, src_bytes, 7763 utf_16_be_eol = detect_eol (coding.source, src_bytes,
7667 coding_category_utf_16_be); 7764 coding_category_utf_16_be);
7668 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE 7765 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE