Mercurial > emacs
comparison src/coding.c @ 93595:ac4d127a841a
(CATEGORY_MASK_ANY): Add CATEGORY_MASK_UTF_16_AUTO.
(CATEGORY_MASK_UTF_16): Likewise.
(detect_coding_utf_16): Add heuristics to reject utf-16 for a
binary file.
(detect_coding): Add null-byte detection for a binary file.
(detect_coding_system): Likewise.
author | Kenichi Handa <handa@m17n.org> |
---|---|
date | Thu, 03 Apr 2008 12:30:02 +0000 |
parents | 06e93ffa2e9f |
children | 62d97ebb13a9 |
comparison
equal
deleted
inserted
replaced
93594:46b8fe649bbc | 93595:ac4d127a841a |
---|---|
623 | CATEGORY_MASK_ISO_8_1 \ | 623 | CATEGORY_MASK_ISO_8_1 \ |
624 | CATEGORY_MASK_ISO_8_2 \ | 624 | CATEGORY_MASK_ISO_8_2 \ |
625 | CATEGORY_MASK_ISO_7_ELSE \ | 625 | CATEGORY_MASK_ISO_7_ELSE \ |
626 | CATEGORY_MASK_ISO_8_ELSE \ | 626 | CATEGORY_MASK_ISO_8_ELSE \ |
627 | CATEGORY_MASK_UTF_8 \ | 627 | CATEGORY_MASK_UTF_8 \ |
628 | CATEGORY_MASK_UTF_16_AUTO \ | |
628 | CATEGORY_MASK_UTF_16_BE \ | 629 | CATEGORY_MASK_UTF_16_BE \ |
629 | CATEGORY_MASK_UTF_16_LE \ | 630 | CATEGORY_MASK_UTF_16_LE \ |
630 | CATEGORY_MASK_UTF_16_BE_NOSIG \ | 631 | CATEGORY_MASK_UTF_16_BE_NOSIG \ |
631 | CATEGORY_MASK_UTF_16_LE_NOSIG \ | 632 | CATEGORY_MASK_UTF_16_LE_NOSIG \ |
632 | CATEGORY_MASK_CHARSET \ | 633 | CATEGORY_MASK_CHARSET \ |
655 ( CATEGORY_MASK_ISO_7BIT \ | 656 ( CATEGORY_MASK_ISO_7BIT \ |
656 | CATEGORY_MASK_ISO_8BIT \ | 657 | CATEGORY_MASK_ISO_8BIT \ |
657 | CATEGORY_MASK_ISO_ELSE) | 658 | CATEGORY_MASK_ISO_ELSE) |
658 | 659 |
659 #define CATEGORY_MASK_UTF_16 \ | 660 #define CATEGORY_MASK_UTF_16 \ |
660 (CATEGORY_MASK_UTF_16_BE \ | 661 (CATEGORY_MASK_UTF_16_AUTO \ |
662 | CATEGORY_MASK_UTF_16_BE \ | |
661 | CATEGORY_MASK_UTF_16_LE \ | 663 | CATEGORY_MASK_UTF_16_LE \ |
662 | CATEGORY_MASK_UTF_16_BE_NOSIG \ | 664 | CATEGORY_MASK_UTF_16_BE_NOSIG \ |
663 | CATEGORY_MASK_UTF_16_LE_NOSIG) | 665 | CATEGORY_MASK_UTF_16_LE_NOSIG) |
664 | 666 |
665 | 667 |
1511 | CATEGORY_MASK_UTF_16_AUTO); | 1513 | CATEGORY_MASK_UTF_16_AUTO); |
1512 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE | 1514 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE |
1513 | CATEGORY_MASK_UTF_16_BE_NOSIG | 1515 | CATEGORY_MASK_UTF_16_BE_NOSIG |
1514 | CATEGORY_MASK_UTF_16_LE_NOSIG); | 1516 | CATEGORY_MASK_UTF_16_LE_NOSIG); |
1515 } | 1517 } |
1516 else if (c1 >= 0 && c2 >= 0) | 1518 else |
1517 { | 1519 { |
1520 /* We check the dispersion of Eth and Oth bytes where E is even and | |
1521 O is odd. If both are high, we assume binary data.*/ | |
1522 unsigned char e[256], o[256]; | |
1523 unsigned e_num = 1, o_num = 1; | |
1524 | |
1525 memset (e, 0, 256); | |
1526 memset (o, 0, 256); | |
1527 e[c1] = 1; | |
1528 o[c2] = 1; | |
1529 | |
1518 detect_info->rejected | 1530 detect_info->rejected |
1519 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE); | 1531 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE); |
1520 } | 1532 |
1533 while (1) | |
1534 { | |
1535 ONE_MORE_BYTE (c1); | |
1536 ONE_MORE_BYTE (c2); | |
1537 if (! e[c1]) | |
1538 { | |
1539 e[c1] = 1; | |
1540 e_num++; | |
1541 if (e_num >= 128) | |
1542 break; | |
1543 } | |
1544 if (! o[c2]) | |
1545 { | |
1546 o[c1] = 1; | |
1547 o_num++; | |
1548 if (o_num >= 128) | |
1549 break; | |
1550 } | |
1551 } | |
1552 detect_info->rejected |= CATEGORY_MASK_UTF_16; | |
1553 return 0; | |
1554 } | |
1555 | |
1521 no_more_source: | 1556 no_more_source: |
1522 return 1; | 1557 return 1; |
1523 } | 1558 } |
1524 | 1559 |
1525 static void | 1560 static void |
5675 now. */ | 5710 now. */ |
5676 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) | 5711 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) |
5677 { | 5712 { |
5678 int c, i; | 5713 int c, i; |
5679 struct coding_detection_info detect_info; | 5714 struct coding_detection_info detect_info; |
5715 int null_byte_found = 0, eight_bit_found = 0; | |
5680 | 5716 |
5681 detect_info.checked = detect_info.found = detect_info.rejected = 0; | 5717 detect_info.checked = detect_info.found = detect_info.rejected = 0; |
5682 for (i = 0, src = coding->source; src < src_end; i++, src++) | 5718 coding->head_ascii = -1; |
5719 for (src = coding->source; src < src_end; src++) | |
5683 { | 5720 { |
5684 c = *src; | 5721 c = *src; |
5685 if (c & 0x80) | 5722 if (c & 0x80) |
5686 break; | |
5687 if (c < 0x20 | |
5688 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | |
5689 && ! inhibit_iso_escape_detection | |
5690 && ! detect_info.checked) | |
5691 { | 5723 { |
5692 coding->head_ascii = src - (coding->source + coding->consumed); | 5724 eight_bit_found = 1; |
5693 if (detect_coding_iso_2022 (coding, &detect_info)) | 5725 if (coding->head_ascii < 0) |
5726 coding->head_ascii = src - coding->source; | |
5727 if (null_byte_found) | |
5728 break; | |
5729 } | |
5730 else if (c < 0x20) | |
5731 { | |
5732 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | |
5733 && ! inhibit_iso_escape_detection | |
5734 && ! detect_info.checked) | |
5694 { | 5735 { |
5695 /* We have scanned the whole data. */ | 5736 if (coding->head_ascii < 0) |
5696 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) | 5737 coding->head_ascii = src - coding->source; |
5697 /* We didn't find an 8-bit code. */ | 5738 if (detect_coding_iso_2022 (coding, &detect_info)) |
5698 src = src_end; | 5739 { |
5699 break; | 5740 /* We have scanned the whole data. */ |
5741 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) | |
5742 /* We didn't find an 8-bit code. We may have | |
5743 found a null-byte, but it's very rare that | |
5744 a binary file confirm to ISO-2022. */ | |
5745 src = src_end; | |
5746 break; | |
5747 } | |
5748 } | |
5749 else if (! c) | |
5750 { | |
5751 null_byte_found = 1; | |
5752 if (eight_bit_found) | |
5753 break; | |
5700 } | 5754 } |
5701 } | 5755 } |
5702 } | 5756 } |
5703 coding->head_ascii = src - (coding->source + coding->consumed); | 5757 if (coding->head_ascii < 0) |
5704 | 5758 coding->head_ascii = src - coding->source; |
5705 if (coding->head_ascii < coding->src_bytes | 5759 |
5760 if (null_byte_found || eight_bit_found | |
5761 || coding->head_ascii < coding->src_bytes | |
5706 || detect_info.found) | 5762 || detect_info.found) |
5707 { | 5763 { |
5708 enum coding_category category; | 5764 enum coding_category category; |
5709 struct coding_system *this; | 5765 struct coding_system *this; |
5710 | 5766 |
5716 this = coding_categories + category; | 5772 this = coding_categories + category; |
5717 if (detect_info.found & (1 << category)) | 5773 if (detect_info.found & (1 << category)) |
5718 break; | 5774 break; |
5719 } | 5775 } |
5720 else | 5776 else |
5721 for (i = 0; i < coding_category_raw_text; i++) | 5777 { |
5722 { | 5778 if (null_byte_found) |
5723 category = coding_priorities[i]; | 5779 { |
5724 this = coding_categories + category; | 5780 detect_info.checked |= ~CATEGORY_MASK_UTF_16; |
5725 if (this->id < 0) | 5781 detect_info.rejected |= ~CATEGORY_MASK_UTF_16; |
5726 { | 5782 } |
5727 /* No coding system of this category is defined. */ | 5783 for (i = 0; i < coding_category_raw_text; i++) |
5728 detect_info.rejected |= (1 << category); | 5784 { |
5729 } | 5785 category = coding_priorities[i]; |
5730 else if (category >= coding_category_raw_text) | 5786 this = coding_categories + category; |
5731 continue; | 5787 if (this->id < 0) |
5732 else if (detect_info.checked & (1 << category)) | 5788 { |
5733 { | 5789 /* No coding system of this category is defined. */ |
5734 if (detect_info.found & (1 << category)) | 5790 detect_info.rejected |= (1 << category); |
5791 } | |
5792 else if (category >= coding_category_raw_text) | |
5793 continue; | |
5794 else if (detect_info.checked & (1 << category)) | |
5795 { | |
5796 if (detect_info.found & (1 << category)) | |
5797 break; | |
5798 } | |
5799 else if ((*(this->detector)) (coding, &detect_info) | |
5800 && detect_info.found & (1 << category)) | |
5801 { | |
5802 if (category == coding_category_utf_16_auto) | |
5803 { | |
5804 if (detect_info.found & CATEGORY_MASK_UTF_16_LE) | |
5805 category = coding_category_utf_16_le; | |
5806 else | |
5807 category = coding_category_utf_16_be; | |
5808 } | |
5735 break; | 5809 break; |
5736 } | 5810 } |
5737 else if ((*(this->detector)) (coding, &detect_info) | 5811 } |
5738 && detect_info.found & (1 << category)) | |
5739 { | |
5740 if (category == coding_category_utf_16_auto) | |
5741 { | |
5742 if (detect_info.found & CATEGORY_MASK_UTF_16_LE) | |
5743 category = coding_category_utf_16_le; | |
5744 else | |
5745 category = coding_category_utf_16_be; | |
5746 } | |
5747 break; | |
5748 } | |
5749 } | |
5750 | 5812 |
5751 if (i < coding_category_raw_text) | 5813 if (i < coding_category_raw_text) |
5752 setup_coding_system (CODING_ID_NAME (this->id), coding); | 5814 setup_coding_system (CODING_ID_NAME (this->id), coding); |
5753 else if (detect_info.rejected == CATEGORY_MASK_ANY) | 5815 else if (null_byte_found) |
5754 setup_coding_system (Qraw_text, coding); | 5816 setup_coding_system (Qno_conversion, coding); |
5755 else if (detect_info.rejected) | 5817 else if ((detect_info.rejected & CATEGORY_MASK_ANY) |
5756 for (i = 0; i < coding_category_raw_text; i++) | 5818 == CATEGORY_MASK_ANY) |
5757 if (! (detect_info.rejected & (1 << coding_priorities[i]))) | 5819 setup_coding_system (Qraw_text, coding); |
5758 { | 5820 else if (detect_info.rejected) |
5759 this = coding_categories + coding_priorities[i]; | 5821 for (i = 0; i < coding_category_raw_text; i++) |
5760 setup_coding_system (CODING_ID_NAME (this->id), coding); | 5822 if (! (detect_info.rejected & (1 << coding_priorities[i]))) |
5761 break; | 5823 { |
5762 } | 5824 this = coding_categories + coding_priorities[i]; |
5825 setup_coding_system (CODING_ID_NAME (this->id), coding); | |
5826 break; | |
5827 } | |
5828 } | |
5763 } | 5829 } |
5764 } | 5830 } |
5765 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) | 5831 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) |
5766 == coding_category_utf_16_auto) | 5832 == coding_category_utf_16_auto) |
5767 { | 5833 { |
7470 Lisp_Object val; | 7536 Lisp_Object val; |
7471 struct coding_system coding; | 7537 struct coding_system coding; |
7472 int id; | 7538 int id; |
7473 struct coding_detection_info detect_info; | 7539 struct coding_detection_info detect_info; |
7474 enum coding_category base_category; | 7540 enum coding_category base_category; |
7541 int null_byte_found = 0, eight_bit_found = 0; | |
7475 | 7542 |
7476 if (NILP (coding_system)) | 7543 if (NILP (coding_system)) |
7477 coding_system = Qundecided; | 7544 coding_system = Qundecided; |
7478 setup_coding_system (coding_system, &coding); | 7545 setup_coding_system (coding_system, &coding); |
7479 attrs = CODING_ID_ATTRS (coding.id); | 7546 attrs = CODING_ID_ATTRS (coding.id); |
7495 { | 7562 { |
7496 enum coding_category category; | 7563 enum coding_category category; |
7497 struct coding_system *this; | 7564 struct coding_system *this; |
7498 int c, i; | 7565 int c, i; |
7499 | 7566 |
7567 coding.head_ascii = -1; | |
7500 /* Skip all ASCII bytes except for a few ISO2022 controls. */ | 7568 /* Skip all ASCII bytes except for a few ISO2022 controls. */ |
7501 for (i = 0; src < src_end; i++, src++) | 7569 for (; src < src_end; src++) |
7502 { | 7570 { |
7503 c = *src; | 7571 c = *src; |
7504 if (c & 0x80) | 7572 if (c & 0x80) |
7505 break; | |
7506 if (c < 0x20 | |
7507 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | |
7508 && ! inhibit_iso_escape_detection) | |
7509 { | 7573 { |
7510 coding.head_ascii = src - coding.source; | 7574 eight_bit_found = 1; |
7511 if (detect_coding_iso_2022 (&coding, &detect_info)) | 7575 if (coding.head_ascii < 0) |
7576 coding.head_ascii = src - coding.source; | |
7577 if (null_byte_found) | |
7578 break; | |
7579 } | |
7580 if (c < 0x20) | |
7581 { | |
7582 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | |
7583 && ! inhibit_iso_escape_detection | |
7584 && ! detect_info.checked) | |
7512 { | 7585 { |
7513 /* We have scanned the whole data. */ | 7586 if (coding.head_ascii < 0) |
7514 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) | 7587 coding.head_ascii = src - coding.source; |
7515 /* We didn't find an 8-bit code. */ | 7588 if (detect_coding_iso_2022 (&coding, &detect_info)) |
7516 src = src_end; | 7589 { |
7517 break; | 7590 /* We have scanned the whole data. */ |
7591 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) | |
7592 /* We didn't find an 8-bit code. We may have | |
7593 found a null-byte, but it's very rare that | |
7594 a binary file confirm to ISO-2022. */ | |
7595 src = src_end; | |
7596 break; | |
7597 } | |
7598 } | |
7599 else if (! c) | |
7600 { | |
7601 null_byte_found = 1; | |
7602 if (eight_bit_found) | |
7603 break; | |
7518 } | 7604 } |
7519 } | 7605 } |
7520 } | 7606 } |
7521 coding.head_ascii = src - coding.source; | 7607 if (coding.head_ascii < 0) |
7522 | 7608 coding.head_ascii = src - coding.source; |
7523 if (src < src_end | 7609 |
7610 if (null_byte_found || eight_bit_found | |
7611 || coding.head_ascii < coding.src_bytes | |
7524 || detect_info.found) | 7612 || detect_info.found) |
7525 { | 7613 { |
7526 if (src == src_end) | 7614 if (coding.head_ascii == coding.src_bytes) |
7527 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */ | 7615 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */ |
7528 for (i = 0; i < coding_category_raw_text; i++) | 7616 for (i = 0; i < coding_category_raw_text; i++) |
7529 { | 7617 { |
7530 category = coding_priorities[i]; | 7618 category = coding_priorities[i]; |
7531 this = coding_categories + category; | 7619 this = coding_categories + category; |
7532 if (detect_info.found & (1 << category)) | 7620 if (detect_info.found & (1 << category)) |
7533 break; | 7621 break; |
7534 } | 7622 } |
7535 else | 7623 else |
7536 for (i = 0; i < coding_category_raw_text; i++) | 7624 { |
7537 { | 7625 if (null_byte_found) |
7538 category = coding_priorities[i]; | 7626 { |
7539 this = coding_categories + category; | 7627 detect_info.checked |= ~CATEGORY_MASK_UTF_16; |
7540 | 7628 detect_info.rejected |= ~CATEGORY_MASK_UTF_16; |
7541 if (this->id < 0) | 7629 } |
7542 { | 7630 for (i = 0; i < coding_category_raw_text; i++) |
7543 /* No coding system of this category is defined. */ | 7631 { |
7544 detect_info.rejected |= (1 << category); | 7632 category = coding_priorities[i]; |
7545 } | 7633 this = coding_categories + category; |
7546 else if (category >= coding_category_raw_text) | 7634 |
7547 continue; | 7635 if (this->id < 0) |
7548 else if (detect_info.checked & (1 << category)) | 7636 { |
7549 { | 7637 /* No coding system of this category is defined. */ |
7550 if (highest | 7638 detect_info.rejected |= (1 << category); |
7551 && (detect_info.found & (1 << category))) | 7639 } |
7640 else if (category >= coding_category_raw_text) | |
7641 continue; | |
7642 else if (detect_info.checked & (1 << category)) | |
7643 { | |
7644 if (highest | |
7645 && (detect_info.found & (1 << category))) | |
7646 break; | |
7647 } | |
7648 else if ((*(this->detector)) (&coding, &detect_info) | |
7649 && highest | |
7650 && (detect_info.found & (1 << category))) | |
7651 { | |
7652 if (category == coding_category_utf_16_auto) | |
7653 { | |
7654 if (detect_info.found & CATEGORY_MASK_UTF_16_LE) | |
7655 category = coding_category_utf_16_le; | |
7656 else | |
7657 category = coding_category_utf_16_be; | |
7658 } | |
7552 break; | 7659 break; |
7553 } | 7660 } |
7554 else | 7661 } |
7555 { | 7662 } |
7556 if ((*(this->detector)) (&coding, &detect_info) | 7663 } |
7557 && highest | 7664 |
7558 && (detect_info.found & (1 << category))) | 7665 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY) |
7559 { | |
7560 if (category == coding_category_utf_16_auto) | |
7561 { | |
7562 if (detect_info.found & CATEGORY_MASK_UTF_16_LE) | |
7563 category = coding_category_utf_16_le; | |
7564 else | |
7565 category = coding_category_utf_16_be; | |
7566 } | |
7567 break; | |
7568 } | |
7569 } | |
7570 } | |
7571 } | |
7572 | |
7573 if (detect_info.rejected == CATEGORY_MASK_ANY) | |
7574 { | 7666 { |
7575 detect_info.found = CATEGORY_MASK_RAW_TEXT; | 7667 detect_info.found = CATEGORY_MASK_RAW_TEXT; |
7576 id = coding_categories[coding_category_raw_text].id; | 7668 id = coding_categories[coding_category_raw_text].id; |
7577 val = Fcons (make_number (id), Qnil); | 7669 val = Fcons (make_number (id), Qnil); |
7578 } | 7670 } |
7657 Lisp_Object tail; | 7749 Lisp_Object tail; |
7658 | 7750 |
7659 if (VECTORP (eol_type)) | 7751 if (VECTORP (eol_type)) |
7660 { | 7752 { |
7661 if (detect_info.found & ~CATEGORY_MASK_UTF_16) | 7753 if (detect_info.found & ~CATEGORY_MASK_UTF_16) |
7662 normal_eol = detect_eol (coding.source, src_bytes, | 7754 { |
7663 coding_category_raw_text); | 7755 if (null_byte_found) |
7756 normal_eol = EOL_SEEN_LF; | |
7757 else | |
7758 normal_eol = detect_eol (coding.source, src_bytes, | |
7759 coding_category_raw_text); | |
7760 } | |
7664 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE | 7761 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE |
7665 | CATEGORY_MASK_UTF_16_BE_NOSIG)) | 7762 | CATEGORY_MASK_UTF_16_BE_NOSIG)) |
7666 utf_16_be_eol = detect_eol (coding.source, src_bytes, | 7763 utf_16_be_eol = detect_eol (coding.source, src_bytes, |
7667 coding_category_utf_16_be); | 7764 coding_category_utf_16_be); |
7668 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE | 7765 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE |