diff src/coding.c @ 95397:c99f0a16c077

(CODING_UTF_8_BOM): New macro. (enum coding_category): Delete coding_category_utf_8, add coding_category_utf_8_auto, coding_category_utf_8_nosig, and coding_category_utf_8_sig. (CATEGORY_MASK_UTF_8): Delete it. (CATEGORY_MASK_UTF_8_AUTO, CATEGORY_MASK_UTF_8_NOSIG) (CATEGORY_MASK_UTF_8_SIG): New macros. (CATEGORY_MASK_ANY): Delete CATEGORY_MASK_UTF_8, add CATEGORY_MASK_UTF_8_AUTO, CATEGORY_MASK_UTF_8_NOSIG, and CATEGORY_MASK_UTF_8_SIG. (CATEGORY_MASK_UTF_8): New macro. (UTF_BOM, UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3): New macros. (detect_coding_utf_8): Check BOM. (decode_coding_utf_8, encode_coding_utf_8): Handle BOM. (decode_coding_utf_16): Adjusted for the change of enum utf_bom_type. (encode_coding_utf_16): Likewise. (setup_coding_system): Likewise. Set CODING_UTF_8_BOM (coding). (detect_coding, detect_coding_system): Handle utf-8-auto. (Fdefine_coding_system_internal): Handle `bom' property for utf-8. (syms_of_coding): Fix setting up of Vcoding_category_table.
author Kenichi Handa <handa@m17n.org>
date Thu, 29 May 2008 22:58:15 +0000
parents 90b4a0dc21cc
children 831c8ee4d884
line wrap: on
line diff
--- a/src/coding.c	Thu May 29 22:58:00 2008 +0000
+++ b/src/coding.c	Thu May 29 22:58:15 2008 +0000
@@ -546,6 +546,9 @@
    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 
+/* UTF-8 section */
+#define CODING_UTF_8_BOM(coding)	\
+  ((coding)->spec.utf_8_bom)
 
 /* UTF-16 section */
 #define CODING_UTF_16_BOM(coding)	\
@@ -576,7 +579,9 @@
     coding_category_iso_8_2,
     coding_category_iso_7_else,
     coding_category_iso_8_else,
-    coding_category_utf_8,
+    coding_category_utf_8_auto,
+    coding_category_utf_8_nosig,
+    coding_category_utf_8_sig,
     coding_category_utf_16_auto,
     coding_category_utf_16_be,
     coding_category_utf_16_le,
@@ -600,7 +605,9 @@
 #define CATEGORY_MASK_ISO_8_2		(1 << coding_category_iso_8_2)
 #define CATEGORY_MASK_ISO_7_ELSE	(1 << coding_category_iso_7_else)
 #define CATEGORY_MASK_ISO_8_ELSE	(1 << coding_category_iso_8_else)
-#define CATEGORY_MASK_UTF_8		(1 << coding_category_utf_8)
+#define CATEGORY_MASK_UTF_8_AUTO	(1 << coding_category_utf_8_auto)
+#define CATEGORY_MASK_UTF_8_NOSIG	(1 << coding_category_utf_8_nosig)
+#define CATEGORY_MASK_UTF_8_SIG		(1 << coding_category_utf_8_sig)
 #define CATEGORY_MASK_UTF_16_AUTO	(1 << coding_category_utf_16_auto)
 #define CATEGORY_MASK_UTF_16_BE		(1 << coding_category_utf_16_be)
 #define CATEGORY_MASK_UTF_16_LE		(1 << coding_category_utf_16_le)
@@ -622,7 +629,9 @@
    | CATEGORY_MASK_ISO_8_2		\
    | CATEGORY_MASK_ISO_7_ELSE		\
    | CATEGORY_MASK_ISO_8_ELSE		\
-   | CATEGORY_MASK_UTF_8		\
+   | CATEGORY_MASK_UTF_8_AUTO		\
+   | CATEGORY_MASK_UTF_8_NOSIG		\
+   | CATEGORY_MASK_UTF_8_SIG		\
    | CATEGORY_MASK_UTF_16_AUTO		\
    | CATEGORY_MASK_UTF_16_BE		\
    | CATEGORY_MASK_UTF_16_LE		\
@@ -662,6 +671,10 @@
    | CATEGORY_MASK_UTF_16_BE_NOSIG	\
    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 
+#define CATEGORY_MASK_UTF_8	\
+  (CATEGORY_MASK_UTF_8_AUTO	\
+   | CATEGORY_MASK_UTF_8_NOSIG	\
+   | CATEGORY_MASK_UTF_8_SIG)
 
 /* List of symbols `coding-category-xxx' ordered by priority.  This
    variable is exposed to Emacs Lisp.  */
@@ -1214,6 +1227,11 @@
 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
 
+#define UTF_BOM 0xFEFF
+#define UTF_8_BOM_1 0xEF
+#define UTF_8_BOM_2 0xBB
+#define UTF_8_BOM_3 0xBF
+
 static int
 detect_coding_utf_8 (coding, detect_info)
      struct coding_system *coding;
@@ -1223,6 +1241,7 @@
   const unsigned char *src_end = coding->source + coding->src_bytes;
   int multibytep = coding->src_multibyte;
   int consumed_chars = 0;
+  int bom_found = 0;
   int found = 0;
 
   detect_info->checked |= CATEGORY_MASK_UTF_8;
@@ -1242,7 +1261,7 @@
 	break;
       if (UTF_8_2_OCTET_LEADING_P (c))
 	{
-	  found = CATEGORY_MASK_UTF_8;
+	  found = 1;
 	  continue;
 	}
       ONE_MORE_BYTE (c2);
@@ -1250,7 +1269,10 @@
 	break;
       if (UTF_8_3_OCTET_LEADING_P (c))
 	{
-	  found = CATEGORY_MASK_UTF_8;
+	  found = 1;
+	  if (src_base == coding->source
+	      && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
+	    bom_found = 1;
 	  continue;
 	}
       ONE_MORE_BYTE (c3);
@@ -1258,7 +1280,7 @@
 	break;
       if (UTF_8_4_OCTET_LEADING_P (c))
 	{
-	  found = CATEGORY_MASK_UTF_8;
+	  found = 1;
 	  continue;
 	}
       ONE_MORE_BYTE (c4);
@@ -1266,7 +1288,7 @@
 	break;
       if (UTF_8_5_OCTET_LEADING_P (c))
 	{
-	  found = CATEGORY_MASK_UTF_8;
+	  found = 1;
 	  continue;
 	}
       break;
@@ -1280,7 +1302,16 @@
       detect_info->rejected |= CATEGORY_MASK_UTF_8;
       return 0;
     }
-  detect_info->found |= found;
+  if (bom_found)
+    {
+      /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
+      detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
+    }
+  else
+    {
+      detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
+      detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
+    }
   return 1;
 }
 
@@ -1296,12 +1327,46 @@
   int *charbuf_end = coding->charbuf + coding->charbuf_size;
   int consumed_chars = 0, consumed_chars_base;
   int multibytep = coding->src_multibyte;
+  enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
   Lisp_Object attr, charset_list;
   int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
   int byte_after_cr = -1;
 
   CODING_GET_INFO (coding, attr, charset_list);
 
+  if (bom != utf_without_bom)
+    {
+      int c1, c2, c3;
+
+      src_base = src;
+      ONE_MORE_BYTE (c1);
+      if (! UTF_8_3_OCTET_LEADING_P (c1))
+	src = src_base;
+      else
+	{
+	  ONE_MORE_BYTE (c2);      
+	  if (! UTF_8_EXTRA_OCTET_P (c2))
+	    src = src_base;
+	  else
+	    {
+	      ONE_MORE_BYTE (c3);      
+	      if (! UTF_8_EXTRA_OCTET_P (c3))
+		src = src_base;
+	      else
+		{
+		  if ((c1 != UTF_8_BOM_1)
+		      || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
+		    src = src_base;
+		  else
+		    CODING_UTF_8_BOM (coding) = utf_without_bom;
+		}
+	    }
+	}
+    }
+  CODING_UTF_8_BOM (coding) = utf_without_bom;
+
+
+
   while (1)
     {
       int c, c1, c2, c3, c4, c5;
@@ -1415,6 +1480,13 @@
   int produced_chars = 0;
   int c;
 
+  if (CODING_UTF_8_BOM (coding) == utf_with_bom)
+    {
+      ASSURE_DESTINATION (3);
+      EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
+      CODING_UTF_8_BOM (coding) = utf_without_bom;
+    }
+
   if (multibytep)
     {
       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
@@ -1566,7 +1638,7 @@
   int *charbuf_end = coding->charbuf + coding->charbuf_size;
   int consumed_chars = 0, consumed_chars_base;
   int multibytep = coding->src_multibyte;
-  enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
+  enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
   int surrogate = CODING_UTF_16_SURROGATE (coding);
   Lisp_Object attr, charset_list;
@@ -1575,7 +1647,7 @@
 
   CODING_GET_INFO (coding, attr, charset_list);
 
-  if (bom == utf_16_with_bom)
+  if (bom == utf_with_bom)
     {
       int c, c1, c2;
 
@@ -1592,13 +1664,13 @@
 	  src = src_base;
 	  coding->errors++;
 	}
-      CODING_UTF_16_BOM (coding) = utf_16_without_bom;
-    }
-  else if (bom == utf_16_detect_bom)
+      CODING_UTF_16_BOM (coding) = utf_without_bom;
+    }
+  else if (bom == utf_detect_bom)
     {
       /* We have already tried to detect BOM and failed in
 	 detect_coding.  */
-      CODING_UTF_16_BOM (coding) = utf_16_without_bom;
+      CODING_UTF_16_BOM (coding) = utf_without_bom;
     }
 
   while (1)
@@ -1688,7 +1760,7 @@
   unsigned char *dst = coding->destination + coding->produced;
   unsigned char *dst_end = coding->destination + coding->dst_bytes;
   int safe_room = 8;
-  enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
+  enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
   int produced_chars = 0;
   Lisp_Object attrs, charset_list;
@@ -1696,14 +1768,14 @@
 
   CODING_GET_INFO (coding, attrs, charset_list);
 
-  if (bom != utf_16_without_bom)
+  if (bom != utf_without_bom)
     {
       ASSURE_DESTINATION (safe_room);
       if (big_endian)
 	EMIT_TWO_BYTES (0xFE, 0xFF);
       else
 	EMIT_TWO_BYTES (0xFF, 0xFE);
-      CODING_UTF_16_BOM (coding) = utf_16_without_bom;
+      CODING_UTF_16_BOM (coding) = utf_without_bom;
     }
 
   while (charbuf < charbuf_end)
@@ -5272,18 +5344,24 @@
     }
   else if (EQ (coding_type, Qutf_8))
     {
+      val = AREF (attrs, coding_attr_utf_bom);
+      CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
+				   : EQ (val, Qt) ? utf_with_bom
+				   : utf_without_bom);
       coding->detector = detect_coding_utf_8;
       coding->decoder = decode_coding_utf_8;
       coding->encoder = encode_coding_utf_8;
       coding->common_flags
 	|= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
+      if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
+	coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
     }
   else if (EQ (coding_type, Qutf_16))
     {
-      val = AREF (attrs, coding_attr_utf_16_bom);
-      CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
-				    : EQ (val, Qt) ? utf_16_with_bom
-				    : utf_16_without_bom);
+      val = AREF (attrs, coding_attr_utf_bom);
+      CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
+				    : EQ (val, Qt) ? utf_with_bom
+				    : utf_without_bom);
       val = AREF (attrs, coding_attr_utf_16_endian);
       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
 				       : utf_16_little_endian);
@@ -5293,7 +5371,7 @@
       coding->encoder = encode_coding_utf_16;
       coding->common_flags
 	|= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
-      if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
+      if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
 	coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
     }
   else if (EQ (coding_type, Qccl))
@@ -5828,14 +5906,34 @@
 	}
     }
   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
+	   == coding_category_utf_8_auto)
+    {
+      Lisp_Object coding_systems;
+      struct coding_detection_info detect_info;
+
+      coding_systems
+	= AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
+      detect_info.found = detect_info.rejected = 0;
+      coding->head_ascii = 0;
+      if (CONSP (coding_systems)
+	  && detect_coding_utf_8 (coding, &detect_info))
+	{
+	  if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
+	    setup_coding_system (XCAR (coding_systems), coding);
+	  else
+	    setup_coding_system (XCDR (coding_systems), coding);
+	}
+    }
+  else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
 	   == coding_category_utf_16_auto)
     {
       Lisp_Object coding_systems;
       struct coding_detection_info detect_info;
 
       coding_systems
-	= AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
+	= AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
       detect_info.found = detect_info.rejected = 0;
+      coding->head_ascii = 0;
       if (CONSP (coding_systems)
 	  && detect_coding_utf_16 (coding, &detect_info))
 	{
@@ -7724,6 +7822,19 @@
 	  detect_info.found |= found;
 	}
     }
+  else if (base_category == coding_category_utf_8_auto)
+    {
+      if (detect_coding_utf_8 (&coding, &detect_info))
+	{
+	  struct coding_system *this;
+
+	  if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
+	    this = coding_categories + coding_category_utf_8_sig;
+	  else
+	    this = coding_categories + coding_category_utf_8_nosig;
+	  val = Fcons (make_number (this->id), Qnil);
+	}
+    }
   else if (base_category == coding_category_utf_16_auto)
     {
       if (detect_coding_utf_16 (&coding, &detect_info))
@@ -9154,7 +9265,7 @@
 	  val = XCDR (bom);
 	  CHECK_CODING_SYSTEM (val);
 	}
-      ASET (attrs, coding_attr_utf_16_bom, bom);
+      ASET (attrs, coding_attr_utf_bom, bom);
 
       endian = args[coding_arg_utf16_endian];
       CHECK_SYMBOL (endian);
@@ -9333,8 +9444,27 @@
     }
   else if (EQ (coding_type, Qutf_8))
     {
-      category = coding_category_utf_8;
+      Lisp_Object bom;
+
       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
+
+      if (nargs < coding_arg_utf8_max)
+	goto short_args;
+
+      bom = args[coding_arg_utf8_bom];
+      if (! NILP (bom) && ! EQ (bom, Qt))
+	{
+	  CHECK_CONS (bom);
+	  val = XCAR (bom);
+	  CHECK_CODING_SYSTEM (val);
+	  val = XCDR (bom);
+	  CHECK_CODING_SYSTEM (val);
+	}
+      ASET (attrs, coding_attr_utf_bom, bom);
+
+      category = (CONSP (bom) ? coding_category_utf_8_auto
+		  : NILP (bom) ? coding_category_utf_8_nosig
+		  : coding_category_utf_8_sig);
     }
   else if (EQ (coding_type, Qundecided))
     category = coding_category_undecided;
@@ -9755,8 +9885,12 @@
 	intern ("coding-category-iso-7-else"));
   ASET (Vcoding_category_table, coding_category_iso_8_else,
 	intern ("coding-category-iso-8-else"));
-  ASET (Vcoding_category_table, coding_category_utf_8,
+  ASET (Vcoding_category_table, coding_category_utf_8_auto,
+	intern ("coding-category-utf-8-auto"));
+  ASET (Vcoding_category_table, coding_category_utf_8_nosig,
 	intern ("coding-category-utf-8"));
+  ASET (Vcoding_category_table, coding_category_utf_8_sig,
+	intern ("coding-category-utf-8-sig"));
   ASET (Vcoding_category_table, coding_category_utf_16_be,
 	intern ("coding-category-utf-16-be"));
   ASET (Vcoding_category_table, coding_category_utf_16_auto,