changeset 79877:ef3503538f91

(detect_coding_iso2022): New arg latin_extra_code_state. Allow Latin extra codes only when *latin_extra_code_state is nonzero. (detect_coding_mask): If there is a NULL byte, detect the encoding as UTF-16 or binary. If Latin extra codes exist, detect the encoding as ISO-2022 only when there's no other proper encoding is found.
author Kenichi Handa <handa@m17n.org>
date Sat, 19 Jan 2008 05:55:36 +0000
parents 174ad23bc1f2
children 6a0f9ee0177d
files src/coding.c
diffstat 1 files changed, 93 insertions(+), 24 deletions(-) [+]
line wrap: on
line diff
--- a/src/coding.c	Sat Jan 19 04:36:40 2008 +0000
+++ b/src/coding.c	Sat Jan 19 05:55:36 2008 +0000
@@ -1410,12 +1410,17 @@
 	CODING_CATEGORY_MASK_ISO_7_ELSE
 	CODING_CATEGORY_MASK_ISO_8_ELSE
    are set.  If a code which should never appear in ISO2022 is found,
-   returns 0.  */
+   returns 0.
+
+   If *latin_extra_code_state is zero and Latin extra codes are found,
+   set *latin_extra_code_state to 1 and return 0.  If it is nonzero,
+   accept Latin extra codes.  */
 
 static int
-detect_coding_iso2022 (src, src_end, multibytep)
+detect_coding_iso2022 (src, src_end, multibytep, latin_extra_code_state)
      unsigned char *src, *src_end;
      int multibytep;
+     int *latin_extra_code_state;
 {
   int mask = CODING_CATEGORY_MASK_ISO;
   int mask_found = 0;
@@ -1578,6 +1583,11 @@
 	    if (VECTORP (Vlatin_extra_code_table)
 		&& !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 	      {
+		if (! *latin_extra_code_state)
+		  {
+		    *latin_extra_code_state = 1;
+		    return 0;
+		  }
 		if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 		    & CODING_FLAG_ISO_LATIN_EXTRA)
 		  newmask |= CODING_CATEGORY_MASK_ISO_8_1;
@@ -1604,6 +1614,11 @@
 		{
 		  int newmask = 0;
 
+		  if (! *latin_extra_code_state)
+		    {
+		      *latin_extra_code_state = 1;
+		      return 0;
+		    }
 		  if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 		      & CODING_FLAG_ISO_LATIN_EXTRA)
 		    newmask |= CODING_CATEGORY_MASK_ISO_8_1;
@@ -4131,6 +4146,8 @@
   unsigned char *src = source, *src_end = source + src_bytes;
   unsigned int mask, utf16_examined_p, iso2022_examined_p;
   int i;
+  int null_byte_found;
+  int latin_extra_code_state = 1;
 
   /* At first, skip all ASCII characters and control characters except
      for three ISO2022 specific control characters.  */
@@ -4139,21 +4156,36 @@
   ascii_skip_code[ISO_CODE_ESC] = 0;
 
  label_loop_detect_coding:
-  while (src < src_end && ascii_skip_code[*src]) src++;
+  null_byte_found = 0;
+  /* We stop this loop before the last byte because it may be a NULL
+     anchor byte.  */
+  while (src < src_end - 1 && ascii_skip_code[*src])
+    null_byte_found |= (! *src++);
+  if (ascii_skip_code[*src])
+    src++;
+  else if (! null_byte_found)
+    {
+      unsigned char *p = src + 1;
+      while (p < src_end - 1)
+	null_byte_found |= (! *p++);
+    }
   *skip = src - source;
 
   if (src >= src_end)
-    /* We found nothing other than ASCII.  There's nothing to do.  */
+    /* We found nothing other than ASCII (and NULL byte).  There's
+       nothing to do.  */
     return 0;
 
   c = *src;
   /* The text seems to be encoded in some multilingual coding system.
      Now, try to find in which coding system the text is encoded.  */
-  if (c < 0x80)
+  if (! null_byte_found && c < 0x80)
     {
       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
       /* C is an ISO2022 specific control code of C0.  */
-      mask = detect_coding_iso2022 (src, src_end, multibytep);
+      latin_extra_code_state = 1;
+      mask = detect_coding_iso2022 (src, src_end, multibytep,
+				    &latin_extra_code_state);
       if (mask == 0)
 	{
 	  /* No valid ISO2022 code follows C.  Try again.  */
@@ -4181,21 +4213,27 @@
       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
 	c = src[1] - 0x20;
 
-      if (c < 0xA0)
+      if (null_byte_found)
+	{
+	  try = (CODING_CATEGORY_MASK_UTF_16_BE
+		 | CODING_CATEGORY_MASK_UTF_16_LE);
+	}
+      else if (c < 0xA0)
 	{
 	  /* C is the first byte of SJIS character code,
 	     or a leading-code of Emacs' internal format (emacs-mule),
 	     or the first byte of UTF-16.  */
 	  try = (CODING_CATEGORY_MASK_SJIS
-		  | CODING_CATEGORY_MASK_EMACS_MULE
-		  | CODING_CATEGORY_MASK_UTF_16_BE
-		  | CODING_CATEGORY_MASK_UTF_16_LE);
+		 | CODING_CATEGORY_MASK_EMACS_MULE
+		 | CODING_CATEGORY_MASK_UTF_16_BE
+		 | CODING_CATEGORY_MASK_UTF_16_LE);
 
 	  /* Or, if C is a special latin extra code,
 	     or is an ISO2022 specific control code of C1 (SS2 or SS3),
 	     or is an ISO2022 control-sequence-introducer (CSI),
 	     we should also consider the possibility of ISO2022 codings.  */
-	  if ((VECTORP (Vlatin_extra_code_table)
+	  if ((latin_extra_code_state
+	       && VECTORP (Vlatin_extra_code_table)
 	       && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 	      || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
 	      || (c == ISO_CODE_CSI
@@ -4205,7 +4243,7 @@
 			      && src + 1 < src_end
 			      && src[1] == ']')))))
 	    try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
-		     | CODING_CATEGORY_MASK_ISO_8BIT);
+		    | CODING_CATEGORY_MASK_ISO_8BIT);
 	}
       else
 	/* C is a character of ISO2022 in graphic plane right,
@@ -4213,29 +4251,36 @@
 	   or the first byte of BIG5's 2-byte code,
 	   or the first byte of UTF-8/16.  */
 	try = (CODING_CATEGORY_MASK_ISO_8_ELSE
-		| CODING_CATEGORY_MASK_ISO_8BIT
-		| CODING_CATEGORY_MASK_SJIS
-		| CODING_CATEGORY_MASK_BIG5
-	        | CODING_CATEGORY_MASK_UTF_8
-	        | CODING_CATEGORY_MASK_UTF_16_BE
-	        | CODING_CATEGORY_MASK_UTF_16_LE);
+	       | CODING_CATEGORY_MASK_ISO_8BIT
+	       | CODING_CATEGORY_MASK_SJIS
+	       | CODING_CATEGORY_MASK_BIG5
+	       | CODING_CATEGORY_MASK_UTF_8
+	       | CODING_CATEGORY_MASK_UTF_16_BE
+	       | CODING_CATEGORY_MASK_UTF_16_LE);
 
       /* Or, we may have to consider the possibility of CCL.  */
-      if (coding_system_table[CODING_CATEGORY_IDX_CCL]
+      if (! null_byte_found
+	  && coding_system_table[CODING_CATEGORY_IDX_CCL]
 	  && (coding_system_table[CODING_CATEGORY_IDX_CCL]
 	      ->spec.ccl.valid_codes)[c])
 	try |= CODING_CATEGORY_MASK_CCL;
 
       mask = 0;
-      utf16_examined_p = iso2022_examined_p = 0;
       if (priorities)
 	{
+	  /* At first try detection with Latin extra codes not-allowed.
+	     If no proper coding system is found because of Latin extra
+	     codes, try detection with Latin extra codes allowed.  */
+	  latin_extra_code_state = 0;
+	label_retry:
+	  utf16_examined_p = iso2022_examined_p = 0;
 	  for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
 	    {
 	      if (!iso2022_examined_p
 		  && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
 		{
-		  mask |= detect_coding_iso2022 (src, src_end, multibytep);
+		  mask |= detect_coding_iso2022 (src, src_end, multibytep,
+						 &latin_extra_code_state);
 		  iso2022_examined_p = 1;
 		}
 	      else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
@@ -4256,16 +4301,40 @@
 	      else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
 		mask |= detect_coding_ccl (src, src_end, multibytep);
 	      else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
-		mask |= CODING_CATEGORY_MASK_RAW_TEXT;
+		{
+		  if (latin_extra_code_state == 1)
+		    {
+		      /* Detection of ISO-2022 based coding system
+			 failed because of Latin extra codes.  Before
+			 falling back to raw-text, try again with
+			 Latin extra codes allowed.  */
+		      latin_extra_code_state = 2;
+		      try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
+			     | CODING_CATEGORY_MASK_ISO_8BIT);
+		      goto label_retry;
+		    }
+		  mask |= CODING_CATEGORY_MASK_RAW_TEXT;
+		}
 	      else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
-		mask |= CODING_CATEGORY_MASK_BINARY;
+		{
+		  if (latin_extra_code_state == 1)
+		    {
+		      /* See the above comment.  */
+		      latin_extra_code_state = 2;
+		      try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
+			     | CODING_CATEGORY_MASK_ISO_8BIT);
+		      goto label_retry;
+		    }
+		  mask |= CODING_CATEGORY_MASK_BINARY;
+		}
 	      if (mask & priorities[i])
 		return priorities[i];
 	    }
 	  return CODING_CATEGORY_MASK_RAW_TEXT;
 	}
       if (try & CODING_CATEGORY_MASK_ISO)
-	mask |= detect_coding_iso2022 (src, src_end, multibytep);
+	mask |= detect_coding_iso2022 (src, src_end, multibytep,
+				       &latin_extra_code_state);
       if (try & CODING_CATEGORY_MASK_SJIS)
 	mask |= detect_coding_sjis (src, src_end, multibytep);
       if (try & CODING_CATEGORY_MASK_BIG5)