changeset 101172:674e67257137

(TWO_MORE_BYTES): New macro. (detect_coding_utf_16): Use TWO_MORE_BYTES instead of ONE_MORE_BYTE.
author Kenichi Handa <handa@m17n.org>
date Wed, 14 Jan 2009 12:17:52 +0000
parents a94440e70b7c
children 825f62fa0199
files src/coding.c
diffstat 1 files changed, 50 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/src/coding.c	Wed Jan 14 12:08:49 2009 +0000
+++ b/src/coding.c	Wed Jan 14 12:17:52 2009 +0000
@@ -743,6 +743,47 @@
     consumed_chars++;					\
   } while (0)
 
+/* Safely get two bytes from the source text pointed by SRC which ends
+   at SRC_END, and set C1 and C2 to those bytes.  If there are not
+   enough bytes in the source for C1, it jumps to `no_more_source'.
+   If there are not enough bytes in the source for C2, set C2 to -1.
+   If multibytep is nonzero and a multibyte character is found at SRC,
+   set C1 and/or C2 to the negative value of the character code.  The
+   caller should declare and set these variables appropriately in
+   advance:
+	src, src_end, multibytep
+   It is intended that this macro is used in detect_coding_utf_16.  */
+
+#define TWO_MORE_BYTES(c1, c2)			\
+  do {						\
+    if (src == src_end)				\
+      goto no_more_source;			\
+    c1 = *src++;				\
+    if (multibytep && (c1 & 0x80))		\
+      {						\
+	if ((c1 & 0xFE) == 0xC0)		\
+	  c1 = ((c1 & 1) << 6) | *src++;	\
+	else					\
+	  {					\
+	    c1 = c2 = -1;			\
+	    break;				\
+	  }					\
+      }						\
+    if (src == src_end)				\
+      c2 = -1;					\
+    else					\
+      {						\
+	c2 = *src++;				\
+	if (multibytep && (c2 & 0x80))		\
+	  {					\
+	    if ((c2 & 0xFE) == 0xC0)		\
+	      c2 = ((c2 & 1) << 6) | *src++;	\
+	    else				\
+	      c2 = -1;				\
+	  }					\
+      }						\
+  } while (0)
+
 
 #define ONE_MORE_BYTE_NO_CHECK(c)			\
   do {							\
@@ -1575,8 +1616,7 @@
       return 0;
     }
 
-  ONE_MORE_BYTE (c1);
-  ONE_MORE_BYTE (c2);
+  TWO_MORE_BYTES (c1, c2);
   if ((c1 == 0xFF) && (c2 == 0xFE))
     {
       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
@@ -1593,6 +1633,11 @@
 				| CATEGORY_MASK_UTF_16_BE_NOSIG
 				| CATEGORY_MASK_UTF_16_LE_NOSIG);
     }
+  else if (c1 < 0 || c2 < 0)
+    {
+      detect_info->rejected |= CATEGORY_MASK_UTF_16;
+      return 0;
+    }
   else
     {
       /* We check the dispersion of Eth and Oth bytes where E is even and
@@ -1610,8 +1655,9 @@
 
       while (1)
 	{
-	  ONE_MORE_BYTE (c1);
-	  ONE_MORE_BYTE (c2);
+	  TWO_MORE_BYTES (c1, c2);
+	  if (c1 < 0 || c2 < 0)
+	    break;
 	  if (! e[c1])
 	    {
 	      e[c1] = 1;