emacs: src/coding.c comparison

comparison src/coding.c @ 20931:068eb408c911

(decode_coding_iso2022): Update coding->fake_multibyte. (ENCODE_SINGLE_SHIFT_2, ENCODE_SINGLE_SHIFT_3, encode_coding_iso2022, decode_coding_sjis_big5, encode_coding_sjis_big5, decode_eol, encode_eol, decode_coding, encode_coding): Likewise. (shrink_decoding_region, shrink_encoding_region): Do not skip non-ASCII code in any cases. Bug fix for getting starting address from BEG. (code_convert_region): Sync character positions correctly by paying attention to coding->fake_multibyte. (code_convert_string): Set number of character and bytes just processed in members of CODING. (code_convert_string): Adjusted for the change of code_convert_region. (code_convert_region1): Likewise.

author	Kenichi Handa <handa@m17n.org>
date	Fri, 20 Feb 1998 01:40:47 +0000
parents	0fa2183c587d
children	e4dd62e5d921

comparison

equal deleted inserted replaced

-:1331679fe704
+:068eb408c911
 if (!NILP (Venable_character_unification) && NILP (unification_table))
 unification_table = Vstandard_character_unification_table_for_decode;
 coding->produced_char = 0;
+coding->fake_multibyte = 0;
 while (src < src_end && (dst_bytes
 			   ? (dst < adjusted_dst_end)
 			   : (dst < src - 6)))
 {
 /* SRC_BASE remembers the start position in source in each loop.
 	  break;
 	case ISO_0xA0_or_0xFF:
 	  if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
 	      || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
-	    {
+	    goto label_invalid_code;
-	      /* Invalid code.  */
-	      *dst++ = c1;
-	      coding->produced_char++;
-	      break;
-	    }
 	  /* This is a graphic character, we fall down ... */
 	case ISO_graphic_plane_1:
 	  if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
-	    {
+	    goto label_invalid_code;
-	      /* Invalid code.  */
-	      *dst++ = c1;
-	      coding->produced_char++;
-	    }
 	  else
 	    DECODE_ISO_CHARACTER (charset1, c1);
 	  break;
 	case ISO_control_code:
 	  charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 	  charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 	  break;
 	label_invalid_code:
-	  coding->produced_char += src - src_base;
 	  while (src_base < src)
 	    *dst++ = *src_base++;
+	  coding->fake_multibyte = 1;
 	}
 continue;
 label_end_of_loop:
 result = CODING_FINISH_INSUFFICIENT_SRC;
 label_end_of_loop_2:
 src = src_base;
 break;
 }
-if (result == CODING_FINISH_NORMAL
+if (src < src_end)
-&& src < src_end)
+{
-result = CODING_FINISH_INSUFFICIENT_DST;
+if (result == CODING_FINISH_NORMAL)
+	result = CODING_FINISH_INSUFFICIENT_DST;
-/* If this is the last block of the text to be decoded, we had
+else if (result != CODING_FINISH_INCONSISTENT_EOL
-better just flush out all remaining codes in the text although
+	       && coding->mode & CODING_MODE_LAST_BLOCK)
-they are not valid characters.  */
+	{
-if (coding->mode & CODING_MODE_LAST_BLOCK)
+	  /* This is the last block of the text to be decoded.  We had
-{
+	     better just flush out all remaining codes in the text
-bcopy (src, dst, src_end - src);
+	     although they are not valid characters.  */
-dst += (src_end - src);
+	  src_bytes = src_end - src;
-src = src_end;
+	  if (dst_bytes && (dst_end - dst < src_bytes))
-}
+	    src_bytes = dst_end - dst;
+	  bcopy (src, dst, src_bytes);
+	  dst += src_bytes;
+	  src += src_bytes;
+	  coding->fake_multibyte = 1;
+	}
+}
 coding->consumed = coding->consumed_char = src - source;
 coding->produced = dst - destination;
 return result;
 }
 #define ENCODE_SINGLE_SHIFT_2				\
 do {							\
 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)	\
 *dst++ = ISO_CODE_ESC, *dst++ = 'N';		\
 else						\
-*dst++ = ISO_CODE_SS2;				\
+{							\
+	*dst++ = ISO_CODE_SS2;				\
+	coding->fake_multibyte = 1;			\
+}							\
 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;	\
 } while (0)
-#define ENCODE_SINGLE_SHIFT_3			   	\
+#define ENCODE_SINGLE_SHIFT_3				\
-do {						   	\
+do {							\
 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)	\
-*dst++ = ISO_CODE_ESC, *dst++ = 'O';	   	\
+*dst++ = ISO_CODE_ESC, *dst++ = 'O';		\
-else					   	\
+else						\
-*dst++ = ISO_CODE_SS3;			   	\
+{							\
+	*dst++ = ISO_CODE_SS3;				\
+	coding->fake_multibyte = 1;			\
+}							\
 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;	\
 } while (0)
 /* The following four macros produce codes (control character or
 escape sequence) for ISO2022 locking-shift functions (shift-in,
 if (!NILP (Venable_character_unification) && NILP (unification_table))
 unification_table = Vstandard_character_unification_table_for_encode;
 coding->consumed_char = 0;
+coding->fake_multibyte = 0;
 while (src < src_end && (dst_bytes
 			   ? (dst < adjusted_dst_end)
 			   : (dst < src - 19)))
 {
 /* SRC_BASE remembers the start position in source in each loop.
 result = CODING_FINISH_INSUFFICIENT_SRC;
 src = src_base;
 break;
 }
-if (result == CODING_FINISH_NORMAL
+if (src < src_end)
-&& src < src_end)
+{
-result = CODING_FINISH_INSUFFICIENT_DST;
+if (result == CODING_FINISH_NORMAL)
+	result = CODING_FINISH_INSUFFICIENT_DST;
-/* If this is the last block of the text to be encoded, we must
+else
-reset graphic planes and registers to the initial state, and
+	/* If this is the last block of the text to be encoded, we
-flush out the carryover if any.  */
+	   must reset graphic planes and registers to the initial
-if (coding->mode & CODING_MODE_LAST_BLOCK)
+	   state, and flush out the carryover if any.  */
-ENCODE_RESET_PLANE_AND_REGISTER;
+	if (coding->mode & CODING_MODE_LAST_BLOCK)
+	  ENCODE_RESET_PLANE_AND_REGISTER;
+}
 coding->consumed = src - source;
 coding->produced = coding->produced_char = dst - destination;
 return result;
 }
 else if (CHARSET_DIMENSION (charset_alt) == 1)			  \
 {									  \
 	if (sjis_p && charset_alt == charset_katakana_jisx0201)		  \
 	  *dst++ = c1;							  \
 	else								  \
-	  *dst++ = charset_alt, *dst++ = c1;				  \
+	  {								  \
+	    *dst++ = charset_alt, *dst++ = c1;				  \
+	    coding->fake_multibyte = 1;					  \
+	  }								  \
 }									  \
 else								  \
 {									  \
 	c1 &= 0x7F, c2 &= 0x7F;						  \
 	if (sjis_p && charset_alt == charset_jisx0208)			  \
 	  {								  \
 	    unsigned char s1, s2;					  \
-									  \
+	    								  \
 	    ENCODE_SJIS (c1, c2, s1, s2);				  \
 	    *dst++ = s1, *dst++ = s2;					  \
+	    coding->fake_multibyte = 1;					  \
 	  }								  \
 	else if (!sjis_p						  \
 		 && (charset_alt == charset_big5_1			  \
 		     || charset_alt == charset_big5_2))			  \
 	  {								  \
 	    unsigned char b1, b2;					  \
-									  \
+	    								  \
 	    ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);			  \
 	    *dst++ = b1, *dst++ = b2;					  \
 	  }								  \
 	else								  \
-	  *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;		  \
+	  {								  \
+	    *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;		  \
+	    coding->fake_multibyte = 1;					  \
+	  }								  \
 }									  \
 coding->consumed_char++;						  \
 } while (0);
 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 if (!NILP (Venable_character_unification) && NILP (unification_table))
 unification_table = Vstandard_character_unification_table_for_decode;
 coding->produced_char = 0;
+coding->fake_multibyte = 0;
 while (src < src_end && (dst_bytes
 			   ? (dst < adjusted_dst_end)
 			   : (dst < src - 3)))
 {
 /* SRC_BASE remembers the start position in source in each loop.
 	    *dst++ = c1;
 	  coding->produced_char++;
 	}
 else if (c1 < 0x80)
 	DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
-else if (c1 < 0xA0 || c1 >= 0xE0)
+else if (c1 < 0xA0)
 	{
-	  /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
+	  /* SJIS -> JISX0208 */
 	  if (sjis_p)
 	    {
 	      ONE_MORE_BYTE (c2);
-	      DECODE_SJIS (c1, c2, c3, c4);
+	      if (c2 >= 0x40)
-	      DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
+		{
+		  DECODE_SJIS (c1, c2, c3, c4);
+		  DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
+		}
+	      else
+		goto label_invalid_code_2;
 	    }
-	  else if (c1 >= 0xE0 && c1 < 0xFF)
+	  else
-	    {
+	    goto label_invalid_code_1;
-	      int charset;
+	}
+else if (c1 < 0xE0)
-	      ONE_MORE_BYTE (c2);
-	      DECODE_BIG5 (c1, c2, charset, c3, c4);
-	      DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
-	    }
-	  else			/* Invalid code */
-	    {
-	      *dst++ = c1;
-	      coding->produced_char++;
-	    }
-	}
-else
 	{
 	  /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
 	  if (sjis_p)
 	    DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
 					/* dummy */ c2);
 	  else
 	    {
 	      int charset;
 	      ONE_MORE_BYTE (c2);
-	      DECODE_BIG5 (c1, c2, charset, c3, c4);
+	      if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
-	      DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
+		{
+		  DECODE_BIG5 (c1, c2, charset, c3, c4);
+		  DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
+		}
+	      else
+		goto label_invalid_code_2;
 	    }
 	}
+else			/* C1 >= 0xE0 */
+	{
+	  /* SJIS -> JISX0208, BIG5 -> Big5 */
+	  if (sjis_p)
+	    {
+	      ONE_MORE_BYTE (c2);
+	      if (c2 >= 0x40)
+		{
+		  DECODE_SJIS (c1, c2, c3, c4);
+		  DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
+		}
+	      else
+		goto label_invalid_code_2;
+	    }
+	  else
+	    {
+	      int charset;
+	      ONE_MORE_BYTE (c2);
+	      if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
+		{
+		  DECODE_BIG5 (c1, c2, charset, c3, c4);
+		  DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
+		}
+	      else
+		goto label_invalid_code_2;
+	    }
+	}
+continue;
+label_invalid_code_1:
+*dst++ = c1;
+coding->produced_char++;
+coding->fake_multibyte = 1;
+continue;
+label_invalid_code_2:
+*dst++ = c1; *dst++= c2;
+coding->produced_char += 2;
+coding->fake_multibyte = 1;
 continue;
 label_end_of_loop:
 result = CODING_FINISH_INSUFFICIENT_SRC;
 label_end_of_loop_2:
 src = src_base;
 break;
 }
-if (result == CODING_FINISH_NORMAL
+if (src < src_end)
-&& src < src_end)
+{
-result = CODING_FINISH_INSUFFICIENT_DST;
+if (result == CODING_FINISH_NORMAL)
+	result = CODING_FINISH_INSUFFICIENT_DST;
+else if (result != CODING_FINISH_INCONSISTENT_EOL
+	       && coding->mode & CODING_MODE_LAST_BLOCK)
+	{
+	  src_bytes = src_end - src;
+	  if (dst_bytes && (dst_end - dst < src_bytes))
+	    src_bytes = dst_end - dst;
+	  bcopy (dst, src, src_bytes);
+	  src += src_bytes;
+	  dst += src_bytes;
+	  coding->fake_multibyte = 1;
+	}
+}
 coding->consumed = coding->consumed_char = src - source;
 coding->produced = dst - destination;
 return result;
 }
 if (!NILP (Venable_character_unification) && NILP (unification_table))
 unification_table = Vstandard_character_unification_table_for_encode;
 coding->consumed_char = 0;
+coding->fake_multibyte = 0;
 while (src < src_end && (dst_bytes
 			   ? (dst < adjusted_dst_end)
 			   : (dst < src - 1)))
 {
 /* SRC_BASE remembers the start position in source in each loop.
 {
 unsigned char *src = source;
 unsigned char *src_end = source + src_bytes;
 unsigned char *dst = destination;
 unsigned char *dst_end = destination + dst_bytes;
+unsigned char c;
 int result = CODING_FINISH_NORMAL;
+coding->fake_multibyte = 0;
 if (src_bytes <= 0)
 return result;
 switch (coding->eol_type)
 	while (src < src_end && (dst_bytes
 				 ? (dst < adjusted_dst_end)
 				 : (dst < src - 1)))
 	  {
 	    unsigned char *src_base = src;
-	    unsigned char c = *src++;
+	    c = *src++;
 	    if (c == '\r')
 	      {
 		ONE_MORE_BYTE (c);
 		if (c != '\n')
 		  {
 		      {
 			result = CODING_FINISH_INCONSISTENT_EOL;
 			goto label_end_of_loop_2;
 		      }
 		    *dst++ = '\r';
+		    if (BASE_LEADING_CODE_P (c))
+		      coding->fake_multibyte = 1;
 		  }
 		*dst++ = c;
 	      }
 	    else if (c == '\n'
 		     && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
 	      {
 		result = CODING_FINISH_INCONSISTENT_EOL;
 		goto label_end_of_loop_2;
 	      }
 	    else
-	      *dst++ = c;
+	      {
+		*dst++ = c;
+		if (BASE_LEADING_CODE_P (c))
+		  coding->fake_multibyte = 1;
+	      }
 	    continue;
 	  label_end_of_loop:
 	    result = CODING_FINISH_INSUFFICIENT_SRC;
 	  label_end_of_loop_2:
 break;
 case CODING_EOL_CR:
 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 	{
-	  while (src < src_end) if (*src++ == '\n') break;
+	  while (src < src_end)
+	    {
+	      if ((c = *src++) == '\n')
+		break;
+	      if (BASE_LEADING_CODE_P (c))
+		coding->fake_multibyte = 1;
+	    }
 	  if (*--src == '\n')
 	    {
 	      src_bytes = src - source;
 	      result = CODING_FINISH_INCONSISTENT_EOL;
 	    }
 	bcopy (source, destination, src_bytes);
 else
 	safe_bcopy (source, destination, src_bytes);
 src += src_bytes;
 dst += dst_bytes;
+coding->fake_multibyte = 1;
 break;
 }
 coding->consumed = coding->consumed_char = src - source;
 coding->produced = coding->produced_char = dst - destination;
 int src_bytes, dst_bytes;
 {
 unsigned char *src = source;
 unsigned char *dst = destination;
 int result = CODING_FINISH_NORMAL;
+coding->fake_multibyte = 0;
 if (coding->eol_type == CODING_EOL_CRLF)
 {
 unsigned char c;
 unsigned char *src_end = source + src_bytes;
 	  c = *src++;
 	  if (c == '\n'
 	      || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
 	    *dst++ = '\r', *dst++ = '\n';
 	  else
-	    *dst++ = c;
+	    {
+	      *dst++ = c;
+	      if (BASE_LEADING_CODE_P (c))
+		coding->fake_multibyte = 1;
+	    }
 	}
 if (src < src_end)
 	result = CODING_FINISH_INSUFFICIENT_DST;
 }
 else
 {
+unsigned char c;
 if (dst_bytes && src_bytes > dst_bytes)
 	{
 	  src_bytes = dst_bytes;
 	  result = CODING_FINISH_INSUFFICIENT_DST;
 	}
 if (dst_bytes)
 	bcopy (source, destination, src_bytes);
 else
-	safe_bcopy (source, destination, src_bytes);
+	{
+	  safe_bcopy (source, destination, src_bytes);
+	  dst_bytes = src_bytes;
+	}
 if (coding->eol_type == CODING_EOL_CRLF)
 	{
 	  while (src_bytes--)
-	    if (*dst++ == '\n') dst[-1] = '\r';
+	    {
-	}
+	      if ((c = *dst++) == '\n')
-else if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
+		dst[-1] = '\r';
-	{
+	      else if (BASE_LEADING_CODE_P (c))
-	  while (src_bytes--)
+		  coding->fake_multibyte = 1;
-	    if (*dst++ == '\r') dst[-1] = '\n';
+	    }
 	}
-src += src_bytes;
+else
-dst += src_bytes;
+	{
+	  if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
+	    {
+	      while (src_bytes--)
+		if (*dst++ == '\r') dst[-1] = '\n';
+	    }
+	  coding->fake_multibyte = 1;
+	}
+src = source + dst_bytes;
+dst = destination + dst_bytes;
 }
 coding->consumed = coding->consumed_char = src - source;
 coding->produced = coding->produced_char = dst - destination;
 return result;
 if (src_bytes <= 0)
 {
 coding->produced = coding->produced_char = 0;
 coding->consumed = coding->consumed_char = 0;
+coding->fake_multibyte = 0;
 return CODING_FINISH_NORMAL;
 }
 if (coding->type == coding_type_undecided)
 detect_coding (coding, source, src_bytes);
 	}
 if (dst_bytes)
 	bcopy (source, destination, coding->produced);
 else
 	safe_bcopy (source, destination, coding->produced);
+coding->fake_multibyte = 1;
 coding->consumed
 	= coding->consumed_char = coding->produced_char = coding->produced;
 break;
 }
 if (src_bytes <= 0)
 {
 coding->produced = coding->produced_char = 0;
 coding->consumed = coding->consumed_char = 0;
+coding->fake_multibyte = 0;
 return CODING_FINISH_NORMAL;
 }
 switch (coding->type)
 {
 	{
 	  unsigned char *p = destination, *pend = p + coding->produced;
 	  while (p < pend)
 	    if (*p++ == '\015') p[-1] = '\n';
 	}
+coding->fake_multibyte = 1;
 coding->consumed
 	= coding->consumed_char = coding->produced_char = coding->produced;
 break;
 }
 return result;
 }
-/* Scan text in the region between *BEG and *END, skip characters
+/* Scan text in the region between *BEG and *END (byte positions),
-which we don't have to decode by coding system CODING at the head
+skip characters which we don't have to decode by coding system
-and tail, then set *BEG and *END to the region of the text we
+CODING at the head and tail, then set *BEG and *END to the region
-actually have to convert.
+of the text we actually have to convert.  The caller should move
+the gap out of the region in advance.
 If STR is not NULL, *BEG and *END are indices into STR.  */
 static void
 shrink_decoding_region (beg, end, coding, str)
 int *beg, *end;
 struct coding_system *coding;
 unsigned char *str;
 {
-unsigned char *begp_orig, *begp, *endp_orig, *endp;
+unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
 int eol_conversion;
 if (coding->type == coding_type_ccl
 || coding->type == coding_type_undecided
 || !NILP (coding->post_read_conversion))
 /* We can't skip any data.  */
 return;
 }
 else if (coding->type == coding_type_no_conversion)
 {
-/* We need no conversion.  */
+/* We need no conversion, but don't have to skip any data here.
-*beg = *end;
+Decoding routine handles them effectively anyway.  */
 return;
 }
 if (coding->heading_ascii >= 0)
 /* Detection routine has already found how much we can skip at the
 begp_orig = begp = str + *beg;
 endp_orig = endp = str + *end;
 }
 else
 {
-move_gap (*beg);
+begp_orig = begp = BYTE_POS_ADDR (*beg);
-begp_orig = begp = GAP_END_ADDR;
 endp_orig = endp = begp + *end - *beg;
 }
 eol_conversion = (coding->eol_type != CODING_EOL_LF);
 case coding_type_emacs_mule:
 case coding_type_raw_text:
 if (eol_conversion)
 	{
 	  if (coding->heading_ascii < 0)
-	    while (begp < endp && *begp != '\r') begp++;
+	    while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
-	  while (begp < endp && *(endp - 1) != '\r') endp--;
+	  while (begp < endp && *(endp - 1) != '\r' && *(endp - 1) < 0x80)
+	    endp--;
 	}
 else
 	begp = endp;
 break;
 break;
 default:		/* i.e. case coding_type_iso2022: */
 if (coding->heading_ascii < 0)
 	{
-	  unsigned char c;
 	  /* We can skip all ASCII characters at the head except for a
 	     few control codes.  */
 	  while (begp < endp && (c = *begp) < 0x80
 		 && c != ISO_CODE_CR && c != ISO_CODE_SO
 		 && c != ISO_CODE_SI && c != ISO_CODE_ESC
 	{
 	case CODING_CATEGORY_IDX_ISO_8_1:
 	case CODING_CATEGORY_IDX_ISO_8_2:
 	  /* We can skip all ASCII characters at the tail.  */
 	  if (eol_conversion)
-	    while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
+	    while (begp < endp && (c = endp[-1]) < 0x80 && c != '\n') endp--;
 	  else
 	    while (begp < endp && endp[-1] < 0x80) endp--;
 	  break;
 	case CODING_CATEGORY_IDX_ISO_7:
 	case CODING_CATEGORY_IDX_ISO_7_TIGHT:
 	  /* We can skip all charactes at the tail except for ESC and
 the following 2-byte at the tail.  */
 	  if (eol_conversion)
-	    while (begp < endp && endp[-1] != ISO_CODE_ESC && endp[-1] != '\n')
+	    while (begp < endp
+		   && (c = endp[-1]) < 0x80 && c != ISO_CODE_ESC && c != '\n')
 	      endp--;
 	  else
-	    while (begp < endp && endp[-1] != ISO_CODE_ESC)
+	    while (begp < endp
+		   && (c = endp[-1]) < 0x80 && c != ISO_CODE_ESC)
 	      endp--;
 	  if (begp < endp && endp[-1] == ISO_CODE_ESC)
 	    {
 	      if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
 		/* This is an ASCII designation sequence.  We can
 begp_orig = begp = str + *beg;
 endp_orig = endp = str + *end;
 }
 else
 {
-move_gap (*beg);
+begp_orig = begp = BYTE_POS_ADDR (*beg);
-begp_orig = begp = GAP_END_ADDR;
 endp_orig = endp = begp + *end - *beg;
 }
 eol_conversion = (coding->eol_type == CODING_EOL_CR
 		    || coding->eol_type == CODING_EOL_CRLF);
 *end += endp - endp_orig;
 return;
 }
 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
-text from FROM to TO by coding system CODING, and return number of
+text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
-characters in the resulting text.
+coding system CODING, and return the status code of code conversion
+(currently, this value has no meaning).
+How many characters (and bytes) are converted to how many
+characters (and bytes) are recorded in members of the structure
+CODING.
 If ADJUST is nonzero, we do various things as if the original text
 is deleted and a new text is inserted.  See the comments in
 replace_range (insdel.c) to know what we are doing.
 ADJUST nonzero also means that post-read-conversion or
 pre-write-conversion functions (if any) should be processed.  */
 int
-code_convert_region (from, to, coding, encodep, adjust)
+code_convert_region (from, from_byte, to, to_byte, coding, encodep, adjust)
-int from, to, encodep, adjust;
+int from, from_byte, to, to_byte, encodep, adjust;
 struct coding_system *coding;
 {
-int len = to - from, require, inserted, inserted_byte;
+int len = to - from, len_byte = to_byte - from_byte;
-int from_byte, to_byte, len_byte;
+int require, inserted, inserted_byte;
 int from_byte_orig, to_byte_orig;
 Lisp_Object saved_coding_symbol = Qnil;
+int multibyte = !NILP (current_buffer->enable_multibyte_characters);
+int first = 1;
+int fake_multibyte = 0;
+unsigned char *src, *dst;
 if (adjust)
 {
+int saved_from = from;
 prepare_to_modify_buffer (from, to, &from);
-to = from + len;
+if (saved_from != from)
-}
+	{
-from_byte = CHAR_TO_BYTE (from); to_byte = CHAR_TO_BYTE (to);
+	  to = from + len;
-len_byte = to_byte - from_byte;
+	  if (multibyte)
+	    from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
+	  else
+	    from_byte = from, to_byte = to;
+	  len_byte = to_byte - from_byte;
+	}
+}
 if (! encodep && CODING_REQUIRE_DETECTION (coding))
 {
 /* We must detect encoding of text and eol.  Even if detection
 routines can't decide the encoding, we should not let them
 if (from < GPT && to > GPT)
 	move_gap_both (from, from_byte);
 if (coding->type == coding_type_undecided)
 	{
-	  detect_coding (coding, BYTE_POS_ADDR (from), len);
+	  detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
 	  if (coding->type == coding_type_undecided)
 	    coding->type = coding_type_emacs_mule;
 	}
 if (coding->eol_type == CODING_EOL_UNDECIDED)
 	{
 	     encounter an inconsitent eol format while decoding.  */
 	  coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
 	}
 }
+coding->consumed_char = len, coding->consumed = len_byte;
 if (encodep
 ? ! CODING_REQUIRE_ENCODING (coding)
 : ! CODING_REQUIRE_DECODING (coding))
-return len;
+{
+coding->produced = len_byte;
+if (multibyte)
+	{
+	  if (GPT < from || GPT > to)
+	    move_gap_both (from, from_byte);
+	  coding->produced_char
+	    = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
+	  if (coding->produced_char != len)
+	    {
+	      int diff = coding->produced_char - len;
+	      if (adjust)
+		adjust_before_replace (from, from_byte, to, to_byte);
+	      ZV += diff; Z += diff; GPT += diff;
+	      if (adjust)
+		adjust_after_replace (from, from_byte, to, to_byte,
+				      diff, 0);
+	    }
+	}
+else
+	coding->produced_char = len_byte;
+return 0;
+}
 /* Now we convert the text.  */
 /* For encoding, we must process pre-write-conversion in advance.  */
 if (encodep
 if (current_buffer != prev)
 	{
 	  len = ZV - BEGV;
 	  new = current_buffer;
 	  set_buffer_internal_1 (prev);
-	  del_range (from, to);
+	  del_range_2 (from, to, from_byte, to_byte);
 	  insert_from_buffer (new, BEG, len, 0);
 	  to = from + len;
-	  to_byte = CHAR_TO_BYTE (to);
+	  to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
 	  len_byte = to_byte - from_byte;
 	}
 }
 /* Try to skip the heading and tailing ASCIIs.  */
 from_byte_orig = from_byte; to_byte_orig = to_byte;
+if (from < GPT && GPT < to)
+move_gap (from);
 if (encodep)
 shrink_encoding_region (&from_byte, &to_byte, coding, NULL);
 else
 shrink_decoding_region (&from_byte, &to_byte, coding, NULL);
 if (from_byte == to_byte)
-return len;
+{
+coding->produced = len_byte;
+coding->produced_char = multibyte ? len : len_byte;
+return 0;
+}
 /* Here, the excluded region by shrinking contains only ASCIIs.  */
 from += (from_byte - from_byte_orig);
 to += (to_byte - to_byte_orig);
 len = to - from;
 len_byte = to_byte - from_byte;
-/* For converion, we must put the gap before the text to be decoded
+/* For converion, we must put the gap before the text in addition to
-in addition to make the gap larger for efficient decoding.  The
+making the gap larger for efficient decoding.  The required gap
-required gap size starts from 2000 which is the magic number used
+size starts from 2000 which is the magic number used in make_gap.
-in make_gap.  But, after one batch of conversion, it will be
+But, after one batch of conversion, it will be incremented if we
-incremented if we find that it is not enough .  */
+find that it is not enough .  */
 require = 2000;
 if (GAP_SIZE  < require)
 make_gap (require - GAP_SIZE);
 move_gap_both (from, from_byte);
 beg_unchanged = GPT - BEG;
 if (Z - GPT < end_unchanged)
 end_unchanged = Z - GPT;
 inserted = inserted_byte = 0;
+src = GAP_END_ADDR, dst = GPT_ADDR;
+GAP_SIZE += len_byte;
+ZV -= len;
+Z -= len;
+ZV_BYTE -= len_byte;
+Z_BYTE -= len_byte;
 for (;;)
 {
-int result, diff_char, diff_byte;
+int result;
 /* The buffer memory is changed from:
-	 +--------+converted-text+------------+-----original-text-----+---+
+	 +--------+converted-text+---------+-------original-text------+---+
-	 |<-from->|<--inserted-->|<-GAP_SIZE->|<---------len--------->|---|  */
+	 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
+		  |<------------------- GAP_SIZE -------------------->|  */
 if (encodep)
-	result = encode_coding (coding, GAP_END_ADDR, GPT_ADDR, len_byte, 0);
+	result = encode_coding (coding, src, dst, len_byte, 0);
 else
-	result = decode_coding (coding, GAP_END_ADDR, GPT_ADDR, len_byte, 0);
+	result = decode_coding (coding, src, dst, len_byte, 0);
 /* to:
 	 +--------+-------converted-text--------+--+---original-text--+---+
-	 |<-from->|<----(inserted+produced)---->|--|<-(len-consumed)->|---|  */
+	 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
+		  |<------------------- GAP_SIZE -------------------->|  */
-diff_char = coding->produced_char - coding->consumed_char;
+if (coding->fake_multibyte)
-diff_byte = coding->produced - coding->consumed;
+	fake_multibyte = 1;
-GAP_SIZE -= diff_byte;
+if (!encodep && !multibyte)
-ZV += diff_char; ZV_BYTE += diff_byte;
+	coding->produced_char = coding->produced;
-Z += diff_char; Z_BYTE += diff_byte;
-GPT += coding->produced_char; GPT_BYTE += coding->produced;
 inserted += coding->produced_char;
 inserted_byte += coding->produced;
-len -= coding->consumed_char;
 len_byte -= coding->consumed;
+src += coding->consumed;
+dst += inserted_byte;
 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
 	{
-	  unsigned char *p = GPT_ADDR - inserted_byte, *pend = GPT_ADDR;
+	  unsigned char *pend = dst, *p = pend - inserted_byte;
 	  /* Encode LFs back to the original eol format (CR or CRLF).  */
 	  if (coding->eol_type == CODING_EOL_CR)
 	    {
 	      while (p < pend) if (*p++ == '\n') p[-1] = '\r';
 	    }
 	  else
 	    {
-	      unsigned char *p2 = p;
 	      int count = 0;
-	      while (p2 < pend) if (*p2++ == '\n') count++;
+	      while (p < pend) if (*p++ == '\n') count++;
-	      if (GAP_SIZE < count)
+	      if (src - dst < count)
-		make_gap (count - GAP_SIZE);
-	      p2 = GPT_ADDR + count;
-	      while (p < pend)
 		{
-		  *--p2 = *--pend;
+		  /* We don't have sufficient room for putting LFs
-		  if (*pend == '\n') *--p2 = '\r';
+		     back to CRLF.  We must record converted and
+		     not-yet-converted text back to the buffer
+		     content, enlarge the gap, then record them out of
+		     the buffer contents again.  */
+		  int add = len_byte + inserted_byte;
+		  GAP_SIZE -= add;
+		  ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
+		  GPT += inserted_byte; GPT_BYTE += inserted_byte;
+		  make_gap (count - GAP_SIZE);
+		  GAP_SIZE += add;
+		  ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
+		  GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
+		  /* Don't forget to update SRC, DST, and PEND.  */
+		  src = GAP_END_ADDR - len_byte;
+		  dst = GPT_ADDR + inserted_byte;
+		  pend = dst;
 		}
-	      GPT += count; GAP_SIZE -= count; ZV += count; Z += count;
-	      ZV_BYTE += count; Z_BYTE += count;
-	      coding->produced += count;
-	      coding->produced_char += count;
 	      inserted += count;
 	      inserted_byte += count;
+	      coding->produced += count;
+	      p = dst = pend + count;
+	      while (count)
+		{
+		  *--p = *--pend;
+		  if (*p == '\n') count--, *--p = '\r';
+		}
 	    }
 	  /* Suppress eol-format conversion in the further conversion.  */
 	  coding->eol_type = CODING_EOL_LF;
 	  /* Restore the original symbol.  */
 	  coding->symbol = saved_coding_symbol;
+	  continue;
 	}
 if (len_byte <= 0)
 	break;
 if (result == CODING_FINISH_INSUFFICIENT_SRC)
 	{
 	  /* The source text ends in invalid codes.  Let's just
 	     make them valid buffer contents, and finish conversion.  */
-	  inserted += len;
+	  inserted += len_byte;
 	  inserted_byte += len_byte;
+	  while (len_byte--)
+	    *src++ = *dst++;
+	  fake_multibyte = 1;
 	  break;
 	}
-if (inserted == coding->produced_char)
+if (first)
-	/* We have just done the first batch of conversion.  Let's
+	{
-	   reconsider the required gap size now.
+	  /* We have just done the first batch of conversion which was
+	     stoped because of insufficient gap.  Let's reconsider the
-	   We have converted CONSUMED bytes into PRODUCED bytes.  To
+	     required gap size (i.e. SRT - DST) now.
-	   convert the remaining LEN bytes, we may need REQUIRE bytes
-	   of gap, where:
+	     We have converted ORIG bytes (== coding->consumed) into
-	       REQUIRE + LEN = (LEN * PRODUCED / CONSUMED)
+	     NEW bytes (coding->produced).  To convert the remaining
-	       REQUIRE = LEN * (PRODUCED - CONSUMED) / CONSUMED
+	     LEN bytes, we may need REQUIRE bytes of gap, where:
-		       = LEN * DIFF / CONSUMED
+		REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
-	   Here, we are sure that DIFF is positive.  */
+		REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
-	require = len_byte * diff_byte / coding->consumed;
+	     Here, we are sure that NEW >= ORIG.  */
-if (GAP_SIZE  < require)
+	  require = (len_byte * (coding->produced - coding->consumed)
-	make_gap (require - GAP_SIZE);
+		     / coding->consumed);
-}
+	  first = 0;
-if (GAP_SIZE > 0) *GPT_ADDR = 0; /* Put an anchor.  */
+	}
+if ((src - dst) < (require + 2000))
+	{
+	  /* See the comment above the previous call of make_gap.  */
+	  int add = len_byte + inserted_byte;
+	  GAP_SIZE -= add;
+	  ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
+	  GPT += inserted_byte; GPT_BYTE += inserted_byte;
+	  make_gap (require + 2000);
+	  GAP_SIZE += add;
+	  ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
+	  GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
+	  /* Don't forget to update SRC, DST.  */
+	  src = GAP_END_ADDR - len_byte;
+	  dst = GPT_ADDR + inserted_byte;
+	}
+}
+if (src - dst > 0) *dst = 0; /* Put an anchor.  */
+if (multibyte && (fake_multibyte || !encodep && (to - from) != (to_byte - from_byte)))
+inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
+/* Update various buffer positions for the new text.  */
+GAP_SIZE -= inserted_byte;
+ZV += inserted; Z+= inserted;
+ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
+GPT += inserted; GPT_BYTE += inserted_byte;
 if (adjust)
 {
 adjust_after_replace (from, from_byte, to, to_byte,
 			    inserted, inserted_byte);
 	      inserted = XFASTINT (val);
 	    }
 	  if (pos >= from + orig_inserted)
 	    temp_set_point (current_buffer, pos + (inserted - orig_inserted));
 	}
-}
+signal_after_change (from, to - from, inserted);
+}
-return ((from_byte - from_byte_orig) + inserted + (to_byte_orig - to_byte));
+{
+int skip = (to_byte_orig - to_byte) + (from_byte - from_byte_orig);
+coding->consumed = to_byte_orig - from_byte_orig;
+coding->consumed_char = skip + (to - from);
+coding->produced = skip + inserted_byte;
+coding->produced_char = skip + inserted;
+}
+return 0;
 }
 Lisp_Object
 code_convert_string (str, coding, encodep, nocopy)
 Lisp_Object str;
 unibyte<->multibyte conversion.  */
 	  current_buffer->enable_multibyte_characters = Qnil;
 	  insert_from_string (str, 0, 0, to_byte, to_byte, 0);
 	  current_buffer->enable_multibyte_characters = Qt;
 	}
-code_convert_region (BEGV, ZV, coding, encodep, 1);
+code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
 if (encodep)
 	/* We must return the buffer contents as unibyte string.  */
 	current_buffer->enable_multibyte_characters = Qnil;
 str = make_buffer_string (BEGV, ZV, 0);
 set_buffer_internal (prev);
 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
 coding.mode |= CODING_MODE_LAST_BLOCK;
-len = code_convert_region (from, to, &coding, encodep, 1);
+code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
-return make_number (len);
+		       &coding, encodep, 1);
+return make_number (coding.produced_char);
 }
 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
 3, 3, "r\nzCoding system: ",
 "Decode the current region by specified coding system.\n\

Mercurial > emacs

comparison src/coding.c @ 20931:068eb408c911