changeset 50085:575609f03daa

(ccl-decode-mule-utf-8): Don't loose bytes on handling an invalid byte sequence.
author Kenichi Handa <handa@m17n.org>
date Wed, 12 Mar 2003 00:45:49 +0000
parents e41f8dbc86aa
children ae6e6c38d49d
files lisp/international/utf-8.el
diffstat 1 files changed, 41 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/lisp/international/utf-8.el	Tue Mar 11 22:17:21 2003 +0000
+++ b/lisp/international/utf-8.el	Wed Mar 12 00:45:49 2003 +0000
@@ -308,18 +308,20 @@
     ((r5 = ,(charset-id 'eight-bit-control))
      (r6 = ,(charset-id 'eight-bit-graphic))
      (loop
+      (r0 = -1)
       (read r0)
 
       ;; 1byte encoding, i.e., ascii
       (if (r0 < #x80)
-	  (write r0)
+	  ((write r0))
 	(if (r0 < #xc0)		    ; continuation byte (invalid here)
-	    (if (r0 < #xa0)
-		(write-multibyte-character r5 r0)
-	      (write-multibyte-character r6 r0))
+	    ((if (r0 < #xa0)
+		 (write-multibyte-character r5 r0)
+	       (write-multibyte-character r6 r0)))
 	  ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
 	  (if (r0 < #xe0)
-	      ((read r1)
+	      ((r1 = -1)
+	       (read r1)
 
 	       (if ((r1 & #b11000000) != #b10000000)
 		   ;; Invalid 2-byte sequence
@@ -373,7 +375,9 @@
 	    ;; 3byte encoding
 	    ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
 	    (if (r0 < #xf0)
-		((read r1 r2)
+		((r1 = -1)
+		 (r2 = -1)
+		 (read r1 r2)
 
 		 ;; This is set to 1 if the encoding is invalid.
 		 (r4 = 0)
@@ -478,7 +482,10 @@
 		  ;; 4byte encoding
 		  ;; keep those bytes as eight-bit-{control|graphic}
 		  ;; Fixme: allow lookup in utf-subst-table-for-decode.
-		  ((read r1 r2 r3)
+		  ((r1 = -1)
+		   (r2 = -1)
+		   (r3 = -1)
+		   (read r1 r2 r3)
 		   ;; r0 > #xf0, thus eight-bit-graphic
 		   (write-multibyte-character r6 r0)
 		   (if (r1 < #xa0)
@@ -512,7 +519,33 @@
 			       (write-multibyte-character r6 r1)))))))
 		;; else invalid byte >= #xfe
 		(write-multibyte-character r6 r0))))))
-      (repeat))))
+      (repeat)))
+
+    ;; At EOF...
+    (if (r0 >= 0)
+	((if (r0 < #x80)
+	     (write r0)
+	   (if (r0 < #xa0)
+	       (write-multibyte-character r5 r0)
+	     ((write-multibyte-character r6 r0))))
+	 (if (r1 >= 0)
+	     ((if (r1 < #x80)
+		  (write r1)
+		(if (r1 < #xa0)
+		    (write-multibyte-character r5 r1)
+		  ((write-multibyte-character r6 r1))))
+	      (if (r2 >= 0)
+		  ((if (r2 < #x80)
+		       (write r2)
+		     (if (r2 < #xa0)
+			 (write-multibyte-character r5 r2)
+		       ((write-multibyte-character r6 r2))))
+		   (if (r3 >= 0)
+		       (if (r3 < #x80)
+			   (write r3)
+			 (if (r3 < #xa0)
+			     (write-multibyte-character r5 r3)
+			   ((write-multibyte-character r6 r3))))))))))))
 
   "CCL program to decode UTF-8.
 Basic decoding is done into the charsets ascii, latin-iso8859-1 and