emacs: lisp/international/utf-8.el comparison

comparison lisp/international/utf-8.el @ 56037:81dbb510a1db

(utf-translate-cjk-charsets): New variable. (utf-translate-cjk-unicode-range): New variable. (utf-translate-cjk-load-tables): New function. (utf-lookup-subst-table-for-decode): New function. (utf-lookup-subst-table-for-encode): New function. (utf-translate-cjk-mode): Init-value changed to t. Don't load tables here. Update safe-charsets of utf-* coding systems. (ccl-mule-utf-untrans): New CCL. (ccl-decode-mule-utf-8): Call ccl-mule-utf-untrans. Use `repeat' at end of each branch. (ccl-mule-utf-8-encode-untrans): New CCL. (ccl-encode-mule-utf-8): Call ccl-mule-utf-8-encode-untrans. (ccl-untranslated-to-ucs): Handle 2-byte encoding. Set r1 to the length of encoding. Don't return r0. (utf-8-compose): New arg hash-table. Handle 2-byte encoding. (utf-8-post-read-conversion): Narrow to region properly. If utf-translate-cjk-mode is on, load tables if necessary. Call utf-8-compose with hash-table arg if necessary. Call XXX-compose-region instead of XXX-post-read-convesion. (utf-8-pre-write-conversion): New function. (mule-utf-8): Include CJK charsets in safe-charsets if utf-translate-cjk-mode is on. Add pre-write-conversion.

author	Kenichi Handa <handa@m17n.org>
date	Sat, 12 Jun 2004 02:10:37 +0000
parents	6e677a935fe9
children	4575a565f45d

comparison

equal deleted inserted replaced

-:ff6f1f61fea4
+:81dbb510a1db
 	 (set-default s v))
 :version "21.4"
 :type 'boolean
 :group 'mule)
-(define-minor-mode utf-translate-cjk-mode
-"Whether the UTF based coding systems should decode/encode CJK characters.
+(defconst utf-translate-cjk-charsets '(chinese-gb2312
-Enabling this loads tables which allow the coding systems mule-utf-8,
+				       chinese-big5-1 chinese-big5-2
-mule-utf-16le and mule-utf-16be to encode characters in the charsets
+				       japanese-jisx0208 japanese-jisx0212
-`korean-ksc5601', `chinese-gb2312', `chinese-big5-1',
+				       korean-ksc5601)
-`chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to
+"List of charsets supported by `utf-translate-cjk-mode'.")
-decode the corresponding unicodes into such characters.
+(defconst utf-translate-cjk-unicode-range
-Where the charsets overlap, the one preferred for decoding is chosen
+'((#x2e80 . #xd7a3)
-according to the language environment in effect when this option is
+(#xff00 . #xffef))
-turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for
+"List of Unicode code ranges supported by `utf-translate-cjk-mode'.")
-Chinese-Big5 and jisx for other environments.
+;; Return non-nil if CODE-POINT is in `utf-translate-cjk-unicode-range'.
-The tables are large (over 40000 entries), so this option is not the
+(defsubst utf-translate-cjk-substitutable-p (code-point)
-default.  Also, installing them may be rather slow."
+(let ((tail utf-translate-cjk-unicode-range)
-:init-value nil
+	elt)
-:version "21.4"
+(while tail
-:type 'boolean
+(setq elt (car tail) tail (cdr tail))
-:set-after '(current-language-environment)
+(if (and (>= code-point (car elt)) (<= code-point (cdr elt)))
-:group 'mule
+	  (setq tail nil)
-:global t
+	(setq elt nil)))
-(if utf-translate-cjk-mode
+elt))
-;; Fixme: Allow the use of the CJK charsets to be
-;; customized by reordering and possible omission.
+(defvar utf-translate-cjk-lang-env nil
-(progn
+"Language environment in which tables for `utf-translate-cjk-mode' is loaded.
+The value nil means that the tables are not yet loaded.")
+(defun utf-translate-cjk-load-tables ()
+"Load tables for `utf-translate-cjk-mode'."
+;; Fixme: Allow the use of the CJK charsets to be
+;; customized by reordering and possible omission.
+(let ((redefined (< (hash-table-size ucs-mule-cjk-to-unicode) 43000)))
+(if redefined
 	;; Redefine them with realistic initial sizes and a
 	;; smallish rehash size to avoid wasting significant
 	;; space after they're built.
 	(setq ucs-mule-cjk-to-unicode
 	      (make-hash-table :test 'eq :size 43000 :rehash-size 1000)
 	      ucs-unicode-to-mule-cjk
-	      (make-hash-table :test 'eq :size 21500 :rehash-size 1000))
+	      (make-hash-table :test 'eq :size 21500 :rehash-size 1000)))
-	;; Load the files explicitly, to avoid having to keep
-	;; around the large tables they contain (as well as the
+;; Load the files explicitly, to avoid having to keep
-	;; ones which get built).
+;; around the large tables they contain (as well as the
-	(cond
+;; ones which get built).
-	 ((string= "Korean" current-language-environment)
+(cond ((string= "Korean" current-language-environment)
-	  (load "subst-jis")
+	   (load "subst-jis")
-	  (load "subst-big5")
+	   (load "subst-big5")
-	  (load "subst-gb2312")
+	   (load "subst-gb2312")
-	  (load "subst-ksc"))
+	   (load "subst-ksc"))
-	 ((string= "Chinese-BIG5" current-language-environment)
+	  ((string= "Chinese-BIG5" current-language-environment)
-	  (load "subst-jis")
+	   (load "subst-jis")
-	  (load "subst-ksc")
+	   (load "subst-ksc")
-	  (load "subst-gb2312")
+	   (load "subst-gb2312")
-	  (load "subst-big5"))
+	   (load "subst-big5"))
-	 ((string= "Chinese-GB" current-language-environment)
+	  ((string= "Chinese-GB" current-language-environment)
-	  (load "subst-jis")
+	   (load "subst-jis")
-	  (load "subst-ksc")
+	   (load "subst-ksc")
-	  (load "subst-big5")
+	   (load "subst-big5")
-	  (load "subst-gb2312"))
+	   (load "subst-gb2312"))
-	 (t
+	  (t
-	  (load "subst-ksc")
+	   (load "subst-ksc")
-	  (load "subst-gb2312")
+	   (load "subst-gb2312")
-	  (load "subst-big5")
+	   (load "subst-big5")
-	  (load "subst-jis")))	  ; jis covers as much as big5, gb2312
+	   (load "subst-jis")))	  ; jis covers as much as big5, gb2312
+(when redefined
+(define-translation-hash-table 'utf-subst-table-for-decode
+	ucs-unicode-to-mule-cjk)
+(define-translation-hash-table 'utf-subst-table-for-encode
+	ucs-mule-cjk-to-unicode)
+(set-char-table-extra-slot (get 'utf-translation-table-for-encode
+				      'translation-table)
+				 1 ucs-mule-cjk-to-unicode))
+(setq utf-translate-cjk-lang-env current-language-environment)))
+(defun utf-lookup-subst-table-for-decode (code-point)
+(if (and utf-translate-cjk-mode
+	   (not utf-translate-cjk-lang-env)
+	   (utf-translate-cjk-substitutable-p code-point))
+(utf-translate-cjk-load-tables))
+(gethash code-point
+	   (get 'utf-subst-table-for-decode 'translation-hash-table)))
+(defun utf-lookup-subst-table-for-encode (char)
+(if (and utf-translate-cjk-mode
+	   (not utf-translate-cjk-lang-env)
+	   (memq (char-charset char) utf-translate-cjk-charsets))
+(utf-translate-cjk-load-tables))
+(gethash char
+	   (get 'utf-subst-table-for-encode 'translation-hash-table)))
+(define-minor-mode utf-translate-cjk-mode
+"Whether the UTF based coding systems should decode/encode CJK characters.
+Enabling this allows the coding systems mule-utf-8,
+mule-utf-16le and mule-utf-16be to encode characters in the charsets
+`korean-ksc5601', `chinese-gb2312', `chinese-big5-1',
+`chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to
+decode the corresponding unicodes into such characters.
+Where the charsets overlap, the one preferred for decoding is chosen
+according to the language environment in effect when this option is
+turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for
+Chinese-Big5 and jisx for other environments.
+This option is on by default.  If you are not interested in CJK
+characters and want to avoid some overhead on encoding/decoding
+by the above coding systems, you can customize this option to nil."
+:init-value t
+:version "21.4"
+:type 'boolean
+:group 'mule
+:global t
+(if utf-translate-cjk-mode
+(progn
 	(define-translation-hash-table 'utf-subst-table-for-decode
 	  ucs-unicode-to-mule-cjk)
 	(define-translation-hash-table 'utf-subst-table-for-encode
 	  ucs-mule-cjk-to-unicode)
 	(set-char-table-extra-slot (get 'utf-translation-table-for-encode
 (make-hash-table :test 'eq))
 (define-translation-hash-table 'utf-subst-table-for-encode
 (make-hash-table :test 'eq))
 (set-char-table-extra-slot (get 'utf-translation-table-for-encode
 				    'translation-table)
-			       1 nil)))
+			       1 nil))
+;; Update safe-chars of mule-utf-* coding systems.
+(dolist (elt (coding-system-list t))
+(if (string-match "^mule-utf" (symbol-name elt))
+	(let ((safe-charsets (coding-system-get elt 'safe-charsets))
+	      (safe-chars (coding-system-get elt 'safe-chars))
+	      (need-update nil))
+	  (dolist (charset utf-translate-cjk-charsets)
+	    (unless (eq utf-translate-cjk-mode (memq charset safe-charsets))
+	      (setq safe-charsets
+		    (if utf-translate-cjk-mode
+			(cons charset safe-charsets)
+		      (delq charset safe-charsets))
+		    need-update t)
+	      (aset safe-chars (make-char charset) utf-translate-cjk-mode)))
+	  (when need-update
+	    (coding-system-put elt 'safe-charsets safe-charsets)
+	    (define-coding-system-internal elt))))))
+(define-ccl-program ccl-mule-utf-untrans
+;; R0 is an untranslatable Unicode code-point (U+3500..U+DFFF or
+;; U+10000..U+10FFFF) or an invaid byte (#x00..#xFF).  Write
+;; eight-bit-control/graphic sequence (2 to 4 chars) representing
+;; UTF-8 sequence of r0.  Registers r4, r5, r6 are modified.
+;;
+;; This is a subrountine because we assume that this is called very
+;; rarely (so we don't have to worry about the overhead of the
+;; call).
+`(0
+((r5 = ,(charset-id 'eight-bit-control))
+(r6 = ,(charset-id 'eight-bit-graphic))
+(if (r0 < #x100)
+	 ((r4 = ((r0 >> 6) | #xC0))
+	  (write-multibyte-character r6 r4))
+((if (r0 < #x10000)
+	    ((r4 = ((r0 >> 12) | #xE0))
+	     (write-multibyte-character r6 r4))
+	  ((r4 = ((r0 >> 18) | #xF0))
+	   (write-multibyte-character r6 r4)
+	   (r4 = (((r0 >> 12) & #x3F) | #x80))
+	   (if (r4 < #xA0)
+	       (write-multibyte-character r5 r4)
+	     (write-multibyte-character r6 r4))))
+	(r4 = (((r0 >> 6) & #x3F) | #x80))
+	(if (r4 < #xA0)
+	    (write-multibyte-character r5 r4)
+	  (write-multibyte-character r6 r4))))
+(r4 = ((r0 & #x3F) | #x80))
+(if (r4 < #xA0)
+	 (write-multibyte-character r5 r4)
+(write-multibyte-character r6 r4)))))
 (define-ccl-program ccl-decode-mule-utf-8
 ;;
 ;;        charset         | bytes in utf-8 | bytes in emacs
 ;; -----------------------+----------------+---------------
 ;; -----------------------+----------------+---------------
 ;; mule-unicode-0100-24ff |       3        |       4
 ;;        (>= 8000)       |                |
 ;; mule-unicode-2500-33ff |       3        |       4
 ;; mule-unicode-e000-ffff |       3        |       4
+;; -----------------------+----------------+---------------
+;;      invalid byte      |       1        |       2
 ;;
 ;; Thus magnification factor is two.
 ;;
 `(2
-((r5 = ,(charset-id 'eight-bit-control))
+((r0 = -1)
-(r6 = ,(charset-id 'eight-bit-graphic))
 (loop
+(if (r0 < 0)
+	  (read r0))
+(if (r0 < #x80)
+	  ;; 1-byte encoding, i.e., ascii
+	  ((write r0)
+	   (r0 = -1)
+	   (repeat)))
+(if (r0 < #xc0)		    ; continuation byte (invalid here)
+	  ((call ccl-mule-utf-untrans)
+	   (r0 = -1)
+	   (repeat)))
+;; Read the 2nd byte.
+(r1 = -1)
+(read r1)
+(if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte
+	  ((call ccl-mule-utf-untrans)
+	   ;; Handle it in the next loop.
+	   (r0 = r1)
+	   (repeat)))
+(if (r0 < #xe0)
+	  ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
+	  ((r2 = ((r0 & #x1F) << 6))
+	   (r2 |= (r1 & #x3F))
+	   ;; Now r2 holds scalar value
+	   (if (r2 < 128)	; `overlong sequence'
+	       ((call ccl-mule-utf-untrans)
+		(r0 = r1)
+		(call ccl-mule-utf-untrans)
+		(r0 = -1)
+		(repeat)))
+	   (r1 = r2)
+	   (if (r1 < 160)
+	       ;; eight-bit-control
+	       (r0 = ,(charset-id 'eight-bit-control))
+	     (if (r1 < 256)
+		 ;; latin-iso8859-1
+		 ((r0 = ,(charset-id 'latin-iso8859-1))
+		  (r1 -= 128))
+	       ;; mule-unicode-0100-24ff (< 0800)
+	       ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
+		(r1 -= #x0100)
+		(r2 = (((r1 / 96) + 32) << 7))
+		(r1 %= 96)
+		(r1 += (r2 + 32))
+		(translate-character
+		 utf-translation-table-for-decode r0 r1))))
+	   (write-multibyte-character r0 r1)
+	   (r0 = -1)
+	   (repeat)))
+;; Read the 3rd bytes.
+(r2 = -1)
+(read r2)
+(if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte
+	  ((call ccl-mule-utf-untrans)
+	   (r0 = r1)
+	   (call ccl-mule-utf-untrans)
+	   ;; Handle it in the next loop.
+	   (r0 = r2)
+	   (repeat)))
+(if (r0 < #xF0)
+	  ;; 3byte encoding
+	  ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
+	  ((r3 = ((r0 & #xF) << 12))
+	   (r3 |= ((r1 & #x3F) << 6))
+	   (r3 |= (r2 & #x3F))
+	   (if (r3 < #x800)		; `overlong sequence'
+	       ((call ccl-mule-utf-untrans)
+		(r0 = r1)
+		(call ccl-mule-utf-untrans)
+		(r0 = r2)
+		(call ccl-mule-utf-untrans)
+		(r0 = -1)
+		(repeat)))
+	   (if (r3 < #x2500)
+	       ;; mule-unicode-0100-24ff (>= 0800)
+	       ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
+		(r3 -= #x0100)
+		(r3 //= 96)
+		(r1 = (r7 + 32))
+		(r1 += ((r3 + 32) << 7))
+		(translate-character
+		 utf-translation-table-for-decode r0 r1)
+		(write-multibyte-character r0 r1)
+		(r0 = -1)
+		(repeat)))
+	   (if (r3 < #x3400)
+	       ;; mule-unicode-2500-33ff
+	       ((r0 = r3)		; don't zap r3
+		(lookup-integer utf-subst-table-for-decode r0 r1)
+		(if (r7 == 0)
+		    ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
+		     (r3 -= #x2500)
+		     (r3 //= 96)
+		     (r1 = (r7 + 32))
+		     (r1 += ((r3 + 32) << 7))))
+		(write-multibyte-character r0 r1)
+		(r0 = -1)
+		(repeat)))
+	   (if (r3 < #xE000)
+	       ;; Try to convert to CJK chars, else
+	       ;; keep them as eight-bit-{control|graphic}.
+	       ((r0 = r3)
+		(lookup-integer utf-subst-table-for-decode r3 r1)
+		(if r7
+		    ;; got a translation
+		    (write-multibyte-character r3 r1)
+		  (call ccl-mule-utf-untrans))
+		(r0 = -1)
+		(repeat)))
+	   ;; mule-unicode-e000-ffff
+	   ;; Fixme: fffe and ffff are invalid.
+	   (r0 = r3)		; don't zap r3
+	   (lookup-integer utf-subst-table-for-decode r0 r1)
+	   (if (r7 == 0)
+	       ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
+		(r3 -= #xe000)
+		(r3 //= 96)
+		(r1 = (r7 + 32))
+		(r1 += ((r3 + 32) << 7))))
+	   (write-multibyte-character r0 r1)
+	   (r0 = -1)
+	   (repeat)))
+;; Read the 4th bytes.
+(r3 = -1)
+(read r3)
+(if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte
+	  ((call ccl-mule-utf-untrans)
+	   (r0 = r1)
+	   (call ccl-mule-utf-untrans)
+	   ;; Handle it in the next loop.
+	   (r0 = r3)
+	   (repeat)))
+(if (r3 < #xF8)
+	  ;; 4-byte encoding:
+	  ;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx
+	  ;; keep those bytes as eight-bit-{control|graphic}
+	  ;; Fixme: allow lookup in utf-subst-table-for-decode.
+	  ((r4 = ((r0 & #x7) << 18))
+	   (r4 |= ((r1 & #x3F) << 12))
+	   (r4 |= ((r2 & #x3F) << 6))
+	   (r4 |= (r3 & #x3F))
+	   (if (r4 < #x10000)		; `overlong sequence'
+	       ((call ccl-mule-utf-untrans)
+		(r0 = r1)
+		(call ccl-mule-utf-untrans)
+		(r0 = r2)
+		(call ccl-mule-utf-untrans)
+		(r0 = r3)
+		(call ccl-mule-utf-untrans))
+	     ((r0 = r4)
+	      (call ccl-mule-utf-untrans)))
+	   (r0 = -1)
+	   (repeat)))
+;; Unsupported sequence.
+(call ccl-mule-utf-untrans)
+(r0 = r1)
+(call ccl-mule-utf-untrans)
+(r0 = r2)
+(call ccl-mule-utf-untrans)
+(r0 = r3)
+(call ccl-mule-utf-untrans)
 (r0 = -1)
-(read r0)
-;; 1byte encoding, i.e., ascii
-(if (r0 < #x80)
-	  ((write r0))
-	(if (r0 < #xc0)		    ; continuation byte (invalid here)
-	    ((if (r0 < #xa0)
-		 (write-multibyte-character r5 r0)
-	       (write-multibyte-character r6 r0)))
-	  ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
-	  (if (r0 < #xe0)
-	      ((r1 = -1)
-	       (read r1)
-	       (if ((r1 & #b11000000) != #b10000000)
-		   ;; Invalid 2-byte sequence
-		   ((if (r0 < #xa0)
-			(write-multibyte-character r5 r0)
-		      (write-multibyte-character r6 r0))
-		    (if (r1 < #x80)
-			(write r1)
-		      (if (r1 < #xa0)
-			  (write-multibyte-character r5 r1)
-			(write-multibyte-character r6 r1))))
-		 ((r3 = r0)	   ; save in case of overlong sequence
-		  (r2 = r1)
-		  (r0 &= #x1f)
-		  (r0 <<= 6)
-		  (r1 &= #x3f)
-		  (r1 += r0)
-		  ;; Now r1 holds scalar value
-		  (if (r1 < 128)	; `overlong sequence'
-		      ((if (r3 < #xa0)
-			   (write-multibyte-character r5 r3)
-			 (write-multibyte-character r6 r3))
-		       (if (r2 < #x80)
-			   (write r2)
-			 (if (r2 < #xa0)
-			     (write-multibyte-character r5 r2)
-			   (write-multibyte-character r6 r2))))
-		    ;; eight-bit-control
-		    (if (r1 < 160)
-			((write-multibyte-character r5 r1))
-		      ;; latin-iso8859-1
-		      (if (r1 < 256)
-			  ((r0 = ,(charset-id 'latin-iso8859-1))
-			   (r1 -= 128)
-			   (write-multibyte-character r0 r1))
-			;; mule-unicode-0100-24ff (< 0800)
-			((r0 = ,(charset-id 'mule-unicode-0100-24ff))
-			 (r1 -= #x0100)
-			 (r2 = (((r1 / 96) + 32) << 7))
-			 (r1 %= 96)
-			 (r1 += (r2 + 32))
-			 (translate-character
-			  utf-translation-table-for-decode r0 r1)
-			 (write-multibyte-character r0 r1))))))))
-	    ;; 3byte encoding
-	    ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
-	    (if (r0 < #xf0)
-		((r1 = -1)
-		 (r2 = -1)
-		 (read r1 r2)
-		 ;; This is set to 1 if the encoding is invalid.
-		 (r4 = 0)
-		 (r3 = (r1 & #b11000000))
-		 (r3 |= ((r2 >> 2) & #b00110000))
-		 (if (r3 != #b10100000)
-		     (r4 = 1)
-		   ((r3 = ((r0 & #x0f) << 12))
-		    (r3 += ((r1 & #x3f) << 6))
-		    (r3 += (r2 & #x3f))
-		    (if (r3 < #x0800)
-			(r4 = 1))))
-		 (if (r4 != 0)
-		     ;; Invalid 3-byte sequence
-		     ((if (r0 < #xa0)
-			  (write-multibyte-character r5 r0)
-			(write-multibyte-character r6 r0))
-		      (if (r1 < #x80)
-			  (write r1)
-			(if (r1 < #xa0)
-			    (write-multibyte-character r5 r1)
-			  (write-multibyte-character r6 r1)))
-		      (if (r2 < #x80)
-			  (write r2)
-			(if (r2 < #xa0)
-			    (write-multibyte-character r5 r2)
-			  (write-multibyte-character r6 r2))))
-		   ;; mule-unicode-0100-24ff (>= 0800)
-		   ((if (r3 < #x2500)
-			((r0 = ,(charset-id 'mule-unicode-0100-24ff))
-			 (r3 -= #x0100)
-			 (r3 //= 96)
-			 (r1 = (r7 + 32))
-			 (r1 += ((r3 + 32) << 7))
-			 (translate-character
-			  utf-translation-table-for-decode r0 r1)
-			 (write-multibyte-character r0 r1))
-		      ;; mule-unicode-2500-33ff
-		      (if (r3 < #x3400)
-			  ((r4 = r3)	; don't zap r3
-			   (lookup-integer utf-subst-table-for-decode r4 r5)
-			   (if r7
-			       ;; got a translation
-			       ((write-multibyte-character r4 r5)
-				;; Zapped through register starvation.
-				(r5 = ,(charset-id 'eight-bit-control)))
-			     ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
-			      (r3 -= #x2500)
-			      (r3 //= 96)
-			      (r1 = (r7 + 32))
-			      (r1 += ((r3 + 32) << 7))
-			      (write-multibyte-character r0 r1))))
-			;; U+3400 .. U+D7FF
-			;; Try to convert to CJK chars, else keep
-			;; them as eight-bit-{control|graphic}.
-			(if (r3 < #xd800)
-			    ((r4 = r3)	; don't zap r3
-			     (lookup-integer utf-subst-table-for-decode r4 r5)
-			     (if r7
-				 ;; got a translation
-				 ((write-multibyte-character r4 r5)
-				  ;; Zapped through register starvation.
-				  (r5 = ,(charset-id 'eight-bit-control)))
-			       ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
-			       ((r3 = r6)
-				(write-multibyte-character r3 r0)
-				(if (r1 < #xa0)
-				    (r3 = r5))
-				(write-multibyte-character r3 r1)
-				(if (r2 < #xa0)
-				    (r3 = r5)
-				  (r3 = r6))
-				(write-multibyte-character r3 r2))))
-			  ;; Surrogates, U+D800 .. U+DFFF
-			  (if (r3 < #xe000)
-			      ((r3 = r6)
-			       (write-multibyte-character r3 r0) ; eight-bit-graphic
-			       (if (r1 < #xa0)
-				   (r3 = r5))
-			       (write-multibyte-character r3 r1)
-			       (if (r2 < #xa0)
-				   (r3 = r5)
-				 (r3 = r6))
-			       (write-multibyte-character r3 r2))
-			    ;; mule-unicode-e000-ffff
-			    ;; Fixme: fffe and ffff are invalid.
-			    ((r4 = r3)	; don't zap r3
-			     (lookup-integer utf-subst-table-for-decode r4 r5)
-			     (if r7
-				 ;; got a translation
-				 ((write-multibyte-character r4 r5)
-				  ;; Zapped through register starvation.
-				  (r5 = ,(charset-id 'eight-bit-control)))
-			       ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
-				(r3 -= #xe000)
-				(r3 //= 96)
-				(r1 = (r7 + 32))
-				(r1 += ((r3 + 32) << 7))
-				(write-multibyte-character r0 r1)))))))))))
-	      (if (r0 < #xfe)
-		  ;; 4byte encoding
-		  ;; keep those bytes as eight-bit-{control|graphic}
-		  ;; Fixme: allow lookup in utf-subst-table-for-decode.
-		  ((r1 = -1)
-		   (r2 = -1)
-		   (r3 = -1)
-		   (read r1 r2 r3)
-		   ;; r0 > #xf0, thus eight-bit-graphic
-		   (write-multibyte-character r6 r0)
-		   (if (r1 < #xa0)
-		       (if (r1 < #x80)	; invalid byte
-			   (write r1)
-			 (write-multibyte-character r5 r1))
-		     (write-multibyte-character r6 r1))
-		   (if (r2 < #xa0)
-		       (if (r2 < #x80)	; invalid byte
-			   (write r2)
-			 (write-multibyte-character r5 r2))
-		     (write-multibyte-character r6 r2))
-		   (if (r3 < #xa0)
-		       (if (r3 < #x80)	; invalid byte
-			   (write r3)
-			 (write-multibyte-character r5 r3))
-		     (write-multibyte-character r6 r3))
-		   (if (r0 >= #xf8)	; 5- or 6-byte encoding
-		       ((r0 = -1)
-			(read r0)
-			(if (r0 < #xa0)
-			    (if (r0 < #x80) ; invalid byte
-				(write r0)
-			      (write-multibyte-character r5 r0))
-			  (write-multibyte-character r6 r0))
-			(if (r0 >= #xfc) ; 6-byte
-			    ((r0 = -1)
-			     (read r0)
-			     (if (r0 < #xa0)
-				 (if (r0 < #x80) ; invalid byte
-				     (write r0)
-				   (write-multibyte-character r5 r0))
-			       (write-multibyte-character r6 r0)))))))
-		;; else invalid byte >= #xfe
-		(write-multibyte-character r6 r0))))))
 (repeat)))
 ;; At EOF...
 (if (r0 >= 0)
-	((if (r0 < #x80)
+	;; r0 >= #x80
-	     (write r0)
+	((call ccl-mule-utf-untrans)
-	   (if (r0 < #xa0)
-	       (write-multibyte-character r5 r0)
-	     ((write-multibyte-character r6 r0))))
 	 (if (r1 >= 0)
-	     ((if (r1 < #x80)
+	     ((r0 = r1)
-		  (write r1)
+	      (call ccl-mule-utf-untrans)
-		(if (r1 < #xa0)
-		    (write-multibyte-character r5 r1)
-		  ((write-multibyte-character r6 r1))))
 	      (if (r2 >= 0)
-		  ((if (r2 < #x80)
+		  ((r0 = r2)
-		       (write r2)
+		   (call ccl-mule-utf-untrans)
-		     (if (r2 < #xa0)
-			 (write-multibyte-character r5 r2)
-		       ((write-multibyte-character r6 r2))))
 		   (if (r3 >= 0)
-		       (if (r3 < #x80)
+		       ((r0 = r3)
-			   (write r3)
+			(call ccl-mule-utf-untrans))))))))))
-			 (if (r3 < #xa0)
-			     (write-multibyte-character r5 r3)
-			   ((write-multibyte-character r6 r3))))))))))))
 "CCL program to decode UTF-8.
 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
 mule-unicode-*, but see also `utf-fragmentation-table' and
 `ucs-mule-cjk-to-unicode'.
 Encodings of un-representable Unicode characters are decoded asis into
 eight-bit-control and eight-bit-graphic characters.")
+(define-ccl-program ccl-mule-utf-8-encode-untrans
+;; UTF-8 decoder generates an UTF-8 sequence represented by a
+;; sequence eight-bit-control/graphic chars for an untranslatable
+;; character and an invalid byte.
+;;
+;; This CCL parses that sequence (the first byte is already in r1),
+;; writes out the original bytes of that sequence, and sets r5 to
+;; -1.
+;;
+;; If the eight-bit-control/graphic sequence is shorter than what r1
+;; suggests, it sets r5 and r6 to the last character read that
+;; should be handled by the next loop of a caller.
+;;
+;; Note: For UTF-8 validation, we only check if a character is
+;; eight-bit-control/graphic or not.  It may result in incorrect
+;; handling of random binary data, but such a data can't be encoded
+;; by UTF-8 anyway.  At least, UTF-8 decoders doesn't generate such
+;; a sequence even if a source contains invalid byte-sequence.
+`(0
+(;; Read the 2nd byte.
+(read-multibyte-character r5 r6)
+(r0 = (r5 != ,(charset-id 'eight-bit-control)))
+(if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
+	 ((write r1)			; invalid UTF-8
+	  (r1 = -1)
+	  (end)))
+(if (r1 <= #xC3)
+	 ;; 2-byte sequence for an originally invalid byte.
+	 ((r6 &= #x3F)
+	  (r6 |= ((r1 & #x1F) << 6))
+	  (write r6)
+	  (r5 = -1)
+	  (end)))
+(write r1 r6)
+(r2 = r1)
+(r1 = -1)
+;; Read the 3rd byte.
+(read-multibyte-character r5 r6)
+(r0 = (r5 != ,(charset-id 'eight-bit-control)))
+(if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
+	 (end))				; invalid UTF-8
+(write r6)
+(if (r2 < #xF0)
+	 ;; 3-byte sequence for an untranslated character.
+	 ((r5 = -1)
+	  (end)))
+;; Read the 4th byte.
+(read-multibyte-character r5 r6)
+(r0 = (r5 != ,(charset-id 'eight-bit-control)))
+(if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
+	 (end))			; invalid UTF-8
+;; 4-byte sequence for an untranslated character.
+(write r6)
+(r5 = -1)
+(end))
+;; At EOF...
+((r5 = -1)
+(if (r1 >= 0)
+	 (write r1)))))
 (define-ccl-program ccl-encode-mule-utf-8
 `(1
 ((r5 = -1)
 (loop
 (if (r5 < 0)
-	  ((r1 = -1)
+	  (read-multibyte-character r0 r1)
-	   (read-multibyte-character r0 r1)
+	;; Pre-read character is in r5 (charset-ID) and r6 (code-point).
-	   (translate-character utf-translation-table-for-encode r0 r1))
+	((r0 = r5)
-	(;; We have already done read-multibyte-character.
-	 (r0 = r5)
 	 (r1 = r6)
 	 (r5 = -1)))
+(translate-character utf-translation-table-for-encode r0 r1)
 (if (r0 == ,(charset-id 'ascii))
-	  (write r1)
+	  (write-repeat r1))
-	(if (r0 == ,(charset-id 'latin-iso8859-1))
+(if (r0 == ,(charset-id 'latin-iso8859-1))
-	    ;; r1          scalar                  utf-8
+	  ;; r1          scalar                  utf-8
-	    ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
+	  ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
-	    ;; 20    0000 0000 1010 0000    1100 0010 1010 0000
+	  ;; 20    0000 0000 1010 0000    1100 0010 1010 0000
-	    ;; 7f    0000 0000 1111 1111    1100 0011 1011 1111
+	  ;; 7f    0000 0000 1111 1111    1100 0011 1011 1111
-	    ((r0 = (((r1 & #x40) >> 6) | #xc2))
+	  ((r0 = (((r1 & #x40) >> 6) | #xc2))
-	     (r1 &= #x3f)
+	   (r1 &= #x3f)
+	   (r1 |= #x80)
+	   (write r0)
+	   (write-repeat r1)))
+(if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
+	  ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
+	   ;; #x3f80 == (0011 1111 1000 0000)b
+	   (r1 &= #x7f)
+	   (r1 += (r0 + 224))		; 240 == -32 + #x0100
+	   ;; now r1 holds scalar value
+	   (if (r1 < #x0800)
+	       ;; 2byte encoding
+	       ((write ((r1 >> 6) | #xC0))
+		(r1 &= #x3F)
+		(r1 |= #x80)
+		(write-repeat r1))
+	     ;; 3byte encoding
+	     ((write ((r1 >> 12) | #xE0))
+	      (write  (((r1 & #x0FC0) >> 6) | #x80))
+	      (r1 &= #x3F)
+	      (r1 |= #x80)
+	      (write-repeat r1)))))
+(if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
+	  ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
+	   (r1 &= #x7f)
+	   (r1 += (r0 + 9440))		; 9440 == -32 + #x2500
+	   ;; now r1 holds scalar value
+	   (write ((r1 >> 12) | #xE0))
+	   (write  (((r1 & #x0FC0) >> 6) | #x80))
+	   (r1 &= #x3F)
+	   (r1 |= #x80)
+	   (write-repeat r1)))
+(if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
+	  ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
+	   (r1 &= #x7f)
+	   (r1 += (r0 + 57312))		; 57312 == -32 + #xe000
+	   ;; now r1 holds scalar value
+	   (write ((r1 >> 12) | #xE0))
+	   (write  (((r1 & #x0FC0) >> 6) | #x80))
+	   (r1 &= #x3F)
+	   (r1 |= #x80)
+	   (write-repeat r1)))
+(if (r0 == ,(charset-id 'eight-bit-control))
+	  ;; r1          scalar                  utf-8
+	  ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
+	  ;; 80    0000 0000 1000 0000    1100 0010 1000 0000
+	  ;; 9f    0000 0000 1001 1111    1100 0010 1001 1111
+	  ((write #xC2)
+	   (write-repeat r1)))
+(if (r0 == ,(charset-id 'eight-bit-graphic))
+	  ;; r1          scalar                  utf-8
+	  ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
+	  ;; a0    0000 0000 1010 0000    1100 0010 1010 0000
+	  ;; ff    0000 0000 1111 1111    1101 1111 1011 1111
+	  ((r0 = (r1 >= #xC0))
+	   (r0 &= (r1 <= #xC3))
+	   (r4 = (r1 >= #xE1))
+	   (r4 &= (r1 <= #xF7))
+	   (r0 |= r4)
+	   (if r0
+	       ((call ccl-mule-utf-8-encode-untrans)
+		(repeat))
+	     (write-repeat r1))))
+(lookup-character utf-subst-table-for-encode r0 r1)
+(if r7		; lookup succeeded
+	  (if (r0 < #x800)
+	      ;; 2byte encoding
+	      ((write ((r0 >> 6) | #xC0))
+	       (r1 &= #x3F)
+	       (r1 |= #x80)
+	       (write-repeat r1))
+	    ;; 3byte encoding
+	    ((write ((r0 >> 12) | #xE0))
+	     (write  (((r0 & #x0FC0) >> 6) | #x80))
+	     (r1 &= #x3F)
 	     (r1 |= #x80)
-	     (write r0 r1))
+	     (write-repeat r1))))
-	  (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
+;; Unsupported character.
-	      ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
+;; Output U+FFFD, which is `ef bf bd' in UTF-8.
-	       ;; #x3f80 == (0011 1111 1000 0000)b
+(write #xef)
-	       (r1 &= #x7f)
+(write #xbf)
-	       (r1 += (r0 + 224))	; 240 == -32 + #x0100
+(write-repeat #xbd))))
-	       ;; now r1 holds scalar value
-	       (if (r1 < #x0800)
-		   ;; 2byte encoding
-		   ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
-		    ;; #x07c0 == (0000 0111 1100 0000)b
-		    (r1 &= #x3f)
-		    (r1 |= #x80)
-		    (write r0 r1))
-		 ;; 3byte encoding
-		 ((r0 = (((r1 & #xf000) >> 12) | #xe0))
-		  (r2 = ((r1 & #x3f) | #x80))
-		  (r1 &= #x0fc0)
-		  (r1 >>= 6)
-		  (r1 |= #x80)
-		  (write r0 r1 r2))))
-	    (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
-		((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
-		 (r1 &= #x7f)
-		 (r1 += (r0 + 9440))	; 9440 == -32 + #x2500
-		 (r0 = (((r1 & #xf000) >> 12) | #xe0))
-		 (r2 = ((r1 & #x3f) | #x80))
-		 (r1 &= #x0fc0)
-		 (r1 >>= 6)
-		 (r1 |= #x80)
-		 (write r0 r1 r2))
-	      (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
-		  ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
-		   (r1 &= #x7f)
-		   (r1 += (r0 + 57312))	; 57312 == -32 + #xe000
-		   (r0 = (((r1 & #xf000) >> 12) | #xe0))
-		   (r2 = ((r1 & #x3f) | #x80))
-		   (r1 &= #x0fc0)
-		   (r1 >>= 6)
-		   (r1 |= #x80)
-		   (write r0 r1 r2))
-		(if (r0 == ,(charset-id 'eight-bit-control))
-		    ;; r1          scalar                  utf-8
-		    ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
-		    ;; 80    0000 0000 1000 0000    1100 0010 1000 0000
-		    ;; 9f    0000 0000 1001 1111    1100 0010 1001 1111
-		    ((write #xc2)
-		     (write r1))
-		  (if (r0 == ,(charset-id 'eight-bit-graphic))
-		      ;; r1          scalar                  utf-8
-		      ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
-		      ;; a0    0000 0000 1010 0000    1100 0010 1010 0000
-		      ;; ff    0000 0000 1111 1111    1101 1111 1011 1111
-		      ((write r1)
-		       (r1 = -1)
-		       (read-multibyte-character r0 r1)
-		       (if (r0 != ,(charset-id 'eight-bit-graphic))
-			   (if (r0 != ,(charset-id 'eight-bit-control))
-			       ((r5 = r0)
-				(r6 = r1))))
-		       (if (r5 < 0)
-			   ((read-multibyte-character r0 r2)
-			    (if (r0 != ,(charset-id 'eight-bit-graphic))
-				(if (r0 != ,(charset-id 'eight-bit-control))
-				    ((r5 = r0)
-				     (r6 = r2))))
-			    (if (r5 < 0)
-				(write r1 r2)
-			      (if (r1 < #xa0)
-				  (write r1)
-				((write #xc2)
-				 (write r1)))))))
-		    ((lookup-character utf-subst-table-for-encode r0 r1)
-		     (if r7		; lookup succeeded
-			 ((r1 = (((r0 & #xf000) >> 12) | #xe0))
-			  (r2 = ((r0 & #x3f) | #x80))
-			  (r0 &= #x0fc0)
-			  (r0 >>= 6)
-			  (r0 |= #x80)
-			  (write r1 r0 r2))
-		       ;; Unsupported character.
-		       ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
-		       ((write #xef)
-			(write #xbf)
-			(write #xbd)))))))))))
-(repeat)))
-(if (r1 >= #xa0)
-	(write r1)
-(if (r1 >= #x80)
-	  ((write #xc2)
-	   (write r1)))))
 "CCL program to encode into UTF-8.")
 (define-ccl-program ccl-untranslated-to-ucs
 `(0
-(if (r0 < #xf0)			; 3-byte encoding, as above
+(if (r1 == 0)
-	((r4 = 0)
+	nil
-	 (r3 = (r1 & #b11000000))
+(if (r0 <= #xC3)			; 2-byte encoding
-	 (r3 |= ((r2 >> 2) & #b00110000))
+	  ((r0 = ((r0 & #x3) << 6))
-	 (if (r3 != #b10100000)
+	   (r0 |= (r1 & #x3F))
-	     (r4 = 1)
+	   (r1 = 2))
-	   ((r3 = ((r0 & #x0f) << 12))
+	(if (r2 == 0)
-	    (r3 += ((r1 & #x3f) << 6))
+	    (r1 = 0)
-	    (r3 += (r2 & #x3f))
+	  (if (r0 < #xF0)		; 3-byte encoding, as above
-	    (if (r3 < #x0800)
+	      ((r0 = ((r0 & #xF) << 12))
-		(r4 = 1))))
+	       (r0 |= ((r1 & #x3F) << 6))
-	 (if (r4 != 0)
+	       (r0 |= (r1 & #x3F))
-	     (r0 = 0)
+	       (r1 = 3))
-	   (r0 = r3)))
+	    (if (r3 == 0)
-(if (r0 < #xf8)			; 4-byte (Mule-UCS recipe)
+		(r1 = 0)
-	  ((r4 = (r1 >> 6))
+	      ((r0 = ((r0 & #x7) << 18))
-	   (if (r4 != #b10)
+	       (r0 |= ((r1 & #x3F) << 12))
-	       (r0 = 0)
+	       (r0 |= ((r2 & #x3F) << 6))
-	     ((r4 = (r2 >> 6))
+	       (r0 |= (r3 & #x3F))
-	      (if (r4 != #b10)
+	       (r1 = 4))))))))
-		  (r0 = 0)
+"Decode 2-, 3-, or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
-		((r4 = (r3 >> 6))
+Set r1 to the byte length.  r0 == 0 for invalid sequence.")
-		 (if (r4 != #b10)
-		     (r0 = 0)
-		   ((r1 = ((r1  & #x3F) << 12))
-		    (r2 = ((r2  & #x3F) << 6))
-		    (r3 &= #x3F)
-		    (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3)))))))))
-	(r0 = 0))))
-"Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
-r0 == 0 for invalid sequence.")
 (defvar utf-8-ccl-regs (make-vector 8 0))
 (defsubst utf-8-untranslated-to-ucs ()
 "Return the UCS code for an untranslated sequence of raw bytes t point.
 Only for 3- or 4-byte sequences."
 (aset utf-8-ccl-regs 0 (or (char-after) 0))
 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0))
 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0))
 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0))
-(ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs)
+(ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs))
-(aref utf-8-ccl-regs 0))
 (defun utf-8-help-echo (window object position)
 (format "Untranslated Unicode U+%04X"
 	  (get-char-property position 'untranslated-utf-8 object)))
-;; We compose the untranslatable sequences into a single character.
+;; We compose the untranslatable sequences into a single character,
+;; and move point to the next character.
 ;; This is infelicitous for editing, because there's currently no
 ;; mechanism for treating compositions as atomic, but is OK for
 ;; display.  They are composed to U+FFFD with help-echo which
 ;; indicates the unicodes they represent.  This function GCs too much.
-(defsubst utf-8-compose ()
-"Put a suitable composition on an untranslatable sequence.
+;; If utf-translate-cjk-mode is non-nil, this function is called with
-Return the sequence's length."
+;; HASH-TABLE which translates CJK characters into some of CJK
-(let* ((u (utf-8-untranslated-to-ucs))
+;; charsets.
-	 (l (unless (zerop u)
-	      (if (>= u #x10000)
+(defsubst utf-8-compose (hash-table)
-		       4
+"Put a suitable composition on an untranslatable sequence at point.
-		     3))))
+If HASH-TABLE is non-nil, try to translate CJK characters by it at first.
-(when l
+Move point to the end of the sequence."
-(put-text-property (point) (min (point-max) (+ l (point)))
+(utf-8-untranslated-to-ucs)
-			 'untranslated-utf-8 u)
+(let ((l (aref utf-8-ccl-regs 1))
-(put-text-property (point) (min (point-max) (+ l (point)))
+	ch)
-			 'help-echo 'utf-8-help-echo)
+(if (> l 0)
-(compose-region (point) (+ l (point)) ?$,3u=(B)
+	(if (and hash-table
-l)))
+		 (setq ch (gethash (aref utf-8-ccl-regs 0)  hash-table)))
+	    (progn
+	      (insert ch)
+	      (delete-region (point) (min (point-max) (+ l (point)))))
+	  (setq ch (aref utf-8-ccl-regs 0))
+	  (put-text-property (point) (min (point-max) (+ l (point)))
+			     'untranslated-utf-8 ch)
+	  (put-text-property (point) (min (point-max) (+ l (point)))
+			     'help-echo 'utf-8-help-echo)
+	  (if (= l 2)
+	      (put-text-property (point) (min (point-max) (+ l (point)))
+				 'display (format "\\%03o" ch))
+	    (compose-region (point) (+ l (point)) ?$,3u=(B))
+	  (forward-char l))
+(forward-char 1))))
 (defcustom utf-8-compose-scripts nil
 "*Non-nil means compose various scripts on decoding utf-8 text."
 :group 'mule
 :version "21.4"
 :type 'boolean)
 (defun utf-8-post-read-conversion (length)
 "Compose untranslated utf-8 sequences into single characters.
+If `utf-translate-cjk-mode' is non-nil, tries to translate CJK characters.
 Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
 (save-excursion
-;; Can't do eval-when-compile to insert a multibyte constant
+(save-restriction
-;; version of the string in the loop, since it's always loaded as
+(narrow-to-region (point) (+ (point) length))
-;; unibyte from a byte-compiled file.
+;; Can't do eval-when-compile to insert a multibyte constant
-(let ((range (string-as-multibyte "^\xe1-\xf7")))
+;; version of the string in the loop, since it's always loaded as
-(while (and (skip-chars-forward range)
+;; unibyte from a byte-compiled file.
-		  (not (eobp)))
+(let ((range (string-as-multibyte "^\xc0-\xc3\xe1-\xf7"))
-	(forward-char (utf-8-compose)))))
+	    hash-table ch)
-;; Fixme: Takahashi-san implies it may not work this easily.  I
+	(when utf-translate-cjk-mode
-;; asked why but didn't get a reply. -- fx
+	  (if (not utf-translate-cjk-lang-env)
-(when (and utf-8-compose-scripts (> length 1))
+	      ;; Check these characters:
-;; These currently have definitions which cover the relevant
+	      ;;   "U+2e80-U+33ff", "U+ff00-U+ffef"
-;; unicodes.  We could avoid loading thai-util &c by checking
+	      ;; We may have to translate them to CJK charsets.
-;; whether the region contains any characters with the appropriate
+	      (let ((range2 "$,29@(B-$,2G$,3r`(B-$,3u/(B"))
-;; categories.  There aren't yet Unicode-based rules for Tibetan.
+		(skip-chars-forward (concat range range2))
-(save-excursion (setq length (diacritic-post-read-conversion length)))
+		(unless (eobp)
-(save-excursion (setq length (thai-post-read-conversion length)))
+		  (utf-translate-cjk-load-tables)
-(save-excursion (setq length (lao-post-read-conversion length)))
+		  (setq range (concat range range2)))
-(save-excursion (setq length (devanagari-post-read-conversion length)))
+	  (setq hash-table (get 'utf-subst-table-for-decode
-(save-excursion (setq length (malayalam-post-read-conversion length)))
+				'translation-hash-table)))))
-(save-excursion (setq length (tamil-post-read-conversion length))))
+	(while (and (skip-chars-forward range)
-length)
+		    (not (eobp)))
+	  (setq ch (following-char))
-;; ucs-tables is preloaded
+	  (if (< ch 256)
-;; (defun utf-8-pre-write-conversion (beg end)
+	      (utf-8-compose hash-table)
-;;   "Semi-dummy pre-write function effectively to autoload ucs-tables."
+	    (if (and hash-table
-;;   ;; Ensure translation-table is loaded.
+		     (setq ch (gethash (encode-char ch 'ucs) hash-table)))
-;;   (require 'ucs-tables)
+		(progn
-;;   ;; Don't do this again.
+		  (insert ch)
-;;   (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
+		  (delete-char 1))
-;;   nil)
+	      (forward-char 1)))))
+(when (and utf-8-compose-scripts (> length 1))
+	;; These currently have definitions which cover the relevant
+	;; unicodes.  We could avoid loading thai-util &c by checking
+	;; whether the region contains any characters with the appropriate
+	;; categories.  There aren't yet Unicode-based rules for Tibetan.
+	(diacritic-compose-region (point-max) (point-min))
+	(thai-compose-region (point-max) (point-min))
+	(lao-compose-region (point-max) (point-min))
+	(devanagari-compose-region (point-max) (point-min))
+	(malayalam-compose-region (point-max) (point-min))
+	(tamil-compose-region (point-max) (point-min)))
+(- (point-max) (point-min)))))
+(defun utf-8-pre-write-conversion (beg end)
+"Prepare for `utf-translate-cjk-mode' to encode text between BEG and END.
+This is used as a post-read-conversion of utf-8 coding system."
+(if (and utf-translate-cjk-mode
+	   (not utf-translate-cjk-lang-env)
+	   (save-excursion
+	     (goto-char beg)
+	     (re-search-forward "\\cc\\|\\cj\\|\\ch" end t)))
+(utf-translate-cjk-load-tables))
+nil)
 (make-coding-system
 'mule-utf-8 4 ?u
 "UTF-8 encoding for Emacs-supported Unicode characters.
 It supports Unicode characters of these ranges:
 On encoding (e.g. writing a file), Emacs characters not belonging to
 any of the character sets listed above are encoded into the UTF-8 byte
 sequence representing U+FFFD (REPLACEMENT CHARACTER)."
 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
-'((safe-charsets
+`((safe-charsets
 ascii
 eight-bit-control
 eight-bit-graphic
 latin-iso8859-1
 mule-unicode-0100-24ff
 mule-unicode-2500-33ff
-mule-unicode-e000-ffff)
+mule-unicode-e000-ffff
+,@(if utf-translate-cjk-mode
+	  utf-translate-cjk-charsets))
 (mime-charset . utf-8)
 (coding-category . coding-category-utf-8)
 (valid-codes (0 . 255))
-;;    (pre-write-conversion . utf-8-pre-write-conversion)
+(pre-write-conversion . utf-8-pre-write-conversion)
 (post-read-conversion . utf-8-post-read-conversion)
 (translation-table-for-encode . utf-translation-table-for-encode)
 (dependency unify-8859-on-encoding-mode
 	       unify-8859-on-decoding-mode
 	       utf-fragment-on-decoding

Mercurial > emacs

comparison lisp/international/utf-8.el @ 56037:81dbb510a1db