changeset 53240:ee5206ee4439

(ctext-non-standard-encodings-alist): Change the format. (ctext-non-standard-encodings): New variable. (ctext-post-read-conversion): Fully re-written. (ctext-non-standard-designations-alist): Delete it. (ctext-non-standard-encodings-table): New function. (ctext-pre-write-conversion): Fully re-written.
author Kenichi Handa <handa@m17n.org>
date Wed, 03 Dec 2003 08:24:42 +0000
parents 82690620d562
children b5acb6a7d79b
files lisp/international/mule.el
diffstat 1 files changed, 125 insertions(+), 115 deletions(-) [+]
line wrap: on
line diff
--- a/lisp/international/mule.el	Wed Dec 03 08:23:53 2003 +0000
+++ b/lisp/international/mule.el	Wed Dec 03 08:24:42 2003 +0000
@@ -1330,12 +1330,42 @@
 ;;; X selections
 
 (defvar ctext-non-standard-encodings-alist
-  '(("ISO8859-15" . iso-8859-15)
-    ("ISO8859-14" . iso-8859-14)
-    ("KOI8-R" . koi8-r)
-    ("BIG5-0" . big5))
-  "Alist of non-standard encoding names vs Emacs coding systems.
-This alist is used to decode an extened segment of a compound text.")
+  '(("big5-0" big5 2 (chinese-big5-1 chinese-big5-2))
+    ("ISO8859-14" iso-8859-14 1 latin-iso8859-14)
+    ("ISO8859-15" iso-8859-15 1 latin-iso8859-15))
+  "Alist of non-standard encoding names vs the corresponding usages in CTEXT.
+
+It controls how extended segments of a compound text are handled
+by the coding system `compound-text-with-extensions'.
+
+Each element has the form (ENCODING-NAME CODING-SYSTEM N-OCTET CHARSET).
+
+ENCODING-NAME is an encoding name of an \"extended segments\".
+
+CODING-SYSTEM is the coding-system to encode (or decode) the
+characters into (or from) the extended segment.
+
+N-OCTET is the number of octets (bytes) that encodes a character
+in the segment.  It can be 0 (meaning the number of octets per
+character is variable), 1, 2, 3, or 4.
+
+CHARSET is a charater set containing characters that are encoded
+in the segment.  It can be a list of character sets.  It can also
+be a char-table, in which case characters that have non-nil value
+in the char-table are the target.
+
+On decoding CTEXT, all encoding names listed here are recognized.
+
+On encoding CTEXT, encoding names in the variable
+`ctext-non-standard-encodings' (which see) and in the information
+listed for the current language environment under the key
+`ctext-non-standard-encodings' are used.")
+
+(defvar ctext-non-standard-encodings
+  '("big5-0")
+  "List of non-standard encoding names used in extended segments of CTEXT.
+Each element must be one of the names listed in the variable
+`ctext-non-standard-encodings-alist' (which see).")
 
 (defvar ctext-non-standard-encodings-regexp
   (string-to-multibyte
@@ -1347,13 +1377,9 @@
     "\\(\e%G[^\e]*\e%@\\)")))
 
 ;; Functions to support "Non-Standard Character Set Encodings" defined
-;; by the COMPOUND-TEXT spec.
-;; We support that by decoding the whole data by `ctext' which just
-;; pertains byte sequences belonging to ``extended segment'', then
-;; decoding those byte sequences one by one in Lisp.
-;; This function also supports "The UTF-8 encoding" described in the
-;; section 7 of the documentation fo COMPOUND-TEXT distributed with
-;; XFree86.
+;; by the COMPOUND-TEXT spec.  They also support "The UTF-8 encoding"
+;; described in the section 7 of the documentation of COMPOUND-TEXT
+;; distributed with XFree86.
 
 (defun ctext-post-read-conversion (len)
   "Decode LEN characters encoded as Compound Text with Extended Segments."
@@ -1365,7 +1391,6 @@
 	    pos bytes)
 	(or in-workbuf
 	    (narrow-to-region (point) (+ (point) len)))
-	(decode-coding-region (point-min) (point-max) 'ctext)
 	(if in-workbuf
 	    (set-buffer-multibyte t))
 	(while (re-search-forward ctext-non-standard-encodings-regexp
@@ -1376,11 +1401,14 @@
 	      (let* ((M (char-after (+ pos 4)))
 		     (L (char-after (+ pos 5)))
 		     (encoding (match-string 2))
-		     (coding (or (cdr (assoc-ignore-case 
-				       encoding
-				       ctext-non-standard-encodings-alist))
-				 (coding-system-p
-				  (intern (downcase encoding))))))
+		     (encoding-info (assoc-ignore-case 
+				     encoding
+				     ctext-non-standard-encodings-alist))
+		     (coding (if encoding-info
+				 (nth 1 encoding-info)
+			       (setq encoding (intern (downcase encoding)))
+			       (and (coding-system-p encoding)
+				    encoding))))
 		(setq bytes (- (+ (* (- M 128) 128) (- L 128))
 			       (- (point) (+ pos 6))))
 		(when coding
@@ -1388,66 +1416,39 @@
 		  (forward-char bytes)
 		  (decode-coding-region (- (point) bytes) (point) coding)))
 	    ;; ESC % G --UTF-8-BYTES-- ESC % @
-	    (setq bytes (- (point) pos))
-	    (decode-coding-region (- (point) bytes) (point) 'utf-8))))
+	    (delete-char -3)
+	    (delete-region pos (+ pos 3))
+	    (decode-coding-region pos (point) 'utf-8))))
       (goto-char (point-min))
       (- (point-max) (point)))))
 
-;; From X registry 2001/06/01
-;; 20. NON-STANDARD CHARACTER SET ENCODINGS
-
-;; See Section 6 of the Compound Text standard.
-
-;; Name						Reference
-;; ----						---------
-;; "DEC.CNS11643.1986-2"				[53]
-;; 	CNS11643 2-plane using the recommended
-;; 	internal representation scheme
-;; "DEC.DTSCS.1990-2"				[54]
-;; 	DEC Taiwan Supplemental Character Set
-;; "fujitsu.u90x03"				[87]
-;; "ILA"						[62]
-;; 	registry prefix
-;; "IPSYS"						[59]
-;; 	registry prefix
-;; "omron_UDC"					[45]
-;;         omron User Defined Charset
-;; "omron_UDC_ja"					[45]
-;;         omron User Defined Charset for Japanese
-;; "omron_UDC_zh"					[45]
-;;         omron User Defined Charset for Chinese(Main land)
-;; "omron_UDC_tw"					[45]
-;;         omron User Defined Charset for Chinese(Taiwan)
+;; Return a char table of extended segment usage for each character.
+;; Each value of the char table is nil, one of the elements of
+;; `ctext-non-standard-encodings-alist', or the symbol `utf-8'.
 
-;; If you add charsets here, be sure to modify the regexp used by
-;; ctext-pre-write-conversion to look up non-standard charsets.
-(defvar ctext-non-standard-designations-alist
-  '(("$(0" . (big5 "big5-0" 2))
-    ("$(1" . (big5 "big5-0" 2))
-    ;; The following are actually standard; generating extended
-    ;; segments for them is wrong and screws e.g. Latin-9 users.
-    ;; 8859-{10,13,16} aren't Emacs charsets anyhow.  -- fx
-;;     ("-V"  . (t "iso8859-10" 1))
-;;     ("-Y"  . (t "iso8859-13" 1))
-;;     ("-_"  . (t "iso8859-14" 1))
-;;     ("-b"  . (t "iso8859-15" 1))
-;;     ("-f"  . (t "iso8859-16" 1))
-    )
-  "Alist of ctext control sequences that introduce character sets which
-are not in the list of approved encodings, and the corresponding
-coding system, identifier string, and number of octets per encoded
-character.
-
-Each element has the form (CTLSEQ . (ENCODING CHARSET NOCTETS)).  CTLSEQ
-is the control sequence (sans the leading ESC) that introduces the character
-set in the text encoded by compound-text.  ENCODING is a coding system
-symbol; if it is t, it means that the ctext coding system already encodes
-the text correctly, and only the leading control sequence needs to be altered.
-If ENCODING is a coding system, we need to re-encode the text with that
-coding system.  CHARSET is the name of the charset we need to put into
-the leading control sequence.  NOCTETS is the number of octets (bytes) that
-encode each character in this charset.  NOCTETS can be 0 (meaning the number
-of octets per character is variable), 1, 2, 3, or 4.")
+(defun ctext-non-standard-encodings-table ()
+  (let ((table (make-char-table 'translation-table)))
+    (aset table (make-char 'mule-unicode-0100-24ff) 'utf-8)
+    (aset table (make-char 'mule-unicode-2500-33ff) 'utf-8)
+    (aset table (make-char 'mule-unicode-e000-ffff) 'utf-8)
+    (dolist (encoding (reverse
+		       (append
+			(get-language-info current-language-environment
+					   'ctext-non-standard-encodings)
+			ctext-non-standard-encodings)))
+      (let* ((slot (assoc encoding ctext-non-standard-encodings-alist))
+	     (charset (nth 3 slot)))
+	(if charset
+	    (cond ((charsetp charset)
+		   (aset table (make-char charset) slot))
+		  ((listp charset)
+		   (dolist (elt charset)
+		     (aset table (make-char elt) slot)))
+		  ((char-table-p charset)
+		   (map-char-table #'(lambda (k v) 
+				   (if (and v (> k 128)) (aset table k slot)))
+				   charset))))))
+    table))
 
 (defun ctext-pre-write-conversion (from to)
   "Encode characters between FROM and TO as Compound Text w/Extended Segments.
@@ -1470,47 +1471,56 @@
 	     (insert-buffer-substring buf from to))))
 
     ;; Now we can encode the whole buffer.
-    (let ((case-fold-search nil)
+    (let ((encoding-table (ctext-non-standard-encodings-table))
 	  last-coding-system-used
-	  pos posend desig encode-info encoding chset noctets textlen)
-      (goto-char (point-min))
-      ;; At first encode the whole buffer.
-      (encode-coding-region (point-min) (point-max) 'ctext-no-compositions)
-      ;; Then replace ISO-2022 charset designations with extended
-      ;; segments, for those charsets that are not part of the
-      ;; official X registry.  The regexp below finds the leading
-      ;; sequences for big5.
-      (while (re-search-forward "\e\\(\$([01]\\)" nil 'move)
-	(setq pos (match-beginning 0)
-	      posend (point)
-	      desig (match-string 1)
-	      encode-info (cdr (assoc desig
-				      ctext-non-standard-designations-alist))
-	      encoding (car encode-info)
-	      chset (cadr encode-info)
-	      noctets (car (cddr encode-info)))
-	(skip-chars-forward "^\e")
-	(cond
-	 ((eq encoding t)  ; only the leading sequence needs to be changed
-	  (setq textlen (+ (- (point) posend) (length chset) 1))
-	  ;; Generate the control sequence for an extended segment.
-	  (replace-match (format "\e%%/%d%c%c%s"
-				 noctets
-				 (+ (/ textlen 128) 128)
-				 (+ (% textlen 128) 128)
-				 chset)
-			 t t))
-	 ((coding-system-p encoding) ; need to recode the entire segment...
-	  (decode-coding-region pos (point) 'ctext-no-compositions)
-	  (encode-coding-region pos (point) encoding)
-	  (setq textlen (+ (- (point) pos) (length chset) 1))
-	  (save-excursion
-	    (goto-char pos)
-	    (insert (format "\e%%/%d%c%c%s"
-			    noctets
-			    (+ (/ textlen 128) 128)
-			    (+ (% textlen 128) 128)
-			    chset))))))
+	  last-pos last-encoding-info
+	  encoding-info end-pos)
+      (goto-char (setq last-pos (point-min)))
+      (setq end-pos (point-marker))
+      (while (re-search-forward "[^\000-\177]+" nil t)
+	;; Found a sequence of non-ASCII characters.
+	(setq last-pos (match-beginning 0)
+	      last-encoding-info (aref encoding-table (char-after last-pos)))
+	(set-marker end-pos (match-end 0))
+	(goto-char (1+ last-pos))
+	(catch 'tag
+	  (while t
+	    (setq encoding-info
+		  (if (< (point) end-pos)
+		      (aref encoding-table (following-char))))
+	    (unless (eq last-encoding-info encoding-info)
+	      (cond ((consp last-encoding-info)
+		     ;; Encode the previous range using an extended
+		     ;; segment.
+		     (let ((encoding-name (car last-encoding-info))
+			   (coding-system (nth 1 last-encoding-info))
+			   (noctets (nth 2 last-encoding-info))
+			   len)
+		       (encode-coding-region last-pos (point) coding-system)
+		       (setq len (+ (length encoding-name) 1
+				    (- (point) last-pos)))
+		       (save-excursion
+			 (goto-char last-pos)
+			 (insert (string-to-multibyte 
+				  (format "\e%%/%d%c%c%s"
+					  noctets
+					  (+ (/ len 128) 128)
+					  (+ (% len 128) 128)
+					  encoding-name))))))
+		    ((eq last-encoding-info 'utf-8)
+		     ;; Encode the previous range using UTF-8 encoding
+		     ;; extention.
+		     (encode-coding-region last-pos (point) 'mule-utf-8)
+		     (save-excursion
+		       (goto-char last-pos)
+		       (insert "\e%G"))
+		     (insert "\e%@")))
+	      (setq last-pos (point)
+		    last-encoding-info encoding-info))
+	    (if (< (point) end-pos)
+		(forward-char 1)
+	      (throw 'tag nil)))))
+      (set-marker end-pos nil)
       (goto-char (point-min))))
   ;; Must return nil, as build_annotations_2 expects that.
   nil)