emacs: lisp/international/mule.el comparison

comparison lisp/international/mule.el @ 53240:ee5206ee4439

(ctext-non-standard-encodings-alist): Change the format. (ctext-non-standard-encodings): New variable. (ctext-post-read-conversion): Fully re-written. (ctext-non-standard-designations-alist): Delete it. (ctext-non-standard-encodings-table): New function. (ctext-pre-write-conversion): Fully re-written.

author	Kenichi Handa <handa@m17n.org>
date	Wed, 03 Dec 2003 08:24:42 +0000
parents	810931aa5f2d
children	5c66f1de4907

comparison

equal deleted inserted replaced

-:82690620d562
+:ee5206ee4439
 (set-coding-priority-internal)))
 ;;; X selections
 (defvar ctext-non-standard-encodings-alist
-'(("ISO8859-15" . iso-8859-15)
+'(("big5-0" big5 2 (chinese-big5-1 chinese-big5-2))
-("ISO8859-14" . iso-8859-14)
+("ISO8859-14" iso-8859-14 1 latin-iso8859-14)
-("KOI8-R" . koi8-r)
+("ISO8859-15" iso-8859-15 1 latin-iso8859-15))
-("BIG5-0" . big5))
+"Alist of non-standard encoding names vs the corresponding usages in CTEXT.
-"Alist of non-standard encoding names vs Emacs coding systems.
-This alist is used to decode an extened segment of a compound text.")
+It controls how extended segments of a compound text are handled
+by the coding system `compound-text-with-extensions'.
+Each element has the form (ENCODING-NAME CODING-SYSTEM N-OCTET CHARSET).
+ENCODING-NAME is an encoding name of an \"extended segments\".
+CODING-SYSTEM is the coding-system to encode (or decode) the
+characters into (or from) the extended segment.
+N-OCTET is the number of octets (bytes) that encodes a character
+in the segment.  It can be 0 (meaning the number of octets per
+character is variable), 1, 2, 3, or 4.
+CHARSET is a charater set containing characters that are encoded
+in the segment.  It can be a list of character sets.  It can also
+be a char-table, in which case characters that have non-nil value
+in the char-table are the target.
+On decoding CTEXT, all encoding names listed here are recognized.
+On encoding CTEXT, encoding names in the variable
+`ctext-non-standard-encodings' (which see) and in the information
+listed for the current language environment under the key
+`ctext-non-standard-encodings' are used.")
+(defvar ctext-non-standard-encodings
+'("big5-0")
+"List of non-standard encoding names used in extended segments of CTEXT.
+Each element must be one of the names listed in the variable
+`ctext-non-standard-encodings-alist' (which see).")
 (defvar ctext-non-standard-encodings-regexp
 (string-to-multibyte
 (concat
 ;; For non-standard encodings.
 "\\|"
 ;; For UTF-8 encoding.
 "\\(\e%G[^\e]*\e%@\\)")))
 ;; Functions to support "Non-Standard Character Set Encodings" defined
-;; by the COMPOUND-TEXT spec.
+;; by the COMPOUND-TEXT spec.  They also support "The UTF-8 encoding"
-;; We support that by decoding the whole data by `ctext' which just
+;; described in the section 7 of the documentation of COMPOUND-TEXT
-;; pertains byte sequences belonging to ``extended segment'', then
+;; distributed with XFree86.
-;; decoding those byte sequences one by one in Lisp.
-;; This function also supports "The UTF-8 encoding" described in the
-;; section 7 of the documentation fo COMPOUND-TEXT distributed with
-;; XFree86.
 (defun ctext-post-read-conversion (len)
 "Decode LEN characters encoded as Compound Text with Extended Segments."
 (save-match-data
 (save-restriction
 	    (in-workbuf (string= (buffer-name) " *code-converting-work*"))
 	    last-coding-system-used
 	    pos bytes)
 	(or in-workbuf
 	    (narrow-to-region (point) (+ (point) len)))
-	(decode-coding-region (point-min) (point-max) 'ctext)
 	(if in-workbuf
 	    (set-buffer-multibyte t))
 	(while (re-search-forward ctext-non-standard-encodings-regexp
 				  nil 'move)
 	  (setq pos (match-beginning 0))
 	  (if (match-beginning 1)
 	      ;; ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
 	      (let* ((M (char-after (+ pos 4)))
 		     (L (char-after (+ pos 5)))
 		     (encoding (match-string 2))
-		     (coding (or (cdr (assoc-ignore-case
+		     (encoding-info (assoc-ignore-case
-				       encoding
+				     encoding
-				       ctext-non-standard-encodings-alist))
+				     ctext-non-standard-encodings-alist))
-				 (coding-system-p
+		     (coding (if encoding-info
-				  (intern (downcase encoding))))))
+				 (nth 1 encoding-info)
+			       (setq encoding (intern (downcase encoding)))
+			       (and (coding-system-p encoding)
+				    encoding))))
 		(setq bytes (- (+ (* (- M 128) 128) (- L 128))
 			       (- (point) (+ pos 6))))
 		(when coding
 		  (delete-region pos (point))
 		  (forward-char bytes)
 		  (decode-coding-region (- (point) bytes) (point) coding)))
 	    ;; ESC % G --UTF-8-BYTES-- ESC % @
-	    (setq bytes (- (point) pos))
+	    (delete-char -3)
-	    (decode-coding-region (- (point) bytes) (point) 'utf-8))))
+	    (delete-region pos (+ pos 3))
+	    (decode-coding-region pos (point) 'utf-8))))
 (goto-char (point-min))
 (- (point-max) (point)))))
-;; From X registry 2001/06/01
+;; Return a char table of extended segment usage for each character.
-;; 20. NON-STANDARD CHARACTER SET ENCODINGS
+;; Each value of the char table is nil, one of the elements of
+;; `ctext-non-standard-encodings-alist', or the symbol `utf-8'.
-;; See Section 6 of the Compound Text standard.
+(defun ctext-non-standard-encodings-table ()
-;; Name						Reference
+(let ((table (make-char-table 'translation-table)))
-;; ----						---------
+(aset table (make-char 'mule-unicode-0100-24ff) 'utf-8)
-;; "DEC.CNS11643.1986-2"				[53]
+(aset table (make-char 'mule-unicode-2500-33ff) 'utf-8)
-;; 	CNS11643 2-plane using the recommended
+(aset table (make-char 'mule-unicode-e000-ffff) 'utf-8)
-;; 	internal representation scheme
+(dolist (encoding (reverse
-;; "DEC.DTSCS.1990-2"				[54]
+		       (append
-;; 	DEC Taiwan Supplemental Character Set
+			(get-language-info current-language-environment
-;; "fujitsu.u90x03"				[87]
+					   'ctext-non-standard-encodings)
-;; "ILA"						[62]
+			ctext-non-standard-encodings)))
-;; 	registry prefix
+(let* ((slot (assoc encoding ctext-non-standard-encodings-alist))
-;; "IPSYS"						[59]
+	     (charset (nth 3 slot)))
-;; 	registry prefix
+	(if charset
-;; "omron_UDC"					[45]
+	    (cond ((charsetp charset)
-;;         omron User Defined Charset
+		   (aset table (make-char charset) slot))
-;; "omron_UDC_ja"					[45]
+		  ((listp charset)
-;;         omron User Defined Charset for Japanese
+		   (dolist (elt charset)
-;; "omron_UDC_zh"					[45]
+		     (aset table (make-char elt) slot)))
-;;         omron User Defined Charset for Chinese(Main land)
+		  ((char-table-p charset)
-;; "omron_UDC_tw"					[45]
+		   (map-char-table #'(lambda (k v)
-;;         omron User Defined Charset for Chinese(Taiwan)
+				   (if (and v (> k 128)) (aset table k slot)))
+				   charset))))))
-;; If you add charsets here, be sure to modify the regexp used by
+table))
-;; ctext-pre-write-conversion to look up non-standard charsets.
-(defvar ctext-non-standard-designations-alist
-'(("$(0" . (big5 "big5-0" 2))
-("$(1" . (big5 "big5-0" 2))
-;; The following are actually standard; generating extended
-;; segments for them is wrong and screws e.g. Latin-9 users.
-;; 8859-{10,13,16} aren't Emacs charsets anyhow.  -- fx
-;;     ("-V"  . (t "iso8859-10" 1))
-;;     ("-Y"  . (t "iso8859-13" 1))
-;;     ("-_"  . (t "iso8859-14" 1))
-;;     ("-b"  . (t "iso8859-15" 1))
-;;     ("-f"  . (t "iso8859-16" 1))
-)
-"Alist of ctext control sequences that introduce character sets which
-are not in the list of approved encodings, and the corresponding
-coding system, identifier string, and number of octets per encoded
-character.
-Each element has the form (CTLSEQ . (ENCODING CHARSET NOCTETS)).  CTLSEQ
-is the control sequence (sans the leading ESC) that introduces the character
-set in the text encoded by compound-text.  ENCODING is a coding system
-symbol; if it is t, it means that the ctext coding system already encodes
-the text correctly, and only the leading control sequence needs to be altered.
-If ENCODING is a coding system, we need to re-encode the text with that
-coding system.  CHARSET is the name of the charset we need to put into
-the leading control sequence.  NOCTETS is the number of octets (bytes) that
-encode each character in this charset.  NOCTETS can be 0 (meaning the number
-of octets per character is variable), 1, 2, 3, or 4.")
 (defun ctext-pre-write-conversion (from to)
 "Encode characters between FROM and TO as Compound Text w/Extended Segments.
 If FROM is a string, or if the current buffer is not the one set up for us
 	     (set-buffer (generate-new-buffer " *temp"))
 	     (set-buffer-multibyte multibyte)
 	     (insert-buffer-substring buf from to))))
 ;; Now we can encode the whole buffer.
-(let ((case-fold-search nil)
+(let ((encoding-table (ctext-non-standard-encodings-table))
 	  last-coding-system-used
-	  pos posend desig encode-info encoding chset noctets textlen)
+	  last-pos last-encoding-info
-(goto-char (point-min))
+	  encoding-info end-pos)
-;; At first encode the whole buffer.
+(goto-char (setq last-pos (point-min)))
-(encode-coding-region (point-min) (point-max) 'ctext-no-compositions)
+(setq end-pos (point-marker))
-;; Then replace ISO-2022 charset designations with extended
+(while (re-search-forward "[^\000-\177]+" nil t)
-;; segments, for those charsets that are not part of the
+	;; Found a sequence of non-ASCII characters.
-;; official X registry.  The regexp below finds the leading
+	(setq last-pos (match-beginning 0)
-;; sequences for big5.
+	      last-encoding-info (aref encoding-table (char-after last-pos)))
-(while (re-search-forward "\e\\(\$([01]\\)" nil 'move)
+	(set-marker end-pos (match-end 0))
-	(setq pos (match-beginning 0)
+	(goto-char (1+ last-pos))
-	      posend (point)
+	(catch 'tag
-	      desig (match-string 1)
+	  (while t
-	      encode-info (cdr (assoc desig
+	    (setq encoding-info
-				      ctext-non-standard-designations-alist))
+		  (if (< (point) end-pos)
-	      encoding (car encode-info)
+		      (aref encoding-table (following-char))))
-	      chset (cadr encode-info)
+	    (unless (eq last-encoding-info encoding-info)
-	      noctets (car (cddr encode-info)))
+	      (cond ((consp last-encoding-info)
-	(skip-chars-forward "^\e")
+		     ;; Encode the previous range using an extended
-	(cond
+		     ;; segment.
-	 ((eq encoding t)  ; only the leading sequence needs to be changed
+		     (let ((encoding-name (car last-encoding-info))
-	  (setq textlen (+ (- (point) posend) (length chset) 1))
+			   (coding-system (nth 1 last-encoding-info))
-	  ;; Generate the control sequence for an extended segment.
+			   (noctets (nth 2 last-encoding-info))
-	  (replace-match (format "\e%%/%d%c%c%s"
+			   len)
-				 noctets
+		       (encode-coding-region last-pos (point) coding-system)
-				 (+ (/ textlen 128) 128)
+		       (setq len (+ (length encoding-name) 1
-				 (+ (% textlen 128) 128)
+				    (- (point) last-pos)))
-				 chset)
+		       (save-excursion
-			 t t))
+			 (goto-char last-pos)
-	 ((coding-system-p encoding) ; need to recode the entire segment...
+			 (insert (string-to-multibyte
-	  (decode-coding-region pos (point) 'ctext-no-compositions)
+				  (format "\e%%/%d%c%c%s"
-	  (encode-coding-region pos (point) encoding)
+					  noctets
-	  (setq textlen (+ (- (point) pos) (length chset) 1))
+					  (+ (/ len 128) 128)
-	  (save-excursion
+					  (+ (% len 128) 128)
-	    (goto-char pos)
+					  encoding-name))))))
-	    (insert (format "\e%%/%d%c%c%s"
+		    ((eq last-encoding-info 'utf-8)
-			    noctets
+		     ;; Encode the previous range using UTF-8 encoding
-			    (+ (/ textlen 128) 128)
+		     ;; extention.
-			    (+ (% textlen 128) 128)
+		     (encode-coding-region last-pos (point) 'mule-utf-8)
-			    chset))))))
+		     (save-excursion
+		       (goto-char last-pos)
+		       (insert "\e%G"))
+		     (insert "\e%@")))
+	      (setq last-pos (point)
+		    last-encoding-info encoding-info))
+	    (if (< (point) end-pos)
+		(forward-char 1)
+	      (throw 'tag nil)))))
+(set-marker end-pos nil)
 (goto-char (point-min))))
 ;; Must return nil, as build_annotations_2 expects that.
 nil)
 ;;; FILE I/O

Mercurial > emacs

comparison lisp/international/mule.el @ 53240:ee5206ee4439