emacs: lisp/international/ucs-tables.el comparison

comparison lisp/international/ucs-tables.el @ 46506:a7f933a7b003

Optimize tables. Deal with some non-8859 charsets. (ucs-mule-to-mule-unicode): New. (ucs-unify-8859): Use utf-8-fragment-on-decoding, set up Quail translation. (ucs-fragment-8859): Modified consistent with ucs-unify-8859. (unify-8859-on-encoding-mode): Doc mod. Fix custom version. (unify-8859-on-decoding-mode): Doc mod. Change code. Fix custom version. Add custom dependencies. (ucs-insert): Check for null from decode-char. (translation-table-for-input, ucs-quail-activate) (ucs-minibuffer-setup, ccl-encode-unicode-font) (ucs-tables-unload-hook): New.

author	Dave Love <fx@gnu.org>
date	Wed, 17 Jul 2002 19:21:41 +0000
parents	aee6ae77d23b
children	6262acd6c7ac

comparison

equal deleted inserted replaced

-:005d282a48ed
+:a7f933a7b003
 ;; Boston, MA 02111-1307, USA.
 ;;; Commentary:
 ;; This file provides tables mapping between Unicode numbers and
-;; emacs-mule characters from the iso8859 charsets (and others).  It
+;; emacs-mule characters from the iso-8859 charsets (and others).  It
 ;; also provides some auxiliary functions.
 ;; These tables are used to construct other mappings between the Mule
 ;; iso8859 charsets and the emacs-unicode charsets and a table that
-;; unifies iso8859 characters using a single charset as far as
+;; unifies iso-8859 characters using a single charset as far as
 ;; possible.  These tables are used by latin1-disp.el to display some
 ;; Unicode characters without a Unicode font and by utf-8.el to unify
 ;; Latin-N as far as possible on encoding.
 ;; More drastically, they can be used to unify 8859 into Latin-1 plus
 ;; which are supposed to contain distinct 8859 charsets.  Also, it can
 ;; make reading and writing of emacs-mule and iso-2022-based encodings
 ;; not idempotent.
 ;; Global minor modes are provided to unify on encoding and decoding.
+;; These could be extended to non-iso-8859 charsets.  However 8859 is
-;; The translation table `ucs-mule-to-mule-unicode' is populated.
+;; all that users normally care about unifying although, for instance,
-;; This is used by the `mule-utf-8' coding system to encode extra
+;; Greek occurs in as many as nine Emacs charsets.
-;; characters.
+;; The translation table `ucs-mule-to-mule-unicode' is populated,
+;; which could be used for more general unification on decoding.  This
+;; is used by the `mule-utf-8' coding system to encode extra
+;; characters, and also by the coding systems set up by code-pages.el.
+;; The decoding tables here take account of
+;; `utf-8-fragment-on-decoding' which may specify decoding Greek and
+;; Cyrillic into 8859 charsets.
+;; Unification also puts a `translation-table-for-input' property on
+;; relevant coding coding systems and arranges for the
+;; `translation-table-for-input' variable to be set either globally or
+;; locally.  This is used by Quail input methods to translate input
+;; characters appropriately for the buffer's coding system (if
+;; possible).  Unification on decoding sets it globally to translate
+;; to Unicode.  Unification on encoding uses hooks to set it up
+;; locally to buffers.  Thus in the latter case, typing `"a' into a
+;; Latin-1 buffer using the `latin-2-prefix' method translates the
+;; generated latin-iso8859-2 `,Bd(B' into latin-iso8859-1 `,Ad(B'.
+;; NB, this code depends on the default value of
+;; `enable-character-translation'.  (Making it nil would anyway lead
+;; to inconsistent behaviour between CCL-based coding systems which
+;; use explicit translation tables and the rest.)
 ;; Command `ucs-insert' is convenient for inserting a given Unicode.
 ;; (See also the `ucs' input method.)
+;; A replacement CCL program is provided which allows characters in
+;; the `ucs-mule-to-mule-unicode' table to be displayed with an
+;; iso-10646-encoded font.  E.g. to use a `Unicode' font for Cyrillic:
+;;
+;;   (set-fontset-font "fontset-startup"
+;;                     (cons (make-char 'cyrillic-iso8859-5 160)
+;;                           (make-char 'cyrillic-iso8859-5 255))
+;;                     '(nil . "ISO10646-1"))
 ;;; Code:
 ;;; Define tables, to be populated later.
 	     l)
 	 (while (< i 256)
 	   (push (cons (make-char 'latin-iso8859-1 (- i 128)) i)
 		 l)
 	   (setq i (1+ i)))
-	 (nreverse l)))
+	 (nreverse l))))
-;;       (case-table (standard-case-table))
-;;       (syntax-table (standard-syntax-table))
-)
 ;; Convert the lists to the basic char tables.
 (dolist (n (list 15 14 9 8 7 5 4 3 2 1))
 (let ((alist (symbol-value (intern (format "ucs-8859-%d-alist" n)))))
 (dolist (pair alist)
 	      (mu (decode-char 'ucs (cdr pair))))
 	  (aset ucs-mule-8859-to-ucs-table mule uc)
 	  ;; 	  (aset ucs-ucs-to-mule-8859-table uc mule)
 	  ;; 	  (aset ucs-mule-unicode-to-mule-8859 mu mule)
 	  (aset ucs-mule-8859-to-mule-unicode mule mu)
-	  (aset ucs-mule-to-mule-unicode mule mu)))
+	  (aset ucs-mule-to-mule-unicode mule mu)))))
-;; I think this is actually done OK in characters.el.
+;; The table optimizing here and elsewhere probably isn't very
-;; Probably things like accents shouldn't have word syntax, but the
+;; useful, but seems good practice.
-;; Latin-N syntax tables currently aren't consistent for such
+(optimize-char-table ucs-mule-to-mule-unicode)
-;; characters anyhow.
+(optimize-char-table ucs-mule-8859-to-mule-unicode)
-;;      ;; Make the mule-unicode characters inherit syntax and case info
-;;      ;; if they don't already have it.
-;;      (dolist (pair alist)
-;; 	(let ((mule (car pair))
-;; 	      (uc (cdr pair))
-;; 	      (mu (decode-char 'ucs (cdr pair))))
-;; 	  (let ((syntax (aref syntax-table mule)))
-;; 	    (if (eq mule (downcase mule))
-;; 		(if (eq mule (upcase mule)) ; non-letter or uncased letter
-;; 		    (progn
-;; 		      (if (= 4 (car syntax)) ; left delim
-;; 			  (progn
-;; 			    (aset syntax-table
-;; 				  mu
-;; 				  (cons 4 (aref ucs-mule-8859-to-mule-unicode
-;; 						(cdr syntax))))
-;; 			    (aset syntax-table
-;; 				  (aref ucs-mule-8859-to-mule-unicode
-;; 					(cdr syntax))
-;; 				  (cons 5 mu)))
-;; 			(aset syntax-table mu syntax))
-;; 		      (aset case-table mu mu)))
-;; 	      ;; Upper case letter
-;; 	      (let ((lower (aref ucs-mule-8859-to-mule-unicode
-;; 				 (aref case-table mule))))
-;; 		(aset case-table mu lower)
-;; 		(aset case-table lower lower)
-;; 		(modify-syntax-entry lower "w   " syntax-table)
-;; 		(modify-syntax-entry mu "w   " syntax-table))))))
-))
 ;; Derive tables that can be used as per-coding-system
 ;; `translation-table-for-encode's.
 (dolist (n (list 15 14 9 8 7 5 4 3 2 1))
 (let* ((alist (symbol-value (intern (format "ucs-8859-%d-alist" n))))
 	   (encode-translator (set (intern (format "ucs-8859-%d-encode-table"
 ;; unicode as some character in this set.
 (map-char-table (lambda (k v)
 			(if (and (setq elt (rassq v alist))
 				 (not (assq k alist)))
 			    (aset encode-translator k (car elt))))
-		      ucs-mule-8859-to-ucs-table))))
+		      ucs-mule-8859-to-ucs-table)
+(optimize-char-table encode-translator))))
 ;; Register for use in CCL.
 (define-translation-table 'ucs-mule-8859-to-mule-unicode
 ucs-mule-8859-to-mule-unicode)
+(define-translation-table 'ucs-mule-to-mule-unicode
-;; Fixme: Make this reversible, which means frobbing
+ucs-mule-to-mule-unicode)
-;; `char-coding-system-table' directly to remove what we added -- see
-;; codepages.el.  Also make it a user option.
 (defun ucs-unify-8859 (&optional encode-only)
 "Set up translation tables for unifying characters from ISO 8859.
 On decoding, non-ASCII characters are mapped into the `iso-latin-1'
 and `mule-unicode-0100-24ff' charsets.  On encoding, these are mapped
 With prefix arg, do unification on encoding only, i.e. don't unify
 everything on input operations."
 (interactive "P")
 (unless encode-only
 ;; Unify 8859 on decoding.  (Non-CCL coding systems only.)
-(unify-8859-on-decoding-mode 1))
+(if utf-8-fragment-on-decoding
+	(map-char-table
+	 (lambda (k v)
+	   (if v (aset ucs-mule-to-mule-unicode v nil)))
+	 utf-8-translation-table-for-decode)
+;; Reset in case it was changed.
+(map-char-table
+(lambda (k v)
+	 (if v (aset ucs-mule-to-mule-unicode v k)))
+utf-8-translation-table-for-decode))
+(set-char-table-parent standard-translation-table-for-decode
+			   ucs-mule-8859-to-mule-unicode)
+;; Translate Quail input globally.
+(setq-default translation-table-for-input ucs-mule-to-mule-unicode)
+;; In case these are set up, but we should use the global
+;; translation table.
+(remove-hook 'quail-activate-hook 'ucs-quail-activate)
+(remove-hook 'minibuffer-setup-hook 'ucs-minibuffer-setup))
 ;; Adjust the 8859 coding systems to fragment the unified characters
 ;; on encoding.
 (dolist (n '(1 2 3 4 5 7 8 9 14 15))
 (let* ((coding-system
 	    (coding-system-base (intern (format "iso-8859-%d" n))))
 ;; used after they've been registered, but we might as well
 ;; record them.  Setting the parent here is a convenience.
 (set-char-table-parent safe table)
 ;; Update the table of what encodes to what.
 (register-char-codings coding-system table)
-(coding-system-put coding-system 'translation-table-for-encode table)))
+(coding-system-put coding-system 'translation-table-for-encode table)
+(coding-system-put coding-system 'translation-table-for-input table)))
-;;; The following works for the bundled coding systems, but it's
+;; Arrange local translation tables for Quail input.
-;;; better to use the Unicode-based ones and make it irrelevant.
+(add-hook 'quail-activate-hook 'ucs-quail-activate)
+(add-hook 'minibuffer-setup-hook 'ucs-minibuffer-setup))
-;;;   ;; Update the Cyrillic special cases.
-;;;   ;; `translation-table-for-encode' doesn't work for CCL coding
-;;;   ;; systems, and `standard-translation-table-for-decode' isn't
-;;;   ;; applied.
-;;;   (let ((table (get 'cyrillic-koi8-r-encode-table 'translation-table)))
-;;;     (map-char-table
-;;;      (lambda (k v)
-;;;        (aset table
-;;; 	     (or (aref ucs-8859-5-encode-table k)
-;;; 		 k)
-;;; 	     v))
-;;;      table)
-;;;     (register-char-codings 'cyrillic-koi8 table))
-;;;   (let ((table (get 'cyrillic-koi8-r-nonascii-translation-table
-;;; 		    'translation-table)))
-;;;     (map-char-table
-;;;      (lambda (k v)
-;;;        (if v (aset table k (or (aref ucs-mule-8859-to-mule-unicode v)
-;;; 			       v))))
-;;;      table))
-;;;   ;; Redefine this, since the orginal only translated 8859-5.
-;;;   (define-ccl-program ccl-encode-koi8
-;;;     `(1
-;;;       ((loop
-;;; 	(read-multibyte-character r0 r1)
-;;; 	(translate-character cyrillic-koi8-r-encode-table r0 r1)
-;;; 	(write-repeat r1))))
-;;;     "CCL program to encode KOI8.")
-;;;   (let ((table (get 'cyrillic-alternativnyj-encode-table 'translation-table)))
-;;;     (map-char-table
-;;;      (lambda (k v)
-;;;        (aset table
-;;; 	     (or (aref ucs-8859-5-encode-table k)
-;;; 		 k)
-;;; 	     v))
-;;;      table)
-;;;     (register-char-codings 'cyrillic-alternativnyj table))
-;;;   (let ((table (get 'cyrillic-alternativnyj-nonascii-translation-table
-;;; 		    'translation-table)))
-;;;     (map-char-table
-;;;      (lambda (k v)
-;;;        (if v (aset table
-;;; 		   k
-;;; 		   (or (aref ucs-mule-8859-to-mule-unicode v)
-;;; 		       v))))
-;;;      table))
-)
 (defun ucs-fragment-8859 (&optional encode-only)
 "Undo the unification done by `ucs-unify-8859'.
 With prefix arg, undo unification on encoding only, i.e. don't undo
 unification on input operations."
 (interactive "P")
 ;; Maybe fix decoding.
 (unless encode-only
 ;; Unify 8859 on decoding.  (Non-CCL coding systems only.)
-(unify-8859-on-decoding-mode -1))
+(set-char-table-parent standard-translation-table-for-decode nil)
+(setq-default translation-table-for-input nil))
 ;; Fix encoding.  For each charset, remove the entries in
 ;; `char-coding-system-table' added to its safe-chars table (as its
 ;; parent).
 (dolist (n '(1 2 3 4 5 7 8 9 14 15))
 (let* ((coding-system
 	     (let ((codings (aref char-coding-system-table key)))
 	       (aset char-coding-system-table key
 		     (delq coding-system codings)))))
 (char-table-parent safe))
 (set-char-table-parent safe nil)
-(coding-system-put coding-system 'translation-table-for-encode nil))))
+(coding-system-put coding-system 'translation-table-for-encode nil)
+(coding-system-put coding-system 'translation-table-for-input nil)))
+(optimize-char-table char-coding-system-table)
+(remove-hook 'quail-activate-hook 'ucs-quail-activate)
+(remove-hook 'minibuffer-setup-hook 'ucs-minibuffer-setup))
 (define-minor-mode unify-8859-on-encoding-mode
 "Set up translation tables for unifying ISO 8859 characters on encoding.
 The ISO 8859 characters sets overlap, e.g. 8859-1 (Latin-1) and
 unified representation) in a buffer saved as Latin-9 will be encoded
 directly to a byte value 233.  By default, in contrast, you would be
 prompted for a general coding system to use for saving the file, which
 can cope with separate Latin-1 and Latin-9 representations of e-acute.
+Also sets hooks that arrange `translation-table-for-input' to be set
+up locally when Quail input methods are activated.  This will often
+allow input generated by Quail input methods to conform with what the
+buffer's file coding system can encode.  Thus you could use a Latin-2
+input method to search for e-acute in a Latin-1 buffer.
 See also command `unify-8859-on-decoding-mode'."
 :group 'mule
 :global t
-:version "21.3"				; who knows...?
 :init-value nil
 (if unify-8859-on-encoding-mode
 (ucs-unify-8859 t)
 (ucs-fragment-8859 t)))
+(custom-add-version 'unify-8859-on-encoding-mode "21.4")
 (define-minor-mode unify-8859-on-decoding-mode
-"Set up translation table for unifying ISO 8859 characters on decoding.
+"Set up translation tables for unifying ISO 8859 characters on decoding.
-On decoding -- i.e. input operations -- non-ASCII characters from the
+On decoding, i.e. input operations, non-ASCII characters from the
 built-in ISO 8859 charsets are unified by mapping them into the
 `iso-latin-1' and `mule-unicode-0100-24ff' charsets.
-This sets the parent of `standard-translation-table-for-decode'.
 Also sets `translation-table-for-input' globally, so that Quail input
 methods produce unified characters.
-See also command `unify-8859-on-encoding-mode'."
+See also command `unify-8859-on-encoding-mode' and the user option
+`utf-8-fragment-on-decoding'."
 :group 'mule
 :global t
-:version "21.3"				; who knows...?
 :init-value nil
-(let ((table (if unify-8859-on-decoding-mode ucs-mule-8859-to-mule-unicode)))
+(if unify-8859-on-decoding-mode
-(set-char-table-parent standard-translation-table-for-decode table)
+(ucs-unify-8859)
-(setq-default translation-table-for-input table)))
+(ucs-fragment-8859)))
+(custom-add-dependencies 'unify-8859-on-decoding-mode
+			 '(utf-8-fragment-on-decoding))
+(custom-add-version 'unify-8859-on-decoding-mode "21.4")
 (defun ucs-insert (arg)
 "Insert the Emacs character representation of the given Unicode.
 Interactively, prompts for a hex string giving the code."
 (interactive "sUnicode (hex): ")
-(insert (or (decode-char 'ucs (if (integerp arg)
+(let ((c (decode-char 'ucs (if (integerp arg)
-				    arg
+				 arg
-				  (string-to-number arg 16)))
+			       (string-to-number arg 16)))))
-	      (error "Unknown Unicode character"))))
+(if c
+	(insert c)
+(error "Character can't be decoded to UCS"))))
 ;;; Dealing with non-8859 character sets.
 ;; We only set up translation on encoding to utf-8.  Also translation
 ;; tables ucs-CS-encode-table are constructed for some coding systems
 		      (make-translation-table)))))
 	(dolist (pair (symbol-value cs))
 	  (aset ucs-mule-to-mule-unicode (car pair) (cdr pair))
 	  (if encode-translator
 	      (aset encode-translator (cdr pair) (car pair))))
+	(if encode-translator
+	    (optimize-char-table encode-translator))
 	(if (charsetp cs)
 	    (push cs safe-charsets)
 	  (setq safe-charsets
 		(append (delq 'ascii (coding-system-get cs 'safe-charsets))
-			safe-charsets)))))
+			safe-charsets)))
+	(cond ((eq cs 'vietnamese-viscii)
+	       (coding-system-put 'vietnamese-viscii
+				  'translation-table-for-input
+				  encode-translator)
+	       (coding-system-put 'vietnamese-viqr
+				  'translation-table-for-input
+				  encode-translator))
+	      ((memq cs '(lao thai-tis620 tibetan-iso-8bit))
+	       (coding-system-put cs 'translation-table-for-input cs)))))
+(optimize-char-table ucs-mule-to-mule-unicode)
 (dolist (c safe-charsets)
 (aset table (make-char c) t))
 (coding-system-put 'mule-utf-8 'safe-charsets
 		       (append (coding-system-get 'mule-utf-8 'safe-charsets)
 			       safe-charsets))
 (register-char-codings 'mule-utf-8 table)))
+(defvar translation-table-for-input (make-translation-table))
+;; Arrange to set up the translation table for Quail.  This probably
+;; isn't foolproof.
+(defun ucs-quail-activate ()
+"Set up an appropriate `translation-table-for-input' for current buffer.
+Intended to be added to `quail-activate-hook'."
+(let ((cs (coding-system-base buffer-file-coding-system)))
+(if (eq cs 'undecided)
+	(setq cs (coding-system-base default-buffer-file-coding-system)))
+(if (coding-system-get cs 'translation-table-for-input)
+	(set (make-variable-buffer-local 'translation-table-for-input)
+	     (coding-system-get cs 'translation-table-for-input)))))
+;; The minibuffer needs to acquire a `buffer-file-coding-system' for
+;; the above to work in it.
+(defun ucs-minibuffer-setup ()
+"Set up an appropriate `buffer-file-coding-system' for current buffer.
+Does so by inheriting it from the cadr of the current buffer list.
+Intended to be added to `minibuffer-setup-hook'."
+(set (make-local-variable 'buffer-file-coding-system)
+(with-current-buffer (cadr (buffer-list))
+	 buffer-file-coding-system)))
+;; Modified to allow display of arbitrary characters with an
+;; iso-10646-encoded (`Unicode') font.
+(define-ccl-program ccl-encode-unicode-font
+`(0
+((if (r0 == ,(charset-id 'ascii))
+	 ((r2 = r1)
+	  (r1 = 0))
+(
+	;; Look for a translation for non-ASCII chars.  For a 2D
+	;; charset, produce a single code for the translation.
+	;; Official 2D sets are in the charset id range [#x90,#x99],
+	;; private ones in the range [#xf0,#xfe] (with #xff not used).
+	;; Fixme: Is there a better way to do this?
+	(r3 = (r0 >= #x90))
+	(r3 &= (r0 <= #x99))
+	(r3 |= (r0 >= #xf0))
+	(if r3				; 2D input
+	    (r1 = ((r1 << 7) | r2)))
+	(translate-character ucs-mule-to-mule-unicode r0 r1)
+	(r3 = (r0 >= #x90))
+	(r3 &= (r0 <= #x99))
+	(r3 |= (r0 >= #xf0))
+	(if r3 				; 2D translation
+	    ((r2 = (r1 & 127))
+	     (r1 = (r1 >> 7))))
+	(if (r0 == ,(charset-id 'latin-iso8859-1))
+	    ((r2 = (r1 + 128))
+	     (r1 = 0))
+	  (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
+	      ((r1 *= 96)
+	       (r1 += r2)
+	       (r1 += ,(- #x100 (* 32 96) 32))
+	       (r1 >8= 0)
+	       (r2 = r7))
+	    (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
+		((r1 *= 96)
+		 (r1 += r2)
+		 (r1 += ,(- #x2500 (* 32 96) 32))
+		 (r1 >8= 0)
+		 (r2 = r7))
+	      (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
+		  ((r1 *= 96)
+		   (r1 += r2)
+		   (r1 += ,(- #xe000 (* 32 96) 32))
+		   (r1 >8= 0)
+		   (r2 = r7))))))))))
+"Encode characters for display with iso10646 font.
+Translate through table `ucs-mule-to-mule-unicode' initially.")
+(defalias 'ucs-tables-unload-hook 'ucs-fragment-8859)
 (provide 'ucs-tables)
 ;;; ucs-tables.el ends here

Mercurial > emacs

comparison lisp/international/ucs-tables.el @ 46506:a7f933a7b003