diff lisp/international/characters.el @ 88612:ec8e29bbca37

Various simplifications and additions.
author Dave Love <fx@gnu.org>
date Thu, 23 May 2002 18:16:52 +0000
parents 98ec5d812ccc
children 3a7ca837e9b9
line wrap: on
line diff
--- a/lisp/international/characters.el	Thu May 23 18:15:02 2002 +0000
+++ b/lisp/international/characters.el	Thu May 23 18:16:52 2002 +0000
@@ -2,7 +2,7 @@
 
 ;; Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
 ;; Licensed to the Free Software Foundation.
-;; Copyright (C) 2001 Free Software Foundation, Inc.
+;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
 ;; Copyright (C) 2001, 2002
 ;;   National Institute of Advanced Industrial Science and Technology (AIST)
 ;;   Registration Number H13PRO009
@@ -113,9 +113,7 @@
 		  arabic-2-column)))
   (while charsets
 ;;     (modify-syntax-entry (make-char (car charsets)) "w")
-    (map-charset-chars
-     #'(lambda (char ignore) (modify-category-entry char ?b))
-     (car charsets))
+    (map-charset-chars #'modify-category-entry (car charsets) ?b)
     (setq charsets (cdr charsets))))
 (modify-category-entry '(#x600 . #x6ff) ?b)
 (modify-category-entry '(#xfb50 . #xfdff) ?b)
@@ -142,6 +140,8 @@
 (modify-syntax-entry ?\〗 ")〖")
 (modify-syntax-entry ?\】 ")【")
 
+;; Fixme: should any Chinese stuff be re-instated?
+
 ;; (modify-category-entry (make-char 'chinese-gb2312) ?c)
 ;; (modify-category-entry (make-char 'chinese-gb2312) ?\|)
 ;; (modify-category-entry (make-char 'chinese-gb2312 35) ?A)
@@ -191,10 +191,7 @@
 
 ;; Cyrillic character set (ISO-8859-5)
 
-(modify-syntax-entry (decode-char 'iso-8859-5 160) " ")
-(modify-syntax-entry ?­ ".")
 (modify-syntax-entry ?№ ".")
-(modify-syntax-entry ?§ ".")
 (let ((tbl (standard-case-table)))
   (set-case-syntax-pair ?Ё ?ё tbl)
   (set-case-syntax-pair ?Ђ ?ђ tbl)
@@ -285,23 +282,16 @@
 
 ;; Ethiopic character set
 
-;; (modify-category-entry (make-char 'ethiopic) ?e)
-;; (modify-syntax-entry (make-char 'ethiopic) "w")
 (modify-category-entry '(#x1200 . #x137b) ?e)
-(let ((chars '(? ? ? ? ? ? ? ? ? ? ? ? ? ?
-	       ;; Unicode equivalents of the above:
-	       ?፡ ?። ?፣ ?፤ ?፥ ?፦ ?፧ ?፨ ?ﷰ ?ﷻ ?﷼ ?﷽ ?﷾ ?﷿)))
+(let ((chars '(? ? ? ? ? ? ? ? ? ? ? ? ? ?)))
   (while chars
     (modify-syntax-entry (car chars) ".")
     (setq chars (cdr chars))))
+(map-charset-chars #'modify-category-entry 'ethiopic ?e)
 
 ;; Greek character set (ISO-8859-7)
 
-;; (modify-category-entry (make-char 'greek-iso8859-7) ?g)
-(let ((c #x370))
-  (while (<= c #x3ff)
-    (modify-category-entry (decode-char 'ucs c) ?g)
-    (setq c (1+ c))))
+(modify-category-entry '(#x370 . #x3ff) ?g)
 
 ;; (let ((c 182))
 ;;   (while (< c 255)
@@ -364,19 +354,15 @@
 
 ;; Hebrew character set (ISO-8859-8)
 
-;; (modify-category-entry (make-char 'hebrew-iso8859-8) ?w)
-(let ((c #x591))
-  (while (<= c #x5f4)
-    (modify-category-entry (decode-char 'ucs c) ?w)
-    (setq c (1+ c))))
+(modify-category-entry '(#x590 . #x5f4) ?w)
 
 ;; (modify-syntax-entry (make-char 'hebrew-iso8859-8 208) ".") ; PASEQ
 ;; (modify-syntax-entry (make-char 'hebrew-iso8859-8 211) ".") ; SOF PASUQ
-(modify-syntax-entry (decode-char 'ucs #x5be) ".") ; MAQAF
-(modify-syntax-entry (decode-char 'ucs #x5c0) ".") ; PASEQ
-(modify-syntax-entry (decode-char 'ucs #x5c3) ".") ; SOF PASUQ
-(modify-syntax-entry (decode-char 'ucs #x5f3) ".") ; GERESH
-(modify-syntax-entry (decode-char 'ucs #x5f4) ".") ; GERSHAYIM
+(modify-syntax-entry #x5be ".") ; MAQAF
+(modify-syntax-entry #x5c0 ".") ; PASEQ
+(modify-syntax-entry #x5c3 ".") ; SOF PASUQ
+(modify-syntax-entry #x5f3 ".") ; GERESH
+(modify-syntax-entry #x5f4 ".") ; GERSHAYIM
 
 ;; (let ((c 224))
 ;;   (while (< c 251)
@@ -390,10 +376,9 @@
 ;; (modify-category-entry (make-char 'indian-2-column) ?I)
 ;; (modify-category-entry (make-char 'indian-glyph) ?I)
 ;; Unicode Devanagari block
-(let ((c #x901))
-  (while (<= c #x970)
-    (modify-category-entry (decode-char 'ucs c) ?i)
-    (setq c (1+ c))))
+(modify-category-entry '(#x901 . #x970) ?i)
+(map-charset-chars #'modify-category-entry 'indian-is13194 ?i)
+(map-charset-chars #'modify-category-entry 'indian-2-column ?i)
 
 ;;; Commented out since the categories appear not to be used anywhere
 ;;; and word syntax is the default.
@@ -468,27 +453,27 @@
 ;; Unicode equivalents of JISX0201-kana
 (let ((c #xff61))
   (while (<= c #xff9f)
-    (modify-category-entry (decode-char 'ucs c) ?k)
-    (modify-category-entry (decode-char 'ucs c) ?j)
-    (modify-category-entry (decode-char 'ucs c) ?\|)
+    (modify-category-entry c ?k)
+    (modify-category-entry c ?j)
+    (modify-category-entry c ?\|)
     (setq c (1+ c))))
 
 ;; Katakana block
 (let ((c #x30a0))
   (while (<= c #x30ff)
     ;; ?K is double width, ?k isn't specified
-    (modify-category-entry (decode-char 'ucs c) ?K)
+    (modify-category-entry c ?K)
     ;;(modify-category-entry (decode-char 'ucs c) ?j)
-    (modify-category-entry (decode-char 'ucs c) ?\|) 
+    (modify-category-entry c ?\|) 
     (setq c (1+ c))))
 
 ;; Hiragana block
 (let ((c #x3040))
   (while (<= c #x309f)
     ;; ?H is actually defined to be double width
-    (modify-category-entry (decode-char 'ucs c) ?H)
+    (modify-category-entry c ?H)
     ;;(modify-category-entry (decode-char 'ucs c) ?j)
-    (modify-category-entry (decode-char 'ucs c) ?\|) 
+    (modify-category-entry c ?\|) 
     (setq c (1+ c))))
 
 ;; JISX0208
@@ -498,8 +483,7 @@
 			   (decode-char 'japanese-jisx0208 #x287E)) "_")
 (let ((chars '(? ? ? ? ? ? ? ? ? ? ? ?)))
   (dolist (elt chars)
-    (modify-syntax-entry (car chars) "w")
-    (setq chars (cdr chars))))
+    (modify-syntax-entry (car chars) "w")))
 (modify-syntax-entry ?\ "(")
 (modify-syntax-entry ?\ "(")
 (modify-syntax-entry ?\ "(")
@@ -546,9 +530,7 @@
 
 ;; JISX0201-Kana
 ;; (modify-syntax-entry (make-char 'katakana-jisx0201) "w")
-(let ((chars '(?。 ?、 ?・
-	       ;; Unicode:
-	       ?。 ?、 ?・)))
+(let ((chars '(?。 ?、 ?・)))
   (while chars
     (modify-syntax-entry (car chars) ".")
     (setq chars (cdr chars))))
@@ -558,6 +540,8 @@
 
 ;; Korean character set (KSC5601)
 
+;; Fixme: re-instate these
+
 ;; (modify-syntax-entry (make-char 'korean-ksc5601) "w")
 ;; (modify-syntax-entry (make-char 'korean-ksc5601 33) "_")
 ;; (modify-syntax-entry (make-char 'korean-ksc5601 34) "_")
@@ -573,41 +557,17 @@
 ;; (modify-category-entry (make-char 'korean-ksc5601 43) ?K)
 ;; (modify-category-entry (make-char 'korean-ksc5601 44) ?Y)
 
-;; Latin character set (latin-1,2,3,4,5,8,9)
+;; Latin
 
-;; (modify-category-entry (make-char 'latin-iso8859-1) ?l)
-;; (modify-category-entry (make-char 'latin-iso8859-2) ?l)
-;; (modify-category-entry (make-char 'latin-iso8859-3) ?l)
-;; (modify-category-entry (make-char 'latin-iso8859-4) ?l)
-;; (modify-category-entry (make-char 'latin-iso8859-9) ?l)
-;; (modify-category-entry (make-char 'latin-iso8859-14) ?l)
-;; (modify-category-entry (make-char 'latin-iso8859-15) ?l)
-
-;; (modify-category-entry (make-char 'latin-iso8859-1 160) ?\ )
-;; (modify-category-entry (make-char 'latin-iso8859-2 160) ?\ )
-;; (modify-category-entry (make-char 'latin-iso8859-3 160) ?\ )
-;; (modify-category-entry (make-char 'latin-iso8859-4 160) ?\ )
-;; (modify-category-entry (make-char 'latin-iso8859-9 160) ?\ )
-;; (modify-category-entry (make-char 'latin-iso8859-14 160) ?\ )
-;; (modify-category-entry (make-char 'latin-iso8859-15 160) ?\ )
+(modify-category-entry '(#x80 . #x024F) ?l)
 
 ;; Lao character set
 
-;; (modify-category-entry (make-char 'lao) ?o)
-(dotimes (i (1+ (- #xeff #xe80)))
-  (modify-category-entry (decode-char 'ucs (+ i #xe80)) ?o))
+(modify-category-entry '(#xe80 . #xeff) ?o)
+(map-charset-chars #'modify-category-entry 'lao ?o)
 
-(let ((deflist	'(;; chars	syntax	category
-		  ("ກ-ຮ"	"w"	?0) ; consonant
-		  ("ະາຳຽເ-ໄ"	"w"	?1) ; vowel base
-		  ("ັິ-ືົໍ"	"w"	?2) ; vowel upper
-		  ("ຸູ"		"w"	?3) ; vowel lower
-		  ("່-໌"	"w"	?4) ; tone mark 
-		  ("ຼ"		"w"	?9) ; semivowel lower
-		  ("໐-໙"	"w"	?6) ; digit
-		  ("ຯໆ"		"_"	?5) ; symbol
-		  ;; Unicode equivalents
-		  ("ກ-ຮ"	"w"	?0) ; consonant
+;; Fixme: check this.  Lao characters in HELLO seem to have all the categories
+(let ((deflist	'(("ກ-ຮ"	"w"	?0) ; consonant
 		  ("ະາຳຽເ-ໄ"	"w"	?1) ; vowel base
 		  ("ັິ-ືົໍ"	"w"	?2) ; vowel upper
 		  ("ຸູ"	"w"	?3) ; vowel lower
@@ -640,9 +600,8 @@
 
 ;; Thai character set (TIS620)
 
-;; (modify-category-entry (make-char 'thai-tis620) ?t)
-(dotimes (i (1+ (- #xe7f #xe00)))
-  (modify-category-entry (decode-char 'ucs (+ i #xe00)) ?t))
+(modify-category-entry '(#xe00 . #xe7f) ?t)
+(map-charset-chars #'modify-category-entry 'thai-tis620 ?t)
 
 (let ((deflist	'(;; chars	syntax	category
 		  ("ก-รลว-ฮ"	"w"	?0) ; consonant
@@ -677,10 +636,9 @@
 
 ;; Tibetan character set
 
-;; (modify-category-entry (make-char 'tibetan) ?q)
-;; (modify-category-entry (make-char 'tibetan-1-column) ?q)
-(dotimes (i (1+ (- #xfff #xf00)))
-  (modify-category-entry (decode-char 'ucs (+ i #xf00)) ?q))
+(modify-category-entry '(#xf00 . #xfff) ?q)
+(map-charset-chars #'modify-category-entry 'tibetan ?q)
+(map-charset-chars #'modify-category-entry 'tibetan-1-column ?q)
 
 (let ((deflist	'(;; chars             syntax category
 		  ("-"        	"w"	?0) ; consonant
@@ -697,18 +655,6 @@
 		  (""            "."     ?>) ;
 		  ("-"      "."     ?<) ; prohibition
 		  ("----" "." ?q) ; others
-
-		  ;; Unicode version (not complete)
-		  ("ཀ-ཀྵཪ"        	"w"	?0) ; consonant
-		  ("ྐ-ྐྵྺྻྼ"       "w"     ?0) ;
-		  ("ིེཻོཽྀ"       "w"	?2) ; upper vowel
-		  ("ཾྂྃ྆྇ྈྉྊྋ" "w"	?2) ; upper modifier
-		  ("༙཰྄ཱུ༵༷"       "w"	?3) ; lowel vowel/modifier
-		  ("༠-༩༪-༳"	        "w"	?6) ; digit
-		  ("་།-༒༔ཿ"        "."     ?|) ; line-break char
-		  ("༈་།-༒༔ཿ༽༴"  "."     ?>) ; prohibition
-		  ("ༀ-༊༼࿁࿂྅"      "."     ?<) ; prohibition
-		  ("༓༕-༘༚-༟༶༸-༻༾༿྾྿-࿏" "." ?q) ; others
 		  ))
       elm chars len syntax category to ch i)
   (while deflist
@@ -734,15 +680,13 @@
 
 ;; Vietnamese character set
 
-;; (let ((lower (make-char 'vietnamese-viscii-lower))
-;;      (upper (make-char 'vietnamese-viscii-upper)))
-;;   (modify-syntax-entry lower "w")
-;;   (modify-syntax-entry upper "w")
-;;  (modify-category-entry lower ?v)
-;;  (modify-category-entry upper ?v)
-;;  (modify-category-entry lower ?l)	; To make a word with
-;;  (modify-category-entry upper ?l)	; latin characters.
-;;  )
+;; To make a word with Latin characters
+(map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?l)
+(map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?v)
+
+(map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?l)
+(map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?v)
+;; Fixme Unicode versions of Vietnamese categeory.
 
 (let ((tbl (standard-case-table))
       (i 32))
@@ -766,17 +710,14 @@
   ;; Latin Extended-A, Latin Extended-B
   (setq c #x0100)
   (while (<= c #x0233)
-    (modify-category-entry (decode-char 'ucs c) ?l)
     (and (or (<= c #x012e)
 	     (and (>= c #x014a) (<= c #x0177)))
 	 (zerop (% c 2))
-	 (set-case-syntax-pair
-	  (decode-char 'ucs c) (decode-char 'ucs (1+ c)) tbl))
+	 (set-case-syntax-pair c (1+ c) tbl))
     (and (>= c #x013a)
 	 (<= c #x0148)
 	 (zerop (% c 2))
-	 (set-case-syntax-pair
-	  (decode-char 'ucs (1- c)) (decode-char 'ucs c) tbl))
+	 (set-case-syntax-pair (1- c) c tbl))
     (setq c (1+ c)))
   (set-case-syntax-pair ?IJ ?ij tbl)
   (set-case-syntax-pair ?Ĵ ?ĵ tbl)
@@ -883,28 +824,25 @@
   (set-case-syntax-pair ?Ȳ ?ȳ tbl)
 
   ;; Latin Extended Additional
+  (modify-category-entry '(#x1e00 . #x1ef9) ?l)
   (setq c #x1e00)
   (while (<= c #x1ef9)
-    (modify-category-entry (decode-char 'ucs c) ?l)
     (and (zerop (% c 2))
 	 (or (<= c #x1e94) (>= c #x1ea0))
-	 (set-case-syntax-pair
-	  (decode-char 'ucs c) (decode-char 'ucs (1+ c)) tbl))
+	 (set-case-syntax-pair c (1+ c) tbl))
     (setq c (1+ c)))
 
   ;; Greek
+  (modify-category-entry '(#x0370 . #x03ff) ?g)
   (setq c #x0370)
   (while (<= c #x03ff)
-    (modify-category-entry (decode-char 'ucs c) ?g)
     (if (or (and (>= c #x0391) (<= c #x03a1))
 	    (and (>= c #x03a3) (<= c #x03ab)))
-	(set-case-syntax-pair
-	 (decode-char 'ucs c) (decode-char 'ucs (+ c 32)) tbl))
+	(set-case-syntax-pair c (+ c 32) tbl))
     (and (>= c #x03da)
 	 (<= c #x03ee)
 	 (zerop (% c 2))
-	 (set-case-syntax-pair
-	  (decode-char 'ucs c) (decode-char 'ucs (1+ c)) tbl))
+	 (set-case-syntax-pair c (1+ c) tbl))
     (setq c (1+ c)))
   (set-case-syntax-pair ?Ά ?ά tbl)
   (set-case-syntax-pair ?Έ ?έ tbl)
@@ -917,20 +855,18 @@
   ;; Armenian
   (setq c #x531)
   (while (<= c #x556)
-    (set-case-syntax-pair (decode-char 'ucs c)
-			  (decode-char 'ucs (+ c #x30)) tbl)
+    (set-case-syntax-pair c (+ c #x30) tbl)
     (setq c (1+ c)))
 
   ;; Greek Extended
+  (modify-category-entry '(#x1f00 . #x1fff) ?g)
   (setq c #x1f00)
   (while (<= c #x1fff)
-    (modify-category-entry (decode-char 'ucs c) ?g)
     (and (<= (logand c #x000f) 7)
 	 (<= c #x1fa7)
 	 (not (memq c '(#x1f50 #x1f52 #x1f54 #x1f56)))
 	 (/= (logand c #x00f0) 7)
-	 (set-case-syntax-pair
-	  (decode-char 'ucs (+ c 8)) (decode-char 'ucs c) tbl))
+	 (set-case-syntax-pair (+ c 8) c tbl))
     (setq c (1+ c)))
   (set-case-syntax-pair ?Ᾰ ?ᾰ tbl)
   (set-case-syntax-pair ?Ᾱ ?ᾱ tbl)
@@ -958,23 +894,20 @@
   (set-case-syntax-pair ?ῼ ?ῳ tbl)
 
   ;; cyrillic
+  (modify-category-entry '(#x0400 . #x04FF) ?y)
   (setq c #x0400)
   (while (<= c #x04ff)
-    (modify-category-entry (decode-char 'ucs c) ?y)
     (and (>= c #x0400)
 	 (<= c #x040f)
-	 (set-case-syntax-pair
-	  (decode-char 'ucs c) (decode-char 'ucs (+ c 80)) tbl))
+	 (set-case-syntax-pair c (+ c 80) tbl))
     (and (>= c #x0410)
 	 (<= c #x042f)
-	 (set-case-syntax-pair
-	  (decode-char 'ucs c) (decode-char 'ucs (+ c 32)) tbl))
+	 (set-case-syntax-pair c (+ c 32) tbl))
     (and (zerop (% c 2))
 	 (or (and (>= c #x0460) (<= c #x0480))
 	     (and (>= c #x048c) (<= c #x04be))
 	     (and (>= c #x04d0) (<= c #x04f4)))
-	 (set-case-syntax-pair
-	  (decode-char 'ucs c) (decode-char 'ucs (1+ c)) tbl))	 
+	 (set-case-syntax-pair c (1+ c) tbl))	 
     (setq c (1+ c)))
   (set-case-syntax-pair ?Ӂ ?ӂ tbl)
   (set-case-syntax-pair ?Ӄ ?ӄ tbl)
@@ -995,45 +928,35 @@
   ;; Roman numerals
   (setq c #x2160)
   (while (<= c #x216f)
-    (set-case-syntax-pair (decode-char 'ucs c)
-			  (decode-char 'ucs (+ c #x10)) tbl)
+    (set-case-syntax-pair c (+ c #x10) tbl)
     (setq c (1+ c)))
 
   ;; Circled Latin
   (setq c #x24b6)
   (while (<= c #x24cf)
-    (set-case-syntax-pair (decode-char 'ucs c)
-			  (decode-char 'ucs (+ c 26)) tbl)
-    (modify-category-entry (decode-char 'ucs c) ?l)
-    (modify-category-entry (decode-char 'ucs (+ c 26)) ?l)
+    (set-case-syntax-pair c (+ c 26) tbl)
+    (modify-category-entry c ?l)
+    (modify-category-entry (+ c 26) ?l)
     (setq c (1+ c)))
 
   ;; Fullwidth Latin
   (setq c #xff21)
   (while (<= c #xff3a)
-    (set-case-syntax-pair (decode-char 'ucs c)
-			  (decode-char 'ucs (+ c #x20)) tbl)
-    (modify-category-entry (decode-char 'ucs c) ?l)
-    (modify-category-entry (decode-char 'ucs (+ c #x20)) ?l)
+    (set-case-syntax-pair c (+ c #x20) tbl)
+    (modify-category-entry c ?l)
+    (modify-category-entry (+ c #x20) ?l)
     (setq c (1+ c)))
 
   ;; Ohm, Kelvin, Angstrom
-  (set-case-syntax-pair ?Ω ?ω tbl)
+;;;  (set-case-syntax-pair ?Ω ?ω tbl)
 ;;;  These mess up the case conversion of k and å.
 ;;;  (set-case-syntax-pair ?K ?k tbl)
 ;;;  (set-case-syntax-pair ?Å ?å tbl)
 
   ;; Combining diacritics
-  (setq c #x300)
-  (while (<= c #x362)
-    (modify-category-entry (decode-char 'ucs c) ?^)
-    (setq c (1+ c)))
-
+  (modify-category-entry '(#x300 . #x362) ?^)
   ;; Combining marks
-  (setq c #x20d0)
-  (while (<= c #x20e3)
-    (modify-category-entry (decode-char 'ucs c) ?^)
-    (setq c (1+ c)))
+  (modify-category-entry '(#x20d0 . #x20e3) ?^)
 
   ;; Fixme: syntax for symbols &c
   )
@@ -1059,6 +982,7 @@
 ;; For each character set, put the information of the most proper
 ;; coding system to encode it by `preferred-coding-system' property.
 
+;; Fixme: should this be junked?
 (let ((l '((latin-iso8859-1	. iso-latin-1)
 	   (latin-iso8859-2	. iso-latin-2)
 	   (latin-iso8859-3	. iso-latin-3)
@@ -1131,8 +1055,7 @@
 	   (#xFFE0 . #xFFEF))))
   (dolist (elt l)
     (set-char-table-range char-width-table
-			  (cons (decode-char 'ucs (car elt))
-				(decode-char 'ucs (cdr elt)))
+			  (cons (car elt) (cdr elt))
 			  2)))
 (map-charset-chars
  #'(lambda (range ignore) (set-char-table-range char-width-table range 2))