changeset 88767:b861d4bbb70f

(find-multibyte-characters): Doc fix. (find-multibyte-characters): Don't test for charset `unknown'. (locale-language-names): Change or add: be, bs, cy, mk, ru.koi8, ru, sr_YU, tg, wa, zh.gbk. (locale-language-names): Change sp to Cyrillic. (locale-charset-language-names): Match @euro after utf-8.
author Dave Love <fx@gnu.org>
date Fri, 14 Jun 2002 23:23:24 +0000
parents 4eeed541231e
children f91d6a22179b
files lisp/international/mule-cmds.el
diffstat 1 files changed, 20 insertions(+), 25 deletions(-) [+]
line wrap: on
line diff
--- a/lisp/international/mule-cmds.el	Fri Jun 14 23:23:08 2002 +0000
+++ b/lisp/international/mule-cmds.el	Fri Jun 14 23:23:24 2002 +0000
@@ -461,6 +461,7 @@
 	   (append codings
 		   (char-table-extra-slot char-coding-system-table 0))))))
 
+;; Fixme: is this doing the right thing now, at least with eight-bit?
 (defun find-multibyte-characters (from to &optional maxcount excludes)
   "Find multibyte characters in the region specified by FROM and TO.
 If FROM is a string, find multibyte characters in the string.
@@ -471,9 +472,7 @@
   COUNT is a number of characters,
   CHARs are found characters of the character set.
 Optional 3rd arg MAXCOUNT limits how many CHARs are put in the above list.
-Optional 4th arg EXCLUDE is a list of character sets to be ignored.
-
-For invalid characters, CHARs are actually strings."
+Optional 4th arg EXCLUDE is a list of character sets to be ignored."
   (let ((chars nil)
 	charset char)
     (if (stringp from)
@@ -481,10 +480,7 @@
 	  (while (setq idx (string-match "[^\000-\177]" from idx))
 	    (setq char (aref from idx)
 		  charset (char-charset char))
-	    (if (eq charset 'unknown)
-		(setq char (match-string 0)))
-	    (if (or (memq charset '(unknown
-				    eight-bit-control eight-bit-graphic))
+	    (if (or (memq charset '(eight-bit-control eight-bit-graphic))
 		    (not (or (eq excludes t) (memq charset excludes))))
 		(let ((slot (assq charset chars)))
 		  (if slot
@@ -500,9 +496,7 @@
 	(while (re-search-forward "[^\000-\177]" to t)
 	  (setq char (preceding-char)
 		charset (char-charset char))
-	  (if (eq charset 'unknown)
-	      (setq char (match-string 0)))
-	  (if (or (memq charset '(unknown eight-bit-control eight-bit-graphic))
+	  (if (or (memq charset '(eight-bit-control eight-bit-graphic))
 		  (not (or (eq excludes t) (memq charset excludes))))
 	      (let ((slot (assq charset chars)))
 		(if slot
@@ -1641,17 +1635,18 @@
     ; ay Aymara
     ; az Azerbaijani
     ; ba Bashkir
-    ("be" . "Belarussian") ; Belarussian [Byelorussian]
+    ("be" . "Belarusian") ; Belarusian [Byelorussian until early 1990s]
     ("bg" . "Bulgarian") ; Bulgarian
     ; bh Bihari
     ; bi Bislama
     ; bn Bengali, Bangla
     ("bo" . "Tibetan")
     ("br" . "Latin-1") ; Breton
+    ("bs" . "Latin-2") ; Bosnian
     ("ca" . "Latin-1") ; Catalan
     ; co Corsican
     ("cs" . "Czech")
-    ("cy" . "Latin-8") ; Welsh
+    ("cy" . "Welsh") ; Welsh
     ("da" . "Latin-1") ; Danish
     ("de" . "German")
     ; dz Bhutani
@@ -1662,7 +1657,7 @@
     ("es" . "Spanish")
     ("et" . "Latin-4") ; Estonian
     ("eu" . "Latin-1") ; Basque
-    ; fa Persian
+    ; fa Persian glibc uses utf-8
     ("fi" . "Latin-1") ; Finnish
     ; fj Fiji
     ("fo" . "Latin-1") ; Faroese
@@ -1673,7 +1668,7 @@
     ("gl" . "Latin-1") ; Galician
     ; gn Guarani
     ; gu Gujarati
-    ("gv" . "Latin-8") ; Manx Gaelic
+    ("gv" . "Latin-8") ; Manx Gaelic  glibc uses 8859-1
     ; ha Hausa
     ("he" . "Hebrew")
     ("hi" . "Devanagari") ; Hindi  glibc uses utf-8
@@ -1707,7 +1702,7 @@
     ("lv" . "Latvian") ; Latvian, Lettish
     ; mg Malagasy
     ("mi" . "Latin-7") ; Maori
-    ("mk" . "Latin-5") ; Macedonian
+    ("mk" . "Cyrillic-ISO") ; Macedonian
     ; ml Malayalam
     ; mn Mongolian
     ; mo Moldavian
@@ -1730,8 +1725,8 @@
     ("rm" . "Latin-1") ; Rhaeto-Romanic
     ; rn Kirundi
     ("ro" . "Romanian")
-    ("ru.*[_.]koi8" . "Cyrillic-KOI8") ; Russian
-    ("ru" . "Latin-5") ; Russian
+    ("ru.*[_.]koi8\\(?:-r\\)?\\'" . "Cyrillic-KOI8") ; Russian
+    ("ru" . "Cyrillic-ISO") ; Russian
     ; rw Kinyarwanda
     ("sa" . "Devanagari") ; Sanskrit
     ; sd Sindhi
@@ -1746,6 +1741,7 @@
     ; so Somali
     ("sq" . "Latin-1") ; Albanian
     ("sr" . "Latin-2") ; Serbian (Latin alphabet)
+    ("sr.*@cyrillic" . "Cyrillic-ISO")	; per glibc
     ; ss Siswati
     ; st Sesotho
     ; su Sundanese
@@ -1753,7 +1749,7 @@
     ("sw" . "Latin-1") ; Swahili
     ; ta Tamil  glibc uses utf-8
     ; te Telugu  glibc uses utf-8
-    ("tg" . "Cyrillic-KOI8-T") ; Tajik
+    ("tg" . "Tajik")
     ("th" . "Thai")
     ; ti Tigrinya
     ; tk Turkmen
@@ -1770,6 +1766,7 @@
     ("uz" . "Latin-1") ; Uzbek
     ("vi" . "Vietnamese") ;  glibc uses utf-8
     ; vo Volapuk
+    ("wa" . "Latin-1") ; Walloon
     ; wo Wolof
     ; xh Xhosa
     ("yi" . "Windows-1255") ; Yiddish
@@ -1778,13 +1775,11 @@
 
     ; glibc:
     ; zh_CN.GB18030/GB18030 \
-    ; zh_CN.GBK/GBK \
     ; zh_HK/BIG5-HKSCS \
-    ; zh_TW/BIG5 \
-    ; zh_TW.EUC-TW/EUC-TW \
 
     ("zh.*[._]big5" . "Chinese-BIG5")
-    ("zh.*[._]gbk" . nil) ; Solaris 2.7; has gbk-0 as well as GB 2312.1980-0
+    ("zh.*[._].gbk" . "Chinese-GBK")
+    ;; glibc has zh_TW.EUC-TW, with zh_TW defaulting to Big5
     ("zh_tw" . "Chinese-CNS")
     ("zh" . "Chinese-GB")
     ; zu Zulu
@@ -1801,7 +1796,7 @@
     ("cz" . "Czech") ; e.g. Solaris 2.6
     ("ee" . "Latin-4") ; Estonian, e.g. X11R6.4
     ("iw" . "Hebrew") ; e.g. X11R6.4
-    ("sp" . "Latin-5") ; Serbian (Cyrillic alphabet), e.g. X11R6.4
+    ("sp" . "Cyrillic-ISO") ; Serbian (Cyrillic alphabet), e.g. X11R6.4
     ("su" . "Latin-1") ; Finnish, e.g. Solaris 2.6
     ("jp" . "Japanese") ; e.g. MS Windows
     ("chs" . "Chinese-GB") ; MS Windows Chinese Simplified
@@ -1821,8 +1816,8 @@
      (".*8859[-_]?9\\>" . "Latin-5")
      (".*8859[-_]?14\\>" . "Latin-8")
      (".*8859[-_]?15\\>" . "Latin-9")
-     (".*@euro\\>" . "Latin-9")
-     (".*utf\\(-?8\\)\\>" . "UTF-8")))
+     (".*utf\\(-?8\\)\\>" . "UTF-8")
+     (".*@euro\\>" . "Latin-9"))) ; utf-8@euro exists, so put this last
   "List of pairs of locale regexps and charset language names.
 The first element whose locale regexp matches the start of a downcased locale
 specifies the language name whose charsets corresponds to that locale.