comparison lisp/international/mule.el @ 53240:ee5206ee4439

(ctext-non-standard-encodings-alist): Change the format. (ctext-non-standard-encodings): New variable. (ctext-post-read-conversion): Fully re-written. (ctext-non-standard-designations-alist): Delete it. (ctext-non-standard-encodings-table): New function. (ctext-pre-write-conversion): Fully re-written.
author Kenichi Handa <handa@m17n.org>
date Wed, 03 Dec 2003 08:24:42 +0000
parents 810931aa5f2d
children 5c66f1de4907
comparison
equal deleted inserted replaced
53239:82690620d562 53240:ee5206ee4439
1328 (set-coding-priority-internal))) 1328 (set-coding-priority-internal)))
1329 1329
1330 ;;; X selections 1330 ;;; X selections
1331 1331
1332 (defvar ctext-non-standard-encodings-alist 1332 (defvar ctext-non-standard-encodings-alist
1333 '(("ISO8859-15" . iso-8859-15) 1333 '(("big5-0" big5 2 (chinese-big5-1 chinese-big5-2))
1334 ("ISO8859-14" . iso-8859-14) 1334 ("ISO8859-14" iso-8859-14 1 latin-iso8859-14)
1335 ("KOI8-R" . koi8-r) 1335 ("ISO8859-15" iso-8859-15 1 latin-iso8859-15))
1336 ("BIG5-0" . big5)) 1336 "Alist of non-standard encoding names vs the corresponding usages in CTEXT.
1337 "Alist of non-standard encoding names vs Emacs coding systems. 1337
1338 This alist is used to decode an extened segment of a compound text.") 1338 It controls how extended segments of a compound text are handled
1339 by the coding system `compound-text-with-extensions'.
1340
1341 Each element has the form (ENCODING-NAME CODING-SYSTEM N-OCTET CHARSET).
1342
1343 ENCODING-NAME is an encoding name of an \"extended segments\".
1344
1345 CODING-SYSTEM is the coding-system to encode (or decode) the
1346 characters into (or from) the extended segment.
1347
1348 N-OCTET is the number of octets (bytes) that encodes a character
1349 in the segment. It can be 0 (meaning the number of octets per
1350 character is variable), 1, 2, 3, or 4.
1351
1352 CHARSET is a charater set containing characters that are encoded
1353 in the segment. It can be a list of character sets. It can also
1354 be a char-table, in which case characters that have non-nil value
1355 in the char-table are the target.
1356
1357 On decoding CTEXT, all encoding names listed here are recognized.
1358
1359 On encoding CTEXT, encoding names in the variable
1360 `ctext-non-standard-encodings' (which see) and in the information
1361 listed for the current language environment under the key
1362 `ctext-non-standard-encodings' are used.")
1363
1364 (defvar ctext-non-standard-encodings
1365 '("big5-0")
1366 "List of non-standard encoding names used in extended segments of CTEXT.
1367 Each element must be one of the names listed in the variable
1368 `ctext-non-standard-encodings-alist' (which see).")
1339 1369
1340 (defvar ctext-non-standard-encodings-regexp 1370 (defvar ctext-non-standard-encodings-regexp
1341 (string-to-multibyte 1371 (string-to-multibyte
1342 (concat 1372 (concat
1343 ;; For non-standard encodings. 1373 ;; For non-standard encodings.
1345 "\\|" 1375 "\\|"
1346 ;; For UTF-8 encoding. 1376 ;; For UTF-8 encoding.
1347 "\\(\e%G[^\e]*\e%@\\)"))) 1377 "\\(\e%G[^\e]*\e%@\\)")))
1348 1378
1349 ;; Functions to support "Non-Standard Character Set Encodings" defined 1379 ;; Functions to support "Non-Standard Character Set Encodings" defined
1350 ;; by the COMPOUND-TEXT spec. 1380 ;; by the COMPOUND-TEXT spec. They also support "The UTF-8 encoding"
1351 ;; We support that by decoding the whole data by `ctext' which just 1381 ;; described in the section 7 of the documentation of COMPOUND-TEXT
1352 ;; pertains byte sequences belonging to ``extended segment'', then 1382 ;; distributed with XFree86.
1353 ;; decoding those byte sequences one by one in Lisp.
1354 ;; This function also supports "The UTF-8 encoding" described in the
1355 ;; section 7 of the documentation fo COMPOUND-TEXT distributed with
1356 ;; XFree86.
1357 1383
1358 (defun ctext-post-read-conversion (len) 1384 (defun ctext-post-read-conversion (len)
1359 "Decode LEN characters encoded as Compound Text with Extended Segments." 1385 "Decode LEN characters encoded as Compound Text with Extended Segments."
1360 (save-match-data 1386 (save-match-data
1361 (save-restriction 1387 (save-restriction
1363 (in-workbuf (string= (buffer-name) " *code-converting-work*")) 1389 (in-workbuf (string= (buffer-name) " *code-converting-work*"))
1364 last-coding-system-used 1390 last-coding-system-used
1365 pos bytes) 1391 pos bytes)
1366 (or in-workbuf 1392 (or in-workbuf
1367 (narrow-to-region (point) (+ (point) len))) 1393 (narrow-to-region (point) (+ (point) len)))
1368 (decode-coding-region (point-min) (point-max) 'ctext)
1369 (if in-workbuf 1394 (if in-workbuf
1370 (set-buffer-multibyte t)) 1395 (set-buffer-multibyte t))
1371 (while (re-search-forward ctext-non-standard-encodings-regexp 1396 (while (re-search-forward ctext-non-standard-encodings-regexp
1372 nil 'move) 1397 nil 'move)
1373 (setq pos (match-beginning 0)) 1398 (setq pos (match-beginning 0))
1374 (if (match-beginning 1) 1399 (if (match-beginning 1)
1375 ;; ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES-- 1400 ;; ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
1376 (let* ((M (char-after (+ pos 4))) 1401 (let* ((M (char-after (+ pos 4)))
1377 (L (char-after (+ pos 5))) 1402 (L (char-after (+ pos 5)))
1378 (encoding (match-string 2)) 1403 (encoding (match-string 2))
1379 (coding (or (cdr (assoc-ignore-case 1404 (encoding-info (assoc-ignore-case
1380 encoding 1405 encoding
1381 ctext-non-standard-encodings-alist)) 1406 ctext-non-standard-encodings-alist))
1382 (coding-system-p 1407 (coding (if encoding-info
1383 (intern (downcase encoding)))))) 1408 (nth 1 encoding-info)
1409 (setq encoding (intern (downcase encoding)))
1410 (and (coding-system-p encoding)
1411 encoding))))
1384 (setq bytes (- (+ (* (- M 128) 128) (- L 128)) 1412 (setq bytes (- (+ (* (- M 128) 128) (- L 128))
1385 (- (point) (+ pos 6)))) 1413 (- (point) (+ pos 6))))
1386 (when coding 1414 (when coding
1387 (delete-region pos (point)) 1415 (delete-region pos (point))
1388 (forward-char bytes) 1416 (forward-char bytes)
1389 (decode-coding-region (- (point) bytes) (point) coding))) 1417 (decode-coding-region (- (point) bytes) (point) coding)))
1390 ;; ESC % G --UTF-8-BYTES-- ESC % @ 1418 ;; ESC % G --UTF-8-BYTES-- ESC % @
1391 (setq bytes (- (point) pos)) 1419 (delete-char -3)
1392 (decode-coding-region (- (point) bytes) (point) 'utf-8)))) 1420 (delete-region pos (+ pos 3))
1421 (decode-coding-region pos (point) 'utf-8))))
1393 (goto-char (point-min)) 1422 (goto-char (point-min))
1394 (- (point-max) (point))))) 1423 (- (point-max) (point)))))
1395 1424
1396 ;; From X registry 2001/06/01 1425 ;; Return a char table of extended segment usage for each character.
1397 ;; 20. NON-STANDARD CHARACTER SET ENCODINGS 1426 ;; Each value of the char table is nil, one of the elements of
1398 1427 ;; `ctext-non-standard-encodings-alist', or the symbol `utf-8'.
1399 ;; See Section 6 of the Compound Text standard. 1428
1400 1429 (defun ctext-non-standard-encodings-table ()
1401 ;; Name Reference 1430 (let ((table (make-char-table 'translation-table)))
1402 ;; ---- --------- 1431 (aset table (make-char 'mule-unicode-0100-24ff) 'utf-8)
1403 ;; "DEC.CNS11643.1986-2" [53] 1432 (aset table (make-char 'mule-unicode-2500-33ff) 'utf-8)
1404 ;; CNS11643 2-plane using the recommended 1433 (aset table (make-char 'mule-unicode-e000-ffff) 'utf-8)
1405 ;; internal representation scheme 1434 (dolist (encoding (reverse
1406 ;; "DEC.DTSCS.1990-2" [54] 1435 (append
1407 ;; DEC Taiwan Supplemental Character Set 1436 (get-language-info current-language-environment
1408 ;; "fujitsu.u90x03" [87] 1437 'ctext-non-standard-encodings)
1409 ;; "ILA" [62] 1438 ctext-non-standard-encodings)))
1410 ;; registry prefix 1439 (let* ((slot (assoc encoding ctext-non-standard-encodings-alist))
1411 ;; "IPSYS" [59] 1440 (charset (nth 3 slot)))
1412 ;; registry prefix 1441 (if charset
1413 ;; "omron_UDC" [45] 1442 (cond ((charsetp charset)
1414 ;; omron User Defined Charset 1443 (aset table (make-char charset) slot))
1415 ;; "omron_UDC_ja" [45] 1444 ((listp charset)
1416 ;; omron User Defined Charset for Japanese 1445 (dolist (elt charset)
1417 ;; "omron_UDC_zh" [45] 1446 (aset table (make-char elt) slot)))
1418 ;; omron User Defined Charset for Chinese(Main land) 1447 ((char-table-p charset)
1419 ;; "omron_UDC_tw" [45] 1448 (map-char-table #'(lambda (k v)
1420 ;; omron User Defined Charset for Chinese(Taiwan) 1449 (if (and v (> k 128)) (aset table k slot)))
1421 1450 charset))))))
1422 ;; If you add charsets here, be sure to modify the regexp used by 1451 table))
1423 ;; ctext-pre-write-conversion to look up non-standard charsets.
1424 (defvar ctext-non-standard-designations-alist
1425 '(("$(0" . (big5 "big5-0" 2))
1426 ("$(1" . (big5 "big5-0" 2))
1427 ;; The following are actually standard; generating extended
1428 ;; segments for them is wrong and screws e.g. Latin-9 users.
1429 ;; 8859-{10,13,16} aren't Emacs charsets anyhow. -- fx
1430 ;; ("-V" . (t "iso8859-10" 1))
1431 ;; ("-Y" . (t "iso8859-13" 1))
1432 ;; ("-_" . (t "iso8859-14" 1))
1433 ;; ("-b" . (t "iso8859-15" 1))
1434 ;; ("-f" . (t "iso8859-16" 1))
1435 )
1436 "Alist of ctext control sequences that introduce character sets which
1437 are not in the list of approved encodings, and the corresponding
1438 coding system, identifier string, and number of octets per encoded
1439 character.
1440
1441 Each element has the form (CTLSEQ . (ENCODING CHARSET NOCTETS)). CTLSEQ
1442 is the control sequence (sans the leading ESC) that introduces the character
1443 set in the text encoded by compound-text. ENCODING is a coding system
1444 symbol; if it is t, it means that the ctext coding system already encodes
1445 the text correctly, and only the leading control sequence needs to be altered.
1446 If ENCODING is a coding system, we need to re-encode the text with that
1447 coding system. CHARSET is the name of the charset we need to put into
1448 the leading control sequence. NOCTETS is the number of octets (bytes) that
1449 encode each character in this charset. NOCTETS can be 0 (meaning the number
1450 of octets per character is variable), 1, 2, 3, or 4.")
1451 1452
1452 (defun ctext-pre-write-conversion (from to) 1453 (defun ctext-pre-write-conversion (from to)
1453 "Encode characters between FROM and TO as Compound Text w/Extended Segments. 1454 "Encode characters between FROM and TO as Compound Text w/Extended Segments.
1454 1455
1455 If FROM is a string, or if the current buffer is not the one set up for us 1456 If FROM is a string, or if the current buffer is not the one set up for us
1468 (set-buffer (generate-new-buffer " *temp")) 1469 (set-buffer (generate-new-buffer " *temp"))
1469 (set-buffer-multibyte multibyte) 1470 (set-buffer-multibyte multibyte)
1470 (insert-buffer-substring buf from to)))) 1471 (insert-buffer-substring buf from to))))
1471 1472
1472 ;; Now we can encode the whole buffer. 1473 ;; Now we can encode the whole buffer.
1473 (let ((case-fold-search nil) 1474 (let ((encoding-table (ctext-non-standard-encodings-table))
1474 last-coding-system-used 1475 last-coding-system-used
1475 pos posend desig encode-info encoding chset noctets textlen) 1476 last-pos last-encoding-info
1476 (goto-char (point-min)) 1477 encoding-info end-pos)
1477 ;; At first encode the whole buffer. 1478 (goto-char (setq last-pos (point-min)))
1478 (encode-coding-region (point-min) (point-max) 'ctext-no-compositions) 1479 (setq end-pos (point-marker))
1479 ;; Then replace ISO-2022 charset designations with extended 1480 (while (re-search-forward "[^\000-\177]+" nil t)
1480 ;; segments, for those charsets that are not part of the 1481 ;; Found a sequence of non-ASCII characters.
1481 ;; official X registry. The regexp below finds the leading 1482 (setq last-pos (match-beginning 0)
1482 ;; sequences for big5. 1483 last-encoding-info (aref encoding-table (char-after last-pos)))
1483 (while (re-search-forward "\e\\(\$([01]\\)" nil 'move) 1484 (set-marker end-pos (match-end 0))
1484 (setq pos (match-beginning 0) 1485 (goto-char (1+ last-pos))
1485 posend (point) 1486 (catch 'tag
1486 desig (match-string 1) 1487 (while t
1487 encode-info (cdr (assoc desig 1488 (setq encoding-info
1488 ctext-non-standard-designations-alist)) 1489 (if (< (point) end-pos)
1489 encoding (car encode-info) 1490 (aref encoding-table (following-char))))
1490 chset (cadr encode-info) 1491 (unless (eq last-encoding-info encoding-info)
1491 noctets (car (cddr encode-info))) 1492 (cond ((consp last-encoding-info)
1492 (skip-chars-forward "^\e") 1493 ;; Encode the previous range using an extended
1493 (cond 1494 ;; segment.
1494 ((eq encoding t) ; only the leading sequence needs to be changed 1495 (let ((encoding-name (car last-encoding-info))
1495 (setq textlen (+ (- (point) posend) (length chset) 1)) 1496 (coding-system (nth 1 last-encoding-info))
1496 ;; Generate the control sequence for an extended segment. 1497 (noctets (nth 2 last-encoding-info))
1497 (replace-match (format "\e%%/%d%c%c%s" 1498 len)
1498 noctets 1499 (encode-coding-region last-pos (point) coding-system)
1499 (+ (/ textlen 128) 128) 1500 (setq len (+ (length encoding-name) 1
1500 (+ (% textlen 128) 128) 1501 (- (point) last-pos)))
1501 chset) 1502 (save-excursion
1502 t t)) 1503 (goto-char last-pos)
1503 ((coding-system-p encoding) ; need to recode the entire segment... 1504 (insert (string-to-multibyte
1504 (decode-coding-region pos (point) 'ctext-no-compositions) 1505 (format "\e%%/%d%c%c%s"
1505 (encode-coding-region pos (point) encoding) 1506 noctets
1506 (setq textlen (+ (- (point) pos) (length chset) 1)) 1507 (+ (/ len 128) 128)
1507 (save-excursion 1508 (+ (% len 128) 128)
1508 (goto-char pos) 1509 encoding-name))))))
1509 (insert (format "\e%%/%d%c%c%s" 1510 ((eq last-encoding-info 'utf-8)
1510 noctets 1511 ;; Encode the previous range using UTF-8 encoding
1511 (+ (/ textlen 128) 128) 1512 ;; extention.
1512 (+ (% textlen 128) 128) 1513 (encode-coding-region last-pos (point) 'mule-utf-8)
1513 chset)))))) 1514 (save-excursion
1515 (goto-char last-pos)
1516 (insert "\e%G"))
1517 (insert "\e%@")))
1518 (setq last-pos (point)
1519 last-encoding-info encoding-info))
1520 (if (< (point) end-pos)
1521 (forward-char 1)
1522 (throw 'tag nil)))))
1523 (set-marker end-pos nil)
1514 (goto-char (point-min)))) 1524 (goto-char (point-min))))
1515 ;; Must return nil, as build_annotations_2 expects that. 1525 ;; Must return nil, as build_annotations_2 expects that.
1516 nil) 1526 nil)
1517 1527
1518 ;;; FILE I/O 1528 ;;; FILE I/O