comparison lisp/international/mule.el @ 90324:b3b869baa1c3

(ctext-non-standard-encodings-alist): Add an entry for gbk-0. Set charset `big5' in the entry for "big5-0". (ctext-post-read-conversion): Use multibyt-char-to-unibyte to read a row 8-bit. (ctext-non-standard-encodings): Initialize to nil. (ctext-non-standard-encodings-table): Return a list instead of char-table. (ctext-pre-write-conversion): Adjusted for the above change.
author Kenichi Handa <handa@m17n.org>
date Mon, 27 Feb 2006 01:24:24 +0000
parents 0f622530c46c
children 6e94ff6be848
comparison
equal deleted inserted replaced
90323:9ef82b3ff1a7 90324:b3b869baa1c3
1341 (make-obsolete 'set-coding-priority 'set-coding-system-priority "23.1") 1341 (make-obsolete 'set-coding-priority 'set-coding-system-priority "23.1")
1342 1342
1343 ;;; X selections 1343 ;;; X selections
1344 1344
1345 (defvar ctext-non-standard-encodings-alist 1345 (defvar ctext-non-standard-encodings-alist
1346 '(("big5-0" big5 2 (chinese-big5-1 chinese-big5-2)) 1346 '(("big5-0" big5 2 big5)
1347 ("ISO8859-14" iso-8859-14 1 latin-iso8859-14) 1347 ("ISO8859-14" iso-8859-14 1 latin-iso8859-14)
1348 ("ISO8859-15" iso-8859-15 1 latin-iso8859-15)) 1348 ("ISO8859-15" iso-8859-15 1 latin-iso8859-15)
1349 ("gbk-0" gbk 2 chinese-gbk))
1349 "Alist of non-standard encoding names vs the corresponding usages in CTEXT. 1350 "Alist of non-standard encoding names vs the corresponding usages in CTEXT.
1350 1351
1351 It controls how extended segments of a compound text are handled 1352 It controls how extended segments of a compound text are handled
1352 by the coding system `compound-text-with-extensions'. 1353 by the coding system `compound-text-with-extensions'.
1353 1354
1361 N-OCTET is the number of octets (bytes) that encodes a character 1362 N-OCTET is the number of octets (bytes) that encodes a character
1362 in the segment. It can be 0 (meaning the number of octets per 1363 in the segment. It can be 0 (meaning the number of octets per
1363 character is variable), 1, 2, 3, or 4. 1364 character is variable), 1, 2, 3, or 4.
1364 1365
1365 CHARSET is a charater set containing characters that are encoded 1366 CHARSET is a charater set containing characters that are encoded
1366 in the segment. It can be a list of character sets. It can also 1367 in the segment. It can be a list of character sets.
1367 be a char-table, in which case characters that have non-nil value
1368 in the char-table are the target.
1369 1368
1370 On decoding CTEXT, all encoding names listed here are recognized. 1369 On decoding CTEXT, all encoding names listed here are recognized.
1371 1370
1372 On encoding CTEXT, encoding names in the variable 1371 On encoding CTEXT, encoding names in the variable
1373 `ctext-non-standard-encodings' (which see) and in the information 1372 `ctext-non-standard-encodings' (which see) and in the information
1374 listed for the current language environment under the key 1373 listed for the current language environment under the key
1375 `ctext-non-standard-encodings' are used.") 1374 `ctext-non-standard-encodings' are used.")
1376 1375
1377 (defvar ctext-non-standard-encodings 1376 (defvar ctext-non-standard-encodings nil
1378 '("big5-0")
1379 "List of non-standard encoding names used in extended segments of CTEXT. 1377 "List of non-standard encoding names used in extended segments of CTEXT.
1380 Each element must be one of the names listed in the variable 1378 Each element must be one of the names listed in the variable
1381 `ctext-non-standard-encodings-alist' (which see).") 1379 `ctext-non-standard-encodings-alist' (which see).")
1382 1380
1383 (defvar ctext-non-standard-encodings-regexp 1381 (defvar ctext-non-standard-encodings-regexp
1410 (while (re-search-forward ctext-non-standard-encodings-regexp 1408 (while (re-search-forward ctext-non-standard-encodings-regexp
1411 nil 'move) 1409 nil 'move)
1412 (setq pos (match-beginning 0)) 1410 (setq pos (match-beginning 0))
1413 (if (match-beginning 1) 1411 (if (match-beginning 1)
1414 ;; ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES-- 1412 ;; ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
1415 (let* ((M (char-after (+ pos 4))) 1413 (let* ((M (multibyte-char-to-unibyte (char-after (+ pos 4))))
1416 (L (char-after (+ pos 5))) 1414 (L (multibyte-char-to-unibyte (char-after (+ pos 5))))
1417 (encoding (match-string 2)) 1415 (encoding (match-string 2))
1418 (encoding-info (assoc-string 1416 (encoding-info (assoc-string
1419 encoding 1417 encoding
1420 ctext-non-standard-encodings-alist t)) 1418 ctext-non-standard-encodings-alist t))
1421 (coding (if encoding-info 1419 (coding (if encoding-info
1434 (delete-region pos (+ pos 3)) 1432 (delete-region pos (+ pos 3))
1435 (decode-coding-region pos (point) 'utf-8)))) 1433 (decode-coding-region pos (point) 'utf-8))))
1436 (goto-char (point-min)) 1434 (goto-char (point-min))
1437 (- (point-max) (point))))) 1435 (- (point-max) (point)))))
1438 1436
1439 ;; Return a char table of extended segment usage for each character. 1437 ;; Return an alist of CHARSET vs CTEXT-USAGE-INFO generated from
1440 ;; Each value of the char table is nil, one of the elements of 1438 ;; `ctext-non-standard-encodings' and a list specified by the key
1441 ;; `ctext-non-standard-encodings-alist', or the symbol `utf-8'. 1439 ;; `ctext-non-standard-encodings' for the currrent language
1440 ;; environment. CTEXT-USAGE-INFO is one of the element of
1441 ;; `ctext-non-standard-encodings-alist' or nil. In the former case, a
1442 ;; character in CHARSET is encoded using extended segment. In the
1443 ;; latter case, a character in CHARSET is encoded using normal ISO2022
1444 ;; designation sequence. If a character is not in any of CHARSETs, it
1445 ;; is encoded using UTF-8 encoding extention.
1442 1446
1443 (defun ctext-non-standard-encodings-table () 1447 (defun ctext-non-standard-encodings-table ()
1444 (let ((table (make-char-table 'translation-table))) 1448 (let (table)
1445 (aset table (make-char 'mule-unicode-0100-24ff) 'utf-8) 1449 ;; Setup charsets specified in `ctext-non-standard-encodings' and
1446 (aset table (make-char 'mule-unicode-2500-33ff) 'utf-8) 1450 ;; by the key `ctext-non-standard-encodings' for the current
1447 (aset table (make-char 'mule-unicode-e000-ffff) 'utf-8) 1451 ;; language environment.
1448 (dolist (encoding (reverse 1452 (dolist (encoding (append
1449 (append 1453 ctext-non-standard-encodings
1450 (get-language-info current-language-environment 1454 (get-language-info current-language-environment
1451 'ctext-non-standard-encodings) 1455 'ctext-non-standard-encodings)))
1452 ctext-non-standard-encodings)))
1453 (let* ((slot (assoc encoding ctext-non-standard-encodings-alist)) 1456 (let* ((slot (assoc encoding ctext-non-standard-encodings-alist))
1454 (charset (nth 3 slot))) 1457 (charset (nth 3 slot)))
1455 (if charset 1458 (if (charsetp charset)
1456 (cond ((charsetp charset) 1459 (push (cons charset slot) table)
1457 (aset table (make-char charset) slot)) 1460 (dolist (cs charset)
1458 ((listp charset) 1461 (push (cons cs slot) table)))))
1459 (dolist (elt charset) 1462
1460 (aset table (make-char elt) slot))) 1463 ;; Next prepend charsets for ISO2022 designation sequence.
1461 ((char-table-p charset) 1464 (dolist (charset charset-list)
1462 (map-char-table #'(lambda (k v) 1465 (let ((final (plist-get (charset-plist charset) :iso-final-char)))
1463 (if (and v (> k 128)) (aset table k slot))) 1466 (if (and (integerp final)
1464 charset)))))) 1467 (>= final #x40) (<= final #x7e)
1468 ;; Exclude ascii and chinese-cns11643-X.
1469 (not (eq charset 'ascii))
1470 (not (string-match "cns11643" (symbol-name charset))))
1471 (push (cons charset nil) table))))
1465 table)) 1472 table))
1466 1473
1467 (defun ctext-pre-write-conversion (from to) 1474 (defun ctext-pre-write-conversion (from to)
1468 "Encode characters between FROM and TO as Compound Text w/Extended Segments. 1475 "Encode characters between FROM and TO as Compound Text w/Extended Segments.
1469 1476
1479 1486
1480 ;; Now we can encode the whole buffer. 1487 ;; Now we can encode the whole buffer.
1481 (let ((encoding-table (ctext-non-standard-encodings-table)) 1488 (let ((encoding-table (ctext-non-standard-encodings-table))
1482 last-coding-system-used 1489 last-coding-system-used
1483 last-pos last-encoding-info 1490 last-pos last-encoding-info
1484 encoding-info end-pos) 1491 encoding-info end-pos ch)
1485 (goto-char (setq last-pos (point-min))) 1492 (goto-char (setq last-pos (point-min)))
1486 (setq end-pos (point-marker)) 1493 (setq end-pos (point-marker))
1487 (while (re-search-forward "[^\000-\177]+" nil t) 1494 (while (re-search-forward "[^\000-\177]+" nil t)
1488 ;; Found a sequence of non-ASCII characters. 1495 ;; Found a sequence of non-ASCII characters.
1489 (setq last-pos (match-beginning 0) 1496 (setq last-pos (match-beginning 0)
1490 last-encoding-info (aref encoding-table (char-after last-pos))) 1497 ch (char-after last-pos)
1498 last-encoding-info (catch 'tag
1499 (dolist (elt encoding-table)
1500 (if (encode-char ch (car elt))
1501 (throw 'tag (cdr elt))))
1502 'utf-8))
1491 (set-marker end-pos (match-end 0)) 1503 (set-marker end-pos (match-end 0))
1492 (goto-char (1+ last-pos)) 1504 (goto-char (1+ last-pos))
1493 (catch 'tag 1505 (catch 'tag
1494 (while t 1506 (while t
1495 (setq encoding-info 1507 (setq encoding-info
1496 (if (< (point) end-pos) 1508 (if (< (point) end-pos)
1497 (aref encoding-table (following-char)))) 1509 (catch 'tag
1510 (setq ch (following-char))
1511 (dolist (elt encoding-table)
1512 (if (encode-char ch (car elt))
1513 (throw 'tag (cdr elt))))
1514 'utf-8)))
1498 (unless (eq last-encoding-info encoding-info) 1515 (unless (eq last-encoding-info encoding-info)
1499 (cond ((consp last-encoding-info) 1516 (cond ((consp last-encoding-info)
1500 ;; Encode the previous range using an extended 1517 ;; Encode the previous range using an extended
1501 ;; segment. 1518 ;; segment.
1502 (let ((encoding-name (car last-encoding-info)) 1519 (let ((encoding-name (car last-encoding-info))
1506 (encode-coding-region last-pos (point) coding-system) 1523 (encode-coding-region last-pos (point) coding-system)
1507 (setq len (+ (length encoding-name) 1 1524 (setq len (+ (length encoding-name) 1
1508 (- (point) last-pos))) 1525 (- (point) last-pos)))
1509 (save-excursion 1526 (save-excursion
1510 (goto-char last-pos) 1527 (goto-char last-pos)
1511 (insert (string-to-multibyte 1528 (insert (format "\e%%/%d" noctets))
1512 (format "\e%%/%d%c%c%s\002" 1529 (insert-byte (+ (/ len 128) 128) 1)
1513 noctets 1530 (insert-byte (+ (% len 128) 128) 1)
1514 (+ (/ len 128) 128) 1531 (insert encoding-name))))
1515 (+ (% len 128) 128)
1516 encoding-name))))))
1517 ((eq last-encoding-info 'utf-8) 1532 ((eq last-encoding-info 'utf-8)
1518 ;; Encode the previous range using UTF-8 encoding 1533 ;; Encode the previous range using UTF-8 encoding
1519 ;; extention. 1534 ;; extention.
1520 (encode-coding-region last-pos (point) 'mule-utf-8) 1535 (encode-coding-region last-pos (point) 'mule-utf-8)
1521 (save-excursion 1536 (save-excursion