Mercurial > emacs
comparison lisp/international/mule.el @ 90324:b3b869baa1c3
(ctext-non-standard-encodings-alist): Add an entry for gbk-0. Set
charset `big5' in the entry for "big5-0".
(ctext-post-read-conversion): Use multibyt-char-to-unibyte to read
a row 8-bit.
(ctext-non-standard-encodings): Initialize to nil.
(ctext-non-standard-encodings-table): Return a list instead of
char-table.
(ctext-pre-write-conversion): Adjusted for the above change.
author | Kenichi Handa <handa@m17n.org> |
---|---|
date | Mon, 27 Feb 2006 01:24:24 +0000 |
parents | 0f622530c46c |
children | 6e94ff6be848 |
comparison
equal
deleted
inserted
replaced
90323:9ef82b3ff1a7 | 90324:b3b869baa1c3 |
---|---|
1341 (make-obsolete 'set-coding-priority 'set-coding-system-priority "23.1") | 1341 (make-obsolete 'set-coding-priority 'set-coding-system-priority "23.1") |
1342 | 1342 |
1343 ;;; X selections | 1343 ;;; X selections |
1344 | 1344 |
1345 (defvar ctext-non-standard-encodings-alist | 1345 (defvar ctext-non-standard-encodings-alist |
1346 '(("big5-0" big5 2 (chinese-big5-1 chinese-big5-2)) | 1346 '(("big5-0" big5 2 big5) |
1347 ("ISO8859-14" iso-8859-14 1 latin-iso8859-14) | 1347 ("ISO8859-14" iso-8859-14 1 latin-iso8859-14) |
1348 ("ISO8859-15" iso-8859-15 1 latin-iso8859-15)) | 1348 ("ISO8859-15" iso-8859-15 1 latin-iso8859-15) |
1349 ("gbk-0" gbk 2 chinese-gbk)) | |
1349 "Alist of non-standard encoding names vs the corresponding usages in CTEXT. | 1350 "Alist of non-standard encoding names vs the corresponding usages in CTEXT. |
1350 | 1351 |
1351 It controls how extended segments of a compound text are handled | 1352 It controls how extended segments of a compound text are handled |
1352 by the coding system `compound-text-with-extensions'. | 1353 by the coding system `compound-text-with-extensions'. |
1353 | 1354 |
1361 N-OCTET is the number of octets (bytes) that encodes a character | 1362 N-OCTET is the number of octets (bytes) that encodes a character |
1362 in the segment. It can be 0 (meaning the number of octets per | 1363 in the segment. It can be 0 (meaning the number of octets per |
1363 character is variable), 1, 2, 3, or 4. | 1364 character is variable), 1, 2, 3, or 4. |
1364 | 1365 |
1365 CHARSET is a charater set containing characters that are encoded | 1366 CHARSET is a charater set containing characters that are encoded |
1366 in the segment. It can be a list of character sets. It can also | 1367 in the segment. It can be a list of character sets. |
1367 be a char-table, in which case characters that have non-nil value | |
1368 in the char-table are the target. | |
1369 | 1368 |
1370 On decoding CTEXT, all encoding names listed here are recognized. | 1369 On decoding CTEXT, all encoding names listed here are recognized. |
1371 | 1370 |
1372 On encoding CTEXT, encoding names in the variable | 1371 On encoding CTEXT, encoding names in the variable |
1373 `ctext-non-standard-encodings' (which see) and in the information | 1372 `ctext-non-standard-encodings' (which see) and in the information |
1374 listed for the current language environment under the key | 1373 listed for the current language environment under the key |
1375 `ctext-non-standard-encodings' are used.") | 1374 `ctext-non-standard-encodings' are used.") |
1376 | 1375 |
1377 (defvar ctext-non-standard-encodings | 1376 (defvar ctext-non-standard-encodings nil |
1378 '("big5-0") | |
1379 "List of non-standard encoding names used in extended segments of CTEXT. | 1377 "List of non-standard encoding names used in extended segments of CTEXT. |
1380 Each element must be one of the names listed in the variable | 1378 Each element must be one of the names listed in the variable |
1381 `ctext-non-standard-encodings-alist' (which see).") | 1379 `ctext-non-standard-encodings-alist' (which see).") |
1382 | 1380 |
1383 (defvar ctext-non-standard-encodings-regexp | 1381 (defvar ctext-non-standard-encodings-regexp |
1410 (while (re-search-forward ctext-non-standard-encodings-regexp | 1408 (while (re-search-forward ctext-non-standard-encodings-regexp |
1411 nil 'move) | 1409 nil 'move) |
1412 (setq pos (match-beginning 0)) | 1410 (setq pos (match-beginning 0)) |
1413 (if (match-beginning 1) | 1411 (if (match-beginning 1) |
1414 ;; ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES-- | 1412 ;; ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES-- |
1415 (let* ((M (char-after (+ pos 4))) | 1413 (let* ((M (multibyte-char-to-unibyte (char-after (+ pos 4)))) |
1416 (L (char-after (+ pos 5))) | 1414 (L (multibyte-char-to-unibyte (char-after (+ pos 5)))) |
1417 (encoding (match-string 2)) | 1415 (encoding (match-string 2)) |
1418 (encoding-info (assoc-string | 1416 (encoding-info (assoc-string |
1419 encoding | 1417 encoding |
1420 ctext-non-standard-encodings-alist t)) | 1418 ctext-non-standard-encodings-alist t)) |
1421 (coding (if encoding-info | 1419 (coding (if encoding-info |
1434 (delete-region pos (+ pos 3)) | 1432 (delete-region pos (+ pos 3)) |
1435 (decode-coding-region pos (point) 'utf-8)))) | 1433 (decode-coding-region pos (point) 'utf-8)))) |
1436 (goto-char (point-min)) | 1434 (goto-char (point-min)) |
1437 (- (point-max) (point))))) | 1435 (- (point-max) (point))))) |
1438 | 1436 |
1439 ;; Return a char table of extended segment usage for each character. | 1437 ;; Return an alist of CHARSET vs CTEXT-USAGE-INFO generated from |
1440 ;; Each value of the char table is nil, one of the elements of | 1438 ;; `ctext-non-standard-encodings' and a list specified by the key |
1441 ;; `ctext-non-standard-encodings-alist', or the symbol `utf-8'. | 1439 ;; `ctext-non-standard-encodings' for the currrent language |
1440 ;; environment. CTEXT-USAGE-INFO is one of the element of | |
1441 ;; `ctext-non-standard-encodings-alist' or nil. In the former case, a | |
1442 ;; character in CHARSET is encoded using extended segment. In the | |
1443 ;; latter case, a character in CHARSET is encoded using normal ISO2022 | |
1444 ;; designation sequence. If a character is not in any of CHARSETs, it | |
1445 ;; is encoded using UTF-8 encoding extention. | |
1442 | 1446 |
1443 (defun ctext-non-standard-encodings-table () | 1447 (defun ctext-non-standard-encodings-table () |
1444 (let ((table (make-char-table 'translation-table))) | 1448 (let (table) |
1445 (aset table (make-char 'mule-unicode-0100-24ff) 'utf-8) | 1449 ;; Setup charsets specified in `ctext-non-standard-encodings' and |
1446 (aset table (make-char 'mule-unicode-2500-33ff) 'utf-8) | 1450 ;; by the key `ctext-non-standard-encodings' for the current |
1447 (aset table (make-char 'mule-unicode-e000-ffff) 'utf-8) | 1451 ;; language environment. |
1448 (dolist (encoding (reverse | 1452 (dolist (encoding (append |
1449 (append | 1453 ctext-non-standard-encodings |
1450 (get-language-info current-language-environment | 1454 (get-language-info current-language-environment |
1451 'ctext-non-standard-encodings) | 1455 'ctext-non-standard-encodings))) |
1452 ctext-non-standard-encodings))) | |
1453 (let* ((slot (assoc encoding ctext-non-standard-encodings-alist)) | 1456 (let* ((slot (assoc encoding ctext-non-standard-encodings-alist)) |
1454 (charset (nth 3 slot))) | 1457 (charset (nth 3 slot))) |
1455 (if charset | 1458 (if (charsetp charset) |
1456 (cond ((charsetp charset) | 1459 (push (cons charset slot) table) |
1457 (aset table (make-char charset) slot)) | 1460 (dolist (cs charset) |
1458 ((listp charset) | 1461 (push (cons cs slot) table))))) |
1459 (dolist (elt charset) | 1462 |
1460 (aset table (make-char elt) slot))) | 1463 ;; Next prepend charsets for ISO2022 designation sequence. |
1461 ((char-table-p charset) | 1464 (dolist (charset charset-list) |
1462 (map-char-table #'(lambda (k v) | 1465 (let ((final (plist-get (charset-plist charset) :iso-final-char))) |
1463 (if (and v (> k 128)) (aset table k slot))) | 1466 (if (and (integerp final) |
1464 charset)))))) | 1467 (>= final #x40) (<= final #x7e) |
1468 ;; Exclude ascii and chinese-cns11643-X. | |
1469 (not (eq charset 'ascii)) | |
1470 (not (string-match "cns11643" (symbol-name charset)))) | |
1471 (push (cons charset nil) table)))) | |
1465 table)) | 1472 table)) |
1466 | 1473 |
1467 (defun ctext-pre-write-conversion (from to) | 1474 (defun ctext-pre-write-conversion (from to) |
1468 "Encode characters between FROM and TO as Compound Text w/Extended Segments. | 1475 "Encode characters between FROM and TO as Compound Text w/Extended Segments. |
1469 | 1476 |
1479 | 1486 |
1480 ;; Now we can encode the whole buffer. | 1487 ;; Now we can encode the whole buffer. |
1481 (let ((encoding-table (ctext-non-standard-encodings-table)) | 1488 (let ((encoding-table (ctext-non-standard-encodings-table)) |
1482 last-coding-system-used | 1489 last-coding-system-used |
1483 last-pos last-encoding-info | 1490 last-pos last-encoding-info |
1484 encoding-info end-pos) | 1491 encoding-info end-pos ch) |
1485 (goto-char (setq last-pos (point-min))) | 1492 (goto-char (setq last-pos (point-min))) |
1486 (setq end-pos (point-marker)) | 1493 (setq end-pos (point-marker)) |
1487 (while (re-search-forward "[^\000-\177]+" nil t) | 1494 (while (re-search-forward "[^\000-\177]+" nil t) |
1488 ;; Found a sequence of non-ASCII characters. | 1495 ;; Found a sequence of non-ASCII characters. |
1489 (setq last-pos (match-beginning 0) | 1496 (setq last-pos (match-beginning 0) |
1490 last-encoding-info (aref encoding-table (char-after last-pos))) | 1497 ch (char-after last-pos) |
1498 last-encoding-info (catch 'tag | |
1499 (dolist (elt encoding-table) | |
1500 (if (encode-char ch (car elt)) | |
1501 (throw 'tag (cdr elt)))) | |
1502 'utf-8)) | |
1491 (set-marker end-pos (match-end 0)) | 1503 (set-marker end-pos (match-end 0)) |
1492 (goto-char (1+ last-pos)) | 1504 (goto-char (1+ last-pos)) |
1493 (catch 'tag | 1505 (catch 'tag |
1494 (while t | 1506 (while t |
1495 (setq encoding-info | 1507 (setq encoding-info |
1496 (if (< (point) end-pos) | 1508 (if (< (point) end-pos) |
1497 (aref encoding-table (following-char)))) | 1509 (catch 'tag |
1510 (setq ch (following-char)) | |
1511 (dolist (elt encoding-table) | |
1512 (if (encode-char ch (car elt)) | |
1513 (throw 'tag (cdr elt)))) | |
1514 'utf-8))) | |
1498 (unless (eq last-encoding-info encoding-info) | 1515 (unless (eq last-encoding-info encoding-info) |
1499 (cond ((consp last-encoding-info) | 1516 (cond ((consp last-encoding-info) |
1500 ;; Encode the previous range using an extended | 1517 ;; Encode the previous range using an extended |
1501 ;; segment. | 1518 ;; segment. |
1502 (let ((encoding-name (car last-encoding-info)) | 1519 (let ((encoding-name (car last-encoding-info)) |
1506 (encode-coding-region last-pos (point) coding-system) | 1523 (encode-coding-region last-pos (point) coding-system) |
1507 (setq len (+ (length encoding-name) 1 | 1524 (setq len (+ (length encoding-name) 1 |
1508 (- (point) last-pos))) | 1525 (- (point) last-pos))) |
1509 (save-excursion | 1526 (save-excursion |
1510 (goto-char last-pos) | 1527 (goto-char last-pos) |
1511 (insert (string-to-multibyte | 1528 (insert (format "\e%%/%d" noctets)) |
1512 (format "\e%%/%d%c%c%s\002" | 1529 (insert-byte (+ (/ len 128) 128) 1) |
1513 noctets | 1530 (insert-byte (+ (% len 128) 128) 1) |
1514 (+ (/ len 128) 128) | 1531 (insert encoding-name)))) |
1515 (+ (% len 128) 128) | |
1516 encoding-name)))))) | |
1517 ((eq last-encoding-info 'utf-8) | 1532 ((eq last-encoding-info 'utf-8) |
1518 ;; Encode the previous range using UTF-8 encoding | 1533 ;; Encode the previous range using UTF-8 encoding |
1519 ;; extention. | 1534 ;; extention. |
1520 (encode-coding-region last-pos (point) 'mule-utf-8) | 1535 (encode-coding-region last-pos (point) 'mule-utf-8) |
1521 (save-excursion | 1536 (save-excursion |