Mercurial > emacs
comparison lisp/international/mule.el @ 53240:ee5206ee4439
(ctext-non-standard-encodings-alist): Change the format.
(ctext-non-standard-encodings): New variable.
(ctext-post-read-conversion): Fully re-written.
(ctext-non-standard-designations-alist): Delete it.
(ctext-non-standard-encodings-table): New function.
(ctext-pre-write-conversion): Fully re-written.
author | Kenichi Handa <handa@m17n.org> |
---|---|
date | Wed, 03 Dec 2003 08:24:42 +0000 |
parents | 810931aa5f2d |
children | 5c66f1de4907 |
comparison
equal
deleted
inserted
replaced
53239:82690620d562 | 53240:ee5206ee4439 |
---|---|
1328 (set-coding-priority-internal))) | 1328 (set-coding-priority-internal))) |
1329 | 1329 |
1330 ;;; X selections | 1330 ;;; X selections |
1331 | 1331 |
1332 (defvar ctext-non-standard-encodings-alist | 1332 (defvar ctext-non-standard-encodings-alist |
1333 '(("ISO8859-15" . iso-8859-15) | 1333 '(("big5-0" big5 2 (chinese-big5-1 chinese-big5-2)) |
1334 ("ISO8859-14" . iso-8859-14) | 1334 ("ISO8859-14" iso-8859-14 1 latin-iso8859-14) |
1335 ("KOI8-R" . koi8-r) | 1335 ("ISO8859-15" iso-8859-15 1 latin-iso8859-15)) |
1336 ("BIG5-0" . big5)) | 1336 "Alist of non-standard encoding names vs the corresponding usages in CTEXT. |
1337 "Alist of non-standard encoding names vs Emacs coding systems. | 1337 |
1338 This alist is used to decode an extened segment of a compound text.") | 1338 It controls how extended segments of a compound text are handled |
1339 by the coding system `compound-text-with-extensions'. | |
1340 | |
1341 Each element has the form (ENCODING-NAME CODING-SYSTEM N-OCTET CHARSET). | |
1342 | |
1343 ENCODING-NAME is an encoding name of an \"extended segments\". | |
1344 | |
1345 CODING-SYSTEM is the coding-system to encode (or decode) the | |
1346 characters into (or from) the extended segment. | |
1347 | |
1348 N-OCTET is the number of octets (bytes) that encodes a character | |
1349 in the segment. It can be 0 (meaning the number of octets per | |
1350 character is variable), 1, 2, 3, or 4. | |
1351 | |
1352 CHARSET is a charater set containing characters that are encoded | |
1353 in the segment. It can be a list of character sets. It can also | |
1354 be a char-table, in which case characters that have non-nil value | |
1355 in the char-table are the target. | |
1356 | |
1357 On decoding CTEXT, all encoding names listed here are recognized. | |
1358 | |
1359 On encoding CTEXT, encoding names in the variable | |
1360 `ctext-non-standard-encodings' (which see) and in the information | |
1361 listed for the current language environment under the key | |
1362 `ctext-non-standard-encodings' are used.") | |
1363 | |
1364 (defvar ctext-non-standard-encodings | |
1365 '("big5-0") | |
1366 "List of non-standard encoding names used in extended segments of CTEXT. | |
1367 Each element must be one of the names listed in the variable | |
1368 `ctext-non-standard-encodings-alist' (which see).") | |
1339 | 1369 |
1340 (defvar ctext-non-standard-encodings-regexp | 1370 (defvar ctext-non-standard-encodings-regexp |
1341 (string-to-multibyte | 1371 (string-to-multibyte |
1342 (concat | 1372 (concat |
1343 ;; For non-standard encodings. | 1373 ;; For non-standard encodings. |
1345 "\\|" | 1375 "\\|" |
1346 ;; For UTF-8 encoding. | 1376 ;; For UTF-8 encoding. |
1347 "\\(\e%G[^\e]*\e%@\\)"))) | 1377 "\\(\e%G[^\e]*\e%@\\)"))) |
1348 | 1378 |
1349 ;; Functions to support "Non-Standard Character Set Encodings" defined | 1379 ;; Functions to support "Non-Standard Character Set Encodings" defined |
1350 ;; by the COMPOUND-TEXT spec. | 1380 ;; by the COMPOUND-TEXT spec. They also support "The UTF-8 encoding" |
1351 ;; We support that by decoding the whole data by `ctext' which just | 1381 ;; described in the section 7 of the documentation of COMPOUND-TEXT |
1352 ;; pertains byte sequences belonging to ``extended segment'', then | 1382 ;; distributed with XFree86. |
1353 ;; decoding those byte sequences one by one in Lisp. | |
1354 ;; This function also supports "The UTF-8 encoding" described in the | |
1355 ;; section 7 of the documentation fo COMPOUND-TEXT distributed with | |
1356 ;; XFree86. | |
1357 | 1383 |
1358 (defun ctext-post-read-conversion (len) | 1384 (defun ctext-post-read-conversion (len) |
1359 "Decode LEN characters encoded as Compound Text with Extended Segments." | 1385 "Decode LEN characters encoded as Compound Text with Extended Segments." |
1360 (save-match-data | 1386 (save-match-data |
1361 (save-restriction | 1387 (save-restriction |
1363 (in-workbuf (string= (buffer-name) " *code-converting-work*")) | 1389 (in-workbuf (string= (buffer-name) " *code-converting-work*")) |
1364 last-coding-system-used | 1390 last-coding-system-used |
1365 pos bytes) | 1391 pos bytes) |
1366 (or in-workbuf | 1392 (or in-workbuf |
1367 (narrow-to-region (point) (+ (point) len))) | 1393 (narrow-to-region (point) (+ (point) len))) |
1368 (decode-coding-region (point-min) (point-max) 'ctext) | |
1369 (if in-workbuf | 1394 (if in-workbuf |
1370 (set-buffer-multibyte t)) | 1395 (set-buffer-multibyte t)) |
1371 (while (re-search-forward ctext-non-standard-encodings-regexp | 1396 (while (re-search-forward ctext-non-standard-encodings-regexp |
1372 nil 'move) | 1397 nil 'move) |
1373 (setq pos (match-beginning 0)) | 1398 (setq pos (match-beginning 0)) |
1374 (if (match-beginning 1) | 1399 (if (match-beginning 1) |
1375 ;; ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES-- | 1400 ;; ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES-- |
1376 (let* ((M (char-after (+ pos 4))) | 1401 (let* ((M (char-after (+ pos 4))) |
1377 (L (char-after (+ pos 5))) | 1402 (L (char-after (+ pos 5))) |
1378 (encoding (match-string 2)) | 1403 (encoding (match-string 2)) |
1379 (coding (or (cdr (assoc-ignore-case | 1404 (encoding-info (assoc-ignore-case |
1380 encoding | 1405 encoding |
1381 ctext-non-standard-encodings-alist)) | 1406 ctext-non-standard-encodings-alist)) |
1382 (coding-system-p | 1407 (coding (if encoding-info |
1383 (intern (downcase encoding)))))) | 1408 (nth 1 encoding-info) |
1409 (setq encoding (intern (downcase encoding))) | |
1410 (and (coding-system-p encoding) | |
1411 encoding)))) | |
1384 (setq bytes (- (+ (* (- M 128) 128) (- L 128)) | 1412 (setq bytes (- (+ (* (- M 128) 128) (- L 128)) |
1385 (- (point) (+ pos 6)))) | 1413 (- (point) (+ pos 6)))) |
1386 (when coding | 1414 (when coding |
1387 (delete-region pos (point)) | 1415 (delete-region pos (point)) |
1388 (forward-char bytes) | 1416 (forward-char bytes) |
1389 (decode-coding-region (- (point) bytes) (point) coding))) | 1417 (decode-coding-region (- (point) bytes) (point) coding))) |
1390 ;; ESC % G --UTF-8-BYTES-- ESC % @ | 1418 ;; ESC % G --UTF-8-BYTES-- ESC % @ |
1391 (setq bytes (- (point) pos)) | 1419 (delete-char -3) |
1392 (decode-coding-region (- (point) bytes) (point) 'utf-8)))) | 1420 (delete-region pos (+ pos 3)) |
1421 (decode-coding-region pos (point) 'utf-8)))) | |
1393 (goto-char (point-min)) | 1422 (goto-char (point-min)) |
1394 (- (point-max) (point))))) | 1423 (- (point-max) (point))))) |
1395 | 1424 |
1396 ;; From X registry 2001/06/01 | 1425 ;; Return a char table of extended segment usage for each character. |
1397 ;; 20. NON-STANDARD CHARACTER SET ENCODINGS | 1426 ;; Each value of the char table is nil, one of the elements of |
1398 | 1427 ;; `ctext-non-standard-encodings-alist', or the symbol `utf-8'. |
1399 ;; See Section 6 of the Compound Text standard. | 1428 |
1400 | 1429 (defun ctext-non-standard-encodings-table () |
1401 ;; Name Reference | 1430 (let ((table (make-char-table 'translation-table))) |
1402 ;; ---- --------- | 1431 (aset table (make-char 'mule-unicode-0100-24ff) 'utf-8) |
1403 ;; "DEC.CNS11643.1986-2" [53] | 1432 (aset table (make-char 'mule-unicode-2500-33ff) 'utf-8) |
1404 ;; CNS11643 2-plane using the recommended | 1433 (aset table (make-char 'mule-unicode-e000-ffff) 'utf-8) |
1405 ;; internal representation scheme | 1434 (dolist (encoding (reverse |
1406 ;; "DEC.DTSCS.1990-2" [54] | 1435 (append |
1407 ;; DEC Taiwan Supplemental Character Set | 1436 (get-language-info current-language-environment |
1408 ;; "fujitsu.u90x03" [87] | 1437 'ctext-non-standard-encodings) |
1409 ;; "ILA" [62] | 1438 ctext-non-standard-encodings))) |
1410 ;; registry prefix | 1439 (let* ((slot (assoc encoding ctext-non-standard-encodings-alist)) |
1411 ;; "IPSYS" [59] | 1440 (charset (nth 3 slot))) |
1412 ;; registry prefix | 1441 (if charset |
1413 ;; "omron_UDC" [45] | 1442 (cond ((charsetp charset) |
1414 ;; omron User Defined Charset | 1443 (aset table (make-char charset) slot)) |
1415 ;; "omron_UDC_ja" [45] | 1444 ((listp charset) |
1416 ;; omron User Defined Charset for Japanese | 1445 (dolist (elt charset) |
1417 ;; "omron_UDC_zh" [45] | 1446 (aset table (make-char elt) slot))) |
1418 ;; omron User Defined Charset for Chinese(Main land) | 1447 ((char-table-p charset) |
1419 ;; "omron_UDC_tw" [45] | 1448 (map-char-table #'(lambda (k v) |
1420 ;; omron User Defined Charset for Chinese(Taiwan) | 1449 (if (and v (> k 128)) (aset table k slot))) |
1421 | 1450 charset)))))) |
1422 ;; If you add charsets here, be sure to modify the regexp used by | 1451 table)) |
1423 ;; ctext-pre-write-conversion to look up non-standard charsets. | |
1424 (defvar ctext-non-standard-designations-alist | |
1425 '(("$(0" . (big5 "big5-0" 2)) | |
1426 ("$(1" . (big5 "big5-0" 2)) | |
1427 ;; The following are actually standard; generating extended | |
1428 ;; segments for them is wrong and screws e.g. Latin-9 users. | |
1429 ;; 8859-{10,13,16} aren't Emacs charsets anyhow. -- fx | |
1430 ;; ("-V" . (t "iso8859-10" 1)) | |
1431 ;; ("-Y" . (t "iso8859-13" 1)) | |
1432 ;; ("-_" . (t "iso8859-14" 1)) | |
1433 ;; ("-b" . (t "iso8859-15" 1)) | |
1434 ;; ("-f" . (t "iso8859-16" 1)) | |
1435 ) | |
1436 "Alist of ctext control sequences that introduce character sets which | |
1437 are not in the list of approved encodings, and the corresponding | |
1438 coding system, identifier string, and number of octets per encoded | |
1439 character. | |
1440 | |
1441 Each element has the form (CTLSEQ . (ENCODING CHARSET NOCTETS)). CTLSEQ | |
1442 is the control sequence (sans the leading ESC) that introduces the character | |
1443 set in the text encoded by compound-text. ENCODING is a coding system | |
1444 symbol; if it is t, it means that the ctext coding system already encodes | |
1445 the text correctly, and only the leading control sequence needs to be altered. | |
1446 If ENCODING is a coding system, we need to re-encode the text with that | |
1447 coding system. CHARSET is the name of the charset we need to put into | |
1448 the leading control sequence. NOCTETS is the number of octets (bytes) that | |
1449 encode each character in this charset. NOCTETS can be 0 (meaning the number | |
1450 of octets per character is variable), 1, 2, 3, or 4.") | |
1451 | 1452 |
1452 (defun ctext-pre-write-conversion (from to) | 1453 (defun ctext-pre-write-conversion (from to) |
1453 "Encode characters between FROM and TO as Compound Text w/Extended Segments. | 1454 "Encode characters between FROM and TO as Compound Text w/Extended Segments. |
1454 | 1455 |
1455 If FROM is a string, or if the current buffer is not the one set up for us | 1456 If FROM is a string, or if the current buffer is not the one set up for us |
1468 (set-buffer (generate-new-buffer " *temp")) | 1469 (set-buffer (generate-new-buffer " *temp")) |
1469 (set-buffer-multibyte multibyte) | 1470 (set-buffer-multibyte multibyte) |
1470 (insert-buffer-substring buf from to)))) | 1471 (insert-buffer-substring buf from to)))) |
1471 | 1472 |
1472 ;; Now we can encode the whole buffer. | 1473 ;; Now we can encode the whole buffer. |
1473 (let ((case-fold-search nil) | 1474 (let ((encoding-table (ctext-non-standard-encodings-table)) |
1474 last-coding-system-used | 1475 last-coding-system-used |
1475 pos posend desig encode-info encoding chset noctets textlen) | 1476 last-pos last-encoding-info |
1476 (goto-char (point-min)) | 1477 encoding-info end-pos) |
1477 ;; At first encode the whole buffer. | 1478 (goto-char (setq last-pos (point-min))) |
1478 (encode-coding-region (point-min) (point-max) 'ctext-no-compositions) | 1479 (setq end-pos (point-marker)) |
1479 ;; Then replace ISO-2022 charset designations with extended | 1480 (while (re-search-forward "[^\000-\177]+" nil t) |
1480 ;; segments, for those charsets that are not part of the | 1481 ;; Found a sequence of non-ASCII characters. |
1481 ;; official X registry. The regexp below finds the leading | 1482 (setq last-pos (match-beginning 0) |
1482 ;; sequences for big5. | 1483 last-encoding-info (aref encoding-table (char-after last-pos))) |
1483 (while (re-search-forward "\e\\(\$([01]\\)" nil 'move) | 1484 (set-marker end-pos (match-end 0)) |
1484 (setq pos (match-beginning 0) | 1485 (goto-char (1+ last-pos)) |
1485 posend (point) | 1486 (catch 'tag |
1486 desig (match-string 1) | 1487 (while t |
1487 encode-info (cdr (assoc desig | 1488 (setq encoding-info |
1488 ctext-non-standard-designations-alist)) | 1489 (if (< (point) end-pos) |
1489 encoding (car encode-info) | 1490 (aref encoding-table (following-char)))) |
1490 chset (cadr encode-info) | 1491 (unless (eq last-encoding-info encoding-info) |
1491 noctets (car (cddr encode-info))) | 1492 (cond ((consp last-encoding-info) |
1492 (skip-chars-forward "^\e") | 1493 ;; Encode the previous range using an extended |
1493 (cond | 1494 ;; segment. |
1494 ((eq encoding t) ; only the leading sequence needs to be changed | 1495 (let ((encoding-name (car last-encoding-info)) |
1495 (setq textlen (+ (- (point) posend) (length chset) 1)) | 1496 (coding-system (nth 1 last-encoding-info)) |
1496 ;; Generate the control sequence for an extended segment. | 1497 (noctets (nth 2 last-encoding-info)) |
1497 (replace-match (format "\e%%/%d%c%c%s" | 1498 len) |
1498 noctets | 1499 (encode-coding-region last-pos (point) coding-system) |
1499 (+ (/ textlen 128) 128) | 1500 (setq len (+ (length encoding-name) 1 |
1500 (+ (% textlen 128) 128) | 1501 (- (point) last-pos))) |
1501 chset) | 1502 (save-excursion |
1502 t t)) | 1503 (goto-char last-pos) |
1503 ((coding-system-p encoding) ; need to recode the entire segment... | 1504 (insert (string-to-multibyte |
1504 (decode-coding-region pos (point) 'ctext-no-compositions) | 1505 (format "\e%%/%d%c%c%s" |
1505 (encode-coding-region pos (point) encoding) | 1506 noctets |
1506 (setq textlen (+ (- (point) pos) (length chset) 1)) | 1507 (+ (/ len 128) 128) |
1507 (save-excursion | 1508 (+ (% len 128) 128) |
1508 (goto-char pos) | 1509 encoding-name)))))) |
1509 (insert (format "\e%%/%d%c%c%s" | 1510 ((eq last-encoding-info 'utf-8) |
1510 noctets | 1511 ;; Encode the previous range using UTF-8 encoding |
1511 (+ (/ textlen 128) 128) | 1512 ;; extention. |
1512 (+ (% textlen 128) 128) | 1513 (encode-coding-region last-pos (point) 'mule-utf-8) |
1513 chset)))))) | 1514 (save-excursion |
1515 (goto-char last-pos) | |
1516 (insert "\e%G")) | |
1517 (insert "\e%@"))) | |
1518 (setq last-pos (point) | |
1519 last-encoding-info encoding-info)) | |
1520 (if (< (point) end-pos) | |
1521 (forward-char 1) | |
1522 (throw 'tag nil))))) | |
1523 (set-marker end-pos nil) | |
1514 (goto-char (point-min)))) | 1524 (goto-char (point-min)))) |
1515 ;; Must return nil, as build_annotations_2 expects that. | 1525 ;; Must return nil, as build_annotations_2 expects that. |
1516 nil) | 1526 nil) |
1517 | 1527 |
1518 ;;; FILE I/O | 1528 ;;; FILE I/O |