Mercurial > emacs
comparison lisp/international/mule.el @ 51313:36fa2cf14d0c
(ctext-non-standard-encodings-alist):
Renamed from non-standard-icccm-encodings-alist.
(ctext-non-standard-encodings-regexp): New variable
(ctext-post-read-conversion): Full rewrite.
(ctext-non-standard-designations-alist): Renamed from
non-standard-designations-alist.
(ctext-pre-write-conversion): Full rewrite.
author | Kenichi Handa <handa@m17n.org> |
---|---|
date | Thu, 29 May 2003 01:28:24 +0000 |
parents | e5124f8f4dc9 |
children | 3a4379245dd8 |
comparison
equal
deleted
inserted
replaced
51312:b2f981020fdd | 51313:36fa2cf14d0c |
---|---|
1314 (setq coding-category-list (append arg current-list)) | 1314 (setq coding-category-list (append arg current-list)) |
1315 (set-coding-priority-internal))) | 1315 (set-coding-priority-internal))) |
1316 | 1316 |
1317 ;;; X selections | 1317 ;;; X selections |
1318 | 1318 |
1319 (defvar non-standard-icccm-encodings-alist | 1319 (defvar ctext-non-standard-encodings-alist |
1320 '(("ISO8859-15" . latin-iso8859-15) | 1320 '(("ISO8859-15" . latin-iso8859-15) |
1321 ("ISO8859-14" . latin-iso8859-14) | 1321 ("ISO8859-14" . latin-iso8859-14) |
1322 ("KOI8-R" . koi8-r) | 1322 ("KOI8-R" . koi8-r) |
1323 ("BIG5-0" . big5)) | 1323 ("BIG5-0" . big5)) |
1324 "Alist of font charset names defined by XLFD. | 1324 "Alist of non-standard encoding names vs Emacs coding systems. |
1325 The cdr of each element is the corresponding Emacs charset or coding system.") | 1325 This alist is used to decode an extened segment of a compound text.") |
1326 | |
1327 (defvar ctext-non-standard-encodings-regexp | |
1328 (string-to-multibyte | |
1329 (concat | |
1330 ;; For non-standard encodings. | |
1331 "\\(\e%/[0-4][\200-\377][\200-\377]\\([^\002]+\\)\002\\)" | |
1332 "\\|" | |
1333 ;; For UTF-8 encoding. | |
1334 "\\(\e%G[^\e]*\e%@\\)"))) | |
1326 | 1335 |
1327 ;; Functions to support "Non-Standard Character Set Encodings" defined | 1336 ;; Functions to support "Non-Standard Character Set Encodings" defined |
1328 ;; by the COMPOUND-TEXT spec. | 1337 ;; by the COMPOUND-TEXT spec. |
1329 ;; We support that by converting the leading sequence of the | 1338 ;; We support that by decoding the whole data by `ctext' which just |
1330 ;; ``extended segment'' to the corresponding ISO-2022 sequences (if | 1339 ;; pertains byte sequences belonging to ``extended segment'', then |
1331 ;; the leading sequence names an Emacs charset), or decode the segment | 1340 ;; decoding those byte sequences one by one in Lisp. |
1332 ;; (if it names a coding system). Encoding does the reverse. | |
1333 ;; This function also supports "The UTF-8 encoding" described in the | 1341 ;; This function also supports "The UTF-8 encoding" described in the |
1334 ;; section 7 of the documentation fo COMPOUND-TEXT distributed with | 1342 ;; section 7 of the documentation fo COMPOUND-TEXT distributed with |
1335 ;; XFree86. | 1343 ;; XFree86. |
1336 | 1344 |
1337 (defun ctext-post-read-conversion (len) | 1345 (defun ctext-post-read-conversion (len) |
1338 "Decode LEN characters encoded as Compound Text with Extended Segments." | 1346 "Decode LEN characters encoded as Compound Text with Extended Segments." |
1339 (buffer-disable-undo) ; minimize consing due to insertions and deletions | |
1340 (narrow-to-region (point) (+ (point) len)) | |
1341 (save-match-data | 1347 (save-match-data |
1342 (let ((pt (point-marker)) | 1348 (save-restriction |
1343 (oldpt (point-marker)) | 1349 (let ((case-fold-search nil) |
1344 (newpt (make-marker)) | 1350 (in-workbuf (string= (buffer-name) " *code-converting-work*")) |
1345 (modified-p (buffer-modified-p)) | 1351 last-coding-system-used |
1346 (case-fold-search nil) | 1352 pos bytes) |
1347 ;; We need multibyte conversion of "TO" type because the | 1353 (or in-workbuf |
1348 ;; buffer may be multibyte, and, in that case, the pattern | 1354 (narrow-to-region (point) (+ (point) len))) |
1349 ;; must contain eight-bit-control/graphic characters. | 1355 (decode-coding-region (point-min) (point-max) 'ctext) |
1350 (pattern (string-to-multibyte "\\(\e\\)%/[0-4]\\([\200-\377][\200-\377]\\)\\([^\002]+\\)\002\\|\e%G[^\e]+\e%@")) | 1356 (if in-workbuf |
1351 last-coding-system-used | 1357 (set-buffer-multibyte t)) |
1352 encoding textlen chset) | 1358 (while (re-search-forward ctext-non-standard-encodings-regexp |
1353 (while (re-search-forward pattern nil 'move) | 1359 nil 'move) |
1354 (set-marker newpt (point)) | 1360 (setq pos (match-beginning 0)) |
1355 (set-marker pt (match-beginning 0)) | 1361 (if (match-beginning 1) |
1356 (if (= (preceding-char) ?@) | 1362 ;; ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES-- |
1357 ;; We found embedded utf-8 sequence. | 1363 (let* ((M (char-after (+ pos 4))) |
1358 (progn | 1364 (L (char-after (+ pos 5))) |
1359 (delete-char -3) ; delete ESC % @ at the tail | 1365 (encoding (match-string 2)) |
1360 (goto-char pt) | 1366 (coding (or (cdr (assoc-ignore-case |
1361 (delete-char 3) ; delete ESC % G at the head | 1367 encoding |
1362 (if (> pt oldpt) | 1368 ctext-non-standard-encodings-alist)) |
1363 (decode-coding-region oldpt pt 'ctext-no-compositions)) | 1369 (coding-system-p |
1364 (decode-coding-region pt newpt 'mule-utf-8) | 1370 (intern (downcase encoding)))))) |
1365 (goto-char newpt) | 1371 (setq bytes (- (+ (* (- M 128) 128) (- L 128)) |
1366 (set-marker oldpt newpt)) | 1372 (- (point) (+ pos 6)))) |
1367 (setq encoding (match-string 3)) | 1373 (when coding |
1368 (setq textlen (- (+ (* (- (aref (match-string 2) 0) 128) 128) | 1374 (delete-region pos (point)) |
1369 (- (aref (match-string 2) 1) 128)) | 1375 (forward-char bytes) |
1370 (1+ (length encoding)))) | 1376 (decode-coding-region (- (point) bytes) (point) coding))) |
1371 (setq | 1377 ;; ESC % G --UTF-8-BYTES-- ESC % @ |
1372 chset (cdr (assoc-ignore-case encoding | 1378 (setq bytes (- (point) pos)) |
1373 non-standard-icccm-encodings-alist))) | 1379 (decode-coding-region (- (point) bytes) (point) 'utf-8)))) |
1374 (cond ((null chset) | 1380 (goto-char (point-min)) |
1375 ;; This charset is not supported--leave this extended | 1381 (- (point-max) (point))))) |
1376 ;; segment unaltered and skip over it. | |
1377 (goto-char (+ (point) textlen))) | |
1378 ((charsetp chset) | |
1379 ;; If it's a charset, replace the leading escape sequence | |
1380 ;; with a standard ISO-2022 sequence. We will decode all | |
1381 ;; such segments later, in one go, when we exit the loop | |
1382 ;; or find an extended segment that names a coding | |
1383 ;; system, not a charset. | |
1384 (replace-match | |
1385 (concat "\\1" | |
1386 (if (= 0 (charset-iso-graphic-plane chset)) | |
1387 ;; GL charsets | |
1388 (if (= 1 (charset-dimension chset)) "(" "$(") | |
1389 ;; GR charsets | |
1390 (if (= 96 (charset-chars chset)) | |
1391 "-" | |
1392 (if (= 1 (charset-dimension chset)) ")" "$)"))) | |
1393 (string (charset-iso-final-char chset))) | |
1394 t) | |
1395 (goto-char (+ (point) textlen))) | |
1396 ((coding-system-p chset) | |
1397 ;; If it's a coding system, we need to decode the segment | |
1398 ;; right away. But first, decode what we've skipped | |
1399 ;; across until now. | |
1400 (when (> pt oldpt) | |
1401 (decode-coding-region oldpt pt 'ctext-no-compositions)) | |
1402 (delete-region pt newpt) | |
1403 (set-marker newpt (+ newpt textlen)) | |
1404 (decode-coding-region pt newpt chset) | |
1405 (goto-char newpt) | |
1406 (set-marker oldpt newpt))))) | |
1407 ;; Decode what's left. | |
1408 (when (> (point) oldpt) | |
1409 (decode-coding-region oldpt (point) 'ctext-no-compositions)) | |
1410 ;; This buffer started as unibyte, because the string we get from | |
1411 ;; the X selection is a unibyte string. We must now make it | |
1412 ;; multibyte, so that the decoded text is inserted as multibyte | |
1413 ;; into its buffer. | |
1414 (set-buffer-multibyte t) | |
1415 (set-buffer-modified-p modified-p) | |
1416 (- (point-max) (point-min))))) | |
1417 | 1382 |
1418 ;; If you add charsets here, be sure to modify the regexp used by | 1383 ;; If you add charsets here, be sure to modify the regexp used by |
1419 ;; ctext-pre-write-conversion to look up non-standard charsets. | 1384 ;; ctext-pre-write-conversion to look up non-standard charsets. |
1420 (defvar non-standard-designations-alist | 1385 (defvar ctext-non-standard-designations-alist |
1421 '(("$(0" . (big5 "big5-0" 2)) | 1386 '(("$(0" . (big5 "big5-0" 2)) |
1422 ("$(1" . (big5 "big5-0" 2)) | 1387 ("$(1" . (big5 "big5-0" 2)) |
1423 ;; The following are actually standard; generating extended | 1388 ;; The following are actually standard; generating extended |
1424 ;; segments for them is wrong and screws e.g. Latin-9 users. | 1389 ;; segments for them is wrong and screws e.g. Latin-9 users. |
1425 ;; 8859-{10,13,16} aren't Emacs charsets anyhow. -- fx | 1390 ;; 8859-{10,13,16} aren't Emacs charsets anyhow. -- fx |
1447 | 1412 |
1448 (defun ctext-pre-write-conversion (from to) | 1413 (defun ctext-pre-write-conversion (from to) |
1449 "Encode characters between FROM and TO as Compound Text w/Extended Segments. | 1414 "Encode characters between FROM and TO as Compound Text w/Extended Segments. |
1450 | 1415 |
1451 If FROM is a string, or if the current buffer is not the one set up for us | 1416 If FROM is a string, or if the current buffer is not the one set up for us |
1452 by run_pre_post_conversion_on_str, generate a new temp buffer, insert the | 1417 by encode-coding-string, generate a new temp buffer, insert the |
1453 text, and convert it in the temporary buffer. Otherwise, convert in-place." | 1418 text, and convert it in the temporary buffer. Otherwise, convert in-place." |
1454 (cond ((and (string= (buffer-name) " *code-converting-work*") | |
1455 (not (stringp from))) | |
1456 ; Minimize consing due to subsequent insertions and deletions. | |
1457 (buffer-disable-undo) | |
1458 (narrow-to-region from to)) | |
1459 (t | |
1460 (let ((buf (current-buffer))) | |
1461 (set-buffer (generate-new-buffer " *temp")) | |
1462 (buffer-disable-undo) | |
1463 (if (stringp from) | |
1464 (insert from) | |
1465 (insert-buffer-substring buf from to)) | |
1466 (setq from (point-min) to (point-max))))) | |
1467 (encode-coding-region from to 'ctext-no-compositions) | |
1468 ;; Replace ISO-2022 charset designations with extended segments, for | |
1469 ;; those charsets that are not part of the official X registry. | |
1470 (save-match-data | 1419 (save-match-data |
1471 (goto-char (point-min)) | 1420 ;; Setup a working buffer if necessary. |
1472 (let ((newpt (make-marker)) | 1421 (cond ((stringp from) |
1473 (case-fold-search nil) | 1422 (let ((buf (current-buffer))) |
1474 pt desig encode-info encoding chset noctets textlen) | 1423 (set-buffer (generate-new-buffer " *temp")) |
1475 (set-buffer-multibyte nil) | 1424 (set-buffer-multibyte (multibyte-string-p from)) |
1476 ;; The regexp below finds the leading sequences for big5. | 1425 (insert from))) |
1426 ((not (string= (buffer-name) " *code-converting-work*")) | |
1427 (let ((buf (current-buffer)) | |
1428 (multibyte enable-multibyte-characters)) | |
1429 (set-buffer (generate-new-buffer " *temp")) | |
1430 (set-buffer-multibyte multibyte) | |
1431 (insert-buffer-substring buf from to)))) | |
1432 | |
1433 ;; Now we can encode the whole buffer. | |
1434 (let ((case-fold-search nil) | |
1435 last-coding-system-used | |
1436 pos posend desig encode-info encoding chset noctets textlen) | |
1437 (goto-char (point-min)) | |
1438 ;; At first encode the whole buffer. | |
1439 (encode-coding-region (point-min) (point-max) 'ctext-no-compositions) | |
1440 ;; Then replace ISO-2022 charset designations with extended | |
1441 ;; segments, for those charsets that are not part of the | |
1442 ;; official X registry. The regexp below finds the leading | |
1443 ;; sequences for big5. | |
1477 (while (re-search-forward "\e\\(\$([01]\\)" nil 'move) | 1444 (while (re-search-forward "\e\\(\$([01]\\)" nil 'move) |
1478 (setq desig (match-string 1) | 1445 (setq pos (match-beginning 0) |
1479 pt (point-marker) | 1446 posend (point) |
1480 encode-info (cdr (assoc desig non-standard-designations-alist)) | 1447 desig (match-string 1) |
1448 encode-info (cdr (assoc desig | |
1449 ctext-non-standard-designations-alist)) | |
1481 encoding (car encode-info) | 1450 encoding (car encode-info) |
1482 chset (cadr encode-info) | 1451 chset (cadr encode-info) |
1483 noctets (car (cddr encode-info))) | 1452 noctets (car (cddr encode-info))) |
1484 (skip-chars-forward "^\e") | 1453 (skip-chars-forward "^\e") |
1485 (set-marker newpt (point)) | |
1486 (cond | 1454 (cond |
1487 ((eq encoding t) ; only the leading sequence needs to be changed | 1455 ((eq encoding t) ; only the leading sequence needs to be changed |
1488 (setq textlen (+ (- newpt pt) (length chset) 1)) | 1456 (setq textlen (+ (- (point) posend) (length chset) 1)) |
1489 ;; Generate the ICCCM control sequence for an extended segment. | 1457 ;; Generate the control sequence for an extended segment. |
1490 (replace-match (format "\e%%/%d%c%c%s" | 1458 (replace-match (format "\e%%/%d%c%c%s" |
1491 noctets | 1459 noctets |
1492 (+ (/ textlen 128) 128) | 1460 (+ (/ textlen 128) 128) |
1493 (+ (% textlen 128) 128) | 1461 (+ (% textlen 128) 128) |
1494 chset) | 1462 chset) |
1495 t t)) | 1463 t t)) |
1496 ((coding-system-p encoding) ; need to recode the entire segment... | 1464 ((coding-system-p encoding) ; need to recode the entire segment... |
1497 (set-marker pt (match-beginning 0)) | 1465 (decode-coding-region pos (point) 'ctext-no-compositions) |
1498 (decode-coding-region pt newpt 'ctext-no-compositions) | 1466 (encode-coding-region pos (point) encoding) |
1499 (set-buffer-multibyte t) | |
1500 (encode-coding-region pt newpt encoding) | |
1501 (set-buffer-multibyte nil) | 1467 (set-buffer-multibyte nil) |
1502 (setq textlen (+ (- newpt pt) (length chset) 1)) | 1468 (setq textlen (+ (- (point) pos) (length chset) 1)) |
1503 (goto-char pt) | 1469 (save-excursion |
1504 (insert (format "\e%%/%d%c%c%s" | 1470 (goto-char pos) |
1505 noctets | 1471 (insert (format "\e%%/%d%c%c%s" |
1506 (+ (/ textlen 128) 128) | 1472 noctets |
1507 (+ (% textlen 128) 128) | 1473 (+ (/ textlen 128) 128) |
1508 chset)))) | 1474 (+ (% textlen 128) 128) |
1509 (goto-char newpt)))) | 1475 chset)))))) |
1510 (set-buffer-multibyte t) | 1476 (goto-char (point-min)))) |
1511 ;; Must return nil, as build_annotations_2 expects that. | 1477 ;; Must return nil, as build_annotations_2 expects that. |
1512 nil) | 1478 nil) |
1513 | 1479 |
1514 ;;; FILE I/O | 1480 ;;; FILE I/O |
1515 | 1481 |