comparison lisp/international/mule.el @ 51313:36fa2cf14d0c

(ctext-non-standard-encodings-alist): Renamed from non-standard-icccm-encodings-alist. (ctext-non-standard-encodings-regexp): New variable (ctext-post-read-conversion): Full rewrite. (ctext-non-standard-designations-alist): Renamed from non-standard-designations-alist. (ctext-pre-write-conversion): Full rewrite.
author Kenichi Handa <handa@m17n.org>
date Thu, 29 May 2003 01:28:24 +0000
parents e5124f8f4dc9
children 3a4379245dd8
comparison
equal deleted inserted replaced
51312:b2f981020fdd 51313:36fa2cf14d0c
1314 (setq coding-category-list (append arg current-list)) 1314 (setq coding-category-list (append arg current-list))
1315 (set-coding-priority-internal))) 1315 (set-coding-priority-internal)))
1316 1316
1317 ;;; X selections 1317 ;;; X selections
1318 1318
1319 (defvar non-standard-icccm-encodings-alist 1319 (defvar ctext-non-standard-encodings-alist
1320 '(("ISO8859-15" . latin-iso8859-15) 1320 '(("ISO8859-15" . latin-iso8859-15)
1321 ("ISO8859-14" . latin-iso8859-14) 1321 ("ISO8859-14" . latin-iso8859-14)
1322 ("KOI8-R" . koi8-r) 1322 ("KOI8-R" . koi8-r)
1323 ("BIG5-0" . big5)) 1323 ("BIG5-0" . big5))
1324 "Alist of font charset names defined by XLFD. 1324 "Alist of non-standard encoding names vs Emacs coding systems.
1325 The cdr of each element is the corresponding Emacs charset or coding system.") 1325 This alist is used to decode an extened segment of a compound text.")
1326
1327 (defvar ctext-non-standard-encodings-regexp
1328 (string-to-multibyte
1329 (concat
1330 ;; For non-standard encodings.
1331 "\\(\e%/[0-4][\200-\377][\200-\377]\\([^\002]+\\)\002\\)"
1332 "\\|"
1333 ;; For UTF-8 encoding.
1334 "\\(\e%G[^\e]*\e%@\\)")))
1326 1335
1327 ;; Functions to support "Non-Standard Character Set Encodings" defined 1336 ;; Functions to support "Non-Standard Character Set Encodings" defined
1328 ;; by the COMPOUND-TEXT spec. 1337 ;; by the COMPOUND-TEXT spec.
1329 ;; We support that by converting the leading sequence of the 1338 ;; We support that by decoding the whole data by `ctext' which just
1330 ;; ``extended segment'' to the corresponding ISO-2022 sequences (if 1339 ;; pertains byte sequences belonging to ``extended segment'', then
1331 ;; the leading sequence names an Emacs charset), or decode the segment 1340 ;; decoding those byte sequences one by one in Lisp.
1332 ;; (if it names a coding system). Encoding does the reverse.
1333 ;; This function also supports "The UTF-8 encoding" described in the 1341 ;; This function also supports "The UTF-8 encoding" described in the
1334 ;; section 7 of the documentation fo COMPOUND-TEXT distributed with 1342 ;; section 7 of the documentation fo COMPOUND-TEXT distributed with
1335 ;; XFree86. 1343 ;; XFree86.
1336 1344
1337 (defun ctext-post-read-conversion (len) 1345 (defun ctext-post-read-conversion (len)
1338 "Decode LEN characters encoded as Compound Text with Extended Segments." 1346 "Decode LEN characters encoded as Compound Text with Extended Segments."
1339 (buffer-disable-undo) ; minimize consing due to insertions and deletions
1340 (narrow-to-region (point) (+ (point) len))
1341 (save-match-data 1347 (save-match-data
1342 (let ((pt (point-marker)) 1348 (save-restriction
1343 (oldpt (point-marker)) 1349 (let ((case-fold-search nil)
1344 (newpt (make-marker)) 1350 (in-workbuf (string= (buffer-name) " *code-converting-work*"))
1345 (modified-p (buffer-modified-p)) 1351 last-coding-system-used
1346 (case-fold-search nil) 1352 pos bytes)
1347 ;; We need multibyte conversion of "TO" type because the 1353 (or in-workbuf
1348 ;; buffer may be multibyte, and, in that case, the pattern 1354 (narrow-to-region (point) (+ (point) len)))
1349 ;; must contain eight-bit-control/graphic characters. 1355 (decode-coding-region (point-min) (point-max) 'ctext)
1350 (pattern (string-to-multibyte "\\(\e\\)%/[0-4]\\([\200-\377][\200-\377]\\)\\([^\002]+\\)\002\\|\e%G[^\e]+\e%@")) 1356 (if in-workbuf
1351 last-coding-system-used 1357 (set-buffer-multibyte t))
1352 encoding textlen chset) 1358 (while (re-search-forward ctext-non-standard-encodings-regexp
1353 (while (re-search-forward pattern nil 'move) 1359 nil 'move)
1354 (set-marker newpt (point)) 1360 (setq pos (match-beginning 0))
1355 (set-marker pt (match-beginning 0)) 1361 (if (match-beginning 1)
1356 (if (= (preceding-char) ?@) 1362 ;; ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
1357 ;; We found embedded utf-8 sequence. 1363 (let* ((M (char-after (+ pos 4)))
1358 (progn 1364 (L (char-after (+ pos 5)))
1359 (delete-char -3) ; delete ESC % @ at the tail 1365 (encoding (match-string 2))
1360 (goto-char pt) 1366 (coding (or (cdr (assoc-ignore-case
1361 (delete-char 3) ; delete ESC % G at the head 1367 encoding
1362 (if (> pt oldpt) 1368 ctext-non-standard-encodings-alist))
1363 (decode-coding-region oldpt pt 'ctext-no-compositions)) 1369 (coding-system-p
1364 (decode-coding-region pt newpt 'mule-utf-8) 1370 (intern (downcase encoding))))))
1365 (goto-char newpt) 1371 (setq bytes (- (+ (* (- M 128) 128) (- L 128))
1366 (set-marker oldpt newpt)) 1372 (- (point) (+ pos 6))))
1367 (setq encoding (match-string 3)) 1373 (when coding
1368 (setq textlen (- (+ (* (- (aref (match-string 2) 0) 128) 128) 1374 (delete-region pos (point))
1369 (- (aref (match-string 2) 1) 128)) 1375 (forward-char bytes)
1370 (1+ (length encoding)))) 1376 (decode-coding-region (- (point) bytes) (point) coding)))
1371 (setq 1377 ;; ESC % G --UTF-8-BYTES-- ESC % @
1372 chset (cdr (assoc-ignore-case encoding 1378 (setq bytes (- (point) pos))
1373 non-standard-icccm-encodings-alist))) 1379 (decode-coding-region (- (point) bytes) (point) 'utf-8))))
1374 (cond ((null chset) 1380 (goto-char (point-min))
1375 ;; This charset is not supported--leave this extended 1381 (- (point-max) (point)))))
1376 ;; segment unaltered and skip over it.
1377 (goto-char (+ (point) textlen)))
1378 ((charsetp chset)
1379 ;; If it's a charset, replace the leading escape sequence
1380 ;; with a standard ISO-2022 sequence. We will decode all
1381 ;; such segments later, in one go, when we exit the loop
1382 ;; or find an extended segment that names a coding
1383 ;; system, not a charset.
1384 (replace-match
1385 (concat "\\1"
1386 (if (= 0 (charset-iso-graphic-plane chset))
1387 ;; GL charsets
1388 (if (= 1 (charset-dimension chset)) "(" "$(")
1389 ;; GR charsets
1390 (if (= 96 (charset-chars chset))
1391 "-"
1392 (if (= 1 (charset-dimension chset)) ")" "$)")))
1393 (string (charset-iso-final-char chset)))
1394 t)
1395 (goto-char (+ (point) textlen)))
1396 ((coding-system-p chset)
1397 ;; If it's a coding system, we need to decode the segment
1398 ;; right away. But first, decode what we've skipped
1399 ;; across until now.
1400 (when (> pt oldpt)
1401 (decode-coding-region oldpt pt 'ctext-no-compositions))
1402 (delete-region pt newpt)
1403 (set-marker newpt (+ newpt textlen))
1404 (decode-coding-region pt newpt chset)
1405 (goto-char newpt)
1406 (set-marker oldpt newpt)))))
1407 ;; Decode what's left.
1408 (when (> (point) oldpt)
1409 (decode-coding-region oldpt (point) 'ctext-no-compositions))
1410 ;; This buffer started as unibyte, because the string we get from
1411 ;; the X selection is a unibyte string. We must now make it
1412 ;; multibyte, so that the decoded text is inserted as multibyte
1413 ;; into its buffer.
1414 (set-buffer-multibyte t)
1415 (set-buffer-modified-p modified-p)
1416 (- (point-max) (point-min)))))
1417 1382
1418 ;; If you add charsets here, be sure to modify the regexp used by 1383 ;; If you add charsets here, be sure to modify the regexp used by
1419 ;; ctext-pre-write-conversion to look up non-standard charsets. 1384 ;; ctext-pre-write-conversion to look up non-standard charsets.
1420 (defvar non-standard-designations-alist 1385 (defvar ctext-non-standard-designations-alist
1421 '(("$(0" . (big5 "big5-0" 2)) 1386 '(("$(0" . (big5 "big5-0" 2))
1422 ("$(1" . (big5 "big5-0" 2)) 1387 ("$(1" . (big5 "big5-0" 2))
1423 ;; The following are actually standard; generating extended 1388 ;; The following are actually standard; generating extended
1424 ;; segments for them is wrong and screws e.g. Latin-9 users. 1389 ;; segments for them is wrong and screws e.g. Latin-9 users.
1425 ;; 8859-{10,13,16} aren't Emacs charsets anyhow. -- fx 1390 ;; 8859-{10,13,16} aren't Emacs charsets anyhow. -- fx
1447 1412
1448 (defun ctext-pre-write-conversion (from to) 1413 (defun ctext-pre-write-conversion (from to)
1449 "Encode characters between FROM and TO as Compound Text w/Extended Segments. 1414 "Encode characters between FROM and TO as Compound Text w/Extended Segments.
1450 1415
1451 If FROM is a string, or if the current buffer is not the one set up for us 1416 If FROM is a string, or if the current buffer is not the one set up for us
1452 by run_pre_post_conversion_on_str, generate a new temp buffer, insert the 1417 by encode-coding-string, generate a new temp buffer, insert the
1453 text, and convert it in the temporary buffer. Otherwise, convert in-place." 1418 text, and convert it in the temporary buffer. Otherwise, convert in-place."
1454 (cond ((and (string= (buffer-name) " *code-converting-work*")
1455 (not (stringp from)))
1456 ; Minimize consing due to subsequent insertions and deletions.
1457 (buffer-disable-undo)
1458 (narrow-to-region from to))
1459 (t
1460 (let ((buf (current-buffer)))
1461 (set-buffer (generate-new-buffer " *temp"))
1462 (buffer-disable-undo)
1463 (if (stringp from)
1464 (insert from)
1465 (insert-buffer-substring buf from to))
1466 (setq from (point-min) to (point-max)))))
1467 (encode-coding-region from to 'ctext-no-compositions)
1468 ;; Replace ISO-2022 charset designations with extended segments, for
1469 ;; those charsets that are not part of the official X registry.
1470 (save-match-data 1419 (save-match-data
1471 (goto-char (point-min)) 1420 ;; Setup a working buffer if necessary.
1472 (let ((newpt (make-marker)) 1421 (cond ((stringp from)
1473 (case-fold-search nil) 1422 (let ((buf (current-buffer)))
1474 pt desig encode-info encoding chset noctets textlen) 1423 (set-buffer (generate-new-buffer " *temp"))
1475 (set-buffer-multibyte nil) 1424 (set-buffer-multibyte (multibyte-string-p from))
1476 ;; The regexp below finds the leading sequences for big5. 1425 (insert from)))
1426 ((not (string= (buffer-name) " *code-converting-work*"))
1427 (let ((buf (current-buffer))
1428 (multibyte enable-multibyte-characters))
1429 (set-buffer (generate-new-buffer " *temp"))
1430 (set-buffer-multibyte multibyte)
1431 (insert-buffer-substring buf from to))))
1432
1433 ;; Now we can encode the whole buffer.
1434 (let ((case-fold-search nil)
1435 last-coding-system-used
1436 pos posend desig encode-info encoding chset noctets textlen)
1437 (goto-char (point-min))
1438 ;; At first encode the whole buffer.
1439 (encode-coding-region (point-min) (point-max) 'ctext-no-compositions)
1440 ;; Then replace ISO-2022 charset designations with extended
1441 ;; segments, for those charsets that are not part of the
1442 ;; official X registry. The regexp below finds the leading
1443 ;; sequences for big5.
1477 (while (re-search-forward "\e\\(\$([01]\\)" nil 'move) 1444 (while (re-search-forward "\e\\(\$([01]\\)" nil 'move)
1478 (setq desig (match-string 1) 1445 (setq pos (match-beginning 0)
1479 pt (point-marker) 1446 posend (point)
1480 encode-info (cdr (assoc desig non-standard-designations-alist)) 1447 desig (match-string 1)
1448 encode-info (cdr (assoc desig
1449 ctext-non-standard-designations-alist))
1481 encoding (car encode-info) 1450 encoding (car encode-info)
1482 chset (cadr encode-info) 1451 chset (cadr encode-info)
1483 noctets (car (cddr encode-info))) 1452 noctets (car (cddr encode-info)))
1484 (skip-chars-forward "^\e") 1453 (skip-chars-forward "^\e")
1485 (set-marker newpt (point))
1486 (cond 1454 (cond
1487 ((eq encoding t) ; only the leading sequence needs to be changed 1455 ((eq encoding t) ; only the leading sequence needs to be changed
1488 (setq textlen (+ (- newpt pt) (length chset) 1)) 1456 (setq textlen (+ (- (point) posend) (length chset) 1))
1489 ;; Generate the ICCCM control sequence for an extended segment. 1457 ;; Generate the control sequence for an extended segment.
1490 (replace-match (format "\e%%/%d%c%c%s" 1458 (replace-match (format "\e%%/%d%c%c%s"
1491 noctets 1459 noctets
1492 (+ (/ textlen 128) 128) 1460 (+ (/ textlen 128) 128)
1493 (+ (% textlen 128) 128) 1461 (+ (% textlen 128) 128)
1494 chset) 1462 chset)
1495 t t)) 1463 t t))
1496 ((coding-system-p encoding) ; need to recode the entire segment... 1464 ((coding-system-p encoding) ; need to recode the entire segment...
1497 (set-marker pt (match-beginning 0)) 1465 (decode-coding-region pos (point) 'ctext-no-compositions)
1498 (decode-coding-region pt newpt 'ctext-no-compositions) 1466 (encode-coding-region pos (point) encoding)
1499 (set-buffer-multibyte t)
1500 (encode-coding-region pt newpt encoding)
1501 (set-buffer-multibyte nil) 1467 (set-buffer-multibyte nil)
1502 (setq textlen (+ (- newpt pt) (length chset) 1)) 1468 (setq textlen (+ (- (point) pos) (length chset) 1))
1503 (goto-char pt) 1469 (save-excursion
1504 (insert (format "\e%%/%d%c%c%s" 1470 (goto-char pos)
1505 noctets 1471 (insert (format "\e%%/%d%c%c%s"
1506 (+ (/ textlen 128) 128) 1472 noctets
1507 (+ (% textlen 128) 128) 1473 (+ (/ textlen 128) 128)
1508 chset)))) 1474 (+ (% textlen 128) 128)
1509 (goto-char newpt)))) 1475 chset))))))
1510 (set-buffer-multibyte t) 1476 (goto-char (point-min))))
1511 ;; Must return nil, as build_annotations_2 expects that. 1477 ;; Must return nil, as build_annotations_2 expects that.
1512 nil) 1478 nil)
1513 1479
1514 ;;; FILE I/O 1480 ;;; FILE I/O
1515 1481