Mercurial > emacs
comparison lisp/international/utf-8.el @ 41873:16ee1ffbef65
(ucs-mule-to-mule-unicode): New
translation table.
(ccl-encode-mule-utf-8): Use it.
(utf-8-untranslated-to-ucs, utf-8-help-echo, utf-8-compose)
(utf-8-post-read-conversion, utf-8-pre-write-conversion): New
function.
(utf-8-subst-table): New variable.
(utf-8-compose-scripts): New option.
(mule-utf-8): Update safe-charsets, pre-write and post-read
conversion.
author | Dave Love <fx@gnu.org> |
---|---|
date | Fri, 07 Dec 2001 14:26:02 +0000 |
parents | b174db545cfd |
children | dc9ed9d094c3 |
comparison
equal
deleted
inserted
replaced
41872:2a4f1d831675 | 41873:16ee1ffbef65 |
---|---|
1 ;;; utf-8.el --- limited UTF-8 decoding/encoding support | 1 ;;; utf-8.el --- limited UTF-8 decoding/encoding support -*- coding: iso-2022-7bit-*- |
2 | 2 |
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. | 3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. |
4 ;; Licensed to the Free Software Foundation. | 4 ;; Licensed to the Free Software Foundation. |
5 ;; Copyright (C) 2001 Free Software Foundation, Inc. | 5 ;; Copyright (C) 2001 Free Software Foundation, Inc. |
6 | 6 |
24 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | 24 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
25 ;; Boston, MA 02111-1307, USA. | 25 ;; Boston, MA 02111-1307, USA. |
26 | 26 |
27 ;;; Commentary: | 27 ;;; Commentary: |
28 | 28 |
29 ;; The coding-system `mule-utf-8' supports encoding/decoding of the | 29 ;; The coding-system `mule-utf-8' basically supports encoding/decoding |
30 ;; following character sets to and from UTF-8: | 30 ;; of the following character sets to and from UTF-8: |
31 ;; | 31 ;; |
32 ;; ascii | 32 ;; ascii |
33 ;; eight-bit-control | 33 ;; eight-bit-control |
34 ;; latin-iso8859-1 | 34 ;; latin-iso8859-1 |
35 ;; mule-unicode-0100-24ff | 35 ;; mule-unicode-0100-24ff |
36 ;; mule-unicode-2500-33ff | 36 ;; mule-unicode-2500-33ff |
37 ;; mule-unicode-e000-ffff | 37 ;; mule-unicode-e000-ffff |
38 ;; | 38 ;; |
39 ;; Characters of other character sets cannot be encoded with | |
40 ;; mule-utf-8. Note that the mule-unicode charsets currently lack | |
41 ;; case and syntax information, so things like `downcase' will only | |
42 ;; work for characters from ASCII and Latin-1. | |
43 ;; | |
44 ;; On decoding, Unicode characters that do not fit into the above | 39 ;; On decoding, Unicode characters that do not fit into the above |
45 ;; character sets are handled as `eight-bit-control' or | 40 ;; character sets are handled as `eight-bit-control' or |
46 ;; `eight-bit-graphic' characters to retain the information about the | 41 ;; `eight-bit-graphic' characters to retain the information about the |
47 ;; original byte sequence. | 42 ;; original byte sequence. |
43 ;; | |
44 ;; Characters from other character sets can be encoded with | |
45 ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and | |
46 ;; registering the translation with `register-char-codings'. | |
48 | 47 |
49 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: | 48 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: |
50 | 49 |
51 ;; scalar | utf-8 | 50 ;; scalar | utf-8 |
52 ;; value | 1st byte | 2nd byte | 3rd byte | 51 ;; value | 1st byte | 2nd byte | 3rd byte |
55 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | | 54 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | |
56 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx | 55 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx |
57 | 56 |
58 ;;; Code: | 57 ;;; Code: |
59 | 58 |
59 (defvar ucs-mule-to-mule-unicode (make-translation-table) | |
60 "Translation table for encoding to `mule-utf-8'.") | |
61 ;; Could have been done by ucs-tables loaded before. | |
62 (unless (get 'ucs-mule-to-mule-unicode 'translation-table) | |
63 (define-translation-table 'ucs-mule-to-mule-unicode ucs-mule-to-mule-unicode)) | |
60 (define-ccl-program ccl-decode-mule-utf-8 | 64 (define-ccl-program ccl-decode-mule-utf-8 |
61 ;; | 65 ;; |
62 ;; charset | bytes in utf-8 | bytes in emacs | 66 ;; charset | bytes in utf-8 | bytes in emacs |
63 ;; -----------------------+----------------+--------------- | 67 ;; -----------------------+----------------+--------------- |
64 ;; ascii | 1 | 1 | 68 ;; ascii | 1 | 1 |
65 ;; -----------------------+----------------+--------------- | 69 ;; -----------------------+----------------+--------------- |
66 ;; eight-bit-control | 2 | 2 | 70 ;; eight-bit-control | 2 | 2 |
71 ;; eight-bit-graphic | 2 | 1 | |
67 ;; latin-iso8859-1 | 2 | 2 | 72 ;; latin-iso8859-1 | 2 | 2 |
68 ;; -----------------------+----------------+--------------- | 73 ;; -----------------------+----------------+--------------- |
69 ;; mule-unicode-0100-24ff | 2 | 4 | 74 ;; mule-unicode-0100-24ff | 2 | 4 |
70 ;; (< 0800) | | | 75 ;; (< 0800) | | |
71 ;; -----------------------+----------------+--------------- | 76 ;; -----------------------+----------------+--------------- |
226 `(1 | 231 `(1 |
227 ((r5 = -1) | 232 ((r5 = -1) |
228 (loop | 233 (loop |
229 (if (r5 < 0) | 234 (if (r5 < 0) |
230 ((r1 = -1) | 235 ((r1 = -1) |
231 (read-multibyte-character r0 r1)) | 236 (read-multibyte-character r0 r1) |
237 (translate-character ucs-mule-to-mule-unicode r0 r1)) | |
232 (;; We have already done read-multibyte-character. | 238 (;; We have already done read-multibyte-character. |
233 (r0 = r5) | 239 (r0 = r5) |
234 (r1 = r6) | 240 (r1 = r6) |
235 (r5 = -1))) | 241 (r5 = -1))) |
236 | 242 |
338 "CCL program to encode into UTF-8. | 344 "CCL program to encode into UTF-8. |
339 Only characters from the charsets ascii, eight-bit-control, | 345 Only characters from the charsets ascii, eight-bit-control, |
340 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized. | 346 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized. |
341 Others are encoded as U+FFFD.") | 347 Others are encoded as U+FFFD.") |
342 | 348 |
349 ;; Dummy definition so that the CCL can be checked correctly; the | |
350 ;; actual data are loaded on demand. | |
351 (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it | |
352 (define-translation-table 'ucs-mule-8859-to-mule-unicode)) | |
353 | |
354 (defsubst utf-8-untranslated-to-ucs () | |
355 (let ((b1 (char-after)) | |
356 (b2 (char-after (1+ (point)))) | |
357 (b3 (char-after (+ 2 (point)))) | |
358 (b4 (char-after (+ 4 (point))))) | |
359 (if (and b1 b2 b3) | |
360 (cond ((< b1 ?\xf0) | |
361 (setq b2 (lsh (logand b2 ?\x3f) 6)) | |
362 (setq b3 (logand b3 ?\x3f)) | |
363 (logior b3 (logior b2 (lsh (logand b1 ?\x0f) 12)))) | |
364 (b4 | |
365 (setq b2 (lsh (logand b2 ?\x3f) 12)) | |
366 (setq b3 (lsh (logand b3 ?\x3f) 6)) | |
367 (setq b4 (logand b4 ?\x3f)) | |
368 (logior b4 (logior b3 (logior b2 (lsh (logand b1 ?\x07) | |
369 18))))))))) | |
370 | |
371 (defun utf-8-help-echo (window object position) | |
372 (format "Untranslated Unicode U+%04X" | |
373 (get-char-property position 'untranslated-utf-8 object))) | |
374 | |
375 (defvar utf-8-subst-table nil | |
376 "If non-nil, a hash table mapping `untranslatable utf-8' to Emacs characters.") | |
377 | |
378 ;; We compose the untranslatable sequences into a single character. | |
379 ;; This is infelicitous for editing, because there's currently no | |
380 ;; mechanism for treating compositions as atomic, but is OK for | |
381 ;; display. We try to compose an appropriate character from a hash | |
382 ;; table of CJK characters to display correctly. Otherwise we use | |
383 ;; U+FFFD. What we really should have is hash table lookup from CCL | |
384 ;; so that we could do this properly. This function GCs too much. | |
385 (defsubst utf-8-compose () | |
386 "Put a suitable composition on an untranslatable sequence. | |
387 Return the sequence's length." | |
388 (let* ((u (utf-8-untranslated-to-ucs)) | |
389 (l (and u (if (>= u ?\x10000) | |
390 4 | |
391 3))) | |
392 (subst (and utf-8-subst-table (gethash u utf-8-subst-table)))) | |
393 (when u | |
394 (put-text-property (point) (min (point-max) (+ l (point))) | |
395 'untranslated-utf-8 u) | |
396 (unless subst | |
397 (put-text-property (point) (min (point-max) (+ l (point))) | |
398 'help-echo 'utf-8-help-echo) | |
399 (setq subst ?$,3u=(B)) | |
400 (compose-region (point) (+ l (point)) subst) | |
401 l))) | |
402 | |
403 (defcustom utf-8-compose-scripts nil | |
404 "*Non-nil means compose various scipts on decoding utf-8 text." | |
405 :group 'mule | |
406 :type 'boolean) ; omitted in Emacs 21.1 | |
407 | |
408 (defun utf-8-post-read-conversion (length) | |
409 "Compose untranslated utf-8 sequences into single characters. | |
410 Also compose particular scripts if `utf-8-compose-scripts' is non-nil." | |
411 (save-excursion | |
412 ;; Can't do eval-when-compile to insert a multibyte constant | |
413 ;; version of the string in the loop, since it's always loaded as | |
414 ;; unibyte from a byte-compiled file. | |
415 (let ((range (string-as-multibyte "^\341-\377"))) | |
416 (while (and (skip-chars-forward | |
417 range) | |
418 (not (eobp))) | |
419 (forward-char (utf-8-compose))))) | |
420 ;; Fixme: Takahashi-san implies it may not work this easily -- needs | |
421 ;; checking with him. | |
422 (when (and utf-8-compose-scripts (> length 1)) | |
423 ;; These currently have definitions which cover the relevant | |
424 ;; Unicodes. We could avoid loading thai-util &c by checking | |
425 ;; whether the region contains any characters with the appropriate | |
426 ;; categories. There aren't yet Unicode-based rules for Tibetan. | |
427 (save-excursion (setq length (diacritic-post-read-conversion length))) | |
428 (save-excursion (setq length (thai-post-read-conversion length))) | |
429 (save-excursion (setq length (lao-post-read-conversion length))) | |
430 (save-excursion (setq length (devanagari-post-read-conversion length)))) | |
431 length) | |
432 | |
433 (defun utf-8-pre-write-conversion (beg end) | |
434 "Semi-dummy pre-write function effectively to autoload ucs-tables." | |
435 ;; Ensure translation table is loaded. | |
436 (require 'ucs-tables) | |
437 ;; Don't do this again. | |
438 (coding-system-put 'mule-utf-8 'pre-write-conversion nil) | |
439 nil) | |
440 | |
343 (make-coding-system | 441 (make-coding-system |
344 'mule-utf-8 4 ?u | 442 'mule-utf-8 4 ?u |
345 "UTF-8 encoding for Emacs-supported Unicode characters. | 443 "UTF-8 encoding for Emacs-supported Unicode characters. |
346 The supported Emacs character sets are: | 444 The supported Emacs character sets are the following, plus others |
347 ascii | 445 which may be included in the translation table |
348 eight-bit-control | 446 `ucs-mule-to-mule-unicode': |
349 eight-bit-graphic | 447 ascii |
350 latin-iso8859-1 | 448 eight-bit-control |
351 mule-unicode-0100-24ff | 449 eight-bit-graphic |
352 mule-unicode-2500-33ff | 450 latin-iso8859-1 |
353 mule-unicode-e000-ffff | 451 latin-iso8859-2 |
452 latin-iso8859-3 | |
453 latin-iso8859-4 | |
454 cyrillic-iso8859-5 | |
455 greek-iso8859-7 | |
456 hebrew-iso8859-8 | |
457 latin-iso8859-9 | |
458 latin-iso8859-14 | |
459 latin-iso8859-15 | |
460 mule-unicode-0100-24ff | |
461 mule-unicode-2500-33ff | |
462 mule-unicode-e000-ffff | |
354 | 463 |
355 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF | 464 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF |
356 are decoded into sequences of eight-bit-control and eight-bit-graphic | 465 are decoded into sequences of eight-bit-control and eight-bit-graphic |
357 characters to preserve their byte sequences. Emacs characters out of | 466 characters to preserve their byte sequences and composed to display as |
358 these ranges are encoded into U+FFFD. | 467 a single character. Emacs characters that can't be encoded to these |
359 | 468 ranges are encoded as U+FFFD." |
360 Note that, currently, characters in the mule-unicode charsets have no | |
361 syntax and case information. Thus, for instance, upper- and | |
362 lower-casing commands won't work with them." | |
363 | 469 |
364 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | 470 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) |
365 '((safe-charsets | 471 '((safe-charsets |
366 ascii | 472 ascii |
367 eight-bit-control | 473 eight-bit-control |
368 eight-bit-graphic | 474 eight-bit-graphic |
369 latin-iso8859-1 | 475 latin-iso8859-1 |
476 latin-iso8859-15 | |
477 latin-iso8859-14 | |
478 latin-iso8859-9 | |
479 hebrew-iso8859-8 | |
480 greek-iso8859-7 | |
481 cyrillic-iso8859-5 | |
482 latin-iso8859-4 | |
483 latin-iso8859-3 | |
484 latin-iso8859-2 | |
485 vietnamese-viscii-lower | |
486 vietnamese-viscii-upper | |
487 thai-tis620 | |
488 ipa | |
489 ethiopic | |
490 indian-is13194 | |
491 katakana-jisx0201 | |
492 chinese-sisheng | |
493 lao | |
370 mule-unicode-0100-24ff | 494 mule-unicode-0100-24ff |
371 mule-unicode-2500-33ff | 495 mule-unicode-2500-33ff |
372 mule-unicode-e000-ffff) | 496 mule-unicode-e000-ffff) |
373 (mime-charset . utf-8) | 497 (mime-charset . utf-8) |
374 (coding-category . coding-category-utf-8) | 498 (coding-category . coding-category-utf-8) |
375 (valid-codes (0 . 255)))) | 499 (valid-codes (0 . 255)) |
500 (pre-write-conversion . utf-8-pre-write-conversion) | |
501 (post-read-conversion . utf-8-post-read-conversion))) | |
376 | 502 |
377 (define-coding-system-alias 'utf-8 'mule-utf-8) | 503 (define-coding-system-alias 'utf-8 'mule-utf-8) |
378 | 504 |
505 ;; I think this needs special private charsets defined for the | |
506 ;; untranslated sequences, if it's going to work well. | |
507 | |
508 ;;; (defun utf-8-compose-function (pos to pattern &optional string) | |
509 ;;; (let* ((prop (get-char-property pos 'composition string)) | |
510 ;;; (l (and prop (- (cadr prop) (car prop))))) | |
511 ;;; (cond ((and l (> l (- to pos))) | |
512 ;;; (delete-region pos to)) | |
513 ;;; ((and (> (char-after pos) 224) | |
514 ;;; (< (char-after pos) 256) | |
515 ;;; (save-restriction | |
516 ;;; (narrow-to-region pos to) | |
517 ;;; (utf-8-compose))) | |
518 ;;; t)))) | |
519 | |
520 ;;; (dotimes (i 96) | |
521 ;;; (aset composition-function-table | |
522 ;;; (+ 128 i) | |
523 ;;; `((,(string-as-multibyte "[\200-\237\240-\377]") | |
524 ;;; . utf-8-compose-function)))) | |
525 | |
379 ;;; utf-8.el ends here | 526 ;;; utf-8.el ends here |