comparison lisp/international/utf-8.el @ 41873:16ee1ffbef65

(ucs-mule-to-mule-unicode): New translation table. (ccl-encode-mule-utf-8): Use it. (utf-8-untranslated-to-ucs, utf-8-help-echo, utf-8-compose) (utf-8-post-read-conversion, utf-8-pre-write-conversion): New function. (utf-8-subst-table): New variable. (utf-8-compose-scripts): New option. (mule-utf-8): Update safe-charsets, pre-write and post-read conversion.
author Dave Love <fx@gnu.org>
date Fri, 07 Dec 2001 14:26:02 +0000
parents b174db545cfd
children dc9ed9d094c3
comparison
equal deleted inserted replaced
41872:2a4f1d831675 41873:16ee1ffbef65
1 ;;; utf-8.el --- limited UTF-8 decoding/encoding support 1 ;;; utf-8.el --- limited UTF-8 decoding/encoding support -*- coding: iso-2022-7bit-*-
2 2
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. 3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
4 ;; Licensed to the Free Software Foundation. 4 ;; Licensed to the Free Software Foundation.
5 ;; Copyright (C) 2001 Free Software Foundation, Inc. 5 ;; Copyright (C) 2001 Free Software Foundation, Inc.
6 6
24 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, 24 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
25 ;; Boston, MA 02111-1307, USA. 25 ;; Boston, MA 02111-1307, USA.
26 26
27 ;;; Commentary: 27 ;;; Commentary:
28 28
29 ;; The coding-system `mule-utf-8' supports encoding/decoding of the 29 ;; The coding-system `mule-utf-8' basically supports encoding/decoding
30 ;; following character sets to and from UTF-8: 30 ;; of the following character sets to and from UTF-8:
31 ;; 31 ;;
32 ;; ascii 32 ;; ascii
33 ;; eight-bit-control 33 ;; eight-bit-control
34 ;; latin-iso8859-1 34 ;; latin-iso8859-1
35 ;; mule-unicode-0100-24ff 35 ;; mule-unicode-0100-24ff
36 ;; mule-unicode-2500-33ff 36 ;; mule-unicode-2500-33ff
37 ;; mule-unicode-e000-ffff 37 ;; mule-unicode-e000-ffff
38 ;; 38 ;;
39 ;; Characters of other character sets cannot be encoded with
40 ;; mule-utf-8. Note that the mule-unicode charsets currently lack
41 ;; case and syntax information, so things like `downcase' will only
42 ;; work for characters from ASCII and Latin-1.
43 ;;
44 ;; On decoding, Unicode characters that do not fit into the above 39 ;; On decoding, Unicode characters that do not fit into the above
45 ;; character sets are handled as `eight-bit-control' or 40 ;; character sets are handled as `eight-bit-control' or
46 ;; `eight-bit-graphic' characters to retain the information about the 41 ;; `eight-bit-graphic' characters to retain the information about the
47 ;; original byte sequence. 42 ;; original byte sequence.
43 ;;
44 ;; Characters from other character sets can be encoded with
45 ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and
46 ;; registering the translation with `register-char-codings'.
48 47
49 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: 48 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
50 49
51 ;; scalar | utf-8 50 ;; scalar | utf-8
52 ;; value | 1st byte | 2nd byte | 3rd byte 51 ;; value | 1st byte | 2nd byte | 3rd byte
55 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | 54 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
56 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx 55 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
57 56
58 ;;; Code: 57 ;;; Code:
59 58
59 (defvar ucs-mule-to-mule-unicode (make-translation-table)
60 "Translation table for encoding to `mule-utf-8'.")
61 ;; Could have been done by ucs-tables loaded before.
62 (unless (get 'ucs-mule-to-mule-unicode 'translation-table)
63 (define-translation-table 'ucs-mule-to-mule-unicode ucs-mule-to-mule-unicode))
60 (define-ccl-program ccl-decode-mule-utf-8 64 (define-ccl-program ccl-decode-mule-utf-8
61 ;; 65 ;;
62 ;; charset | bytes in utf-8 | bytes in emacs 66 ;; charset | bytes in utf-8 | bytes in emacs
63 ;; -----------------------+----------------+--------------- 67 ;; -----------------------+----------------+---------------
64 ;; ascii | 1 | 1 68 ;; ascii | 1 | 1
65 ;; -----------------------+----------------+--------------- 69 ;; -----------------------+----------------+---------------
66 ;; eight-bit-control | 2 | 2 70 ;; eight-bit-control | 2 | 2
71 ;; eight-bit-graphic | 2 | 1
67 ;; latin-iso8859-1 | 2 | 2 72 ;; latin-iso8859-1 | 2 | 2
68 ;; -----------------------+----------------+--------------- 73 ;; -----------------------+----------------+---------------
69 ;; mule-unicode-0100-24ff | 2 | 4 74 ;; mule-unicode-0100-24ff | 2 | 4
70 ;; (< 0800) | | 75 ;; (< 0800) | |
71 ;; -----------------------+----------------+--------------- 76 ;; -----------------------+----------------+---------------
226 `(1 231 `(1
227 ((r5 = -1) 232 ((r5 = -1)
228 (loop 233 (loop
229 (if (r5 < 0) 234 (if (r5 < 0)
230 ((r1 = -1) 235 ((r1 = -1)
231 (read-multibyte-character r0 r1)) 236 (read-multibyte-character r0 r1)
237 (translate-character ucs-mule-to-mule-unicode r0 r1))
232 (;; We have already done read-multibyte-character. 238 (;; We have already done read-multibyte-character.
233 (r0 = r5) 239 (r0 = r5)
234 (r1 = r6) 240 (r1 = r6)
235 (r5 = -1))) 241 (r5 = -1)))
236 242
338 "CCL program to encode into UTF-8. 344 "CCL program to encode into UTF-8.
339 Only characters from the charsets ascii, eight-bit-control, 345 Only characters from the charsets ascii, eight-bit-control,
340 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized. 346 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
341 Others are encoded as U+FFFD.") 347 Others are encoded as U+FFFD.")
342 348
349 ;; Dummy definition so that the CCL can be checked correctly; the
350 ;; actual data are loaded on demand.
351 (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it
352 (define-translation-table 'ucs-mule-8859-to-mule-unicode))
353
354 (defsubst utf-8-untranslated-to-ucs ()
355 (let ((b1 (char-after))
356 (b2 (char-after (1+ (point))))
357 (b3 (char-after (+ 2 (point))))
358 (b4 (char-after (+ 4 (point)))))
359 (if (and b1 b2 b3)
360 (cond ((< b1 ?\xf0)
361 (setq b2 (lsh (logand b2 ?\x3f) 6))
362 (setq b3 (logand b3 ?\x3f))
363 (logior b3 (logior b2 (lsh (logand b1 ?\x0f) 12))))
364 (b4
365 (setq b2 (lsh (logand b2 ?\x3f) 12))
366 (setq b3 (lsh (logand b3 ?\x3f) 6))
367 (setq b4 (logand b4 ?\x3f))
368 (logior b4 (logior b3 (logior b2 (lsh (logand b1 ?\x07)
369 18)))))))))
370
371 (defun utf-8-help-echo (window object position)
372 (format "Untranslated Unicode U+%04X"
373 (get-char-property position 'untranslated-utf-8 object)))
374
375 (defvar utf-8-subst-table nil
376 "If non-nil, a hash table mapping `untranslatable utf-8' to Emacs characters.")
377
378 ;; We compose the untranslatable sequences into a single character.
379 ;; This is infelicitous for editing, because there's currently no
380 ;; mechanism for treating compositions as atomic, but is OK for
381 ;; display. We try to compose an appropriate character from a hash
382 ;; table of CJK characters to display correctly. Otherwise we use
383 ;; U+FFFD. What we really should have is hash table lookup from CCL
384 ;; so that we could do this properly. This function GCs too much.
385 (defsubst utf-8-compose ()
386 "Put a suitable composition on an untranslatable sequence.
387 Return the sequence's length."
388 (let* ((u (utf-8-untranslated-to-ucs))
389 (l (and u (if (>= u ?\x10000)
390 4
391 3)))
392 (subst (and utf-8-subst-table (gethash u utf-8-subst-table))))
393 (when u
394 (put-text-property (point) (min (point-max) (+ l (point)))
395 'untranslated-utf-8 u)
396 (unless subst
397 (put-text-property (point) (min (point-max) (+ l (point)))
398 'help-echo 'utf-8-help-echo)
399 (setq subst ?$,3u=(B))
400 (compose-region (point) (+ l (point)) subst)
401 l)))
402
403 (defcustom utf-8-compose-scripts nil
404 "*Non-nil means compose various scipts on decoding utf-8 text."
405 :group 'mule
406 :type 'boolean) ; omitted in Emacs 21.1
407
408 (defun utf-8-post-read-conversion (length)
409 "Compose untranslated utf-8 sequences into single characters.
410 Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
411 (save-excursion
412 ;; Can't do eval-when-compile to insert a multibyte constant
413 ;; version of the string in the loop, since it's always loaded as
414 ;; unibyte from a byte-compiled file.
415 (let ((range (string-as-multibyte "^\341-\377")))
416 (while (and (skip-chars-forward
417 range)
418 (not (eobp)))
419 (forward-char (utf-8-compose)))))
420 ;; Fixme: Takahashi-san implies it may not work this easily -- needs
421 ;; checking with him.
422 (when (and utf-8-compose-scripts (> length 1))
423 ;; These currently have definitions which cover the relevant
424 ;; Unicodes. We could avoid loading thai-util &c by checking
425 ;; whether the region contains any characters with the appropriate
426 ;; categories. There aren't yet Unicode-based rules for Tibetan.
427 (save-excursion (setq length (diacritic-post-read-conversion length)))
428 (save-excursion (setq length (thai-post-read-conversion length)))
429 (save-excursion (setq length (lao-post-read-conversion length)))
430 (save-excursion (setq length (devanagari-post-read-conversion length))))
431 length)
432
433 (defun utf-8-pre-write-conversion (beg end)
434 "Semi-dummy pre-write function effectively to autoload ucs-tables."
435 ;; Ensure translation table is loaded.
436 (require 'ucs-tables)
437 ;; Don't do this again.
438 (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
439 nil)
440
343 (make-coding-system 441 (make-coding-system
344 'mule-utf-8 4 ?u 442 'mule-utf-8 4 ?u
345 "UTF-8 encoding for Emacs-supported Unicode characters. 443 "UTF-8 encoding for Emacs-supported Unicode characters.
346 The supported Emacs character sets are: 444 The supported Emacs character sets are the following, plus others
347 ascii 445 which may be included in the translation table
348 eight-bit-control 446 `ucs-mule-to-mule-unicode':
349 eight-bit-graphic 447 ascii
350 latin-iso8859-1 448 eight-bit-control
351 mule-unicode-0100-24ff 449 eight-bit-graphic
352 mule-unicode-2500-33ff 450 latin-iso8859-1
353 mule-unicode-e000-ffff 451 latin-iso8859-2
452 latin-iso8859-3
453 latin-iso8859-4
454 cyrillic-iso8859-5
455 greek-iso8859-7
456 hebrew-iso8859-8
457 latin-iso8859-9
458 latin-iso8859-14
459 latin-iso8859-15
460 mule-unicode-0100-24ff
461 mule-unicode-2500-33ff
462 mule-unicode-e000-ffff
354 463
355 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF 464 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
356 are decoded into sequences of eight-bit-control and eight-bit-graphic 465 are decoded into sequences of eight-bit-control and eight-bit-graphic
357 characters to preserve their byte sequences. Emacs characters out of 466 characters to preserve their byte sequences and composed to display as
358 these ranges are encoded into U+FFFD. 467 a single character. Emacs characters that can't be encoded to these
359 468 ranges are encoded as U+FFFD."
360 Note that, currently, characters in the mule-unicode charsets have no
361 syntax and case information. Thus, for instance, upper- and
362 lower-casing commands won't work with them."
363 469
364 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) 470 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
365 '((safe-charsets 471 '((safe-charsets
366 ascii 472 ascii
367 eight-bit-control 473 eight-bit-control
368 eight-bit-graphic 474 eight-bit-graphic
369 latin-iso8859-1 475 latin-iso8859-1
476 latin-iso8859-15
477 latin-iso8859-14
478 latin-iso8859-9
479 hebrew-iso8859-8
480 greek-iso8859-7
481 cyrillic-iso8859-5
482 latin-iso8859-4
483 latin-iso8859-3
484 latin-iso8859-2
485 vietnamese-viscii-lower
486 vietnamese-viscii-upper
487 thai-tis620
488 ipa
489 ethiopic
490 indian-is13194
491 katakana-jisx0201
492 chinese-sisheng
493 lao
370 mule-unicode-0100-24ff 494 mule-unicode-0100-24ff
371 mule-unicode-2500-33ff 495 mule-unicode-2500-33ff
372 mule-unicode-e000-ffff) 496 mule-unicode-e000-ffff)
373 (mime-charset . utf-8) 497 (mime-charset . utf-8)
374 (coding-category . coding-category-utf-8) 498 (coding-category . coding-category-utf-8)
375 (valid-codes (0 . 255)))) 499 (valid-codes (0 . 255))
500 (pre-write-conversion . utf-8-pre-write-conversion)
501 (post-read-conversion . utf-8-post-read-conversion)))
376 502
377 (define-coding-system-alias 'utf-8 'mule-utf-8) 503 (define-coding-system-alias 'utf-8 'mule-utf-8)
378 504
505 ;; I think this needs special private charsets defined for the
506 ;; untranslated sequences, if it's going to work well.
507
508 ;;; (defun utf-8-compose-function (pos to pattern &optional string)
509 ;;; (let* ((prop (get-char-property pos 'composition string))
510 ;;; (l (and prop (- (cadr prop) (car prop)))))
511 ;;; (cond ((and l (> l (- to pos)))
512 ;;; (delete-region pos to))
513 ;;; ((and (> (char-after pos) 224)
514 ;;; (< (char-after pos) 256)
515 ;;; (save-restriction
516 ;;; (narrow-to-region pos to)
517 ;;; (utf-8-compose)))
518 ;;; t))))
519
520 ;;; (dotimes (i 96)
521 ;;; (aset composition-function-table
522 ;;; (+ 128 i)
523 ;;; `((,(string-as-multibyte "[\200-\237\240-\377]")
524 ;;; . utf-8-compose-function))))
525
379 ;;; utf-8.el ends here 526 ;;; utf-8.el ends here