annotate lisp/international/utf-8.el @ 45625:28f0b229040c

Initial revision
author Thien-Thi Nguyen <ttn@gnuvola.org>
date Mon, 03 Jun 2002 03:15:34 +0000
parents c3c4e09c3eab
children 395e5c46761b
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
44411
c3c4e09c3eab Fix typo.
Pavel Janík <Pavel@Janik.cz>
parents: 41961
diff changeset
1 ;;; utf-8.el --- limited UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*-
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
2
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
4 ;; Licensed to the Free Software Foundation.
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
5 ;; Copyright (C) 2001 Free Software Foundation, Inc.
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
6
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
7 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
8 ;; Keywords: multilingual, Unicode, UTF-8, i18n
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
9
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
10 ;; This file is part of GNU Emacs.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
11
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
12 ;; GNU Emacs is free software; you can redistribute it and/or modify
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
13 ;; it under the terms of the GNU General Public License as published by
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
14 ;; the Free Software Foundation; either version 2, or (at your option)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
15 ;; any later version.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
16
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
17 ;; GNU Emacs is distributed in the hope that it will be useful,
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
18 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
19 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
20 ;; GNU General Public License for more details.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
21
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
22 ;; You should have received a copy of the GNU General Public License
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
23 ;; along with GNU Emacs; see the file COPYING. If not, write to the
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
24 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
25 ;; Boston, MA 02111-1307, USA.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
26
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
27 ;;; Commentary:
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
28
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
29 ;; The coding-system `mule-utf-8' basically supports encoding/decoding
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
30 ;; of the following character sets to and from UTF-8:
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
31 ;;
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
32 ;; ascii
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
33 ;; eight-bit-control
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
34 ;; latin-iso8859-1
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
35 ;; mule-unicode-0100-24ff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
36 ;; mule-unicode-2500-33ff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
37 ;; mule-unicode-e000-ffff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
38 ;;
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
39 ;; On decoding, Unicode characters that do not fit into the above
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
40 ;; character sets are handled as `eight-bit-control' or
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
41 ;; `eight-bit-graphic' characters to retain the information about the
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
42 ;; original byte sequence.
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
43 ;;
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
44 ;; Characters from other character sets can be encoded with
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
45 ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
46 ;; registering the translation with `register-char-codings'.
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
47
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
48 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
49
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
50 ;; scalar | utf-8
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
51 ;; value | 1st byte | 2nd byte | 3rd byte
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
52 ;; --------------------+-----------+-----------+----------
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
53 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
54 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
55 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
56
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
57 ;;; Code:
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
58
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
59 (defvar ucs-mule-to-mule-unicode (make-translation-table)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
60 "Translation table for encoding to `mule-utf-8'.")
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
61 ;; Could have been done by ucs-tables loaded before.
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
62 (unless (get 'ucs-mule-to-mule-unicode 'translation-table)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
63 (define-translation-table 'ucs-mule-to-mule-unicode ucs-mule-to-mule-unicode))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
64 (define-ccl-program ccl-decode-mule-utf-8
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
65 ;;
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
66 ;; charset | bytes in utf-8 | bytes in emacs
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
67 ;; -----------------------+----------------+---------------
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
68 ;; ascii | 1 | 1
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
69 ;; -----------------------+----------------+---------------
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
70 ;; eight-bit-control | 2 | 2
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
71 ;; eight-bit-graphic | 2 | 1
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
72 ;; latin-iso8859-1 | 2 | 2
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
73 ;; -----------------------+----------------+---------------
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
74 ;; mule-unicode-0100-24ff | 2 | 4
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
75 ;; (< 0800) | |
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
76 ;; -----------------------+----------------+---------------
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
77 ;; mule-unicode-0100-24ff | 3 | 4
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
78 ;; (>= 8000) | |
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
79 ;; mule-unicode-2500-33ff | 3 | 4
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
80 ;; mule-unicode-e000-ffff | 3 | 4
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
81 ;;
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
82 ;; Thus magnification factor is two.
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
83 ;;
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
84 `(2
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
85 ((r5 = ,(charset-id 'eight-bit-control))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
86 (r6 = ,(charset-id 'eight-bit-graphic))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
87 (loop
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
88 (read r0)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
89
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
90 ;; 1byte encoding, i.e., ascii
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
91 (if (r0 < #x80)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
92 (write r0)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
93
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
94 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
95 (if (r0 < #xe0)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
96 ((read r1)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
97
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
98 (if ((r1 & #b11000000) != #b10000000)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
99 ;; Invalid 2-byte sequence
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
100 ((if (r0 < #xa0)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
101 (write-multibyte-character r5 r0)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
102 (write-multibyte-character r6 r0))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
103 (if (r1 < #x80)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
104 (write r1)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
105 (if (r1 < #xa0)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
106 (write-multibyte-character r5 r1)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
107 (write-multibyte-character r6 r1))))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
108
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
109 ((r0 &= #x1f)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
110 (r0 <<= 6)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
111 (r1 &= #x3f)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
112 (r1 += r0)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
113 ;; Now r1 holds scalar value
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
114
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
115 ;; eight-bit-control
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
116 (if (r1 < 160)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
117 ((write-multibyte-character r5 r1))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
118
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
119 ;; latin-iso8859-1
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
120 (if (r1 < 256)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
121 ((r0 = ,(charset-id 'latin-iso8859-1))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
122 (r1 -= 128)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
123 (write-multibyte-character r0 r1))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
124
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
125 ;; mule-unicode-0100-24ff (< 0800)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
126 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
127 (r1 -= #x0100)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
128 (r2 = (((r1 / 96) + 32) << 7))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
129 (r1 %= 96)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
130 (r1 += (r2 + 32))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
131 (write-multibyte-character r0 r1)))))))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
132
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
133 ;; 3byte encoding
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
134 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
135 (if (r0 < #xf0)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
136 ((read r1 r2)
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
137
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
138 ;; This is set to 1 if the encoding is invalid.
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
139 (r4 = 0)
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
140
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
141 (r3 = (r1 & #b11000000))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
142 (r3 |= ((r2 >> 2) & #b00110000))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
143 (if (r3 != #b10100000)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
144 (r4 = 1)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
145 ((r3 = ((r0 & #x0f) << 12))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
146 (r3 += ((r1 & #x3f) << 6))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
147 (r3 += (r2 & #x3f))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
148 (if (r3 < #x0800)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
149 (r4 = 1))))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
150
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
151 (if (r4 != 0)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
152 ;; Invalid 3-byte sequence
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
153 ((if (r0 < #xa0)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
154 (write-multibyte-character r5 r0)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
155 (write-multibyte-character r6 r0))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
156 (if (r1 < #x80)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
157 (write r1)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
158 (if (r1 < #xa0)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
159 (write-multibyte-character r5 r1)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
160 (write-multibyte-character r6 r1)))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
161 (if (r2 < #x80)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
162 (write r2)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
163 (if (r2 < #xa0)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
164 (write-multibyte-character r5 r2)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
165 (write-multibyte-character r6 r2))))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
166
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
167 ;; mule-unicode-0100-24ff (>= 0800)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
168 ((if (r3 < #x2500)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
169 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
170 (r3 -= #x0100)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
171 (r3 //= 96)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
172 (r1 = (r7 + 32))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
173 (r1 += ((r3 + 32) << 7))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
174 (write-multibyte-character r0 r1))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
175
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
176 ;; mule-unicode-2500-33ff
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
177 (if (r3 < #x3400)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
178 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
179 (r3 -= #x2500)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
180 (r3 //= 96)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
181 (r1 = (r7 + 32))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
182 (r1 += ((r3 + 32) << 7))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
183 (write-multibyte-character r0 r1))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
184
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
185 ;; U+3400 .. U+DFFF
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
186 ;; keep those bytes as eight-bit-{control|graphic}
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
187 (if (r3 < #xe000)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
188 ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
189 (r3 = r6)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
190 (write-multibyte-character r3 r0)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
191 (if (r1 < #xa0)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
192 (r3 = r5))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
193 (write-multibyte-character r3 r1)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
194 (if (r2 < #xa0)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
195 (r3 = r5)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
196 (r3 = r6))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
197 (write-multibyte-character r3 r2))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
198
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
199 ;; mule-unicode-e000-ffff
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
200 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
201 (r3 -= #xe000)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
202 (r3 //= 96)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
203 (r1 = (r7 + 32))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
204 (r1 += ((r3 + 32) << 7))
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
205 (write-multibyte-character r0 r1))))))))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
206
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
207 ;; 4byte encoding
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
208 ;; keep those bytes as eight-bit-{control|graphic}
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
209 ((read r1 r2 r3)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
210 ;; r0 > #xf0, thus eight-bit-graphic
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
211 (write-multibyte-character r6 r0)
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
212 (if (r1 < #xa0)
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
213 (write-multibyte-character r5 r1)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
214 (write-multibyte-character r6 r1))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
215 (if (r2 < #xa0)
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
216 (write-multibyte-character r5 r2)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
217 (write-multibyte-character r6 r2))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
218 (if (r3 < #xa0)
37934
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
219 (write-multibyte-character r5 r3)
88389fa9b713 (ccl-decode-mule-utf-8): Handle
Gerd Moellmann <gerd@gnu.org>
parents: 37097
diff changeset
220 (write-multibyte-character r6 r3))))))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
221
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
222 (repeat))))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
223
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
224 "CCL program to decode UTF-8.
36465
f968e313e8ad Doc fixes.
Dave Love <fx@gnu.org>
parents: 36423
diff changeset
225 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
f968e313e8ad Doc fixes.
Dave Love <fx@gnu.org>
parents: 36423
diff changeset
226 mule-unicode-*. Encodings of un-representable Unicode characters are
f968e313e8ad Doc fixes.
Dave Love <fx@gnu.org>
parents: 36423
diff changeset
227 decoded asis into eight-bit-control and eight-bit-graphic
f968e313e8ad Doc fixes.
Dave Love <fx@gnu.org>
parents: 36423
diff changeset
228 characters.")
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
229
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
230 (define-ccl-program ccl-encode-mule-utf-8
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
231 `(1
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
232 ((r5 = -1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
233 (loop
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
234 (if (r5 < 0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
235 ((r1 = -1)
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
236 (read-multibyte-character r0 r1)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
237 (translate-character ucs-mule-to-mule-unicode r0 r1))
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
238 (;; We have already done read-multibyte-character.
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
239 (r0 = r5)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
240 (r1 = r6)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
241 (r5 = -1)))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
242
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
243 (if (r0 == ,(charset-id 'ascii))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
244 (write r1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
245
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
246 (if (r0 == ,(charset-id 'latin-iso8859-1))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
247 ;; r1 scalar utf-8
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
248 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
249 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
250 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
251 ((r0 = (((r1 & #x40) >> 6) | #xc2))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
252 (r1 &= #x3f)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
253 (r1 |= #x80)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
254 (write r0 r1))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
255
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
256 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
257 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
258 ;; #x3f80 == (0011 1111 1000 0000)b
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
259 (r1 &= #x7f)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
260 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
261 ;; now r1 holds scalar value
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
262 (if (r1 < #x0800)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
263 ;; 2byte encoding
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
264 ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
265 ;; #x07c0 == (0000 0111 1100 0000)b
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
266 (r1 &= #x3f)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
267 (r1 |= #x80)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
268 (write r0 r1))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
269 ;; 3byte encoding
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
270 ((r0 = (((r1 & #xf000) >> 12) | #xe0))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
271 (r2 = ((r1 & #x3f) | #x80))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
272 (r1 &= #x0fc0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
273 (r1 >>= 6)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
274 (r1 |= #x80)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
275 (write r0 r1 r2))))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
276
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
277 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
278 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
279 (r1 &= #x7f)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
280 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
281 (r0 = (((r1 & #xf000) >> 12) | #xe0))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
282 (r2 = ((r1 & #x3f) | #x80))
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
283 (r1 &= #x0fc0)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
284 (r1 >>= 6)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
285 (r1 |= #x80)
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
286 (write r0 r1 r2))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
287
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
288 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
289 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
290 (r1 &= #x7f)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
291 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
292 (r0 = (((r1 & #xf000) >> 12) | #xe0))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
293 (r2 = ((r1 & #x3f) | #x80))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
294 (r1 &= #x0fc0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
295 (r1 >>= 6)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
296 (r1 |= #x80)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
297 (write r0 r1 r2))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
298
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
299 (if (r0 == ,(charset-id 'eight-bit-control))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
300 ;; r1 scalar utf-8
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
301 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
302 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
303 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
304 ((write #xc2)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
305 (write r1))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
306
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
307 (if (r0 == ,(charset-id 'eight-bit-graphic))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
308 ;; r1 scalar utf-8
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
309 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
310 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
311 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
312 ((write r1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
313 (r1 = -1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
314 (read-multibyte-character r0 r1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
315 (if (r0 != ,(charset-id 'eight-bit-graphic))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
316 (if (r0 != ,(charset-id 'eight-bit-control))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
317 ((r5 = r0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
318 (r6 = r1))))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
319 (if (r5 < 0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
320 ((read-multibyte-character r0 r2)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
321 (if (r0 != ,(charset-id 'eight-bit-graphic))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
322 (if (r0 != ,(charset-id 'eight-bit-control))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
323 ((r5 = r0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
324 (r6 = r2))))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
325 (if (r5 < 0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
326 (write r1 r2)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
327 (if (r1 < #xa0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
328 (write r1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
329 ((write #xc2)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
330 (write r1)))))))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
331
37097
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
332 ;; Unsupported character.
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
333 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
334 ((write #xef)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
335 (write #xbf)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
336 (write #xbd)))))))))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
337 (repeat)))
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
338 (if (r1 >= #xa0)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
339 (write r1)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
340 (if (r1 >= #x80)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
341 ((write #xc2)
b095952a8678 (ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents: 36522
diff changeset
342 (write r1)))))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
343
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
344 "CCL program to encode into UTF-8.
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
345 Only characters from the charsets ascii, eight-bit-control,
36465
f968e313e8ad Doc fixes.
Dave Love <fx@gnu.org>
parents: 36423
diff changeset
346 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
f968e313e8ad Doc fixes.
Dave Love <fx@gnu.org>
parents: 36423
diff changeset
347 Others are encoded as U+FFFD.")
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
348
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
349 ;; Dummy definition so that the CCL can be checked correctly; the
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
350 ;; actual data are loaded on demand.
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
351 (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
352 (define-translation-table 'ucs-mule-8859-to-mule-unicode))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
353
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
354 (defsubst utf-8-untranslated-to-ucs ()
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
355 (let ((b1 (char-after))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
356 (b2 (char-after (1+ (point))))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
357 (b3 (char-after (+ 2 (point))))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
358 (b4 (char-after (+ 4 (point)))))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
359 (if (and b1 b2 b3)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
360 (cond ((< b1 ?\xf0)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
361 (setq b2 (lsh (logand b2 ?\x3f) 6))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
362 (setq b3 (logand b3 ?\x3f))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
363 (logior b3 (logior b2 (lsh (logand b1 ?\x0f) 12))))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
364 (b4
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
365 (setq b2 (lsh (logand b2 ?\x3f) 12))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
366 (setq b3 (lsh (logand b3 ?\x3f) 6))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
367 (setq b4 (logand b4 ?\x3f))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
368 (logior b4 (logior b3 (logior b2 (lsh (logand b1 ?\x07)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
369 18)))))))))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
370
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
371 (defun utf-8-help-echo (window object position)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
372 (format "Untranslated Unicode U+%04X"
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
373 (get-char-property position 'untranslated-utf-8 object)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
374
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
375 (defvar utf-8-subst-table nil
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
376 "If non-nil, a hash table mapping `untranslatable utf-8' to Emacs characters.")
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
377
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
378 ;; We compose the untranslatable sequences into a single character.
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
379 ;; This is infelicitous for editing, because there's currently no
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
380 ;; mechanism for treating compositions as atomic, but is OK for
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
381 ;; display. We try to compose an appropriate character from a hash
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
382 ;; table of CJK characters to display correctly. Otherwise we use
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
383 ;; U+FFFD. What we really should have is hash table lookup from CCL
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
384 ;; so that we could do this properly. This function GCs too much.
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
385 (defsubst utf-8-compose ()
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
386 "Put a suitable composition on an untranslatable sequence.
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
387 Return the sequence's length."
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
388 (let* ((u (utf-8-untranslated-to-ucs))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
389 (l (and u (if (>= u ?\x10000)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
390 4
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
391 3)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
392 (subst (and utf-8-subst-table (gethash u utf-8-subst-table))))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
393 (when u
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
394 (put-text-property (point) (min (point-max) (+ l (point)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
395 'untranslated-utf-8 u)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
396 (unless subst
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
397 (put-text-property (point) (min (point-max) (+ l (point)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
398 'help-echo 'utf-8-help-echo)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
399 (setq subst ?$,3u=(B))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
400 (compose-region (point) (+ l (point)) subst)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
401 l)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
402
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
403 (defcustom utf-8-compose-scripts nil
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
404 "*Non-nil means compose various scipts on decoding utf-8 text."
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
405 :group 'mule
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
406 :type 'boolean) ; omitted in Emacs 21.1
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
407
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
408 (defun utf-8-post-read-conversion (length)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
409 "Compose untranslated utf-8 sequences into single characters.
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
410 Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
411 (save-excursion
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
412 ;; Can't do eval-when-compile to insert a multibyte constant
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
413 ;; version of the string in the loop, since it's always loaded as
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
414 ;; unibyte from a byte-compiled file.
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
415 (let ((range (string-as-multibyte "^\341-\377")))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
416 (while (and (skip-chars-forward
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
417 range)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
418 (not (eobp)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
419 (forward-char (utf-8-compose)))))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
420 ;; Fixme: Takahashi-san implies it may not work this easily -- needs
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
421 ;; checking with him.
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
422 (when (and utf-8-compose-scripts (> length 1))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
423 ;; These currently have definitions which cover the relevant
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
424 ;; Unicodes. We could avoid loading thai-util &c by checking
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
425 ;; whether the region contains any characters with the appropriate
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
426 ;; categories. There aren't yet Unicode-based rules for Tibetan.
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
427 (save-excursion (setq length (diacritic-post-read-conversion length)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
428 (save-excursion (setq length (thai-post-read-conversion length)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
429 (save-excursion (setq length (lao-post-read-conversion length)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
430 (save-excursion (setq length (devanagari-post-read-conversion length))))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
431 length)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
432
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
433 (defun utf-8-pre-write-conversion (beg end)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
434 "Semi-dummy pre-write function effectively to autoload ucs-tables."
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
435 ;; Ensure translation table is loaded.
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
436 (require 'ucs-tables)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
437 ;; Don't do this again.
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
438 (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
439 nil)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
440
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
441 (make-coding-system
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
442 'mule-utf-8 4 ?u
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
443 "UTF-8 encoding for Emacs-supported Unicode characters.
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
444 The supported Emacs character sets are the following, plus others
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
445 which may be included in the translation table
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
446 `ucs-mule-to-mule-unicode':
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
447 ascii
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
448 eight-bit-control
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
449 eight-bit-graphic
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
450 latin-iso8859-1
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
451 latin-iso8859-2
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
452 latin-iso8859-3
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
453 latin-iso8859-4
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
454 cyrillic-iso8859-5
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
455 greek-iso8859-7
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
456 hebrew-iso8859-8
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
457 latin-iso8859-9
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
458 latin-iso8859-14
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
459 latin-iso8859-15
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
460 mule-unicode-0100-24ff
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
461 mule-unicode-2500-33ff
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
462 mule-unicode-e000-ffff
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
463
36243
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
464 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
a05ae5420f85 Doc and commentary fixes.
Dave Love <fx@gnu.org>
parents: 35542
diff changeset
465 are decoded into sequences of eight-bit-control and eight-bit-graphic
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
466 characters to preserve their byte sequences and composed to display as
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
467 a single character. Emacs characters that can't be encoded to these
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
468 ranges are encoded as U+FFFD."
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
469
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
470 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
471 '((safe-charsets
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
472 ascii
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
473 eight-bit-control
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
474 eight-bit-graphic
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
475 latin-iso8859-1
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
476 latin-iso8859-15
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
477 latin-iso8859-14
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
478 latin-iso8859-9
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
479 hebrew-iso8859-8
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
480 greek-iso8859-7
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
481 cyrillic-iso8859-5
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
482 latin-iso8859-4
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
483 latin-iso8859-3
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
484 latin-iso8859-2
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
485 vietnamese-viscii-lower
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
486 vietnamese-viscii-upper
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
487 thai-tis620
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
488 ipa
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
489 ethiopic
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
490 indian-is13194
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
491 katakana-jisx0201
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
492 chinese-sisheng
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
493 lao
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
494 mule-unicode-0100-24ff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
495 mule-unicode-2500-33ff
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
496 mule-unicode-e000-ffff)
36371
f6bb3ed752b4 (mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents: 36243
diff changeset
497 (mime-charset . utf-8)
36423
aa776838b660 (mule-utf-8): Set coding-category property to coding-category-utf-8.
Kenichi Handa <handa@m17n.org>
parents: 36371
diff changeset
498 (coding-category . coding-category-utf-8)
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
499 (valid-codes (0 . 255))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
500 (pre-write-conversion . utf-8-pre-write-conversion)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
501 (post-read-conversion . utf-8-post-read-conversion)))
35542
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
502
e4a75e66ee46 new file
Kenichi Handa <handa@m17n.org>
parents:
diff changeset
503 (define-coding-system-alias 'utf-8 'mule-utf-8)
38436
b174db545cfd Some fixes to follow coding conventions.
Pavel Janík <Pavel@Janik.cz>
parents: 37934
diff changeset
504
41873
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
505 ;; I think this needs special private charsets defined for the
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
506 ;; untranslated sequences, if it's going to work well.
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
507
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
508 ;;; (defun utf-8-compose-function (pos to pattern &optional string)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
509 ;;; (let* ((prop (get-char-property pos 'composition string))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
510 ;;; (l (and prop (- (cadr prop) (car prop)))))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
511 ;;; (cond ((and l (> l (- to pos)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
512 ;;; (delete-region pos to))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
513 ;;; ((and (> (char-after pos) 224)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
514 ;;; (< (char-after pos) 256)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
515 ;;; (save-restriction
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
516 ;;; (narrow-to-region pos to)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
517 ;;; (utf-8-compose)))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
518 ;;; t))))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
519
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
520 ;;; (dotimes (i 96)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
521 ;;; (aset composition-function-table
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
522 ;;; (+ 128 i)
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
523 ;;; `((,(string-as-multibyte "[\200-\237\240-\377]")
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
524 ;;; . utf-8-compose-function))))
16ee1ffbef65 (ucs-mule-to-mule-unicode): New
Dave Love <fx@gnu.org>
parents: 38436
diff changeset
525
38436
b174db545cfd Some fixes to follow coding conventions.
Pavel Janík <Pavel@Janik.cz>
parents: 37934
diff changeset
526 ;;; utf-8.el ends here