17052
|
1 /* Header for multilingual character handler.
|
|
2 Ver.1.0
|
|
3
|
|
4 Copyright (C) 1995 Free Software Foundation, Inc.
|
|
5 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
|
|
6
|
|
7 This program is free software; you can redistribute it and/or modify
|
|
8 it under the terms of the GNU General Public License as published by
|
|
9 the Free Software Foundation; either version 2, or (at your option)
|
|
10 any later version.
|
|
11
|
|
12 This program is distributed in the hope that it will be useful,
|
|
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15 GNU General Public License for more details.
|
|
16
|
|
17 You should have received a copy of the GNU General Public License
|
|
18 along with this program; if not, write to the Free Software
|
|
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
|
20
|
|
21 #ifndef _CHARSET_H
|
|
22 #define _CHARSET_H
|
|
23
|
|
24 /*** GENERAL NOTE on CHARACTER SET (CHARSET) ***
|
|
25
|
|
26 A character set ("charset" hereafter) is a meaningful collection
|
|
27 (i.e. language, culture, functionality, etc) of characters. Emacs
|
|
28 handles multiple charsets at once. Each charset corresponds to one
|
|
29 of ISO charsets (except for a special charset for composition
|
|
30 characters). Emacs identifies a charset by a unique identification
|
|
31 number, whereas ISO identifies a charset by a triplet of DIMENSION,
|
|
32 CHARS and FINAL-CHAR. So, hereafter, just saying "charset" means an
|
|
33 identification number (integer value).
|
|
34
|
|
35 The value range of charset is 0x00, 0x80..0xFE. There are four
|
|
36 kinds of charset depending on DIMENSION (1 or 2) and CHARS (94 or
|
|
37 96). For instance, a charset of DIMENSION2_CHARS94 contains 94x94
|
|
38
|
|
39
|
|
40 Within Emacs Lisp, a charset is treated as a symbol which has a
|
|
41 property `charset'. The property value is a vector containing
|
|
42 various information about the charset. For readability of C codes,
|
|
43 we use the following convention on C variable names:
|
|
44 charset_symbol: Emacs Lisp symbol of a charset
|
|
45 charset_id: Emacs Lisp integer of an identification number of a charset
|
|
46 charset: C integer of an identification number of a charset
|
|
47
|
|
48 Each charset (except for ASCII) is assigned a base leading-code
|
|
49 (range 0x80..0x9D). In addition, a charset of greater than 0xA0
|
|
50 (whose base leading-code is 0x9A..0x9D) is assigned an extended
|
|
51 leading-code (range 0xA0..0xFE). In this case, each base
|
|
52 leading-code specify the allowable range of extended leading-code as
|
|
53 shown in the table below. A leading-code is used to represent a
|
|
54 character in Emacs' buffer and string.
|
|
55
|
|
56 We call a charset which has extended leading-code as "private
|
|
57 charset" because those are mainly for a charset which is not
|
|
58 registered by ISO. On the contrary, we call a charset which does
|
|
59 not have extended leading-code as "official charset".
|
|
60
|
|
61 ---------------------------------------------------------------------------
|
|
62 charset dimension base leading-code extended leading-code
|
|
63 ---------------------------------------------------------------------------
|
|
64 0x00 official dim1 -- none -- -- none --
|
|
65 (ASCII)
|
|
66 0x01..0x7F --never used--
|
|
67 0x80 COMPOSITION same as charset -- none --
|
|
68 0x81..0x8F official dim1 same as charset -- none --
|
|
69 0x90..0x99 official dim2 same as charset -- none --
|
|
70 0x9A..0x9F --never used--
|
|
71 0xA0..0xDF private dim1 0x9A same as charset
|
|
72 of 1-column width
|
|
73 0xE0..0xEF private dim1 0x9B same as charset
|
|
74 of 2-column width
|
|
75 0xF0..0xF4 private dim2 0x9C same as charset
|
|
76 of 1-column width
|
|
77 0xF5..0xFE private dim2 0x9D same as charset
|
|
78 of 2-column width
|
|
79 0xFF --never used--
|
|
80 ---------------------------------------------------------------------------
|
|
81
|
|
82 In the table, "COMPOSITION" means a charset for a composite
|
|
83 character which is a character composed from several (up to 16)
|
|
84 non-composite characters (components). Although a composite
|
|
85 character can contain components of many charsets, a composite
|
|
86 character itself belongs to the charset CHARSET-COMPOSITION. See
|
|
87 the document "GENERAL NOTE on COMPOSITE CHARACTER" below for more
|
|
88 detail.
|
|
89
|
|
90 */
|
|
91
|
|
92 /* Definition of special leading-codes. */
|
|
93 /* Base leading-code. */
|
|
94 /* Special leading-code followed by components of a composite character. */
|
|
95 #define LEADING_CODE_COMPOSITION 0x80
|
|
96 /* Leading-code followed by extended leading-code. */
|
|
97 #define LEADING_CODE_PRIVATE_11 0x9A /* for private DIMENSION1 of 1-column */
|
|
98 #define LEADING_CODE_PRIVATE_12 0x9B /* for private DIMENSION1 of 2-column */
|
|
99 #define LEADING_CODE_PRIVATE_21 0x9C /* for private DIMENSION2 of 1-column */
|
|
100 #define LEADING_CODE_PRIVATE_22 0x9D /* for private DIMENSION2o f 2-column */
|
|
101
|
|
102 /* Extended leading-code. */
|
|
103 /* Start of each extended leading-codes. */
|
|
104 #define LEADING_CODE_EXT_11 0xA0 /* follows LEADING_CODE_PRIVATE_11 */
|
|
105 #define LEADING_CODE_EXT_12 0xE0 /* follows LEADING_CODE_PRIVATE_12 */
|
|
106 #define LEADING_CODE_EXT_21 0xF0 /* follows LEADING_CODE_PRIVATE_21 */
|
|
107 #define LEADING_CODE_EXT_22 0xF5 /* follows LEADING_CODE_PRIVATE_22 */
|
|
108 /* Maximum value of extended leading-codes. */
|
|
109 #define LEADING_CODE_EXT_MAX 0xFE
|
|
110
|
|
111 /* Definition of minimum/maximum charset of each DIMENSION. */
|
|
112 #define MIN_CHARSET_OFFICIAL_DIMENSION1 0x81
|
|
113 #define MAX_CHARSET_OFFICIAL_DIMENSION1 0x8F
|
|
114 #define MIN_CHARSET_OFFICIAL_DIMENSION2 0x90
|
|
115 #define MAX_CHARSET_OFFICIAL_DIMENSION2 0x99
|
|
116 #define MIN_CHARSET_PRIVATE_DIMENSION1 LEADING_CODE_EXT_11
|
|
117 #define MIN_CHARSET_PRIVATE_DIMENSION2 LEADING_CODE_EXT_21
|
|
118
|
|
119 /* Definition of special charsets. */
|
|
120 #define CHARSET_ASCII 0
|
|
121 #define CHARSET_COMPOSITION 0x80
|
|
122
|
|
123 extern int charset_ascii; /* ASCII */
|
|
124 extern int charset_composition; /* for a composite character */
|
|
125 extern int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
|
|
126 extern int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
|
|
127 extern int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
|
|
128 extern int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
|
|
129 extern int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
|
|
130 extern int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
|
|
131 extern int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
|
|
132
|
|
133 /* Check if STR points the head of multi-byte form, i.e. *STR is an
|
|
134 ASCII character or a base leading-code. */
|
|
135 #define CHAR_HEAD_P(str) ((unsigned char) *(str) < 0xA0)
|
|
136
|
|
137 /*** GENERAL NOTE on CHARACTER REPRESENTATION ***
|
|
138
|
|
139 At first, the term "character" or "char" is used for a multilingual
|
|
140 character (of course, including ASCII character), not for a byte in
|
|
141 computer memory. We use the term "code" or "byte" for the latter
|
|
142 case.
|
|
143
|
|
144 A character is identified by charset and one or two POSITION-CODEs.
|
|
145 POSITION-CODE is the position of the character in the charset. A
|
|
146 character of DIMENSION1 charset has one POSITION-CODE: POSITION-CODE-1.
|
|
147 A character of DIMENSION2 charset has two POSITION-CODE:
|
|
148 POSITION-CODE-1 and POSITION-CODE-2. The code range of
|
|
149 POSITION-CODE is 0x20..0x7F.
|
|
150
|
|
151 Emacs has two kinds of representation of a character: multi-byte
|
|
152 form (for buffer and string) and single-word form (for character
|
|
153 object in Emacs Lisp). The latter is called "character code" here
|
|
154 after. Both representation encode the information of charset and
|
|
155 POSITION-CODE but in a different way (for instance, MSB of
|
|
156 POSITION-CODE is set in multi-byte form).
|
|
157
|
|
158 For details of multi-byte form, see the section "2. Emacs internal
|
|
159 format handlers" of `coding.c'.
|
|
160
|
|
161 Emacs uses 19 bits for a character code. The bits are divided into
|
|
162 3 fields: FIELD1(5bits):FIELD2(7bits):FIELD3(7bits).
|
|
163
|
|
164 A character code of DIMENSION1 character uses FIELD2 to hold charset
|
|
165 and FIELD3 to hold POSITION-CODE-1. A character code of DIMENSION2
|
|
166 character uses FIELD1 to hold charset, FIELD2 and FIELD3 to hold
|
|
167 POSITION-CODE-1 and POSITION-CODE-2 respectively.
|
|
168
|
|
169 More precisely...
|
|
170
|
|
171 FIELD2 of DIMENSION1 character (except for ASCII) is "charset - 0x70".
|
|
172 This is to make all character codes except for ASCII greater than
|
|
173 256 (ASCII's FIELD2 is 0). So, the range of FIELD2 of DIMENSION1
|
|
174 character is 0 or 0x11..0x7F.
|
|
175
|
|
176 FIELD1 of DIMENSION2 character is "charset - 0x8F" for official
|
|
177 charset and "charset - 0xE0" for private charset. So, the range of
|
|
178 FIELD1 of DIMENSION2 character is 0x01..0x1E.
|
|
179
|
|
180 -----------------------------------------------------------------------
|
|
181 charset FIELD1 (5-bit) FIELD2 (7-bit) FIELD3 (7-bit)
|
|
182 -----------------------------------------------------------------------
|
|
183 ASCII 0 0 POSITION-CODE-1
|
|
184 DIMENSION1 0 charset - 0x70 POSITION-CODE-1
|
|
185 DIMENSION2(o) charset - 0x8F POSITION-CODE-1 POSITION-CODE-2
|
|
186 DIMENSION2(p) charset - 0xE0 POSITION-CODE-1 POSITION-CODE-2
|
|
187 -----------------------------------------------------------------------
|
|
188 "(o)": official, "(p)": private
|
|
189 -----------------------------------------------------------------------
|
|
190
|
|
191 */
|
|
192
|
|
193 /*** GENERAL NOTE on COMPOSITE CHARACTER ***
|
|
194
|
|
195 A composite character is a character composed from several (up to
|
|
196 16) non-composite characters (components). Although each components
|
|
197 can belong to any charset, a composite character itself belongs to
|
|
198 the charset `charset-composition' and is assigned a special
|
|
199 leading-code `LEADING_CODE_COMPOSITION' for multi-byte form. See
|
|
200 the document "2. Emacs internal format handlers" in `coding.c' for
|
|
201 more detail about multi-byte form.
|
|
202
|
|
203 A character code of composite character has special format. In the
|
|
204 above document, FIELD1 of a composite character is 0x1F. Each
|
|
205 composite character is assigned a sequential number CMPCHAR-ID.
|
|
206 FIELD2 and FIELD3 are combined to make 14bits field for holding
|
|
207 CMPCHAR-ID, which means that Emacs can handle at most 2^14 (= 16384)
|
|
208 composite characters at once.
|
|
209
|
|
210 -----------------------------------------------------------------------
|
|
211 charset FIELD1 (5-bit) FIELD2&3 (14-bit)
|
|
212 -----------------------------------------------------------------------
|
|
213 CHARSET-COMPOSITION 0x1F CMPCHAR-ID
|
|
214 -----------------------------------------------------------------------
|
|
215
|
|
216 Emacs assigns CMPCHAR-ID to a composite character only when it
|
|
217 requires the character code of the composite character (e.g. while
|
|
218 displaying the composite character).
|
|
219
|
|
220 */
|
|
221
|
|
222 /* Masks of each field of character code. */
|
|
223 #define CHAR_FIELD1_MASK (0x1F << 14)
|
|
224 #define CHAR_FIELD2_MASK (0x7F << 7)
|
|
225 #define CHAR_FIELD3_MASK 0x7F
|
|
226
|
|
227 /* Macros to access each field of character C. */
|
|
228 #define CHAR_FIELD1(c) (((c) & CHAR_FIELD1_MASK) >> 14)
|
|
229 #define CHAR_FIELD2(c) (((c) & CHAR_FIELD2_MASK) >> 7)
|
|
230 #define CHAR_FIELD3(c) ((c) & CHAR_FIELD3_MASK)
|
|
231
|
|
232 /* Minimum character code of character of each DIMENSION. */
|
|
233 #define MIN_CHAR_OFFICIAL_DIMENSION1 \
|
|
234 ((MIN_CHARSET_OFFICIAL_DIMENSION1 - 0x70) << 7)
|
|
235 #define MIN_CHAR_PRIVATE_DIMENSION1 \
|
|
236 ((MIN_CHARSET_PRIVATE_DIMENSION1 - 0x70) << 7)
|
|
237 #define MIN_CHAR_OFFICIAL_DIMENSION2 \
|
|
238 ((MIN_CHARSET_OFFICIAL_DIMENSION2 - 0x8F) << 14)
|
|
239 #define MIN_CHAR_PRIVATE_DIMENSION2 \
|
|
240 ((MIN_CHARSET_PRIVATE_DIMENSION2 - 0xE0) << 14)
|
|
241 #define MIN_CHAR_COMPOSITION \
|
|
242 (0x1F << 14)
|
|
243
|
|
244 /* 1 if C is an ASCII character, else 0. */
|
|
245 #define SINGLE_BYTE_CHAR_P(c) ((c) < 0x100)
|
|
246 /* 1 if C is an composite character, else 0. */
|
|
247 #define COMPOSITE_CHAR_P(c) ((c) >= MIN_CHAR_COMPOSITION)
|
|
248
|
|
249 /* A char-table containing information of each character set.
|
|
250
|
|
251 Unlike ordinary char-tables, this doesn't contain any nested table.
|
|
252 Only the top level elements are used. Each element is a vector of
|
|
253 the following information:
|
|
254 CHARSET-ID, BYTES, DIMENSION, CHARS, WIDTH, DIRECTION,
|
|
255 LEADING-CODE-BASE, LEADING-CODE-EXT,
|
|
256 ISO-FINAL-CHAR, ISO-GRAPHIC-PLANE,
|
|
257 REVERSE-CHARSET, SHORT-NAME, LONG-NAME, DESCRIPTION,
|
|
258 PLIST.
|
|
259
|
|
260 CHARSET-ID (integer) is the identification number of the charset.
|
|
261
|
|
262 BYTE (integer) is the length of multi-byte form of a character in
|
|
263 the charset: one of 1, 2, 3, and 4.
|
|
264
|
|
265 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.
|
|
266
|
|
267 CHARS (integer) is the number of characters in a dimension: 94 or 96.
|
|
268
|
|
269 WIDTH (integer) is the number of columns a character in the charset
|
|
270 occupies on the screen: one of 0, 1, and 2.
|
|
271
|
|
272 DIRECTION (integer) is the rendering direction of characters in the
|
|
273 charset when rendering. If 0, render from right to left, else
|
|
274 render from left to right.
|
|
275
|
|
276 LEADING-CODE-BASE (integer) is the base leading-code for the
|
|
277 charset.
|
|
278
|
|
279 LEADING-CODE-EXT (integer) is the extended leading-code for the
|
|
280 charset. All charsets of less than 0xA0 has the value 0.
|
|
281
|
|
282 ISO-FINAL-CHAR (character) is the final character of the
|
|
283 corresponding ISO 2022 charset.
|
|
284
|
|
285 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked
|
|
286 while encoding to variants of ISO 2022 coding system, one of the
|
|
287 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).
|
|
288
|
|
289 REVERSE-CHARSET (integer) is the charset which differs only in
|
|
290 LEFT-TO-RIGHT value from the charset. If there's no such a
|
|
291 charset, the value is -1.
|
|
292
|
|
293 SHORT-NAME (string) is the short name to refer to the charset.
|
|
294
|
|
295 LONG-NAME (string) is the long name to refer to the charset.
|
|
296
|
|
297 DESCRIPTION (string) is the description string of the charset.
|
|
298
|
|
299 PLIST (property list) may contain any type of information a user
|
|
300 want to put and get by functions `put-charset-property' and
|
|
301 `get-charset-property' respectively. */
|
|
302 extern Lisp_Object Vcharset_table;
|
|
303
|
|
304 /* Macros to access various information of CHARSET in Vcharset_table.
|
|
305 We provide these macros for efficiency. No range check of CHARSET. */
|
|
306
|
|
307 /* Return entry of CHARSET (lisp integer) in Vcharset_table. */
|
|
308 #define CHARSET_TABLE_ENTRY(charset) \
|
|
309 XCHAR_TABLE (Vcharset_table)->contents[charset]
|
|
310
|
|
311 /* Return information INFO-IDX of CHARSET. */
|
|
312 #define CHARSET_TABLE_INFO(charset, info_idx) \
|
|
313 XVECTOR (CHARSET_TABLE_ENTRY (charset))->contents[info_idx]
|
|
314
|
|
315 #define CHARSET_ID_IDX (0)
|
|
316 #define CHARSET_BYTES_IDX (1)
|
|
317 #define CHARSET_DIMENSION_IDX (2)
|
|
318 #define CHARSET_CHARS_IDX (3)
|
|
319 #define CHARSET_WIDTH_IDX (4)
|
|
320 #define CHARSET_DIRECTION_IDX (5)
|
|
321 #define CHARSET_LEADING_CODE_BASE_IDX (6)
|
|
322 #define CHARSET_LEADING_CODE_EXT_IDX (7)
|
|
323 #define CHARSET_ISO_FINAL_CHAR_IDX (8)
|
|
324 #define CHARSET_ISO_GRAPHIC_PLANE_IDX (9)
|
|
325 #define CHARSET_REVERSE_CHARSET_IDX (10)
|
|
326 #define CHARSET_SHORT_NAME_IDX (11)
|
|
327 #define CHARSET_LONG_NAME_IDX (12)
|
|
328 #define CHARSET_DESCRIPTION_IDX (13)
|
|
329 #define CHARSET_PLIST_IDX (14)
|
|
330 /* Size of a vector of each entry of Vcharset_table. */
|
|
331 #define CHARSET_MAX_IDX (15)
|
|
332
|
|
333 /* And several more macros to be used frequently. */
|
|
334 #define CHARSET_BYTES(charset) \
|
|
335 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX))
|
|
336 #define CHARSET_DIMENSION(charset) \
|
|
337 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX))
|
|
338 #define CHARSET_CHARS(charset) \
|
|
339 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX))
|
|
340 #define CHARSET_WIDTH(charset) \
|
|
341 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX))
|
|
342 #define CHARSET_DIRECTION(charset) \
|
|
343 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX))
|
|
344 #define CHARSET_LEADING_CODE_BASE(charset) \
|
|
345 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX))
|
|
346 #define CHARSET_LEADING_CODE_EXT(charset) \
|
|
347 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX))
|
|
348 #define CHARSET_ISO_FINAL_CHAR(charset) \
|
|
349 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX))
|
|
350 #define CHARSET_ISO_GRAPHIC_PLANE(charset) \
|
|
351 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX))
|
|
352 #define CHARSET_REVERSE_CHARSET(charset) \
|
|
353 XINT (CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX))
|
|
354
|
|
355 /* Macros to specify direction of a charset. */
|
|
356 #define CHARSET_DIRECTION_LEFT_TO_RIGHT 0
|
|
357 #define CHARSET_DIRECTION_RIGHT_TO_LEFT 1
|
|
358
|
|
359 /* A vector of charset symbol indexed by charset-id. This is used
|
|
360 only for returning charset symbol from C functions. */
|
|
361 extern Lisp_Object Vcharset_symbol_table;
|
|
362
|
|
363 /* Return symbol of CHARSET. */
|
|
364 #define CHARSET_SYMBOL(charset) \
|
|
365 XVECTOR (Vcharset_symbol_table)->contents[charset]
|
|
366
|
|
367 /* 1 if CHARSET is valid, else 0. */
|
|
368 #define CHARSET_VALID_P(charset) \
|
|
369 ((charset) == 0 \
|
|
370 || ((charset) >= 0x80 && (charset) <= MAX_CHARSET_OFFICIAL_DIMENSION2) \
|
|
371 || ((charset) >= MIN_CHARSET_PRIVATE_DIMENSION1 && (charset) < MAX_CHARSET))
|
|
372
|
|
373 /* 1 if CHARSET is already defined, else 0. */
|
|
374 #define CHARSET_DEFINED_P(charset) \
|
|
375 (((charset) >= 0) && ((charset) < MAX_CHARSET) \
|
|
376 && !NILP (CHARSET_TABLE_ENTRY (charset)))
|
|
377
|
|
378 /* Since the information CHARSET-BYTES and CHARSET-WIDTH of
|
|
379 Vcharset_table can be retrieved only from the first byte of
|
|
380 multi-byte form (an ASCII code or a base leading-code), we provide
|
|
381 here tables to be used by macros BYTES_BY_CHAR_HEAD and
|
|
382 WIDTH_BY_CHAR_HEAD for faster information retrieval. */
|
|
383 extern int bytes_by_char_head[256];
|
|
384 extern int width_by_char_head[256];
|
|
385
|
|
386 #define BYTES_BY_CHAR_HEAD(char_head) bytes_by_char_head[char_head]
|
|
387 #define WIDTH_BY_CHAR_HEAD(char_head) width_by_char_head[char_head]
|
|
388
|
|
389 /* Charset of the character C. */
|
|
390 #define CHAR_CHARSET(c) \
|
|
391 (SINGLE_BYTE_CHAR_P (c) \
|
|
392 ? CHARSET_ASCII \
|
|
393 : ((c) < MIN_CHAR_OFFICIAL_DIMENSION2 \
|
|
394 ? CHAR_FIELD2 (c) + 0x70 \
|
|
395 : ((c) < MIN_CHAR_PRIVATE_DIMENSION2 \
|
|
396 ? CHAR_FIELD1 (c) + 0x8F \
|
|
397 : ((c) < MIN_CHAR_COMPOSITION \
|
|
398 ? CHAR_FIELD1 (c) + 0xE0 \
|
|
399 : CHARSET_COMPOSITION))))
|
|
400
|
|
401 /* Return charset at the place pointed by P. */
|
|
402 #define CHARSET_AT(p) \
|
|
403 (*(p) < 0x80 \
|
|
404 ? CHARSET_ASCII \
|
|
405 : (*(p) == LEADING_CODE_COMPOSITION \
|
|
406 ? CHARSET_COMPOSITION \
|
|
407 : (*(p) < LEADING_CODE_PRIVATE_11 \
|
|
408 ? (int)*(p) \
|
|
409 : (*(p) <= LEADING_CODE_PRIVATE_22 \
|
|
410 ? (int)*((p) + 1) \
|
|
411 : -1))))
|
|
412
|
|
413 /* Same as `CHARSET_AT ()' but perhaps runs faster because of an
|
|
414 additional argument C which is the code (byte) at P. */
|
|
415 #define FIRST_CHARSET_AT(p, c) \
|
|
416 ((c) < 0x80 \
|
|
417 ? CHARSET_ASCII \
|
|
418 : ((c) == LEADING_CODE_COMPOSITION \
|
|
419 ? CHARSET_COMPOSITION \
|
|
420 : ((c) < LEADING_CODE_PRIVATE_11 \
|
|
421 ? (int)(c) \
|
|
422 : ((c) <= LEADING_CODE_PRIVATE_22 \
|
|
423 ? (int)*((p) + 1) \
|
|
424 : -1))))
|
|
425
|
|
426 /* Check if two characters C1 and C2 belong to the same charset.
|
|
427 Always return 0 for composite characters. */
|
|
428 #define SAME_CHARSET_P(c1, c2) \
|
|
429 (c1 < MIN_CHAR_COMPOSITION \
|
|
430 && (SINGLE_BYTE_CHAR_P (c1) \
|
|
431 ? SINGLE_BYTE_CHAR_P (c2) \
|
|
432 : (c1 < MIN_CHAR_OFFICIAL_DIMENSION2 \
|
|
433 ? (c1 & CHAR_FIELD2_MASK) == (c2 & CHAR_FIELD2_MASK) \
|
|
434 : (c1 & CHAR_FIELD1_MASK) == (c2 & CHAR_FIELD1_MASK))))
|
|
435
|
|
436 /* Return a non-ASCII character of which charset is CHARSET and
|
|
437 position-codes are C1 and C2. DIMENSION1 character ignores C2. */
|
|
438 #define MAKE_NON_ASCII_CHAR(charset, c1, c2) \
|
|
439 ((charset) == CHARSET_COMPOSITION \
|
|
440 ? MAKE_COMPOSITE_CHAR (((c1) << 7) + (c2)) \
|
|
441 : (CHARSET_DIMENSION (charset) == 1 \
|
|
442 ? (((charset) - 0x70) << 7) | (c1) \
|
|
443 : ((charset) < MIN_CHARSET_PRIVATE_DIMENSION2 \
|
|
444 ? (((charset) - 0x8F) << 14) | ((c1) << 7) | (c2) \
|
|
445 : (((charset) - 0xE0) << 14) | ((c1) << 7) | (c2))))
|
|
446
|
|
447 /* Return a composite character of which CMPCHAR-ID is ID. */
|
|
448 #define MAKE_COMPOSITE_CHAR(id) (MIN_CHAR_COMPOSITION + (id))
|
|
449
|
|
450 /* Return CMPCHAR-ID of a composite character C. */
|
|
451 #define COMPOSITE_CHAR_ID(c) ((c) - MIN_CHAR_COMPOSITION)
|
|
452
|
|
453 /* Return a character of which charset is CHARSET and position-codes
|
|
454 are C1 and C2. DIMENSION1 character ignores C2. */
|
|
455 #define MAKE_CHAR(charset, c1, c2) \
|
|
456 ((charset) == CHARSET_ASCII \
|
|
457 ? (c1) \
|
|
458 : MAKE_NON_ASCII_CHAR ((charset), (c1) & 0x7F, (c2) & 0x7F))
|
|
459
|
|
460 /* The charset of non-ASCII character C is set to CHARSET, and the
|
|
461 position-codes of C are set to C1 and C2. C2 of DIMENSION1 character
|
|
462 is 0. */
|
|
463 #define SPLIT_NON_ASCII_CHAR(c, charset, c1, c2) \
|
|
464 ((c) < MIN_CHAR_OFFICIAL_DIMENSION2 \
|
|
465 ? (charset = CHAR_FIELD2 (c) + 0x70, \
|
|
466 c1 = CHAR_FIELD3 (c), \
|
|
467 c2 = 0) \
|
|
468 : (charset = ((c) < MIN_CHAR_COMPOSITION \
|
|
469 ? (CHAR_FIELD1 (c) \
|
|
470 + ((c) < MIN_CHAR_PRIVATE_DIMENSION2 ? 0x8F : 0xE0)) \
|
|
471 : CHARSET_COMPOSITION), \
|
|
472 c1 = CHAR_FIELD2 (c), \
|
|
473 c2 = CHAR_FIELD3 (c)))
|
|
474
|
|
475 /* The charset of character C is set to CHARSET, and the
|
|
476 position-codes of C are set to C1 and C2. C2 of DIMENSION1 character
|
|
477 is 0. */
|
|
478 #define SPLIT_CHAR(c, charset, c1, c2) \
|
|
479 (SINGLE_BYTE_CHAR_P (c) \
|
|
480 ? charset = CHARSET_ASCII, c1 = (c), c2 = 0 \
|
|
481 : SPLIT_NON_ASCII_CHAR (c, charset, c1, c2))
|
|
482
|
|
483 /* The charset of the character at STR is set to CHARSET, and the
|
|
484 position-codes are set to C1 and C2. C2 of DIMENSION1 character is 0.
|
|
485 If the character is a composite character, the upper 7-bit and
|
|
486 lower 7-bit of CMPCHAR-ID are set in C1 and C2 respectively. No
|
|
487 range checking. */
|
|
488 #define SPLIT_STRING(str, len, charset, c1, c2) \
|
|
489 ((BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) < 2 \
|
|
490 || BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) > len \
|
|
491 || split_non_ascii_string (str, len, &charset, &c1, &c2, 0) < 0) \
|
|
492 ? c1 = *(str), charset = CHARSET_ASCII \
|
|
493 : charset)
|
|
494
|
|
495 #define MAX_CHARSET 0xFF
|
|
496
|
|
497 /* Mapping table from ISO2022's charset (specified by DIMENSION,
|
|
498 CHARS, and FINAL_CHAR) to Emacs' charset. Should be accessed by
|
|
499 macro ISO_CHARSET_TABLE (DIMENSION, CHARS, FINAL_CHAR). */
|
|
500 extern int iso_charset_table[2][2][128];
|
|
501
|
|
502 #define ISO_CHARSET_TABLE(dimension, chars, final_char) \
|
|
503 iso_charset_table[XINT (dimension) - 1][XINT (chars) > 94][XINT (final_char)]
|
|
504
|
|
505 #define BASE_LEADING_CODE_P(c) (BYTES_BY_CHAR_HEAD ((unsigned char) (c)) > 1)
|
|
506
|
|
507 /* The following two macros CHAR_STRING and STRING_CHAR are the main
|
|
508 entry points to convert between Emacs two types of character
|
|
509 representations: multi-byte form and single-word form (character
|
|
510 code). */
|
|
511
|
|
512 /* Set STR a pointer to the multi-byte form of the character C. If C
|
|
513 is not a composite character, the multi-byte form is set in WORKBUF
|
|
514 and STR points WORKBUF. The caller should allocate at least 4-byte
|
|
515 area at WORKBUF in advance. Returns the length of the multi-byte
|
|
516 form. */
|
|
517
|
|
518 #define CHAR_STRING(c, workbuf, str) \
|
|
519 (SINGLE_BYTE_CHAR_P (c) \
|
|
520 ? *(str = workbuf) = (unsigned char)(c), 1 \
|
|
521 : non_ascii_char_to_string (c, workbuf, &str))
|
|
522
|
|
523 /* Return a character code of the character of which multi-byte form
|
|
524 is at STR and the length is LEN. If STR doesn't contain valid
|
|
525 multi-byte form, only the first byte in STR is returned. */
|
|
526
|
|
527 #define STRING_CHAR(str, len) \
|
|
528 ((BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) == 1 \
|
|
529 || BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) > (len)) \
|
|
530 ? (unsigned char) *(str) \
|
|
531 : string_to_non_ascii_char (str, len, 0))
|
|
532
|
|
533 /* This is like STRING_CHAR but the third arg ACTUAL_LEN is set to
|
|
534 the length of the multi-byte form. Just to know the length, use
|
|
535 MULTIBYTE_FORM_LENGTH. */
|
|
536
|
|
537 #define STRING_CHAR_AND_LENGTH(str, len, actual_len) \
|
|
538 ((BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) == 1 \
|
|
539 || BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) > (len)) \
|
|
540 ? (actual_len = 1), (unsigned char) *(str) \
|
|
541 : string_to_non_ascii_char (str, len, &actual_len))
|
|
542
|
|
543 /* Return the length of the multi-byte form at string STR of length LEN. */
|
|
544
|
|
545 #define MULTIBYTE_FORM_LENGTH(str, len) \
|
|
546 ((BYTES_BY_CHAR_HEAD (*(unsigned char *)(str)) == 1 \
|
|
547 || BYTES_BY_CHAR_HEAD (*(unsigned char *)(str)) > (len)) \
|
|
548 ? 1 \
|
|
549 : multibyte_form_length (str, len))
|
|
550
|
|
551 /* Set C a (possibly multibyte) character at P. P points into a
|
|
552 string which is the virtual concatenation of STR1 (which ends at
|
|
553 END1) or STR2 (which ends at END2). */
|
|
554
|
|
555 #define GET_CHAR_AFTER_2(c, p, str1, end1, str2, end2) \
|
|
556 do { \
|
|
557 const char *dtemp = (p) == (end1) ? (str2) : (p); \
|
|
558 const char *dlimit = ((p) >= (str1) && (p) < (end1)) ? (end1) : (end2); \
|
|
559 c = STRING_CHAR (dtemp, dlimit - dtemp); \
|
|
560 } while (0)
|
|
561
|
|
562 /* Set C a (possibly multibyte) character before P. P points into a
|
|
563 string which is the virtual concatenation of STR1 (which ends at
|
|
564 END1) or STR2 (which ends at END2). */
|
|
565
|
|
566 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
|
|
567 do { \
|
|
568 const char *dtemp = (p); \
|
|
569 const char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
|
|
570 while (dtemp-- > dlimit && (unsigned char) *dtemp >= 0xA0); \
|
|
571 c = STRING_CHAR (dtemp, p - dtemp); \
|
|
572 } while (0)
|
|
573
|
|
574 #ifdef emacs
|
|
575
|
|
576 /* Increase the buffer point POS of the current buffer to the next
|
|
577 character boundary. This macro relies on the fact that *GPT_ADDR
|
|
578 and *Z_ADDR are always accessible and the values are '\0'. No
|
|
579 range checking of POS. */
|
|
580 #define INC_POS(pos) \
|
|
581 do { \
|
|
582 unsigned char *p = POS_ADDR (pos) + 1; \
|
|
583 pos++; \
|
|
584 while (!CHAR_HEAD_P (p)) p++, pos++; \
|
|
585 } while (0)
|
|
586
|
|
587 /* Decrease the buffer point POS of the current buffer to the previous
|
|
588 character boundary. No range checking of POS. */
|
|
589 #define DEC_POS(pos) \
|
|
590 do { \
|
|
591 unsigned char *p, *p_min; \
|
|
592 if (--pos < GPT) \
|
|
593 p = BEG_ADDR + pos - 1, p_min = BEG_ADDR; \
|
|
594 else \
|
|
595 p = BEG_ADDR + GAP_SIZE + pos - 1, p_min = GAP_END_ADDR; \
|
|
596 while (p > p_min && !CHAR_HEAD_P (p)) p--, pos--; \
|
|
597 } while (0)
|
|
598
|
|
599 #endif /* emacs */
|
|
600
|
|
601 /* Maximum counts of components in one composite character. */
|
|
602 #define MAX_COMPONENT_COUNT 16
|
|
603
|
|
604 /* Structure to hold information of a composite character. */
|
|
605 struct cmpchar_info {
|
|
606 /* Byte length of the composite character. */
|
|
607 int len;
|
|
608
|
|
609 /* Multi-byte form of the composite character. */
|
|
610 unsigned char *data;
|
|
611
|
|
612 /* Length of glyph codes. */
|
|
613 int glyph_len;
|
|
614
|
|
615 /* Width of the overall glyph of the composite character. */
|
|
616 int width;
|
|
617
|
|
618 /* Pointer to an array of glyph codes of the composite character.
|
|
619 This actually contains only character code, no face. */
|
|
620 GLYPH *glyph;
|
|
621
|
|
622 /* Pointer to an array of composition rules. The value has the form:
|
|
623 (0xA0 + ((GLOBAL-REF-POINT << 2) | NEW-REF-POINT))
|
|
624 where each XXX-REF-POINT is 0..8. */
|
|
625 unsigned char *cmp_rule;
|
|
626
|
|
627 /* Pointer to an array of x-axis offset of left edge of glyphs
|
|
628 relative to the left of of glyph[0] except for the first element
|
|
629 which is the absolute offset from the left edge of overall glyph.
|
|
630 The actual pixel offset should be calculated by multiplying each
|
|
631 frame's one column width by this value:
|
|
632 (i.e. FONT_WIDTH (f->output_data.x->font) * col_offset[N]). */
|
|
633 float *col_offset;
|
|
634
|
|
635 /* Work slot used by `dumpglyphs' (xterm.c). */
|
|
636 int face_work;
|
|
637 };
|
|
638
|
|
639 /* Table of pointers to the structure `cmpchar_info' indexed by
|
|
640 CMPCHAR-ID. */
|
|
641 extern struct cmpchar_info **cmpchar_table;
|
|
642 /* Number of the current composite characters. */
|
|
643 extern int n_cmpchars;
|
|
644
|
|
645 /* This is the maximum length of multi-byte form. */
|
|
646 #define MAX_LENGTH_OF_MULTI_BYTE_FORM (MAX_COMPONENT_COUNT * 6)
|
|
647
|
|
648 #endif /* _CHARSET_H */
|
|
649
|