90086
|
1 ;; unidata-gen.el -- Create files containing character property data.
|
|
2 ;; Copyright (C) 2005
|
|
3 ;; National Institute of Advanced Industrial Science and Technology (AIST)
|
|
4 ;; Registration Number H13PRO009
|
|
5
|
|
6 ;; This file is part of GNU Emacs.
|
|
7
|
|
8 ;; GNU Emacs is free software; you can redistribute it and/or modify
|
|
9 ;; it under the terms of the GNU General Public License as published by
|
|
10 ;; the Free Software Foundation; either version 2, or (at your option)
|
|
11 ;; any later version.
|
|
12
|
|
13 ;; GNU Emacs is distributed in the hope that it will be useful,
|
|
14 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
16 ;; GNU General Public License for more details.
|
|
17
|
|
18 ;; You should have received a copy of the GNU General Public License
|
|
19 ;; along with GNU Emacs; see the file COPYING. If not, write to the
|
|
20 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
21 ;; Boston, MA 02111-1307, USA.
|
|
22
|
|
23 ;;; Commentary:
|
|
24
|
|
25 ;; FILES TO BE GENERATED
|
|
26 ;;
|
|
27 ;; The entry function `unidata-gen-files' generated these filese in
|
|
28 ;; the current directory.
|
|
29 ;;
|
|
30 ;; charprop.el
|
|
31 ;; It contains a series of forms of this format:
|
|
32 ;; (char-code-property-register PROP FILE)
|
|
33 ;; where PROP is a symbol representing a character property
|
|
34 ;; (name, geneirc-category, etc), and FILE is a name of one of
|
|
35 ;; the following files.
|
|
36 ;;
|
|
37 ;; uni-name.el, uni-cat.el, uni-comb.el, uni-bidi.el
|
|
38 ;; It contains a single form of this format:
|
|
39 ;; (char-code-property-register PROP CHAR-TABLE)
|
|
40 ;; where PROP is the same as above, and CHAR-TABLE is a
|
|
41 ;; char-table containing property values in a comporessed format.
|
|
42 ;;
|
|
43 ;; When they are installed in .../lisp/international/, the file
|
|
44 ;; "charprop.el" is preloaded in loadup.el. The other files are
|
|
45 ;; automatically loaded when the functions `get-char-code-property'
|
|
46 ;; and `put-char-code-property' are called.
|
|
47 ;;
|
|
48 ;; FORMAT OF A CHAR TABLE
|
|
49 ;;
|
|
50 ;; We want to make a file size containing a char-table small. We
|
|
51 ;; also want to load the file and get a property value fast. We
|
|
52 ;; also want to reduce the used memory after loading it. So,
|
|
53 ;; instead of naively storing a property value for each character in
|
|
54 ;; a char-table (and write it out into a file), we store compressed
|
|
55 ;; data in a char-table as below.
|
|
56 ;;
|
|
57 ;; If succeeding 128*N characters have the same property value, we
|
|
58 ;; store that value for them. Otherwise, comporess values for
|
|
59 ;; succeeding 128 characters into a single string and store it as a
|
|
60 ;; value for those characters. The way of compression depends on a
|
|
61 ;; property. See the section "SIMPLE TABLE", "RUN-LENGTH TABLE",
|
|
62 ;; and "WORD-LIST TABLE".
|
|
63
|
|
64 ;; The char table has four extra slots:
|
|
65 ;; 1st: property symbol
|
|
66 ;; 2nd: function to call to get a property value
|
|
67 ;; 3nd: function to call to put a property value
|
|
68 ;; 4th: function to call to get a description of a property value
|
|
69 ;; 5th: data referred by the above functions
|
|
70 ;;
|
|
71 ;; The actual
|
|
72 ;; For more detail, see the comments in the section "SIMPLE TABLE"
|
|
73 ;; and "NAME TABLE".
|
|
74
|
|
75 ;; The name of the file UnicodeData.txt.
|
|
76 (defconst unidata-text-file
|
|
77 (expand-file-name "admin/unidata/UnicodeData.txt" source-directory))
|
|
78
|
|
79 ;; List of elements of this form:
|
|
80 ;; (CHAR-or-RANGE PROP1 PROP2 ... PROPn)
|
|
81 ;; CHAR-or-RANGE: a character code or a cons of character codes
|
|
82 ;; PROPn: string representing the nth property value
|
|
83
|
|
84 (defvar unidata-list
|
|
85 (let* ((table (list nil))
|
|
86 (tail table)
|
|
87 (block-names '(("^<CJK Ideograph" . CJK\ IDEOGRAPH)
|
|
88 ("^<Hangul Syllable" . HANGUL\ SYLLABLE)
|
|
89 ("^<.*Surrogate" . nil)
|
|
90 ("^<.*Private Use" . PRIVATE\ USE)))
|
|
91 val char name)
|
|
92 (or (file-readable-p unidata-text-file)
|
|
93 (error "File not readable: %s" unidata-text-file))
|
|
94 (with-temp-buffer
|
|
95 (call-process "sed" unidata-text-file t nil
|
|
96 "-e" "s/\\([^;]*\\);\\(.*\\)/(#x\\1 \\\"\\2\\\")/"
|
|
97 "-e" "s/;/\\\" \\\"/g")
|
|
98 (goto-char (point-min))
|
|
99 (condition-case nil
|
|
100 (while t
|
|
101 (setq val (read (current-buffer))
|
|
102 char (car val)
|
|
103 name (cadr val))
|
|
104
|
|
105 ;; Check this kind of block.
|
|
106 ;; 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
|
|
107 ;; 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
|
|
108 (if (and (= (aref name 0) ?<)
|
|
109 (string-match ", First>$" name))
|
|
110 (let ((first char)
|
|
111 (l block-names)
|
|
112 block-name)
|
|
113 (setq val (read (current-buffer))
|
|
114 char (car val)
|
|
115 block-name (cadr val)
|
|
116 name nil)
|
|
117 (while l
|
|
118 (if (string-match (caar l) block-name)
|
|
119 (setq name (cdar l) l nil)
|
|
120 (setq l (cdr l))))
|
|
121 (if (not name)
|
|
122 ;; As this is a surrogate pair range, ignore it.
|
|
123 (setq val nil)
|
|
124 (setcar val (cons first char))
|
|
125 (setcar (cdr val) name))))
|
|
126
|
|
127 (when val
|
|
128 (setcdr tail (list val))
|
|
129 (setq tail (cdr tail))))
|
|
130 (error nil)))
|
|
131 (cdr table)))
|
|
132
|
|
133 ;; Alist of this form:
|
|
134 ;; (PROP INDEX GENERATOR FILENAME)
|
|
135 ;; PROP: character property
|
|
136 ;; INDEX: index to each element of unidata-list for PROP
|
|
137 ;; GENERATOR: function to generate a char-table
|
|
138 ;; FILENAME: filename to store the char-table
|
|
139 ;; DESCRIBER: function to call to get a description string of property value
|
|
140
|
|
141 (defconst unidata-prop-alist
|
|
142 '((name
|
|
143 1 unidata-gen-table-name "uni-name.el"
|
|
144 "Unicode character name.
|
|
145 Property value is a string.")
|
|
146 (general-category
|
|
147 2 unidata-gen-table-symbol "uni-category.el"
|
|
148 "Unicode general category.
|
|
149 Property value is one of the following symbols:
|
|
150 Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
|
|
151 Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn"
|
|
152 unidata-describe-general-category)
|
|
153 (canonical-combining-class
|
|
154 3 unidata-gen-table-integer "uni-combining.el"
|
|
155 "Unicode canonical combining class.
|
|
156 Property value is an integer."
|
|
157 unidata-describe-canonical-combining-class)
|
|
158 (bidi-class
|
|
159 4 unidata-gen-table-symbol "uni-bidi.el"
|
|
160 "Unicode bidi class.
|
|
161 Property value is one of the following symbols:
|
|
162 L, LRE, LRO, R, AL, RLE, RLO, PDF, EN, ES, ET,
|
|
163 AN, CS, NSM, BN, B, S, WS, ON"
|
|
164 unidata-describe-bidi-class)
|
|
165 (decomposition
|
|
166 5 unidata-gen-table-decomposition "uni-decomposition.el"
|
|
167 "Unicode decomposition mapping.
|
|
168 Property value is a list of characters. The first element may be
|
|
169 one of these symbols representing compatiblity formatting tag:
|
|
170 <font>, <noBreak>, <initial>, <medial>, <final>, <isolated>, <circle>,
|
|
171 <super>, <sub>, <vertical>, <wide>, <narrow>, <small>, <square>, <fraction>,
|
|
172 <compat>"
|
|
173 unidata-describe-decomposition)
|
|
174 (decimal-digit-value
|
|
175 6 unidata-gen-table-integer "uni-decimal.el"
|
|
176 "Unicode numeric value (decimal digit).
|
|
177 Property value is an integer.")
|
|
178 (digit-value
|
|
179 7 unidata-gen-table-integer "uni-digit.el"
|
|
180 "Unicode numeric value (digit).
|
|
181 Property value is an integer.")
|
|
182 (numeric-value
|
|
183 8 unidata-gen-table-symbol "uni-numeric.el"
|
|
184 "Unicode numeric value (numeric).
|
|
185 Property value is an symbol.")
|
|
186 (mirrored
|
|
187 9 unidata-gen-table-symbol "uni-mirrored.el"
|
|
188 "Unicode bidi mirrored flag.
|
|
189 Property value is a symbol `Y' or `N'.")
|
|
190 (old-name
|
|
191 10 unidata-gen-table-name "uni-old-name.el"
|
|
192 "Unicode old names as published in Unicode 1.0.
|
|
193 Property value is a string.")
|
|
194 (iso-10646-comment
|
|
195 11 unidata-gen-table-name "uni-comment.el"
|
|
196 "Unicode ISO 10646 comment.
|
|
197 Property value is a string.")
|
|
198 (uppercase
|
|
199 12 unidata-gen-table-character "uni-uppercase.el"
|
|
200 "Unicode simple uppercase mapping.
|
|
201 Property value is a character."
|
|
202 string)
|
|
203 (lowercase
|
|
204 13 unidata-gen-table-character "uni-lowercase.el"
|
|
205 "Unicode simple lowercase mapping.
|
|
206 Property value is a character."
|
|
207 string)
|
|
208 (titlecase
|
|
209 14 unidata-gen-table-character "uni-titlecase.el"
|
|
210 "Unicode simple titlecase mapping.
|
|
211 Property value is a character."
|
|
212 string)))
|
|
213
|
|
214 ;; Functions to access the above data.
|
|
215 (defsubst unidata-prop-index (prop) (nth 1 (assq prop unidata-prop-alist)))
|
|
216 (defsubst unidata-prop-generator (prop) (nth 2 (assq prop unidata-prop-alist)))
|
|
217 (defsubst unidata-prop-file (prop) (nth 3 (assq prop unidata-prop-alist)))
|
|
218 (defsubst unidata-prop-docstring (prop) (nth 4 (assq prop unidata-prop-alist)))
|
|
219 (defsubst unidata-prop-describer (prop) (nth 5 (assq prop unidata-prop-alist)))
|
|
220
|
|
221
|
|
222 ;; SIMPLE TABLE
|
|
223 ;;
|
|
224 ;; If the type of character property value is character, and the
|
|
225 ;; values of succeeding character codes are usually different, we use
|
|
226 ;; a char-table described here to store such values.
|
|
227 ;;
|
|
228 ;; If succeeding 128 characters has no property, a char-table has the
|
|
229 ;; symbol t is for them. Otherwise a char-table has a string of the
|
|
230 ;; following format for them.
|
|
231 ;;
|
|
232 ;; The first character of the string is FIRST-INDEX.
|
|
233 ;; The Nth (N > 0) character of the string is a property value of the
|
|
234 ;; character (BLOCk-HEAD + FIRST-INDEX + N - 1), where BLOCK-HEAD is
|
|
235 ;; the first of the characters in the block.
|
|
236 ;;
|
|
237 ;; The 4th extra slot of a char-table is nil.
|
|
238
|
|
239 (defun unidata-get-character (char val table)
|
|
240 (cond
|
|
241 ((characterp val)
|
|
242 val)
|
|
243
|
|
244 ((stringp val)
|
|
245 (let* ((len (length val))
|
|
246 (block-head (lsh (lsh char -7) 7))
|
|
247 (vec (make-vector 128 nil))
|
|
248 (first-index (aref val 0)))
|
|
249 (dotimes (i (1- len))
|
|
250 (let ((elt (aref val (1+ i))))
|
|
251 (if (> elt 0)
|
|
252 (aset vec (+ first-index i) elt))))
|
|
253 (dotimes (i 128)
|
|
254 (aset table (+ block-head i) (aref vec i)))
|
|
255 (aref vec (- char block-head))))))
|
|
256
|
|
257 (defun unidata-put-character (char val table)
|
|
258 (or (characterp val)
|
|
259 (not val)
|
|
260 (error "Not an character nor nil: %S" val))
|
|
261 (let ((current-val (aref table char)))
|
|
262 (unless (eq current-val val)
|
|
263 (if (stringp current-val)
|
|
264 (funcall (char-table-extra-slot table 1) char current-val table))
|
|
265 (aset table char val))))
|
|
266
|
|
267 (defun unidata-gen-table-character (prop)
|
|
268 (let ((table (make-char-table 'char-code-property-table))
|
|
269 (prop-idx (unidata-prop-index prop))
|
|
270 (vec (make-vector 128 0))
|
|
271 (tail unidata-list)
|
|
272 elt range val idx slot)
|
|
273 (set-char-table-range table (cons 0 (max-char)) t)
|
|
274 (while tail
|
|
275 (setq elt (car tail) tail (cdr tail))
|
|
276 (setq range (car elt)
|
|
277 val (nth prop-idx elt))
|
|
278 (if (= (length val) 0)
|
|
279 (setq val nil)
|
|
280 (setq val (string-to-number val 16)))
|
|
281 (if (consp range)
|
|
282 (if val
|
|
283 (set-char-table-range table range val))
|
|
284 (let* ((start (lsh (lsh range -7) 7))
|
|
285 (limit (+ start 127))
|
|
286 first-index last-index)
|
|
287 (fillarray vec 0)
|
|
288 (if val
|
|
289 (aset vec (setq last-index (setq first-index (- range start)))
|
|
290 val))
|
|
291 (while (and (setq elt (car tail) range (car elt))
|
|
292 (integerp range)
|
|
293 (<= range limit))
|
|
294 (setq val (nth prop-idx elt))
|
|
295 (when (> (length val) 0)
|
|
296 (aset vec (setq last-index (- range start))
|
|
297 (string-to-number val 16))
|
|
298 (or first-index
|
|
299 (setq first-index last-index)))
|
|
300 (setq tail (cdr tail)))
|
|
301 (when first-index
|
|
302 (let ((str (string first-index))
|
|
303 c)
|
|
304 (while (<= first-index last-index)
|
|
305 (setq str (format "%s%c" str (or (aref vec first-index) 0))
|
|
306 first-index (1+ first-index)))
|
|
307 (set-char-table-range table (cons start limit) str))))))
|
|
308
|
|
309 (set-char-table-extra-slot table 0 prop)
|
|
310 (byte-compile 'unidata-get-character)
|
|
311 (byte-compile 'unidata-put-character)
|
|
312 (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-character))
|
|
313 (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-character))
|
|
314
|
|
315 table))
|
|
316
|
|
317
|
|
318
|
|
319 ;; RUN-LENGTH TABLE
|
|
320 ;;
|
|
321 ;; If the type of character property value is symbol, integer,
|
|
322 ;; boolean, or character, we use a char-table described here to store
|
|
323 ;; the values.
|
|
324 ;;
|
|
325 ;; The 4th extra slot is a vector of property values (VAL-TABLE), and
|
|
326 ;; values for succeeding 128 characters are encoded into this
|
|
327 ;; character sequence:
|
|
328 ;; ( VAL-CODE RUN-LENGTH ? ) +
|
|
329 ;; where:
|
|
330 ;; VAL-CODE (0..127):
|
|
331 ;; (VAL-CODE - 1) is an index into VAL-TABLE.
|
|
332 ;; The value 0 means no-value.
|
|
333 ;; RUN-LENGTH (130..255):
|
|
334 ;; (RUN-LENGTH - 128) specifies how many characters have the same
|
|
335 ;; value. If omitted, it means 1.
|
|
336
|
|
337
|
|
338 ;; Return a symbol-type character property value of CHAR. VAL is the
|
|
339 ;; current value of (aref TABLE CHAR).
|
|
340
|
|
341 (defun unidata-get-symbol (char val table)
|
|
342 (let ((val-table (char-table-extra-slot table 4)))
|
|
343 (cond ((symbolp val)
|
|
344 val)
|
|
345 ((stringp val)
|
|
346 (let ((first-char (lsh (lsh char -7) 7))
|
|
347 (str val)
|
|
348 (len (length val))
|
|
349 (idx 0)
|
|
350 this-val count)
|
|
351 (set-char-table-range table (cons first-char (+ first-char 127))
|
|
352 nil)
|
|
353 (while (< idx len)
|
|
354 (setq val (aref str idx) idx (1+ idx)
|
|
355 count (if (< idx len) (aref str idx) 1))
|
|
356 (setq val (and (> val 0) (aref val-table (1- val)))
|
|
357 count (if (< count 128)
|
|
358 1
|
|
359 (prog1 (- count 128) (setq idx (1+ idx)))))
|
|
360 (dotimes (i count)
|
|
361 (if val
|
|
362 (aset table first-char val))
|
|
363 (if (= first-char char)
|
|
364 (setq this-val val))
|
|
365 (setq first-char (1+ first-char))))
|
|
366 this-val))
|
|
367 ((> val 0)
|
|
368 (aref val-table (1- val))))))
|
|
369
|
|
370 ;; Return a integer-type character property value of CHAR. VAL is the
|
|
371 ;; current value of (aref TABLE CHAR).
|
|
372
|
|
373 (defun unidata-get-integer (char val table)
|
|
374 (let ((val-table (char-table-extra-slot table 4)))
|
|
375 (cond ((integerp val)
|
|
376 val)
|
|
377 ((stringp val)
|
|
378 (let ((first-char (lsh (lsh char -7) 7))
|
|
379 (str val)
|
|
380 (len (length val))
|
|
381 (idx 0)
|
|
382 this-val count)
|
|
383 (while (< idx len)
|
|
384 (setq val (aref str idx) idx (1+ idx)
|
|
385 count (if (< idx len) (aref str idx) 1))
|
|
386 (setq val (and (> val 0) (aref val-table (1- val)))
|
|
387 count (if (< count 128)
|
|
388 1
|
|
389 (prog1 (- count 128) (setq idx (1+ idx)))))
|
|
390 (dotimes (i count)
|
|
391 (aset table first-char val)
|
|
392 (if (= first-char char)
|
|
393 (setq this-val val))
|
|
394 (setq first-char (1+ first-char))))
|
|
395 this-val)))))
|
|
396
|
|
397 ;; Store VAL (symbol) as a character property value of CHAR in TABLE.
|
|
398
|
|
399 (defun unidata-put-symbol (char val table)
|
|
400 (or (symbolp val)
|
|
401 (error "Not a symbol: %S" val))
|
|
402 (let ((current-val (aref table char)))
|
|
403 (unless (eq current-val val)
|
|
404 (if (stringp current-val)
|
|
405 (funcall (char-table-extra-slot table 1) char current-val table))
|
|
406 (aset table char val))))
|
|
407
|
|
408 ;; Store VAL (integer) as a character property value of CHAR in TABLE.
|
|
409
|
|
410 (defun unidata-put-integer (char val table)
|
|
411 (or (integerp val)
|
|
412 (not val)
|
|
413 (error "Not an integer nor nil: %S" val))
|
|
414 (let ((current-val (aref table char)))
|
|
415 (unless (eq current-val val)
|
|
416 (if (stringp current-val)
|
|
417 (funcall (char-table-extra-slot table 1) char current-val table))
|
|
418 (aset table char val))))
|
|
419
|
|
420 ;; Encode the character property value VAL into an integer value by
|
|
421 ;; VAL-LIST. By side effect, VAL-LIST is modified.
|
|
422 ;; VAL-LIST has this form:
|
|
423 ;; (t (VAL1 . VAL-CODE1) (VAL2 . VAL-CODE2) ...)
|
|
424 ;; If VAL is one of VALn, just return VAL-CODEn. Otherwise,
|
|
425 ;; VAL-LIST is modified to this:
|
|
426 ;; (t (VAL . (1+ VAL-CODE1)) (VAL1 . VAL-CODE1) (VAL2 . VAL-CODE2) ...)
|
|
427
|
|
428 (defun unidata-encode-val (val-list val)
|
|
429 (let ((slot (assq val val-list))
|
|
430 val-code)
|
|
431 (if slot
|
|
432 (cdr slot)
|
|
433 (setq val-code (if (cdr val-list) (1+ (cdr (nth 1 val-list))) 1))
|
|
434 (setcdr val-list (cons (cons val val-code) (cdr val-list)))
|
|
435 val-code)))
|
|
436
|
|
437 ;; Generate a char-table for the character property PROP.
|
|
438
|
|
439 (defun unidata-gen-table (prop val-func default-value)
|
|
440 (let ((table (make-char-table 'char-code-property-table))
|
|
441 (prop-idx (unidata-prop-index prop))
|
|
442 (val-list (list t))
|
|
443 (vec (make-vector 128 0))
|
|
444 tail elt range val val-code idx slot)
|
|
445 (set-char-table-range table (cons 0 (max-char)) default-value)
|
|
446 (setq tail unidata-list)
|
|
447 (while tail
|
|
448 (setq elt (car tail) tail (cdr tail))
|
|
449 (setq range (car elt)
|
|
450 val (funcall val-func (nth prop-idx elt)))
|
|
451 (setq val-code (if val (unidata-encode-val val-list val)))
|
|
452 (if (consp range)
|
|
453 (if val-code
|
|
454 (set-char-table-range table range val))
|
|
455 (let* ((start (lsh (lsh range -7) 7))
|
|
456 (limit (+ start 127))
|
|
457 str count new-val)
|
|
458 (fillarray vec 0)
|
|
459 (if val-code
|
|
460 (aset vec (- range start) val-code))
|
|
461 (while (and (setq elt (car tail) range (car elt))
|
|
462 (integerp range)
|
|
463 (<= range limit))
|
|
464 (setq new-val (funcall val-func (nth prop-idx elt)))
|
|
465 (if (not (eq val new-val))
|
|
466 (setq val new-val
|
|
467 val-code (if val (unidata-encode-val val-list val))))
|
|
468 (if val-code
|
|
469 (aset vec (- range start) val-code))
|
|
470 (setq tail (cdr tail)))
|
|
471 (setq str "" val-code -1 count 0)
|
|
472 (mapc #'(lambda (x)
|
|
473 (if (= val-code x)
|
|
474 (setq count (1+ count))
|
|
475 (if (> count 2)
|
|
476 (setq str (concat str (string val-code
|
|
477 (+ count 128))))
|
|
478 (if (= count 2)
|
|
479 (setq str (concat str (string val-code val-code)))
|
|
480 (if (= count 1)
|
|
481 (setq str (concat str (string val-code))))))
|
|
482 (setq val-code x count 1)))
|
|
483 vec)
|
|
484 (if (= count 128)
|
|
485 (if val
|
|
486 (set-char-table-range table (cons start limit) val))
|
|
487 (if (= val-code 0)
|
|
488 (set-char-table-range table (cons start limit) str)
|
|
489 (if (> count 2)
|
|
490 (setq str (concat str (string val-code (+ count 128))))
|
|
491 (if (= count 2)
|
|
492 (setq str (concat str (string val-code val-code)))
|
|
493 (setq str (concat str (string val-code)))))
|
|
494 (set-char-table-range table (cons start limit) str))))))
|
|
495
|
|
496 (setq val-list (nreverse (cdr val-list)))
|
|
497 (set-char-table-extra-slot table 0 prop)
|
|
498 (set-char-table-extra-slot table 4 (vconcat (mapcar 'car val-list)))
|
|
499 table))
|
|
500
|
|
501 (defun unidata-gen-table-symbol (prop)
|
|
502 (let ((table (unidata-gen-table prop
|
|
503 #'(lambda (x) (and (> (length x) 0)
|
|
504 (intern x)))
|
|
505 0)))
|
|
506 (byte-compile 'unidata-get-symbol)
|
|
507 (byte-compile 'unidata-put-symbol)
|
|
508 (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-symbol))
|
|
509 (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-symbol))
|
|
510 table))
|
|
511
|
|
512 (defun unidata-gen-table-integer (prop)
|
|
513 (let ((table (unidata-gen-table prop
|
|
514 #'(lambda (x) (and (> (length x) 0)
|
|
515 (string-to-number x)))
|
|
516 t)))
|
|
517 (byte-compile 'unidata-get-integer)
|
|
518 (byte-compile 'unidata-put-integer)
|
|
519 (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-integer))
|
|
520 (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-integer))
|
|
521 table))
|
|
522
|
|
523
|
|
524 ;; WORD-LIST TABLE
|
|
525
|
|
526 ;; If the table is for `name' property, each character in the string
|
|
527 ;; is one of these:
|
|
528 ;; DIFF-HEAD-CODE (0, 1, or 2):
|
|
529 ;; specifies how to decode the following characters.
|
|
530 ;; WORD-CODE (3..#x7FF excluding '-', '0'..'9', 'A'..'Z'):
|
|
531 ;; specifies an index number into WORD-TABLE (see below)
|
|
532 ;; Otherwise (' ', '-', '0'..'9', 'A'..'Z'):
|
|
533 ;; specifies a literal word.
|
|
534 ;;
|
|
535 ;; The 4th slots is a vector:
|
|
536 ;; [ WORD-TABLE BLOCK-NAME HANGUL-JAMO-TABLE ]
|
|
537 ;; WORD-TABLE is a vector of word symbols.
|
|
538 ;; BLOCK-NAME is a vector of name symbols for a block of characters.
|
|
539 ;; HANGUL-JAMO-TABLE is `unidata-name-jamo-name-table'.
|
|
540
|
|
541 ;; Return the difference of symbol list L1 and L2 in this form:
|
|
542 ;; (DIFF-HEAD SYM1 SYM2 ...)
|
|
543 ;; DIFF-HEAD is ((SAME-HEAD-LENGTH * 16) + SAME-TAIL-LENGTH).
|
|
544 ;; Ex: If L1 is (a b c d e f) and L2 is (a g h e f), this function
|
|
545 ;; returns ((+ (* 1 16) 2) g h).
|
|
546 ;; It means that we can get L2 from L1 by prepending the first element
|
|
547 ;; of L1 and appending the last 2 elements of L1 to the list (g h).
|
|
548 ;; If L1 and L2 don't have common elements at the head and tail,
|
|
549 ;; set DIFF-HEAD to -1 and SYM1 ... to the elements of L2.
|
|
550
|
|
551 (defun unidata-word-list-diff (l1 l2)
|
|
552 (let ((beg 0)
|
|
553 (end 0)
|
|
554 (len1 (length l1))
|
|
555 (len2 (length l2))
|
|
556 result)
|
|
557 (when (< len1 16)
|
|
558 (while (and l1 (eq (car l1) (car l2)))
|
|
559 (setq beg (1+ beg)
|
|
560 l1 (cdr l1) len1 (1- len1) l2 (cdr l2) len2 (1- len2)))
|
|
561 (while (and (< end len1) (< end len2)
|
|
562 (eq (nth (- len1 end 1) l1) (nth (- len2 end 1) l2)))
|
|
563 (setq end (1+ end))))
|
|
564 (if (= (+ beg end) 0)
|
|
565 (setq result (list -1))
|
|
566 (setq result (list (+ (* beg 16) (+ beg (- len1 end))))))
|
|
567 (while (< end len2)
|
|
568 (setcdr result (cons (nth (- len2 end 1) l2) (cdr result)))
|
|
569 (setq end (1+ end)))
|
|
570 result))
|
|
571
|
|
572 ;; Return a compressed form of the vector VEC. Each element of VEC is
|
|
573 ;; a list of symbols of which names can be concatenated to form a
|
|
574 ;; character name. This function changes those elements into
|
|
575 ;; compressed forms by utilizing the fact that diff of consecutive
|
|
576 ;; elements is usually small.
|
|
577
|
|
578 (defun unidata-word-list-compress (vec)
|
|
579 (let (last-elt last-idx diff-head tail elt val)
|
|
580 (dotimes (i 128)
|
|
581 (setq elt (aref vec i))
|
|
582 (when elt
|
|
583 (if (null last-elt)
|
|
584 (setq diff-head -1
|
|
585 val (cons 0 elt))
|
|
586 (setq val (unidata-word-list-diff last-elt elt))
|
|
587 (if (= (car val) -1)
|
|
588 (setq diff-head -1
|
|
589 val (cons 0 (cdr val)))
|
|
590 (if (eq diff-head (car val))
|
|
591 (setq val (cons 2 (cdr val)))
|
|
592 (setq diff-head (car val))
|
|
593 (if (>= diff-head 0)
|
|
594 (setq val (cons 1 val))))))
|
|
595 (aset vec i val)
|
|
596 (setq last-idx i last-elt elt)))
|
|
597 (if (not last-idx)
|
|
598 (setq vec nil)
|
|
599 (if (< last-idx 127)
|
|
600 (let ((shorter (make-vector (1+ last-idx) nil)))
|
|
601 (dotimes (i (1+ last-idx))
|
|
602 (aset shorter i (aref vec i)))
|
|
603 (setq vec shorter))))
|
|
604 vec))
|
|
605
|
|
606 ;; Encode the word index IDX into a characters code that can be
|
|
607 ;; embedded in a string.
|
|
608
|
|
609 (defsubst unidata-encode-word (idx)
|
|
610 ;; Exclude 0, 1, 2.
|
|
611 (+ idx 3))
|
|
612
|
|
613 ;; Decode the character code CODE (that is embedded in a string) into
|
|
614 ;; the corresponding word name by looking up WORD-TABLE.
|
|
615
|
|
616 (defsubst unidata-decode-word (code word-table)
|
|
617 (setq code (- code 3))
|
|
618 (if (< code (length word-table))
|
|
619 (aref word-table code)))
|
|
620
|
|
621 ;; Table of short transliterated name symbols of Hangul Jamo divided
|
|
622 ;; into Choseong, Jungseong, and Jongseong.
|
|
623
|
|
624 (defconst unidata-name-jamo-name-table
|
|
625 [[G GG N D DD R M B BB S SS nil J JJ C K T P H]
|
|
626 [A AE YA YAE EO E YEO YE O WA WAE OE YO U WEO WE WI YU EU YI I]
|
|
627 [G GG GS N NJ NH D L LG LM LB LS LT LP LH M B BS S SS NG J C K T P H]])
|
|
628
|
|
629 ;; Return a name of CHAR. VAL is the current value of (aref TABLE
|
|
630 ;; CHAR).
|
|
631
|
|
632 (defun unidata-get-name (char val table)
|
|
633 (cond
|
|
634 ((stringp val)
|
|
635 (if (> (aref val 0) 0)
|
|
636 val
|
|
637 (let* ((first-char (lsh (lsh char -7) 7))
|
|
638 (word-table (aref (char-table-extra-slot table 4) 0))
|
|
639 (i 1)
|
|
640 (len (length val))
|
|
641 (vec (make-vector 128 nil))
|
|
642 (idx 0)
|
|
643 (case-fold-search nil)
|
|
644 c word-list tail-list last-list word diff-head)
|
|
645 (while (< i len)
|
|
646 (setq c (aref val i))
|
|
647 (if (< c 3)
|
|
648 (progn
|
|
649 (if (or word-list tail-list)
|
|
650 (aset vec idx
|
|
651 (setq last-list (nconc word-list tail-list))))
|
|
652 (setq i (1+ i) idx (1+ idx)
|
|
653 word-list nil tail-list nil)
|
|
654 (if (> c 0)
|
|
655 (let ((l last-list))
|
|
656 (if (= c 1)
|
|
657 (setq diff-head
|
|
658 (prog1 (aref val i) (setq i (1+ i)))))
|
|
659 (setq tail-list (nthcdr (% diff-head 16) last-list))
|
|
660 (dotimes (i (/ diff-head 16))
|
|
661 (setq word-list (nconc word-list (list (car l)))
|
|
662 l (cdr l))))))
|
|
663 (setq word-list
|
|
664 (nconc word-list
|
|
665 (list (symbol-name
|
|
666 (unidata-decode-word c word-table))))
|
|
667 i (1+ i))))
|
|
668 (if (or word-list tail-list)
|
|
669 (aset vec idx (nconc word-list tail-list)))
|
|
670 (setq val nil)
|
|
671 (dotimes (i 128)
|
|
672 (setq c (+ first-char i))
|
|
673 (let ((name (aref vec i)))
|
|
674 (if name
|
|
675 (let ((tail (cdr (setq name (copy-sequence name))))
|
|
676 elt)
|
|
677 (while tail
|
|
678 (setq elt (car tail))
|
|
679 (or (string= elt "-")
|
|
680 (progn
|
|
681 (setcdr tail (cons elt (cdr tail)))
|
|
682 (setcar tail " ")))
|
|
683 (setq tail (cddr tail)))
|
|
684 (setq name (apply 'concat name))))
|
|
685 (aset table c name)
|
|
686 (if (= c char)
|
|
687 (setq val name))))
|
|
688 val)))
|
|
689
|
|
690 ((and (integerp val) (> val 0))
|
|
691 (let* ((symbol-table (aref (char-table-extra-slot table 4) 1))
|
|
692 (sym (aref symbol-table (1- val))))
|
|
693 (cond ((eq sym 'HANGUL\ SYLLABLE)
|
|
694 (let ((jamo-name-table (aref (char-table-extra-slot table 4) 2)))
|
|
695 ;; SIndex = S - SBase
|
|
696 (setq char (- char #xAC00))
|
|
697 (let ( ;; LIndex = SIndex / NCount
|
|
698 (L (/ char 588))
|
|
699 ;; VIndex = (SIndex % NCount) * TCount
|
|
700 (V (/ (% char 588) 28))
|
|
701 ;; TIndex = SIndex % TCount
|
|
702 (T (% char 28)))
|
|
703 (format "HANGUL SYLLABLE %s%s%s"
|
|
704 (aref (aref jamo-name-table 0) L)
|
|
705 (aref (aref jamo-name-table 1) V)
|
|
706 (if (= T 0) ""
|
|
707 (aref (aref jamo-name-table 2) (1- T)))))))
|
|
708 ((eq sym 'CJK\ COMPATIBILITY\ IDEOGRAPH)
|
|
709 (format "%s-%04X" sym char))
|
|
710 ((eq sym 'VARIATION\ SELECTOR)
|
|
711 (format "%s-%d" sym (+ (- char #xe0100) 17))))))))
|
|
712
|
|
713 ;; Store VAL as the name of CHAR in TABLE.
|
|
714
|
|
715 (defun unidata-put-name (char val table)
|
|
716 (let ((current-val (aref table char)))
|
|
717 (if (and (stringp current-val) (= (aref current-val 0) 0))
|
|
718 (funcall (char-table-extra-slot table 1) char current-val table))
|
|
719 (aset table char val)))
|
|
720
|
|
721 (defun unidata-get-decomposition (char val table)
|
|
722 (cond
|
|
723 ((consp val)
|
|
724 val)
|
|
725
|
|
726 ((stringp val)
|
|
727 (if (> (aref val 0) 0)
|
|
728 val
|
|
729 (let* ((first-char (lsh (lsh char -7) 7))
|
|
730 (word-table (char-table-extra-slot table 4))
|
|
731 (i 1)
|
|
732 (len (length val))
|
|
733 (vec (make-vector 128 nil))
|
|
734 (idx 0)
|
|
735 (case-fold-search nil)
|
|
736 c word-list tail-list last-list word diff-head)
|
|
737 (while (< i len)
|
|
738 (setq c (aref val i))
|
|
739 (if (< c 3)
|
|
740 (progn
|
|
741 (if (or word-list tail-list)
|
|
742 (aset vec idx
|
|
743 (setq last-list (nconc word-list tail-list))))
|
|
744 (setq i (1+ i) idx (1+ idx)
|
|
745 word-list nil tail-list nil)
|
|
746 (if (> c 0)
|
|
747 (let ((l last-list))
|
|
748 (if (= c 1)
|
|
749 (setq diff-head
|
|
750 (prog1 (aref val i) (setq i (1+ i)))))
|
|
751 (setq tail-list (nthcdr (% diff-head 16) last-list))
|
|
752 (dotimes (i (/ diff-head 16))
|
|
753 (setq word-list (nconc word-list (list (car l)))
|
|
754 l (cdr l))))))
|
|
755 (setq word-list
|
|
756 (nconc word-list
|
|
757 (list (or (unidata-decode-word c word-table) c)))
|
|
758 i (1+ i))))
|
|
759 (if (or word-list tail-list)
|
|
760 (aset vec idx (nconc word-list tail-list)))
|
|
761 (dotimes (i 128)
|
|
762 (aset table (+ first-char i) (aref vec i)))
|
|
763 (aref vec (- char first-char)))))))
|
|
764
|
|
765 ;; Store VAL as the name of CHAR in TABLE.
|
|
766
|
|
767 (defun unidata-put-decomposition (char val table)
|
|
768 (let ((current-val (aref table char)))
|
|
769 (if (and (stringp current-val) (= (aref current-val 0) 0))
|
|
770 (funcall (char-table-extra-slot table 1) char current-val table))
|
|
771 (aset table char val)))
|
|
772
|
|
773 ;; UnicodeData.txt contains these lines:
|
|
774 ;; 0000;<control>;Cc;0;BN;;;;;N;NULL;;;;
|
|
775 ;; ...
|
|
776 ;; 0020;SPACE;Zs;0;WS;;;;;N;;;;;
|
|
777 ;; ...
|
|
778 ;; The following command yields a file of about 96K bytes.
|
|
779 ;; % gawk -F ';' '{print $1,$2;}' < UnicodeData.txt | gzip > temp.gz
|
|
780 ;; With the following function, we can get a file of almost the same
|
|
781 ;; the size.
|
|
782
|
|
783 ;; Generate a char-table for character names.
|
|
784
|
|
785 (defun unidata-gen-table-word-list (prop val-func)
|
|
786 (let ((table (make-char-table 'char-code-property-table))
|
|
787 (prop-idx (unidata-prop-index prop))
|
|
788 (word-list (list nil))
|
|
789 word-table
|
|
790 block-list block-word-table block-end
|
|
791 tail elt range val idx slot)
|
|
792 (set-char-table-range table (cons 0 (max-char)) 0)
|
|
793 (setq tail unidata-list)
|
|
794 (setq block-end -1)
|
|
795 (while tail
|
|
796 (setq elt (car tail) tail (cdr tail))
|
|
797 (setq range (car elt)
|
|
798 val (funcall val-func (nth prop-idx elt)))
|
|
799 ;; Treat the sequence of "CJK COMPATIBILITY IDEOGRAPH-XXXX" and
|
|
800 ;; "VARIATION SELECTOR-XXX" as a block.
|
|
801 (if (and (consp val) (eq prop 'name)
|
|
802 (or (and (eq (car val) 'CJK)
|
|
803 (eq (nth 1 val) 'COMPATIBILITY))
|
|
804 (and (>= range #xe0100)
|
|
805 (eq (car val) 'VARIATION)
|
|
806 (eq (nth 1 val) 'SELECTOR))))
|
|
807 (let ((first (car val))
|
|
808 (second (nth 1 val))
|
|
809 (start range))
|
|
810 (while (and (setq elt (car tail) range (car elt)
|
|
811 val (funcall val-func (nth prop-idx elt)))
|
|
812 (consp val)
|
|
813 (eq first (car val))
|
|
814 (eq second (nth 1 val)))
|
|
815 (setq block-end range
|
|
816 tail (cdr tail)))
|
|
817 (setq range (cons start block-end)
|
|
818 val (if (eq first 'CJK) 'CJK\ COMPATIBILITY\ IDEOGRAPH
|
|
819 'VARIATION\ SELECTOR))))
|
|
820
|
|
821 (if (consp range)
|
|
822 (if val
|
|
823 (let ((slot (assq val block-list)))
|
|
824 (setq range (cons (car range) (cdr range)))
|
|
825 (setq block-end (cdr range))
|
|
826 (if slot
|
|
827 (nconc slot (list range))
|
|
828 (push (list val range) block-list))))
|
|
829 (let* ((start (lsh (lsh range -7) 7))
|
|
830 (limit (+ start 127))
|
|
831 (first tail)
|
|
832 (vec (make-vector 128 nil))
|
|
833 c name len)
|
|
834 (if (<= start block-end)
|
|
835 ;; START overlap with the previous block.
|
|
836 (aset table range (nth prop-idx elt))
|
|
837 (if val
|
|
838 (aset vec (- range start) val))
|
|
839 (while (and (setq elt (car tail) range (car elt))
|
|
840 (integerp range)
|
|
841 (<= range limit))
|
|
842 (setq val (funcall val-func (nth prop-idx elt)))
|
|
843 (if val
|
|
844 (aset vec (- range start) val))
|
|
845 (setq tail (cdr tail)))
|
|
846 (setq vec (unidata-word-list-compress vec))
|
|
847 (when vec
|
|
848 (dotimes (i (length vec))
|
|
849 (dolist (elt (aref vec i))
|
|
850 (if (symbolp elt)
|
|
851 (let ((slot (assq elt word-list)))
|
|
852 (if slot
|
|
853 (setcdr slot (1+ (cdr slot)))
|
|
854 (setcdr word-list
|
|
855 (cons (cons elt 1) (cdr word-list))))))))
|
|
856 (set-char-table-range table (cons start limit) vec))))))
|
|
857 (setq word-list (sort (cdr word-list)
|
|
858 #'(lambda (x y) (> (cdr x) (cdr y)))))
|
|
859 (setq tail word-list idx 0)
|
|
860 (while tail
|
|
861 (setcdr (car tail) (unidata-encode-word idx))
|
|
862 (setq idx (1+ idx) tail (cdr tail)))
|
|
863 (setq word-table (make-vector (length word-list) nil))
|
|
864 (setq idx 0)
|
|
865 (dolist (elt word-list)
|
|
866 (aset word-table idx (car elt))
|
|
867 (setq idx (1+ idx)))
|
|
868
|
|
869 (if (and (eq prop 'decomposition)
|
|
870 (> idx 32))
|
|
871 (error "Too many symobls in decomposition data"))
|
|
872
|
|
873 (dotimes (i (/ #x110000 128))
|
|
874 (let* ((idx (* i 128))
|
|
875 (vec (aref table idx)))
|
|
876 (when (vectorp vec)
|
|
877 (dotimes (i (length vec))
|
|
878 (let ((tail (aref vec i))
|
|
879 elt code)
|
|
880 (if (not tail)
|
|
881 (aset vec i "\0")
|
|
882 (while tail
|
|
883 (setq elt (car tail)
|
|
884 code (if (integerp elt) elt
|
|
885 (cdr (assq elt word-list))))
|
|
886 (setcar tail (string code))
|
|
887 (setq tail (cdr tail)))
|
|
888 (aset vec i (mapconcat 'identity (aref vec i) "")))))
|
|
889 (set-char-table-range
|
|
890 table (cons idx (+ idx 127))
|
|
891 (mapconcat 'identity vec "")))))
|
|
892
|
|
893 (setq block-word-table (make-vector (length block-list) nil))
|
|
894 (setq idx 0)
|
|
895 (dolist (elt block-list)
|
|
896 (dolist (e (cdr elt))
|
|
897 (set-char-table-range table e (1+ idx)))
|
|
898 (aset block-word-table idx (car elt))
|
|
899 (setq idx (1+ idx)))
|
|
900
|
|
901 (set-char-table-extra-slot table 0 prop)
|
|
902 (set-char-table-extra-slot table 4 (cons word-table block-word-table))
|
|
903 table))
|
|
904
|
|
905 (defun unidata-split-name (str)
|
|
906 (if (symbolp str)
|
|
907 str
|
|
908 (let ((len (length str))
|
|
909 (l nil)
|
|
910 (idx 0)
|
|
911 c)
|
|
912 (if (= len 0)
|
|
913 nil
|
|
914 (dotimes (i len)
|
|
915 (setq c (aref str i))
|
|
916 (if (= c 32)
|
|
917 (setq l (cons (intern (substring str idx i)) l)
|
|
918 idx (1+ i))
|
|
919 (if (and (= c ?-) (< idx i)
|
|
920 (< (1+ i) len) (/= (aref str (1+ i)) 32))
|
|
921 (setq l (cons '- (cons (intern (substring str idx i)) l))
|
|
922 idx (1+ i)))))
|
|
923 (nreverse (cons (intern (substring str idx)) l))))))
|
|
924
|
|
925 (defun unidata-gen-table-name (prop)
|
|
926 (let* ((table (unidata-gen-table-word-list prop 'unidata-split-name))
|
|
927 (word-tables (char-table-extra-slot table 4)))
|
|
928 (byte-compile 'unidata-get-name)
|
|
929 (byte-compile 'unidata-put-name)
|
|
930 (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-name))
|
|
931 (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-name))
|
|
932
|
|
933 (if (eq prop 'name)
|
|
934 (set-char-table-extra-slot table 4
|
|
935 (vector (car word-tables)
|
|
936 (cdr word-tables)
|
|
937 unidata-name-jamo-name-table))
|
|
938 (set-char-table-extra-slot table 4
|
|
939 (vector (car word-tables))))
|
|
940 table))
|
|
941
|
|
942 (defun unidata-split-decomposition (str)
|
|
943 (if (symbolp str)
|
|
944 str
|
|
945 (let ((len (length str))
|
|
946 (l nil)
|
|
947 (idx 0)
|
|
948 c)
|
|
949 (if (= len 0)
|
|
950 nil
|
|
951 (dotimes (i len)
|
|
952 (setq c (aref str i))
|
|
953 (if (= c 32)
|
|
954 (setq l (if (= (aref str idx) ?<)
|
|
955 (cons (intern (substring str idx i)) l)
|
|
956 (cons (string-to-int (substring str idx i) 16) l))
|
|
957 idx (1+ i))))
|
|
958 (if (= (aref str idx) ?<)
|
|
959 (setq l (cons (intern (substring str idx len)) l))
|
|
960 (setq l (cons (string-to-int (substring str idx len) 16) l)))
|
|
961 (nreverse l)))))
|
|
962
|
|
963
|
|
964 (defun unidata-gen-table-decomposition (prop)
|
|
965 (let* ((table (unidata-gen-table-word-list prop 'unidata-split-decomposition))
|
|
966 (word-tables (char-table-extra-slot table 4)))
|
|
967 (byte-compile 'unidata-get-decomposition)
|
|
968 (byte-compile 'unidata-put-decomposition)
|
|
969 (set-char-table-extra-slot table 1
|
|
970 (symbol-function 'unidata-get-decomposition))
|
|
971 (set-char-table-extra-slot table 2
|
|
972 (symbol-function 'unidata-put-decomposition))
|
|
973 (set-char-table-extra-slot table 4 (car word-tables))
|
|
974 table))
|
|
975
|
|
976
|
|
977
|
|
978 (defun unidata-describe-general-category (val)
|
|
979 (cdr (assq val
|
|
980 '((Lu . "Letter, Uppercase")
|
|
981 (Ll . "Letter, Lowercase")
|
|
982 (Lt . "Letter, Titlecase")
|
|
983 (Lm . "Letter, Modifier")
|
|
984 (Lo . "Letter, Other")
|
|
985 (Mn . "Mark, Nonspacing")
|
|
986 (Mc . "Mark, Spacing Combining")
|
|
987 (Me . "Mark, Enclosing")
|
|
988 (Nd . "Number, Decimal Digit")
|
|
989 (Nl . "Number, Letter")
|
|
990 (No . "Number, Other")
|
|
991 (Pc . "Punctuation, Connector")
|
|
992 (Pd . "Punctuation, Dash")
|
|
993 (Ps . "Punctuation, Open")
|
|
994 (Pe . "Punctuation, Close")
|
|
995 (Pi . "Punctuation, Initial quote")
|
|
996 (Pf . "Punctuation, Final quote")
|
|
997 (Po . "Punctuation, Other")
|
|
998 (Sm . "Symbol, Math")
|
|
999 (Sc . "Symbol, Currency")
|
|
1000 (Sk . "Symbol, Modifier")
|
|
1001 (So . "Symbol, Other")
|
|
1002 (Zs . "Separator, Space")
|
|
1003 (Zl . "Separator, Line")
|
|
1004 (Zp . "Separator, Paragraph")
|
|
1005 (Cc . "Other, Control")
|
|
1006 (Cf . "Other, Format")
|
|
1007 (Cs . "Other, Surrogate")
|
|
1008 (Co . "Other, Private Use")
|
|
1009 (Cn . "Other, Not Assigned")))))
|
|
1010
|
|
1011 (defun unidata-describe-canonical-combining-class (val)
|
|
1012 (cdr (assq val
|
|
1013 '((0 . "Spacing, split, enclosing, reordrant, and Tibetan subjoined")
|
|
1014 (1 . "Overlays and interior")
|
|
1015 (7 . "Nuktas")
|
|
1016 (8 . "Hiragana/Katakana voicing marks")
|
|
1017 (9 . "Viramas")
|
|
1018 (10 . "Start of fixed position classes")
|
|
1019 (199 . "End of fixed position classes")
|
|
1020 (200 . "Below left attached")
|
|
1021 (202 . "Below attached")
|
|
1022 (204 . "Below right attached")
|
|
1023 (208 . "Left attached (reordrant around single base character)")
|
|
1024 (210 . "Right attached")
|
|
1025 (212 . "Above left attached")
|
|
1026 (214 . "Above attached")
|
|
1027 (216 . "Above right attached")
|
|
1028 (218 . "Below left")
|
|
1029 (220 . "Below")
|
|
1030 (222 . "Below right")
|
|
1031 (224 . "Left (reordrant around single base character)")
|
|
1032 (226 . "Right")
|
|
1033 (228 . "Above left")
|
|
1034 (230 . "Above")
|
|
1035 (232 . "Above right")
|
|
1036 (233 . "Double below")
|
|
1037 (234 . "Double above")
|
|
1038 (240 . "Below (iota subscript)")))))
|
|
1039
|
|
1040 (defun unidata-describe-bidi-class (val)
|
|
1041 (cdr (assq val
|
|
1042 '((L . "Left-to-Right")
|
|
1043 (LRE . "Left-to-Right Embedding")
|
|
1044 (LRO . "Left-to-Right Override")
|
|
1045 (R . "Right-to-Left")
|
|
1046 (AL . "Right-to-Left Arabic")
|
|
1047 (RLE . "Right-to-Left Embedding")
|
|
1048 (RLO . "Right-to-Left Override")
|
|
1049 (PDF . "Pop Directional Format")
|
|
1050 (EN . "European Number")
|
|
1051 (ES . "European Number Separator")
|
|
1052 (ET . "European Number Terminator")
|
|
1053 (AN . "Arabic Number")
|
|
1054 (CS . "Common Number Separator")
|
|
1055 (NSM . "Non-Spacing Mark")
|
|
1056 (BN . "Boundary Neutral")
|
|
1057 (B . "Paragraph Separator")
|
|
1058 (S . "Segment Separator")
|
|
1059 (WS . "Whitespace")
|
|
1060 (ON . "Other Neutrals")))))
|
|
1061
|
|
1062 (defun unidata-describe-decomposition (val)
|
|
1063 (mapconcat #'(lambda (x) (if (symbolp x) (symbol-name x) (string ?' x ?')))
|
|
1064 val " "))
|
|
1065
|
|
1066 ;; Verify if we can retrieve correct values from the generated
|
|
1067 ;; char-tables.
|
|
1068
|
|
1069 (defun unidata-check ()
|
|
1070 (dolist (elt unidata-prop-alist)
|
|
1071 (let* ((prop (car elt))
|
|
1072 (index (unidata-prop-index prop))
|
|
1073 (generator (unidata-prop-generator prop))
|
|
1074 (table (progn
|
|
1075 (message "Generating %S table..." prop)
|
|
1076 (funcall generator prop)))
|
|
1077 (decoder (char-table-extra-slot table 1))
|
|
1078 (check #x400))
|
|
1079 (dolist (e unidata-list)
|
|
1080 (let ((char (car e))
|
|
1081 (val1 (nth index e))
|
|
1082 val2)
|
|
1083 (if (and (stringp val1) (= (length val1) 0))
|
|
1084 (setq val1 nil))
|
|
1085 (unless (consp char)
|
|
1086 (setq val2 (funcall decoder char (aref table char) table))
|
|
1087 (if val1
|
|
1088 (cond ((eq generator 'unidata-gen-table-symbol)
|
|
1089 (setq val1 (intern val1)))
|
|
1090 ((eq generator 'unidata-gen-table-integer)
|
|
1091 (setq val1 (string-to-int val1)))
|
|
1092 ((eq generator 'unidata-gen-table-character)
|
|
1093 (setq val1 (string-to-int val1 16)))
|
|
1094 ((eq generator 'unidata-gen-table-decomposition)
|
|
1095 (setq val1 (unidata-split-decomposition val1)))))
|
|
1096 (when (>= char check)
|
|
1097 (message "%S %04X" prop check)
|
|
1098 (setq check (+ check #x400)))
|
|
1099 (or (equal val1 val2)
|
|
1100 (insert (format "> %04X %S\n< %04X %S\n"
|
|
1101 char val1 char val2)))
|
|
1102 (sit-for 0)))))))
|
|
1103
|
|
1104 ;; The entry function. It generates files described in the header
|
|
1105 ;; comment of this file.
|
|
1106
|
|
1107 (defun unidata-gen-files ()
|
|
1108 (interactive)
|
|
1109 (let ((coding-system-for-write 'utf-8)
|
|
1110 (charprop-file "charprop.el"))
|
|
1111 (with-temp-file charprop-file
|
|
1112 (insert ";; Automatically generated by unidata-gen.el.\n")
|
|
1113 (dolist (elt unidata-prop-alist)
|
|
1114 (let* ((prop (car elt))
|
|
1115 (generator (unidata-prop-generator prop))
|
|
1116 (file (unidata-prop-file prop))
|
|
1117 (docstring (unidata-prop-docstring prop))
|
|
1118 (describer (unidata-prop-describer prop))
|
|
1119 table)
|
|
1120 ;; Filename in this comment line is extracted by sed in
|
|
1121 ;; Makefile.
|
|
1122 (insert (format ";; FILE: %s\n" file))
|
|
1123 (insert (format "(define-char-code-property '%S %S\n %S)\n"
|
|
1124 prop file docstring))
|
|
1125 (with-temp-file file
|
|
1126 (message "Generating %s..." file)
|
|
1127 (setq table (funcall generator prop))
|
|
1128 (when describer
|
|
1129 (unless (subrp (symbol-function describer))
|
|
1130 (byte-compile describer)
|
|
1131 (setq describer (symbol-function describer)))
|
|
1132 (set-char-table-extra-slot table 3 describer))
|
|
1133 (insert ";; Automatically generated from UnicodeData.txt.\n"
|
|
1134 (format "(define-char-code-property '%S %S %S)\n"
|
|
1135 prop table docstring)
|
|
1136 ;; \040 below is to avoid error on reading this file.
|
|
1137 ";; Local\040Variables:\n"
|
|
1138 ";; coding: utf-8\n"
|
|
1139 ";; no-byte-compile: t\n"
|
|
1140 ";; End:\n\n"
|
|
1141 (format ";; %s ends here\n" file)))))
|
|
1142 (message "Writing %s..." charprop-file)
|
|
1143 ;; \040 below is to avoid error on reading this file.
|
|
1144 (insert ";; Local\040Variables:\n"
|
|
1145 ";; coding: utf-8\n"
|
|
1146 ";; no-byte-compile: t\n"
|
|
1147 ";; End:\n\n"
|
|
1148 (format ";; %s ends here\n" charprop-file)))))
|
|
1149
|
|
1150 ;;; unidata-gen.el ends here
|