Mercurial > emacs
annotate admin/unidata/unidata-gen.el @ 95267:4ec6f3d46f27
(regexp-opt): Always return a properly-grouped regexp.
author | Stefan Monnier <monnier@iro.umontreal.ca> |
---|---|
date | Fri, 23 May 2008 17:19:22 +0000 |
parents | aeac1d771ae4 |
children | eae5b6774936 |
rev | line source |
---|---|
90086 | 1 ;; unidata-gen.el -- Create files containing character property data. |
91421
948b9fc771f9
Update copyright years and GPL version.
Glenn Morris <rgm@gnu.org>
parents:
91405
diff
changeset
|
2 ;; Copyright (C) 2005, 2006, 2007, 2008 |
90086 | 3 ;; National Institute of Advanced Industrial Science and Technology (AIST) |
4 ;; Registration Number H13PRO009 | |
5 | |
6 ;; This file is part of GNU Emacs. | |
7 | |
94829
aeac1d771ae4
Switch to recommended form of GPLv3 permissions notice.
Glenn Morris <rgm@gnu.org>
parents:
94071
diff
changeset
|
8 ;; GNU Emacs is free software: you can redistribute it and/or modify |
90086 | 9 ;; it under the terms of the GNU General Public License as published by |
94829
aeac1d771ae4
Switch to recommended form of GPLv3 permissions notice.
Glenn Morris <rgm@gnu.org>
parents:
94071
diff
changeset
|
10 ;; the Free Software Foundation, either version 3 of the License, or |
aeac1d771ae4
Switch to recommended form of GPLv3 permissions notice.
Glenn Morris <rgm@gnu.org>
parents:
94071
diff
changeset
|
11 ;; (at your option) any later version. |
90086 | 12 |
13 ;; GNU Emacs is distributed in the hope that it will be useful, | |
14 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 ;; GNU General Public License for more details. | |
17 | |
18 ;; You should have received a copy of the GNU General Public License | |
94829
aeac1d771ae4
Switch to recommended form of GPLv3 permissions notice.
Glenn Morris <rgm@gnu.org>
parents:
94071
diff
changeset
|
19 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. |
90086 | 20 |
21 ;;; Commentary: | |
22 | |
90162
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
23 ;; SPECIAL NOTICE |
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
24 ;; |
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
25 ;; This file must be byte-compilable/loadable by `temacs' and also |
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
26 ;; the entry function `unidata-gen-files' must be runnable by |
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
27 ;; `temacs'. |
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
28 |
90086 | 29 ;; FILES TO BE GENERATED |
30 ;; | |
90162
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
31 ;; The entry function `unidata-gen-files' generates these files in |
90086 | 32 ;; the current directory. |
33 ;; | |
34 ;; charprop.el | |
35 ;; It contains a series of forms of this format: | |
36 ;; (char-code-property-register PROP FILE) | |
37 ;; where PROP is a symbol representing a character property | |
90162
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
38 ;; (name, generic-category, etc), and FILE is a name of one of |
90086 | 39 ;; the following files. |
40 ;; | |
94071
03be13c38b12
(unidata-prop-alist): Fix typo in description of `numeric-value'.
Juanma Barranquero <lekktu@gmail.com>
parents:
91421
diff
changeset
|
41 ;; uni-name.el, uni-category.el, uni-combining.el, uni-bidi.el, |
03be13c38b12
(unidata-prop-alist): Fix typo in description of `numeric-value'.
Juanma Barranquero <lekktu@gmail.com>
parents:
91421
diff
changeset
|
42 ;; uni-decomposition.el, uni-decimal.el, uni-digit.el, uni-numeric.el, |
03be13c38b12
(unidata-prop-alist): Fix typo in description of `numeric-value'.
Juanma Barranquero <lekktu@gmail.com>
parents:
91421
diff
changeset
|
43 ;; uni-mirrored.el, uni-old-name.el, uni-comment.el, uni-uppercase.el, |
03be13c38b12
(unidata-prop-alist): Fix typo in description of `numeric-value'.
Juanma Barranquero <lekktu@gmail.com>
parents:
91421
diff
changeset
|
44 ;; uni-lowercase.el, uni-titlecase.el |
03be13c38b12
(unidata-prop-alist): Fix typo in description of `numeric-value'.
Juanma Barranquero <lekktu@gmail.com>
parents:
91421
diff
changeset
|
45 ;; They each contain a single form of this format: |
90086 | 46 ;; (char-code-property-register PROP CHAR-TABLE) |
47 ;; where PROP is the same as above, and CHAR-TABLE is a | |
90162
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
48 ;; char-table containing property values in a compressed format. |
90086 | 49 ;; |
50 ;; When they are installed in .../lisp/international/, the file | |
51 ;; "charprop.el" is preloaded in loadup.el. The other files are | |
52 ;; automatically loaded when the functions `get-char-code-property' | |
53 ;; and `put-char-code-property' are called. | |
54 ;; | |
55 ;; FORMAT OF A CHAR TABLE | |
56 ;; | |
57 ;; We want to make a file size containing a char-table small. We | |
58 ;; also want to load the file and get a property value fast. We | |
59 ;; also want to reduce the used memory after loading it. So, | |
60 ;; instead of naively storing a property value for each character in | |
61 ;; a char-table (and write it out into a file), we store compressed | |
62 ;; data in a char-table as below. | |
63 ;; | |
64 ;; If succeeding 128*N characters have the same property value, we | |
90162
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
65 ;; store that value for them. Otherwise, compress values for |
90086 | 66 ;; succeeding 128 characters into a single string and store it as a |
67 ;; value for those characters. The way of compression depends on a | |
68 ;; property. See the section "SIMPLE TABLE", "RUN-LENGTH TABLE", | |
69 ;; and "WORD-LIST TABLE". | |
70 | |
71 ;; The char table has four extra slots: | |
72 ;; 1st: property symbol | |
73 ;; 2nd: function to call to get a property value | |
74 ;; 3nd: function to call to put a property value | |
75 ;; 4th: function to call to get a description of a property value | |
76 ;; 5th: data referred by the above functions | |
77 | |
78 ;; List of elements of this form: | |
79 ;; (CHAR-or-RANGE PROP1 PROP2 ... PROPn) | |
80 ;; CHAR-or-RANGE: a character code or a cons of character codes | |
81 ;; PROPn: string representing the nth property value | |
82 | |
90174
f3973ae57d8b
(unidata-text-file): Delete it.
Kenichi Handa <handa@m17n.org>
parents:
90162
diff
changeset
|
83 (defvar unidata-list nil) |
f3973ae57d8b
(unidata-text-file): Delete it.
Kenichi Handa <handa@m17n.org>
parents:
90162
diff
changeset
|
84 |
f3973ae57d8b
(unidata-text-file): Delete it.
Kenichi Handa <handa@m17n.org>
parents:
90162
diff
changeset
|
85 (defun unidata-setup-list (unidata-text-file) |
90086 | 86 (let* ((table (list nil)) |
87 (tail table) | |
88 (block-names '(("^<CJK Ideograph" . CJK\ IDEOGRAPH) | |
89 ("^<Hangul Syllable" . HANGUL\ SYLLABLE) | |
90 ("^<.*Surrogate" . nil) | |
91 ("^<.*Private Use" . PRIVATE\ USE))) | |
92 val char name) | |
93 (or (file-readable-p unidata-text-file) | |
94 (error "File not readable: %s" unidata-text-file)) | |
95 (with-temp-buffer | |
90162
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
96 (insert-file-contents unidata-text-file) |
90086 | 97 (goto-char (point-min)) |
98 (condition-case nil | |
99 (while t | |
100 (setq val (read (current-buffer)) | |
101 char (car val) | |
102 name (cadr val)) | |
103 | |
104 ;; Check this kind of block. | |
105 ;; 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; | |
106 ;; 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; | |
107 (if (and (= (aref name 0) ?<) | |
108 (string-match ", First>$" name)) | |
109 (let ((first char) | |
110 (l block-names) | |
111 block-name) | |
112 (setq val (read (current-buffer)) | |
113 char (car val) | |
114 block-name (cadr val) | |
115 name nil) | |
116 (while l | |
117 (if (string-match (caar l) block-name) | |
118 (setq name (cdar l) l nil) | |
119 (setq l (cdr l)))) | |
120 (if (not name) | |
121 ;; As this is a surrogate pair range, ignore it. | |
122 (setq val nil) | |
123 (setcar val (cons first char)) | |
124 (setcar (cdr val) name)))) | |
125 | |
126 (when val | |
127 (setcdr tail (list val)) | |
128 (setq tail (cdr tail)))) | |
129 (error nil))) | |
90174
f3973ae57d8b
(unidata-text-file): Delete it.
Kenichi Handa <handa@m17n.org>
parents:
90162
diff
changeset
|
130 (setq unidata-list (cdr table)))) |
90086 | 131 |
132 ;; Alist of this form: | |
133 ;; (PROP INDEX GENERATOR FILENAME) | |
134 ;; PROP: character property | |
135 ;; INDEX: index to each element of unidata-list for PROP | |
136 ;; GENERATOR: function to generate a char-table | |
137 ;; FILENAME: filename to store the char-table | |
138 ;; DESCRIBER: function to call to get a description string of property value | |
139 | |
140 (defconst unidata-prop-alist | |
141 '((name | |
142 1 unidata-gen-table-name "uni-name.el" | |
143 "Unicode character name. | |
144 Property value is a string.") | |
145 (general-category | |
146 2 unidata-gen-table-symbol "uni-category.el" | |
147 "Unicode general category. | |
148 Property value is one of the following symbols: | |
149 Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po, | |
150 Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn" | |
151 unidata-describe-general-category) | |
152 (canonical-combining-class | |
153 3 unidata-gen-table-integer "uni-combining.el" | |
154 "Unicode canonical combining class. | |
155 Property value is an integer." | |
156 unidata-describe-canonical-combining-class) | |
157 (bidi-class | |
158 4 unidata-gen-table-symbol "uni-bidi.el" | |
159 "Unicode bidi class. | |
160 Property value is one of the following symbols: | |
161 L, LRE, LRO, R, AL, RLE, RLO, PDF, EN, ES, ET, | |
162 AN, CS, NSM, BN, B, S, WS, ON" | |
163 unidata-describe-bidi-class) | |
164 (decomposition | |
165 5 unidata-gen-table-decomposition "uni-decomposition.el" | |
166 "Unicode decomposition mapping. | |
167 Property value is a list of characters. The first element may be | |
90162
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
168 one of these symbols representing compatibility formatting tag: |
90086 | 169 <font>, <noBreak>, <initial>, <medial>, <final>, <isolated>, <circle>, |
170 <super>, <sub>, <vertical>, <wide>, <narrow>, <small>, <square>, <fraction>, | |
171 <compat>" | |
172 unidata-describe-decomposition) | |
173 (decimal-digit-value | |
174 6 unidata-gen-table-integer "uni-decimal.el" | |
175 "Unicode numeric value (decimal digit). | |
176 Property value is an integer.") | |
177 (digit-value | |
178 7 unidata-gen-table-integer "uni-digit.el" | |
179 "Unicode numeric value (digit). | |
180 Property value is an integer.") | |
181 (numeric-value | |
182 8 unidata-gen-table-symbol "uni-numeric.el" | |
183 "Unicode numeric value (numeric). | |
94071
03be13c38b12
(unidata-prop-alist): Fix typo in description of `numeric-value'.
Juanma Barranquero <lekktu@gmail.com>
parents:
91421
diff
changeset
|
184 Property value is a symbol.") |
90086 | 185 (mirrored |
186 9 unidata-gen-table-symbol "uni-mirrored.el" | |
187 "Unicode bidi mirrored flag. | |
188 Property value is a symbol `Y' or `N'.") | |
189 (old-name | |
190 10 unidata-gen-table-name "uni-old-name.el" | |
191 "Unicode old names as published in Unicode 1.0. | |
192 Property value is a string.") | |
193 (iso-10646-comment | |
194 11 unidata-gen-table-name "uni-comment.el" | |
195 "Unicode ISO 10646 comment. | |
196 Property value is a string.") | |
197 (uppercase | |
198 12 unidata-gen-table-character "uni-uppercase.el" | |
199 "Unicode simple uppercase mapping. | |
200 Property value is a character." | |
201 string) | |
202 (lowercase | |
203 13 unidata-gen-table-character "uni-lowercase.el" | |
204 "Unicode simple lowercase mapping. | |
205 Property value is a character." | |
206 string) | |
207 (titlecase | |
208 14 unidata-gen-table-character "uni-titlecase.el" | |
209 "Unicode simple titlecase mapping. | |
210 Property value is a character." | |
211 string))) | |
212 | |
213 ;; Functions to access the above data. | |
214 (defsubst unidata-prop-index (prop) (nth 1 (assq prop unidata-prop-alist))) | |
215 (defsubst unidata-prop-generator (prop) (nth 2 (assq prop unidata-prop-alist))) | |
216 (defsubst unidata-prop-file (prop) (nth 3 (assq prop unidata-prop-alist))) | |
217 (defsubst unidata-prop-docstring (prop) (nth 4 (assq prop unidata-prop-alist))) | |
218 (defsubst unidata-prop-describer (prop) (nth 5 (assq prop unidata-prop-alist))) | |
219 | |
220 | |
221 ;; SIMPLE TABLE | |
222 ;; | |
223 ;; If the type of character property value is character, and the | |
224 ;; values of succeeding character codes are usually different, we use | |
225 ;; a char-table described here to store such values. | |
226 ;; | |
227 ;; If succeeding 128 characters has no property, a char-table has the | |
228 ;; symbol t is for them. Otherwise a char-table has a string of the | |
229 ;; following format for them. | |
230 ;; | |
231 ;; The first character of the string is FIRST-INDEX. | |
232 ;; The Nth (N > 0) character of the string is a property value of the | |
90162
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
233 ;; character (BLOCK-HEAD + FIRST-INDEX + N - 1), where BLOCK-HEAD is |
90086 | 234 ;; the first of the characters in the block. |
235 ;; | |
236 ;; The 4th extra slot of a char-table is nil. | |
237 | |
238 (defun unidata-get-character (char val table) | |
239 (cond | |
240 ((characterp val) | |
241 val) | |
242 | |
243 ((stringp val) | |
244 (let* ((len (length val)) | |
245 (block-head (lsh (lsh char -7) 7)) | |
246 (vec (make-vector 128 nil)) | |
247 (first-index (aref val 0))) | |
248 (dotimes (i (1- len)) | |
249 (let ((elt (aref val (1+ i)))) | |
250 (if (> elt 0) | |
251 (aset vec (+ first-index i) elt)))) | |
252 (dotimes (i 128) | |
253 (aset table (+ block-head i) (aref vec i))) | |
254 (aref vec (- char block-head)))))) | |
255 | |
256 (defun unidata-put-character (char val table) | |
257 (or (characterp val) | |
258 (not val) | |
94071
03be13c38b12
(unidata-prop-alist): Fix typo in description of `numeric-value'.
Juanma Barranquero <lekktu@gmail.com>
parents:
91421
diff
changeset
|
259 (error "Not a character nor nil: %S" val)) |
90086 | 260 (let ((current-val (aref table char))) |
261 (unless (eq current-val val) | |
262 (if (stringp current-val) | |
263 (funcall (char-table-extra-slot table 1) char current-val table)) | |
264 (aset table char val)))) | |
265 | |
266 (defun unidata-gen-table-character (prop) | |
267 (let ((table (make-char-table 'char-code-property-table)) | |
268 (prop-idx (unidata-prop-index prop)) | |
269 (vec (make-vector 128 0)) | |
270 (tail unidata-list) | |
271 elt range val idx slot) | |
272 (set-char-table-range table (cons 0 (max-char)) t) | |
273 (while tail | |
274 (setq elt (car tail) tail (cdr tail)) | |
275 (setq range (car elt) | |
276 val (nth prop-idx elt)) | |
277 (if (= (length val) 0) | |
278 (setq val nil) | |
279 (setq val (string-to-number val 16))) | |
280 (if (consp range) | |
281 (if val | |
282 (set-char-table-range table range val)) | |
283 (let* ((start (lsh (lsh range -7) 7)) | |
284 (limit (+ start 127)) | |
285 first-index last-index) | |
286 (fillarray vec 0) | |
287 (if val | |
288 (aset vec (setq last-index (setq first-index (- range start))) | |
289 val)) | |
290 (while (and (setq elt (car tail) range (car elt)) | |
291 (integerp range) | |
292 (<= range limit)) | |
293 (setq val (nth prop-idx elt)) | |
294 (when (> (length val) 0) | |
295 (aset vec (setq last-index (- range start)) | |
296 (string-to-number val 16)) | |
297 (or first-index | |
298 (setq first-index last-index))) | |
299 (setq tail (cdr tail))) | |
300 (when first-index | |
301 (let ((str (string first-index)) | |
302 c) | |
303 (while (<= first-index last-index) | |
304 (setq str (format "%s%c" str (or (aref vec first-index) 0)) | |
305 first-index (1+ first-index))) | |
306 (set-char-table-range table (cons start limit) str)))))) | |
307 | |
308 (set-char-table-extra-slot table 0 prop) | |
309 (byte-compile 'unidata-get-character) | |
310 (byte-compile 'unidata-put-character) | |
311 (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-character)) | |
312 (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-character)) | |
313 | |
314 table)) | |
315 | |
316 | |
317 | |
318 ;; RUN-LENGTH TABLE | |
319 ;; | |
320 ;; If the type of character property value is symbol, integer, | |
321 ;; boolean, or character, we use a char-table described here to store | |
322 ;; the values. | |
323 ;; | |
324 ;; The 4th extra slot is a vector of property values (VAL-TABLE), and | |
325 ;; values for succeeding 128 characters are encoded into this | |
326 ;; character sequence: | |
327 ;; ( VAL-CODE RUN-LENGTH ? ) + | |
328 ;; where: | |
329 ;; VAL-CODE (0..127): | |
330 ;; (VAL-CODE - 1) is an index into VAL-TABLE. | |
331 ;; The value 0 means no-value. | |
332 ;; RUN-LENGTH (130..255): | |
333 ;; (RUN-LENGTH - 128) specifies how many characters have the same | |
334 ;; value. If omitted, it means 1. | |
335 | |
336 | |
337 ;; Return a symbol-type character property value of CHAR. VAL is the | |
338 ;; current value of (aref TABLE CHAR). | |
339 | |
340 (defun unidata-get-symbol (char val table) | |
341 (let ((val-table (char-table-extra-slot table 4))) | |
342 (cond ((symbolp val) | |
343 val) | |
344 ((stringp val) | |
345 (let ((first-char (lsh (lsh char -7) 7)) | |
346 (str val) | |
347 (len (length val)) | |
348 (idx 0) | |
349 this-val count) | |
350 (set-char-table-range table (cons first-char (+ first-char 127)) | |
351 nil) | |
352 (while (< idx len) | |
353 (setq val (aref str idx) idx (1+ idx) | |
354 count (if (< idx len) (aref str idx) 1)) | |
355 (setq val (and (> val 0) (aref val-table (1- val))) | |
356 count (if (< count 128) | |
357 1 | |
358 (prog1 (- count 128) (setq idx (1+ idx))))) | |
359 (dotimes (i count) | |
360 (if val | |
361 (aset table first-char val)) | |
362 (if (= first-char char) | |
363 (setq this-val val)) | |
364 (setq first-char (1+ first-char)))) | |
365 this-val)) | |
366 ((> val 0) | |
367 (aref val-table (1- val)))))) | |
94071
03be13c38b12
(unidata-prop-alist): Fix typo in description of `numeric-value'.
Juanma Barranquero <lekktu@gmail.com>
parents:
91421
diff
changeset
|
368 |
90086 | 369 ;; Return a integer-type character property value of CHAR. VAL is the |
370 ;; current value of (aref TABLE CHAR). | |
371 | |
372 (defun unidata-get-integer (char val table) | |
373 (let ((val-table (char-table-extra-slot table 4))) | |
374 (cond ((integerp val) | |
375 val) | |
376 ((stringp val) | |
377 (let ((first-char (lsh (lsh char -7) 7)) | |
378 (str val) | |
379 (len (length val)) | |
380 (idx 0) | |
381 this-val count) | |
382 (while (< idx len) | |
383 (setq val (aref str idx) idx (1+ idx) | |
384 count (if (< idx len) (aref str idx) 1)) | |
385 (setq val (and (> val 0) (aref val-table (1- val))) | |
386 count (if (< count 128) | |
387 1 | |
388 (prog1 (- count 128) (setq idx (1+ idx))))) | |
389 (dotimes (i count) | |
390 (aset table first-char val) | |
391 (if (= first-char char) | |
392 (setq this-val val)) | |
393 (setq first-char (1+ first-char)))) | |
394 this-val))))) | |
395 | |
396 ;; Store VAL (symbol) as a character property value of CHAR in TABLE. | |
397 | |
398 (defun unidata-put-symbol (char val table) | |
399 (or (symbolp val) | |
400 (error "Not a symbol: %S" val)) | |
401 (let ((current-val (aref table char))) | |
402 (unless (eq current-val val) | |
403 (if (stringp current-val) | |
404 (funcall (char-table-extra-slot table 1) char current-val table)) | |
405 (aset table char val)))) | |
406 | |
407 ;; Store VAL (integer) as a character property value of CHAR in TABLE. | |
408 | |
409 (defun unidata-put-integer (char val table) | |
410 (or (integerp val) | |
411 (not val) | |
412 (error "Not an integer nor nil: %S" val)) | |
413 (let ((current-val (aref table char))) | |
414 (unless (eq current-val val) | |
415 (if (stringp current-val) | |
416 (funcall (char-table-extra-slot table 1) char current-val table)) | |
417 (aset table char val)))) | |
418 | |
419 ;; Encode the character property value VAL into an integer value by | |
420 ;; VAL-LIST. By side effect, VAL-LIST is modified. | |
421 ;; VAL-LIST has this form: | |
422 ;; (t (VAL1 . VAL-CODE1) (VAL2 . VAL-CODE2) ...) | |
423 ;; If VAL is one of VALn, just return VAL-CODEn. Otherwise, | |
424 ;; VAL-LIST is modified to this: | |
425 ;; (t (VAL . (1+ VAL-CODE1)) (VAL1 . VAL-CODE1) (VAL2 . VAL-CODE2) ...) | |
426 | |
427 (defun unidata-encode-val (val-list val) | |
428 (let ((slot (assq val val-list)) | |
429 val-code) | |
430 (if slot | |
431 (cdr slot) | |
432 (setq val-code (if (cdr val-list) (1+ (cdr (nth 1 val-list))) 1)) | |
433 (setcdr val-list (cons (cons val val-code) (cdr val-list))) | |
434 val-code))) | |
435 | |
436 ;; Generate a char-table for the character property PROP. | |
437 | |
438 (defun unidata-gen-table (prop val-func default-value) | |
439 (let ((table (make-char-table 'char-code-property-table)) | |
440 (prop-idx (unidata-prop-index prop)) | |
441 (val-list (list t)) | |
442 (vec (make-vector 128 0)) | |
443 tail elt range val val-code idx slot) | |
444 (set-char-table-range table (cons 0 (max-char)) default-value) | |
445 (setq tail unidata-list) | |
446 (while tail | |
447 (setq elt (car tail) tail (cdr tail)) | |
448 (setq range (car elt) | |
449 val (funcall val-func (nth prop-idx elt))) | |
450 (setq val-code (if val (unidata-encode-val val-list val))) | |
451 (if (consp range) | |
452 (if val-code | |
453 (set-char-table-range table range val)) | |
454 (let* ((start (lsh (lsh range -7) 7)) | |
455 (limit (+ start 127)) | |
456 str count new-val) | |
457 (fillarray vec 0) | |
458 (if val-code | |
459 (aset vec (- range start) val-code)) | |
460 (while (and (setq elt (car tail) range (car elt)) | |
461 (integerp range) | |
462 (<= range limit)) | |
463 (setq new-val (funcall val-func (nth prop-idx elt))) | |
464 (if (not (eq val new-val)) | |
465 (setq val new-val | |
466 val-code (if val (unidata-encode-val val-list val)))) | |
467 (if val-code | |
468 (aset vec (- range start) val-code)) | |
469 (setq tail (cdr tail))) | |
470 (setq str "" val-code -1 count 0) | |
471 (mapc #'(lambda (x) | |
472 (if (= val-code x) | |
473 (setq count (1+ count)) | |
474 (if (> count 2) | |
475 (setq str (concat str (string val-code | |
476 (+ count 128)))) | |
477 (if (= count 2) | |
478 (setq str (concat str (string val-code val-code))) | |
479 (if (= count 1) | |
480 (setq str (concat str (string val-code)))))) | |
481 (setq val-code x count 1))) | |
482 vec) | |
483 (if (= count 128) | |
484 (if val | |
485 (set-char-table-range table (cons start limit) val)) | |
486 (if (= val-code 0) | |
487 (set-char-table-range table (cons start limit) str) | |
488 (if (> count 2) | |
489 (setq str (concat str (string val-code (+ count 128)))) | |
490 (if (= count 2) | |
491 (setq str (concat str (string val-code val-code))) | |
492 (setq str (concat str (string val-code))))) | |
493 (set-char-table-range table (cons start limit) str)))))) | |
494 | |
495 (setq val-list (nreverse (cdr val-list))) | |
94071
03be13c38b12
(unidata-prop-alist): Fix typo in description of `numeric-value'.
Juanma Barranquero <lekktu@gmail.com>
parents:
91421
diff
changeset
|
496 (set-char-table-extra-slot table 0 prop) |
90086 | 497 (set-char-table-extra-slot table 4 (vconcat (mapcar 'car val-list))) |
498 table)) | |
499 | |
500 (defun unidata-gen-table-symbol (prop) | |
501 (let ((table (unidata-gen-table prop | |
502 #'(lambda (x) (and (> (length x) 0) | |
503 (intern x))) | |
504 0))) | |
505 (byte-compile 'unidata-get-symbol) | |
506 (byte-compile 'unidata-put-symbol) | |
507 (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-symbol)) | |
508 (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-symbol)) | |
509 table)) | |
510 | |
511 (defun unidata-gen-table-integer (prop) | |
512 (let ((table (unidata-gen-table prop | |
513 #'(lambda (x) (and (> (length x) 0) | |
514 (string-to-number x))) | |
515 t))) | |
516 (byte-compile 'unidata-get-integer) | |
517 (byte-compile 'unidata-put-integer) | |
518 (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-integer)) | |
519 (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-integer)) | |
520 table)) | |
521 | |
522 | |
523 ;; WORD-LIST TABLE | |
524 | |
525 ;; If the table is for `name' property, each character in the string | |
526 ;; is one of these: | |
527 ;; DIFF-HEAD-CODE (0, 1, or 2): | |
528 ;; specifies how to decode the following characters. | |
529 ;; WORD-CODE (3..#x7FF excluding '-', '0'..'9', 'A'..'Z'): | |
530 ;; specifies an index number into WORD-TABLE (see below) | |
531 ;; Otherwise (' ', '-', '0'..'9', 'A'..'Z'): | |
532 ;; specifies a literal word. | |
533 ;; | |
534 ;; The 4th slots is a vector: | |
535 ;; [ WORD-TABLE BLOCK-NAME HANGUL-JAMO-TABLE ] | |
536 ;; WORD-TABLE is a vector of word symbols. | |
537 ;; BLOCK-NAME is a vector of name symbols for a block of characters. | |
538 ;; HANGUL-JAMO-TABLE is `unidata-name-jamo-name-table'. | |
539 | |
540 ;; Return the difference of symbol list L1 and L2 in this form: | |
541 ;; (DIFF-HEAD SYM1 SYM2 ...) | |
542 ;; DIFF-HEAD is ((SAME-HEAD-LENGTH * 16) + SAME-TAIL-LENGTH). | |
543 ;; Ex: If L1 is (a b c d e f) and L2 is (a g h e f), this function | |
544 ;; returns ((+ (* 1 16) 2) g h). | |
545 ;; It means that we can get L2 from L1 by prepending the first element | |
546 ;; of L1 and appending the last 2 elements of L1 to the list (g h). | |
547 ;; If L1 and L2 don't have common elements at the head and tail, | |
548 ;; set DIFF-HEAD to -1 and SYM1 ... to the elements of L2. | |
549 | |
550 (defun unidata-word-list-diff (l1 l2) | |
551 (let ((beg 0) | |
552 (end 0) | |
553 (len1 (length l1)) | |
554 (len2 (length l2)) | |
555 result) | |
556 (when (< len1 16) | |
557 (while (and l1 (eq (car l1) (car l2))) | |
558 (setq beg (1+ beg) | |
559 l1 (cdr l1) len1 (1- len1) l2 (cdr l2) len2 (1- len2))) | |
94071
03be13c38b12
(unidata-prop-alist): Fix typo in description of `numeric-value'.
Juanma Barranquero <lekktu@gmail.com>
parents:
91421
diff
changeset
|
560 (while (and (< end len1) (< end len2) |
90086 | 561 (eq (nth (- len1 end 1) l1) (nth (- len2 end 1) l2))) |
562 (setq end (1+ end)))) | |
563 (if (= (+ beg end) 0) | |
564 (setq result (list -1)) | |
565 (setq result (list (+ (* beg 16) (+ beg (- len1 end)))))) | |
566 (while (< end len2) | |
567 (setcdr result (cons (nth (- len2 end 1) l2) (cdr result))) | |
568 (setq end (1+ end))) | |
569 result)) | |
570 | |
571 ;; Return a compressed form of the vector VEC. Each element of VEC is | |
572 ;; a list of symbols of which names can be concatenated to form a | |
573 ;; character name. This function changes those elements into | |
574 ;; compressed forms by utilizing the fact that diff of consecutive | |
575 ;; elements is usually small. | |
576 | |
577 (defun unidata-word-list-compress (vec) | |
578 (let (last-elt last-idx diff-head tail elt val) | |
579 (dotimes (i 128) | |
580 (setq elt (aref vec i)) | |
581 (when elt | |
582 (if (null last-elt) | |
583 (setq diff-head -1 | |
584 val (cons 0 elt)) | |
585 (setq val (unidata-word-list-diff last-elt elt)) | |
586 (if (= (car val) -1) | |
587 (setq diff-head -1 | |
588 val (cons 0 (cdr val))) | |
589 (if (eq diff-head (car val)) | |
590 (setq val (cons 2 (cdr val))) | |
591 (setq diff-head (car val)) | |
592 (if (>= diff-head 0) | |
593 (setq val (cons 1 val)))))) | |
594 (aset vec i val) | |
595 (setq last-idx i last-elt elt))) | |
596 (if (not last-idx) | |
597 (setq vec nil) | |
598 (if (< last-idx 127) | |
599 (let ((shorter (make-vector (1+ last-idx) nil))) | |
600 (dotimes (i (1+ last-idx)) | |
601 (aset shorter i (aref vec i))) | |
602 (setq vec shorter)))) | |
603 vec)) | |
604 | |
605 ;; Encode the word index IDX into a characters code that can be | |
606 ;; embedded in a string. | |
607 | |
608 (defsubst unidata-encode-word (idx) | |
609 ;; Exclude 0, 1, 2. | |
610 (+ idx 3)) | |
611 | |
612 ;; Decode the character code CODE (that is embedded in a string) into | |
613 ;; the corresponding word name by looking up WORD-TABLE. | |
614 | |
615 (defsubst unidata-decode-word (code word-table) | |
616 (setq code (- code 3)) | |
617 (if (< code (length word-table)) | |
618 (aref word-table code))) | |
619 | |
620 ;; Table of short transliterated name symbols of Hangul Jamo divided | |
621 ;; into Choseong, Jungseong, and Jongseong. | |
622 | |
623 (defconst unidata-name-jamo-name-table | |
624 [[G GG N D DD R M B BB S SS nil J JJ C K T P H] | |
625 [A AE YA YAE EO E YEO YE O WA WAE OE YO U WEO WE WI YU EU YI I] | |
626 [G GG GS N NJ NH D L LG LM LB LS LT LP LH M B BS S SS NG J C K T P H]]) | |
627 | |
628 ;; Return a name of CHAR. VAL is the current value of (aref TABLE | |
629 ;; CHAR). | |
630 | |
631 (defun unidata-get-name (char val table) | |
94071
03be13c38b12
(unidata-prop-alist): Fix typo in description of `numeric-value'.
Juanma Barranquero <lekktu@gmail.com>
parents:
91421
diff
changeset
|
632 (cond |
90086 | 633 ((stringp val) |
634 (if (> (aref val 0) 0) | |
635 val | |
636 (let* ((first-char (lsh (lsh char -7) 7)) | |
637 (word-table (aref (char-table-extra-slot table 4) 0)) | |
638 (i 1) | |
639 (len (length val)) | |
640 (vec (make-vector 128 nil)) | |
641 (idx 0) | |
642 (case-fold-search nil) | |
643 c word-list tail-list last-list word diff-head) | |
644 (while (< i len) | |
645 (setq c (aref val i)) | |
646 (if (< c 3) | |
647 (progn | |
648 (if (or word-list tail-list) | |
649 (aset vec idx | |
650 (setq last-list (nconc word-list tail-list)))) | |
651 (setq i (1+ i) idx (1+ idx) | |
652 word-list nil tail-list nil) | |
653 (if (> c 0) | |
654 (let ((l last-list)) | |
655 (if (= c 1) | |
656 (setq diff-head | |
657 (prog1 (aref val i) (setq i (1+ i))))) | |
658 (setq tail-list (nthcdr (% diff-head 16) last-list)) | |
659 (dotimes (i (/ diff-head 16)) | |
660 (setq word-list (nconc word-list (list (car l))) | |
661 l (cdr l)))))) | |
662 (setq word-list | |
94071
03be13c38b12
(unidata-prop-alist): Fix typo in description of `numeric-value'.
Juanma Barranquero <lekktu@gmail.com>
parents:
91421
diff
changeset
|
663 (nconc word-list |
90086 | 664 (list (symbol-name |
665 (unidata-decode-word c word-table)))) | |
666 i (1+ i)))) | |
667 (if (or word-list tail-list) | |
668 (aset vec idx (nconc word-list tail-list))) | |
669 (setq val nil) | |
670 (dotimes (i 128) | |
671 (setq c (+ first-char i)) | |
672 (let ((name (aref vec i))) | |
673 (if name | |
674 (let ((tail (cdr (setq name (copy-sequence name)))) | |
675 elt) | |
676 (while tail | |
677 (setq elt (car tail)) | |
678 (or (string= elt "-") | |
679 (progn | |
680 (setcdr tail (cons elt (cdr tail))) | |
681 (setcar tail " "))) | |
682 (setq tail (cddr tail))) | |
683 (setq name (apply 'concat name)))) | |
684 (aset table c name) | |
685 (if (= c char) | |
686 (setq val name)))) | |
687 val))) | |
688 | |
689 ((and (integerp val) (> val 0)) | |
690 (let* ((symbol-table (aref (char-table-extra-slot table 4) 1)) | |
691 (sym (aref symbol-table (1- val)))) | |
692 (cond ((eq sym 'HANGUL\ SYLLABLE) | |
693 (let ((jamo-name-table (aref (char-table-extra-slot table 4) 2))) | |
694 ;; SIndex = S - SBase | |
695 (setq char (- char #xAC00)) | |
696 (let ( ;; LIndex = SIndex / NCount | |
697 (L (/ char 588)) | |
698 ;; VIndex = (SIndex % NCount) * TCount | |
699 (V (/ (% char 588) 28)) | |
700 ;; TIndex = SIndex % TCount | |
701 (T (% char 28))) | |
94071
03be13c38b12
(unidata-prop-alist): Fix typo in description of `numeric-value'.
Juanma Barranquero <lekktu@gmail.com>
parents:
91421
diff
changeset
|
702 (format "HANGUL SYLLABLE %s%s%s" |
90112
eb1b00df002b
(unidata-get-name): Handle U+110B.
Kenichi Handa <handa@m17n.org>
parents:
90109
diff
changeset
|
703 ;; U+110B is nil in this table. |
eb1b00df002b
(unidata-get-name): Handle U+110B.
Kenichi Handa <handa@m17n.org>
parents:
90109
diff
changeset
|
704 (or (aref (aref jamo-name-table 0) L) "") |
90086 | 705 (aref (aref jamo-name-table 1) V) |
706 (if (= T 0) "" | |
707 (aref (aref jamo-name-table 2) (1- T))))))) | |
90109
790c49ba39db
(unidata-get-name): Handle "CJK IDEOGRAPH".
Kenichi Handa <handa@m17n.org>
parents:
90097
diff
changeset
|
708 ((eq sym 'CJK\ IDEOGRAPH) |
790c49ba39db
(unidata-get-name): Handle "CJK IDEOGRAPH".
Kenichi Handa <handa@m17n.org>
parents:
90097
diff
changeset
|
709 (format "%s-%04X" sym char)) |
90086 | 710 ((eq sym 'CJK\ COMPATIBILITY\ IDEOGRAPH) |
711 (format "%s-%04X" sym char)) | |
712 ((eq sym 'VARIATION\ SELECTOR) | |
713 (format "%s-%d" sym (+ (- char #xe0100) 17)))))))) | |
714 | |
715 ;; Store VAL as the name of CHAR in TABLE. | |
716 | |
717 (defun unidata-put-name (char val table) | |
718 (let ((current-val (aref table char))) | |
719 (if (and (stringp current-val) (= (aref current-val 0) 0)) | |
720 (funcall (char-table-extra-slot table 1) char current-val table)) | |
721 (aset table char val))) | |
722 | |
723 (defun unidata-get-decomposition (char val table) | |
724 (cond | |
725 ((consp val) | |
726 val) | |
727 | |
728 ((stringp val) | |
729 (if (> (aref val 0) 0) | |
730 val | |
731 (let* ((first-char (lsh (lsh char -7) 7)) | |
732 (word-table (char-table-extra-slot table 4)) | |
733 (i 1) | |
734 (len (length val)) | |
735 (vec (make-vector 128 nil)) | |
736 (idx 0) | |
737 (case-fold-search nil) | |
738 c word-list tail-list last-list word diff-head) | |
739 (while (< i len) | |
740 (setq c (aref val i)) | |
741 (if (< c 3) | |
742 (progn | |
743 (if (or word-list tail-list) | |
744 (aset vec idx | |
745 (setq last-list (nconc word-list tail-list)))) | |
746 (setq i (1+ i) idx (1+ idx) | |
747 word-list nil tail-list nil) | |
748 (if (> c 0) | |
749 (let ((l last-list)) | |
750 (if (= c 1) | |
751 (setq diff-head | |
752 (prog1 (aref val i) (setq i (1+ i))))) | |
753 (setq tail-list (nthcdr (% diff-head 16) last-list)) | |
754 (dotimes (i (/ diff-head 16)) | |
755 (setq word-list (nconc word-list (list (car l))) | |
756 l (cdr l)))))) | |
757 (setq word-list | |
94071
03be13c38b12
(unidata-prop-alist): Fix typo in description of `numeric-value'.
Juanma Barranquero <lekktu@gmail.com>
parents:
91421
diff
changeset
|
758 (nconc word-list |
90086 | 759 (list (or (unidata-decode-word c word-table) c))) |
760 i (1+ i)))) | |
761 (if (or word-list tail-list) | |
762 (aset vec idx (nconc word-list tail-list))) | |
763 (dotimes (i 128) | |
764 (aset table (+ first-char i) (aref vec i))) | |
90162
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
765 (aref vec (- char first-char))))) |
90086 | 766 |
90162
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
767 ;; Hangul syllable |
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
768 ((and (eq val 0) (>= char #xAC00) (<= char #xD7A3)) |
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
769 ;; SIndex = S (char) - SBase (#xAC00) |
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
770 (setq char (- char #xAC00)) |
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
771 (let (;; L = LBase + SIndex / NCount |
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
772 (L (+ #x1100 (/ char 588))) |
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
773 ;; V = VBase + (SIndex % NCount) * TCount |
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
774 (V (+ #x1161 (/ (% char 588) 28))) |
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
775 ;; T = TBase + SIndex % TCount |
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
776 (T (+ #x11A7 (% char 28)))) |
90176
dfbe37c15be3
(unidata-get-decomposition): For Hangul
Kenichi Handa <handa@m17n.org>
parents:
90174
diff
changeset
|
777 (if (= T #x11A7) |
dfbe37c15be3
(unidata-get-decomposition): For Hangul
Kenichi Handa <handa@m17n.org>
parents:
90174
diff
changeset
|
778 (list L V) |
dfbe37c15be3
(unidata-get-decomposition): For Hangul
Kenichi Handa <handa@m17n.org>
parents:
90174
diff
changeset
|
779 (list L V T)))) |
90162
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
780 |
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
781 )) |
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
782 |
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
783 ;; Store VAL as the decomposition information of CHAR in TABLE. |
90086 | 784 |
785 (defun unidata-put-decomposition (char val table) | |
786 (let ((current-val (aref table char))) | |
787 (if (and (stringp current-val) (= (aref current-val 0) 0)) | |
788 (funcall (char-table-extra-slot table 1) char current-val table)) | |
789 (aset table char val))) | |
790 | |
791 ;; UnicodeData.txt contains these lines: | |
792 ;; 0000;<control>;Cc;0;BN;;;;;N;NULL;;;; | |
793 ;; ... | |
794 ;; 0020;SPACE;Zs;0;WS;;;;;N;;;;; | |
795 ;; ... | |
796 ;; The following command yields a file of about 96K bytes. | |
797 ;; % gawk -F ';' '{print $1,$2;}' < UnicodeData.txt | gzip > temp.gz | |
798 ;; With the following function, we can get a file of almost the same | |
799 ;; the size. | |
800 | |
801 ;; Generate a char-table for character names. | |
802 | |
803 (defun unidata-gen-table-word-list (prop val-func) | |
804 (let ((table (make-char-table 'char-code-property-table)) | |
805 (prop-idx (unidata-prop-index prop)) | |
806 (word-list (list nil)) | |
807 word-table | |
808 block-list block-word-table block-end | |
809 tail elt range val idx slot) | |
810 (set-char-table-range table (cons 0 (max-char)) 0) | |
811 (setq tail unidata-list) | |
812 (setq block-end -1) | |
813 (while tail | |
814 (setq elt (car tail) tail (cdr tail)) | |
815 (setq range (car elt) | |
816 val (funcall val-func (nth prop-idx elt))) | |
817 ;; Treat the sequence of "CJK COMPATIBILITY IDEOGRAPH-XXXX" and | |
818 ;; "VARIATION SELECTOR-XXX" as a block. | |
819 (if (and (consp val) (eq prop 'name) | |
820 (or (and (eq (car val) 'CJK) | |
821 (eq (nth 1 val) 'COMPATIBILITY)) | |
822 (and (>= range #xe0100) | |
823 (eq (car val) 'VARIATION) | |
824 (eq (nth 1 val) 'SELECTOR)))) | |
825 (let ((first (car val)) | |
826 (second (nth 1 val)) | |
827 (start range)) | |
828 (while (and (setq elt (car tail) range (car elt) | |
829 val (funcall val-func (nth prop-idx elt))) | |
830 (consp val) | |
831 (eq first (car val)) | |
832 (eq second (nth 1 val))) | |
833 (setq block-end range | |
834 tail (cdr tail))) | |
835 (setq range (cons start block-end) | |
836 val (if (eq first 'CJK) 'CJK\ COMPATIBILITY\ IDEOGRAPH | |
837 'VARIATION\ SELECTOR)))) | |
838 | |
839 (if (consp range) | |
840 (if val | |
841 (let ((slot (assq val block-list))) | |
842 (setq range (cons (car range) (cdr range))) | |
843 (setq block-end (cdr range)) | |
844 (if slot | |
845 (nconc slot (list range)) | |
846 (push (list val range) block-list)))) | |
847 (let* ((start (lsh (lsh range -7) 7)) | |
848 (limit (+ start 127)) | |
849 (first tail) | |
850 (vec (make-vector 128 nil)) | |
851 c name len) | |
852 (if (<= start block-end) | |
853 ;; START overlap with the previous block. | |
854 (aset table range (nth prop-idx elt)) | |
855 (if val | |
856 (aset vec (- range start) val)) | |
857 (while (and (setq elt (car tail) range (car elt)) | |
858 (integerp range) | |
859 (<= range limit)) | |
860 (setq val (funcall val-func (nth prop-idx elt))) | |
861 (if val | |
862 (aset vec (- range start) val)) | |
863 (setq tail (cdr tail))) | |
864 (setq vec (unidata-word-list-compress vec)) | |
865 (when vec | |
866 (dotimes (i (length vec)) | |
867 (dolist (elt (aref vec i)) | |
868 (if (symbolp elt) | |
869 (let ((slot (assq elt word-list))) | |
870 (if slot | |
871 (setcdr slot (1+ (cdr slot))) | |
872 (setcdr word-list | |
873 (cons (cons elt 1) (cdr word-list)))))))) | |
874 (set-char-table-range table (cons start limit) vec)))))) | |
875 (setq word-list (sort (cdr word-list) | |
876 #'(lambda (x y) (> (cdr x) (cdr y))))) | |
877 (setq tail word-list idx 0) | |
878 (while tail | |
879 (setcdr (car tail) (unidata-encode-word idx)) | |
880 (setq idx (1+ idx) tail (cdr tail))) | |
881 (setq word-table (make-vector (length word-list) nil)) | |
882 (setq idx 0) | |
883 (dolist (elt word-list) | |
884 (aset word-table idx (car elt)) | |
885 (setq idx (1+ idx))) | |
886 | |
887 (if (and (eq prop 'decomposition) | |
888 (> idx 32)) | |
90162
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
889 (error "Too many symbols in decomposition data")) |
90086 | 890 |
891 (dotimes (i (/ #x110000 128)) | |
892 (let* ((idx (* i 128)) | |
893 (vec (aref table idx))) | |
894 (when (vectorp vec) | |
895 (dotimes (i (length vec)) | |
896 (let ((tail (aref vec i)) | |
897 elt code) | |
898 (if (not tail) | |
899 (aset vec i "\0") | |
900 (while tail | |
901 (setq elt (car tail) | |
902 code (if (integerp elt) elt | |
903 (cdr (assq elt word-list)))) | |
904 (setcar tail (string code)) | |
905 (setq tail (cdr tail))) | |
906 (aset vec i (mapconcat 'identity (aref vec i) ""))))) | |
907 (set-char-table-range | |
908 table (cons idx (+ idx 127)) | |
909 (mapconcat 'identity vec ""))))) | |
910 | |
911 (setq block-word-table (make-vector (length block-list) nil)) | |
912 (setq idx 0) | |
913 (dolist (elt block-list) | |
914 (dolist (e (cdr elt)) | |
915 (set-char-table-range table e (1+ idx))) | |
916 (aset block-word-table idx (car elt)) | |
917 (setq idx (1+ idx))) | |
918 | |
919 (set-char-table-extra-slot table 0 prop) | |
920 (set-char-table-extra-slot table 4 (cons word-table block-word-table)) | |
921 table)) | |
922 | |
923 (defun unidata-split-name (str) | |
924 (if (symbolp str) | |
925 str | |
926 (let ((len (length str)) | |
927 (l nil) | |
928 (idx 0) | |
929 c) | |
930 (if (= len 0) | |
931 nil | |
932 (dotimes (i len) | |
933 (setq c (aref str i)) | |
934 (if (= c 32) | |
935 (setq l (cons (intern (substring str idx i)) l) | |
936 idx (1+ i)) | |
94071
03be13c38b12
(unidata-prop-alist): Fix typo in description of `numeric-value'.
Juanma Barranquero <lekktu@gmail.com>
parents:
91421
diff
changeset
|
937 (if (and (= c ?-) (< idx i) |
90086 | 938 (< (1+ i) len) (/= (aref str (1+ i)) 32)) |
939 (setq l (cons '- (cons (intern (substring str idx i)) l)) | |
940 idx (1+ i))))) | |
941 (nreverse (cons (intern (substring str idx)) l)))))) | |
942 | |
943 (defun unidata-gen-table-name (prop) | |
944 (let* ((table (unidata-gen-table-word-list prop 'unidata-split-name)) | |
945 (word-tables (char-table-extra-slot table 4))) | |
946 (byte-compile 'unidata-get-name) | |
947 (byte-compile 'unidata-put-name) | |
948 (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-name)) | |
949 (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-name)) | |
950 | |
951 (if (eq prop 'name) | |
952 (set-char-table-extra-slot table 4 | |
953 (vector (car word-tables) | |
954 (cdr word-tables) | |
955 unidata-name-jamo-name-table)) | |
956 (set-char-table-extra-slot table 4 | |
957 (vector (car word-tables)))) | |
958 table)) | |
959 | |
960 (defun unidata-split-decomposition (str) | |
961 (if (symbolp str) | |
962 str | |
963 (let ((len (length str)) | |
964 (l nil) | |
965 (idx 0) | |
966 c) | |
967 (if (= len 0) | |
968 nil | |
969 (dotimes (i len) | |
970 (setq c (aref str i)) | |
971 (if (= c 32) | |
972 (setq l (if (= (aref str idx) ?<) | |
973 (cons (intern (substring str idx i)) l) | |
90162
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
974 (cons (string-to-number (substring str idx i) 16) l)) |
90086 | 975 idx (1+ i)))) |
976 (if (= (aref str idx) ?<) | |
977 (setq l (cons (intern (substring str idx len)) l)) | |
90162
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
978 (setq l (cons (string-to-number (substring str idx len) 16) l))) |
90086 | 979 (nreverse l))))) |
980 | |
981 | |
982 (defun unidata-gen-table-decomposition (prop) | |
983 (let* ((table (unidata-gen-table-word-list prop 'unidata-split-decomposition)) | |
984 (word-tables (char-table-extra-slot table 4))) | |
985 (byte-compile 'unidata-get-decomposition) | |
986 (byte-compile 'unidata-put-decomposition) | |
987 (set-char-table-extra-slot table 1 | |
988 (symbol-function 'unidata-get-decomposition)) | |
94071
03be13c38b12
(unidata-prop-alist): Fix typo in description of `numeric-value'.
Juanma Barranquero <lekktu@gmail.com>
parents:
91421
diff
changeset
|
989 (set-char-table-extra-slot table 2 |
90086 | 990 (symbol-function 'unidata-put-decomposition)) |
991 (set-char-table-extra-slot table 4 (car word-tables)) | |
992 table)) | |
993 | |
994 | |
995 | |
996 (defun unidata-describe-general-category (val) | |
997 (cdr (assq val | |
998 '((Lu . "Letter, Uppercase") | |
999 (Ll . "Letter, Lowercase") | |
1000 (Lt . "Letter, Titlecase") | |
1001 (Lm . "Letter, Modifier") | |
1002 (Lo . "Letter, Other") | |
1003 (Mn . "Mark, Nonspacing") | |
1004 (Mc . "Mark, Spacing Combining") | |
1005 (Me . "Mark, Enclosing") | |
1006 (Nd . "Number, Decimal Digit") | |
1007 (Nl . "Number, Letter") | |
1008 (No . "Number, Other") | |
1009 (Pc . "Punctuation, Connector") | |
1010 (Pd . "Punctuation, Dash") | |
1011 (Ps . "Punctuation, Open") | |
1012 (Pe . "Punctuation, Close") | |
1013 (Pi . "Punctuation, Initial quote") | |
1014 (Pf . "Punctuation, Final quote") | |
1015 (Po . "Punctuation, Other") | |
1016 (Sm . "Symbol, Math") | |
1017 (Sc . "Symbol, Currency") | |
1018 (Sk . "Symbol, Modifier") | |
1019 (So . "Symbol, Other") | |
1020 (Zs . "Separator, Space") | |
1021 (Zl . "Separator, Line") | |
1022 (Zp . "Separator, Paragraph") | |
1023 (Cc . "Other, Control") | |
1024 (Cf . "Other, Format") | |
1025 (Cs . "Other, Surrogate") | |
1026 (Co . "Other, Private Use") | |
1027 (Cn . "Other, Not Assigned"))))) | |
1028 | |
1029 (defun unidata-describe-canonical-combining-class (val) | |
1030 (cdr (assq val | |
1031 '((0 . "Spacing, split, enclosing, reordrant, and Tibetan subjoined") | |
1032 (1 . "Overlays and interior") | |
1033 (7 . "Nuktas") | |
1034 (8 . "Hiragana/Katakana voicing marks") | |
1035 (9 . "Viramas") | |
1036 (10 . "Start of fixed position classes") | |
1037 (199 . "End of fixed position classes") | |
1038 (200 . "Below left attached") | |
1039 (202 . "Below attached") | |
1040 (204 . "Below right attached") | |
1041 (208 . "Left attached (reordrant around single base character)") | |
1042 (210 . "Right attached") | |
1043 (212 . "Above left attached") | |
1044 (214 . "Above attached") | |
1045 (216 . "Above right attached") | |
1046 (218 . "Below left") | |
1047 (220 . "Below") | |
1048 (222 . "Below right") | |
1049 (224 . "Left (reordrant around single base character)") | |
1050 (226 . "Right") | |
1051 (228 . "Above left") | |
1052 (230 . "Above") | |
1053 (232 . "Above right") | |
1054 (233 . "Double below") | |
1055 (234 . "Double above") | |
1056 (240 . "Below (iota subscript)"))))) | |
1057 | |
1058 (defun unidata-describe-bidi-class (val) | |
1059 (cdr (assq val | |
1060 '((L . "Left-to-Right") | |
1061 (LRE . "Left-to-Right Embedding") | |
1062 (LRO . "Left-to-Right Override") | |
1063 (R . "Right-to-Left") | |
1064 (AL . "Right-to-Left Arabic") | |
1065 (RLE . "Right-to-Left Embedding") | |
1066 (RLO . "Right-to-Left Override") | |
1067 (PDF . "Pop Directional Format") | |
1068 (EN . "European Number") | |
1069 (ES . "European Number Separator") | |
1070 (ET . "European Number Terminator") | |
1071 (AN . "Arabic Number") | |
1072 (CS . "Common Number Separator") | |
1073 (NSM . "Non-Spacing Mark") | |
1074 (BN . "Boundary Neutral") | |
1075 (B . "Paragraph Separator") | |
1076 (S . "Segment Separator") | |
1077 (WS . "Whitespace") | |
1078 (ON . "Other Neutrals"))))) | |
1079 | |
1080 (defun unidata-describe-decomposition (val) | |
1081 (mapconcat #'(lambda (x) (if (symbolp x) (symbol-name x) (string ?' x ?'))) | |
1082 val " ")) | |
1083 | |
1084 ;; Verify if we can retrieve correct values from the generated | |
1085 ;; char-tables. | |
1086 | |
1087 (defun unidata-check () | |
1088 (dolist (elt unidata-prop-alist) | |
1089 (let* ((prop (car elt)) | |
1090 (index (unidata-prop-index prop)) | |
1091 (generator (unidata-prop-generator prop)) | |
94071
03be13c38b12
(unidata-prop-alist): Fix typo in description of `numeric-value'.
Juanma Barranquero <lekktu@gmail.com>
parents:
91421
diff
changeset
|
1092 (table (progn |
90086 | 1093 (message "Generating %S table..." prop) |
1094 (funcall generator prop))) | |
1095 (decoder (char-table-extra-slot table 1)) | |
1096 (check #x400)) | |
1097 (dolist (e unidata-list) | |
1098 (let ((char (car e)) | |
1099 (val1 (nth index e)) | |
1100 val2) | |
1101 (if (and (stringp val1) (= (length val1) 0)) | |
1102 (setq val1 nil)) | |
1103 (unless (consp char) | |
1104 (setq val2 (funcall decoder char (aref table char) table)) | |
1105 (if val1 | |
1106 (cond ((eq generator 'unidata-gen-table-symbol) | |
1107 (setq val1 (intern val1))) | |
1108 ((eq generator 'unidata-gen-table-integer) | |
90162
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
1109 (setq val1 (string-to-number val1))) |
90086 | 1110 ((eq generator 'unidata-gen-table-character) |
90162
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
1111 (setq val1 (string-to-number val1 16))) |
90086 | 1112 ((eq generator 'unidata-gen-table-decomposition) |
1113 (setq val1 (unidata-split-decomposition val1))))) | |
1114 (when (>= char check) | |
1115 (message "%S %04X" prop check) | |
1116 (setq check (+ check #x400))) | |
1117 (or (equal val1 val2) | |
94071
03be13c38b12
(unidata-prop-alist): Fix typo in description of `numeric-value'.
Juanma Barranquero <lekktu@gmail.com>
parents:
91421
diff
changeset
|
1118 (insert (format "> %04X %S\n< %04X %S\n" |
90086 | 1119 char val1 char val2))) |
1120 (sit-for 0))))))) | |
1121 | |
1122 ;; The entry function. It generates files described in the header | |
1123 ;; comment of this file. | |
1124 | |
90174
f3973ae57d8b
(unidata-text-file): Delete it.
Kenichi Handa <handa@m17n.org>
parents:
90162
diff
changeset
|
1125 (defun unidata-gen-files (&optional unidata-text-file) |
f3973ae57d8b
(unidata-text-file): Delete it.
Kenichi Handa <handa@m17n.org>
parents:
90162
diff
changeset
|
1126 (or unidata-text-file |
f3973ae57d8b
(unidata-text-file): Delete it.
Kenichi Handa <handa@m17n.org>
parents:
90162
diff
changeset
|
1127 (setq unidata-text-file (car command-line-args-left) |
f3973ae57d8b
(unidata-text-file): Delete it.
Kenichi Handa <handa@m17n.org>
parents:
90162
diff
changeset
|
1128 command-line-args-left (cdr command-line-args-left))) |
f3973ae57d8b
(unidata-text-file): Delete it.
Kenichi Handa <handa@m17n.org>
parents:
90162
diff
changeset
|
1129 (unidata-setup-list unidata-text-file) |
91405
30275283f62d
(unidata-gen-files): Force unix line ends.
Jason Rumney <jasonr@gnu.org>
parents:
90176
diff
changeset
|
1130 (let ((coding-system-for-write 'utf-8-unix) |
90086 | 1131 (charprop-file "charprop.el")) |
1132 (with-temp-file charprop-file | |
1133 (insert ";; Automatically generated by unidata-gen.el.\n") | |
1134 (dolist (elt unidata-prop-alist) | |
1135 (let* ((prop (car elt)) | |
1136 (generator (unidata-prop-generator prop)) | |
1137 (file (unidata-prop-file prop)) | |
1138 (docstring (unidata-prop-docstring prop)) | |
1139 (describer (unidata-prop-describer prop)) | |
1140 table) | |
1141 ;; Filename in this comment line is extracted by sed in | |
1142 ;; Makefile. | |
1143 (insert (format ";; FILE: %s\n" file)) | |
1144 (insert (format "(define-char-code-property '%S %S\n %S)\n" | |
1145 prop file docstring)) | |
1146 (with-temp-file file | |
1147 (message "Generating %s..." file) | |
1148 (setq table (funcall generator prop)) | |
1149 (when describer | |
1150 (unless (subrp (symbol-function describer)) | |
1151 (byte-compile describer) | |
1152 (setq describer (symbol-function describer))) | |
1153 (set-char-table-extra-slot table 3 describer)) | |
1154 (insert ";; Automatically generated from UnicodeData.txt.\n" | |
1155 (format "(define-char-code-property '%S %S %S)\n" | |
1156 prop table docstring) | |
90162
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
1157 ";; Local Variables:\n" |
90086 | 1158 ";; coding: utf-8\n" |
1159 ";; no-byte-compile: t\n" | |
1160 ";; End:\n\n" | |
1161 (format ";; %s ends here\n" file))))) | |
1162 (message "Writing %s..." charprop-file) | |
90162
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
1163 (insert ";; Local Variables:\n" |
90086 | 1164 ";; coding: utf-8\n" |
1165 ";; no-byte-compile: t\n" | |
1166 ";; End:\n\n" | |
1167 (format ";; %s ends here\n" charprop-file))))) | |
1168 | |
90162
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
1169 |
88ca83396d15
Typo fixed in comments. Change
Kenichi Handa <handa@m17n.org>
parents:
90112
diff
changeset
|
1170 |
90097
380964b11586
Changes from arch/CVS synchronization
Miles Bader <miles@gnu.org>
parents:
90086
diff
changeset
|
1171 ;; arch-tag: 961c862e-b821-447e-9b8a-bfbab9c2d525 |
90086 | 1172 ;;; unidata-gen.el ends here |