Mercurial > audlegacy
comparison src/libguess/guess.scm @ 2313:3149d4b1a9a9 trunk
[svn] - objective-make autodepend fixes
- move all sourcecode into src/ and adjust Makefiles accordingly
author | nenolod |
---|---|
date | Fri, 12 Jan 2007 11:43:40 -0800 |
parents | |
children | b474ecb5bde4 |
comparison
equal
deleted
inserted
replaced
2312:e1a5a66fb9cc | 2313:3149d4b1a9a9 |
---|---|
1 ;;; | |
2 ;;; This code is derivative of guess.c of Gauche-0.8.7. | |
3 ;;; The following is the original copyright notice. | |
4 ;;; | |
5 | |
6 ;;; | |
7 ;;; Auxiliary script to generate japanese code guessing table | |
8 ;;; | |
9 ;;; Copyright (c) 2000-2003 Shiro Kawai, All rights reserved. | |
10 ;;; | |
11 ;;; Redistribution and use in source and binary forms, with or without | |
12 ;;; modification, are permitted provided that the following conditions | |
13 ;;; are met: | |
14 ;;; | |
15 ;;; 1. Redistributions of source code must retain the above copyright | |
16 ;;; notice, this list of conditions and the following disclaimer. | |
17 ;;; | |
18 ;;; 2. Redistributions in binary form must reproduce the above copyright | |
19 ;;; notice, this list of conditions and the following disclaimer in the | |
20 ;;; documentation and/or other materials provided with the distribution. | |
21 ;;; | |
22 ;;; 3. Neither the name of the authors nor the names of its contributors | |
23 ;;; may be used to endorse or promote products derived from this | |
24 ;;; software without specific prior written permission. | |
25 ;;; | |
26 ;;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
27 ;;; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
28 ;;; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
29 ;;; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
30 ;;; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
31 ;;; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED | |
32 ;;; TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
33 ;;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | |
34 ;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
35 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
36 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
37 ;;; | |
38 ;;; $Id: guess.scm,v 1.3 2003/07/05 03:29:10 shirok Exp $ | |
39 ;;; | |
40 | |
41 (use srfi-1) | |
42 (use gauche.sequence) | |
43 | |
44 ;; This is a simple state machine compiler. | |
45 ;; | |
46 ;; <state-machine> : (define-dfa <name> <state> ...) | |
47 ;; <state> : (<name> (<input-set> <next-state> <score>) ...) | |
48 ;; <name> : symbol | |
49 ;; <next-state> : symbol | |
50 ;; <score> : real | |
51 ;; <input-set> : (<byte-or-range> ...) | |
52 ;; <byte-or-range> : <byte> | (<byte> <byte>) | |
53 ;; <byte> : integer between 0 and #xff | ASCII char | |
54 ;; | |
55 ;; When evaluated, the DFA generates a state transition table in | |
56 ;; C source format. | |
57 | |
58 (define-class <dfa> () | |
59 ((name :init-keyword :name :accessor name-of) | |
60 (states :init-keyword :states :accessor states-of) | |
61 (instances :allocation :class :init-value '()))) | |
62 | |
63 (define-class <state> () | |
64 ((name :init-keyword :name :accessor name-of) | |
65 (index :init-keyword :index :accessor index-of) | |
66 (arcs :init-keyword :arcs :accessor arcs-of :init-value '()))) | |
67 | |
68 (define-class <arc> () | |
69 ((from-state :init-keyword :from-state :accessor from-state-of) | |
70 (to-state :init-keyword :to-state :accessor to-state-of) | |
71 (ranges :init-keyword :ranges :accessor ranges-of) | |
72 (index :init-keyword :index :accessor index-of) | |
73 (score :init-keyword :score :accessor score-of))) | |
74 | |
75 ;; Create DFA | |
76 | |
77 (define-syntax define-dfa | |
78 (syntax-rules () | |
79 ((_ name . states) | |
80 (define name (make <dfa> | |
81 :name 'name | |
82 :states (resolve-states 'states)))))) | |
83 | |
84 (define-method initialize ((self <dfa>) initargs) | |
85 (next-method) | |
86 (slot-push! self 'instances self)) | |
87 | |
88 (define (all-dfas) (reverse (class-slot-ref <dfa> 'instances))) | |
89 | |
90 (define (resolve-states state-defs) | |
91 (let ((states (map (lambda (d i) (make <state> :name (car d) :index i)) | |
92 state-defs | |
93 (iota (length state-defs))))) | |
94 (fold (lambda (s d i) | |
95 (let1 num-arcs (length (cdr d)) | |
96 (set! (arcs-of s) | |
97 (map (lambda (arc aindex) | |
98 (make <arc> | |
99 :from-state s | |
100 :to-state (or (find (lambda (e) | |
101 (eq? (name-of e) (cadr arc))) | |
102 states) | |
103 (error "no such state" (cadr arc))) | |
104 :ranges (car arc) | |
105 :index aindex | |
106 :score (caddr arc))) | |
107 (cdr d) | |
108 (iota num-arcs i))) | |
109 (+ i num-arcs))) | |
110 0 | |
111 states state-defs) | |
112 states)) | |
113 | |
114 ;; Emit state table | |
115 (define (emit-dfa-table dfa) | |
116 (format #t "static signed char guess_~a_st[][256] = {\n" (name-of dfa)) | |
117 (for-each emit-state-table (states-of dfa)) | |
118 (print "};\n") | |
119 (format #t "static guess_arc guess_~a_ar[] = {\n" (name-of dfa)) | |
120 (for-each emit-arc-table | |
121 (append-map arcs-of (states-of dfa))) | |
122 (print "};\n") | |
123 ) | |
124 | |
125 (define (emit-state-table state) | |
126 (define (b2i byte) ;byte->integer | |
127 (if (char? byte) (char->integer byte) byte)) | |
128 (let1 arc-vec (make-vector 256 -1) | |
129 (dolist (br (arcs-of state)) | |
130 (dolist (range (ranges-of br)) | |
131 (if (pair? range) | |
132 (vector-fill! arc-vec (index-of br) | |
133 (b2i (car range)) (+ (b2i (cadr range)) 1)) | |
134 (set! (ref arc-vec (b2i range)) (index-of br))))) | |
135 (format #t " { /* state ~a */" (name-of state)) | |
136 (dotimes (i 256) | |
137 (when (zero? (modulo i 16)) (newline)) | |
138 (format #t " ~2d," (ref arc-vec i))) | |
139 (print "\n },") | |
140 )) | |
141 | |
142 (define (emit-arc-table arc) | |
143 (format #t " { ~2d, ~5s }, /* ~a -> ~a */\n" | |
144 (index-of (to-state-of arc)) | |
145 (score-of arc) | |
146 (name-of (from-state-of arc)) | |
147 (name-of (to-state-of arc)))) | |
148 ;; | |
149 ;; main | |
150 ;; | |
151 | |
152 (define (main args) | |
153 (unless (= (length args) 2) | |
154 (error "usage: ~a <outout-file.c>" (car args))) | |
155 (with-output-to-file (cadr args) | |
156 (lambda () | |
157 (print "/* State transition table for character code guessing */") | |
158 (print "/* This file is automatically generated by guess.scm */") | |
159 (newline) | |
160 (for-each emit-dfa-table (all-dfas)))) | |
161 0) | |
162 | |
163 ;;;============================================================ | |
164 ;;; DFA definitions | |
165 ;;; | |
166 | |
167 ;;; | |
168 ;;; EUC-JP | |
169 ;;; | |
170 | |
171 (define-dfa eucj | |
172 ;; first byte | |
173 (init | |
174 (((#x00 #x7f)) init 1.0) ; ASCII range | |
175 ((#x8e) jis0201_kana 0.8) ; JISX 0201 kana | |
176 ((#x8f) jis0213_2 0.95) ; JISX 0213 plane 2 | |
177 (((#xa1 #xfe)) jis0213_1 1.0) ; JISX 0213 plane 1 | |
178 ) | |
179 ;; jis x 0201 kana | |
180 (jis0201_kana | |
181 (((#xa1 #xdf)) init 1.0) | |
182 ) | |
183 ;; jis x 0208 and jis x 0213 plane 1 | |
184 (jis0213_1 | |
185 (((#xa1 #xfe)) init 1.0)) | |
186 ;; jis x 0213 plane 2 | |
187 (jis0213_2 | |
188 (((#xa1 #xfe)) init 1.0)) | |
189 ) | |
190 | |
191 ;;; | |
192 ;;; Shift_JIS | |
193 ;;; | |
194 | |
195 (define-dfa sjis | |
196 ;; first byte | |
197 (init | |
198 (((#x00 #x7f)) init 1.0) ;ascii | |
199 (((#x81 #x9f) (#xe1 #xef)) jis0213 1.0) ;jisx0213 plane 1 | |
200 (((#xa1 #xdf)) init 0.8) ;jisx0201 kana | |
201 (((#xf0 #xfc)) jis0213 0.95) ;jisx0213 plane 2 | |
202 (((#xfd #xff)) init 0.8)) ;vendor extension | |
203 (jis0213 | |
204 (((#x40 #x7e) (#x80 #xfc)) init 1.0)) | |
205 ) | |
206 | |
207 ;;; | |
208 ;;; UTF-8 | |
209 ;;; | |
210 | |
211 (define-dfa utf8 | |
212 (init | |
213 (((#x00 #x7f)) init 1.0) | |
214 (((#xc2 #xdf)) 1byte_more 1.0) | |
215 (((#xe0 #xef)) 2byte_more 1.0) | |
216 (((#xf0 #xf7)) 3byte_more 1.0) | |
217 (((#xf8 #xfb)) 4byte_more 1.0) | |
218 (((#xfc #xfd)) 5byte_more 1.0)) | |
219 (1byte_more | |
220 (((#x80 #xbf)) init 1.0)) | |
221 (2byte_more | |
222 (((#x80 #xbf)) 1byte_more 1.0)) | |
223 (3byte_more | |
224 (((#x80 #xbf)) 2byte_more 1.0)) | |
225 (4byte_more | |
226 (((#x80 #xbf)) 3byte_more 1.0)) | |
227 (5byte_more | |
228 (((#x80 #xbf)) 4byte_more 1.0)) | |
229 ) | |
230 | |
231 ;;; | |
232 ;;; UCS-2LE | |
233 ;;; | |
234 | |
235 (define-dfa ucs2le | |
236 (init | |
237 ((#xff) le 1.0) | |
238 (((#x00 #x7f)) ascii 1.0) | |
239 (((#x00 #xff)) multi 1.0)) | |
240 (le | |
241 ((#xfe) init 1.0)) | |
242 (ascii | |
243 ((#x00) init 1.0)) | |
244 (multi | |
245 (((#x00 #xff)) init 1.0))) | |
246 | |
247 ;;; | |
248 ;;; UCS-2BE | |
249 ;;; | |
250 (define-dfa ucs2be | |
251 (init | |
252 ((#xfe) be 1.0) | |
253 ((#x00) ascii 1.0) | |
254 (((#x00 #xff)) multi 1.0)) | |
255 (be | |
256 ((#xff) init 1.0)) | |
257 (ascii | |
258 (((#x00 #x7f)) init 1.0)) | |
259 (multi | |
260 (((#x00 #xff)) init 1.0))) | |
261 | |
262 | |
263 ;;; | |
264 ;;; JIS (ISO2022JP) | |
265 ;;; | |
266 | |
267 ;; NB: for now, we just check the sequence of <ESC> $ or <ESC> '('. | |
268 '(define-dfa jis | |
269 (init | |
270 ((#x1b) esc 1.0) | |
271 (((#x00 #x1a) (#x1c #x1f)) init 1.0) ;C0 | |
272 (((#x20 #x7f)) init 1.0) ;ASCII | |
273 (((#xa1 #xdf)) init 0.7) ;JIS8bit kana | |
274 ) | |
275 (esc | |
276 ((#x0d #x0a) init 0.9) ;cancel | |
277 ((#\( ) esc-paren 1.0) | |
278 ((#\$ ) esc-$ 1.0) | |
279 ((#\& ) esc-& 1.0) | |
280 ) | |
281 (esc-paren | |
282 ((#\B #\J #\H) init 1.0) | |
283 ((#\I) jis0201kana 0.8) | |
284 ) | |
285 (esc-$ | |
286 ((#\@ #\B) kanji 1.0) | |
287 ((#\( ) esc-$-paren 1.0) | |
288 ) | |
289 (esc-$-paren | |
290 ((#\D #\O #\P) kanji 1.0)) | |
291 (esc-& | |
292 ((#\@ ) init 1.0)) | |
293 (jis0201kana | |
294 ((#x1b) esc 1.0) | |
295 (((#x20 #x5f)) jis0201kana 1.0)) | |
296 (kanji | |
297 ((#x1b) esc 1.0) | |
298 (((#x21 #x7e)) kanji-2 1.0)) | |
299 (kanji-2 | |
300 (((#x21 #x7e)) kanji 1.0)) | |
301 ) | |
302 | |
303 ;;; | |
304 ;;; Big5 | |
305 ;;; | |
306 | |
307 (define-dfa big5 | |
308 ;; first byte | |
309 (init | |
310 (((#x00 #x7f)) init 1.0) ;ascii | |
311 (((#xa1 #xfe)) 2byte 1.0) ;big5-2byte | |
312 ) | |
313 (2byte | |
314 (((#x40 #x7e) (#xa1 #xfe)) init 1.0)) | |
315 ) | |
316 | |
317 ;;; | |
318 ;;; GB2312 (EUC-CN?) | |
319 ;;; | |
320 | |
321 (define-dfa gb2312 | |
322 ;; first byte | |
323 (init | |
324 (((#x00 #x7f)) init 1.0) ;ascii | |
325 (((#xa1 #xfe)) 2byte 1.0) ;gb2312 2byte | |
326 ) | |
327 (2byte | |
328 (((#xa1 #xfe)) init 1.0)) | |
329 ) | |
330 | |
331 ;;; | |
332 ;;; GB18030 | |
333 ;;; | |
334 | |
335 (define-dfa gb18030 | |
336 ;; first byte | |
337 (init | |
338 (((#x00 #x80)) init 1.0) ;ascii | |
339 (((#x81 #xfe)) 2byte 1.0) ;gb18030 2byte | |
340 (((#x81 #xfe)) 4byte2 1.0) ;gb18030 2byte | |
341 ) | |
342 (2byte | |
343 (((#x40 #x7e) (#x80 #xfe)) init 1.0)) | |
344 (4byte2 | |
345 (((#x30 #x39)) 4byte3 1.0)) | |
346 (4byte3 | |
347 (((#x81 #xfe)) 4byte4 1.0)) | |
348 (4byte4 | |
349 (((#x30 #x39)) init 1.0)) | |
350 ) | |
351 | |
352 ;;; | |
353 ;;; EUC-KR | |
354 ;;; | |
355 | |
356 (define-dfa euck | |
357 ;; first byte | |
358 (init | |
359 (((#x00 #x7f)) init 1.0) ; ASCII range | |
360 (((#xa1 #xfe)) ks1001 1.0) ; KSX 1001 | |
361 ) | |
362 ;; ks x 1001 | |
363 (ks1001 | |
364 (((#xa1 #xfe)) init 1.0)) | |
365 ) | |
366 | |
367 ;;; | |
368 ;;; Johab | |
369 ;;; | |
370 | |
371 (define-dfa johab | |
372 ;; first byte | |
373 (init | |
374 (((#x00 #x7f)) init 1.0) ; ASCII range | |
375 (((#x84 #xd3)) jamo51 1.0) ; jamo51 | |
376 (((#xd8 #xde) (#xe0 #xf9)) jamo42 0.95) ; jamo42 | |
377 ) | |
378 ;; second byte | |
379 (jamo51 | |
380 (((#x41 #x7e) (#x81 #xfe)) init 1.0)) | |
381 (jamo42 | |
382 (((#x31 #x7e) (#x91 #xfe)) init 1.0)) | |
383 ) | |
384 |