comparison src/libguess/guess.scm @ 2313:3149d4b1a9a9 trunk

[svn] - objective-make autodepend fixes - move all sourcecode into src/ and adjust Makefiles accordingly
author nenolod
date Fri, 12 Jan 2007 11:43:40 -0800
parents
children b474ecb5bde4
comparison
equal deleted inserted replaced
2312:e1a5a66fb9cc 2313:3149d4b1a9a9
1 ;;;
2 ;;; This code is derivative of guess.c of Gauche-0.8.7.
3 ;;; The following is the original copyright notice.
4 ;;;
5
6 ;;;
7 ;;; Auxiliary script to generate japanese code guessing table
8 ;;;
9 ;;; Copyright (c) 2000-2003 Shiro Kawai, All rights reserved.
10 ;;;
11 ;;; Redistribution and use in source and binary forms, with or without
12 ;;; modification, are permitted provided that the following conditions
13 ;;; are met:
14 ;;;
15 ;;; 1. Redistributions of source code must retain the above copyright
16 ;;; notice, this list of conditions and the following disclaimer.
17 ;;;
18 ;;; 2. Redistributions in binary form must reproduce the above copyright
19 ;;; notice, this list of conditions and the following disclaimer in the
20 ;;; documentation and/or other materials provided with the distribution.
21 ;;;
22 ;;; 3. Neither the name of the authors nor the names of its contributors
23 ;;; may be used to endorse or promote products derived from this
24 ;;; software without specific prior written permission.
25 ;;;
26 ;;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27 ;;; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28 ;;; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29 ;;; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30 ;;; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31 ;;; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
32 ;;; TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 ;;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 ;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 ;;;
38 ;;; $Id: guess.scm,v 1.3 2003/07/05 03:29:10 shirok Exp $
39 ;;;
40
41 (use srfi-1)
42 (use gauche.sequence)
43
44 ;; This is a simple state machine compiler.
45 ;;
46 ;; <state-machine> : (define-dfa <name> <state> ...)
47 ;; <state> : (<name> (<input-set> <next-state> <score>) ...)
48 ;; <name> : symbol
49 ;; <next-state> : symbol
50 ;; <score> : real
51 ;; <input-set> : (<byte-or-range> ...)
52 ;; <byte-or-range> : <byte> | (<byte> <byte>)
53 ;; <byte> : integer between 0 and #xff | ASCII char
54 ;;
55 ;; When evaluated, the DFA generates a state transition table in
56 ;; C source format.
57
58 (define-class <dfa> ()
59 ((name :init-keyword :name :accessor name-of)
60 (states :init-keyword :states :accessor states-of)
61 (instances :allocation :class :init-value '())))
62
63 (define-class <state> ()
64 ((name :init-keyword :name :accessor name-of)
65 (index :init-keyword :index :accessor index-of)
66 (arcs :init-keyword :arcs :accessor arcs-of :init-value '())))
67
68 (define-class <arc> ()
69 ((from-state :init-keyword :from-state :accessor from-state-of)
70 (to-state :init-keyword :to-state :accessor to-state-of)
71 (ranges :init-keyword :ranges :accessor ranges-of)
72 (index :init-keyword :index :accessor index-of)
73 (score :init-keyword :score :accessor score-of)))
74
75 ;; Create DFA
76
77 (define-syntax define-dfa
78 (syntax-rules ()
79 ((_ name . states)
80 (define name (make <dfa>
81 :name 'name
82 :states (resolve-states 'states))))))
83
84 (define-method initialize ((self <dfa>) initargs)
85 (next-method)
86 (slot-push! self 'instances self))
87
88 (define (all-dfas) (reverse (class-slot-ref <dfa> 'instances)))
89
90 (define (resolve-states state-defs)
91 (let ((states (map (lambda (d i) (make <state> :name (car d) :index i))
92 state-defs
93 (iota (length state-defs)))))
94 (fold (lambda (s d i)
95 (let1 num-arcs (length (cdr d))
96 (set! (arcs-of s)
97 (map (lambda (arc aindex)
98 (make <arc>
99 :from-state s
100 :to-state (or (find (lambda (e)
101 (eq? (name-of e) (cadr arc)))
102 states)
103 (error "no such state" (cadr arc)))
104 :ranges (car arc)
105 :index aindex
106 :score (caddr arc)))
107 (cdr d)
108 (iota num-arcs i)))
109 (+ i num-arcs)))
110 0
111 states state-defs)
112 states))
113
114 ;; Emit state table
115 (define (emit-dfa-table dfa)
116 (format #t "static signed char guess_~a_st[][256] = {\n" (name-of dfa))
117 (for-each emit-state-table (states-of dfa))
118 (print "};\n")
119 (format #t "static guess_arc guess_~a_ar[] = {\n" (name-of dfa))
120 (for-each emit-arc-table
121 (append-map arcs-of (states-of dfa)))
122 (print "};\n")
123 )
124
125 (define (emit-state-table state)
126 (define (b2i byte) ;byte->integer
127 (if (char? byte) (char->integer byte) byte))
128 (let1 arc-vec (make-vector 256 -1)
129 (dolist (br (arcs-of state))
130 (dolist (range (ranges-of br))
131 (if (pair? range)
132 (vector-fill! arc-vec (index-of br)
133 (b2i (car range)) (+ (b2i (cadr range)) 1))
134 (set! (ref arc-vec (b2i range)) (index-of br)))))
135 (format #t " { /* state ~a */" (name-of state))
136 (dotimes (i 256)
137 (when (zero? (modulo i 16)) (newline))
138 (format #t " ~2d," (ref arc-vec i)))
139 (print "\n },")
140 ))
141
142 (define (emit-arc-table arc)
143 (format #t " { ~2d, ~5s }, /* ~a -> ~a */\n"
144 (index-of (to-state-of arc))
145 (score-of arc)
146 (name-of (from-state-of arc))
147 (name-of (to-state-of arc))))
148 ;;
149 ;; main
150 ;;
151
152 (define (main args)
153 (unless (= (length args) 2)
154 (error "usage: ~a <outout-file.c>" (car args)))
155 (with-output-to-file (cadr args)
156 (lambda ()
157 (print "/* State transition table for character code guessing */")
158 (print "/* This file is automatically generated by guess.scm */")
159 (newline)
160 (for-each emit-dfa-table (all-dfas))))
161 0)
162
163 ;;;============================================================
164 ;;; DFA definitions
165 ;;;
166
167 ;;;
168 ;;; EUC-JP
169 ;;;
170
171 (define-dfa eucj
172 ;; first byte
173 (init
174 (((#x00 #x7f)) init 1.0) ; ASCII range
175 ((#x8e) jis0201_kana 0.8) ; JISX 0201 kana
176 ((#x8f) jis0213_2 0.95) ; JISX 0213 plane 2
177 (((#xa1 #xfe)) jis0213_1 1.0) ; JISX 0213 plane 1
178 )
179 ;; jis x 0201 kana
180 (jis0201_kana
181 (((#xa1 #xdf)) init 1.0)
182 )
183 ;; jis x 0208 and jis x 0213 plane 1
184 (jis0213_1
185 (((#xa1 #xfe)) init 1.0))
186 ;; jis x 0213 plane 2
187 (jis0213_2
188 (((#xa1 #xfe)) init 1.0))
189 )
190
191 ;;;
192 ;;; Shift_JIS
193 ;;;
194
195 (define-dfa sjis
196 ;; first byte
197 (init
198 (((#x00 #x7f)) init 1.0) ;ascii
199 (((#x81 #x9f) (#xe1 #xef)) jis0213 1.0) ;jisx0213 plane 1
200 (((#xa1 #xdf)) init 0.8) ;jisx0201 kana
201 (((#xf0 #xfc)) jis0213 0.95) ;jisx0213 plane 2
202 (((#xfd #xff)) init 0.8)) ;vendor extension
203 (jis0213
204 (((#x40 #x7e) (#x80 #xfc)) init 1.0))
205 )
206
207 ;;;
208 ;;; UTF-8
209 ;;;
210
211 (define-dfa utf8
212 (init
213 (((#x00 #x7f)) init 1.0)
214 (((#xc2 #xdf)) 1byte_more 1.0)
215 (((#xe0 #xef)) 2byte_more 1.0)
216 (((#xf0 #xf7)) 3byte_more 1.0)
217 (((#xf8 #xfb)) 4byte_more 1.0)
218 (((#xfc #xfd)) 5byte_more 1.0))
219 (1byte_more
220 (((#x80 #xbf)) init 1.0))
221 (2byte_more
222 (((#x80 #xbf)) 1byte_more 1.0))
223 (3byte_more
224 (((#x80 #xbf)) 2byte_more 1.0))
225 (4byte_more
226 (((#x80 #xbf)) 3byte_more 1.0))
227 (5byte_more
228 (((#x80 #xbf)) 4byte_more 1.0))
229 )
230
231 ;;;
232 ;;; UCS-2LE
233 ;;;
234
235 (define-dfa ucs2le
236 (init
237 ((#xff) le 1.0)
238 (((#x00 #x7f)) ascii 1.0)
239 (((#x00 #xff)) multi 1.0))
240 (le
241 ((#xfe) init 1.0))
242 (ascii
243 ((#x00) init 1.0))
244 (multi
245 (((#x00 #xff)) init 1.0)))
246
247 ;;;
248 ;;; UCS-2BE
249 ;;;
250 (define-dfa ucs2be
251 (init
252 ((#xfe) be 1.0)
253 ((#x00) ascii 1.0)
254 (((#x00 #xff)) multi 1.0))
255 (be
256 ((#xff) init 1.0))
257 (ascii
258 (((#x00 #x7f)) init 1.0))
259 (multi
260 (((#x00 #xff)) init 1.0)))
261
262
263 ;;;
264 ;;; JIS (ISO2022JP)
265 ;;;
266
267 ;; NB: for now, we just check the sequence of <ESC> $ or <ESC> '('.
268 '(define-dfa jis
269 (init
270 ((#x1b) esc 1.0)
271 (((#x00 #x1a) (#x1c #x1f)) init 1.0) ;C0
272 (((#x20 #x7f)) init 1.0) ;ASCII
273 (((#xa1 #xdf)) init 0.7) ;JIS8bit kana
274 )
275 (esc
276 ((#x0d #x0a) init 0.9) ;cancel
277 ((#\( ) esc-paren 1.0)
278 ((#\$ ) esc-$ 1.0)
279 ((#\& ) esc-& 1.0)
280 )
281 (esc-paren
282 ((#\B #\J #\H) init 1.0)
283 ((#\I) jis0201kana 0.8)
284 )
285 (esc-$
286 ((#\@ #\B) kanji 1.0)
287 ((#\( ) esc-$-paren 1.0)
288 )
289 (esc-$-paren
290 ((#\D #\O #\P) kanji 1.0))
291 (esc-&
292 ((#\@ ) init 1.0))
293 (jis0201kana
294 ((#x1b) esc 1.0)
295 (((#x20 #x5f)) jis0201kana 1.0))
296 (kanji
297 ((#x1b) esc 1.0)
298 (((#x21 #x7e)) kanji-2 1.0))
299 (kanji-2
300 (((#x21 #x7e)) kanji 1.0))
301 )
302
303 ;;;
304 ;;; Big5
305 ;;;
306
307 (define-dfa big5
308 ;; first byte
309 (init
310 (((#x00 #x7f)) init 1.0) ;ascii
311 (((#xa1 #xfe)) 2byte 1.0) ;big5-2byte
312 )
313 (2byte
314 (((#x40 #x7e) (#xa1 #xfe)) init 1.0))
315 )
316
317 ;;;
318 ;;; GB2312 (EUC-CN?)
319 ;;;
320
321 (define-dfa gb2312
322 ;; first byte
323 (init
324 (((#x00 #x7f)) init 1.0) ;ascii
325 (((#xa1 #xfe)) 2byte 1.0) ;gb2312 2byte
326 )
327 (2byte
328 (((#xa1 #xfe)) init 1.0))
329 )
330
331 ;;;
332 ;;; GB18030
333 ;;;
334
335 (define-dfa gb18030
336 ;; first byte
337 (init
338 (((#x00 #x80)) init 1.0) ;ascii
339 (((#x81 #xfe)) 2byte 1.0) ;gb18030 2byte
340 (((#x81 #xfe)) 4byte2 1.0) ;gb18030 2byte
341 )
342 (2byte
343 (((#x40 #x7e) (#x80 #xfe)) init 1.0))
344 (4byte2
345 (((#x30 #x39)) 4byte3 1.0))
346 (4byte3
347 (((#x81 #xfe)) 4byte4 1.0))
348 (4byte4
349 (((#x30 #x39)) init 1.0))
350 )
351
352 ;;;
353 ;;; EUC-KR
354 ;;;
355
356 (define-dfa euck
357 ;; first byte
358 (init
359 (((#x00 #x7f)) init 1.0) ; ASCII range
360 (((#xa1 #xfe)) ks1001 1.0) ; KSX 1001
361 )
362 ;; ks x 1001
363 (ks1001
364 (((#xa1 #xfe)) init 1.0))
365 )
366
367 ;;;
368 ;;; Johab
369 ;;;
370
371 (define-dfa johab
372 ;; first byte
373 (init
374 (((#x00 #x7f)) init 1.0) ; ASCII range
375 (((#x84 #xd3)) jamo51 1.0) ; jamo51
376 (((#xd8 #xde) (#xe0 #xf9)) jamo42 0.95) ; jamo42
377 )
378 ;; second byte
379 (jamo51
380 (((#x41 #x7e) (#x81 #xfe)) init 1.0))
381 (jamo42
382 (((#x31 #x7e) (#x91 #xfe)) init 1.0))
383 )
384