comparison guess.scm @ 0:d9b6ff839eab

initial import
author Yoshiki Yazawa <yaz@cc.rim.or.jp>
date Fri, 30 Nov 2007 19:34:51 +0900
parents
children 754a4550c64e
comparison
equal deleted inserted replaced
-1:000000000000 0:d9b6ff839eab
1 ;;;
2 ;;; This code is derivative of guess.c of Gauche-0.8.7.
3 ;;; The following is the original copyright notice.
4 ;;;
5
6 ;;;
7 ;;; Auxiliary script to generate japanese code guessing table
8 ;;;
9 ;;; Copyright (c) 2000-2003 Shiro Kawai, All rights reserved.
10 ;;;
11 ;;; Redistribution and use in source and binary forms, with or without
12 ;;; modification, are permitted provided that the following conditions
13 ;;; are met:
14 ;;;
15 ;;; 1. Redistributions of source code must retain the above copyright
16 ;;; notice, this list of conditions and the following disclaimer.
17 ;;;
18 ;;; 2. Redistributions in binary form must reproduce the above copyright
19 ;;; notice, this list of conditions and the following disclaimer in the
20 ;;; documentation and/or other materials provided with the distribution.
21 ;;;
22 ;;; 3. Neither the name of the authors nor the names of its contributors
23 ;;; may be used to endorse or promote products derived from this
24 ;;; software without specific prior written permission.
25 ;;;
26 ;;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27 ;;; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28 ;;; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29 ;;; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30 ;;; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31 ;;; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
32 ;;; TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 ;;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 ;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 ;;;
38 ;;; $Id: guess.scm,v 1.3 2003/07/05 03:29:10 shirok Exp $
39 ;;;
40
41 (use srfi-1)
42 (use gauche.sequence)
43
44 ;; This is a simple state machine compiler.
45 ;;
46 ;; <state-machine> : (define-dfa <name> <state> ...)
47 ;; <state> : (<name> (<input-set> <next-state> <score>) ...)
48 ;; <name> : symbol
49 ;; <next-state> : symbol
50 ;; <score> : real
51 ;; <input-set> : (<byte-or-range> ...)
52 ;; <byte-or-range> : <byte> | (<byte> <byte>)
53 ;; <byte> : integer between 0 and #xff | ASCII char
54 ;;
55 ;; When evaluated, the DFA generates a state transition table in
56 ;; C source format.
57
58 (define-class <dfa> ()
59 ((name :init-keyword :name :accessor name-of)
60 (states :init-keyword :states :accessor states-of)
61 (instances :allocation :class :init-value '())))
62
63 (define-class <state> ()
64 ((name :init-keyword :name :accessor name-of)
65 (index :init-keyword :index :accessor index-of)
66 (arcs :init-keyword :arcs :accessor arcs-of :init-value '())))
67
68 (define-class <arc> ()
69 ((from-state :init-keyword :from-state :accessor from-state-of)
70 (to-state :init-keyword :to-state :accessor to-state-of)
71 (ranges :init-keyword :ranges :accessor ranges-of)
72 (index :init-keyword :index :accessor index-of)
73 (score :init-keyword :score :accessor score-of)))
74
75 ;; Create DFA
76
77 (define-syntax define-dfa
78 (syntax-rules ()
79 ((_ name . states)
80 (define name (make <dfa>
81 :name 'name
82 :states (resolve-states 'states))))))
83
84 (define-method initialize ((self <dfa>) initargs)
85 (next-method)
86 (slot-push! self 'instances self))
87
88 (define (all-dfas) (reverse (class-slot-ref <dfa> 'instances)))
89
90 (define (resolve-states state-defs)
91 (let ((states (map (lambda (d i) (make <state> :name (car d) :index i))
92 state-defs
93 (iota (length state-defs)))))
94 (fold (lambda (s d i)
95 (let1 num-arcs (length (cdr d))
96 (set! (arcs-of s)
97 (map (lambda (arc aindex)
98 (make <arc>
99 :from-state s
100 :to-state (or (find (lambda (e)
101 (eq? (name-of e) (cadr arc)))
102 states)
103 (error "no such state" (cadr arc)))
104 :ranges (car arc)
105 :index aindex
106 :score (caddr arc)))
107 (cdr d)
108 (iota num-arcs i)))
109 (+ i num-arcs)))
110 0
111 states state-defs)
112 states))
113
114 ;; Emit state table
115 (define (emit-dfa-table dfa)
116 (format #t "static signed char guess_~a_st[][256] = {\n" (name-of dfa))
117 (for-each emit-state-table (states-of dfa))
118 (print "};\n")
119 (format #t "static guess_arc guess_~a_ar[] = {\n" (name-of dfa))
120 (for-each emit-arc-table
121 (append-map arcs-of (states-of dfa)))
122 (print "};\n")
123 )
124
125 (define (emit-state-table state)
126 (define (b2i byte) ;byte->integer
127 (if (char? byte) (char->integer byte) byte))
128 (let1 arc-vec (make-vector 256 -1)
129 (dolist (br (arcs-of state))
130 (dolist (range (ranges-of br))
131 (if (pair? range)
132 (vector-fill! arc-vec (index-of br)
133 (b2i (car range)) (+ (b2i (cadr range)) 1))
134 (set! (ref arc-vec (b2i range)) (index-of br)))))
135 (format #t " { /* state ~a */" (name-of state))
136 (dotimes (i 256)
137 (when (zero? (modulo i 16)) (newline))
138 (format #t " ~2d," (ref arc-vec i)))
139 (print "\n },")
140 ))
141
142 (define (emit-arc-table arc)
143 (format #t " { ~2d, ~5s }, /* ~a -> ~a */\n"
144 (index-of (to-state-of arc))
145 (score-of arc)
146 (name-of (from-state-of arc))
147 (name-of (to-state-of arc))))
148 ;;
149 ;; main
150 ;;
151
152 (define (main args)
153 (unless (= (length args) 2)
154 (error "usage: ~a <outout-file.c>" (car args)))
155 (with-output-to-file (cadr args)
156 (lambda ()
157 (print "/* State transition table for character code guessing */")
158 (print "/* This file is automatically generated by guess.scm */")
159 (newline)
160 (for-each emit-dfa-table (all-dfas))))
161 0)
162
163 ;;;============================================================
164 ;;; DFA definitions
165 ;;;
166
167 ;;;
168 ;;; EUC-JP
169 ;;;
170
171 (define-dfa eucj
172 ;; first byte
173 (init
174 (((#x00 #x7f)) init 1.0) ; ASCII range
175 ((#x8e) jis0201_kana 0.8) ; JISX 0201 kana
176 ((#x8f) jis0213_2 0.95) ; JISX 0213 plane 2
177 (((#xa1 #xfe)) jis0213_1 1.0) ; JISX 0213 plane 1
178 )
179 ;; jis x 0201 kana
180 (jis0201_kana
181 (((#xa1 #xdf)) init 1.0)
182 )
183 ;; jis x 0208 and jis x 0213 plane 1
184 (jis0213_1
185 (((#xa1 #xfe)) init 1.0))
186 ;; jis x 0213 plane 2
187 (jis0213_2
188 (((#xa1 #xfe)) init 1.0))
189 )
190
191 ;;;
192 ;;; Shift_JIS
193 ;;;
194
195 (define-dfa sjis
196 ;; first byte
197 (init
198 (((#x00 #x7f)) init 1.0) ;ascii
199 (((#x81 #x9f) (#xe1 #xef)) jis0213 1.0) ;jisx0213 plane 1
200 (((#xa1 #xdf)) init 0.8) ;jisx0201 kana
201 (((#xf0 #xfc)) jis0213 0.95) ;jisx0213 plane 2
202 (((#xfd #xff)) init 0.8)) ;vendor extension
203 (jis0213
204 (((#x40 #x7e) (#x80 #xfc)) init 1.0))
205 )
206
207 ;;;
208 ;;; UTF-8
209 ;;;
210
211 (define-dfa utf8
212 (init
213 (((#x00 #x7f)) init 1.0)
214 (((#xc2 #xdf)) 1byte_more 1.0)
215 (((#xe0 #xef)) 2byte_more 1.0)
216 (((#xf0 #xf7)) 3byte_more 1.0)
217 (((#xf8 #xfb)) 4byte_more 1.0)
218 (((#xfc #xfd)) 5byte_more 1.0))
219 (1byte_more
220 (((#x80 #xbf)) init 1.0))
221 (2byte_more
222 (((#x80 #xbf)) 1byte_more 1.0))
223 (3byte_more
224 (((#x80 #xbf)) 2byte_more 1.0))
225 (4byte_more
226 (((#x80 #xbf)) 3byte_more 1.0))
227 (5byte_more
228 (((#x80 #xbf)) 4byte_more 1.0))
229 )
230
231 ;;;
232 ;;; UCS-2LE
233 ;;;
234 ; (define-dfa ucs2le
235 ; (init
236 ; ((#xff) le 1.0)
237 ; (((#x00 #x7f)) ascii 1.0)
238 ; (((#x00 #xff)) multi 1.0))
239 ; (le
240 ; ((#xfe) init 1.0))
241 ; (ascii
242 ; ((#x00) init 1.0))
243 ; (multi
244 ; (((#x00 #xff)) init 1.0)))
245
246 ;;;
247 ;;; UCS-2BE
248 ;;;
249 ; (define-dfa ucs2be
250 ; (init
251 ; ((#xfe) be 1.0)
252 ; ((#x00) ascii 1.0)
253 ; (((#x00 #xff)) multi 1.0))
254 ; (be
255 ; ((#xff) init 1.0))
256 ; (ascii
257 ; (((#x00 #x7f)) init 1.0))
258 ; (multi
259 ; (((#x00 #xff)) init 1.0)))
260
261
262 ;;;
263 ;;; JIS (ISO2022JP)
264 ;;;
265
266 ;; NB: for now, we just check the sequence of <ESC> $ or <ESC> '('.
267 '(define-dfa jis
268 (init
269 ((#x1b) esc 1.0)
270 (((#x00 #x1a) (#x1c #x1f)) init 1.0) ;C0
271 (((#x20 #x7f)) init 1.0) ;ASCII
272 (((#xa1 #xdf)) init 0.7) ;JIS8bit kana
273 )
274 (esc
275 ((#x0d #x0a) init 0.9) ;cancel
276 ((#\( ) esc-paren 1.0)
277 ((#\$ ) esc-$ 1.0)
278 ((#\& ) esc-& 1.0)
279 )
280 (esc-paren
281 ((#\B #\J #\H) init 1.0)
282 ((#\I) jis0201kana 0.8)
283 )
284 (esc-$
285 ((#\@ #\B) kanji 1.0)
286 ((#\( ) esc-$-paren 1.0)
287 )
288 (esc-$-paren
289 ((#\D #\O #\P) kanji 1.0))
290 (esc-&
291 ((#\@ ) init 1.0))
292 (jis0201kana
293 ((#x1b) esc 1.0)
294 (((#x20 #x5f)) jis0201kana 1.0))
295 (kanji
296 ((#x1b) esc 1.0)
297 (((#x21 #x7e)) kanji-2 1.0))
298 (kanji-2
299 (((#x21 #x7e)) kanji 1.0))
300 )
301
302 ;;;
303 ;;; Big5
304 ;;;
305
306 (define-dfa big5
307 ;; first byte
308 (init
309 (((#x00 #x7f)) init 1.0) ;ascii
310 (((#xa1 #xfe)) 2byte 1.0) ;big5-2byte
311 )
312 (2byte
313 (((#x40 #x7e) (#xa1 #xfe)) init 1.0))
314 )
315
316 ;;;
317 ;;; GB2312 (EUC-CN?)
318 ;;;
319
320 (define-dfa gb2312
321 ;; first byte
322 (init
323 (((#x00 #x7f)) init 1.0) ;ascii
324 (((#xa1 #xfe)) 2byte 1.0) ;gb2312 2byte
325 )
326 (2byte
327 (((#xa1 #xfe)) init 1.0))
328 )
329
330 ;;;
331 ;;; GB18030
332 ;;;
333
334 (define-dfa gb18030
335 ;; first byte
336 (init
337 (((#x00 #x80)) init 1.0) ;ascii
338 (((#x81 #xfe)) 2byte 1.0) ;gb18030 2byte
339 (((#x81 #xfe)) 4byte2 1.0) ;gb18030 2byte
340 )
341 (2byte
342 (((#x40 #x7e) (#x80 #xfe)) init 1.0))
343 (4byte2
344 (((#x30 #x39)) 4byte3 1.0))
345 (4byte3
346 (((#x81 #xfe)) 4byte4 1.0))
347 (4byte4
348 (((#x30 #x39)) init 1.0))
349 )
350
351 ;;;
352 ;;; EUC-KR
353 ;;;
354
355 (define-dfa euck
356 ;; first byte
357 (init
358 (((#x00 #x7f)) init 1.0) ; ASCII range
359 (((#xa1 #xfe)) ks1001 1.0) ; KSX 1001
360 )
361 ;; ks x 1001
362 (ks1001
363 (((#xa1 #xfe)) init 1.0))
364 )
365
366 ;;;
367 ;;; Johab
368 ;;;
369
370 (define-dfa johab
371 ;; first byte
372 (init
373 (((#x00 #x7f)) init 1.0) ; ASCII range
374 (((#x84 #xd3)) jamo51 1.0) ; jamo51
375 (((#xd8 #xde) (#xe0 #xf9)) jamo42 0.95) ; jamo42
376 )
377 ;; second byte
378 (jamo51
379 (((#x41 #x7e) (#x81 #xfe)) init 1.0))
380 (jamo42
381 (((#x31 #x7e) (#x91 #xfe)) init 1.0))
382 )
383