2313
|
1 ;;;
|
|
2 ;;; This code is derivative of guess.c of Gauche-0.8.7.
|
|
3 ;;; The following is the original copyright notice.
|
|
4 ;;;
|
|
5
|
|
6 ;;;
|
|
7 ;;; Auxiliary script to generate japanese code guessing table
|
|
8 ;;;
|
|
9 ;;; Copyright (c) 2000-2003 Shiro Kawai, All rights reserved.
|
|
10 ;;;
|
|
11 ;;; Redistribution and use in source and binary forms, with or without
|
|
12 ;;; modification, are permitted provided that the following conditions
|
|
13 ;;; are met:
|
|
14 ;;;
|
|
15 ;;; 1. Redistributions of source code must retain the above copyright
|
|
16 ;;; notice, this list of conditions and the following disclaimer.
|
|
17 ;;;
|
|
18 ;;; 2. Redistributions in binary form must reproduce the above copyright
|
|
19 ;;; notice, this list of conditions and the following disclaimer in the
|
|
20 ;;; documentation and/or other materials provided with the distribution.
|
|
21 ;;;
|
|
22 ;;; 3. Neither the name of the authors nor the names of its contributors
|
|
23 ;;; may be used to endorse or promote products derived from this
|
|
24 ;;; software without specific prior written permission.
|
|
25 ;;;
|
|
26 ;;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
27 ;;; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
28 ;;; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
29 ;;; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
30 ;;; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
31 ;;; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
|
32 ;;; TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
33 ;;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
34 ;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
35 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
36 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
37 ;;;
|
|
38 ;;; $Id: guess.scm,v 1.3 2003/07/05 03:29:10 shirok Exp $
|
|
39 ;;;
|
|
40
|
|
41 (use srfi-1)
|
|
42 (use gauche.sequence)
|
|
43
|
|
44 ;; This is a simple state machine compiler.
|
|
45 ;;
|
|
46 ;; <state-machine> : (define-dfa <name> <state> ...)
|
|
47 ;; <state> : (<name> (<input-set> <next-state> <score>) ...)
|
|
48 ;; <name> : symbol
|
|
49 ;; <next-state> : symbol
|
|
50 ;; <score> : real
|
|
51 ;; <input-set> : (<byte-or-range> ...)
|
|
52 ;; <byte-or-range> : <byte> | (<byte> <byte>)
|
|
53 ;; <byte> : integer between 0 and #xff | ASCII char
|
|
54 ;;
|
|
55 ;; When evaluated, the DFA generates a state transition table in
|
|
56 ;; C source format.
|
|
57
|
|
58 (define-class <dfa> ()
|
|
59 ((name :init-keyword :name :accessor name-of)
|
|
60 (states :init-keyword :states :accessor states-of)
|
|
61 (instances :allocation :class :init-value '())))
|
|
62
|
|
63 (define-class <state> ()
|
|
64 ((name :init-keyword :name :accessor name-of)
|
|
65 (index :init-keyword :index :accessor index-of)
|
|
66 (arcs :init-keyword :arcs :accessor arcs-of :init-value '())))
|
|
67
|
|
68 (define-class <arc> ()
|
|
69 ((from-state :init-keyword :from-state :accessor from-state-of)
|
|
70 (to-state :init-keyword :to-state :accessor to-state-of)
|
|
71 (ranges :init-keyword :ranges :accessor ranges-of)
|
|
72 (index :init-keyword :index :accessor index-of)
|
|
73 (score :init-keyword :score :accessor score-of)))
|
|
74
|
|
75 ;; Create DFA
|
|
76
|
|
77 (define-syntax define-dfa
|
|
78 (syntax-rules ()
|
|
79 ((_ name . states)
|
|
80 (define name (make <dfa>
|
|
81 :name 'name
|
|
82 :states (resolve-states 'states))))))
|
|
83
|
|
84 (define-method initialize ((self <dfa>) initargs)
|
|
85 (next-method)
|
|
86 (slot-push! self 'instances self))
|
|
87
|
|
88 (define (all-dfas) (reverse (class-slot-ref <dfa> 'instances)))
|
|
89
|
|
90 (define (resolve-states state-defs)
|
|
91 (let ((states (map (lambda (d i) (make <state> :name (car d) :index i))
|
|
92 state-defs
|
|
93 (iota (length state-defs)))))
|
|
94 (fold (lambda (s d i)
|
|
95 (let1 num-arcs (length (cdr d))
|
|
96 (set! (arcs-of s)
|
|
97 (map (lambda (arc aindex)
|
|
98 (make <arc>
|
|
99 :from-state s
|
|
100 :to-state (or (find (lambda (e)
|
|
101 (eq? (name-of e) (cadr arc)))
|
|
102 states)
|
|
103 (error "no such state" (cadr arc)))
|
|
104 :ranges (car arc)
|
|
105 :index aindex
|
|
106 :score (caddr arc)))
|
|
107 (cdr d)
|
|
108 (iota num-arcs i)))
|
|
109 (+ i num-arcs)))
|
|
110 0
|
|
111 states state-defs)
|
|
112 states))
|
|
113
|
|
114 ;; Emit state table
|
|
115 (define (emit-dfa-table dfa)
|
|
116 (format #t "static signed char guess_~a_st[][256] = {\n" (name-of dfa))
|
|
117 (for-each emit-state-table (states-of dfa))
|
|
118 (print "};\n")
|
|
119 (format #t "static guess_arc guess_~a_ar[] = {\n" (name-of dfa))
|
|
120 (for-each emit-arc-table
|
|
121 (append-map arcs-of (states-of dfa)))
|
|
122 (print "};\n")
|
|
123 )
|
|
124
|
|
125 (define (emit-state-table state)
|
|
126 (define (b2i byte) ;byte->integer
|
|
127 (if (char? byte) (char->integer byte) byte))
|
|
128 (let1 arc-vec (make-vector 256 -1)
|
|
129 (dolist (br (arcs-of state))
|
|
130 (dolist (range (ranges-of br))
|
|
131 (if (pair? range)
|
|
132 (vector-fill! arc-vec (index-of br)
|
|
133 (b2i (car range)) (+ (b2i (cadr range)) 1))
|
|
134 (set! (ref arc-vec (b2i range)) (index-of br)))))
|
|
135 (format #t " { /* state ~a */" (name-of state))
|
|
136 (dotimes (i 256)
|
|
137 (when (zero? (modulo i 16)) (newline))
|
|
138 (format #t " ~2d," (ref arc-vec i)))
|
|
139 (print "\n },")
|
|
140 ))
|
|
141
|
|
142 (define (emit-arc-table arc)
|
|
143 (format #t " { ~2d, ~5s }, /* ~a -> ~a */\n"
|
|
144 (index-of (to-state-of arc))
|
|
145 (score-of arc)
|
|
146 (name-of (from-state-of arc))
|
|
147 (name-of (to-state-of arc))))
|
|
148 ;;
|
|
149 ;; main
|
|
150 ;;
|
|
151
|
|
152 (define (main args)
|
|
153 (unless (= (length args) 2)
|
|
154 (error "usage: ~a <outout-file.c>" (car args)))
|
|
155 (with-output-to-file (cadr args)
|
|
156 (lambda ()
|
|
157 (print "/* State transition table for character code guessing */")
|
|
158 (print "/* This file is automatically generated by guess.scm */")
|
|
159 (newline)
|
|
160 (for-each emit-dfa-table (all-dfas))))
|
|
161 0)
|
|
162
|
|
163 ;;;============================================================
|
|
164 ;;; DFA definitions
|
|
165 ;;;
|
|
166
|
|
167 ;;;
|
|
168 ;;; EUC-JP
|
|
169 ;;;
|
|
170
|
|
171 (define-dfa eucj
|
|
172 ;; first byte
|
|
173 (init
|
|
174 (((#x00 #x7f)) init 1.0) ; ASCII range
|
|
175 ((#x8e) jis0201_kana 0.8) ; JISX 0201 kana
|
|
176 ((#x8f) jis0213_2 0.95) ; JISX 0213 plane 2
|
|
177 (((#xa1 #xfe)) jis0213_1 1.0) ; JISX 0213 plane 1
|
|
178 )
|
|
179 ;; jis x 0201 kana
|
|
180 (jis0201_kana
|
|
181 (((#xa1 #xdf)) init 1.0)
|
|
182 )
|
|
183 ;; jis x 0208 and jis x 0213 plane 1
|
|
184 (jis0213_1
|
|
185 (((#xa1 #xfe)) init 1.0))
|
|
186 ;; jis x 0213 plane 2
|
|
187 (jis0213_2
|
|
188 (((#xa1 #xfe)) init 1.0))
|
|
189 )
|
|
190
|
|
191 ;;;
|
|
192 ;;; Shift_JIS
|
|
193 ;;;
|
|
194
|
|
195 (define-dfa sjis
|
|
196 ;; first byte
|
|
197 (init
|
|
198 (((#x00 #x7f)) init 1.0) ;ascii
|
|
199 (((#x81 #x9f) (#xe1 #xef)) jis0213 1.0) ;jisx0213 plane 1
|
|
200 (((#xa1 #xdf)) init 0.8) ;jisx0201 kana
|
|
201 (((#xf0 #xfc)) jis0213 0.95) ;jisx0213 plane 2
|
|
202 (((#xfd #xff)) init 0.8)) ;vendor extension
|
|
203 (jis0213
|
|
204 (((#x40 #x7e) (#x80 #xfc)) init 1.0))
|
|
205 )
|
|
206
|
|
207 ;;;
|
|
208 ;;; UTF-8
|
|
209 ;;;
|
|
210
|
|
211 (define-dfa utf8
|
|
212 (init
|
|
213 (((#x00 #x7f)) init 1.0)
|
|
214 (((#xc2 #xdf)) 1byte_more 1.0)
|
|
215 (((#xe0 #xef)) 2byte_more 1.0)
|
|
216 (((#xf0 #xf7)) 3byte_more 1.0)
|
|
217 (((#xf8 #xfb)) 4byte_more 1.0)
|
|
218 (((#xfc #xfd)) 5byte_more 1.0))
|
|
219 (1byte_more
|
|
220 (((#x80 #xbf)) init 1.0))
|
|
221 (2byte_more
|
|
222 (((#x80 #xbf)) 1byte_more 1.0))
|
|
223 (3byte_more
|
|
224 (((#x80 #xbf)) 2byte_more 1.0))
|
|
225 (4byte_more
|
|
226 (((#x80 #xbf)) 3byte_more 1.0))
|
|
227 (5byte_more
|
|
228 (((#x80 #xbf)) 4byte_more 1.0))
|
|
229 )
|
|
230
|
|
231 ;;;
|
|
232 ;;; UCS-2LE
|
|
233 ;;;
|
2559
|
234 ; (define-dfa ucs2le
|
|
235 ; (init
|
|
236 ; ((#xff) le 1.0)
|
|
237 ; (((#x00 #x7f)) ascii 1.0)
|
|
238 ; (((#x00 #xff)) multi 1.0))
|
|
239 ; (le
|
|
240 ; ((#xfe) init 1.0))
|
|
241 ; (ascii
|
|
242 ; ((#x00) init 1.0))
|
|
243 ; (multi
|
|
244 ; (((#x00 #xff)) init 1.0)))
|
2313
|
245
|
|
246 ;;;
|
|
247 ;;; UCS-2BE
|
|
248 ;;;
|
2559
|
249 ; (define-dfa ucs2be
|
|
250 ; (init
|
|
251 ; ((#xfe) be 1.0)
|
|
252 ; ((#x00) ascii 1.0)
|
|
253 ; (((#x00 #xff)) multi 1.0))
|
|
254 ; (be
|
|
255 ; ((#xff) init 1.0))
|
|
256 ; (ascii
|
|
257 ; (((#x00 #x7f)) init 1.0))
|
|
258 ; (multi
|
|
259 ; (((#x00 #xff)) init 1.0)))
|
2313
|
260
|
|
261
|
|
262 ;;;
|
|
263 ;;; JIS (ISO2022JP)
|
|
264 ;;;
|
|
265
|
|
266 ;; NB: for now, we just check the sequence of <ESC> $ or <ESC> '('.
|
|
267 '(define-dfa jis
|
|
268 (init
|
|
269 ((#x1b) esc 1.0)
|
|
270 (((#x00 #x1a) (#x1c #x1f)) init 1.0) ;C0
|
|
271 (((#x20 #x7f)) init 1.0) ;ASCII
|
|
272 (((#xa1 #xdf)) init 0.7) ;JIS8bit kana
|
|
273 )
|
|
274 (esc
|
|
275 ((#x0d #x0a) init 0.9) ;cancel
|
|
276 ((#\( ) esc-paren 1.0)
|
|
277 ((#\$ ) esc-$ 1.0)
|
|
278 ((#\& ) esc-& 1.0)
|
|
279 )
|
|
280 (esc-paren
|
|
281 ((#\B #\J #\H) init 1.0)
|
|
282 ((#\I) jis0201kana 0.8)
|
|
283 )
|
|
284 (esc-$
|
|
285 ((#\@ #\B) kanji 1.0)
|
|
286 ((#\( ) esc-$-paren 1.0)
|
|
287 )
|
|
288 (esc-$-paren
|
|
289 ((#\D #\O #\P) kanji 1.0))
|
|
290 (esc-&
|
|
291 ((#\@ ) init 1.0))
|
|
292 (jis0201kana
|
|
293 ((#x1b) esc 1.0)
|
|
294 (((#x20 #x5f)) jis0201kana 1.0))
|
|
295 (kanji
|
|
296 ((#x1b) esc 1.0)
|
|
297 (((#x21 #x7e)) kanji-2 1.0))
|
|
298 (kanji-2
|
|
299 (((#x21 #x7e)) kanji 1.0))
|
|
300 )
|
|
301
|
|
302 ;;;
|
|
303 ;;; Big5
|
|
304 ;;;
|
|
305
|
|
306 (define-dfa big5
|
|
307 ;; first byte
|
|
308 (init
|
|
309 (((#x00 #x7f)) init 1.0) ;ascii
|
|
310 (((#xa1 #xfe)) 2byte 1.0) ;big5-2byte
|
|
311 )
|
|
312 (2byte
|
|
313 (((#x40 #x7e) (#xa1 #xfe)) init 1.0))
|
|
314 )
|
|
315
|
|
316 ;;;
|
|
317 ;;; GB2312 (EUC-CN?)
|
|
318 ;;;
|
|
319
|
|
320 (define-dfa gb2312
|
|
321 ;; first byte
|
|
322 (init
|
|
323 (((#x00 #x7f)) init 1.0) ;ascii
|
|
324 (((#xa1 #xfe)) 2byte 1.0) ;gb2312 2byte
|
|
325 )
|
|
326 (2byte
|
|
327 (((#xa1 #xfe)) init 1.0))
|
|
328 )
|
|
329
|
|
330 ;;;
|
|
331 ;;; GB18030
|
|
332 ;;;
|
|
333
|
|
334 (define-dfa gb18030
|
|
335 ;; first byte
|
|
336 (init
|
|
337 (((#x00 #x80)) init 1.0) ;ascii
|
|
338 (((#x81 #xfe)) 2byte 1.0) ;gb18030 2byte
|
|
339 (((#x81 #xfe)) 4byte2 1.0) ;gb18030 2byte
|
|
340 )
|
|
341 (2byte
|
|
342 (((#x40 #x7e) (#x80 #xfe)) init 1.0))
|
|
343 (4byte2
|
|
344 (((#x30 #x39)) 4byte3 1.0))
|
|
345 (4byte3
|
|
346 (((#x81 #xfe)) 4byte4 1.0))
|
|
347 (4byte4
|
|
348 (((#x30 #x39)) init 1.0))
|
|
349 )
|
|
350
|
|
351 ;;;
|
|
352 ;;; EUC-KR
|
|
353 ;;;
|
|
354
|
|
355 (define-dfa euck
|
|
356 ;; first byte
|
|
357 (init
|
|
358 (((#x00 #x7f)) init 1.0) ; ASCII range
|
|
359 (((#xa1 #xfe)) ks1001 1.0) ; KSX 1001
|
|
360 )
|
|
361 ;; ks x 1001
|
|
362 (ks1001
|
|
363 (((#xa1 #xfe)) init 1.0))
|
|
364 )
|
|
365
|
|
366 ;;;
|
|
367 ;;; Johab
|
|
368 ;;;
|
|
369
|
|
370 (define-dfa johab
|
|
371 ;; first byte
|
|
372 (init
|
|
373 (((#x00 #x7f)) init 1.0) ; ASCII range
|
|
374 (((#x84 #xd3)) jamo51 1.0) ; jamo51
|
|
375 (((#xd8 #xde) (#xe0 #xf9)) jamo42 0.95) ; jamo42
|
|
376 )
|
|
377 ;; second byte
|
|
378 (jamo51
|
|
379 (((#x41 #x7e) (#x81 #xfe)) init 1.0))
|
|
380 (jamo42
|
|
381 (((#x31 #x7e) (#x91 #xfe)) init 1.0))
|
|
382 )
|
|
383
|