comparison guess.scm @ 2:754a4550c64e

- added arabic, greek, hebrew and turkish DFAs - new UCS-2LE/BE DFAs - now arabic_impl.c uses arabic DFAs - dfa common macros have been moved to dfa.h - minor cleanups
author Yoshiki Yazawa <yaz@cc.rim.or.jp>
date Wed, 11 Jun 2008 00:11:30 +0900
parents d9b6ff839eab
children 70e2c306231e
comparison
equal deleted inserted replaced
1:04f2be1c8464 2:754a4550c64e
3 ;;; The following is the original copyright notice. 3 ;;; The following is the original copyright notice.
4 ;;; 4 ;;;
5 5
6 ;;; 6 ;;;
7 ;;; Auxiliary script to generate japanese code guessing table 7 ;;; Auxiliary script to generate japanese code guessing table
8 ;;; 8 ;;;
9 ;;; Copyright (c) 2000-2003 Shiro Kawai, All rights reserved. 9 ;;; Copyright (c) 2000-2003 Shiro Kawai, All rights reserved.
10 ;;; 10 ;;;
11 ;;; Redistribution and use in source and binary forms, with or without 11 ;;; Redistribution and use in source and binary forms, with or without
12 ;;; modification, are permitted provided that the following conditions 12 ;;; modification, are permitted provided that the following conditions
13 ;;; are met: 13 ;;; are met:
14 ;;; 14 ;;;
15 ;;; 1. Redistributions of source code must retain the above copyright 15 ;;; 1. Redistributions of source code must retain the above copyright
16 ;;; notice, this list of conditions and the following disclaimer. 16 ;;; notice, this list of conditions and the following disclaimer.
17 ;;; 17 ;;;
18 ;;; 2. Redistributions in binary form must reproduce the above copyright 18 ;;; 2. Redistributions in binary form must reproduce the above copyright
19 ;;; notice, this list of conditions and the following disclaimer in the 19 ;;; notice, this list of conditions and the following disclaimer in the
20 ;;; documentation and/or other materials provided with the distribution. 20 ;;; documentation and/or other materials provided with the distribution.
21 ;;; 21 ;;;
22 ;;; 3. Neither the name of the authors nor the names of its contributors 22 ;;; 3. Neither the name of the authors nor the names of its contributors
23 ;;; may be used to endorse or promote products derived from this 23 ;;; may be used to endorse or promote products derived from this
24 ;;; software without specific prior written permission. 24 ;;; software without specific prior written permission.
25 ;;; 25 ;;;
26 ;;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 ;;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27 ;;; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 ;;; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28 ;;; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28 ;;; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29 ;;; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29 ;;; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30 ;;; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 30 ;;; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 ;;; TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 32 ;;; TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 ;;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 33 ;;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 ;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 34 ;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 35 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 ;;; 37 ;;;
38 ;;; $Id: guess.scm,v 1.3 2003/07/05 03:29:10 shirok Exp $ 38 ;;; $Id: guess.scm,v 1.3 2003/07/05 03:29:10 shirok Exp $
39 ;;; 39 ;;;
40 40
41 (use srfi-1) 41 (use srfi-1)
42 (use gauche.sequence) 42 (use gauche.sequence)
172 ;; first byte 172 ;; first byte
173 (init 173 (init
174 (((#x00 #x7f)) init 1.0) ; ASCII range 174 (((#x00 #x7f)) init 1.0) ; ASCII range
175 ((#x8e) jis0201_kana 0.8) ; JISX 0201 kana 175 ((#x8e) jis0201_kana 0.8) ; JISX 0201 kana
176 ((#x8f) jis0213_2 0.95) ; JISX 0213 plane 2 176 ((#x8f) jis0213_2 0.95) ; JISX 0213 plane 2
177 (((#xa1 #xfe)) jis0213_1 1.0) ; JISX 0213 plane 1 177 (((#xa1 #xfe)) jis0213_1 1.0)) ; JISX 0213 plane 1
178 )
179 ;; jis x 0201 kana 178 ;; jis x 0201 kana
180 (jis0201_kana 179 (jis0201_kana
181 (((#xa1 #xdf)) init 1.0) 180 (((#xa1 #xdf)) init 1.0))
182 )
183 ;; jis x 0208 and jis x 0213 plane 1 181 ;; jis x 0208 and jis x 0213 plane 1
184 (jis0213_1 182 (jis0213_1
185 (((#xa1 #xfe)) init 1.0)) 183 (((#xa1 #xfe)) init 1.0))
186 ;; jis x 0213 plane 2 184 ;; jis x 0213 plane 2
187 (jis0213_2 185 (jis0213_2
188 (((#xa1 #xfe)) init 1.0)) 186 (((#xa1 #xfe)) init 1.0)))
189 )
190 187
191 ;;; 188 ;;;
192 ;;; Shift_JIS 189 ;;; Shift_JIS
193 ;;; 190 ;;;
194 191
199 (((#x81 #x9f) (#xe1 #xef)) jis0213 1.0) ;jisx0213 plane 1 196 (((#x81 #x9f) (#xe1 #xef)) jis0213 1.0) ;jisx0213 plane 1
200 (((#xa1 #xdf)) init 0.8) ;jisx0201 kana 197 (((#xa1 #xdf)) init 0.8) ;jisx0201 kana
201 (((#xf0 #xfc)) jis0213 0.95) ;jisx0213 plane 2 198 (((#xf0 #xfc)) jis0213 0.95) ;jisx0213 plane 2
202 (((#xfd #xff)) init 0.8)) ;vendor extension 199 (((#xfd #xff)) init 0.8)) ;vendor extension
203 (jis0213 200 (jis0213
204 (((#x40 #x7e) (#x80 #xfc)) init 1.0)) 201 (((#x40 #x7e) (#x80 #xfc)) init 1.0)))
205 )
206 202
207 ;;; 203 ;;;
208 ;;; UTF-8 204 ;;; UTF-8
209 ;;; 205 ;;;
210 206
223 (3byte_more 219 (3byte_more
224 (((#x80 #xbf)) 2byte_more 1.0)) 220 (((#x80 #xbf)) 2byte_more 1.0))
225 (4byte_more 221 (4byte_more
226 (((#x80 #xbf)) 3byte_more 1.0)) 222 (((#x80 #xbf)) 3byte_more 1.0))
227 (5byte_more 223 (5byte_more
228 (((#x80 #xbf)) 4byte_more 1.0)) 224 (((#x80 #xbf)) 4byte_more 1.0)))
229 )
230 225
231 ;;; 226 ;;;
232 ;;; UCS-2LE 227 ;;; UCS-2LE
233 ;;; 228 ;;;
234 ; (define-dfa ucs2le 229 (define-dfa ucs2le
235 ; (init 230 (init
236 ; ((#xff) le 1.0) 231 ((#xfe) bom-be 1.0)
237 ; (((#x00 #x7f)) ascii 1.0) 232 ((#xff) bom-le 1.0)
238 ; (((#x00 #xff)) multi 1.0)) 233 (((#x00 #xfd)) byte2 1.0))
239 ; (le 234 (bom-le
240 ; ((#xfe) init 1.0)) 235 (((#x00 #xff)) init 1.0))
241 ; (ascii 236 (bom-be
242 ; ((#x00) init 1.0)) 237 (((#x00 #xfe)) init 1.0)) ;; if be (0xfeff), die.
243 ; (multi 238 (byte2
244 ; (((#x00 #xff)) init 1.0))) 239 (((#x00 #xff)) init 1.0)))
245 240
246 ;;; 241 ;;;
247 ;;; UCS-2BE 242 ;;; UCS-2BE
248 ;;; 243 ;;;
249 ; (define-dfa ucs2be 244 (define-dfa ucs2be
250 ; (init 245 (init
251 ; ((#xfe) be 1.0) 246 ((#xfe) bom-be 1.0)
252 ; ((#x00) ascii 1.0) 247 ((#xff) bom-le 1.0)
253 ; (((#x00 #xff)) multi 1.0)) 248 (((#x00 #xfd)) byte2 1.0))
254 ; (be 249 (bom-le
255 ; ((#xff) init 1.0)) 250 (((#x00 #xfd)) init 1.0)
256 ; (ascii 251 ((#xff) init 1.0)) ;; if le (0xfffe), die.
257 ; (((#x00 #x7f)) init 1.0)) 252 (bom-be
258 ; (multi 253 (((#x00 #xff)) init 1.0))
259 ; (((#x00 #xff)) init 1.0))) 254 (byte2
260 255 (((#x00 #xff)) init 1.0)))
261 256
262 ;;; 257 ;;;
263 ;;; JIS (ISO2022JP) 258 ;;; UTF-16
259 ;;;
260 (define-dfa utf16
261 (init
262 ((#xfe) bom-be 1.0)
263 ((#xff) bom-le 1.0))
264 (init1
265 (((#x00 #xff)) byte2 1.0))
266 (bom-be
267 ((#xff) init1 1.0))
268 (bom-le
269 ((#xfe) init1 1.0))
270 (byte2
271 (((#x00 #xff)) init1 1.0)))
272
273 ;;;
274 ;;; ISO2022JP (JIS)
264 ;;; 275 ;;;
265 276
266 ;; NB: for now, we just check the sequence of <ESC> $ or <ESC> '('. 277 ;; NB: for now, we just check the sequence of <ESC> $ or <ESC> '('.
267 '(define-dfa jis 278 '(define-dfa jis
268 (init 279 (init
269 ((#x1b) esc 1.0) 280 ((#x1b) esc 1.0)
270 (((#x00 #x1a) (#x1c #x1f)) init 1.0) ;C0 281 (((#x00 #x1a) (#x1c #x1f)) init 1.0) ;C0
271 (((#x20 #x7f)) init 1.0) ;ASCII 282 (((#x20 #x7f)) init 1.0) ;ASCII
272 (((#xa1 #xdf)) init 0.7) ;JIS8bit kana 283 (((#xa1 #xdf)) init 0.7)) ;JIS8bit kana
273 )
274 (esc 284 (esc
275 ((#x0d #x0a) init 0.9) ;cancel 285 ((#x0d #x0a) init 0.9) ;cancel
276 ((#\( ) esc-paren 1.0) 286 ((#\( ) esc-paren 1.0)
277 ((#\$ ) esc-$ 1.0) 287 ((#\$ ) esc-$ 1.0)
278 ((#\& ) esc-& 1.0) 288 ((#\& ) esc-& 1.0))
279 )
280 (esc-paren 289 (esc-paren
281 ((#\B #\J #\H) init 1.0) 290 ((#\B #\J #\H) init 1.0)
282 ((#\I) jis0201kana 0.8) 291 ((#\I) jis0201kana 0.8))
283 )
284 (esc-$ 292 (esc-$
285 ((#\@ #\B) kanji 1.0) 293 ((#\@ #\B) kanji 1.0)
286 ((#\( ) esc-$-paren 1.0) 294 ((#\( ) esc-$-paren 1.0))
287 )
288 (esc-$-paren 295 (esc-$-paren
289 ((#\D #\O #\P) kanji 1.0)) 296 ((#\D #\O #\P) kanji 1.0))
290 (esc-& 297 (esc-&
291 ((#\@ ) init 1.0)) 298 ((#\@ ) init 1.0))
292 (jis0201kana 299 (jis0201kana
294 (((#x20 #x5f)) jis0201kana 1.0)) 301 (((#x20 #x5f)) jis0201kana 1.0))
295 (kanji 302 (kanji
296 ((#x1b) esc 1.0) 303 ((#x1b) esc 1.0)
297 (((#x21 #x7e)) kanji-2 1.0)) 304 (((#x21 #x7e)) kanji-2 1.0))
298 (kanji-2 305 (kanji-2
299 (((#x21 #x7e)) kanji 1.0)) 306 (((#x21 #x7e)) kanji 1.0)) )
300 )
301 307
302 ;;; 308 ;;;
303 ;;; Big5 309 ;;; Big5
304 ;;; 310 ;;;
305 311
306 (define-dfa big5 312 (define-dfa big5
307 ;; first byte 313 ;; first byte
308 (init 314 (init
309 (((#x00 #x7f)) init 1.0) ;ascii 315 (((#x00 #x7f)) init 1.0) ;ascii
310 (((#xa1 #xfe)) 2byte 1.0) ;big5-2byte 316 (((#xa1 #xfe)) 2byte 1.0)) ;big5-2byte
311 )
312 (2byte 317 (2byte
313 (((#x40 #x7e) (#xa1 #xfe)) init 1.0)) 318 (((#x40 #x7e) (#xa1 #xfe)) init 1.0)))
314 )
315 319
316 ;;; 320 ;;;
317 ;;; GB2312 (EUC-CN?) 321 ;;; GB2312 (EUC-CN?)
318 ;;; 322 ;;;
319 323
320 (define-dfa gb2312 324 (define-dfa gb2312
321 ;; first byte 325 ;; first byte
322 (init 326 (init
323 (((#x00 #x7f)) init 1.0) ;ascii 327 (((#x00 #x7f)) init 1.0) ;ascii
324 (((#xa1 #xfe)) 2byte 1.0) ;gb2312 2byte 328 (((#xa1 #xfe)) 2byte 1.0)) ;gb2312 2byte
325 )
326 (2byte 329 (2byte
327 (((#xa1 #xfe)) init 1.0)) 330 (((#xa1 #xfe)) init 1.0)))
328 )
329 331
330 ;;; 332 ;;;
331 ;;; GB18030 333 ;;; GB18030
332 ;;; 334 ;;;
333 335
334 (define-dfa gb18030 336 (define-dfa gb18030
335 ;; first byte 337 ;; first byte
336 (init 338 (init
337 (((#x00 #x80)) init 1.0) ;ascii 339 (((#x00 #x80)) init 1.0) ;ascii
338 (((#x81 #xfe)) 2byte 1.0) ;gb18030 2byte 340 (((#x81 #xfe)) 2byte 1.0) ;gb18030 2byte
339 (((#x81 #xfe)) 4byte2 1.0) ;gb18030 2byte 341 (((#x81 #xfe)) 4byte2 1.0)) ;gb18030 2byte
340 )
341 (2byte 342 (2byte
342 (((#x40 #x7e) (#x80 #xfe)) init 1.0)) 343 (((#x40 #x7e) (#x80 #xfe)) init 1.0))
343 (4byte2 344 (4byte2
344 (((#x30 #x39)) 4byte3 1.0)) 345 (((#x30 #x39)) 4byte3 1.0))
345 (4byte3 346 (4byte3
346 (((#x81 #xfe)) 4byte4 1.0)) 347 (((#x81 #xfe)) 4byte4 1.0))
347 (4byte4 348 (4byte4
348 (((#x30 #x39)) init 1.0)) 349 (((#x30 #x39)) init 1.0)) )
349 )
350 350
351 ;;; 351 ;;;
352 ;;; EUC-KR 352 ;;; EUC-KR
353 ;;; 353 ;;;
354 354
355 (define-dfa euck 355 (define-dfa euck
356 ;; first byte 356 ;; first byte
357 (init 357 (init
358 (((#x00 #x7f)) init 1.0) ; ASCII range 358 (((#x00 #x7f)) init 1.0) ; ASCII range
359 (((#xa1 #xfe)) ks1001 1.0) ; KSX 1001 359 (((#xa1 #xfe)) ks1001 1.0)) ; KSX 1001
360 )
361 ;; ks x 1001 360 ;; ks x 1001
362 (ks1001 361 (ks1001
363 (((#xa1 #xfe)) init 1.0)) 362 (((#xa1 #xfe)) init 1.0)))
364 )
365 363
366 ;;; 364 ;;;
367 ;;; Johab 365 ;;; Johab
368 ;;; 366 ;;;
369 367
370 (define-dfa johab 368 (define-dfa johab
371 ;; first byte 369 ;; first byte
372 (init 370 (init
373 (((#x00 #x7f)) init 1.0) ; ASCII range 371 (((#x00 #x7f)) init 1.0) ; ASCII range
374 (((#x84 #xd3)) jamo51 1.0) ; jamo51 372 (((#x84 #xd3)) jamo51 1.0) ; jamo51
375 (((#xd8 #xde) (#xe0 #xf9)) jamo42 0.95) ; jamo42 373 (((#xd8 #xde) (#xe0 #xf9)) jamo42 0.95)) ; jamo42
376 )
377 ;; second byte 374 ;; second byte
378 (jamo51 375 (jamo51
379 (((#x41 #x7e) (#x81 #xfe)) init 1.0)) 376 (((#x41 #x7e) (#x81 #xfe)) init 1.0))
380 (jamo42 377 (jamo42
381 (((#x31 #x7e) (#x91 #xfe)) init 1.0)) 378 (((#x31 #x7e) (#x91 #xfe)) init 1.0)))
382 ) 379
383 380
381
382
383
384
385 ;;;
386 ;;; arabic
387 ;;;
388
389 (define-dfa iso8859_6
390 (init
391 (((#x00 #x7f)) init 1.0) ;ascii
392 ((#xa0) init 1.0)
393 ((#xa4) init 1.0)
394 ((#xac) init 1.0)
395 ((#xad) init 1.0) ;SHY xxx
396 ((#xbb) init 1.0)
397 ((#xbf) init 1.0)
398 (((#xc1 #xda)) init 1.0)
399 (((#xe0 #xf2)) init 1.0)))
400
401 (define-dfa cp1256
402 (init
403 (((#x00 #x7f)) init 1.0) ;ascii
404 (((#x80 #xff)) init 1.0))) ;high bit
405
406
407 ;;;
408 ;;; greek
409 ;;;
410
411 (define-dfa iso8859_7
412 (init
413 (((#x00 #x7f)) init 1.0) ;ascii
414 (((#xa0 #xad)) init 1.0)
415 (((#xaf #xd1)) init 1.0)
416 (((#xd3 #xfe)) init 1.0)))
417
418 (define-dfa cp1253
419 (init
420 (((#x00 #x7f)) init 1.0) ;ascii
421 ((#x80) init 1.0)
422 (((#x82 #x87)) init 1.0)
423 ((#x89) init 1.0)
424 ((#x8b) init 1.0)
425 (((#x91 #x97)) init 1.0)
426 ((#x99) init 1.0)
427 ((#x9b) init 1.0)
428 (((#xa0 #xa9)) init 1.0)
429 (((#xab #xd1)) init 1.0)
430 (((#xd3 #xfe)) init 1.0)))
431
432 ;;;
433 ;;; hebrew
434 ;;;
435
436 (define-dfa iso8859_8
437 (init
438 (((#x00 #x7f)) init 1.0) ;ascii
439 ((#xa0) init 1.0)
440 (((#xa2 #xbe)) init 1.0)
441 (((#xdf #xfa)) init 1.0)
442 (((#xfd #xfe)) init 1.0)))
443
444 (define-dfa cp1255
445 (init
446 (((#x00 #x7f)) init 1.0) ;ascii
447 ((#x80) init 1.0)
448 (((#x82 #x89)) init 1.0)
449 ((#x8b) init 1.0)
450 (((#x91 #x99)) init 1.0)
451 ((#x9b) init 1.0)
452 (((#xa0 #xc9)) init 1.0)
453 (((#xcb #xd8)) init 1.0)
454 (((#xe0 #xfa)) init 1.0)
455 (((#xfd #xfe)) init 1.0)))
456
457 ;;;
458 ;;; turkish
459 ;;;
460
461 (define-dfa iso8859_9
462 (init
463 (((#x00 #x7f)) init 1.0) ;ascii
464 (((#xa0 #xff)) init 1.0)))
465
466 (define-dfa cp1254
467 (init
468 (((#x00 #x7f)) init 1.0) ;ascii
469 ((#x80) init 1.0)
470 (((#x82 #x8c)) init 1.0)
471 (((#x91 #x9c)) init 1.0)
472 (((#x9f #xff)) init 1.0)))
473