Mercurial > libguess
comparison guess.scm @ 2:754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
- new UCS-2LE/BE DFAs
- now arabic_impl.c uses arabic DFAs
- dfa common macros have been moved to dfa.h
- minor cleanups
author | Yoshiki Yazawa <yaz@cc.rim.or.jp> |
---|---|
date | Wed, 11 Jun 2008 00:11:30 +0900 |
parents | d9b6ff839eab |
children | 70e2c306231e |
comparison
equal
deleted
inserted
replaced
1:04f2be1c8464 | 2:754a4550c64e |
---|---|
3 ;;; The following is the original copyright notice. | 3 ;;; The following is the original copyright notice. |
4 ;;; | 4 ;;; |
5 | 5 |
6 ;;; | 6 ;;; |
7 ;;; Auxiliary script to generate japanese code guessing table | 7 ;;; Auxiliary script to generate japanese code guessing table |
8 ;;; | 8 ;;; |
9 ;;; Copyright (c) 2000-2003 Shiro Kawai, All rights reserved. | 9 ;;; Copyright (c) 2000-2003 Shiro Kawai, All rights reserved. |
10 ;;; | 10 ;;; |
11 ;;; Redistribution and use in source and binary forms, with or without | 11 ;;; Redistribution and use in source and binary forms, with or without |
12 ;;; modification, are permitted provided that the following conditions | 12 ;;; modification, are permitted provided that the following conditions |
13 ;;; are met: | 13 ;;; are met: |
14 ;;; | 14 ;;; |
15 ;;; 1. Redistributions of source code must retain the above copyright | 15 ;;; 1. Redistributions of source code must retain the above copyright |
16 ;;; notice, this list of conditions and the following disclaimer. | 16 ;;; notice, this list of conditions and the following disclaimer. |
17 ;;; | 17 ;;; |
18 ;;; 2. Redistributions in binary form must reproduce the above copyright | 18 ;;; 2. Redistributions in binary form must reproduce the above copyright |
19 ;;; notice, this list of conditions and the following disclaimer in the | 19 ;;; notice, this list of conditions and the following disclaimer in the |
20 ;;; documentation and/or other materials provided with the distribution. | 20 ;;; documentation and/or other materials provided with the distribution. |
21 ;;; | 21 ;;; |
22 ;;; 3. Neither the name of the authors nor the names of its contributors | 22 ;;; 3. Neither the name of the authors nor the names of its contributors |
23 ;;; may be used to endorse or promote products derived from this | 23 ;;; may be used to endorse or promote products derived from this |
24 ;;; software without specific prior written permission. | 24 ;;; software without specific prior written permission. |
25 ;;; | 25 ;;; |
26 ;;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 26 ;;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
27 ;;; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 27 ;;; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
28 ;;; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | 28 ;;; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
29 ;;; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | 29 ;;; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
30 ;;; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | 30 ;;; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
32 ;;; TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | 32 ;;; TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
33 ;;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | 33 ;;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
34 ;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | 34 ;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
35 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | 35 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
36 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 36 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
37 ;;; | 37 ;;; |
38 ;;; $Id: guess.scm,v 1.3 2003/07/05 03:29:10 shirok Exp $ | 38 ;;; $Id: guess.scm,v 1.3 2003/07/05 03:29:10 shirok Exp $ |
39 ;;; | 39 ;;; |
40 | 40 |
41 (use srfi-1) | 41 (use srfi-1) |
42 (use gauche.sequence) | 42 (use gauche.sequence) |
172 ;; first byte | 172 ;; first byte |
173 (init | 173 (init |
174 (((#x00 #x7f)) init 1.0) ; ASCII range | 174 (((#x00 #x7f)) init 1.0) ; ASCII range |
175 ((#x8e) jis0201_kana 0.8) ; JISX 0201 kana | 175 ((#x8e) jis0201_kana 0.8) ; JISX 0201 kana |
176 ((#x8f) jis0213_2 0.95) ; JISX 0213 plane 2 | 176 ((#x8f) jis0213_2 0.95) ; JISX 0213 plane 2 |
177 (((#xa1 #xfe)) jis0213_1 1.0) ; JISX 0213 plane 1 | 177 (((#xa1 #xfe)) jis0213_1 1.0)) ; JISX 0213 plane 1 |
178 ) | |
179 ;; jis x 0201 kana | 178 ;; jis x 0201 kana |
180 (jis0201_kana | 179 (jis0201_kana |
181 (((#xa1 #xdf)) init 1.0) | 180 (((#xa1 #xdf)) init 1.0)) |
182 ) | |
183 ;; jis x 0208 and jis x 0213 plane 1 | 181 ;; jis x 0208 and jis x 0213 plane 1 |
184 (jis0213_1 | 182 (jis0213_1 |
185 (((#xa1 #xfe)) init 1.0)) | 183 (((#xa1 #xfe)) init 1.0)) |
186 ;; jis x 0213 plane 2 | 184 ;; jis x 0213 plane 2 |
187 (jis0213_2 | 185 (jis0213_2 |
188 (((#xa1 #xfe)) init 1.0)) | 186 (((#xa1 #xfe)) init 1.0))) |
189 ) | |
190 | 187 |
191 ;;; | 188 ;;; |
192 ;;; Shift_JIS | 189 ;;; Shift_JIS |
193 ;;; | 190 ;;; |
194 | 191 |
199 (((#x81 #x9f) (#xe1 #xef)) jis0213 1.0) ;jisx0213 plane 1 | 196 (((#x81 #x9f) (#xe1 #xef)) jis0213 1.0) ;jisx0213 plane 1 |
200 (((#xa1 #xdf)) init 0.8) ;jisx0201 kana | 197 (((#xa1 #xdf)) init 0.8) ;jisx0201 kana |
201 (((#xf0 #xfc)) jis0213 0.95) ;jisx0213 plane 2 | 198 (((#xf0 #xfc)) jis0213 0.95) ;jisx0213 plane 2 |
202 (((#xfd #xff)) init 0.8)) ;vendor extension | 199 (((#xfd #xff)) init 0.8)) ;vendor extension |
203 (jis0213 | 200 (jis0213 |
204 (((#x40 #x7e) (#x80 #xfc)) init 1.0)) | 201 (((#x40 #x7e) (#x80 #xfc)) init 1.0))) |
205 ) | |
206 | 202 |
207 ;;; | 203 ;;; |
208 ;;; UTF-8 | 204 ;;; UTF-8 |
209 ;;; | 205 ;;; |
210 | 206 |
223 (3byte_more | 219 (3byte_more |
224 (((#x80 #xbf)) 2byte_more 1.0)) | 220 (((#x80 #xbf)) 2byte_more 1.0)) |
225 (4byte_more | 221 (4byte_more |
226 (((#x80 #xbf)) 3byte_more 1.0)) | 222 (((#x80 #xbf)) 3byte_more 1.0)) |
227 (5byte_more | 223 (5byte_more |
228 (((#x80 #xbf)) 4byte_more 1.0)) | 224 (((#x80 #xbf)) 4byte_more 1.0))) |
229 ) | |
230 | 225 |
231 ;;; | 226 ;;; |
232 ;;; UCS-2LE | 227 ;;; UCS-2LE |
233 ;;; | 228 ;;; |
234 ; (define-dfa ucs2le | 229 (define-dfa ucs2le |
235 ; (init | 230 (init |
236 ; ((#xff) le 1.0) | 231 ((#xfe) bom-be 1.0) |
237 ; (((#x00 #x7f)) ascii 1.0) | 232 ((#xff) bom-le 1.0) |
238 ; (((#x00 #xff)) multi 1.0)) | 233 (((#x00 #xfd)) byte2 1.0)) |
239 ; (le | 234 (bom-le |
240 ; ((#xfe) init 1.0)) | 235 (((#x00 #xff)) init 1.0)) |
241 ; (ascii | 236 (bom-be |
242 ; ((#x00) init 1.0)) | 237 (((#x00 #xfe)) init 1.0)) ;; if be (0xfeff), die. |
243 ; (multi | 238 (byte2 |
244 ; (((#x00 #xff)) init 1.0))) | 239 (((#x00 #xff)) init 1.0))) |
245 | 240 |
246 ;;; | 241 ;;; |
247 ;;; UCS-2BE | 242 ;;; UCS-2BE |
248 ;;; | 243 ;;; |
249 ; (define-dfa ucs2be | 244 (define-dfa ucs2be |
250 ; (init | 245 (init |
251 ; ((#xfe) be 1.0) | 246 ((#xfe) bom-be 1.0) |
252 ; ((#x00) ascii 1.0) | 247 ((#xff) bom-le 1.0) |
253 ; (((#x00 #xff)) multi 1.0)) | 248 (((#x00 #xfd)) byte2 1.0)) |
254 ; (be | 249 (bom-le |
255 ; ((#xff) init 1.0)) | 250 (((#x00 #xfd)) init 1.0) |
256 ; (ascii | 251 ((#xff) init 1.0)) ;; if le (0xfffe), die. |
257 ; (((#x00 #x7f)) init 1.0)) | 252 (bom-be |
258 ; (multi | 253 (((#x00 #xff)) init 1.0)) |
259 ; (((#x00 #xff)) init 1.0))) | 254 (byte2 |
260 | 255 (((#x00 #xff)) init 1.0))) |
261 | 256 |
262 ;;; | 257 ;;; |
263 ;;; JIS (ISO2022JP) | 258 ;;; UTF-16 |
259 ;;; | |
260 (define-dfa utf16 | |
261 (init | |
262 ((#xfe) bom-be 1.0) | |
263 ((#xff) bom-le 1.0)) | |
264 (init1 | |
265 (((#x00 #xff)) byte2 1.0)) | |
266 (bom-be | |
267 ((#xff) init1 1.0)) | |
268 (bom-le | |
269 ((#xfe) init1 1.0)) | |
270 (byte2 | |
271 (((#x00 #xff)) init1 1.0))) | |
272 | |
273 ;;; | |
274 ;;; ISO2022JP (JIS) | |
264 ;;; | 275 ;;; |
265 | 276 |
266 ;; NB: for now, we just check the sequence of <ESC> $ or <ESC> '('. | 277 ;; NB: for now, we just check the sequence of <ESC> $ or <ESC> '('. |
267 '(define-dfa jis | 278 '(define-dfa jis |
268 (init | 279 (init |
269 ((#x1b) esc 1.0) | 280 ((#x1b) esc 1.0) |
270 (((#x00 #x1a) (#x1c #x1f)) init 1.0) ;C0 | 281 (((#x00 #x1a) (#x1c #x1f)) init 1.0) ;C0 |
271 (((#x20 #x7f)) init 1.0) ;ASCII | 282 (((#x20 #x7f)) init 1.0) ;ASCII |
272 (((#xa1 #xdf)) init 0.7) ;JIS8bit kana | 283 (((#xa1 #xdf)) init 0.7)) ;JIS8bit kana |
273 ) | |
274 (esc | 284 (esc |
275 ((#x0d #x0a) init 0.9) ;cancel | 285 ((#x0d #x0a) init 0.9) ;cancel |
276 ((#\( ) esc-paren 1.0) | 286 ((#\( ) esc-paren 1.0) |
277 ((#\$ ) esc-$ 1.0) | 287 ((#\$ ) esc-$ 1.0) |
278 ((#\& ) esc-& 1.0) | 288 ((#\& ) esc-& 1.0)) |
279 ) | |
280 (esc-paren | 289 (esc-paren |
281 ((#\B #\J #\H) init 1.0) | 290 ((#\B #\J #\H) init 1.0) |
282 ((#\I) jis0201kana 0.8) | 291 ((#\I) jis0201kana 0.8)) |
283 ) | |
284 (esc-$ | 292 (esc-$ |
285 ((#\@ #\B) kanji 1.0) | 293 ((#\@ #\B) kanji 1.0) |
286 ((#\( ) esc-$-paren 1.0) | 294 ((#\( ) esc-$-paren 1.0)) |
287 ) | |
288 (esc-$-paren | 295 (esc-$-paren |
289 ((#\D #\O #\P) kanji 1.0)) | 296 ((#\D #\O #\P) kanji 1.0)) |
290 (esc-& | 297 (esc-& |
291 ((#\@ ) init 1.0)) | 298 ((#\@ ) init 1.0)) |
292 (jis0201kana | 299 (jis0201kana |
294 (((#x20 #x5f)) jis0201kana 1.0)) | 301 (((#x20 #x5f)) jis0201kana 1.0)) |
295 (kanji | 302 (kanji |
296 ((#x1b) esc 1.0) | 303 ((#x1b) esc 1.0) |
297 (((#x21 #x7e)) kanji-2 1.0)) | 304 (((#x21 #x7e)) kanji-2 1.0)) |
298 (kanji-2 | 305 (kanji-2 |
299 (((#x21 #x7e)) kanji 1.0)) | 306 (((#x21 #x7e)) kanji 1.0)) ) |
300 ) | |
301 | 307 |
302 ;;; | 308 ;;; |
303 ;;; Big5 | 309 ;;; Big5 |
304 ;;; | 310 ;;; |
305 | 311 |
306 (define-dfa big5 | 312 (define-dfa big5 |
307 ;; first byte | 313 ;; first byte |
308 (init | 314 (init |
309 (((#x00 #x7f)) init 1.0) ;ascii | 315 (((#x00 #x7f)) init 1.0) ;ascii |
310 (((#xa1 #xfe)) 2byte 1.0) ;big5-2byte | 316 (((#xa1 #xfe)) 2byte 1.0)) ;big5-2byte |
311 ) | |
312 (2byte | 317 (2byte |
313 (((#x40 #x7e) (#xa1 #xfe)) init 1.0)) | 318 (((#x40 #x7e) (#xa1 #xfe)) init 1.0))) |
314 ) | |
315 | 319 |
316 ;;; | 320 ;;; |
317 ;;; GB2312 (EUC-CN?) | 321 ;;; GB2312 (EUC-CN?) |
318 ;;; | 322 ;;; |
319 | 323 |
320 (define-dfa gb2312 | 324 (define-dfa gb2312 |
321 ;; first byte | 325 ;; first byte |
322 (init | 326 (init |
323 (((#x00 #x7f)) init 1.0) ;ascii | 327 (((#x00 #x7f)) init 1.0) ;ascii |
324 (((#xa1 #xfe)) 2byte 1.0) ;gb2312 2byte | 328 (((#xa1 #xfe)) 2byte 1.0)) ;gb2312 2byte |
325 ) | |
326 (2byte | 329 (2byte |
327 (((#xa1 #xfe)) init 1.0)) | 330 (((#xa1 #xfe)) init 1.0))) |
328 ) | |
329 | 331 |
330 ;;; | 332 ;;; |
331 ;;; GB18030 | 333 ;;; GB18030 |
332 ;;; | 334 ;;; |
333 | 335 |
334 (define-dfa gb18030 | 336 (define-dfa gb18030 |
335 ;; first byte | 337 ;; first byte |
336 (init | 338 (init |
337 (((#x00 #x80)) init 1.0) ;ascii | 339 (((#x00 #x80)) init 1.0) ;ascii |
338 (((#x81 #xfe)) 2byte 1.0) ;gb18030 2byte | 340 (((#x81 #xfe)) 2byte 1.0) ;gb18030 2byte |
339 (((#x81 #xfe)) 4byte2 1.0) ;gb18030 2byte | 341 (((#x81 #xfe)) 4byte2 1.0)) ;gb18030 2byte |
340 ) | |
341 (2byte | 342 (2byte |
342 (((#x40 #x7e) (#x80 #xfe)) init 1.0)) | 343 (((#x40 #x7e) (#x80 #xfe)) init 1.0)) |
343 (4byte2 | 344 (4byte2 |
344 (((#x30 #x39)) 4byte3 1.0)) | 345 (((#x30 #x39)) 4byte3 1.0)) |
345 (4byte3 | 346 (4byte3 |
346 (((#x81 #xfe)) 4byte4 1.0)) | 347 (((#x81 #xfe)) 4byte4 1.0)) |
347 (4byte4 | 348 (4byte4 |
348 (((#x30 #x39)) init 1.0)) | 349 (((#x30 #x39)) init 1.0)) ) |
349 ) | |
350 | 350 |
351 ;;; | 351 ;;; |
352 ;;; EUC-KR | 352 ;;; EUC-KR |
353 ;;; | 353 ;;; |
354 | 354 |
355 (define-dfa euck | 355 (define-dfa euck |
356 ;; first byte | 356 ;; first byte |
357 (init | 357 (init |
358 (((#x00 #x7f)) init 1.0) ; ASCII range | 358 (((#x00 #x7f)) init 1.0) ; ASCII range |
359 (((#xa1 #xfe)) ks1001 1.0) ; KSX 1001 | 359 (((#xa1 #xfe)) ks1001 1.0)) ; KSX 1001 |
360 ) | |
361 ;; ks x 1001 | 360 ;; ks x 1001 |
362 (ks1001 | 361 (ks1001 |
363 (((#xa1 #xfe)) init 1.0)) | 362 (((#xa1 #xfe)) init 1.0))) |
364 ) | |
365 | 363 |
366 ;;; | 364 ;;; |
367 ;;; Johab | 365 ;;; Johab |
368 ;;; | 366 ;;; |
369 | 367 |
370 (define-dfa johab | 368 (define-dfa johab |
371 ;; first byte | 369 ;; first byte |
372 (init | 370 (init |
373 (((#x00 #x7f)) init 1.0) ; ASCII range | 371 (((#x00 #x7f)) init 1.0) ; ASCII range |
374 (((#x84 #xd3)) jamo51 1.0) ; jamo51 | 372 (((#x84 #xd3)) jamo51 1.0) ; jamo51 |
375 (((#xd8 #xde) (#xe0 #xf9)) jamo42 0.95) ; jamo42 | 373 (((#xd8 #xde) (#xe0 #xf9)) jamo42 0.95)) ; jamo42 |
376 ) | |
377 ;; second byte | 374 ;; second byte |
378 (jamo51 | 375 (jamo51 |
379 (((#x41 #x7e) (#x81 #xfe)) init 1.0)) | 376 (((#x41 #x7e) (#x81 #xfe)) init 1.0)) |
380 (jamo42 | 377 (jamo42 |
381 (((#x31 #x7e) (#x91 #xfe)) init 1.0)) | 378 (((#x31 #x7e) (#x91 #xfe)) init 1.0))) |
382 ) | 379 |
383 | 380 |
381 | |
382 | |
383 | |
384 | |
385 ;;; | |
386 ;;; arabic | |
387 ;;; | |
388 | |
389 (define-dfa iso8859_6 | |
390 (init | |
391 (((#x00 #x7f)) init 1.0) ;ascii | |
392 ((#xa0) init 1.0) | |
393 ((#xa4) init 1.0) | |
394 ((#xac) init 1.0) | |
395 ((#xad) init 1.0) ;SHY xxx | |
396 ((#xbb) init 1.0) | |
397 ((#xbf) init 1.0) | |
398 (((#xc1 #xda)) init 1.0) | |
399 (((#xe0 #xf2)) init 1.0))) | |
400 | |
401 (define-dfa cp1256 | |
402 (init | |
403 (((#x00 #x7f)) init 1.0) ;ascii | |
404 (((#x80 #xff)) init 1.0))) ;high bit | |
405 | |
406 | |
407 ;;; | |
408 ;;; greek | |
409 ;;; | |
410 | |
411 (define-dfa iso8859_7 | |
412 (init | |
413 (((#x00 #x7f)) init 1.0) ;ascii | |
414 (((#xa0 #xad)) init 1.0) | |
415 (((#xaf #xd1)) init 1.0) | |
416 (((#xd3 #xfe)) init 1.0))) | |
417 | |
418 (define-dfa cp1253 | |
419 (init | |
420 (((#x00 #x7f)) init 1.0) ;ascii | |
421 ((#x80) init 1.0) | |
422 (((#x82 #x87)) init 1.0) | |
423 ((#x89) init 1.0) | |
424 ((#x8b) init 1.0) | |
425 (((#x91 #x97)) init 1.0) | |
426 ((#x99) init 1.0) | |
427 ((#x9b) init 1.0) | |
428 (((#xa0 #xa9)) init 1.0) | |
429 (((#xab #xd1)) init 1.0) | |
430 (((#xd3 #xfe)) init 1.0))) | |
431 | |
432 ;;; | |
433 ;;; hebrew | |
434 ;;; | |
435 | |
436 (define-dfa iso8859_8 | |
437 (init | |
438 (((#x00 #x7f)) init 1.0) ;ascii | |
439 ((#xa0) init 1.0) | |
440 (((#xa2 #xbe)) init 1.0) | |
441 (((#xdf #xfa)) init 1.0) | |
442 (((#xfd #xfe)) init 1.0))) | |
443 | |
444 (define-dfa cp1255 | |
445 (init | |
446 (((#x00 #x7f)) init 1.0) ;ascii | |
447 ((#x80) init 1.0) | |
448 (((#x82 #x89)) init 1.0) | |
449 ((#x8b) init 1.0) | |
450 (((#x91 #x99)) init 1.0) | |
451 ((#x9b) init 1.0) | |
452 (((#xa0 #xc9)) init 1.0) | |
453 (((#xcb #xd8)) init 1.0) | |
454 (((#xe0 #xfa)) init 1.0) | |
455 (((#xfd #xfe)) init 1.0))) | |
456 | |
457 ;;; | |
458 ;;; turkish | |
459 ;;; | |
460 | |
461 (define-dfa iso8859_9 | |
462 (init | |
463 (((#x00 #x7f)) init 1.0) ;ascii | |
464 (((#xa0 #xff)) init 1.0))) | |
465 | |
466 (define-dfa cp1254 | |
467 (init | |
468 (((#x00 #x7f)) init 1.0) ;ascii | |
469 ((#x80) init 1.0) | |
470 (((#x82 #x8c)) init 1.0) | |
471 (((#x91 #x9c)) init 1.0) | |
472 (((#x9f #xff)) init 1.0))) | |
473 |