comparison PubdicPlus/pod.c @ 0:bbc77ca4def5

initial import
author Yoshiki Yazawa <yaz@cc.rim.or.jp>
date Thu, 13 Dec 2007 04:30:14 +0900
parents
children 466fe6732d8d
comparison
equal deleted inserted replaced
-1:000000000000 0:bbc77ca4def5
1 /* Copyright 1994 Pubdic Project.
2 *
3 * Permission to use, copy, modify, distribute and sell this software
4 * and its documentation for any purpose is hereby granted without
5 * fee, provided that the above copyright notice appear in all copies
6 * and that both that copyright notice and this permission notice
7 * appear in supporting documentation, and that the name of Pubdic
8 * Project not be used in advertising or publicity pertaining to
9 * distribution of the software without specific, written prior
10 * permission. Pubdic Project makes no representations about the
11 * suitability of this software for any purpose. It is provided "as
12 * is" without express or implied warranty.
13 *
14 * PUBDIC PROJECT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
15 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
16 * NO EVENT SHALL PUBDIC PROJECT BE LIABLE FOR ANY SPECIAL, INDIRECT OR
17 * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
18 * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
19 * OTHER TORTUOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
20 * PERFORMANCE OF THIS SOFTWARE.
21 */
22
23 #ifndef lint
24 static char rcsid[] = "$Id: pod.c,v 1.7 2005/12/10 18:50:43 aonoto Exp $";
25 #endif
26
27 #ifdef HAVE_CONFIG_H
28 # include <config.h>
29 #endif
30
31 #include <stdio.h>
32 #if STDC_HEADERS
33 # include <stdlib.h>
34 # include <stddef.h>
35 # include <string.h>
36 #else
37 # if HAVE_MALLOC_H
38 # include <malloc.h>
39 # endif
40 # if HAVE_STRINGS_H
41 # include <strings.h>
42 # endif
43 #endif /* STDC_HEADERS */
44
45 #define POD_WCHAR
46 #ifdef POD_WCHAR
47 typedef unsigned short Wchar;
48 #else
49 #include <locale.h>
50 #include <widec.h>
51 #define Wchar wchar_t
52 #endif
53
54 #if !(HAVE_BZERO) && (HAVE_MEMSET)
55 # define bzero(a, c) memset(a, 0, c)
56 #endif
57
58 static char *program;
59 static int compare, ignore_hinshi_to_compare, sort_by_frequency, merge_sj3;
60 static int merge_kind, wnn_type_output, canna_type_output, sj3_type_output;
61 static int list_kinds;
62 static int copy_frequency, extract_kana = 0;
63 static long specific_kind;
64 static FILE *in1, *in2;
65 static char *common_out, *old_out, *new_out, *hinshi_table, *bunrui;
66 static char *description_table;
67 static int selhinshi = 0;
68
69 /* hinshi_direction */
70 #define INORDER 0
71 #define REVERSE 1
72
73 static int hinshi_direction = INORDER; /* see above */
74
75 #define READBUFSIZE 128
76 #define DICBUFSIZE (2 << 13)
77 #define DICBUFINDEXMASK (DICBUFSIZE - 1)
78 #define HINSHIBUFSIZE (2 << 13)
79 #define HINSHIBUFINDEXMASK (HINSHIBUFSIZE - 1)
80
81 /* status of intern() */
82 #define FOUND 0
83 #define CREATE 1
84
85 /* 品詞を表す構造体 */
86
87 struct hinshipack
88 {
89 int nhinshis;
90 Wchar *hinshi;
91 unsigned flags; /* see below */
92 struct hinshipack *next;
93 };
94
95 /* values of (struct hinshipack.)flags */
96 #define REPLACED 1
97
98 /* 終止形を追加するためのルールファイルの内部表現(だと思う) */
99
100 struct descpack
101 {
102 Wchar *hinshi, *tandesc, *yomdesc;
103 struct descpack *next;
104 };
105
106 /* エントリの種別を表す構造体その他 */
107
108 struct kindpack
109 {
110 Wchar *kind;
111 long kindbit;
112 };
113
114 /* 辞書を表す構造体 */
115
116 struct dicpack
117 {
118 Wchar *yomi, *tango;
119 struct hinshipack *hinshi;
120 int hindo;
121 long kind;
122 Wchar *extdata;
123 unsigned flags; /* SEE BELOW */
124 struct dicpack *next;
125 };
126
127 /* values of (struct dicpack.)flags */
128 #define COMMON 001
129 #define NEW 002
130
131 #if defined (__STDC__) || defined (_AIX) || (defined (__mips) && defined (_SYSTYPE_SVR4)) || defined(_WIN32)
132 /* Prototype for C89 (or later) */
133 #ifdef POD_WCHAR
134 size_t Mbstowcs (Wchar *d, char *ss, int n);
135 size_t Wcstombs (char *d, Wchar *s, int n);
136 int Wscmp (register Wchar *s1, register Wchar *s2);
137 Wchar *Wscpy (Wchar *d, register Wchar *s);
138 int Wslen (Wchar *s);
139 int Watoi (Wchar *s);
140 static void Fputws (Wchar *s, FILE *f);
141 Wchar *Fgetws (Wchar *buf, int siz, FILE *f);
142 #endif /* POD_WCHAR */
143
144 static int all_kana (Wchar *s);
145 static Wchar *findslash (Wchar *s);
146 static Wchar *extstr (Wchar *p, Wchar **pp, int *key_return);
147 static void malloc_failed (void);
148 static struct hinshipack *internhinshi (Wchar *str, int flag);
149 static void replace_hinshi (void);
150 static void select_hinshi (int n);
151 static void freedesc (struct descpack *p);
152 static struct descpack *interndesc (Wchar *hin, Wchar *tan, Wchar *yom);
153 static struct descpack *searchdesc (Wchar *hin);
154 static void store_description (void);
155 static long internkind (Wchar *s);
156 static void listkinds (void);
157 static int kindcompar (struct kindpack *k1, struct kindpack *k2);
158 static void sortkind (void);
159 static struct dicpack *intern (int key, Wchar *yomi, Wchar *kouho, Wchar *hinshi, int hindo, long kind, int *stat, long flags);
160 static void storepd (FILE *file);
161 static void comparepd (FILE *file);
162 static void canna_output (FILE *cf, struct dicpack *p, Wchar *h, int n);
163 static void entry_out (FILE *cf, struct dicpack *p, Wchar *h, int n, Wchar *ex);
164 static void printentry (FILE *cf, struct dicpack *p);
165 static void showentry (struct dicpack **pd, int n);
166 static int diccompar (struct dicpack **p1, struct dicpack **p2);
167 static int dichindocompar (struct dicpack **p1, struct dicpack **p2);
168 void shrinkargs (char **argv, int n, int count);
169 static void parseargs (int argc, char *argv[]);
170 #endif
171
172 #ifndef POD_WCHAR
173 # define Mbstowcs mbstowcs
174 # define Wcstombs wcstombs
175 # define Wscmp wscmp
176 # define Wscpy wscpy
177 # define Wslen wslen
178 # define Fgetws fgetws
179 # define Fputws fputws
180 #else
181 # define SS2 0x8e
182 # define SS3 0x8f
183 # define MSB 0x80
184 # define MSK 0x7f
185
186 # define WCG0 0x0000
187 # define WCG1 0x8080
188 # define WCG2 0x0080
189 # define WCG3 0x8000
190 # define WCMSK 0x8080
191
192 size_t
193 Mbstowcs (d, ss, n)
194 Wchar *d;
195 char *ss;
196 int n;
197 {
198 register Wchar *p = d;
199 register int ch;
200 register unsigned char *s = (unsigned char *) ss;
201
202 while ((ch = *s++) && (p - d < n))
203 {
204 if (ch & MSB)
205 {
206 if (ch == SS2)
207 { /* kana */
208 *p++ = (Wchar) * s++;
209 }
210 else if (ch == SS3)
211 {
212 *p++ = (Wchar) ((*s << 8) | (*(s + 1) & MSK));
213 s += 2;
214 }
215 else
216 {
217 *p++ = (Wchar) ((ch << 8) | (*s++ & 0xff));
218 }
219 }
220 else
221 {
222 *p++ = (Wchar) ch;
223 }
224 }
225 *p = (Wchar) 0;
226 return p - d;
227 }
228
229 size_t
230 Wcstombs (d, s, n)
231 char *d;
232 Wchar *s;
233 int n;
234 {
235 register char *p = d;
236 register Wchar ch;
237
238 while ((ch = *s++) && (p - d + 2 < n))
239 {
240 switch (ch & WCMSK)
241 {
242 case WCG0:
243 *p++ = (char) ch;
244 break;
245
246 case WCG1:
247 *p++ = (char) ((ch >> 8) & 0xff);
248 *p++ = (char) (ch & 0xff);
249 break;
250
251 case WCG2:
252 *p++ = SS2;
253 *p++ = (char) ch;
254 break;
255
256 case WCG3:
257 *p++ = SS3;
258 *p++ = (char) ((ch >> 8) & 0xff);
259 *p++ = (char) ((ch & 0xff) | MSB);
260 break;
261 }
262 }
263 *p = '\0';
264 return p - d;
265 }
266
267 int
268 Wscmp (s1, s2)
269 register Wchar *s1, *s2;
270 {
271 register int res;
272
273 /* 以下のコードはいささかトリッキーなので、説明を加えておこう。
274 以下ではこのコメント内にあるようなことをしたいわけである。
275
276 while (*s1 && *s2 && && *s1 == *s2) {
277 s1++; s2++;
278 }
279 return *s1 - *s2;
280
281 すなわち、s1 も s2 も EOS ('\0') を指していなくて、しかも値が
282 異なる間はそれぞれのポインタを進める。いずれかが EOS になるか、
283 値が違ってきた場合には、*s1 - *s2 を返す。
284 */
285
286 while (!(res = *s1 - *s2++) && *s1++)
287 ;
288 return res;
289 }
290
291 Wchar *
292 Wscpy (d, s)
293 Wchar *d;
294 register Wchar *s;
295 {
296 register Wchar *p = d, ch;
297
298 while (ch = *s++)
299 {
300 *p++ = ch;
301 }
302 *p = (Wchar) 0;
303 return d;
304 }
305
306 int
307 Wslen (s)
308 Wchar *s;
309 {
310 register Wchar *p = s;
311
312 while (*p)
313 p++;
314 return p - s;
315 }
316
317 int
318 Watoi (s)
319 Wchar *s;
320 {
321 register int res = 0;
322 register Wchar ch;
323
324 while ((ch = *s++) && ((Wchar) '0' <= ch) && (ch <= (Wchar) '9'))
325 {
326 res *= 10;
327 res += ch - (Wchar) '0';
328 }
329 return res;
330 }
331
332 static void
333 Fputws (s, f)
334 Wchar *s;
335 FILE *f;
336 {
337 char buf[READBUFSIZE];
338
339 if (Wcstombs (buf, s, READBUFSIZE))
340 {
341 (void) fputs (buf, f);
342 }
343 }
344
345 Wchar *
346 Fgetws (buf, siz, f)
347 Wchar *buf;
348 int siz;
349 FILE *f;
350 {
351 char mbuf[READBUFSIZE], *p;
352
353 p = fgets (mbuf, READBUFSIZE, f);
354 if (p)
355 {
356 if (Mbstowcs (buf, mbuf, siz))
357 {
358 return buf;
359 }
360 }
361 return (Wchar *) 0;
362 }
363 #endif
364
365 /* s が全てカタカナから構成されているかどうかを返す関数 */
366
367 static int
368 all_kana (s)
369 Wchar *s;
370 {
371 static Wchar xa = 0, xke, aa, *p;
372
373 if (!xa)
374 {
375 Mbstowcs (&xa, "\045\041", 1);
376 Mbstowcs (&xke, "\045\166", 1);
377 Mbstowcs (&aa, "\041\074", 1);
378 }
379
380 for (p = s; *p; p++)
381 {
382 if (!(*p == aa || (xa <= *p && *p <= xke)))
383 {
384 return 0;
385 }
386 }
387
388 return 1;
389 }
390
391 /* スラッシュを探す */
392
393 static Wchar *
394 findslash (s)
395 Wchar *s;
396 {
397 while (*s)
398 {
399 if (*s == (Wchar) '/')
400 {
401 return s;
402 }
403 s++;
404 }
405 return (Wchar *) 0;
406 }
407
408 /* トークンを一個取り出す */
409
410 static Wchar *
411 extstr (p, pp, key_return)
412 Wchar *p, **pp;
413 int *key_return;
414 {
415 Wchar *res;
416 int key = 0;
417
418 while (*p == (Wchar) ' ' || *p == (Wchar) '\t')
419 p++;
420 res = p;
421 while (*p && *p != (Wchar) ' ' && *p != (Wchar) '\t' && *p != (Wchar) '\n')
422 {
423 key += (int) *p++;
424 }
425 *p++ = (Wchar) '\0';
426 if (pp)
427 *pp = p;
428 if (key_return)
429 *key_return = key;
430 return res;
431 }
432
433 static struct hinshipack *partsofspeech[HINSHIBUFSIZE];
434
435 static void
436 malloc_failed ()
437 {
438 (void) fprintf (stderr, "%s: malloc failed.\n", program);
439 }
440
441 /* 品詞名を品詞名テーブルに登録する */
442
443 static struct hinshipack *
444 internhinshi (str, flag)
445 Wchar *str;
446 int flag;
447 {
448 struct hinshipack *p, **pp;
449 Wchar *s;
450 int key = 0;
451
452 for (s = str; *s; s++)
453 key += (int) *s;
454 key = ((unsigned) key & HINSHIBUFINDEXMASK);
455 for (pp = partsofspeech + key; p = *pp; pp = &(p->next))
456 {
457 if (!Wscmp (p->hinshi, str))
458 {
459 return p;
460 }
461 }
462 if (flag)
463 {
464 p = (struct hinshipack *) malloc (sizeof (struct hinshipack));
465 if (p)
466 {
467 *pp = p;
468 (void) bzero (p, sizeof (struct hinshipack));
469 p->hinshi = (Wchar *) malloc ((Wslen (str) + 1) * sizeof (Wchar));
470 if (p->hinshi)
471 {
472 (void) Wscpy (p->hinshi, str);
473 p->nhinshis = 1;
474 return p;
475 }
476 free (p);
477 }
478 malloc_failed ();
479 }
480 return (struct hinshipack *) 0;
481 }
482
483 /* 品詞名を置き換える */
484
485 static void
486 replace_hinshi ()
487 {
488 FILE *f;
489 Wchar readbuf[READBUFSIZE], *to, *from, *s;
490 struct hinshipack *hinshientry, *p;
491 int i, err = 0;
492
493 f = fopen (hinshi_table, "r");
494 if (!f)
495 {
496 (void) fprintf (stderr, "%s: can not open the table file of parts of speech \"%s\".\n", program, hinshi_table);
497 exit (1);
498 }
499 while (s = Fgetws (readbuf, READBUFSIZE, f))
500 {
501 from = extstr (s, &s, 0);
502 to = extstr (s, &s, 0);
503 if (hinshi_direction == REVERSE)
504 {
505 Wchar *xx = from;
506 from = to;
507 to = xx;
508 }
509
510 hinshientry = internhinshi (from, 0);
511 if (hinshientry)
512 {
513 Wchar *xx;
514
515 xx = (Wchar *) malloc ((Wslen (to) + 1) * sizeof (Wchar));
516 if (xx)
517 {
518 Wchar *cp;
519 int n = 1;
520
521 (void) Wscpy (xx, to);
522 free (hinshientry->hinshi);
523 hinshientry->hinshi = xx;
524 for (cp = xx; *cp; cp++)
525 {
526 if (*cp == (Wchar) '/')
527 {
528 *cp = (Wchar) 0;
529 n++;
530 }
531 }
532 hinshientry->nhinshis = n;
533 hinshientry->flags |= REPLACED;
534 }
535 else
536 {
537 malloc_failed ();
538 }
539 }
540 }
541 (void) fclose (f);
542
543 for (i = 0; i < HINSHIBUFSIZE; i++)
544 {
545 for (p = partsofspeech[i]; p; p = p->next)
546 {
547 if (!(p->flags & REPLACED))
548 {
549 (void) fprintf (stderr, "%s: The replacement for \"", program);
550 Fputws (p->hinshi, stderr);
551 (void) fprintf (stderr, "\" is not mentioned in the table.\n");
552 err = 1;
553 }
554 }
555 }
556 if (err)
557 {
558 exit (1);
559 }
560 }
561
562 static void
563 select_hinshi (n)
564 int n;
565 {
566 Wchar *s, *t, *xx;
567 struct hinshipack *p;
568 int i;
569
570 if (!n)
571 return;
572
573 for (i = 0; i < HINSHIBUFSIZE; i++)
574 {
575 for (p = partsofspeech[i]; p; p = p->next)
576 {
577 switch (n)
578 {
579 case 1:
580 s = findslash (p->hinshi);
581 if (s)
582 {
583 *s = (Wchar) 0;
584 }
585 break;
586
587 case 2:
588 s = findslash (p->hinshi);
589 if (s)
590 {
591 s++;
592 t = findslash (s);
593 if (t)
594 {
595 xx = (Wchar *) malloc ((t - s + 1) * sizeof (Wchar));
596 if (xx)
597 {
598 *t = (Wchar) 0;
599 Wscpy (xx, s);
600 t = p->hinshi;
601 p->hinshi = xx;
602 (void) free ((char *) t);
603 }
604 }
605 }
606 break;
607
608 case 3:
609 s = findslash (p->hinshi);
610 if (s)
611 {
612 t = findslash (s + 1);
613 if (t)
614 {
615 t++;
616 xx = (Wchar *) malloc ((Wslen (t) + 1) * sizeof (Wchar));
617 if (xx)
618 {
619 Wscpy (xx, t);
620 t = p->hinshi;
621 p->hinshi = xx;
622 (void) free ((char *) t);
623 }
624 }
625 }
626 break;
627
628 default:
629 break;
630 }
631 }
632 }
633 }
634
635 static void
636 freedesc (p)
637 struct descpack *p;
638 {
639 free (p->hinshi);
640 free (p->tandesc);
641 free (p->yomdesc);
642 free (p);
643 }
644
645 static struct descpack *description[HINSHIBUFSIZE];
646
647 /* ルールの登録 */
648
649 static struct descpack *
650 interndesc (hin, tan, yom)
651 Wchar *hin, *tan, *yom;
652 {
653 struct descpack *p, **pp, *next = (struct descpack *) 0;
654 Wchar *s;
655 int key = 0;
656
657 for (s = hin; *s; s++)
658 key += (int) *s;
659 key = ((unsigned) key & HINSHIBUFINDEXMASK);
660 for (pp = description + key; p = *pp; pp = &(p->next))
661 {
662 if (!Wscmp (p->hinshi, hin))
663 {
664 if (!Wscmp (p->tandesc, tan) && !Wscmp (p->yomdesc, yom))
665 {
666 return p;
667 }
668 else
669 {
670 *pp = next = p->next;
671 freedesc (p);
672 break;
673 }
674 }
675 }
676 p = (struct descpack *) malloc (sizeof (struct descpack));
677 if (p)
678 {
679 *pp = p;
680 (void) bzero (p, sizeof (struct descpack));
681 p->next = next;
682 p->hinshi = (Wchar *) malloc ((Wslen (hin) + 1) * sizeof (Wchar));
683 if (p->hinshi)
684 {
685 (void) Wscpy (p->hinshi, hin);
686 p->tandesc = (Wchar *) malloc ((Wslen (tan) + 1) * sizeof (Wchar));
687 if (p->tandesc)
688 {
689 (void) Wscpy (p->tandesc, tan);
690 p->yomdesc = (Wchar *) malloc ((Wslen (yom) + 1) * sizeof (Wchar));
691 if (p->yomdesc)
692 {
693 (void) Wscpy (p->yomdesc, yom);
694 return p;
695 }
696 free (p->tandesc);
697 }
698 free (p->hinshi);
699 }
700 free (p);
701 }
702 malloc_failed ();
703 return (struct descpack *) 0;
704 }
705
706 /* ルールの探索 */
707
708 static struct descpack *
709 searchdesc (hin)
710 Wchar *hin;
711 {
712 struct descpack *p, **pp;
713 Wchar *s;
714 int key = 0;
715
716 for (s = hin; *s; s++)
717 key += (int) *s;
718 key = ((unsigned) key & HINSHIBUFINDEXMASK);
719 for (pp = description + key; p = *pp; pp = &(p->next))
720 {
721 if (!Wscmp (p->hinshi, hin))
722 {
723 return p;
724 }
725 }
726 return (struct descpack *) 0;
727 }
728
729 static void
730 store_description ()
731 {
732 FILE *f;
733 Wchar readbuf[READBUFSIZE], *hin, *tan, *yom, *s;
734
735 if (!description_table)
736 {
737 return;
738 }
739
740 f = fopen (description_table, "r");
741 if (!f)
742 {
743 (void) fprintf (stderr, "%s: can not open the table file of parts of speech \"%s\".\n", program, description_table);
744 exit (1);
745 }
746 while (s = Fgetws (readbuf, READBUFSIZE, f))
747 {
748 Wchar nl[1];
749
750 nl[0] = (Wchar) 0;
751 hin = tan = yom = nl;
752 hin = extstr (s, &s, 0);
753 if (*hin)
754 {
755 tan = extstr (s, &s, 0);
756 if (*tan)
757 {
758 yom = extstr (s, &s, 0);
759 }
760 }
761
762 interndesc (hin, tan, yom);
763 }
764 (void) fclose (f);
765 }
766
767 struct kindpack kinds[sizeof (long) * 8];
768 static int nkinds;
769
770 #define KIHONBIT 1L
771
772 /* 種別の登録 */
773
774 static long
775 internkind (s)
776 Wchar *s;
777 {
778 int i;
779 Wchar *p;
780
781 p = findslash (s);
782 if (p)
783 {
784 long res;
785
786 *p = (Wchar) '\0';
787 res = internkind (s);
788 res |= internkind (p + 1);
789 return res;
790 }
791 else
792 {
793 for (i = 0; i < nkinds; i++)
794 {
795 if (!Wscmp (s, kinds[i].kind))
796 {
797 return kinds[i].kindbit;
798 }
799 }
800 if (nkinds < (sizeof (long) * 8) && (kinds[nkinds].kind = (Wchar *) malloc ((Wslen (s) + 1) * sizeof (Wchar))))
801 {
802 (void) Wscpy (kinds[nkinds].kind, s);
803 kinds[nkinds].kindbit = 1 << nkinds;
804 return kinds[nkinds++].kindbit;
805 }
806 return 0;
807 }
808 }
809
810 /* 種別の一覧の出力 */
811
812 static void
813 listkinds ()
814 {
815 int i;
816
817 for (i = 0; i < nkinds; i++)
818 {
819 Fputws (kinds[i].kind, stdout);
820 putchar ('\n');
821 }
822 }
823
824 static int
825 kindcompar (k1, k2)
826 struct kindpack *k1, *k2;
827 {
828 return Wscmp (k1->kind, k2->kind);
829 }
830
831 static void
832 sortkind ()
833 {
834 qsort (kinds, nkinds, sizeof (struct kindpack), kindcompar);
835 }
836
837 static struct dicpack *dic[DICBUFSIZE], **pdic;
838 static int ndicentries = 0;
839
840 /*
841
842 intern -- 辞書エントリの検索/登録
843
844 第6引数の stat としてヌルでないアドレスが指定された場合には、同じエントリ
845 が登録されていない場合には登録を行う。アドレスがヌルの場合には登録しない。
846
847 flags によっていろいろと指定をする。(以下を見てね)。
848
849 hinshi に 0 を渡してはいけない。kind は 0 を渡しても可だが、-m の時じゃない
850 マッチはしないので注意。
851
852 */
853
854 /* flags */
855 #define IGNORE_HINSHI 1L
856 #define IGNORE_KIND 2L
857
858 static struct dicpack *
859 intern (key, yomi, kouho, hinshi, hindo, kind, stat, flags)
860 int key, hindo, *stat;
861 Wchar *yomi, *kouho, *hinshi;
862 long kind, flags;
863 {
864 struct dicpack *p, **pp;
865 struct descpack *dp;
866 Wchar nl[1], *yomdesc = nl, *tandesc = nl;
867 Wchar *yom = (Wchar *) 0, *tan = (Wchar *) 0, *dhinshi, *dh;
868
869 nl[0] = (Wchar) '\0';
870
871 if (description_table)
872 {
873 dhinshi = dh = hinshi; /* かんなの品詞を探す */
874 while (*dh)
875 {
876 if (*dh++ == (Wchar) '/')
877 {
878 dhinshi = dh;
879 }
880 }
881 dp = searchdesc (dhinshi);
882 if (dp)
883 {
884 yomdesc = dp->yomdesc;
885 tandesc = dp->tandesc;
886 if (Wslen (yomdesc))
887 {
888 Wchar *t;
889 t = (Wchar *) malloc ((Wslen (yomi) + Wslen (yomdesc) + 1) * sizeof (Wchar));
890 if (t)
891 {
892 Wscpy (t, yomi);
893 yom = yomi = t;
894 Wscpy (yomi + Wslen (yomi), yomdesc);
895 }
896 }
897 if (Wslen (tandesc))
898 {
899 Wchar *t;
900 t = (Wchar *) malloc ((Wslen (kouho) + Wslen (tandesc) + 1) * sizeof (Wchar));
901 if (t)
902 {
903 Wscpy (t, kouho);
904 tan = kouho = t;
905 Wscpy (kouho + Wslen (kouho), tandesc);
906 }
907 }
908 }
909 else
910 {
911 char foo[64];
912
913 fprintf (stderr, "no description rule for ");
914 Wcstombs (foo, dhinshi, 64);
915 fprintf (stderr, "%s.\n", foo);
916 }
917 }
918
919 key = ((unsigned) key & DICBUFINDEXMASK);
920 for (pp = dic + key; p = *pp; pp = &(p->next))
921 {
922 if (!Wscmp (p->yomi, yomi) && !Wscmp (p->tango, kouho) && ((flags & IGNORE_HINSHI) || !Wscmp (p->hinshi->hinshi, hinshi)) && ((flags & IGNORE_KIND) || ((p->kind & kind) == kind)))
923 {
924 /* match */
925 if (stat)
926 *stat = FOUND;
927 if (yom)
928 free (yom);
929 if (tan)
930 free (tan);
931 return p;
932 }
933 }
934 if (stat)
935 {
936 p = (struct dicpack *) malloc (sizeof (struct dicpack));
937 if (p)
938 {
939 *pp = p;
940 (void) bzero (p, sizeof (struct dicpack));
941 p->yomi = (Wchar *) malloc ((Wslen (yomi) + 1) * sizeof (Wchar));
942 if (p->yomi)
943 {
944 (void) Wscpy (p->yomi, yomi);
945 p->tango = (Wchar *) malloc ((Wslen (kouho) + 1) * sizeof (Wchar));
946 if (p->tango)
947 {
948 (void) Wscpy (p->tango, kouho);
949 p->hinshi = internhinshi (hinshi, 1);
950 if (p->hinshi)
951 {
952 p->hindo = hindo;
953 *stat = CREATE;
954 ndicentries++;
955 p->kind = kind;
956 p->extdata = (Wchar *) 0;
957 if (yom)
958 free (yom);
959 if (tan)
960 free (tan);
961 return p;
962 }
963 free (p->tango);
964 }
965 free (p->yomi);
966 }
967 free (p);
968 }
969 malloc_failed ();
970 }
971 if (yom)
972 free (yom);
973 if (tan)
974 free (tan);
975 return (struct dicpack *) 0;
976 }
977
978 /* 登録されているエントリに対して fn を実行する */
979
980 static void
981 for_all_interned (fn)
982 void (*fn) ();
983 {
984 int i;
985 struct dicpack *p;
986
987 for (i = 0; i < DICBUFSIZE; i++)
988 {
989 for (p = dic[i]; p; p = p->next)
990 {
991 (*fn) (p);
992 }
993 }
994 }
995
996 static void
997 storepd (file)
998 FILE *file;
999 {
1000 Wchar readbuf[READBUFSIZE], *p, *yomi, *hinshi, *kouho, *hindo, *kind;
1001 int nhindo, key, tkey, stat;
1002 long kindbit;
1003 struct dicpack *dicentry;
1004
1005 while (p = Fgetws (readbuf, READBUFSIZE, file))
1006 {
1007 key = 0;
1008 yomi = extstr (p, &p, &tkey);
1009 key += tkey;
1010 kouho = extstr (p, &p, &tkey);
1011 key += tkey;
1012 hinshi = extstr (p, &p, 0);
1013 hindo = extstr (p, &p, 0);
1014 nhindo = Watoi (hindo);
1015
1016 kind = extstr (p, 0, 0);
1017 if (*kind)
1018 {
1019 kindbit = internkind (kind);
1020 }
1021 else
1022 {
1023 kindbit = KIHONBIT;
1024 }
1025
1026 dicentry = intern (key, yomi, kouho, hinshi, nhindo, kindbit, &stat, IGNORE_KIND);
1027 if (dicentry)
1028 {
1029 dicentry->kind |= kindbit;
1030 }
1031 }
1032 }
1033
1034 static void
1035 comparepd (file)
1036 FILE *file;
1037 {
1038 Wchar readbuf[READBUFSIZE], *p, *yomi, *hinshi, *kouho, *hindo, *kind;
1039 int nhindo, key, tkey, stat, *statp = &stat;
1040 struct dicpack *dicentry;
1041 long kindbit, flags = 0L;
1042
1043 while (p = Fgetws (readbuf, READBUFSIZE, file))
1044 {
1045 key = 0;
1046 yomi = extstr (p, &p, &tkey);
1047 key += tkey;
1048 kouho = extstr (p, &p, &tkey);
1049 key += tkey;
1050 hinshi = extstr (p, &p, 0);
1051 if (ignore_hinshi_to_compare)
1052 {
1053 flags |= IGNORE_HINSHI;
1054 }
1055 hindo = extstr (p, &p, 0);
1056 nhindo = Watoi (hindo);
1057
1058 kind = extstr (p, 0, 0);
1059 if (*kind)
1060 {
1061 kindbit = internkind (kind);
1062 }
1063 else
1064 {
1065 kindbit = KIHONBIT;
1066 }
1067 if (merge_kind || merge_sj3)
1068 {
1069 flags |= IGNORE_KIND;
1070 }
1071 if (copy_frequency)
1072 {
1073 statp = (int *) 0;
1074 }
1075
1076 dicentry = intern (key, yomi, kouho, hinshi, nhindo, kindbit, statp, flags);
1077
1078 if (dicentry)
1079 {
1080 if (copy_frequency)
1081 {
1082 dicentry->hindo = nhindo;
1083 dicentry->flags &= ~COMMON;
1084 }
1085 else if (ignore_hinshi_to_compare && stat == FOUND)
1086 {
1087 /* この場合、同じキーのチェーンが返る */
1088 struct dicpack *pd;
1089
1090 for (pd = dicentry; pd; pd = pd->next)
1091 {
1092 if (!Wscmp (pd->yomi, yomi) && !Wscmp (pd->tango, kouho))
1093 {
1094 pd->flags |= COMMON;
1095 if (!merge_sj3)
1096 {
1097 pd->kind |= kindbit;
1098 }
1099
1100 if (merge_sj3)
1101 {
1102 int len = 0;
1103 Wchar *dat;
1104
1105 if (pd->extdata)
1106 {
1107 len = Wslen (pd->extdata);
1108 }
1109 dat = (Wchar *) malloc ((Wslen (hinshi) + 1 + len) * sizeof (Wchar));
1110 if (dat)
1111 {
1112 if (len)
1113 {
1114 (void) Wscpy (dat, pd->extdata);
1115 (void) free ((char *) pd->extdata);
1116 }
1117 (void) Wscpy (dat + len, hinshi);
1118 pd->extdata = dat;
1119 }
1120 }
1121 }
1122 }
1123 }
1124 else
1125 {
1126 dicentry->kind |= kindbit;
1127 if (stat == FOUND)
1128 {
1129 dicentry->flags |= COMMON;
1130 }
1131 else
1132 { /* CREATE */
1133 dicentry->flags |= NEW;
1134 }
1135 }
1136 }
1137 }
1138 }
1139
1140 static void
1141 canna_output (cf, p, h, n)
1142 FILE *cf;
1143 struct dicpack *p;
1144 Wchar *h;
1145 int n;
1146 {
1147 for (; n-- > 0; h += Wslen (h) + 1)
1148 {
1149 Fputws (p->yomi, cf);
1150 (void) putc (' ', cf);
1151 Fputws (h, cf);
1152 if (p->hindo)
1153 {
1154 (void) fprintf (cf, "*%d", p->hindo);
1155 }
1156 (void) putc (' ', cf);
1157 Fputws (p->tango, cf);
1158 (void) putc ('\n', cf);
1159 }
1160 }
1161
1162 static void
1163 entry_out (cf, p, h, n, ex)
1164 FILE *cf;
1165 struct dicpack *p;
1166 Wchar *h;
1167 int n;
1168 Wchar *ex;
1169 {
1170 int i, f = 1;
1171 long b;
1172
1173 for (; n-- > 0; h += Wslen (h) + 1)
1174 {
1175 Fputws (p->yomi, cf);
1176 (void) putc (' ', cf);
1177 Fputws (p->tango, cf);
1178 (void) putc (' ', cf);
1179 if (merge_sj3 && ex)
1180 {
1181 Fputws (ex, cf);
1182 (void) putc ('/', cf);
1183 }
1184 Fputws (h, cf);
1185 if (!sj3_type_output)
1186 {
1187 (void) fprintf (cf, " %d", p->hindo);
1188 }
1189
1190 if (!wnn_type_output)
1191 {
1192 if (bunrui)
1193 {
1194 (void) printf (" %s", bunrui);
1195 }
1196 else
1197 {
1198 if (specific_kind)
1199 {
1200 b = (specific_kind & p->kind);
1201 }
1202 else
1203 {
1204 b = p->kind;
1205 }
1206 if (b != KIHONBIT)
1207 { /* 基本だけだったら何も書かない */
1208 for (i = 0; i < nkinds; i++)
1209 {
1210 if (b & kinds[i].kindbit)
1211 {
1212 if (f)
1213 {
1214 (void) putc (' ', cf);
1215 f = 0;
1216 }
1217 else
1218 {
1219 (void) putc ('/', cf);
1220 }
1221 Fputws (kinds[i].kind, cf);
1222 }
1223 }
1224 }
1225 }
1226 }
1227 (void) putc ('\n', cf);
1228 }
1229 }
1230
1231 /* p で表されるエントリをファイル cf に出力する */
1232
1233 static void
1234 printentry (cf, p)
1235 FILE *cf;
1236 struct dicpack *p;
1237 {
1238 if (specific_kind && !(p->kind & specific_kind))
1239 {
1240 return;
1241 }
1242
1243 if (extract_kana && !all_kana (p->tango))
1244 {
1245 return;
1246 }
1247
1248 if (selhinshi && !p->hinshi->hinshi[0])
1249 {
1250 return;
1251 }
1252
1253 if (canna_type_output)
1254 {
1255 canna_output (cf, p, p->hinshi->hinshi, p->hinshi->nhinshis);
1256 }
1257 else
1258 {
1259 entry_out (cf, p, p->hinshi->hinshi, p->hinshi->nhinshis, p->extdata);
1260 }
1261 }
1262
1263 static void
1264 showdeleted (p)
1265 struct dicpack *p;
1266 {
1267 if (!(p->flags & COMMON))
1268 {
1269 (void) printf ("- ");
1270 printentry (stdout, p);
1271 }
1272 }
1273
1274 static void
1275 showentry (pd, n)
1276 struct dicpack **pd;
1277 int n;
1278 {
1279 FILE *cf = (FILE *) 0, *of = (FILE *) 0, *nf = (FILE *) 0;
1280 struct dicpack *p;
1281 int i;
1282
1283 if (common_out)
1284 {
1285 if (common_out[0] != '-' || common_out[1])
1286 {
1287 cf = fopen (common_out, "w");
1288 if (!cf)
1289 {
1290 (void) fprintf (stderr, "%s: can not open file \"%s\".\n", program, common_out);
1291 exit (1);
1292 }
1293 }
1294 else
1295 {
1296 cf = stdout;
1297 }
1298 }
1299 if (old_out)
1300 {
1301 if (old_out[0] != '-' || old_out[1])
1302 {
1303 of = fopen (old_out, "w");
1304 if (!of)
1305 {
1306 (void) fprintf (stderr, "%s: can not open file \"%s\".\n", program, old_out);
1307 exit (1);
1308 }
1309 }
1310 else
1311 {
1312 of = stdout;
1313 }
1314 }
1315 if (new_out)
1316 {
1317 if (new_out[0] != '-' || new_out[1])
1318 {
1319 nf = fopen (new_out, "w");
1320 if (!nf)
1321 {
1322 (void) fprintf (stderr, "%s: can not open file \"%s\".\n", program, new_out);
1323 exit (1);
1324 }
1325 }
1326 else
1327 {
1328 nf = stdout;
1329 }
1330 }
1331
1332 for (i = 0; i < n; i++)
1333 {
1334 p = pd[i];
1335 if (compare)
1336 {
1337 if (p->flags & COMMON)
1338 {
1339 if (cf)
1340 {
1341 printentry (cf, p);
1342 }
1343 }
1344 else if (p->flags & NEW)
1345 {
1346 if (nf)
1347 {
1348 printentry (nf, p);
1349 }
1350 }
1351 else
1352 {
1353 if (of)
1354 {
1355 printentry (of, p);
1356 }
1357 }
1358 }
1359 else
1360 { /* just print the normalized dictionary */
1361 printentry (stdout, p);
1362 }
1363 }
1364 }
1365
1366 static int
1367 diccompar (p1, p2)
1368 struct dicpack **p1, **p2;
1369 {
1370 int n;
1371 if (n = Wscmp ((*p1)->yomi, (*p2)->yomi))
1372 {
1373 return n;
1374 }
1375 else if (n = Wscmp ((*p1)->tango, (*p2)->tango))
1376 {
1377 return n;
1378 }
1379 else if (n = Wscmp ((*p1)->hinshi->hinshi, (*p2)->hinshi->hinshi))
1380 {
1381 return n;
1382 }
1383 else
1384 { /* impossible */
1385 return 0;
1386 }
1387 }
1388
1389 static int
1390 dichindocompar (p1, p2)
1391 struct dicpack **p1, **p2;
1392 {
1393 int n;
1394 if (n = Wscmp ((*p1)->yomi, (*p2)->yomi))
1395 {
1396 return n;
1397 }
1398 else if (n = ((*p2)->hindo - (*p1)->hindo))
1399 {
1400 return n;
1401 }
1402 else if (n = Wscmp ((*p1)->tango, (*p2)->tango))
1403 {
1404 return n;
1405 }
1406 else if (n = Wscmp ((*p1)->hinshi->hinshi, (*p2)->hinshi->hinshi))
1407 {
1408 return n;
1409 }
1410 else
1411 { /* impossible */
1412 return 0;
1413 }
1414 }
1415
1416 void
1417 shrinkargs (argv, n, count)
1418 char **argv;
1419 int n, count;
1420 {
1421 int i;
1422
1423 for (i = 0; i + n < count; i++)
1424 {
1425 argv[i] = argv[i + n];
1426 }
1427 }
1428
1429 static void
1430 parseargs (argc, argv)
1431 int argc;
1432 char *argv[];
1433 {
1434 int i;
1435
1436 for (program = argv[0] + strlen (argv[0]); argv[0] < program; program--)
1437 {
1438 if (program[0] == '/')
1439 {
1440 program++;
1441 break;
1442 }
1443 }
1444
1445 for (i = 1; i < argc;)
1446 {
1447 if (argv[i][0] == '-' && argv[i][2] == '\0')
1448 {
1449 switch (argv[i][1])
1450 {
1451 case '1':
1452 case '2':
1453 case '3':
1454 selhinshi = argv[i][1] - '0';
1455 shrinkargs (argv + i, 1, argc - i);
1456 argc -= 1;
1457 break;
1458
1459 case 'b':
1460 bunrui = argv[i + 1];
1461 shrinkargs (argv + i, 2, argc - i);
1462 argc -= 2;
1463 break;
1464
1465 case 'c':
1466 common_out = argv[i + 1];
1467 shrinkargs (argv + i, 2, argc - i);
1468 argc -= 2;
1469 break;
1470
1471 case 'd':
1472 description_table = argv[i + 1];
1473 shrinkargs (argv + i, 2, argc - i);
1474 argc -= 2;
1475 break;
1476
1477 case 'f':
1478 copy_frequency = 1;
1479 shrinkargs (argv + i, 1, argc - i);
1480 argc -= 1;
1481 break;
1482
1483 case 'h':
1484 ignore_hinshi_to_compare = 1;
1485 shrinkargs (argv + i, 1, argc - i);
1486 argc -= 1;
1487 break;
1488
1489 case 'i':
1490 canna_type_output = 1;
1491 wnn_type_output = 0;
1492 shrinkargs (argv + i, 1, argc - i);
1493 argc -= 1;
1494 break;
1495
1496 case 'j':
1497 extract_kana = 1;
1498 shrinkargs (argv + i, 1, argc - i);
1499 argc -= 1;
1500 break;
1501
1502 case 'k':
1503 {
1504 Wchar buf[READBUFSIZE];
1505
1506 (void) Mbstowcs (buf, argv[i + 1], READBUFSIZE);
1507 specific_kind |= internkind (buf);
1508 }
1509 shrinkargs (argv + i, 2, argc - i);
1510 argc -= 2;
1511 break;
1512
1513 case 'l':
1514 list_kinds = 1;
1515 shrinkargs (argv + i, 1, argc - i);
1516 argc -= 1;
1517 break;
1518
1519 case 'm':
1520 merge_kind = 1;
1521 shrinkargs (argv + i, 1, argc - 1);
1522 argc -= 1;
1523 break;
1524
1525 case 'n':
1526 new_out = argv[i + 1];
1527 shrinkargs (argv + i, 2, argc - i);
1528 argc -= 2;
1529 break;
1530
1531 case 'o':
1532 old_out = argv[i + 1];
1533 shrinkargs (argv + i, 2, argc - i);
1534 argc -= 2;
1535 break;
1536
1537 case 'p':
1538 sort_by_frequency = 1;
1539 shrinkargs (argv + i, 1, argc - i);
1540 argc -= 1;
1541 break;
1542
1543 case 'r':
1544 hinshi_table = argv[i + 1];
1545 shrinkargs (argv + i, 2, argc - i);
1546 argc -= 2;
1547 hinshi_direction = REVERSE;
1548 break;
1549
1550 case 's':
1551 hinshi_table = argv[i + 1];
1552 shrinkargs (argv + i, 2, argc - i);
1553 argc -= 2;
1554 break;
1555
1556 case 'v':
1557 sj3_type_output = 1;
1558 wnn_type_output = 1; /* Wnn 形式と似ているので立てる */
1559 shrinkargs (argv + i, 1, argc - i);
1560 argc -= 1;
1561 break;
1562
1563 case 'w':
1564 canna_type_output = 0;
1565 sj3_type_output = 0;
1566 wnn_type_output = 1;
1567 shrinkargs (argv + i, 1, argc - i);
1568 argc -= 1;
1569 break;
1570
1571 case 'x':
1572 merge_sj3 = 1;
1573 ignore_hinshi_to_compare = 1;
1574 shrinkargs (argv + i, 1, argc - i);
1575 argc -= 1;
1576 break;
1577
1578 default:
1579 i++;
1580 break;
1581 }
1582 }
1583 else
1584 {
1585 i++;
1586 }
1587 }
1588
1589 if (argc < 2)
1590 {
1591 (void) fprintf (stderr, "Usage: %s dic1 [dic2] [-c filecommon] ...\n", program);
1592 exit (1);
1593 }
1594
1595 if (argv[1][0] != '-' || argv[1][1])
1596 {
1597 in1 = fopen (argv[1], "r");
1598 if (!in1)
1599 {
1600 (void) fprintf (stderr, "%s: can not open file \"%s\".\n", program, argv[1]);
1601 exit (1);
1602 }
1603 }
1604 if (argc == 3)
1605 {
1606 if (argv[2][0] != '-' || argv[2][1])
1607 {
1608 in2 = fopen (argv[2], "r");
1609 if (!in2)
1610 {
1611 (void) fprintf (stderr, "%s: can not open file \"%s\".\n", program, argv[2]);
1612 exit (1);
1613 }
1614 }
1615 }
1616 else
1617 {
1618 in2 = (FILE *) 0;
1619 }
1620 if (description_table)
1621 {
1622 store_description ();
1623 }
1624 }
1625
1626 static Wchar kihonh[] = {
1627 (Wchar) 'k', (Wchar) 'i', (Wchar) 'h', (Wchar) 'o', (Wchar) 'n', (Wchar) 0,
1628 };
1629
1630 int
1631 main (argc, argv)
1632 int argc;
1633 char *argv[];
1634 {
1635 #ifndef POD_WCHAR
1636 setlocale (LC_ALL, "");
1637 #endif
1638
1639 in1 = in2 = stdin;
1640 (void) internkind (kihonh); /* 基本辞書用。1L として登録 */
1641 parseargs (argc, argv);
1642 storepd (in1);
1643 (void) fclose (in1);
1644
1645 if (in2)
1646 {
1647 compare = 1;
1648 comparepd (in2);
1649 (void) fclose (in2);
1650 }
1651
1652 if (list_kinds)
1653 {
1654 listkinds ();
1655 exit (0);
1656 }
1657
1658 if (selhinshi)
1659 {
1660 select_hinshi (selhinshi);
1661 }
1662 else if (hinshi_table)
1663 {
1664 replace_hinshi ();
1665 }
1666
1667 pdic = (struct dicpack **) malloc (ndicentries * sizeof (struct dicpack *));
1668 if (pdic)
1669 {
1670 int i, j;
1671 struct dicpack *p;
1672
1673 for (i = 0, j = 0; i < DICBUFSIZE; i++)
1674 {
1675 for (p = dic[i]; p; p = p->next)
1676 {
1677 pdic[j++] = p;
1678 }
1679 }
1680 if (sort_by_frequency)
1681 {
1682 qsort (pdic, ndicentries, sizeof (struct dicpack *), dichindocompar);
1683 }
1684 else
1685 {
1686 qsort (pdic, ndicentries, sizeof (struct dicpack *), diccompar);
1687 }
1688 sortkind ();
1689 showentry (pdic, ndicentries);
1690 }
1691 else
1692 {
1693 malloc_failed ();
1694 }
1695 exit (0);
1696 }