Edinburgh Speech Tools 2.4-release
 
Loading...
Searching...
No Matches
scfg_train_main.cc
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1996,1997 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author : Alan W Black */
34/* Date : October 1997 */
35/*-----------------------------------------------------------------------*/
36/* Train a stochastic context free grammar with respect to a given */
37/* corpus. */
38/* */
39/* Only the inside/outside algorithm (with bracketing) is supported */
40/* */
41/* */
42/*=======================================================================*/
43#include <cstdlib>
44#include <cstdio>
45#include <iostream>
46#include <fstream>
47#include <cstring>
48#include "EST_cmd_line.h"
49#include "EST_SCFG.h"
50#include "siod.h"
51
52static EST_String outfile = "-";
53
54
55static int scfg_train_main(int argc, char **argv);
56
57/** @name <command>scfg_train</command> <emphasis>Train the parameters of a stochastic context free grammar</emphasis>
58 @id scfg-make-manual
59 * @toc
60 */
61
62//@{
63
64
65/**@name Synopsis
66 */
67//@{
68
69//@synopsis
70
71/**
72
73scfg_train takes a stochastic context free grammar (SCFG) and trains
74the probabilities with respect to a given bracket corpus using the
75inside-outside algorithm. This is basically an implementation
76of Pereira and Schabes 1992.
77
78Note using this program properly may require months of CPU time.
79
80 */
81
82//@}
83
84/**@name OPTIONS
85 */
86//@{
87
88
89//@options
90
91//@}
92
93
94int main(int argc, char **argv)
95{
96
97 scfg_train_main(argc,argv);
98
99 exit(0);
100 return 0;
101}
102
103static int scfg_train_main(int argc, char **argv)
104{
105 // Top level function generates a probabilistic grammar
106 EST_Option al;
107 EST_StrList files;
108 int spread;
109
110 parse_command_line
111 (argc, argv,
112 EST_String("[options\n")+
113 "Summary: Train a stochastic context free grammar from a (bracketed) corpus\n"+
114 "-grammar <ifile> Grammar file, one rule per line.\n"+
115 "-corpus <ifile> Corpus file, one bracketed sentence per line.\n"+
116 "-method <string> {inout}\n"+
117 " Method for training: inout.\n"+
118 "-passes <int> {50}\n"+
119 " Number of training passes.\n"+
120 "-startpass <int> {0}\n"+
121 " Starting at pass N.\n"+
122 "-spread <int> Spread training data over N passes.\n"+
123 "-checkpoint <int> Save grammar every N passes\n"+
124 "-heap <int> {210000}\n"+
125 " Set size of Lisp heap, needed for large corpora\n"+
126 "-o <ofile> Output file for trained grammar.\n",
127 files, al);
128
129 if (al.present("-o"))
130 outfile = al.val("-o");
131 else
132 outfile = "-";
133
134 siod_init(al.ival("-heap"));
135
136 EST_SCFG_traintest grammar;
137
138 if (al.present("-grammar"))
139 {
140 grammar.load(al.val("-grammar"));
141 }
142 else
143 {
144 cerr << "scfg_train: no grammar specified" << endl;
145 exit(1);
146 }
147
148 if (al.present("-corpus"))
149 {
150 grammar.load_corpus(al.val("-corpus"));
151 }
152 else
153 {
154 cerr << "scfg_train: no corpus specified" << endl;
155 exit(1);
156 }
157
158 if (al.present("-spread"))
159 spread = al.ival("-spread");
160 else
161 spread = 0;
162
163 if (al.val("-method") == "inout")
164 {
165 int checkpoint = -1;
166 if (al.present("-checkpoint"))
167 checkpoint = al.ival("-checkpoint");
168
169 grammar.train_inout(al.ival("-passes"),
170 al.ival("-startpass"),
171 checkpoint,spread,outfile);
172 }
173 else
174 {
175 cerr << "scfg_train: unknown training method \"" <<
176 al.val("-method") << "\"" << endl;
177 exit(1);
178 }
179
180 if (grammar.save(outfile) != write_ok)
181 {
182 cerr << "scfg_train: failed to write grammar to \"" <<
183 outfile << "\"" << endl;
184 exit(1);
185 }
186
187 return 0;
188}
int ival(const EST_String &rkey, int m=1) const
Definition EST_Option.cc:76
void train_inout(int passes, int startpass, int checkpoint, int spread, const EST_String &outfile)
void load_corpus(const EST_String &filename)
EST_read_status load(const EST_String &filename)
Load grammar from named file.
Definition EST_SCFG.cc:193
EST_write_status save(const EST_String &filename)
Save current grammar to named file.
Definition EST_SCFG.cc:204
const V & val(const K &rkey, bool m=0) const
return value according to key (const)
Definition EST_TKVL.cc:145
const int present(const K &rkey) const
Returns true if key is present.
Definition EST_TKVL.cc:222