Edinburgh Speech Tools 2.4-release
 
Loading...
Searching...
No Matches
scfg_make_main.cc
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1996,1997 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author : Alan W Black */
34/* Date : October 1997 */
35/*-----------------------------------------------------------------------*/
36/* Build a stochastic context feee grammar with N non-terminals and */
37/* M terminals specific as lists or numbers */
38/* Probabilities are either even or random on rules and specified as */
39/* probs or -log prob */
40/* */
41/*=======================================================================*/
42#include <cstdlib>
43#include <cstdio>
44#include <iostream>
45#include <fstream>
46#include <cstring>
47#include "EST.h"
48#include "EST_SCFG.h"
49#include "siod.h"
50
51EST_String outfile = "-";
52EST_String domain = "nlogp";
53EST_String values = "equal";
54
55static int scfg_make_main(int argc, char **argv);
56
57static void load_symbols(EST_StrList &syms,const EST_String &filename);
58static void make_symbols(EST_StrList &syms,int n,const EST_String &prefix);
59static LISP assign_probs(LISP rules, const EST_String &domain,
60 const EST_String &values);
61static LISP make_all_rules(const EST_StrList &NonTerminals,
62 const EST_StrList &Terminals);
63static void generate_probs(double *probs,int num);
64
65/** @name <command>scfg_make</command> <emphasis>Make the rules for a stochastic context free grammar</emphasis>
66 @id scfg-make-manual
67 * @toc
68 */
69
70//@{
71
72
73/**@name Synopsis
74 */
75//@{
76
77//@synopsis
78
79/**
80
81Builds a stochastic context free grammar from a vocabulary of non-terminal
82and terminal symbols. An exhaustive set of all possible binary rules
83are generated with random (or equal) probabilities (or negative log
84probabilities). This program is designed for making grammars that
85can be trained using scfg_train.
86
87 */
88
89//@}
90
91/**@name OPTIONS
92 */
93//@{
94
95//@options
96
97//@}
98
99
100int main(int argc, char **argv)
101{
102
103 scfg_make_main(argc,argv);
104
105 exit(0);
106 return 0;
107}
108
109static int scfg_make_main(int argc, char **argv)
110{
111 // Top level function generates a probabilistic grammar
112 EST_Option al;
113 EST_StrList files;
114 EST_StrList NonTerminals, Terminals;
115 LISP rules,r;
116 FILE *fd;
117
118 parse_command_line
119 (argc, argv,
120 EST_String("[options]\n")+
121 "Summary: Build a stochastic context free grammar\n"+
122 "-nonterms <string> Number of nonterminals or file containing them\n"+
123 "-terms <string> Number of terminals or file containing them\n"+
124 "-domain <string> {nlogp}\n"+
125 " Values to be nlogp (negative log probabilities)\n"+
126 " or prob (probabilities)\n"+
127 "-values <string> {equal}\n"+
128 " General initial scores on rules as equal or\n"
129 " random\n"+
130 "-heap <int> {500000}\n"+
131 " Set size of Lisp heap, only needed for large grammars\n"+
132 "-o <ofile> File to save grammar (default stdout)\n",
133 files, al);
134
135 if (al.present("-o"))
136 outfile = al.val("-o");
137 else
138 outfile = "-";
139
140 if (al.present("-domain"))
141 {
142 if (al.val("-domain") == "nlogp")
143 domain = "nlogp";
144 else if (al.val("-domain") == "prob")
145 domain = "prob";
146 else
147 {
148 cerr << "scfg_make: domain must be nlogp or prob" << endl;
149 exit(1);
150 }
151 }
152
153 if (al.present("-values"))
154 {
155 if (al.val("-values") == "equal")
156 values = "equal";
157 else if (al.val("-values") == "random")
158 values = "random";
159 else
160 {
161 cerr << "scfg_make: values must be equal or random" << endl;
162 exit(1);
163 }
164 }
165
166 if (al.present("-nonterms"))
167 {
168 if (al.val("-nonterms").matches(RXint))
169 make_symbols(NonTerminals,al.ival("-nonterms"),"NT");
170 else
171 load_symbols(NonTerminals,al.val("-nonterms"));
172 }
173 else
174 {
175 cerr << "scfg_make: no nonterminals specified" << endl;
176 exit(1);
177 }
178
179 if (al.present("-terms"))
180 {
181 if (al.val("-terms").matches(RXint))
182 make_symbols(Terminals,al.ival("-terms"),"T");
183 else
184 load_symbols(Terminals,al.val("-terms"));
185 }
186 else
187 {
188 cerr << "scfg_make: no terminals specified" << endl;
189 exit(1);
190 }
191
192 siod_init(al.ival("-heap"));
193
194 rules = make_all_rules(NonTerminals,Terminals);
195 rules = assign_probs(rules,domain,values);
196
197 if (outfile == "-")
198 fd = stdout;
199 else
200 {
201 if ((fd=fopen(outfile,"w")) == NULL)
202 {
203 cerr << "scfg_make: failed to open file \"" << outfile <<
204 "\" for writing" << endl;
205 exit(1);
206 }
207 }
208
209 for (r=rules; r != NIL; r=cdr(r))
210 pprint_to_fd(fd,car(r));
211
212 if (fd != stdout)
213 fclose(fd);
214
215
216 return 0;
217}
218
219static LISP make_all_rules(const EST_StrList &NonTerminals,
220 const EST_StrList &Terminals)
221{
222 // Build all possibly rules (CNF)
223 // NT -> NT NT and NT -> T
224 EST_Litem *p,*q,*r;
225 LISP rules = NIL;
226
227 for (p=NonTerminals.head(); p != 0; p=p->next())
228 {
229 int num_rules_nt = (NonTerminals.length()*NonTerminals.length())+
230 Terminals.length();
231 double *probs = new double[num_rules_nt];
232 generate_probs(probs,num_rules_nt);
233 int i=0;
234 for (q=NonTerminals.head(); q != 0; q=q->next())
235 for (r=NonTerminals.head(); r != 0; r=r->next(),i++)
236 rules = cons(cons(flocons(probs[i]),
237 cons(rintern(NonTerminals(p)),
238 cons(rintern(NonTerminals(q)),
239 cons(rintern(NonTerminals(r)),NIL)))),
240 rules);
241 for (q=Terminals.head(); q != 0; q=q->next(),i++)
242 rules = cons(cons(flocons(probs[i]),
243 cons(rintern(NonTerminals(p)),
244 cons(rintern(Terminals(q)),NIL))),
245 rules);
246 delete [] probs;
247 }
248
249 return reverse(rules);
250}
251
252static void generate_probs(double *probs,int num)
253{
254 // Generate probabilities
255 int i;
256
257 if (values == "equal")
258 {
259 double defp = 1.0/(float)num;
260 for (i=0; i < num; i++)
261 probs[i] = defp;
262 }
263 else if (values == "random")
264 {
265 // This isn't random but is somewhat arbitrary
266 double sum = 0;
267 for (i=0; i < num; i++)
268 {
269 probs[i] = (double)abs(rand())/(double)0x7fff;
270 sum += probs[i];
271 }
272 for (i=0; i < num; i++)
273 {
274 probs[i] /= sum;
275 }
276 }
277 else
278 {
279 cerr << "scfg_make: unknown value for probability distribution"
280 << endl;
281 exit(1);
282 }
283}
284
285static LISP assign_probs(LISP rules, const EST_String &domain,
286 const EST_String &values)
287{
288 // Modify probs (don't know how to do random probs yet)
289 LISP r;
290 (void)values;
291
292 if (domain == "nlogp")
293 for (r=rules; r != NIL; r = cdr(r))
294 {
295 if (get_c_float(car(car(r))) == 0)
296 CAR(car(r)) = flocons(40);
297 else
298 CAR(car(r)) = flocons(-log(get_c_float(car(car(r)))));
299 }
300
301 return rules;
302}
303
304static void make_symbols(EST_StrList &syms,int n,const EST_String &prefix)
305{
306 // Generate n symbols with given prefix
307 int i;
308 int magnitude,t;
309
310 for (magnitude=0,t=n; t > 0; t=t/10)
311 magnitude++;
312
313 char *name = walloc(char,prefix.length()+magnitude+1);
314 char *skel = walloc(char,prefix.length()+5);
315 sprintf(skel,"%s%%%02dd",(const char *)prefix,magnitude);
316
317 for (i=0; i < n; i++)
318 {
319 sprintf(name,skel,i);
320 syms.append(name);
321 }
322
323 wfree(name);
324 wfree(skel);
325
326}
327
328
329static void load_symbols(EST_StrList &syms,const EST_String &filename)
330{
331 // Load symbol list for file
332
333 load_StrList(filename,syms);
334
335}
int ival(const EST_String &rkey, int m=1) const
Definition EST_Option.cc:76
int length(void) const
Length of string ({not} length of underlying chunk)
Definition EST_String.h:241
int matches(const char *e, int pos=0) const
Exactly match this string?
const V & val(const K &rkey, bool m=0) const
return value according to key (const)
Definition EST_TKVL.cc:145
const int present(const K &rkey) const
Returns true if key is present.
Definition EST_TKVL.cc:222
void append(const T &item)
add item onto end of list
Definition EST_TList.h:191