Edinburgh Speech Tools 2.4-release
 
Loading...
Searching...
No Matches
wfst_build_main.cc
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1996,1997 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author : Alan W Black */
34/* Date : November 1997 */
35/*-----------------------------------------------------------------------*/
36/* Build a WFST from some base: */
37/* 1 a set of context dependent rewrite rules using the */
38/* the algorithms from "An Efficient Compiler for Weighted Rewrite */
39/* Rules", by Mehryar Mohri and Richard Sproat ACL 1996 */
40/* and information from the techniques in Rithie el al. 1992 */
41/* 2 A regular grammar (but can be written as a CFG as long as it */
42/* contains no centre embedding */
43/* 3 A regular expression */
44/* 4 lts rules (but that doesn't work yet) */
45/* */
46/* or apply some operator on existing wfst(s): compose, concatenate, */
47/* difference, union, */
48/* */
49/* Also allow determinizing and minimization as required */
50/* */
51/*=======================================================================*/
52#include <cstdlib>
53#include <cstdio>
54#include <iostream>
55#include <fstream>
56#include <cstring>
57#include "EST.h"
58#include "EST_WFST.h"
59
60static int wfst_build_main(int argc, char **argv);
61
62
63
64/** @name <command>wfst_build</command> <emphasis>Build a weighted finite-state transducer</emphasis>
65 @id wfst-build-manual
66 * @toc
67 */
68
69//@{
70
71
72/**@name Synopsis
73 */
74//@{
75
76//@synopsis
77
78/**
79
80Build and.or process weighted finite state transducers (WFSTs) form
81various input formats. This program accepts descriptions
82in the following formats and converts them to WFSTs
83<itemizedlist>
84<listitem><para>regular expressions</para></listitem>
85<listitem><para>regular grammars</para></listitem>
86<listitem><para>Koskenniemi/Kay/Kaplan context restriction rules</para></listitem>
87</itemizedlist>
88In addition various operations can be performed on two WFSTs
89<itemizedlist>
90<listitem><para>compose: form new WFST feeding output of first WFSTs into
91second WFSTs.</para></listitem>
92<listitem><para>union: form new WFST accepting the language both WFSTs
93</para></listitem>
94<listitem><para>intersect: form new WFST accepting only the language common
95to both WFSTs
96</para></listitem>
97<listitem><para>concat: form new WFST accepting the language from the
98concatenation of all strings in the first WFST to all strings in the
99second.
100</para></listitem>
101</itemizedlist>
102The newly formed WFSTs can be optionally determinized and minimzed.
103
104The option asis allows a single WFSTs to be loaded and determinized
105and/or minimized
106
107 */
108
109//@}
110
111/**@name OPTIONS
112 */
113//@{
114
115//@options
116
117//@}
118int main(int argc, char **argv)
119{
120
121 wfst_build_main(argc,argv);
122
123 exit(0);
124 return 0;
125}
126
127static int wfst_build_main(int argc, char **argv)
128{
129 // Top level function generates a WFST from rules
130 EST_Option al;
131 EST_StrList files;
132 EST_String outfile;
133
134 parse_command_line
135 (argc, argv,
136 EST_String("[option] [rulefile0] [rulefile1] ...\n")+
137 "Summary: Build a weighted finite state transducer from rules/wfsts\n"+
138 "-type <string> {kk} Input rule type: kk, lts, rg, tl, compose, regex\n"+
139 " union, intersect, concat, asis\n"+
140 "-determinize Determinize WFST before saving it\n"+
141 "-detmin Determinize and minimize WFST before saving it\n"+
142 "-o <ofile> Output file for saved WFST (default stdout)\n"+
143 "-otype <string> {ascii}\n"+
144 " Output type, ascii or binary\n"+
145 "-heap <int> {210000}\n"+
146 " Set size of Lisp heap, needed for large rulesets\n"+
147 "-q Quiet mode, no summary generated\n",
148 files, al);
149
150 if (al.present("-o"))
151 outfile = al.val("-o");
152 else
153 outfile = "-";
154
155 siod_init(al.ival("-heap"));
156
157 LISP ruleset;
158 LISP inalpha, outalpha;
159 EST_WFST *wfst = new EST_WFST;
160 gc_protect(&ruleset);
161
162 if (al.val("-type") == "kk")
163 {
164 ruleset = car(vload(files(files.head()),1));
165 kkcompile(ruleset,*wfst);
166 }
167 else if (al.val("-type") == "lts")
168 {
169 ruleset = car(vload(files(files.head()),1));
170 ltscompile(ruleset,*wfst);
171 }
172 else if (al.val("-type") == "rg")
173 {
174 ruleset = car(vload(files(files.head()),1));
175 rgcompile(ruleset,*wfst);
176 }
177 else if (al.val("-type") == "tl")
178 {
179 ruleset = car(vload(files(files.head()),1));
180 tlcompile(ruleset,*wfst);
181 }
182 else if (al.val("-type") == "asis")
183 {
184 if (wfst->load(files.nth(0)) != format_ok) exit(-1);
185 }
186 else if (al.val("-type") == "compose")
187 {
188 EST_WFST a,b;
189
190 if (files.length() != 2)
191 EST_error("compose requires two WFSTs to combine");
192
193 if (a.load(files.nth(0)) != format_ok) exit(-1);
194 if (b.load(files.nth(1)) != format_ok) exit(-1);
195
196 wfst->compose(a,b);
197 }
198 else if (al.val("-type") == "union")
199 {
200 EST_WFST a,b;
201
202 if (files.length() != 2)
203 EST_error("union requires two WFSTs to combine");
204
205 if (a.load(files.nth(0)) != format_ok) exit(-1);
206 if (b.load(files.nth(1)) != format_ok) exit(-1);
207
208 wfst->uunion(a,b);
209 }
210 else if (al.val("-type") == "intersect")
211 {
212 EST_WFST a,b;
213
214 if (files.length() != 2)
215 EST_error("intersect requires two WFSTs to combine");
216 if (a.load(files.nth(0)) != format_ok) exit(-1);
217 if (b.load(files.nth(1)) != format_ok) exit(-1);
218
219 wfst->intersection(a,b);
220 }
221 else if (al.val("-type") == "concat")
222 {
223 EST_WFST a,b;
224
225 if (files.length() != 2)
226 EST_error("concat requires two WFSTs to combine");
227 if (a.load(files.nth(0)) != format_ok) exit(-1);
228 if (b.load(files.nth(1)) != format_ok) exit(-1);
229
230 wfst->concat(a,b);
231 }
232 else if (al.val("-type") == "difference")
233 {
234 EST_WFST a,b;
235
236 if (files.length() != 2)
237 EST_error("difference requires two WFSTs to combine");
238 if (a.load(files.nth(0)) != format_ok) exit(-1);
239 if (b.load(files.nth(1)) != format_ok) exit(-1);
240
241 wfst->difference(a,b);
242 }
243 else if (al.val("-type") == "regex")
244 {
245 ruleset = car(vload(files(files.head()),1));
246 inalpha = siod_nth(0,ruleset);
247 outalpha = siod_nth(1,ruleset);
248 wfst->build_from_regex(inalpha,outalpha,car(cdr(cdr(ruleset))));
249 }
250 else
251 {
252 cerr << "wfst_build: unknown rule type \"" << al.val("-type")
253 << "\"" << endl;
254 exit(-1);
255 }
256
257 if (al.present("-determinize"))
258 {
259 EST_WFST *dwfst = new EST_WFST;
260 dwfst->determinize(*wfst);
261 if (!al.present("-q"))
262 {
263 cout << "wfst_build summary: " << endl;
264 cout << " non-deterministic wfst: " <<
265 wfst->summary() << endl;
266 cout << " deterministic wfst: " <<
267 dwfst->summary() << endl;
268 }
269 delete wfst;
270 wfst = dwfst;
271 }
272 else if (al.present("-detmin"))
273 {
274 if (!al.present("-q"))
275 {
276 cout << "wfst_build summary: " << endl;
277 cout << " non-deterministic wfst: " <<
278 wfst->summary() << endl;
279 }
280 EST_WFST *dwfst = new EST_WFST;
281 dwfst->determinize(*wfst);
282 delete wfst;
283 if (!al.present("-q"))
284 cout << " deterministic wfst: " <<
285 dwfst->summary() << endl;
286 EST_WFST *mwfst = new EST_WFST;
287 mwfst->minimize(*dwfst);
288 if (!al.present("-q"))
289 cout << " minimized wfst: " <<
290 mwfst->summary() << endl;
291 delete dwfst;
292 wfst = mwfst;
293 }
294 else
295 {
296 if (!al.present("-q"))
297 cout << "wfst_build: " << wfst->summary() << endl;
298 }
299
300 wfst->save(outfile,al.val("-otype"));
301 delete wfst;
302 gc_unprotect(&ruleset);
303
304 return 0;
305}
306
int ival(const EST_String &rkey, int m=1) const
Definition EST_Option.cc:76
const V & val(const K &rkey, bool m=0) const
return value according to key (const)
Definition EST_TKVL.cc:145
const int present(const K &rkey) const
Returns true if key is present.
Definition EST_TKVL.cc:222
T & nth(int n)
return the Nth value
Definition EST_TList.h:139
void difference(const EST_WFST &a, const EST_WFST &b)
Definition wfst_ops.cc:898
void uunion(EST_TList< EST_WFST > &wl)
void compose(const EST_WFST &a, const EST_WFST &b)
Definition wfst_ops.cc:812
EST_write_status save(const EST_String &filename, const EST_String type="ascii")
?
Definition EST_WFST.cc:349
void minimize(const EST_WFST &a)
Build minimized form of a.
Definition wfst_ops.cc:484
void concat(const EST_WFST &a, const EST_WFST &b)
Definition wfst_ops.cc:776
void intersection(EST_TList< EST_WFST > &wl)
Definition wfst_ops.cc:356
void determinize(const EST_WFST &a)
Build determinized form of a.
Definition wfst_ops.cc:164
EST_read_status load(const EST_String &filename)
?
Definition EST_WFST.cc:508