Edinburgh Speech Tools 2.4-release
 
Loading...
Searching...
No Matches
EST_SCFG_Chart.cc
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1997 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author : Alan W Black */
34/* Date : June 1997 */
35/*-----------------------------------------------------------------------*/
36/* */
37/* A SCFG chart parser */
38/* */
39/*=======================================================================*/
40#include <cstdlib>
41#include "siod.h"
42#include "EST_math.h"
43#include "EST_SCFG.h"
44#include "EST_SCFG_Chart.h"
45
46EST_SCFG_Chart_Edge::EST_SCFG_Chart_Edge()
47{
48}
49
50EST_SCFG_Chart_Edge::EST_SCFG_Chart_Edge(double prob,
51 int d1, int d2,
52 int pos)
53{
54 p_d1 = d1;
55 p_d2 = d2;
56 p_pos = pos;
57 p_prob = prob;
58}
59
60EST_SCFG_Chart_Edge::~EST_SCFG_Chart_Edge()
61{
62}
63
64LISP EST_SCFG_Chart::print_edge(int start, int end, int p,
66{
67 // Return a lisp representation of the edge
68
69 if (e->prob() == 0)
70 {
71 return NIL; // failed
72 }
73 else if (start+1 == end)
74 {
75 // unary rule, preterminal
76 LISP r = cons(rintern(grammar->nonterminal(p)),
77 cons(flocons(e->prob()),
78 cons(flocons(start),
79 cons(flocons(end),
80 cons(rintern(grammar->terminal(e->d1())),
81 NIL)))));
82 return r;
83 }
84 else
85 {
86 //name prob start end daughters
87 EST_SCFG_Chart_Edge *d1, *d2;
88
89 d1 = edges[start][e->pos()][e->d1()];
90 d2 = edges[e->pos()][end][e->d2()];
91
92 LISP daughters =
93 cons(print_edge(start,e->pos(),e->d1(),d1),
94 cons(print_edge(e->pos(),end,e->d2(),d2),
95 NIL));
96 LISP r = cons(rintern(grammar->nonterminal(p)),
97 cons(flocons(e->prob()),
98 cons(flocons(start),
99 cons(flocons(end),
100 daughters))));
101 return r;
102 }
103}
104
105EST_SCFG_Chart::EST_SCFG_Chart()
106{
107 n_vertices = 0;
108 edges = 0;
109 wfst = 0;
110 emptyedge = 0;
111 grammar_local = TRUE;
112 grammar = new EST_SCFG;
113}
114
115EST_SCFG_Chart::~EST_SCFG_Chart()
116{
117 // delete all the vertices
118
119 delete_edge_table();
120 if (grammar_local)
121 delete grammar;
122}
123
125{
126 if (grammar_local)
127 delete grammar;
128 grammar_local = FALSE;
129 grammar = &imported_grammar;
130}
131
133{
134 grammar->set_rules(r);
135}
136
138{
139 // Set up well formed substring table from feature name in each
140 // stream item in s
141 setup_wfst(s->head(),0,name);
142}
143
145{
146 // Set up well formed substring table from feature name in each
147 // stream item in s
148 EST_Item *p;
149 int n;
150
151 delete_edge_table();
152 for (n_vertices=1,p=s; p != e; p=inext(p))
153 n_vertices++;
154 setup_edge_table();
155
156 for (n=0,p=s; p != e; p=inext(p),n++)
157 {
158 int term = grammar->terminal(p->f(name).string());
159 if (term == -1)
160 {
161 cerr << "SCFG_Chart: unknown terminal symbol \"" <<
162 p->f(name).string() << "\"" << endl;
163 term = 0; // to avoid crashing
164 }
165 wfst[n] = new EST_SCFG_Chart_Edge(1.0,term,0,-1);
166 }
167}
168
169void EST_SCFG_Chart::delete_edge_table()
170{
171 int i,j,k;
172
173 if (wfst == 0) return;
174
175 for (i=0; i < n_vertices; i++)
176 {
177 delete wfst[i];
178 for (j=0; j < n_vertices; j++)
179 {
180 for (k=0; k < grammar->num_nonterminals(); k++)
181 if (edges[i][j][k] != emptyedge)
182 delete edges[i][j][k];
183 delete [] edges[i][j];
184 }
185 delete [] edges[i];
186 }
187 delete [] wfst;
188 delete [] edges;
189 delete emptyedge;
190
191 wfst = 0;
192 edges = 0;
193
194}
195
196void EST_SCFG_Chart::setup_edge_table()
197{
198 int i,j,k;
199 int nt = grammar->num_nonterminals();
200
201 wfst = new EST_SCFG_Chart_Edge*[n_vertices];
202 edges = new EST_SCFG_Chart_Edge ***[n_vertices];
203 emptyedge = new EST_SCFG_Chart_Edge(0,0,0,0);
204
205 for (i=0; i < n_vertices; i++)
206 {
207 wfst[i] = 0;
208 edges[i] = new EST_SCFG_Chart_Edge **[n_vertices];
209 for (j=0; j < n_vertices; j++)
210 {
211 edges[i][j] = new EST_SCFG_Chart_Edge *[nt];
212 for (k=0; k < nt; k++)
213 edges[i][j][k] = 0;
214 }
215 }
216}
217
218double EST_SCFG_Chart::find_best_tree_cal(int start,int end,int p)
219{
220 // Find the best parse for non/terminal p between start and end
221 int best_j = -1;
222 int best_q = -1, best_r = -1;
223 double best_prob = 0;
224
225 if (end-1 == start)
226 {
227 best_prob = grammar->prob_U(p,wfst[start]->d1());
228 if (best_prob > 0)
229 edges[start][end][p] =
230 new EST_SCFG_Chart_Edge(best_prob*wfst[start]->prob(),
231 wfst[start]->d1(),0,-1);
232 else
233 edges[start][end][p] = emptyedge;
234 return best_prob;
235 }
236 else
237 {
238 // for all rules expanding p find total and best prob
239 double s=0,t_prob,left,right;
240 int j,q,r;
241 int nt = grammar->num_nonterminals();
242
243 for (q=0; q < nt; q++)
244 for (r=0; r < nt; r++)
245 {
246 double pBpqr = grammar->prob_B(p,q,r);
247 if (pBpqr > 0)
248 {
249 for (j=start+1; j < end; j++)
250 {
251 left=find_best_tree(start,j,q);
252 if (left > 0)
253 {
254 right = find_best_tree(j,end,r);
255 t_prob = pBpqr * left * right;
256 if (t_prob > best_prob)
257 {
258 best_prob = t_prob;
259 best_q = q;
260 best_r = r;
261 best_j = j;
262 }
263 s += t_prob;
264 }
265 }
266 }
267 }
268
269 if (best_prob > 0)
270 edges[start][end][p] =
271 new EST_SCFG_Chart_Edge(s,best_q,best_r,best_j);
272 else
273 edges[start][end][p] = emptyedge;
274 return s;
275 }
276}
277
279{
280 // do the parsing, find best parse only
281 if (n_vertices - 1 > 0)
282 find_best_tree(0,n_vertices-1,grammar->distinguished_symbol());
283
284}
285
287{
288 // Extract the parse from the edge table
290
291 top = edges[0][n_vertices-1][grammar->distinguished_symbol()];
292
293 if (top == 0)
294 return NIL; // no parse
295 else
296 return print_edge(0,n_vertices-1,grammar->distinguished_symbol(),top);
297}
298
300 EST_Relation *word,
301 int force)
302{
303 // Build a tree stream in Syn linking the leafs of Syn to those
304 // in word, force guarantees a parse is necessary (though probably
305 // a random one)
306
307 extract_parse(syn,word->head(),0,force);
308}
309
311 EST_Item *s, EST_Item *e, int force)
312{
313 // Build a tree stream in Syn linking the leafs of Syn to those
314 // in word
315 EST_Item *p;
316 int num_words;
317
318 for (num_words=0,p=s; p != e; p=inext(p))
319 num_words++;
320
321 if (num_words != (n_vertices-1))
322 {
323 cerr << "SCFG_Chart: extract_parse, number of items in link stream " <<
324 " different from those in parse tree" << endl;
325 return;
326 }
327
329 EST_Item *w = s;
330
331 top = edges[0][n_vertices-1][grammar->distinguished_symbol()];
332
333 if (top == 0)
334 return; // failed to parse so no parse tree
335 else
336 {
337 EST_Item *ss = syn->append();
338
339 extract_edge(0,n_vertices-1,grammar->distinguished_symbol(),top,
340 ss,&w);
341
342 if ((force) && (!daughter1(ss))) // no parse found but *need* one
343 extract_forced_parse(0,n_vertices-1,ss,w);
344 return;
345 }
346}
347
348void EST_SCFG_Chart::extract_forced_parse(int start, int end,
349 EST_Item *s, EST_Item *w)
350{
351 // Extract a parse even though one wasn't found (often happens
352 // with single word or dual word sentences.
353
354 if (start+1 == end)
355 {
356 s->append_daughter(w);
357 s->set_name(grammar->nonterminal(grammar->distinguished_symbol()));
358 s->set("prob",0.0); // maybe should be epsilon
359 }
360 else
361 {
362 extract_forced_parse(start,start+1,s->append_daughter(),w);
363 EST_Item *st = s->append_daughter();
364 st->set_name(grammar->nonterminal(grammar->distinguished_symbol()));
365 st->set("prob",0.0); // maybe should be epsilon
366 EST_Item *nw = inext(w);
367 extract_forced_parse(start+1,end,st,nw);
368 }
369}
370
371void EST_SCFG_Chart::extract_edge(int start, int end, int p,
373 EST_Item *s,
374 EST_Item **word)
375{
376 // Build the node for this edge, and all of its daughters
377
378 if (e->prob() == 0)
379 {
380 return; // failed
381 }
382 else if (start+1 == end)
383 {
384 // unary rule, preterminal
385 s->append_daughter((*word));
386 s->set_name(grammar->nonterminal(p));
387 s->set("prob",(float)e->prob());
388 *word = inext(*word); // increment along "word" stream
389 return;
390 }
391 else
392 {
393 //name prob start end daughters
394 EST_SCFG_Chart_Edge *d1, *d2;
395
396 d1 = edges[start][e->pos()][e->d1()];
397 d2 = edges[e->pos()][end][e->d2()];
398
399 // Inserts the new nodes in the tree (and creates new si nodes)
400 s->append_daughter();
401 s->append_daughter();
402
403 extract_edge(start,e->pos(),e->d1(),d1,daughter1(s),word);
404 extract_edge(e->pos(),end,e->d2(),d2,daughter2(s),word);
405
406 s->set_name(grammar->nonterminal(p));
407 s->set("prob",(float)e->prob());
408
409 return;
410 }
411}
412
413void EST_SCFG_chart_load_relation(EST_Relation &s,LISP sent)
414{
415 // Set up well formed substring table form lisp list
416 // Setup a relation and call the standard method of set up
417 LISP w,f;
418
419 for (w=sent; w != NIL; w=cdr(w))
420 {
421 EST_Item *word = s.append();
422
423 if (consp(car(w)))
424 { // a word with other feature info
425 word->set_name(get_c_string(car(car(w))));
426 if (consp(car(cdr(car(w)))))
427 for (f=car(cdr(car(w))); f != NIL; f=cdr(f))
428 {
429 if (FLONUMP(car(cdr(car(f)))))
430 word->set(get_c_string(car(car(f))),
431 get_c_float(car(cdr(car(f)))));
432 else
433 word->set(get_c_string(car(car(f))),
434 get_c_string(car(cdr(car(f)))));
435 }
436 else // we assume its a POS value, cause they didn't say
437 word->set("name",get_c_string(car(cdr(car(w)))));
438 }
439 else // for easy we set the pos field to the be the name
440 word->set("name",get_c_string(car(w)));
441 }
442}
443
444void scfg_parse(EST_Relation *Word, const EST_String &name,
445 EST_Relation *Syntax, EST_SCFG &grammar)
446{
447 // Parse feature name in Word to build Syntax relation
448 // The relations names above are *not* the names of the relations
449 // just named to reflect there conceptual usage
450 EST_SCFG_Chart chart;
451
452 chart.set_grammar_rules(grammar);
453 chart.setup_wfst(Word,name);
454 chart.parse();
455 chart.extract_parse(Syntax,Word,TRUE);
456
457 return;
458}
459
460LISP scfg_parse(LISP string, LISP grammar)
461{
462 // Parse and return full parse
463 EST_SCFG_Chart chart;
464 EST_Relation words;
465 LISP parse;
466
467 chart.set_grammar_rules(grammar);
468
469 EST_SCFG_chart_load_relation(words,string);
470 chart.setup_wfst(&words,"name");
471 chart.parse();
472 parse = chart.find_parse();
473
474 return parse;
475}
476
477LISP scfg_parse(LISP string, EST_SCFG &grammar)
478{
479 // Parse and return full parse
480 EST_SCFG_Chart chart;
481 EST_Relation words;
482 LISP parse;
483
484 chart.set_grammar_rules(grammar);
485
486 EST_SCFG_chart_load_relation(words,string);
487 chart.setup_wfst(&words,"name");
488 chart.parse();
489 parse = chart.find_parse();
490
491 return parse;
492}
493
494LISP scfg_bracketing_only(LISP parse)
495{
496 if (consp(siod_nth(4,parse)))
497 {
498 LISP d,ds;
499
500 for (d=cdr(cdr(cdr(cdr(parse)))),ds=NIL; d != NIL; d=cdr(d))
501 ds = cons(scfg_bracketing_only(car(d)),ds);
502 return reverse(ds);
503 }
504 else
505 return siod_nth(4,parse);
506
507}
508
510{
511 // Compare bracketing of best parse to bracketing on original
512 // For each sentence parse it (unbracketed) and then
513 // find the percentage of valid brackets in parsed version that
514 // are valid in the original one.
515 int c;
516 LISP parse;
517 EST_SuffStats cb;
518 int failed = 0;
519 int fully_contained=0;
520
521 for (c=0; c < corpus.length(); c++)
522 {
523 LISP flat = siod_flatten(corpus.a_no_check(c).string());
524
525 parse = scfg_parse(flat,*this);
526 if (parse == NIL)
527 {
528 failed++;
529 continue;
530 }
531
532 EST_bracketed_string parsed(scfg_bracketing_only(parse));
533 EST_SuffStats vs;
534
535 count_bracket_crossing(corpus.a_no_check(c),parsed,vs);
536
537 if (vs.mean() == 1)
538 fully_contained++;
539 cb += vs.mean();
540 }
541
542 cout << "cross bracketing " << cb.mean()*100 << " (" << failed <<
543 " failed " << (float)(100.0*fully_contained)/corpus.length() <<
544 "% fully consistent from " << corpus.length()
545 << " sentences)" << endl;
546
547}
548
549void count_bracket_crossing(const EST_bracketed_string &ref,
550 const EST_bracketed_string &test,
551 EST_SuffStats &vs)
552{
553 int i,j;
554
555 if (ref.length() != test.length())
556 {
557 EST_error("bracket_crossing: sentences of different lengths");
558 }
559
560 for (i=0; i < ref.length(); i++)
561 for (j=i+1; j <= ref.length(); j++)
562 if (test.valid(i,j) == 1)
563 {
564 if (ref.valid(i,j) == 0)
565 vs += 0;
566 else
567 vs += 1;
568 }
569}
void set(const EST_String &name, int ival)
Definition EST_Item.h:179
EST_Item * head() const
int pos(void)
Postion, 0 1 or 2, where 0 is empty, 1 is incomplete 2 is complete.
double prob(void)
Edge probability.
int d1()
(Non)terminal of daughter 1
int d2()
(Non)terminal of daughter 2
void extract_parse(EST_Relation *syn, EST_Relation *word, int force=0)
Extract parse tree and add it to syn linking leafs to word.
void setup_wfst(EST_Relation *s, const EST_String &name="name")
void parse()
Parses the loaded WFST with the loaded grammar.
LISP find_parse()
Return the parse in full LISP form.
void set_grammar_rules(LISP r)
Initialize from LISP rules set.
EST_String nonterminal(int p) const
Convert nonterminal index to string form.
Definition EST_SCFG.h:214
int num_nonterminals() const
Number of nonterminals.
Definition EST_SCFG.h:222
double prob_B(int p, int q, int r) const
The rule probability of given binary rule.
Definition EST_SCFG.h:226
EST_String terminal(int m) const
Convert terminal index to string form.
Definition EST_SCFG.h:216
void set_rules(LISP rules)
Set (or reset) rules from external source after construction.
Definition EST_SCFG.cc:120
double prob_U(int p, int m) const
The rule probability of given unary rule.
Definition EST_SCFG.h:228
double mean(void) const
mean of currently cummulated values
INLINE int length() const
number of items in vector.
INLINE const T & a_no_check(int n) const
read-only const access operator: without bounds checking
const EST_String & string(void) const
Definition EST_Val.h:150
int valid(int i, int k) const
If a bracketing from i to k is valid in string.
Definition EST_SCFG.h:85