Edinburgh Speech Tools 2.4-release
 
Loading...
Searching...
No Matches
ngrammar_aux.cc
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1996 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author : Simon King & Alan W Black */
34/* Date : February 1997 */
35/*-----------------------------------------------------------------------*/
36/* */
37/* Auxiliary functions for EST_Ngram class */
38/* */
39/*=======================================================================*/
40#include <iostream>
41#include <cstring>
42#include "EST_String.h"
43#include "EST_Ngrammar.h"
44
45static bool
46ExponentialFit(EST_DVector &N, double &a, double &b, int first, int last)
47{
48 // fit the function N(r) = e^a * r^b where a,b are constants
49 // to the points N(first)...N(last) inclusive
50 // i.e. a straight line fit in the (natural) log domain
51 // minimising the mean squared error (in log domain), is :
52
53 // limitation : r must be an integer (it is the index of v)
54
55
56 // default
57 if (last == -1)
58 last = N.n()-1;
59
60 if (first < 0)
61 {
62 cerr << "ExponentialFit : first must be >= 0" << endl;
63 return false;
64 }
65
66 if (last >= N.n()-1)
67 {
68 cerr << "ExponentialFit : last must be < N.n()-1 = " << N.n()-1 << endl;
69 }
70
71 if (first == last)
72 {
73 a=log(N(first));
74 b=0;
75 return true;
76 }
77
78 double ElnNr=0.0,ElnNrlnr=0.0,
79 Elnr=0.0,Elnr2=0.0,
80 R=0.0;
81
82 for(int r=first;r<=last;r++)
83 {
84 R += 1;
85 if ( N(r) > 0)
86 {
87 ElnNr += log( N(r) );
88 ElnNrlnr += log( N(r) ) * log( (double)r );
89 }
90 Elnr += log( (double)r );
91 Elnr2 += log( (double)r ) * log( (double)r );
92 }
93
94 // and the answer is :
95 b = ( (ElnNr*Elnr/R) - ElnNrlnr ) / ( (Elnr*Elnr/R) - Elnr2);
96 a = (ElnNr - (b*Elnr) ) / R;
97
98 return true;
99}
100
101static bool
102smooth_ExponentialFit(EST_DVector &N, int first, int last)
103{
104 double a=0.0,b=0.0;
105
106 if (!ExponentialFit(N,a,b,first,last))
107 {
108 cerr << "smooth_ExponentialFit : ExponentialFit failed !" << endl;
109 return false;
110 }
111
112 for(int r=first;r<=last;r++)
113 N[r] = exp(a)* pow((double)r, b);
114
115 return true;
116}
117
118void make_f_of_f(EST_BackoffNgrammarState *s,void *params)
119{
120 EST_Litem *k;
121 double freq;
122 EST_String name;
123
124 EST_DVector *ff = (EST_DVector*)params;
125 int max = ff->n();
126 for (k=s->pdf_const().item_start();
127 !s->pdf_const().item_end(k);
128 k = s->pdf_const().item_next(k))
129 {
130 s->pdf_const().item_freq(k,name,freq);
131 if(freq+0.5 < max)
132 (*ff)[(int)(freq+0.5)] += 1;
133
134 }
135
136
137}
138
139void get_max_f(EST_BackoffNgrammarState *s,void *params)
140{
141 EST_Litem *k;
142 double freq;
143 EST_String name;
144
145 double *max = (double*)params;
146 for (k=s->pdf_const().item_start();
147 !s->pdf_const().item_end(k);
148 k = s->pdf_const().item_next(k))
149 {
150 s->pdf_const().item_freq(k,name,freq);
151 if(freq > *max)
152 *max = freq;
153
154 }
155
156
157}
158
159void map_f_of_f(EST_BackoffNgrammarState *s,void *params)
160{
161 EST_Litem *k;
162 double freq;
163 EST_String name;
164
165 //cerr << "map_f_of_f : visited " << *s << endl;
166
167 EST_DVector *map = (EST_DVector*)params;
168 int max = map->n();
169 for (k=s->pdf_const().item_start();
170 !s->pdf_const().item_end(k);
171 k = s->pdf_const().item_next(k))
172 {
173 s->pdf_const().item_freq(k,name,freq);
174 if (freq+0.5 < max)
175 {
176 double nfreq = (*map)((int)(freq+0.5));
177 s->pdf().set_frequency(name,nfreq);
178
179 }
180
181 }
182
183}
184
185void zero_small_f(EST_BackoffNgrammarState *s,void *params)
186{
187 EST_Litem *k;
188 double freq;
189 EST_String name;
190
191 double *min = (double*)params;
192 for (k=s->pdf_const().item_start();
193 !s->pdf_const().item_end(k);
194 k = s->pdf_const().item_next(k))
195 {
196 s->pdf_const().item_freq(k,name,freq);
197 if (freq < *min)
198 {
199 //cerr << "zeroing " << freq << " < " << *min << endl;
200 s->pdf().override_frequency(k,0.0);
201 }
202 }
203}
204
205void frequency_of_frequencies(EST_DVector &ff, EST_Ngrammar &n,int this_order)
206{
207 int i,size;
208 EST_Litem *k;
209 double max=0.0;
210
211 // if ff has zero size, do complete frequency of frequencies
212 // otherwise just fill in frequencies from 1 to ff.n()-1
213
214 bool complete = (bool)(ff.n() == 0);
215
216 switch(n.representation())
217 {
218
219 case EST_Ngrammar::sparse:
220 case EST_Ngrammar::dense:
221 {
222 size = n.num_states();
223 if (complete)
224 {
225 // find highest frequency in EST_Ngram
226 for(i=0;i<size;i++)
227 if (n.p_states[i].pdf().samples() > max)
228 max = n.p_states[i].pdf().samples();
229
230 ff.resize((int)(max+1.5));
231 ff.fill(0.0);
232 }
233
234 // Sum the frequencies
235 for(i=0;i<size;i++)
236 {
237 for (k=n.p_states[i].pdf().item_start();
238 !n.p_states[i].pdf().item_end(k);
239 k = n.p_states[i].pdf().item_next(k))
240 {
241 EST_String name;
242 double freq;
243 n.p_states[i].pdf().item_freq(k,name,freq);
244 ff[(int)(freq+0.5)] += 1;
245 }
246 }
247
248 if (complete)
249 {
250 // fill in ff(0) - the number of unseen ngrams
251
252 double total=0;
253 for (i=1;i<ff.n();i++)
254 total += ff(i);
255
256 ff[0] = pow(float(n.get_vocab_length()),float(n.order())) - total;
257 }
258 }
259 break;
260
261
262 case EST_Ngrammar::backoff:
263 {
264 if (complete)
265 {
266 n.backoff_traverse(n.backoff_representation,
267 &get_max_f,(void*)(&max),
268 this_order-1);
269 ff.resize((int)(max+1.5));
270 }
271
272 // backoff grammar has grammars of orders
273 // order, order-1, ..., 1
274 // and we treat each order independently
275
276 for (i=0;i<ff.n();i++)
277 ff[i] = 0;
278
279 n.backoff_traverse(n.backoff_representation,
280 &make_f_of_f,(void*)(&ff),
281 this_order-1);
282
283 if (complete)
284 {
285 // fill in ff(0) - the number of unseen ngrams
286 double total=0;
287 for (i=1;i<ff.n();i++)
288 total += ff(i);
289 ff[0] = pow(float(n.get_vocab_length()),float(this_order)) - total;
290
291
292
293 }
294 }
295 break;
296
297 default:
298 cerr << "unknown representation for EST_Ngrammar" << endl;
299 break;
300 }
301
302}
303
304void map_frequencies(EST_Ngrammar &n, const EST_DVector &map, const int this_order)
305{
306 int i;
307 EST_Litem *k;
308
309 switch(n.representation())
310 {
311
312 case EST_Ngrammar::sparse:
313 case EST_Ngrammar::dense:
314 {
315 int size = n.p_num_states;
316
317 for(i=0;i<size;i++)
318 {
319 for (k=n.p_states[i].pdf().item_start();
320 !n.p_states[i].pdf().item_end(k);
321 k = n.p_states[i].pdf().item_next(k))
322 {
323 EST_String name;
324 double freq,nfreq;
325 n.p_states[i].pdf().item_freq(k,name,freq);
326 nfreq = map((int)(freq+0.5));
327 n.p_states[i].pdf().set_frequency(name,nfreq);
328
329
330 }
331 }
332 }
333 break;
334
335 case EST_Ngrammar::backoff:
336 {
337
338 // backoff grammar has grammars of orders
339 // order, order-1, ..., 1
340 // and we treat each order independently
341 n.backoff_traverse(n.backoff_representation,
342 &map_f_of_f,(void*)(&map),
343 this_order-1);
344
345 }
346 break;
347
348 default:
349 cerr << "unknown representation for EST_Ngrammar" << endl;
350 break;
351
352 }
353}
354
355static void
356adjusted_frequencies_BasicGoodTuring(EST_DVector &M,
357 const EST_DVector &N,
358 int maxcount)
359{
360 // produce a map of frequencies, given a (smoothed) distribution
361 // only map up to a frequency of maxcount; beyond that
362 // map(r) == r
363
364 if (maxcount > N.n()-2)
365 {
366 maxcount = N.n()-2;
367 cerr << "adjusted_frequencies_BasicGoodTuring :";
368 cerr << " maxcount is too big, reducing it to " << maxcount << endl;
369 }
370
371 M.resize(N.n());
372 int r;
373 for(r=0; r<=maxcount;r++)
374 {
375 // don't like this bit, but what can you do ?
376 if( (N(r+1) == 0) || (N(r) == 0) )
377 M[r] = r;
378 else
379 M[r] = (r + 1) * N(r+1) / N(r);
380 }
381 // and do not map higher counts
382 for(;r<N.n();r++)
383 M[r]=r;
384
385 return;
386
387}
388
389static void
390smoothed_frequency_distribution_ExponentialFit(EST_DVector &N,int maxcount)
391{
392 if (maxcount > N.n()-2)
393 {
394 maxcount = N.n()-2;
395 cerr << "smoothed_frequency_distribution_ExponentialFit :"
396 << " maxcount too big, reducing it to " << maxcount << endl;
397 }
398 // the idea to fit this particular fn was by Steve Isard
399 // we ignore r=0 in doing the fit
400
401 if (!smooth_ExponentialFit(N,1,maxcount+1))
402 cerr << "smooth_ExponentialFit failed !" << endl;
403
404 return;
405}
406
407bool
408Good_Turing_smooth(EST_Ngrammar &ngrammar, int maxcount, int mincount)
409{
410 // works for any N
411 // since it just remaps frequencies
412
413 // frequencies above maxcount are not changed
414 // for discounting -- see discount routine !
415
416
417 if (ngrammar.entry_type() != EST_Ngrammar::frequencies)
418 {
419 cerr << "EST_Ngram: cannot Good-Turing smooth ngram:" <<
420 " entries are not frequencies" << endl;
421 return false;
422 }
423
424 switch(ngrammar.representation())
425 {
426
427 case EST_Ngrammar::sparse:
428 case EST_Ngrammar::dense:
429 {
430 EST_DVector freqs,mapped_freqs;
431 // grammar is of a single order - simple
432 // Find frequency distribution
433 frequency_of_frequencies(freqs,ngrammar,0);
434 // smoothing should be optional - to do
435 smoothed_frequency_distribution_ExponentialFit(freqs,maxcount-1);
436 // Build map of frequencies
437 adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,maxcount);
438 // Map all frequencies in grammar to Good Turing Smoothed values
439 map_frequencies(ngrammar,mapped_freqs,0);
440
441 }
442 break;
443
444 case EST_Ngrammar::backoff:
445{
446
447 cerr << "Smoothing of backed of grammars is not available!" << endl;
448 return false;
449
450 (void)mincount;
451
452 /*
453 // need to smooth for each order independently
454 int i,o;
455 double threshold;
456
457 //for (o=1;o<=ngrammar.order();o++)
458 for(o=ngrammar.order();o<=ngrammar.order();o++)
459 {
460
461 EST_DVector freqs,mapped_freqs;
462
463 frequency_of_frequencies(freqs,ngrammar,o);
464
465 cerr << "FREQ : " << freqs << endl;
466
467 if(freqs.n() < 2)
468 {
469 // i.e. only unseen things - zero them all
470 threshold = 2; // 1 ?
471 if(o>1)
472 ngrammar.backoff_traverse(ngrammar.backoff_representation,
473 &zero_small_f,
474 (void*)(&threshold),
475 o-1);
476 continue;
477 }
478
479 int max = maxcount;
480 if(max > freqs.n() - 2)
481 max = freqs.n() - 2;
482
483 if(max > 2)
484 // need at least 3 points
485 // smoothing should be optional - to do
486 smoothed_frequency_distribution_ExponentialFit(freqs,max);
487
488 cerr << "SMOOTHED : " << freqs << endl;
489
490
491 adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,max);
492
493 // initial map of frequencies modifies total counts
494 // so pdfs are correct
495
496 map_frequencies(ngrammar,mapped_freqs,o);
497
498
499 cerr << "Map for " << o << " : ";
500 for(i=0;i<max;i++)
501 cerr << mapped_freqs(i) << " ";
502 cerr << endl;
503
504 cerr << "MAP : " << mapped_freqs << endl;
505
506 // now go and zero the small frequencies
507 // but not for unigram
508
509
510 if(mincount < mapped_freqs.n())
511 threshold = mapped_freqs(mincount);
512 else
513 threshold = mapped_freqs(mapped_freqs.n()-1) + 1; // i.e. zero everything
514
515 //cerr << "min = " << threshold << endl;
516
517 if(o>1)
518 ngrammar.backoff_traverse(ngrammar.backoff_representation,
519 &zero_small_f,
520 (void*)(&threshold),
521 o-1);
522 */
523
524}
525break;
526
527default:
528cerr << "unknown representation for EST_Ngrammar" << endl;
529break;
530
531}
532
533return true;
534
535}
536
537
538void
539Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount,
540 const double default_discount)
541{
542
543 if(ngrammar.representation() != EST_Ngrammar::backoff)
544 {
545 cerr << "Good_Turing_discount is not appropriate for non backoff grammar !"
546 << endl;
547 return;
548 }
549
550
551
552 // method (for each grammar order):
553 // produce Good-Turing smoothed frequency map
554 // compute difference between unsmoothed map (0,1,2,3...)
555 // and GT map - this is the discount
556 // we do not subtract the discount here since we need to
557 // preserve the raw counts
558
559 // discounting is applied up to frequency 'maxcount'
560 // beyond which we do not discount
561
562 // to do : zero low frequencies ???? (prune the tree)
563 /*
564 ngrammar.backoff_traverse(ngrammar.backoff_representation,
565 &zero_small_f,
566 (void*)(&threshold),
567 o-1);
568 */
569
570 // need to treat for each order independently
571 int i,o;
572
573 // no discounting for order 1 - YES !?!?!?
574 for (o=1;o<=ngrammar.order();o++)
575 {
576
577 EST_DVector freqs,mapped_freqs;
578 frequency_of_frequencies(freqs,ngrammar,o);
579
580 int max = maxcount;
581 if(max > freqs.n() - 2)
582 max = freqs.n() - 2;
583
584 if(max > 2)
585 {
586 // need at least 3 points
587 // smoothing should be optional - to do
588
589 // April 97
590 // to overcome problems with N(x)=0, shift data points, fit, and shift back
591
592 for(i=0;i<=max+1;i++)
593 freqs[i] += 1;
594
595 smoothed_frequency_distribution_ExponentialFit(freqs,max);
596
597 for(i=0;i<=max+1;i++)
598 {
599 freqs[i] -= 1;
600 // possibly unnecesary check
601 if ( freqs(i) < 0 )
602 freqs[i] = 0;
603 }
604 }
605
606 adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,max);
607
608 // and put it in the discount array
609 ngrammar.backoff_discount[o-1].resize(freqs.n());
610 for(i=(int)ngrammar.backoff_threshold;i<=max;i++)
611 {
612 ngrammar.backoff_discount[o-1][i] = (double)i - mapped_freqs(i);
613
614 // sanity check
615 if( ngrammar.backoff_discount[o-1][i] < 0)
616 {
617 ngrammar.backoff_discount[o-1][i] = 0;
618 }
619 }
620
621 for(;i<freqs.n();i++)
622 ngrammar.backoff_discount[o-1][i] = default_discount;
623
624 }
625}
626
627VAL_REGISTER_CLASS(ngrammar,EST_Ngrammar)
628
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index
EST_Litem * item_start() const
Used for iterating through members of the distribution.
double samples(void) const
Total number of example found.
void override_frequency(const EST_String &s, double c)
Sets the frequency of named item, without modifying {\tt num_samples}.
void set_frequency(const EST_String &s, double c)
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
void resize(int n, int set=1)
resize vector
INLINE int n() const
number of items in vector.
void fill(const T &v)
Fill entire array will value <parameter>v</parameter>.