48#include "EST_String.h"
49#include "EST_Ngrammar.h"
51#include "EST_cutils.h"
77 int this_num,this_order;
79 if (ts.
open(filename) == -1)
80 return misc_read_error;
83 while ((!ts.
eof()) && !ts.
get().string().contains(
"\\data\\"));
108 this_order=atoi(s.
before(
"="));
109 this_num=atoi(s.
after(
"="));
114 nums[this_order] = this_num;
116 if(this_order > order)
131 if(!n.init(order,EST_Ngrammar::backoff,vocab))
132 return misc_read_error;
135 for(i=1;i<=order;i++)
141 EST_String tmp =
"\\" + itoString(i) +
"-grams:";
152 cerr <<
"Unexpected end of grammar file whilst looking for '"
153 << tmp <<
"'" << endl;
154 return misc_read_error;
161 for(j=0;j<nums(i);j++)
164 for (k=0; ((k<i) && !ts.
eof()); k++)
165 window[k] = ts.
get().string();
169 cerr <<
"Unexpected end of file whilst reading " << i
170 <<
"-grams !" << endl;
171 return misc_read_error;
176 cerr <<
"ooooooooops" << endl;
179 occur = atof(ts.
get().string());
180 n.accumulate(window,occur);
185 weight = atof(ts.
get().string());
186 n.set_backoff_weight(window,weight);
191 cerr <<
"EST_Ngrammar:load_ngram_arpa expect end of line at filepos "
194 return misc_read_error;
203 if (ts.
get().string() ==
"\\end\\")
210 cerr <<
"Missing \\end\\ !" << endl;
213 return misc_read_error;
224 if (ts.
open(filename) == -1)
225 return misc_read_error;
227 if (ts.
peek().string() !=
"Ngram_2")
234 order = atoi(ts.
get().string());
240 vocab.append(ts.
get().string());
245 if(!n.init(order,EST_Ngrammar::dense,vocab,pred_vocab))
247 cerr <<
"Something may be wrong with the vocab lists in '"
248 << filename <<
"'" << endl;
249 return misc_read_error;
256 for (i=0; i < order; i++)
257 window[i] = ts.
get().string();
258 if (ts.
get().string() !=
":")
260 cerr <<
"EST_Ngrammar:load_ngram_cstr_ascii missing colon at filepos "
262 return misc_read_error;
264 occur = atof(ts.
get().string());
265 n.accumulate(window,occur);
268 cerr <<
"EST_Ngrammar:load_ngram_cstr_ascii expect end of line at filepos "
270 return misc_read_error;
286 double approx_num_samples = 0.0;
287 long freq_data_start, freq_data_end;
292 if ((ifd=fopen(filename,
"rb")) == NULL)
293 return misc_read_error;
294 fread(&magic,
sizeof(
int),1,ifd);
296 if (SWAPINT(magic) == EST_NGRAMBIN_MAGIC)
298 else if (magic != EST_NGRAMBIN_MAGIC)
300 if (ts.
open(ifd, FALSE) == -1)
301 return misc_read_error;
306 if (ts.
peek().string() !=
"mBin_2")
314 order = atoi(ts.
get().string());
315 if (ts.
get() !=
"\n")
319 return misc_read_error;
324 while ((ts.
peek() !=
"\n") && (!ts.
eof()))
325 vocab.append(ts.
get().string());
327 while ((ts.
peek() !=
"\n") && (!ts.
eof()))
334 if(!n.init(order,EST_Ngrammar::dense,vocab,pred_vocab))
338 return misc_read_error;
343 freq_data_start = ftell(ifd);
344 fseek(ifd,0,SEEK_END);
345 freq_data_end = ftell(ifd);
346 num_entries = (freq_data_end-freq_data_start)/
sizeof(
double);
347 double *dd =
new double[num_entries];
350 fseek(ifd,freq_data_start,SEEK_SET);
352 if (fread(dd,
sizeof(
double),num_entries,ifd) != (
unsigned)num_entries)
354 cerr <<
"EST_Ngrammar::load_ngram_cstr_bin format does not have expected number of entries" << endl;
357 return misc_read_error;
360 swap_bytes_double(dd,num_entries);
362 for(j=i=0;i<n.num_states();i++)
364 if (j >= num_entries)
366 cerr <<
"EST_Ngrammar::load_ngram_cstr_bin unexpected end of frequency data" << endl;
369 return misc_read_error;
372 (!n.p_states[i].pdf().
item_end(k)) && (j < num_entries) ;
377 approx_num_samples += dd[j];
382 if (j+1 >= num_entries)
384 else if (dd[j+1] < -1)
386 else if (dd[j+1] == -1)
394 n.p_num_samples = (int)approx_num_samples;
407save_ngram_htk_ascii_sub(
const EST_String &word, ostream *ost,
414 this_ngram[0] = word;
416 this_pdf = n.prob_dist(this_ngram);
422 double floor_prob_total = floor * (n.pred_vocab->
length()-1);
424 if (word == n.p_sentence_end_marker)
427 *ost <<
" 0*" << n.pred_vocab->
length()-1 <<
" " << 1 << endl;
431 if(floor_prob_total > 1)
433 cerr <<
"ERROR : floor is impossibly large, scaling it !" << endl;
434 floor = 1.0 / (double)(n.pred_vocab->
length()-1);
435 floor_prob_total = 1;
444 if(name != n.p_sentence_start_marker)
452 *ost << word <<
" 0 ";
456 *ost << 1.0 / (double)(n.pred_vocab->
length()-1) <<
"*";
457 *ost << n.pred_vocab->
length()-1 <<
" " << endl;
469 if ( (name == n.p_sentence_start_marker) ||
470 (name == n.p_sentence_end_marker) ||
471 (name == OOV_MARKER) )
479 *ost <<
"*" << lcount <<
" ";
488 double base_prob = freq / total_freq;
491 *ost << floor + ( base_prob * (1-floor_prob_total) );
505 if(!n.closed_vocab())
510 *ost << 0 <<
" ERROR !!!!!!!! ";
516 freq = this_pdf.frequency(n.p_sentence_end_marker);
521 *ost <<
"*" << lcount <<
" " << endl;
527 *ost <<
"*" << lcount <<
" ";
533 double base_prob = freq / total_freq;
536 *ost << floor + ( base_prob * (1-floor_prob_total) ) << endl;
540 *ost << floor << endl;
548save_ngram_htk_ascii(
const EST_String filename,
557 cerr <<
"Can only save bigrams in htk_ascii format" << endl;
558 return misc_write_error;
563 cerr <<
"Negative floor probability does not make sense !" << endl;
564 return misc_write_error;
570 ost =
new ofstream(filename);
575 if(floor * (n.pred_vocab->
length()-1) > 1)
577 floor = 1.0 / (double)(n.pred_vocab->
length()-1);
578 cerr <<
"ERROR : floor is impossibly large, scaling it to ";
579 cerr << floor << endl;
584 if(n.p_sentence_start_marker ==
"")
586 cerr <<
"Can't save in HTK format as no sentence start/end tags"
587 <<
" were given !" << endl;
588 return misc_write_error;
592 save_ngram_htk_ascii_sub(n.p_sentence_start_marker,ost,n,floor);
595 for(i=0;i<n.vocab->
length();i++)
597 if ( (n.vocab->
name(i) != n.p_sentence_start_marker) &&
598 (n.vocab->
name(i) != n.p_sentence_end_marker) &&
599 (n.vocab->
name(i) != OOV_MARKER) )
600 save_ngram_htk_ascii_sub(n.vocab->
name(i),ost,n,floor);
603 if(!n.closed_vocab())
604 save_ngram_htk_ascii_sub(OOV_MARKER,ost,n,floor);
606 save_ngram_htk_ascii_sub(n.p_sentence_end_marker,ost,n,floor);
625 if(n->ngram_exists(ngram))
626 *((
double*)count) += 1;
635 if(n->ngram_exists(ngram))
637 *((ostream*)(ost)) << safe_log10(n->probability(ngram)) <<
" ";
638 for(i=0;i<ngram.
n();i++)
639 *((ostream*)(ost)) << ngram(i) <<
" ";
641 if ((n->representation() == EST_Ngrammar::backoff) &&
642 (n->order() > ngram.
n()) )
643 *((ostream*)(ost)) << safe_log10(n->get_backoff_weight(ngram));
647 *((ostream*)(ost)) << endl;
663 ost =
new ofstream(filename);
672 *ost <<
"\\data\\" << endl;
674 double *count =
new double;
676 if (n.representation() == EST_Ngrammar::backoff)
678 for(o=1;o<=n.order();o++)
687 n.iterate(ngram,&count_ngram_arpa_sub,(
void*)count);
688 *ost <<
"ngram " << o <<
"=" << *count << endl;
691 for(o=1;o<=n.order();o++)
694 *ost <<
"\\" << o <<
"-grams:" << endl;
698 n.iterate(ngram,&save_ngram_arpa_sub,(
void*)ost);
705 for(i=0;i<n.order();i++)
708 n.iterate(ngram,&count_ngram_arpa_sub,(
void*)count);
709 *ost <<
"ngram " << n.order() <<
"=" << *count << endl;
712 *ost <<
"\\" << n.order() <<
"-grams:" << endl;
714 for(i=0;i<n.order();i++)
716 n.iterate(ngram,&save_ngram_arpa_sub,ost);
720 *ost <<
"\\end\\" << endl;
730 const bool trace,
double floor)
741 ost =
new ofstream(filename);
746 *ost <<
"Ngram_2 " << n.order() << endl;
747 for (i=0; i < n.vocab->
length(); i++)
748 *ost << n.vocab->
name(i) <<
" ";
750 for (i=0; i < n.pred_vocab->
length(); i++)
751 *ost << n.pred_vocab->
name(i) <<
" ";
754 if (n.representation() == EST_Ngrammar::dense)
755 n.print_freqs(*ost,floor);
756 else if (n.representation() == EST_Ngrammar::backoff)
758 int total_ngrams = (int)pow(
float(n.get_vocab_length()),
float(n.order()-1));
760 for(i=0;i<total_ngrams;i++)
764 this_pdf = n.prob_dist(this_ngram);
774 for (
int jj=0; jj < this_ngram.
n(); jj++)
775 *ost << this_ngram(jj) <<
" ";
776 *ost << name <<
" : " << freq << endl;
794 if ((ost = fopen(filename,
"wb")) == NULL)
796 cerr <<
"Ngrammar save: unable to open \"" << filename <<
797 "\" for writing" << endl;
801 fprintf(ost,
"EST_File fst\n");
802 fprintf(ost,
"DataType ascii\n");
803 fprintf(ost,
"in \"(");
804 for (i=0; i < n.vocab->
length(); i++)
805 fprintf(ost,
" %s\n",(
const char *)n.vocab->
name(i));
806 fprintf(ost,
" )\"\n");
807 fprintf(ost,
"out \"(");
808 for (i=0; i < n.vocab->
length(); i++)
809 fprintf(ost,
" %s\n",(
const char *)n.vocab->
name(i));
810 fprintf(ost,
" )\"\n");
811 fprintf(ost,
"NumStates %d\n",n.num_states());
812 fprintf(ost,
"EST_Header_End\n");
814 for (i=0; i<n.num_states(); i++)
816 fprintf(ost,
"((%d nonfinal %d)\n",i,i);
827 const bool trace,
double floor)
830 if (n.representation() == EST_Ngrammar::sparse)
831 return misc_write_error;
838 int magic = EST_NGRAMBIN_MAGIC;
842 if ((ofd=stdout) == NULL)
843 return misc_write_error;
847 if ((ofd=fopen(filename,
"wb")) == NULL)
848 return misc_write_error;
851 fwrite(&magic,
sizeof(
int),1,ofd);
852 fprintf(ofd,
"mBin_2 %d\n",n.order());
853 for (i=0; i < n.vocab->
length(); i++)
854 fprintf(ofd,
"%s ",(
const char *)n.vocab->
name(i));
856 for (i=0; i < n.pred_vocab->
length(); i++)
857 fprintf(ofd,
"%s ",(
const char *)n.pred_vocab->
name(i));
865 cerr <<
"Saving ..." << endl;
867 if (n.representation() == EST_Ngrammar::dense)
869 for(i=0;i<n.num_states();i++)
873 cerr <<
"\r" << i*100/n.num_states() <<
"%";
881 n.p_states[i].pdf().
item_freq(k,name,freq);
889 fwrite(&count,
sizeof(
double),1,ofd);
890 fwrite(&freq,
sizeof(
double),1,ofd);
897 fwrite(&count,
sizeof(
double),1,ofd);
899 else if (n.representation() == EST_Ngrammar::backoff)
906 int total_ngrams = (int)pow(
float(n.get_vocab_length()),
float(n.order()-1));
908 for(i=0;i<total_ngrams;i++)
912 cerr <<
"\r" << i*100/total_ngrams <<
"%";
916 this_pdf = n.prob_dist(this_ngram);
931 fwrite(&count,
sizeof(
double),1,ofd);
932 fwrite(&freq,
sizeof(
double),1,ofd);
943 cerr <<
"\r \r" << endl;
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index
EST_Litem * item_start() const
Used for iterating through members of the distribution.
void cumulate(const EST_String &s, double count=1)
Add this observation, may specify number of occurrences.
void set_frequency(const EST_String &s, double c)
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
const EST_String & name(const int n) const
The name given the index.
const int length(void) const
The number of members in the discrete.
EST_String before(int pos, int len=0) const
Part before position.
int contains(const char *s, int pos=-1) const
Does it contain this substring?
EST_String after(int pos, int len=1) const
Part after pos+len.
void append(const T &item)
add item onto end of list
INLINE int n() const
number of items in vector.
int filepos(void) const
current file position in \Ref{EST_TokenStream}
void set_SingleCharSymbols(const EST_String &sc)
set which characters are to be treated as single character symbols
EST_Token get_upto_eoln(void)
get up to {\tt s} in end of line as a single token.
EST_Token & peek(void)
peek at next token
void close(void)
Close stream.
int open(const EST_String &filename)
open a \Ref{EST_TokenStream} for a file.
void set_WhiteSpaceChars(const EST_String &ws)
set which characters are to be treated as whitespace
EST_TokenStream & get(EST_Token &t)
get next token in stream
int filepos(void) const
file position in original \Ref{EST_TokenStream}.