47#include <sphinxbase/ckd_alloc.h>
48#include <sphinxbase/listelem_alloc.h>
49#include <sphinxbase/err.h>
56#define __CHAN_DUMP__ 0
58#define chan_v_eval(chan) hmm_dump_vit_eval(&(chan)->hmm, stderr)
60#define chan_v_eval(chan) hmm_vit_eval(&(chan)->hmm)
73 ngs->n_expand_words = 0;
74 n_words = ps_search_n_words(ngs);
75 bitvec_clear_all(ngs->expand_word_flag, ps_search_n_words(ngs));
76 for (i = 0; i < n_words; ++i) {
77 if (!ngram_model_set_known_wid(ngs->
lmset,
78 dict_basewid(ps_search_dict(ngs),i)))
81 ngs->expand_word_list[ngs->n_expand_words] = i;
82 bitvec_set(ngs->expand_word_flag, i);
83 ngs->n_expand_words++;
85 E_INFO(
"Utterance vocabulary contains %d words\n", ngs->n_expand_words);
86 ngs->expand_word_list[ngs->n_expand_words] = -1;
93 dict_t *dict = ps_search_dict(ngs);
94 int n_words = ps_search_n_words(ngs);
100 for (w = 0; w < n_words; w++) {
101 if (dict_is_single_phone(dict, w))
108 for (w = 0; w < n_words; w++) {
109 if (!dict_is_single_phone(dict, w))
116 bin_mdef_pid2ssid(ps_search_acmod(ngs)->mdef,
118 bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef,
131 int n_words = ps_search_n_words(ngs);
133 for (i = w = 0; w < n_words; ++w) {
134 if (!dict_is_single_phone(ps_search_dict(ngs), w))
149 n_words = ps_search_n_words(ngs);
151 ngs->expand_word_flag = bitvec_alloc(n_words);
152 ngs->expand_word_list = ckd_calloc(n_words + 1,
sizeof(*ngs->expand_word_list));
154 ngs->min_ef_width = cmd_ln_int32_r(ps_search_config(ngs),
"-fwdflatefwid");
155 ngs->max_sf_win = cmd_ln_int32_r(ps_search_config(ngs),
"-fwdflatsfwin");
156 E_INFO(
"fwdflat: min_ef_width = %d, max_sf_win = %d\n",
157 ngs->min_ef_width, ngs->max_sf_win);
162 ngram_fwdflat_expand_all(ngs);
164 ngram_fwdflat_allocate_1ph(ngs);
171 double n_speech = (double)ngs->n_tot_frame
172 / cmd_ln_int32_r(ps_search_config(ngs),
"-frate");
174 E_INFO(
"TOTAL fwdflat %.2f CPU %.3f xRT\n",
175 ngs->fwdflat_perf.t_tot_cpu,
176 ngs->fwdflat_perf.t_tot_cpu / n_speech);
177 E_INFO(
"TOTAL fwdflat %.2f wall %.3f xRT\n",
178 ngs->fwdflat_perf.t_tot_elapsed,
179 ngs->fwdflat_perf.t_tot_elapsed / n_speech);
183 ngram_fwdflat_free_1ph(ngs);
186 bitvec_free(ngs->expand_word_flag);
187 ckd_free(ngs->expand_word_list);
198 ckd_free(ngs->expand_word_list);
199 bitvec_free(ngs->expand_word_flag);
200 n_words = ps_search_n_words(ngs);
202 ngs->expand_word_flag = bitvec_alloc(n_words);
203 ngs->expand_word_list = ckd_calloc(n_words + 1,
sizeof(*ngs->expand_word_list));
208 ngram_fwdflat_free_1ph(ngs);
214 ngram_fwdflat_expand_all(ngs);
216 ngram_fwdflat_allocate_1ph(ngs);
229 int32 i, f, sf, ef, wid, nwd;
241 for (i = 0, bp = ngs->bp_table; i < ngs->bpidx; i++, bp++) {
242 sf = (bp->
bp < 0) ? 0 : ngs->bp_table[bp->bp].frame + 1;
248 if (!ngram_model_set_known_wid(ngs->
lmset,
249 dict_basewid(ps_search_dict(ngs), wid)))
263 node->fef = node->lef = ef;
271 for (f = 0; f < ngs->
n_frame; f++) {
273 for (node = ngs->
frm_wordlist[f]; node; node = nextnode) {
274 nextnode = node->
next;
276 if ((node->lef - node->fef < ngs->min_ef_width) ||
278 ((node->wid == ps_search_finish_wid(ngs)) && (node->lef < ngs->
n_frame - 1))) {
282 prevnode->
next = nextnode;
292 bitvec_clear_all(ngs->
word_active, ps_search_n_words(ngs));
293 for (f = 0; f < ngs->
n_frame; f++) {
294 for (node = ngs->
frm_wordlist[f]; node; node = node->next) {
302 E_INFO(
"Utterance vocabulary contains %d words\n", nwd);
317 dict = ps_search_dict(ngs);
318 d2p = ps_search_dict2pid(ngs);
325 if (dict_is_single_phone(dict, wid))
334 rhmm->
ci2phone = dict_second_phone(dict, wid);
335 rhmm->
ciphone = dict_first_phone(dict, wid);
338 bin_mdef_pid2ssid(ps_search_acmod(ngs)->mdef, rhmm->
ciphone),
339 bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, rhmm->
ciphone));
343 for (p = 1; p < dict_pronlen(dict, wid) - 1; p++) {
346 hmm->info.
rc_id = (p == dict_pronlen(dict, wid) - 1) ? 0 : -1;
350 bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, hmm->
ciphone));
379 ptmr_reset(&ngs->fwdflat_perf);
380 ptmr_start(&ngs->fwdflat_perf);
381 build_fwdflat_wordlist(ngs);
382 build_fwdflat_chan(ngs);
387 for (i = 0; i < ps_search_n_words(ngs); i++)
388 ngs->word_lat_idx[i] = NO_BP;
395 hmm_clear(&rhmm->
hmm);
400 hmm_enter(&rhmm->
hmm, 0, NO_BP, 0);
405 ngs->renormalized = FALSE;
407 for (i = 0; i < ps_search_n_words(ngs); i++)
408 ngs->last_ltrans[i].sf = -1;
413 ngs->
st.n_fwdflat_chan = 0;
414 ngs->
st.n_fwdflat_words = 0;
415 ngs->
st.n_fwdflat_word_transition = 0;
416 ngs->
st.n_senone_active_utt = 0;
432 for (i = 0; i < nw; i++) {
435 if (hmm_frame(&rhmm->
hmm) == frame_idx) {
439 for (hmm = rhmm->
next; hmm; hmm = hmm->
next) {
440 if (hmm_frame(&hmm->
hmm) == frame_idx) {
450 int32 i, w, nw, bestscore;
459 ngs->
st.n_fwdflat_words += nw;
462 for (i = 0; i < nw; i++) {
465 if (hmm_frame(&rhmm->
hmm) == frame_idx) {
466 int32 score = chan_v_eval(rhmm);
467 if ((score
BETTER_THAN bestscore) && (w != ps_search_finish_wid(ngs)))
469 ngs->
st.n_fwdflat_chan++;
472 for (hmm = rhmm->
next; hmm; hmm = hmm->
next) {
473 if (hmm_frame(&hmm->
hmm) == frame_idx) {
474 int32 score = chan_v_eval(hmm);
477 ngs->
st.n_fwdflat_chan++;
488 int32 i, nw, cf, nf, w, pip, newscore, thresh, wordthresh;
497 bitvec_clear_all(ngs->
word_active, ps_search_n_words(ngs));
500 wordthresh = ngs->
best_score + ngs->fwdflatwbeam;
502 E_DEBUG(3,(
"frame %d thresh %d wordthresh %d\n", frame_idx, thresh, wordthresh));
505 for (i = 0; i < nw; i++) {
509 if (hmm_frame(&rhmm->
hmm) == cf
511 hmm_frame(&rhmm->
hmm) = nf;
515 newscore = hmm_out_score(&rhmm->
hmm);
517 assert(!dict_is_single_phone(ps_search_dict(ngs), w));
523 if (hmm->info.
rc_id >= 0) {
524 for (; hmm; hmm = hmm->
next) {
525 if ((hmm_frame(&hmm->
hmm) < cf)
527 hmm_enter(&hmm->
hmm, newscore,
528 hmm_out_history(&rhmm->
hmm), nf);
534 if ((hmm_frame(&hmm->
hmm) < cf)
536 hmm_enter(&hmm->
hmm, newscore,
537 hmm_out_history(&rhmm->
hmm), nf);
543 assert(dict_is_single_phone(ps_search_dict(ngs), w));
551 hmm_out_history(&rhmm->
hmm), 0);
557 for (hmm = rhmm->
next; hmm; hmm = hmm->
next) {
558 if (hmm_frame(&hmm->
hmm) >= cf) {
561 hmm_frame(&hmm->
hmm) = nf;
564 newscore = hmm_out_score(&hmm->
hmm);
566 if (hmm->info.
rc_id < 0) {
571 if (nexthmm->info.
rc_id >= 0) {
572 for (; nexthmm; nexthmm = nexthmm->
next) {
573 if ((hmm_frame(&nexthmm->
hmm) < cf)
575 hmm_in_score(&nexthmm->
hmm))) {
576 hmm_enter(&nexthmm->
hmm,
578 hmm_out_history(&hmm->
hmm),
585 if ((hmm_frame(&nexthmm->
hmm) < cf)
587 hmm_in_score(&nexthmm->
hmm))) {
588 hmm_enter(&nexthmm->
hmm, newscore,
589 hmm_out_history(&hmm->
hmm), nf);
598 hmm_out_history(&hmm->
hmm),
604 else if (hmm_frame(&hmm->
hmm) != nf) {
605 hmm_clear_scores(&hmm->
hmm);
619 ngs->
st.n_fwdflat_word_transition += ngs->n_expand_words;
630 bitvec_clear_all(ngs->expand_word_flag, ps_search_n_words(ngs));
631 ngs->n_expand_words = 0;
633 for (f = sf; f < ef; f++) {
635 if (!bitvec_is_set(ngs->expand_word_flag, node->
wid)) {
636 ngs->expand_word_list[ngs->n_expand_words++] = node->
wid;
637 bitvec_set(ngs->expand_word_flag, node->
wid);
641 ngs->expand_word_list[ngs->n_expand_words] = -1;
642 ngs->
st.n_fwdflat_word_transition += ngs->n_expand_words;
648 int32 cf, nf, b, thresh, pip, i, nw, w, newscore;
649 int32 best_silrc_score = 0, best_silrc_bp = 0;
655 dict_t *dict = ps_search_dict(ngs);
663 lwf = ngs->fwdflat_fwdtree_lw_ratio;
667 get_expand_wordlist(ngs, cf, ngs->max_sf_win);
670 for (b = ngs->bp_table_idx[cf]; b < ngs->bpidx; b++) {
674 bp = ngs->bp_table + b;
675 ngs->word_lat_idx[bp->
wid] = NO_BP;
677 if (bp->
wid == ps_search_finish_wid(ngs))
683 rcss = ngs->bscore_stack + bp->
s_idx;
690 for (i = 0; ngs->expand_word_list[i] >= 0; i++) {
693 w = ngs->expand_word_list[i];
698 newscore = rcss[rssid->
cimap[dict_first_phone(dict, w)]];
700 newscore = bp->
score;
705 * (ngram_tg_score(ngs->
lmset,
706 dict_basewid(dict, w),
715 if ((hmm_frame(&rhmm->
hmm) < cf)
717 hmm_enter(&rhmm->
hmm, newscore, b, nf);
720 hmm_mpx_ssid(&rhmm->
hmm, 0) =
722 dict_last_phone(dict, bp->
wid));
723 assert(IS_S3SSID(hmm_mpx_ssid(&rhmm->
hmm, 0)));
724 E_DEBUG(6,(
"ssid %d(%d,%d) = %d\n",
726 hmm_mpx_ssid(&rhmm->
hmm, 0)));
734 silscore = rcss[rssid->
cimap[ps_search_acmod(ngs)->mdef->sil]];
736 silscore = bp->
score;
738 best_silrc_score = silscore;
744 newscore = best_silrc_score + ngs->silpen + pip;
746 w = ps_search_silence_wid(ngs);
748 if ((hmm_frame(&rhmm->
hmm) < cf)
750 hmm_enter(&rhmm->
hmm, newscore,
756 newscore = best_silrc_score + ngs->fillpen + pip;
758 for (w = dict_filler_start(dict); w <= dict_filler_end(dict); w++) {
759 if (w == ps_search_silence_wid(ngs))
766 if ((hmm_frame(&rhmm->
hmm) < cf)
768 hmm_enter(&rhmm->
hmm, newscore,
778 for (i = 0; i < nw; i++) {
781 if (hmm_frame(&rhmm->
hmm) == cf) {
782 hmm_clear_scores(&rhmm->
hmm);
788fwdflat_renormalize_scores(
ngram_search_t *ngs,
int frame_idx, int32 norm)
792 int32 i, nw, cf, w, *awl;
799 for (i = 0; i < nw; i++) {
802 if (hmm_frame(&rhmm->
hmm) == cf) {
803 hmm_normalize(&rhmm->
hmm, norm);
805 for (hmm = rhmm->
next; hmm; hmm = hmm->
next) {
806 if (hmm_frame(&hmm->
hmm) == cf) {
807 hmm_normalize(&hmm->
hmm, norm);
812 ngs->renormalized = TRUE;
823 if (!ps_search_acmod(ngs)->compallsen)
824 compute_fwdflat_sen_active(ngs, frame_idx);
827 senscr =
acmod_score(ps_search_acmod(ngs), &frame_idx);
828 ngs->
st.n_senone_active_utt += ps_search_acmod(ngs)->n_senone_active;
839 E_INFO(
"Renormalizing Scores at frame %d, best score %d\n",
841 fwdflat_renormalize_scores(ngs, frame_idx, ngs->
best_score);
848 fwdflat_eval_chan(ngs, frame_idx);
850 fwdflat_prune_chan(ngs, frame_idx);
852 fwdflat_word_transition(ngs, frame_idx);
859 if (bitvec_is_set(ngs->
word_active, wid) && wid < ps_search_start_wid(ngs)) {
865 for (i = ps_search_start_wid(ngs); i < ps_search_n_words(ngs); i++) {
891 for (f = 0; f < ngs->
n_frame; f++) {
911 if (dict_is_single_phone(ps_search_dict(ngs),wid))
933 destroy_fwdflat_chan(ngs);
934 destroy_fwdflat_wordlist(ngs);
935 bitvec_clear_all(ngs->
word_active, ps_search_n_words(ngs));
938 cf = ps_search_acmod(ngs)->output_frame;
942 ptmr_stop(&ngs->fwdflat_perf);
945 double n_speech = (double)(cf + 1)
946 / cmd_ln_int32_r(ps_search_config(ngs),
"-frate");
947 E_INFO(
"%8d words recognized (%d/fr)\n",
948 ngs->bpidx, (ngs->bpidx + (cf >> 1)) / (cf + 1));
949 E_INFO(
"%8d senones evaluated (%d/fr)\n", ngs->
st.n_senone_active_utt,
950 (ngs->
st.n_senone_active_utt + (cf >> 1)) / (cf + 1));
951 E_INFO(
"%8d channels searched (%d/fr)\n",
952 ngs->
st.n_fwdflat_chan, ngs->
st.n_fwdflat_chan / (cf + 1));
953 E_INFO(
"%8d words searched (%d/fr)\n",
954 ngs->
st.n_fwdflat_words, ngs->
st.n_fwdflat_words / (cf + 1));
955 E_INFO(
"%8d word transitions (%d/fr)\n",
956 ngs->
st.n_fwdflat_word_transition,
957 ngs->
st.n_fwdflat_word_transition / (cf + 1));
958 E_INFO(
"fwdflat %.2f CPU %.3f xRT\n",
959 ngs->fwdflat_perf.t_cpu,
960 ngs->fwdflat_perf.t_cpu / n_speech);
961 E_INFO(
"fwdflat %.2f wall %.3f xRT\n",
962 ngs->fwdflat_perf.t_elapsed,
963 ngs->fwdflat_perf.t_elapsed / n_speech);
void acmod_activate_hmm(acmod_t *acmod, hmm_t *hmm)
Activate senones associated with an HMM.
int16 const * acmod_score(acmod_t *acmod, int *inout_frame_idx)
Score one frame of data.
void acmod_clear_active(acmod_t *acmod)
Clear set of active senones.
s3ssid_t dict2pid_internal(dict2pid_t *d2p, int32 wid, int pos)
Return the senone sequence ID for the given word position.
#define dict2pid_rssid(d, ci, lc)
Access macros; not designed for arbitrary use.
#define dict_size(d)
Packaged macro access to dictionary members.
#define dict_pron(d, w, p)
The CI phones of the word w at position p.
#define BETTER_THAN
Is one score better than another?
#define hmm_context_set_senscore(ctx, senscr)
Change the senone score array for a context.
#define WORST_SCORE
Large "bad" score.
#define WORSE_THAN
Is one score worse than another?
#define SENSCR_SHIFT
Shift count for senone scores.
void ngram_search_free_all_rc(ngram_search_t *ngs, int32 w)
Allocate last phone channels for all possible right contexts for word w.
void ngram_search_alloc_all_rc(ngram_search_t *ngs, int32 w)
Allocate last phone channels for all possible right contexts for word w.
int ngram_search_mark_bptable(ngram_search_t *ngs, int frame_idx)
Record the current frame's index in the backpointer table.
void ngram_search_save_bp(ngram_search_t *ngs, int frame_idx, int32 w, int32 score, int32 path, int32 rc)
Enter a word in the backpointer table.
N-Gram based multi-pass search ("FBS")
void ngram_fwdflat_start(ngram_search_t *ngs)
Start fwdflat decoding for an utterance.
void ngram_fwdflat_deinit(ngram_search_t *ngs)
Release memory associated with fwdflat decoding.
int ngram_fwdflat_reinit(ngram_search_t *ngs)
Rebuild search structures for updated language models.
void ngram_fwdflat_finish(ngram_search_t *ngs)
Finish fwdflat decoding for an utterance.
void ngram_fwdflat_init(ngram_search_t *ngs)
Initialize N-Gram search for fwdflat decoding.
int ngram_fwdflat_search(ngram_search_t *ngs, int frame_idx)
Search one frame forward in an utterance.
Word graph search implementation.
Back pointer table (forward pass lattice; actually a tree)
int16 last2_phone
next-to-last phone of this word
int32 prev_real_wid
wid of second-last real word
int32 real_wid
wid of this or latest predecessor real word
int32 score
Score (best among all right contexts)
int16 last_phone
last phone of this word
int32 s_idx
Start of BScoreStack for various right contexts.
frame_idx_t frame
start or end frame
Lexical tree node data type.
struct chan_s * next
first descendant of this channel; or, in the case of the last phone of a word, the next alternative r...
int32 ciphone
ciphone for this node
hmm_t hmm
Basic HMM structure.
int32 rc_id
right-context id for last phone of words
Building composite triphone (as well as word internal triphones) with the dictionary.
a structure for a dictionary.
N-Gram search module structure.
int32 * single_phone_wid
list of single-phone word ids
int32 best_score
Best Viterbi path score.
root_chan_t * rhmm_1ph
Root HMMs for single-phone words.
listelem_alloc_t * latnode_alloc
For latnode_t.
int32 n_frame_alloc
Number of frames allocated in bp_table_idx and friends.
int32 ** active_word_list
Array of active multi-phone words for current and next frame.
int32 n_frame
Number of frames actually present.
ngram_search_stats_t st
Various statistics for profiling.
listelem_alloc_t * root_chan_alloc
For root_chan_t.
int32 n_active_word[2]
Number entries in active_word_list.
ngram_model_t * lmset
Set of language models.
int32 * fwdflat_wordlist
List of active word IDs for utterance.
chan_t ** word_chan
Channels associated with a given word (only used for right contexts, single-phone words in fwdtree se...
int32 n_1ph_words
Number single phone words in dict (total)
ps_latnode_t ** frm_wordlist
List of active words in each frame.
listelem_alloc_t * chan_alloc
For chan_t.
hmm_context_t * hmmctx
HMM context.
bitvec_t * word_active
array of active flags for all words.
struct ps_latnode_s * next
Next node in DAG (no ordering implied)
int32 wid
Dictionary word id.
Lexical tree node data type for the first phone (root) of each dynamic HMM tree structure.
int16 ci2phone
second ciphone of this node; one root HMM for each unique right context
hmm_t hmm
Basic HMM structure.
int16 ciphone
first ciphone of this node; all words rooted at this node begin with this ciphone
chan_t * next
first descendant of this channel
cross word triphone model structure
s3cipid_t * cimap
Index into ssid[] above for each ci phone.