131class EST_BackoffNgrammarState {
137 double backoff_weight;
141 EST_BackoffNgrammarState* add_child(
const EST_Discrete *d,
142 const EST_StrVector &words);
143 EST_BackoffNgrammarState* add_child(
const EST_Discrete *d,
144 const EST_IVector &words);
146 EST_BackoffNgrammarState()
148 EST_BackoffNgrammarState(
const EST_Discrete *d,
int level)
149 {clear();init(d,level);};
151 {clear();init(pdf,level);};
152 EST_BackoffNgrammarState(
const EST_BackoffNgrammarState &s);
153 EST_BackoffNgrammarState(
const EST_BackoffNgrammarState *
const s);
154 ~EST_BackoffNgrammarState();
163 bool accumulate(
const EST_StrVector &words,
164 const double count=1);
165 bool accumulate(
const EST_IVector &words,
166 const double count=1);
171 {
return p_pdf.probability(w);}
173 {
return p_pdf.frequency(w);}
174 const EST_String &most_probable(
double *prob = NULL)
const
175 {
return p_pdf.most_probable(prob);}
177 const int level()
const {
return p_level;}
179 EST_BackoffNgrammarState* get_child(
const EST_String &word)
const
181 return (EST_BackoffNgrammarState*)children.lookup(word);
183 EST_BackoffNgrammarState* get_child(
const int word)
const
185 return (EST_BackoffNgrammarState*)children.lookup(p_pdf.get_discrete()->name(word));
188 void remove_child(EST_BackoffNgrammarState *child,
194 const EST_BackoffNgrammarState *
const get_state(
const EST_StrVector &words)
const;
196 bool ngram_exists(
const EST_StrVector &words,
197 const double threshold)
const;
198 const double get_backoff_weight()
const {
return backoff_weight; }
199 const double get_backoff_weight(
const EST_StrVector &words)
const;
200 bool set_backoff_weight(
const EST_StrVector &words,
const double w);
203 void print_freqs(ostream &os,
const int order,
EST_String followers=
"");
205friend ostream& operator<<(ostream& s,
const EST_BackoffNgrammarState &a);
214 enum representation_t {sparse, dense, backoff};
219 enum entry_t {frequencies, log_frequencies};
228 double p_number_of_sentences;
235 representation_t p_representation;
236 entry_t p_entry_type;
241 bool init_sparse_representation();
244 bool init_dense_representation();
252 double backoff_threshold;
255 double backoff_unigram_floor_freq;
263 const double get_backoff_discount(
const int order,
const double freq)
const;
265 bool init_backoff_representation();
267 void backoff_restore_unigram_states();
270 int find_dense_state_index(
const EST_IVector &words,
int index=0)
const;
273 const EST_StrVector &make_ngram_from_index(
const int i)
const;
278 bool init_vocab(
const EST_StrList &wordlist);
279 bool init_vocab(
const EST_StrList &word_list,
280 const EST_StrList &pred_list);
283 bool check_vocab(
const EST_StrList &wordlist);
287 const EST_String &lastword(
const EST_StrVector &words)
const
288 {
return words(p_order-1); }
289 const int lastword(
const EST_IVector &words)
const
290 {
return words(p_order-1); }
294 bool sparse_to_dense();
295 bool dense_to_sparse();
300 void freqs_to_probs();
314 void iterate(EST_StrVector &words,
315 void (*function)(EST_Ngrammar *n,
316 EST_StrVector &words,
321 void const_iterate(EST_StrVector &words,
322 void (*function)(
const EST_Ngrammar *
const n,
323 EST_StrVector &words,
327 bool p_init(
int o, representation_t r);
331 bool oov_preprocess(
const EST_String &filename,
343 const double backoff_reverse_probability_sub(
const EST_StrVector &words,
345 const double backoff_probability(
const EST_StrVector &words,
346 const bool trace=
false)
const;
347 const double backoff_reverse_probability(
const EST_StrVector &words)
const;
348 const EST_String & backoff_most_probable(
const EST_StrVector &words,
349 double *prob = NULL)
const;
363 void *params,
const int level);
366 EST_Ngrammar() {default_values();}
368 EST_Ngrammar(
int o, representation_t r,
369 const EST_StrList &wordlist)
371 default_values(); init(o,r,wordlist);
375 EST_Ngrammar(
int o, representation_t r,
376 const EST_StrList &wordlist,
377 const EST_StrList &predlist)
379 default_values(); init(o,r,wordlist,predlist);
382 EST_Ngrammar(
int o, representation_t r,
EST_Discrete &v)
384 default_values(); init(o,r,v);
388 void default_values();
390 bool init(
int o, representation_t r,
391 const EST_StrList &wordlist);
392 bool init(
int o, representation_t r,
393 const EST_StrList &wordlist,
394 const EST_StrList &predlist);
396 bool init(
int o, representation_t r,
400 int num_states(
void)
const {
return p_num_states;}
401 double samples(
void)
const {
return p_num_samples;}
402 int order()
const {
return p_order; }
403 int get_vocab_length()
const {
return vocab?vocab->length():0; }
405 int get_vocab_word(
const EST_String &s)
const;
406 int get_pred_vocab_length()
const {
return pred_vocab->length(); }
407 EST_String get_pred_vocab_word(
int i)
const {
return pred_vocab->name(i); }
408 int get_pred_vocab_word(
const EST_String &s)
const
409 {
return pred_vocab->name(s); }
410 int closed_vocab()
const {
return !allow_oov; }
411 entry_t entry_type()
const {
return p_entry_type;}
412 representation_t representation()
const
413 {
return p_representation;}
416 bool build(
const EST_StrList &filenames,
417 const EST_String &prev = SENTENCE_START_MARKER,
418 const EST_String &prev_prev = SENTENCE_END_MARKER,
422 const int mincount=1,
423 const int maxcount=10);
426 void accumulate(
const EST_StrVector &words,
427 const double count=1);
429 void accumulate(
const EST_IVector &words,
430 const double count=1);
434 void make_htk_compatible();
437 EST_read_status load(
const EST_String &filename);
438 EST_read_status load(
const EST_String &filename,
const EST_StrList &wordlist);
439 EST_write_status save(
const EST_String &filename,
441 const bool trace=
false,
444 int wordlist_index(
const EST_String &word,
const bool report=
true)
const;
445 const EST_String &wordlist_index(
int i)
const;
446 int predlist_index(
const EST_String &word)
const;
447 const EST_String &predlist_index(
int i)
const;
450 bool set_entry_type(entry_t new_type);
451 bool set_representation(representation_t new_representation);
456 double probability(
const EST_StrVector &words,
bool force=
false,
457 const bool trace=
false)
const;
458 double frequency(
const EST_StrVector &words,
bool force=
false,
459 const bool trace=
false)
const;
461 const EST_String &predict(
const EST_StrVector &words,
462 double *prob,
int *state)
const;
463 const EST_String &predict(
const EST_StrVector &words)
const
464 {
double p;
int state;
return predict(words,&p,&state); }
465 const EST_String &predict(
const EST_StrVector &words,
double *prob)
const
466 {
int state;
return predict(words,prob,&state); }
468 const EST_String &predict(
const EST_IVector &words,
double *prob,
int *state)
const;
469 const EST_String &predict(
const EST_IVector &words)
const
470 {
double p;
int state;
return predict(words,&p,&state); }
471 const EST_String &predict(
const EST_IVector &words,
double *prob)
const
472 {
int state;
return predict(words,prob,&state); }
474 int find_state_id(
const EST_StrVector &words)
const;
475 int find_state_id(
const EST_IVector &words)
const;
476 int find_next_state_id(
int state,
int word)
const;
484 double reverse_probability(
const EST_StrVector &words,
485 bool force=
false)
const;
486 double reverse_probability(
const EST_IVector &words,
487 bool force=
false)
const;
502 void fill_window_start(EST_IVector &window,
506 void fill_window_start(EST_StrVector &window,
513 bool ngram_exists(
const EST_StrVector &words)
const;
514 bool ngram_exists(
const EST_StrVector &words,
const double threshold)
const;
515 const double get_backoff_weight(
const EST_StrVector &words)
const;
516 bool set_backoff_weight(
const EST_StrVector &words,
const double w);
518 void print_freqs(ostream &os,
double floor=0.0);
522 friend ostream& operator<<(ostream& s, EST_Ngrammar &n);
523 friend EST_read_status load_ngram_htk_ascii(
const EST_String filename,
525 friend EST_read_status load_ngram_htk_binary(
const EST_String filename,
527 friend EST_read_status load_ngram_arpa(
const EST_String filename,
529 const EST_StrList &vocab);
530 friend EST_read_status load_ngram_cstr_ascii(
const EST_String filename,
532 friend EST_read_status load_ngram_cstr_bin(
const EST_String filename,
535 friend EST_write_status save_ngram_htk_ascii_sub(
const EST_String &word,
539 friend EST_write_status save_ngram_htk_ascii(
const EST_String filename,
545 friend EST_write_status save_ngram_cstr_ascii(
const EST_String filename,
549 friend EST_write_status save_ngram_cstr_bin(
const EST_String filename,
553 friend EST_write_status save_ngram_arpa(
const EST_String filename,
555 friend EST_write_status save_ngram_arpa_sub(ostream *ost,
557 const EST_StrVector &words);
558 friend EST_write_status save_ngram_wfst(
const EST_String filename,
564friend void frequency_of_frequencies(
EST_DVector &ff, EST_Ngrammar &n,
int this_order);
565friend void map_frequencies(EST_Ngrammar &n,
const EST_DVector &map,
const int this_order);
566friend bool Good_Turing_smooth(EST_Ngrammar &n,
int maxcount,
int mincount);
567friend void Good_Turing_discount(EST_Ngrammar &ngrammar,
const int maxcount,
568 const double default_discount);
570friend void fs_build_backoff_ngrams(EST_Ngrammar *backoff_ngrams,
571 EST_Ngrammar &ngram);
572friend int fs_backoff_smooth(EST_Ngrammar *backoff_ngrams,
573 EST_Ngrammar &ngram,
int smooth_thresh);
577 bool compute_backoff_weights(
const int mincount=1,
578 const int maxcount=10);
581 bool merge(EST_Ngrammar &n,
float weight);
583friend class EST_BackoffNgrammar;