45#include "EST_String.h"
49#include "EST_multistats.h"
54const EST_String PredictionSuffixTree_oov(
"_OOV_");
57void slide(EST_IVector &i,
const int l);
58void slide(EST_StrVector &i,
const int l);
60EST_PredictionSuffixTree_tree_node::~EST_PredictionSuffixTree_tree_node()
65EST_PredictionSuffixTree_tree_node::print_freqs(ostream &os)
75 for (i = pd.item_start();
79 pd.item_freq(i,s,freq);
80 os << get_path() <<
" " << s <<
" : " << freq << endl;
85 EST_Features::Entries t;
86 for (t.
begin(nodes); t; t++)
87 pstnode(t->v)->print_freqs(os);
92EST_PredictionSuffixTree_tree_node::print_probs(ostream &os)
101 os << get_path() <<
" :";
102 for (EST_Litem *i = pd.item_start(); !pd.item_end(i) ; i=pd.item_next(i))
104 pd.item_prob(i,s,prob);
105 os <<
" " << s <<
" " << prob;
111 EST_Features::Entries t;
112 for (t.
begin(nodes); t; t++)
113 pstnode(t->v)->print_probs(os);
118EST_PredictionSuffixTree_tree_node::most_probable(
double *prob)
const
121 return pd.most_probable(prob);
124EST_PredictionSuffixTree::EST_PredictionSuffixTree(
void)
133EST_PredictionSuffixTree::init(
const int order)
137 nodes =
new EST_PredictionSuffixTree_tree_node;
138 nodes->set_level(order-1);
139 pd =
new EST_DiscreteProbDistribution;
142EST_PredictionSuffixTree::~EST_PredictionSuffixTree()
149EST_PredictionSuffixTree::clear(
void)
158 const EST_StrVector &words,
159 const int index)
const
163 if (words.
n() == index+1){
164 return node->prob_dist();
167 EST_PredictionSuffixTree_tree_node *next;
168 EST_PredictionSuffixTree_tree_node *n=0;
169 next = pstnode(node->nodes.
f(words(index),est_val(n)));
174 return PSTnullProbDistribution;
176 return p_prob_dist(next,words,index+1);
182EST_PredictionSuffixTree::accumulate(
const EST_StrVector &words,
194 if (words.
n()+index < p_order)
195 cerr <<
"EST_PredictionSuffixTree: accumulating window is too small"
199 pd->cumulate(words(p_order-1+index),count);
200 p_accumulate(nodes,words,count,index);
206 const EST_StrVector &words,
211 if (words.
n() == index+1)
213 if (node->prob_dist().
samples() == 0)
214 node->set_state(num_states++);
215 node->cumulate(words(index),count);
219 EST_PredictionSuffixTree_tree_node *next;
220 EST_PredictionSuffixTree_tree_node *n = 0;
221 next = pstnode(node->nodes.
f(words(index),est_val(n)));
224 next =
new EST_PredictionSuffixTree_tree_node;
225 if (node->get_path() ==
"")
226 next->set_path(words(index));
229 next->set_path(node->get_path()+
" "+ words(index));
231 next->set_level(node->get_level()-1);
232 node->nodes.
set_val(words(index),est_val(next));
234 p_accumulate(next,words,count,index+1);
239EST_PredictionSuffixTree::rev_prob(
const EST_StrVector &words)
const
242 const EST_DiscreteProbDistribution &pg = prob_dist(words);
243 double d1 = pg.frequency(words(order()-1));
244 double d2 = pd->frequency(words(order()-1));
250EST_PredictionSuffixTree::rev_prob(
const EST_StrVector &words,
254 return (
double)pg.frequency(words(order()-1)) /
255 pd->frequency(words(order()-1));
259EST_String &EST_PredictionSuffixTree::predict(
const EST_StrVector &words)
const
263 return ppredict(nodes,words,&p,&state);
267EST_String &EST_PredictionSuffixTree::predict(
const EST_StrVector &words,
double *p)
const
270 return ppredict(nodes,words,p,&state);
274EST_String &EST_PredictionSuffixTree::predict(
const EST_StrVector &words,
double *p,
int *state)
const
276 return ppredict(nodes,words,p,state);
281 const EST_StrVector &words,
282 double *p,
int *state,
283 const int index)
const
286 if (words.
n() == index+1)
288 *state = node->get_state();
289 return node->most_probable(p);
293 EST_PredictionSuffixTree_tree_node *next;
294 EST_PredictionSuffixTree_tree_node *n=0;
295 next = pstnode(node->nodes.
f(words(index),est_val(n)));
300 return PredictionSuffixTree_oov;
303 return ppredict(next,words,p,state,index+1);
308EST_PredictionSuffixTree::print_freqs(ostream &os)
312 os <<
"EST_PredictionSuffixTree order=" << p_order << endl;
313 nodes->print_freqs(os);
318EST_PredictionSuffixTree::print_probs(ostream &os)
322 os <<
"EST_PredictionSuffixTree " << p_order << endl;
323 nodes->print_probs(os);
328EST_PredictionSuffixTree::save(
const EST_String filename,
const EST_PredictionSuffixTree::EST_filetype type)
337 ofstream os(filename);
344EST_PredictionSuffixTree::load(
const EST_String filename)
351 if (ts.
open(filename) != 0)
353 cerr <<
"EST_PredictionSuffixTree: failed to open \"" << filename <<
"\" for reading\n";
358 if (ts.
get() !=
"EST_PredictionSuffixTree")
360 cerr <<
"EST_PredictionSuffixTree: file \"" << filename <<
"\" not an EST_PredictionSuffixTree\n";
364 order = atoi(ts.
get().string());
365 if ((order < 1) || (order > 10))
367 cerr <<
"EST_PredictionSuffixTree: file \"" << filename <<
"\" has bad order\n";
372 EST_StrVector window(order);
373 for (i=0; i<p_order; i++)
379 window[p_order-1] = ts.
get().string();
382 cerr <<
"EST_PredictionSuffixTree: file \"" << filename <<
"\" missed parsed line ";
383 cerr << ts.
linenum() <<
" near EST_PredictionSuffixTree\n";
384 for (i=0; i < order; i++)
385 cout <<
" " << window(i);
389 freq = atoi(ts.
get().string());
390 accumulate(window,freq);
397EST_PredictionSuffixTree::build(
const EST_String filename,
406 ts.
open(stdin, FALSE);
407 else if (ts.
open(filename) == -1)
411 EST_StrVector window(p_order);
412 for (i=0; i<p_order-1; i++)
413 window[i] = prev_prev;
414 window[p_order-1] = prev;
416 accumulate(window,1);
423 window[p_order-1] = ts.
get().string();
424 accumulate(window,1);
430 window[p_order-1] = last;
431 accumulate(window,1);
436EST_PredictionSuffixTree::build(
const EST_StrList &input)
447 EST_StrVector window(p_order);
448 for (i=0; i<p_order; i++)
452 for(i_ptr=input.head();i_ptr!=NULL;i_ptr=i_ptr->next())
455 window[p_order-1] = input(i_ptr);
456 accumulate(window,1);
462EST_PredictionSuffixTree::test(
const EST_String filename)
467 EST_StrStr_KVL pairs;
473 ts.
open(stdin, FALSE);
474 else if (ts.
open(filename) == -1)
479 EST_Features::Entries p;
480 for (p.
begin(nodes->nodes); p; p++)
484 EST_StrVector window(p_order);
486 for (i=0; i<p_order; i++)
489 int num_tsamples = 0;
494 window[p_order-1] = ts.
get().string();
495 const EST_DiscreteProbDistribution &pdist = prob_dist(window);
499 pairs.
add_item(window(p_order-1),predict(window),1);
502 const EST_FMatrix &m = confusion(pairs,lex);
503 print_confusion(m,pairs,lex);
504 cout <<
"Mean entropy (?) is " << e/num_tsamples << endl;
double samples(void) const
Total number of example found.
double entropy(void) const
void set_val(const EST_String &name, const EST_Val &sval)
const EST_Val & f(const EST_String &path) const
void begin(const Container &over)
Set the iterator ready to run over this container.
int add_item(const K &rkey, const V &rval, int no_search=0)
add key-val pair to list
void append(const T &item)
add item onto end of list
INLINE int n() const
number of items in vector.
void set_SingleCharSymbols(const EST_String &sc)
set which characters are to be treated as single character symbols
int linenum(void) const
returns line number of \Ref{EST_TokenStream}
int open(const EST_String &filename)
open a \Ref{EST_TokenStream} for a file.
EST_TokenStream & get(EST_Token &t)
get next token in stream