#include "savantio.h"
#include "yenta-savant.h"

#define max(a, b) ((a) > (b) ? (a) : (b))

float *find_matches(DenseDocVec *);
static int update_matches(float *, WV_Tree *, float, float, DB_UINT *);
static WV_Tree *fetch_wordvec(unsigned int *);

static int normal_biases[] =
{
  1, 0, 0, 0, 0, 0, 0
};

static WV_Tree *fetch_wordvec(unsigned int *wordcode)
/* Replaces readwv.c->fetch_wordvec(), and fetches from the database in
   memory, not on disk. */
{
  WV_Tree *treeptr = Global_Tree;
  int cmp;
  while (1)
    {
      if (!treeptr) /* not present at all */
	return NULL;
      cmp = wordcode_cmp(treeptr->wordcode, wordcode);
      if (cmp < 0)
	treeptr = treeptr->right;
      else if (cmp > 0)
	treeptr = treeptr->left;
      else
	return treeptr;
    }
}

static void accumulate_tree(WV_Tree *tree)
{
  WV_List *lst;
  float mag;
  if (tree)
    {
      lst = tree->wvlist;
      while (lst)
	{
	  mag = WV_DVFREQ(lst->docweight)*WV_DVFREQ(lst->docweight)*
	    log(((float)Num_DocVecs) / tree->num_docs);
#if 0
	  printf("%d %d %d %f\n", WV_DVFREQ(lst->docweight), Num_DocVecs, tree->num_docs, mag);
#endif
	  DocVec_Mags[WV_DVNUM(lst->docweight) * NUM_FIELD_TYPES +
		     (int) BODY_FIELD] += mag;
	  lst = lst->next;
	}
      accumulate_tree(tree->left);
      accumulate_tree(tree->right);
    }
}

static void create_magnitudes()
{
  /* Take the database in memory and figure out the magnitudes of the
   * documents. Assumes that there is no set of magnitudes yet */
  int i;
#if 0
  printf("Regenerating magnitudes.\n");
#endif
  DocVec_Mags = (DB_FLOAT *)malloc(sizeof(DB_FLOAT *) * DVM_max * 
				   NUM_FIELD_TYPES);
  for (i = 0; i < DVM_rng * NUM_FIELD_TYPES; i++)
    DocVec_Mags[i] = 0.0;
  accumulate_tree(Global_Tree);

  for (i = 0; i < DVM_rng * NUM_FIELD_TYPES; i += NUM_FIELD_TYPES)
    {
#if 0
      printf("Mag[%d]^2: %f\n", i, DocVec_Mags[i]);
#endif
      DocVec_Mags[i] = sqrt(DocVec_Mags[i]);
    }
}

float *find_matches(DenseDocVec *ddv_query)
{
  int i;
  WV_Tree *wordvec = NULL;
  float *all_sims, query_weight, query_mag = 0.0, query_max = 0.0;

  if (!DocVec_Mags)
    create_magnitudes();
#if 0
  for (i = 0; i < DVM_rng * NUM_FIELD_TYPES; i += NUM_FIELD_TYPES)
    {
      printf("Mag[%d]: %f\n", i, DocVec_Mags[i]);
    }
#endif

  /* initialize stuff */
  all_sims = (float *)malloc(DocVecs->length*sizeof(float));
  for(i=0;i<DocVecs->length;i++)
    all_sims[i] = 0.0;

  for(i=0;i<ddv_query->num_entries;i++) /* Uses L2 (Euclidean) metric */
    query_max = max(query_max, (float)ddv_query->weights[i]);
  
  for(i=0; i<ddv_query->num_entries; i++) {
    wordvec = fetch_wordvec(&(ddv_query->wordcodes[WORD_ENCODE_WIDTH*i]));
    if (wordvec != NULL) {
      query_mag += (float)ddv_query->weights[i] *
	(float)ddv_query->weights[i] *
	log(((float)Num_DocVecs)/wordvec->num_docs);
      query_weight = (float)ddv_query->weights[i];
      if (Num_DocVecs != wordvec->num_docs)
	update_matches(all_sims, wordvec, query_weight, query_max,
		       &(ddv_query->wordcodes[WORD_ENCODE_WIDTH*i]));
    }
  }

#if 0
  printf("(Query mag)^2: %f\n", query_mag);
#endif

  query_mag = (float)sqrt((double)query_mag);
#if 0
  printf("Query mag: %f\n", query_mag);
#endif
  if (query_mag > 0)
    for (i = 0; i<DocVecs->length; i++)
      all_sims[i] = all_sims[i] / query_mag;

  return(all_sims);
}

/* update_matches is called once for each unique word in the query */
int update_matches(float *all_sims,
		   WV_Tree *vector,    /* documents containing this word */
		   float query_weight, /* weight of this word in the query */
		   float query_max,    /* the weight of the heaviest word */
		   DB_UINT *wordcode)
{
  int i, j, vecnum;
  float sim/*, vecweight*/;
  DocSim ds_temp1, ds_temp2;
  
  WV_List *trav = vector->wvlist;
  
  for(; trav; trav = trav->next) {
    vecnum = WV_DVNUM(trav->docweight);

    if (DocVec_Mags[vecnum * NUM_FIELD_TYPES] == 0.0)
      /* Skip over any removed documents */
      continue;

/*    vecweight = ((float)(trav->docweight & ((1<<WEIGHT_WIDTH) - 1))) /
      DocVec_Mags[vecnum];
    if (vecweight > query_weight) { 
      vecweight = query_weight;
    }*/
    sim = sim_contrib(trav->docweight,
		      vector->num_docs,
		      wordcode, wordcode,
		      query_weight /* *
		      log(((float)Num_DocVecs)/vector->num_docs)
/*		      log(((float)Num_DocVecs)/vector->num_docs)*/,
		      normal_biases,
		      normal_biases,
		      query_weight * 100,
		      1, 1 /* Not used */);
#if 0
    printf("Sim: %f (%x, %d, %p, --, %f, --, --, %f, %d, %d)\n", sim,
	   trav->docweight,
	   vector->num_docs,
	   wordcode,
	   query_weight /* *
	   log(((float)Num_DocVecs)/vector->num_docs)
/*	   log(((float)Num_DocVecs)/vector->num_docs)*/,
	   query_weight,
	   1, 1 /* Not used */);
#endif
    all_sims[vecnum] += sim;
  }
  return(0);
}
