/* WIDE AREA INFORMATION SERVER SOFTWARE
   No guarantees or restrictions.  See the readme file for the full standard
   disclaimer.    
   Brewster@think.com
*/


/* implements the search part of irext.h 
   (search_word and finished_search_word)
   -brewster

Split from irsearch.c

   5/31/91 Added scale_scores.  Fixed document_score_array to long.
   7/8/91 Removed scale_scores, handled in search_word with doc_id > 0.

   - Jonny G
 */

#include "cdialect.h"
#include "irfiles.h"
#include "irsearch.h"
#include "irext.h"
#include <string.h>

/*===========================*
 *===  Setting Paramters  ===*
 *===========================*/

long max_hit_retrieved = 0;

long set_query_parameter (mask, parameters)
     long mask;
     query_parameter_type * parameters;
{
  switch (mask)
    {
    case SET_MAX_RETRIEVED_MASK:
      max_hit_retrieved = parameters->max_hit_retrieved;
      break;
    default:
      break;
    }
}

/*==============================*
 *===  Document Score Array  ===*
 *==============================*/

long *document_score_array = NULL;
long document_score_array_len = 0;

/* make_document_score_array insures that the document_score_array
   array is long enough, if not it makes it long enough */
static void make_document_score_array _AP((long length));
static void make_document_score_array(length)
long length;
{
  if(length <= document_score_array_len)
    return;
  /* we have to make a new one.  free the old one first (if any) */
  if(document_score_array != 0){
    s_free(document_score_array);
  }
  document_score_array = (long*)s_malloc(
					 (size_t)(length * sizeof(long)));
  document_score_array_len = length;
}

static void destroy_document_score_array _AP((void));
static void destroy_document_score_array()
{
  s_free(document_score_array);
  document_score_array_len = 0;
}
    
void clear_document_score_array()
/* side effects the document_score_array.  XXX could use memset */
{ 
  memset(document_score_array, 0, 
	 document_score_array_len * sizeof(long));
}

/* for debugging purposes */
void print_document_score_array(start,stop)
unsigned long start;
unsigned long stop;
/* assumes start >= 0, stop < db->doc_table_allocated_entries */
{
	long i;
	for(i = start; i <= stop; i++){
		printf("entry number %d: %d \n", 
		       i, (unsigned char)document_score_array[i]);
	}
}



/*=========================*
 *===  Best Hits Array  ===*
 *=========================*/

hit *best_hits_array = NULL;
long best_hits_array_len = 0;
long current_best_hit = 0;

/* make_best_hits_array insures that the best_hits_array
   array is long enough, if not it makes it long enough */
static void make_best_hits_array _AP((long length));
static void make_best_hits_array(length)
long length;
{
  if(length <= best_hits_array_len)
    return;
  /* we have to make a new one.  free the old one first (if any) */
  if(best_hits_array != 0){
    s_free(best_hits_array);
  }
  best_hits_array = (hit*)s_malloc((size_t)(length * sizeof(hit)));
  best_hits_array_len = length;
}

static void destroy_best_hits_array _AP((void));
static void destroy_best_hits_array()
{
  s_free(best_hits_array);
  best_hits_array_len = 0;
}
    
void clear_best_hits_array()
/* side effects the best_hits_array.  XXX could use memset */
{ 
  memset((char*)best_hits_array, 0, best_hits_array_len * sizeof(hit));
}

/* for debugging purposes */
void print_best_hits()
{
  long i;
  for( i = 0; i < best_hits_array_len; i++){
    if (best_hits_array[i].weight != 0)
      { printf("Best hit %ld: weight %ld, doc_id %ld, headline %s, filename %s, lines %ld\n", 
	       i, best_hits_array[i].weight, 
	       best_hits_array[i].document_id,
	       best_hits_array[i].headline,
	       best_hits_array[i].filename,
	       best_hits_array[i].number_of_lines);
      }
  }
}

void sort_best_hits(db)
     database * db;
{
  /* returns nothing.
   * side effects best_hits and document_score_array
   */

  long i, doc;
  long worst_weight_to_make_it = 0;
  document_table_entry doc_entry;
  long best_hit_number = 0;

  /* snuff the scores */
  for(i = 0; i < max_hit_retrieved; i++){
    best_hits_array[i].weight = 0;
  }

  /* loop over the doc, and keep the doc_id and weight in best hit table */
  for(doc = 0; doc < db->doc_table_allocated_entries; doc++){
    long weight = document_score_array[doc];
    if(worst_weight_to_make_it < weight){
      /* merge it into the best_hits array. start at the bottom */
      for(i = (max_hit_retrieved - 1); i >= 0; i--){
	if(weight > best_hits_array[i].weight 
	   /* && (check_document_id(doc, db) == true) too slow.*/
	   ){
	  /* move this entry down */	
	  if((i + 1) < max_hit_retrieved){
	    best_hits_array[i+1].weight = best_hits_array[i].weight;
	    best_hits_array[i+1].document_id = best_hits_array[i].document_id;
	  }
	  best_hits_array[i].document_id = doc;
  	  best_hits_array[i].weight = weight;
	}
	else
	  break;
      }      
    }
  }
  
  for(i = 0; i < max_hit_retrieved; i++){
    if(best_hits_array[i].weight <= 0)  /* if we are out of good stuff, return */
      return;
    /* fill in the rest of the hit */
    if (read_document_table_entry(&doc_entry,
				  best_hits_array[i].document_id,
				  db) 
	== true){
      best_hits_array[best_hit_number].weight = best_hits_array[i].weight;
      best_hits_array[best_hit_number].document_id = best_hits_array[i].document_id;
      best_hits_array[best_hit_number].start_character = doc_entry.start_character;
      best_hits_array[best_hit_number].end_character = doc_entry.end_character;
      best_hits_array[best_hit_number].document_length = doc_entry.document_length;
      best_hits_array[best_hit_number].number_of_lines = doc_entry.number_of_lines;
      read_filename_table_entry(doc_entry.filename_id, 
				best_hits_array[best_hit_number].filename,
				best_hits_array[best_hit_number].type,
				NULL,
				db),
      strncpy(best_hits_array[best_hit_number].headline, 
	      read_headline_table_entry(doc_entry.headline_id,db),
	      MAX_FILE_NAME_LEN);
      best_hit_number++;
    } 
    beFriendly();
  }
  for(i = best_hit_number; i < max_hit_retrieved; i++){
    best_hits_array[best_hit_number].weight = 0;
  }
  /* print_best_hits(s);  for debugging */
}


/* returns the next best hit */
long best_hit(doc_id, score)
     long *doc_id;	
     long *score;
{
  if(current_best_hit > best_hits_array_len)
    return(1);
  if(best_hits_array[current_best_hit].weight == 0)
    return(1);
  *doc_id = best_hits_array[current_best_hit].document_id;
  *score  = best_hits_array[current_best_hit].weight;
  current_best_hit++;
  return(0);
}

long finished_best_hit()
{ /* if we are on a small machine, we might want to 
     destroy_document_score_array */
  clear_document_score_array();
  clear_best_hits_array();
  current_best_hit = 0;
  return(0);
}

/*=============================*	
 *===  Searching for words  ===*
 *=============================*/

long search_word(word,char_pos, line_pos, weight, doc_id, dictionary_value,
		 db)
     char *word; /* the word to be searched for */
     long char_pos;		/* the position of the start of the word */
     long line_pos;		/* is this needed? not for signature system */
     long weight;		/* how important the word looks syntactically,
				   such as is it bold */
     long doc_id;		/* current document, seed words is 0,
				   then it increments into the relevant 
				   document */
     long dictionary_value;	/* this is from the disk dictionary,
				   a signature system would use weight,
				   inverted file systems would put
				   position information */
     database *db;
{
  /* this side effects the document_score_array,
   * and downcases the word.
   * Returns 0 if successful or word not present, 
   * returns non-0 if an error.
   *
   */
  
  long not_full_flag = INDEX_BLOCK_FULL_FLAG; /*start out full so it will go on looking */
  long count, index_block_size;
  long internal_document_id, internal_weight, number_of_valid_entries;
  long index_file_block_number = dictionary_value;
  
  FILE *stream = NULL;
  current_best_hit = 0;  /* so that the best hits willstart from 0 */

  /* check the document_score_array */
  if(document_score_array_len < db->doc_table_allocated_entries)
    make_document_score_array(db->doc_table_allocated_entries);

  if(index_file_block_number >= 0){
    stream = db->index_stream;
    
    while((not_full_flag != INDEX_BLOCK_NOT_FULL_FLAG) && 
	  (index_file_block_number != 0)){	
      /* read the index block */
      if (0 != fseek(stream, (long)index_file_block_number, 
		     SEEK_SET))	
	{ 
	  waislog(WLOG_HIGH, WLOG_ERROR, 
		  "fseek failed into the inverted file to position %ld",
		  (long)index_file_block_number); 
	  return(-1);
	}
      
      not_full_flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, stream);
      index_file_block_number = read_bytes(NEXT_INDEX_BLOCK_SIZE, stream);
      index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, stream);
      if(EOF == index_block_size) 
	{ 
	  waislog(WLOG_HIGH, WLOG_ERROR, 
		  "reading from the index file failed");
	  return(-1);
	}
      
      if(not_full_flag == INDEX_BLOCK_NOT_FULL_FLAG){
	/* not full */
	number_of_valid_entries = index_file_block_number;
      }
      else if(not_full_flag == INDEX_BLOCK_FULL_FLAG){
	/* full */
	number_of_valid_entries = index_block_size - INDEX_BLOCK_HEADER_SIZE;
      }
      else{			/* bad news, file is corrupted. */
	waislog(WLOG_HIGH, WLOG_ERROR, 
		"Expected the flag in the inverted file to be valid.  it is %ld",
		not_full_flag);
	return(-1);
      }
      /* printf("number of valid bytes: %ld\n", number_of_valid_entries); */
      
      /* add the array to the document_score_array */
      for(count = 0; count < number_of_valid_entries; 
	  count = count + INDEX_ELEMENT_SIZE){
	internal_document_id = read_bytes(DOCUMENT_ID_SIZE, stream);
	(void)read_bytes(WORD_POSITION_SIZE, stream);
	(void)read_bytes(CHARACTER_POSITION_SIZE, stream);
	internal_weight = read_bytes(WEIGHT_SIZE,stream);
	/* printf("entry %ld, Doc_id: %ld, weight %ld \n",
		count, internal_document_id, internal_weight); */
	if(EOF == internal_weight) 
	  { 
	    waislog(WLOG_HIGH, WLOG_ERROR, 
		    "reading from the doc-id table failed");
	    return(-1);
	  }
	if(doc_id > 0) /* we are doing a relevant document */
	  internal_weight /= 0.1;

	document_score_array[internal_document_id] = 
	  document_score_array[internal_document_id] + internal_weight;
      }
    }
    return(0); 
  }
  else if(0 == index_file_block_number){
    /* an error occurred on looking up the word */
    return(-1);
  }
  else				/* index_file_block_number is negative */
    return(0);		/* word not present */
}

/* now collect the best hits */
long finished_search_word(db)
     database *db;
{ 
  /* check the document_score_array */
  if(document_score_array_len < db->doc_table_allocated_entries)
    make_document_score_array(db->doc_table_allocated_entries);

  make_best_hits_array(max_hit_retrieved);
  sort_best_hits(db);
  return(0);
}







