#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <savant.h>
#include <savutil.h>
#include <savantio.h>

static void 
merge_wordvec(int argc, char *argv[])
{
  /*  args = inpath1, inpath2, ..., inpathn, outpath */
  FILE **IN_WORDVEC_FILE, *OUT_WORDVEC_FILE, *OUT_WVOFF_FILE;
  FILE *IN_DVMAG_FILE;
  size_t bytesread = 0;
  int i, j, k, alldone = 0, startedyet = 0;
  DB_INT numwords = 0;
  DB_UINT minwc[WORD_ENCODE_WIDTH], *wc, weight;
  int *numweights;
  DB_INT total_numweights = 0;
  DB_INT offset=0;
  int *numdocs, total_numdocs=0, local_numdocs = 0;
  
  /* Format for wordvec file:
     -  (int)      (width*uns int) (int)           (uns int) (uns int),     (uns int)
     -  NUM_WORDS, WORDCODE-1,     NUM_WEIGHTS=N1, WEIGHT-1, WEIGHT-2, ..., WEIGHT-N1,
     -             WORDCODE-2,     NUM_WEIGHTS=N2, WEIGHT-1, WEIGHT-2, ..., WEIGHT-N2,
     -             etc.
     
     Format for WVOFF file:
     -  (width*uns int)   (long)
     -  WORDCODE-1,       OFFSET-1,
     -  WORDCODE-2,       OFFSET-2,
     -  etc.
     */
  /* open the input files */
  OUT_WORDVEC_FILE = open_or_die(argv[argc-1], WORDVEC_FNAME, "w");
  OUT_WVOFF_FILE = open_or_die(argv[argc-1], WVOFF_FNAME, "w");
  IN_WORDVEC_FILE = malloc(argc*sizeof(FILE *));
  wc = malloc(WORD_ENCODE_WIDTH*argc*sizeof(unsigned int));    /* pending wordcode for each input file */
  numweights = malloc(argc*sizeof(int));     /* pending numweights for each input file */
  numdocs = malloc(argc*sizeof(int));        /* number docs in each input file */
  
  /* set minwc to 0 */
  for (i=0;i<WORD_ENCODE_WIDTH;i++) {
    minwc[i] = 0;
  }
  
  /* figure out how many docs are in each subdir (needed for later) */
  for (i=1; i < (argc-1); i++) {
    /* open the dvmag file, to get the number of docs out of it */
    IN_DVMAG_FILE = open_or_die(argv[i], DVMAG_FNAME, "r");
    numdocs[i] = total_numdocs;
    fread_big(&(local_numdocs), sizeof(DB_INT), 1, IN_DVMAG_FILE);
    total_numdocs += local_numdocs;
    fclose(IN_DVMAG_FILE);
  }
  
  for (i=1; i < (argc-1); i++) {
    /* open the new dbase */
    IN_WORDVEC_FILE[i] = open_or_die(argv[i], WORDVEC_FNAME, "r");
    bytesread = fread_big(&(numwords), sizeof(DB_INT), 1, IN_WORDVEC_FILE[i]);    
    bytesread = fread_big(&(wc[WORD_ENCODE_WIDTH*i]), sizeof(DB_INT), 
			  WORD_ENCODE_WIDTH, IN_WORDVEC_FILE[i]);    
    bytesread = fread_big(&(numweights[i]), sizeof(DB_INT), 1, IN_WORDVEC_FILE[i]);    
  }
  
  numwords = 0;
  /* this'll get overwriten later: */
  fwrite_big(&numwords, sizeof(DB_INT), 1, OUT_WORDVEC_FILE);    
  
  while(!alldone) {
    alldone = 1;     /* we can always hope... */
    startedyet = 0;
    
    /* Find out which wordcode is the minimum, and count the number of wieghts */
    for (i=1; i < (argc-1); i++) {
      if (!feof(IN_WORDVEC_FILE[i])) {
	alldone = 0;   /* not everyone's at eof yet */
	if (!startedyet || (wordcode_cmp(&wc[i*WORD_ENCODE_WIDTH], minwc) == -1)) {     
	  /* new word */
	  startedyet = 1;
	  total_numweights = numweights[i];
	  for(k=0; k < WORD_ENCODE_WIDTH; k++) {    /* copy it in */
	    minwc[k] = wc[i*WORD_ENCODE_WIDTH+k];
	  }
	}
	else if (!wordcode_cmp(&wc[i*WORD_ENCODE_WIDTH], minwc)) {     
	  /* got the same word over here, add into the total */
	  total_numweights += numweights[i];
	}
      }
    }
    
    /* Write out another word */
    if (startedyet) {
      numwords++;
      offset = ftell(OUT_WORDVEC_FILE);
      fwrite_big(&minwc, sizeof(DB_INT), WORD_ENCODE_WIDTH, OUT_WVOFF_FILE);
      fwrite_big(&offset, sizeof(DB_INT), 1, OUT_WVOFF_FILE);
      
      fwrite_big(&minwc, sizeof(DB_INT), WORD_ENCODE_WIDTH, OUT_WORDVEC_FILE);
      fwrite_big(&total_numweights, sizeof(DB_INT), 1, OUT_WORDVEC_FILE);
      
      for (i=1; i < (argc-1); i++) {
	if (!feof(IN_WORDVEC_FILE[i])) {
	  if (!wordcode_cmp(&wc[i*WORD_ENCODE_WIDTH], minwc)) {  /* got a hit */
	    for (j=0, bytesread = 1; (j < numweights[i]) && bytesread; j++) {
	      bytesread = fread_big(&weight, sizeof(DB_INT), 1, IN_WORDVEC_FILE[i]);
	      
	      /* Weights are packed with the low-order WEIGHT_WIDTH bits as the weight, and 
		 the rest of the bits as the docnum.  We need to add the numdocs offset to 
		 the docnum. */
	      weight += numdocs[i] << WEIGHT_WIDTH;
	      fwrite_big(&weight, sizeof(DB_INT), bytesread, OUT_WORDVEC_FILE);
	    }
	    if (!feof(IN_WORDVEC_FILE[i])) {
	      bytesread = fread_big(&(wc[i*WORD_ENCODE_WIDTH]), sizeof(DB_INT),
				    WORD_ENCODE_WIDTH, IN_WORDVEC_FILE[i]);    
	      bytesread = fread_big(&(numweights[i]), sizeof(DB_INT), 1, 
				    IN_WORDVEC_FILE[i]);    
	    }
	    else {
	      wc[i] = 0;
	      numweights[i] = 0;
	    }
	  }
	}
      }
    }
  }
  free(wc);
  free(numdocs);
  free(numweights);

  for (i=1; i < (argc-1); i++) {
    fclose(IN_WORDVEC_FILE[i]);
  }
  free(IN_WORDVEC_FILE);

  /* Now, prepend the file with the total number of words */
  rewind(OUT_WORDVEC_FILE);
  fwrite_big(&numwords, sizeof(DB_INT), 1, OUT_WORDVEC_FILE);
  fclose(OUT_WORDVEC_FILE);
  fclose(OUT_WVOFF_FILE);
}

static void 
merge_date(int argc, char *argv[])
{
  
  /*  args = inpath1, inpath2, ..., inpathn, outpath */
  FILE **IN_DATE_FILE, *OUT_DATE_FILE;
  FILE *IN_DVMAG_FILE;
  size_t bytesread = 0;
  int i, k, alldone = 0, startedyet = 0, db_with_min = 0;
  DB_INT numwords = 0;
  DB_UINT minwc[WORD_ENCODE_WIDTH], *wc, weight;
  int *numdocs, total_numdocs=0, local_numdocs = 0;
  
  /* Format for DATE file:
     (int)      (width*uns int) (uns int)
     NUM_DATES, WORDCODE-1,     WEIGHT-1,
     WORDCODE-2,     WEIGHT-2,
     etc.
     
     */
  /* open the input files */
  OUT_DATE_FILE = open_or_die(argv[argc-1], DATE_FNAME, "w");
  IN_DATE_FILE = malloc(argc*sizeof(FILE *));
  wc = malloc(WORD_ENCODE_WIDTH*argc*sizeof(unsigned int));    /* pending wordcode for each input file */
  numdocs = malloc(argc*sizeof(int));        /* number docs in each input file */
  
  /* set minwc to 0 */
  for (i=0;i<WORD_ENCODE_WIDTH;i++) {
    minwc[i] = 0;
  }
  
  /* figure out how many docs are in each subdir (needed for later) */
  for (i=1; i < (argc-1); i++) {
    /* open the dvmag file, to get the number of docs out of it */
    IN_DVMAG_FILE = open_or_die(argv[i], DVMAG_FNAME, "r");
    numdocs[i] = total_numdocs;
    fread_big(&(local_numdocs), sizeof(DB_INT), 1, IN_DVMAG_FILE);
    total_numdocs += local_numdocs;
    fclose(IN_DVMAG_FILE);
  }
  
  for (i=1; i < (argc-1); i++) {
    /* open the new dbase */
    IN_DATE_FILE[i] = open_or_die(argv[i], DATE_FNAME, "r");
    bytesread = fread_big(&(numwords), sizeof(DB_INT), 1, IN_DATE_FILE[i]);    
    bytesread = fread_big(&(wc[WORD_ENCODE_WIDTH*i]), sizeof(DB_UINT), 
			  WORD_ENCODE_WIDTH, IN_DATE_FILE[i]);    
  }
  
  numwords = 0;
  fwrite_big(&numwords, sizeof(DB_INT), 1, OUT_DATE_FILE);    /* this'll get overwriten later */
  
  while(!alldone) {
    alldone = 1;     /* we can always hope... */
    startedyet = 0;
    
    /* Find out which wordcode is the minimum, and count the number of wieghts */
    for (i=1; i < (argc-1); i++) {
      if (!feof(IN_DATE_FILE[i])) {
	alldone = 0;   /* not everyone's at eof yet */
	if (!startedyet || (wordcode_cmp(&wc[i*WORD_ENCODE_WIDTH], minwc) <= 0)) {     /* new word, no dupes */
	  startedyet = 1;
	  for(k=0; k < WORD_ENCODE_WIDTH; k++) {    /* copy it in */
	    minwc[k] = wc[i*WORD_ENCODE_WIDTH+k];
	  }
	  db_with_min = i;   /* This is the index of the subdir with the current min */
	}
      }
    }
    
    if (startedyet) {
      numwords++;
      fwrite_big(&minwc, sizeof(DB_INT), WORD_ENCODE_WIDTH, OUT_DATE_FILE);
      /* printf("minwc = %x %x\n", minwc[0], minwc[1]);*/
      bytesread = fread_big(&weight, sizeof(DB_UINT), 1, 
			    IN_DATE_FILE[db_with_min]);
      
      /* Weights are packed with the low-order WEIGHT_WIDTH bits as
	 the weight, and the rest of the bits as the docnum.  We need
	 to add the numdocs offset to the docnum. */
      weight += numdocs[db_with_min] << WEIGHT_WIDTH;
      fwrite_big(&weight, sizeof(DB_INT), bytesread, OUT_DATE_FILE);
      
      if (!feof(IN_DATE_FILE[db_with_min])) {
	bytesread = fread_big(&(wc[db_with_min*WORD_ENCODE_WIDTH]), 
			      sizeof(DB_UINT), WORD_ENCODE_WIDTH, 
			      IN_DATE_FILE[db_with_min]);    
      }
    }
  }   /* while !alldone */
  for (i=1; i < (argc-1); i++) {
    fclose(IN_DATE_FILE[i]);
  }
  free(IN_DATE_FILE);

  /* Now, prepend the file with the total number of words */
  rewind(OUT_DATE_FILE);
  fwrite_big(&numwords, sizeof(DB_INT), 1, OUT_DATE_FILE);
  fclose(OUT_DATE_FILE);

  free(wc);
  free(numdocs);
}

static void 
merge_dvmag(int argc, char *argv[])
{
  /*  args = inpath1, inpath2, ..., inpathn, outpath */
  /* Must be called AFTER merge_wordvec and merge_date!! */
  
  FILE *IN_DVMAG_FILE, *OUT_DVMAG_FILE, *WV_FILE, *DATE_FILE;
  size_t bufread_big = 0;
  int i, j;
  int docnum, docweight;
  DB_INT num_docs = 0, total_num_docs = 0, num_words = 0, num_weights = 0;
  DB_FLOAT *mag;
  DB_UINT wc[WORD_ENCODE_WIDTH], *weights;
  enum Field_Types type;
  
  OUT_DVMAG_FILE = open_or_die(argv[argc-1], DVMAG_FNAME, "w");
  
  /* get the num of docvecs */
  for (i=1; i < (argc-1); i++) {
    /* open the new dbase */
    IN_DVMAG_FILE = open_or_die(argv[i], DVMAG_FNAME, "r");
    bufread_big = fread_big(&num_docs, sizeof(DB_INT), 1, IN_DVMAG_FILE);
    total_num_docs += num_docs;
    fclose(IN_DVMAG_FILE);
  }
  
  fwrite_big(&total_num_docs, sizeof(DB_INT), 1, OUT_DVMAG_FILE);
  
  mag = (DB_FLOAT *)malloc(total_num_docs*NUM_FIELD_TYPES*sizeof(DB_FLOAT));
  for(i=0; i<total_num_docs*NUM_FIELD_TYPES; i++) {
    mag[i] = 0.0;
  }
  weights = (DB_UINT *)malloc(total_num_docs * sizeof(DB_UINT));
  
  WV_FILE = open_or_die(argv[argc-1], WORDVEC_FNAME, "r");
  fread_big(&num_words, sizeof(DB_INT), 1, WV_FILE);
  for(i=0; i<num_words; i++) {
    fread_big(wc, sizeof(DB_UINT), WORD_ENCODE_WIDTH, WV_FILE);
    type = word_type(wc[0]);
    fread_big(&num_weights, sizeof(DB_UINT), 1, WV_FILE);
    fread_big(weights, sizeof(DB_UINT), num_weights, WV_FILE);
    for(j=0; j<num_weights; j++) {
      docnum = WV_DVNUM(weights[j]);
      docweight = WV_DVFREQ(weights[j]);
      mag[docnum*NUM_FIELD_TYPES+(int)type] += 
	(DB_FLOAT)pow(docweight*log((float)total_num_docs/(float)num_weights), 2);
    }
  }
  fclose(WV_FILE);
  
  DATE_FILE = open_or_die(argv[argc-1], DATE_FNAME, "r");
  fread_big(&num_words, sizeof(DB_INT), 1, DATE_FILE);
  for(i=0; i<num_words; i++) {
    fread_big(wc, sizeof(DB_UINT), WORD_ENCODE_WIDTH, DATE_FILE);
    type = word_type(wc[0]);
    fread_big(weights, sizeof(DB_UINT), 1, DATE_FILE);
    docnum = WV_DVNUM(weights[0]);
    docweight = WV_DVFREQ(weights[0]);
    mag[docnum*NUM_FIELD_TYPES+(int)type] += 
      (DB_FLOAT)(docweight*docweight * log((float)total_num_docs));
  }
  fclose(DATE_FILE);
  
  for(i=0; i<total_num_docs*NUM_FIELD_TYPES; i++) {
    mag[i] = (DB_FLOAT)sqrt(mag[i]);
  }
  fwrite_big(mag, sizeof(DB_FLOAT), total_num_docs*NUM_FIELD_TYPES, OUT_DVMAG_FILE);
  free(mag);
  fclose(OUT_DVMAG_FILE);
  free(weights);
}  

static void
merge_wmap(int argc, char *argv[]) 
{
  FILE *IN_WMAP_FILE, *OUT_WMAP_FILE;
  DB_INT intbuf[1024];
  int i, num_read;

  OUT_WMAP_FILE = open_or_die(argv[argc-1], WMAP2_FNAME, "w");

  for(i=1; i<argc-1; i++) {
    IN_WMAP_FILE = open_or_die(argv[i], WMAP2_FNAME, "r");
    do {
      num_read = fread_big(intbuf, sizeof(DB_INT), 1024, IN_WMAP_FILE);
      fwrite_big(intbuf, sizeof(DB_INT), num_read, OUT_WMAP_FILE);
    } while (num_read == 1024);
    fclose(IN_WMAP_FILE);
  }

  fclose(OUT_WMAP_FILE);
}

void
merge_databases(int argc,
		char *argv[])
{
  
  merge_wordvec(argc, argv);
  merge_date(argc, argv);
  merge_dvmag(argc, argv);
  merge_wmap(argc, argv);
}

