/**************************************************************************
 *
 * mg_perf_hash_build.cpp -- Program to build a perfect hash function
 * Copyright (C) 1999  Rodger McNab
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * $Id: mg_perf_hash_build.cpp,v 1.2 2000/02/15 22:45:22 kjm18 Exp $
 *
 **************************************************************************/

#include "sysfuncs.h"
#include "memlib.h"
#include "messages.h"
#include "local_strings.h"
#include "perf_hash.h"
#include "netorder.h"

#include "mg_files.h"
#include "invf.h"
#include "locallib.h"
#include "words.h"
#include "mg.h"

/*
   $Log: mg_perf_hash_build.cpp,v $
   Revision 1.2  2000/02/15 22:45:22  kjm18
   added feature to retrieve doc nums at a different level than the level
   queried at. eg query at Document level, but retrieve section level docnums
   bug in mg_perf_hash_build.cpp fixed

   Revision 1.1  2000/01/14 02:26:19  sjboddie
   Rodgers new C++ mg

   Revision 1.2  1999/10/17 23:43:27  cs025
   Changes to eradicate Xmalloc

   Revision 1.1  1999/10/11 02:58:01  cs025
   Base install of MG-PP

   Revision 1.1  1999/08/10 21:18:13  sjboddie
   renamed mg-1.3d directory mg

   Revision 1.2  1998/11/25 07:55:47  rjmcnab

   Modified mg to that you can specify the stemmer you want
   to use via a command line option. You specify it to
   mg_passes during the build process. The number of the
   stemmer that you used is stored within the inverted
   dictionary header and the stemmed dictionary header so
   the correct stemmer is used in later stages of building
   and querying.

   Revision 1.1  1998/11/17 09:35:15  rjmcnab
   *** empty log message ***

   * Revision 1.3  1994/10/20  03:56:58  tes
   * I have rewritten the boolean query optimiser and abstracted out the
   * components of the boolean query.
   *
   * Revision 1.2  1994/09/20  04:41:53  tes
   * For version 1.1
   *
 */



#define POOL_SIZE 1024*1024


static void ProcessFiles (char *filename, int r) {
  FILE *dictFile, *hashFile;
  unsigned long i;
  invf_dict_header idh;
  perf_hash_data *phd;
  u_char *pool;
  unsigned long pool_left;
  u_char **starts;

  // read in the dictionary
  dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
			MAGIC_STEM_BUILD, MG_ABORT);
  if (dictFile==NULL) {
    FatalError(1, "unable to open file");
  }
  idh.Read (dictFile);

  //cerr << idh.lookback<<" "<<idh.word_dict_start<<endl;
  // go to the start of the word dictionary
  fseek (dictFile, idh.word_dict_start, SEEK_SET);
  
  if (!(pool = (u_char *) Xmalloc (POOL_SIZE)))
    FatalError (1, "Out of memory");
  pool_left = POOL_SIZE;

  if (!(starts = (u_char **) Xmalloc (sizeof (u_char *) * idh.word_dict_size)))
    FatalError (1, "Out of memory");
  //cerr << "size= "<< idh.word_dict_size<<endl;
  word_dict_el wordEl;
  wordEl.SetNumLevels (idh.num_levels);
  for (i = 0; i < idh.word_dict_size; i++) {
    // read the next word and associated information
    wordEl.Read (dictFile, idh.num_levels);

    // push string onto pool data
    register unsigned long l;
    l = wordEl.el.size() + 1;
    if (pool_left < l) {
      pool = (u_char *) Xmalloc (POOL_SIZE);
      pool_left = POOL_SIZE;
    }
    starts[i] = pool;
    
    *pool++ = wordEl.el.size();
    bcopy ((char *) wordEl.el.begin(), (char *) pool, wordEl.el.size());
    //cerr << pool<<"   " <<starts[i]<<endl;
    pool += wordEl.el.size();
    pool_left -= l;
    
  }
  fclose (dictFile);
  //cerr << pool<<"   " <<starts[i-1]<<endl;
  //cerr<<"starts  "<<starts[113529]<<endl;
  //cerr << starts[17][1] << " "<<starts[25][4]<<endl;
  // create perfect hash file
  hashFile = create_file (filename, INVF_DICT_HASH_SUFFIX, "wb",
			  MAGIC_HASH, MG_ABORT);
  if (!(phd = gen_hash_func (idh.word_dict_size, starts, r)))
    FatalError (1, "Unable to generate hash function");
  if (write_perf_hash_data (hashFile, phd) == -1)
    FatalError (1, "Unable to write hash function");
  fclose (hashFile);
}



int main (int argc, char **argv) {
  int r = -1;
  char *filename = "";
  int ch;
  msg_prefix = argv[0];
  opterr = 0;

  while ((ch = getopt (argc, argv, "f:d:r:h")) != -1) {
    switch (ch) {
    case 'f':		// input file
      filename = optarg;
      break;
    case 'd':
      set_basepath (optarg);
      break;
    case 'r':
      r = atoi (optarg);
      break;
    case 'h':
    case '?':
      fprintf (stderr, "usage: %s [-f input_file]"
	       "[-d data directory] [-r random seed] [-h]\n", argv[0]);
      exit (1);
    }
  }

  ProcessFiles (filename, r);
  return 0;
}
