/**************************************************************************
 *
 * mg_stem_idx.cpp -- stem index builder
 * Copyright (C) 1999  Rodger McNab
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 **************************************************************************/


#include "sysfuncs.h"
#include "messages.h"

#include "mg_files.h"
#include "invf.h"
#include "UCArray.h"
#include "words.h"

#include "stemmer.h"


#if defined(GSDL_USE_OBJECTSPACE)
#  include <ospace\std\map>
#  include <ospace\std\vector>
#elif defined(GSDL_USE_STL_H)
#  include <map.h>
#  include <vector.h>
#else
#  include <map>
#  include <vector>
#endif


/*
   $Log: mg_stem_idx.cpp,v $
   Revision 1.3  2000/01/18 03:53:24  rjmcnab
   Fixed a couple of bugs and made building silent if needed.

   Revision 1.2  2000/01/14 02:45:51  sjboddie
   fixed compiler warning

   Revision 1.1  2000/01/14 02:26:20  sjboddie
   Rodgers new C++ mg

 */


typedef vector<unsigned long> WordNumList;
typedef map<UCArray, WordNumList, DictLTUCArray> StemMapDict;


void CreateStemDict (char *filename,
		     StemMapDict &stemDict,
		     int stemMethod,
		     int stemmerNum) {
  stemDict.erase (stemDict.begin(), stemDict.end());

  // open the dictionary
  FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
			      MAGIC_STEM_BUILD, MG_ABORT);
  invf_dict_header idh;
  idh.Read (dictFile);

  fseek (dictFile, idh.word_dict_start, SEEK_SET);

  unsigned long wordNum;
  u_char mgWord[MAXSTEMLEN + 1];
  word_dict_el wordEl;
  UCArray stemEl;
  wordEl.SetNumLevels (idh.num_levels);
  for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
    // read in the next word
    wordEl.Read (dictFile, idh.num_levels);

    // convert the word to an "mg word"
    mgWord[0] = wordEl.el.size();
    bcopy ((char *)wordEl.el.begin(), (char *)&mgWord[1], wordEl.el.size());

    // stem the word
    stemmer (stemMethod, stemmerNum, mgWord);

    // convert the result back to a UCArray
    stemEl.erase (stemEl.begin(), stemEl.end());
    stemEl.insert (stemEl.end(), &mgWord[1], &mgWord[1] + mgWord[0]);

//      cout << "\"" << stemEl << "\" -> \"" << wordEl.el << "\"\n";
    
    // add this word number to the list of word numbers for this word
    stemDict[stemEl].push_back (wordNum);
  }

  fclose (dictFile);
}


void WriteStemDict (char *filename,
		    StemMapDict &stemDict,
		    int stemMethod,
		    int stemmerNum,
		    unsigned long entriesPerBlock) {
  // Create appropriate stem index file
  FILE *stemDictFile = NULL;
  if (stemMethod == 1) {
    stemDictFile = create_file (filename, INVF_DICT_BLOCKED_1_SUFFIX,
				"wb", MAGIC_STEM_1, MG_ABORT);
  } else if (stemMethod == 2) {
    stemDictFile = create_file (filename, INVF_DICT_BLOCKED_2_SUFFIX,
				"wb", MAGIC_STEM_2, MG_ABORT);
  } else if (stemMethod == 3) {
    stemDictFile = create_file (filename, INVF_DICT_BLOCKED_3_SUFFIX,
				"wb", MAGIC_STEM_3, MG_ABORT);
  } else {
    FatalError (1, "Unknown stem method %d", stemMethod);
  }

  stem_idx_header sih;
  sih.lookback = 0;
  sih.dict_size = stemDict.size();
  sih.entries_per_block = entriesPerBlock;
  sih.max_block_size = 0;
  
  sih.stemmer_num = stemmerNum;
  sih.stem_method = stemMethod;

  // write out a place-holder version of the header
  sih.Write (stemDictFile);

  sih.blocks_start = ftell (stemDictFile);

  block_idx stemIdx;
  unsigned long stemNum = 0;
  stem_block_dict_el stemEl;
  UCArray lastEl;

  StemMapDict::const_iterator here = stemDict.begin();
  StemMapDict::const_iterator end = stemDict.end();
  while (here != end) {
    // remember this stem (and position) if this is the start
    // of a new block
    if (stemNum % entriesPerBlock == 0) {
      block_idx_info elIdx;
      elIdx.el = (*here).first;
      elIdx.block_ptr = ftell (stemDictFile);

      // see if this block is the longest so far
      if (stemIdx.size() > 0) {
	unsigned long blockLen = elIdx.block_ptr -
	  (*(stemIdx.end()-1)).block_ptr;
	if (blockLen > sih.max_block_size) sih.max_block_size = blockLen;
      }
      
      stemIdx.push_back (elIdx);
      lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
    }

    // copy the information for this stem
    stemEl.el = (*here).first;
    stemEl.equivWords = (*here).second;

    // write out the stem
    stemEl.Write (stemDictFile, &lastEl);
    
    here++; stemNum++;
  }


  // write out the element indexes
  sih.num_blocks = stemIdx.size();
  sih.block_idx_start = ftell (stemDictFile);
  WriteBlockIdx (stemDictFile, stemIdx);
  
  // write out the stem dictionary header
  fseek (stemDictFile, sizeof(unsigned long), SEEK_SET);
  sih.Write (stemDictFile);


  // close open files
  fclose (stemDictFile);
  
  // print out information
#ifndef SILENT
  Message ("Num word stems = %d\n", sih.dict_size);
  Message ("Max stem block size = %d\n", sih.max_block_size);
  Message ("Number of stem blocks written = %d\n", sih.num_blocks);
#endif
}


int main (int argc, char **argv) {
  unsigned long entriesPerBlock = 16;
  char *filename = "";
  int ch;
  int stemMethod = 0; // illegal value (no translation)
  int stemmerNum = 0; // English stemmer
  msg_prefix = argv[0];
  opterr = 0;

  while ((ch = getopt (argc, argv, "f:d:b:s:h")) != -1) {
    switch (ch) {
    case 'f':		// input file
      filename = optarg;
      break;
    case 'd':
      set_basepath (optarg);
      break;
    case 'b':
      entriesPerBlock = atoi (optarg);
      break;
    case 's':
      stemMethod = atoi (optarg);
      break;
    case 'a':
      stemmerNum = stemmernumber ((unsigned char *) optarg);
      break;
    case 'h':
    case '?':
      fprintf (stderr, "usage: %s [-d directory] "
	       "[-b entries-per-block] [-h] -s 1|2|3 "
	       "[-a stemmer-method] -f name\n", argv[0]);
      exit (1);
    }
  }
  
  if (stemMethod < 1 || stemMethod > 3)
    FatalError (1, "Stem method must be 1, 2 or 3");

  // read in the dictionary and create the in memory dictionary
  StemMapDict stemDict;
  CreateStemDict (filename, stemDict, stemMethod, stemmerNum);
  
  // write out the dictionary as a blocked file
  WriteStemDict (filename, stemDict, stemMethod, stemmerNum, entriesPerBlock);
  
  return 0;
}
