/**************************************************************************
 *
 * mg_invf_dict.cpp -- Program to build the blocked stemmed dictionary
 * Copyright (C) 1999  Rodger McNab
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * $Id: mg_invf_dict.cpp,v 1.2 2000/01/18 03:53:24 rjmcnab Exp $
 *
 **************************************************************************/

#include "sysfuncs.h"
#include "messages.h"

#include "mg_files.h"
#include "invf.h"

/*
   $Log: mg_invf_dict.cpp,v $
   Revision 1.2  2000/01/18 03:53:24  rjmcnab
   Fixed a couple of bugs and made building silent if needed.

   Revision 1.1  2000/01/14 02:26:16  sjboddie
   Rodgers new C++ mg

 */


static void process_files (char *filename, unsigned long entriesPerBlock) {
  // open the dictionary
  FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
			      MAGIC_STEM_BUILD, MG_ABORT);
  invf_dict_header idh;
  idh.Read (dictFile);

  // open the inverted index file
  FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
				 MAGIC_INVI, MG_ABORT);

  // create the blocked dictionary
  FILE *blockDictFile = create_file (filename, INVF_DICT_BLOCKED_SUFFIX, "wb",
				     MAGIC_STEM, MG_ABORT);
  block_dict_header bdh;
  bdh.lookback = idh.lookback;
  bdh.word_dict_start = idh.word_dict_start;
  bdh.word_dict_size = idh.word_dict_size;
  bdh.tag_dict_start = idh.tag_dict_start;
  bdh.tag_dict_size = idh.tag_dict_size;
  bdh.num_docs = idh.num_docs;
  bdh.num_frags = idh.num_frags;
  bdh.num_words = idh.num_words;
  bdh.total_bytes = idh.total_bytes;
  bdh.index_string_bytes = idh.index_string_bytes;
  bdh.num_levels = idh.num_levels;
  bdh.Write (blockDictFile);

  
  // write out the word part of the dictionary

  bdh.entries_per_wblk = entriesPerBlock;
  bdh.max_wblk_size = 0;
  bdh.wblk_start = ftell (blockDictFile);
  
  fseek (dictFile, idh.word_dict_start, SEEK_SET);

  block_idx wordIdx;
  word_block_dict_el wordBlockEl;
  wordBlockEl.SetNumLevels (idh.num_levels);
  
  
  unsigned long wordNum;
  unsigned long wordInvfPtr;
  UCArray lastEl;
  word_dict_el wordEl;
  wordEl.SetNumLevels (idh.num_levels);
  for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
    // read in the next word and inverted file pointer
    wordEl.Read (dictFile, idh.num_levels);
    ReadUL (invfIdxFile, wordInvfPtr);

    // remember this word (and position) if this is the start
    // of a new block
    if (wordNum % entriesPerBlock == 0) {
      block_idx_info elIdx;
      elIdx.el = wordEl.el;
      elIdx.block_ptr = ftell (blockDictFile);

      // see if this block is the longest so far
      if (wordIdx.size() > 0) {
	unsigned long blockLen = elIdx.block_ptr -
	  (*(wordIdx.end()-1)).block_ptr;
	if (blockLen > bdh.max_wblk_size) bdh.max_wblk_size = blockLen;
      }
      
      wordIdx.push_back (elIdx);
      lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
    }

    // copy the information for this word
    wordBlockEl.el = wordEl.el;
    wordBlockEl.frag_occur = wordEl.frag_occur;
    wordBlockEl.freq = wordEl.freq;
    wordBlockEl.invf_ptr = wordInvfPtr;
    unsigned long tempI;
    for (tempI=0; tempI<idh.num_levels; tempI++)
      wordBlockEl.levelFreqs[tempI] = wordEl.levelFreqs[tempI];

    // write out the word
    wordBlockEl.Write (blockDictFile, &lastEl, idh.num_levels);

    lastEl = wordBlockEl.el;
  }


  // write out the tag part of the dictionary

  bdh.entries_per_tblk = entriesPerBlock;
  bdh.max_tblk_size = 0;
  bdh.tblk_start = ftell (blockDictFile);
  
  fseek (dictFile, idh.tag_dict_start, SEEK_SET);

  block_idx tagIdx;
  block_dict_el tagBlockEl;
  
  unsigned long tagNum;
  unsigned long tagInvfPtr;
  dict_el tagEl;
  lastEl.erase (lastEl.begin(), lastEl.end());
  for (tagNum=0; tagNum<idh.tag_dict_size; tagNum++) {
    // read in the next tag and inverted file pointer
    tagEl.Read (dictFile);
    ReadUL (invfIdxFile, tagInvfPtr);

    // remember this tag (and position) if this is the start
    // of a new block
    if (tagNum % entriesPerBlock == 0) {
      block_idx_info elIdx;
      elIdx.el = tagEl.el;
      elIdx.block_ptr = ftell (blockDictFile);

      // see if this block is the longest so far
      if (tagIdx.size() > 0) {
	unsigned long blockLen = elIdx.block_ptr -
	  (*(tagIdx.end()-1)).block_ptr;
	if (blockLen > bdh.max_tblk_size) bdh.max_tblk_size = blockLen;
      }
      
      tagIdx.push_back (elIdx);
      lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
    }

    // copy the information for this tag
    tagBlockEl.el = tagEl.el;
    tagBlockEl.frag_occur = tagEl.frag_occur;
    tagBlockEl.freq = tagEl.freq;
    tagBlockEl.invf_ptr = tagInvfPtr;

    // write out the tag
    tagBlockEl.Write (blockDictFile, &lastEl);

    lastEl = tagBlockEl.el;
  }


  // write out the element indexes
  bdh.num_wblks = wordIdx.size();
  bdh.wblk_idx_start = ftell (blockDictFile);
  WriteBlockIdx (blockDictFile, wordIdx);
  
  bdh.num_tblks = tagIdx.size();
  bdh.tblk_idx_start = ftell (blockDictFile);
  WriteBlockIdx (blockDictFile, tagIdx);

  // write out the blocked dictionary header
  fseek (blockDictFile, sizeof(unsigned long), SEEK_SET);
  bdh.Write (blockDictFile);


  // close open files
  fclose (blockDictFile);
  fclose (invfIdxFile);
  fclose (dictFile);
  
  // print out information
#ifndef SILENT
  Message ("Max word block size = %d\n", bdh.max_wblk_size);
  Message ("Max tag block size = %d\n", bdh.max_tblk_size);
  Message ("Number of word blocks written = %d\n", bdh.num_wblks);
  Message ("Number of tag blocks written = %d\n", bdh.num_tblks);
#endif
}


int main (int argc, char **argv) {
  unsigned long entriesPerBlock = 16;
  char *filename = "";
  int ch;
  msg_prefix = argv[0];
  opterr = 0;

  while ((ch = getopt (argc, argv, "f:d:b:h")) != -1) {
    switch (ch) {
    case 'f':		// input file
      filename = optarg;
      break;
    case 'd':
      set_basepath (optarg);
      break;
    case 'b':
      entriesPerBlock = atoi (optarg);
      break;
    case 'h':
    case '?':
      fprintf (stderr, "usage: %s [-f input_file] "
	       "[-d data directory] [-b entries-per-block] "
	       "[-h]\n", argv[0]);
      exit (1);
    }
  }

  process_files (filename, entriesPerBlock);
  return 0;
}
