/**************************************************************************
 *
 * mg_invf_dump.cpp -- Program to dump uot an inverted fil
 * Copyright (C) 1994  Neil Sharman
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * $Id: mg_invf_dump.cpp,v 1.2 2001/02/02 01:12:29 kjm18 Exp $
 *
 **************************************************************************/

#include "sysfuncs.h"

#include "messages.h"
#include "bitio_m_stdio.h"
#include "bitio_gen.h"
#include "netorder.h"  /* [RPAP - Jan 97: Endian Ordering] */

#include "mg_files.h"
#include "locallib.h"
#include "words.h"
#include "invf.h"
#include "WordData.h"

/*
   $Log: mg_invf_dump.cpp,v $
   Revision 1.2  2001/02/02 01:12:29  kjm18
   added more command line options, and better help message

   Revision 1.1  2000/01/14 02:26:17  sjboddie
   Rodgers new C++ mg

   Revision 1.1  1999/10/11 02:57:55  cs025
   Base install of MG-PP

   Revision 1.1  1999/08/10 21:18:09  sjboddie
   renamed mg-1.3d directory mg

   Revision 1.2  1998/11/25 07:55:46  rjmcnab

   Modified mg to that you can specify the stemmer you want
   to use via a command line option. You specify it to
   mg_passes during the build process. The number of the
   stemmer that you used is stored within the inverted
   dictionary header and the stemmed dictionary header so
   the correct stemmer is used in later stages of building
   and querying.

   Revision 1.1  1998/11/17 09:35:05  rjmcnab
   *** empty log message ***

   * Revision 1.3  1994/11/29  00:32:01  tes
   * Committing the new merged files and changes.
   *
   * Revision 1.2  1994/09/20  04:41:50  tes
   * For version 1.1
   *
 */


static void PrintInvfWord (FILE *invfFile,
			   invf_dict_header &idh,
			   invf_file_header &ifh,
			   word_dict_el &wordEl,
			   unsigned long wordStart,
			   bool printFrags) {
  cout << wordEl.frag_occur << " \"" << wordEl.el << "\"\n";
  
  if (printFrags) {
    // seek to the appropriate place in the inverted file
    fseek (invfFile, wordStart, SEEK_SET);
    
    stdio_bitio_buffer buffer(invfFile);
    
    unsigned long B = BIO_Bblock_Init (idh.num_frags, wordEl.frag_occur);
    unsigned long fragNum = 0;
    unsigned long i;
    for (i=0; i<wordEl.frag_occur; i++) {
      unsigned long delta = buffer.bblock_decode (B, NULL);
      fragNum += delta;
      cout << " " << fragNum;
      
      if (!ifh.word_level_index ) {
	unsigned long count = buffer.gamma_decode (NULL);
	cout << "(" << count << ")";
      } else {
	cout << "(1)";
      }
    }
    
    cout << "\n";
  
    buffer.done();
  }
}

static void PrintInvfTag (FILE *invfFile,
			  invf_dict_header &idh,
			  invf_file_header &/*ifh*/,
			  dict_el &tagEl,
			  unsigned long tagStart,
			  bool printFrags) {
  cout << tagEl.frag_occur << " \"<" << tagEl.el << ">\"\n";
  
  if (printFrags) {
    // seek to the appropriate place in the inverted file
    fseek (invfFile, tagStart, SEEK_SET);
    
    stdio_bitio_buffer buffer(invfFile);
    
    unsigned long pTag = tagEl.frag_occur*2;
    unsigned long B = BIO_Bblock_Init (idh.num_frags+pTag, pTag);
    unsigned long fragNum = 0;
    unsigned long i;
    for (i=0; i<tagEl.frag_occur; i++) {
      unsigned long delta = buffer.bblock_decode (B, NULL)-1;
      fragNum += delta;
      cout << " " << fragNum;
      cout << "-";
      delta = buffer.bblock_decode (B, NULL)-1;
      fragNum += delta;
      cout << fragNum;
    }
    
    cout << "\n";
    
    buffer.done();
  }
}

static void PrintHeaderInfo (invf_dict_header &idh,
			     invf_file_header &ifh) {
  cerr << "Lookback:         " << idh.lookback << "\n";
  cerr << "Word Dict Size:   " << idh.word_dict_size << "\n";
  cerr << "Tag Dict Size:    " << idh.tag_dict_size << "\n";
  cerr << "Num Documents:    " << idh.num_docs << "\n";
  cerr << "Num Fragments:    " << idh.num_frags << "\n";
  cerr << "Num Words:        " << idh.num_words << "\n";

  cerr << "Skip Mode:        " << ifh.skip_mode << "\n";
  cerr << "Word Level Index: " << ifh.word_level_index << "\n";

  cerr << "\n";
}


static void process_files (char *filename,
			   bool printHeader,
			   bool printWords,
			   bool printTags,
			   bool printFrags) {
  // open the dictionary
  FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
			MAGIC_STEM_BUILD, MG_ABORT);
  invf_dict_header idh;
  idh.Read (dictFile);

  // open the inverted file
  FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb",
			MAGIC_INVF, MG_ABORT);
  
  invf_file_header ifh;
  ifh.Read (invfFile);

  if (ifh.skip_mode != SKIP_MODE_NO_SKIPS)
    FatalError (1, "The invf file contains skips. Unable to dump.");

  // print out header information
  if (printHeader) {
    PrintHeaderInfo (idh, ifh);
  }

  // open the inverted index
  FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
				 MAGIC_INVI, MG_ABORT);

  // go to the start of the word dictionary
  fseek (dictFile, idh.word_dict_start, SEEK_SET);

  // process all the words
  if (printWords) {
    unsigned long wordNum;
    unsigned long wordStart;
    word_dict_el wordEl;
    wordEl.SetNumLevels (idh.num_levels);
    for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
      wordEl.Read (dictFile, idh.num_levels);
      ReadUL (invfIdxFile, wordStart);
      PrintInvfWord (invfFile, idh, ifh, wordEl, wordStart, printFrags);
    }
  }

  // process all the tags
  if (printTags) {
    unsigned long tagNum;
    unsigned long tagStart;
    dict_el tagEl;
    for (tagNum=0; tagNum<idh.tag_dict_size; tagNum++) {
      tagEl.Read (dictFile);
      ReadUL (invfIdxFile, tagStart);
      PrintInvfTag (invfFile, idh, ifh, tagEl, tagStart, printFrags);
    }
  }
  // close the open files
  fclose (invfIdxFile);
  fclose (invfFile);
  fclose (dictFile);
}


int main (int argc, char **argv) {
  char *dir_name, *filename = "";
  int ch;
  msg_prefix = argv[0];
  dir_name = getenv ("MGDATA");
  opterr = 0;

  bool printHeader = false;
  bool printWords = false;
  bool printTags = false;
  bool printFrags = false;
  
  msg_prefix = argv[0];
  while ((ch = getopt (argc, argv, "hrwtnf:d:")) != -1) {
    switch (ch) {
    case 'f':		// input file
      filename = optarg;
      break;
    case 'd':
      set_basepath(optarg);
      break;
    case 'r':
      printHeader = true;
      break;
    case 'w':
      printWords = true;
      break;
    case 'n':
      printFrags = true;
      break;
    case 't':
      printTags = true;
      break;
    case 'h':
    case '?':
      fprintf (stderr, "usage: %s [-h] [-r] [-w] [-t] [-n] [-f input_file]"
	       "[-d data directory]\n(-rwnt:print header, words, tags, fragnums)\n", 
	       argv[0]);
      exit (1);
    }
  }
  
  process_files (filename, printHeader, printWords, printTags, printFrags);

  return 0;
}
