/**************************************************************************
 *
 * ivf.pass2.cpp -- Memory efficient pass 2 inversion
 * Copyright (C) 1999  Rodger McNab
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * $Id: ivf.pass2.cpp,v 1.1 2000/01/14 02:26:07 sjboddie Exp $
 *
 **************************************************************************/

/*
   $Log: ivf.pass2.cpp,v $
   Revision 1.1  2000/01/14 02:26:07  sjboddie
   Rodgers new C++ mg

 */

#include <stdio.h>
#include <unistd.h>

#include "non_ansi.h"

#include "sysfuncs.h"

#include "mg_files.h"
#include "invf.h"
#include "mg.h"
#include "build.h"
#include "locallib.h"
#include "UCArray.h"
#include "bitio_m_random.h"
#include "bitio_m_stdio.h"
#include "bitio_m_mems.h"
#include "bitio_gen.h"
#include <stdio.h>
#include "words.h"
#include "messages.h"
#include "netorder.h"
#include "FIvfLevelInfo.h"
#include "perf_hash.h"
#include "string.h"

#include "longlong.h"

#ifdef __WIN32_
#include <io.h>
#endif

#if defined(GSDL_USE_OBJECTSPACE)
#  include <ospace\std\map>
#elif defined(GSDL_USE_STL_H)
#  include <map.h>
#else
#  include <map>
#endif


#ifdef USE_LONG_LONG
#define SEEK_X seek_LL
#define TELL_X tell_LL
#else
#define SEEK_X seek
#define TELL_X tell
#endif

#ifndef RND_BUF_SIZE
#define RND_BUF_SIZE 8*1024
#endif


static unsigned long numDocs = 0;
static unsigned long numChunkDocs = 0;
static unsigned long numDocsInChunk = 0;

static unsigned long numFrags = 0;
static unsigned long numFragsInChunk = 0;
static unsigned long chunkStartFragNum = 0;




struct BitPtr {
  unsigned long start;
  unsigned long here;
  unsigned long lastFragNum;
  unsigned long lgB;
  
  void Clear () { start = here = lastFragNum = lgB = 0; }
  BitPtr () { Clear(); }
};

class WordBitPtrs {
protected:
  unsigned long numWords;
  unsigned long numTags;
  unsigned long size;
  BitPtr *wordBitPtrs;

  void CheckBufOverrun (unsigned long num) {
    if (wordBitPtrs[num].here > wordBitPtrs[num+1].start) {
      cerr << "numDocs: " << numDocs << "\n";
      cerr << "numChunkDocs: " << numChunkDocs << "\n";
      cerr << "numDocsInChunk: " << numDocsInChunk << "\n";
      cerr << "numFrags: " << numFrags << "\n";
      cerr << "numFragsInChunk: " << numFragsInChunk << "\n";
      cerr << "chunkStartFragNum: " << chunkStartFragNum << "\n";
      cerr << "num: " << num << "\n";
      cerr << "[num].start: " << wordBitPtrs[num].start << "\n";
      cerr << "[num].here: " << wordBitPtrs[num].here << "\n";
      cerr << "[num+1].start: " << wordBitPtrs[num+1].start << "\n";
      FatalError (1, "Bit buffer overrun");
    }
  }
  
public:
  void Clear ();
  WordBitPtrs () { wordBitPtrs = NULL; Clear(); }
  ~WordBitPtrs ();
  void SetSize (unsigned long _numWords,
		unsigned long _numTags);

  void ResetPtrs () {
    if (wordBitPtrs == NULL) return;
    unsigned long i;
    for (i=0; i<size; i++) wordBitPtrs[i].Clear();
  }
  
  BitPtr &GetWordBitPtr (unsigned long wordNum)
    { return wordBitPtrs[wordNum]; }
  unsigned long &GetWordStart (unsigned long wordNum)
    { return wordBitPtrs[wordNum].start; }
  unsigned long &GetWordHere (unsigned long wordNum)
    { return wordBitPtrs[wordNum].here; }
  void CheckWordBufOverrun (unsigned long wordNum)
    { CheckBufOverrun (wordNum); }

  BitPtr &GetTagBitPtr (unsigned long tagNum)
    { return wordBitPtrs[tagNum + numWords]; }
  unsigned long &GetTagStart (unsigned long tagNum)
    { return wordBitPtrs[tagNum + numWords].start; }
  unsigned long &GetTagHere (unsigned long tagNum)
    { return wordBitPtrs[tagNum + numWords].here; }
  void CheckTagBufOverrun (unsigned long tagNum)
    { CheckBufOverrun (tagNum + numWords); }

  BitPtr &GetEndBitPtr ()
    { return wordBitPtrs[size-1]; }
  unsigned long &GetEndStart ()
    { return wordBitPtrs[size-1].start; }
  unsigned long &GetEndHere ()
    { return wordBitPtrs[size-1].here; }
};


struct IP2TagInfo {
  bool inTag;
  unsigned long startFrag;
  unsigned long tagNum;

  IP2TagInfo () {
    inTag = false;
    startFrag = 0;
    tagNum = 0;
  }
};

// maps tags to tag information
typedef map<UCArray, IP2TagInfo, DictLTUCArray> TagMapDict;


// class to handle the translation of occurrence order
// to dictionary order for words and tags
class OccurToDictConverter {
protected:
  unsigned long pos;
  unsigned long val;
  FILE *transFile;
  random_bitio_buffer rbs;

  unsigned long wordDictSize;
  unsigned long tagDictSize;

  void SeekStart ();
  unsigned long TranslateNum (unsigned long num);
  
public:
  OccurToDictConverter ();
  ~OccurToDictConverter ();

  void Open (char *filename, unsigned long _wordDictSize,
	     unsigned long _tagDictSize);

  // Close frees all allocated memory
  void Close ();

  unsigned long TranslateWord (unsigned long occurNum)
    { return TranslateNum (occurNum); }
  unsigned long TranslateTag (unsigned long occurNum)
    { return TranslateNum (occurNum+wordDictSize); }
};


struct InvfStateRec {
  mg_ullong start;
  mg_ullong here;
  unsigned long lastFragNum;
  unsigned long B;

  void Clear () {
    start = here = 0;
    lastFragNum = B = 0;
  }
  InvfStateRec () { Clear (); }
};


#define ISR_SIZE 1024

class InvfStateCache {
protected:
  InvfStateRec recCache [ISR_SIZE];
  unsigned long startNum;

  FILE *stateFile;

  void ClearCache () {
    unsigned int i = 0;
    for (i=0; i<ISR_SIZE; i++) recCache[i].Clear();
  }
  
public:
  InvfStateCache ();
  ~InvfStateCache ();
  
  void Open (char *filename);
  void Close ();

  // previous references to state records may be
  // invalidated calling GetRec
  InvfStateRec &GetRec (unsigned long num);
};


static invf_dict_header idh;
static WordBitPtrs bitPtrs;

static FILE *chunkFile = NULL;
static stdio_bitio_buffer chunkBuf;

static unsigned long ivfMemBufSize = 0;
static char *ivfMemBuf = NULL;

// word and tag dictionaries. a map is used for the tag dictionary
// as it should never be very big (and the perfect hash function
// sometimes has trouble with small values).
static perf_hash_data *wordHashDict = NULL;
static TagMapDict tagMapDict;

// information about all the different levels
static FIvfLevel ivfLevel;

static OccurToDictConverter occurConvert;

// information about the state of the inverted file
static InvfStateCache invfState;

static char collectFilename[512];


void WordBitPtrs::Clear () {
  numWords = 0;
  numTags = 0;
  size=0;
  if (wordBitPtrs != NULL) delete [] wordBitPtrs;
  wordBitPtrs = NULL;
}

WordBitPtrs::~WordBitPtrs () {
  if (wordBitPtrs != NULL) delete [] wordBitPtrs;
}

void WordBitPtrs::SetSize (unsigned long _numWords,
			   unsigned long _numTags){
  Clear();
  numWords = _numWords;
  numTags = _numTags;
  size = numWords + numTags + 1;
  wordBitPtrs = new BitPtr [size];
}


void OccurToDictConverter::SeekStart () {
  if (transFile == NULL) return;
  rbs.SEEK_X (sizeof (unsigned long) * 8);
  pos = 0;
}

unsigned long OccurToDictConverter::TranslateNum (unsigned long num) {
  if (num < pos) SeekStart ();
  while (pos <= num) {
    if (pos < wordDictSize)
      val = rbs.binary_decode (wordDictSize + 1, NULL) - 1;
    else 
      val = rbs.binary_decode (tagDictSize + 1, NULL) - 1;
    pos++;
  }
  return val;
}

OccurToDictConverter::OccurToDictConverter () {
  pos = 0;
  val = 0;
  transFile = NULL;
  wordDictSize = 0;
  tagDictSize = 0;
}

OccurToDictConverter::~OccurToDictConverter () {
  if (transFile != NULL) Close ();
}

void OccurToDictConverter::Open (char *filename, unsigned long _wordDictSize,
				 unsigned long _tagDictSize) {
  if (transFile != NULL) Close ();

  wordDictSize = _wordDictSize;
  tagDictSize = _tagDictSize;

  transFile = open_file (filename, INVF_CHUNK_TRANS_SUFFIX, "rb",
			 MAGIC_CHUNK_TRANS, MG_ABORT);
  rbs.attachFile (transFile, RND_BUF_SIZE);
  SeekStart ();
  val = 0;
}

void OccurToDictConverter::Close () {
  if (transFile == NULL) return;

  rbs.done ();
  fclose (transFile);
  transFile = NULL;
  pos = 0;
  val = 0;

  wordDictSize = 0;
  tagDictSize = 0;
}




InvfStateCache::InvfStateCache () {
  startNum = 0;
  stateFile = NULL;
}

InvfStateCache::~InvfStateCache () {
  if (stateFile != NULL) Close ();
}
  
void InvfStateCache::Open (char *filename) {
  if (stateFile != NULL) Close();

  // open the state file
  char path[512];
  sprintf (path, FILE_NAME_FORMAT ".%ld", get_basepath (), filename,
	   ".invf.state", (long) getpid ());
  if (!(stateFile = fopen (path, "wb+"))) {
    Message ("Unable to create \"%s\"", path);
    exit (1);
  }
  unlink (path); // file will be deleted after it is closed
  
  // reset the buffer
  startNum = 0;
  ClearCache();
}

void InvfStateCache::Close () {
  if (stateFile == NULL) return;
  fclose (stateFile);
  stateFile = NULL;
  startNum = 0;
}

InvfStateRec &InvfStateCache::GetRec (unsigned long num) {
  // see if cached
  if ((num >= startNum) && (num < startNum + ISR_SIZE))
    return recCache[num-startNum];

  // not cached, write out this lot of records and read in
  fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
  fwrite ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);

  // read in the new set of records
  ClearCache ();
  startNum = num - (num % ISR_SIZE);
  fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
  fread ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);

  return recCache[num-startNum];
}



static void ClearCharBuf (char *buf, unsigned long size) {
  char *end = buf + size;
  while (buf != end) *buf++ = 0;
}

static void ReadWordDict (char *filename) {
  // read in the perfect hash function for the word dictionary
  FILE *wordHashFile = open_file (filename, INVF_DICT_HASH_SUFFIX, "rb",
				  MAGIC_HASH, MG_ABORT);
  if (!(wordHashDict = read_perf_hash_data (wordHashFile))) {
    FatalError (1, "Unable to read in hash data for word dictionary");
  }
  fclose (wordHashFile);
}

static void ReadTagDict (char *filename, invf_dict_header &_idh) {
  // open the file
  FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
			      MAGIC_STEM_BUILD, MG_ABORT);

  // seek to the start of the tag dictionary
  fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
  
  unsigned long tagNum;
  dict_el thisEl;
  for (tagNum = 0; tagNum < _idh.tag_dict_size; tagNum++) {
    thisEl.Read (dictFile);
    tagMapDict[thisEl.el].tagNum = tagNum;
  }
  
  fclose (dictFile);
}

static void ReadLevelFile (char *filename) {
  FILE *f;
  f = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
		 MAGIC_INVF_LEVELS, MG_ABORT);
  ivfLevel.Read (f);
  fclose (f);
}

void CheckIntOverflow (mg_ullong totalIBits, mg_ullong lastTotalIBits) {
  if (totalIBits < lastTotalIBits) {
    fprintf(stderr, "ERROR: The totalIBits counter (%d byte unsigned integer) has overflowed.\n", sizeof (mg_ullong));
    if (sizeof (mg_ullong) < 8) {
      fprintf(stderr, "       Try compiling with GCC to enable use of 8 bytes for this counter.\n");
    }
    fprintf(stderr, "       Build aborted.\n");
    exit(1);
  }
}

// assumes the inverted file state file has been opened
static void InitInvfState (char *filename,
			   invf_dict_header &_idh,
			   InvfStateCache &_invfState,
			   mg_ullong &totalIBits,
			   bool wordLevelIndex) {
  // read in the dictionary, setting inverted state information
  FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
			      MAGIC_STEM_BUILD, MG_ABORT);

  // seek to the start of the word dictionary
  fseek (dictFile, _idh.word_dict_start, SEEK_SET);

  // add the word entries
  word_dict_el wordEl;
  wordEl.SetNumLevels (_idh.num_levels);
  unsigned long dictWordNum, p;
  mg_ullong lastTotalIBits;
  unsigned long N = _idh.num_frags;
  for (dictWordNum=0; dictWordNum<_idh.word_dict_size; dictWordNum++) {
    // lastTotalIBits is used to detect integer overflow
    lastTotalIBits = totalIBits;

    // read the next word and associated information
    wordEl.Read (dictFile, _idh.num_levels);

    // update the state record
    p = wordEl.frag_occur;
    InvfStateRec &wisr = _invfState.GetRec (dictWordNum);
    wisr.start = totalIBits;
    wisr.here = totalIBits;
    wisr.B = BIO_Bblock_Init (N, p);
    
    // add the length of the fragment numbers
    totalIBits += BIO_Bblock_Bound_b (N, p, wisr.B);

    // if needed, add the length of the fragment frequency information
    if (!wordLevelIndex)
      totalIBits += BIO_Gamma_Bound (wordEl.freq, wordEl.frag_occur);

    // align next byte
#ifdef USE_LONG_LONG
    totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
#else
    totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
#endif

    CheckIntOverflow (totalIBits, lastTotalIBits);
  }

  
  // seek to the start of the tag dictionary
  fseek (dictFile, _idh.tag_dict_start, SEEK_SET);

  // add the tag entries
  dict_el tagEl;
  unsigned long dictTagNum;
  N = _idh.num_frags;
  for (dictTagNum=0; dictTagNum<_idh.tag_dict_size; dictTagNum++) {
    // lastTotalIBits is used to detect integer overflow
    lastTotalIBits = totalIBits;

    // read the next tag and associated information
    tagEl.Read (dictFile);

    // update the state record
    p = tagEl.frag_occur*2;
    InvfStateRec &tisr = _invfState.GetRec (dictTagNum + _idh.word_dict_size);
    tisr.start = totalIBits;
    tisr.here = totalIBits;
    tisr.B = BIO_Bblock_Init (N+p, p);
    
    // add the length of the fragment numbers (two numbers for each
    // tag, one for start and one for end)
    totalIBits += BIO_Bblock_Bound_b (N+p, p, tisr.B);

    // align next byte
#ifdef USE_LONG_LONG
    totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
#else
    totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
#endif

    CheckIntOverflow (totalIBits, lastTotalIBits);
  }

  fclose (dictFile);
}

/*
// assumes the chunk tag information has been placed in .first
static void PrintChunkInfo (unsigned long chunkMem,
			    unsigned long numChunkWords,
			    unsigned long numChunkTags) {
  static unsigned long chunksRead = 0;
  chunksRead++;
  cout << "Chunk Number: " << chunksRead << "\n";
  cout << "numChunkDocs " << numDocsInChunk << "\n";
  cout << "numChunkFrags " << numFragsInChunk << "\n";
  cout << "mem " << chunkMem << "\n";
  cout << "numWords " << numChunkWords << "\n";
  cout << "numTags " << numChunkTags << "\n\n";

  TagMapDict::iterator tagMapHere = tagMapDict.begin();
  TagMapDict::iterator tagMapEnd = tagMapDict.end();
  while (tagMapHere != tagMapEnd) {
    unsigned long tagMapNum = (*tagMapHere).second.tagNum;
    cout << (*tagMapHere).first << " " << tagMapNum << " "
	 << bitPtrs.GetTagBitPtr(tagMapNum).here << "\n";
    tagMapHere++;
  }
}
*/

void ReadChunk (invf_dict_header &_idh, bool wordLevelIndex) {
  // reset globals
  numChunkDocs = 0;
  chunkStartFragNum = numFrags;

  // read in information about this chunk
  numDocsInChunk = chunkBuf.gamma_decode (NULL) - 1;
  if (numDocsInChunk == 0)
    FatalError (1, "The number of docs in the current chunk is 0");

  numFragsInChunk = chunkBuf.gamma_decode (NULL) - 1;
  unsigned long chunkMem = chunkBuf.gamma_decode (NULL) - 1;

  if (chunkMem > ivfMemBufSize)
    FatalError (1, "Chunk memory size is greater than maximum");
  
  unsigned long numChunkWords = chunkBuf.gamma_decode (NULL) - 1;
  unsigned long numChunkTags = chunkBuf.gamma_decode (NULL) - 1;

  
  // reset stuff
  ClearCharBuf (ivfMemBuf, ivfMemBufSize);
  bitPtrs.ResetPtrs();  

  // read in the entries in occurrence order storing the
  // "chunkWordCount" in "start" and the "chunkFragCount"
  // in "here"
  unsigned long numOccur;
  unsigned long wordNum;
  for (numOccur=0; numOccur<numChunkWords; numOccur++) {
    wordNum = occurConvert.TranslateWord (numOccur);
    BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
    wordPtr.start = chunkBuf.gamma_decode (NULL) - 1;
    if (wordPtr.start >= 2)
      wordPtr.here = chunkBuf.gamma_decode (NULL);
    else wordPtr.here = wordPtr.start;
  }
  unsigned long tagNum;
  for (numOccur=0; numOccur<numChunkTags; numOccur++) {
    tagNum = occurConvert.TranslateTag (numOccur);
    BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
    // only chunkFragCount is encoded for tags
    tagPtr.start = chunkBuf.gamma_decode (NULL) - 1;
    tagPtr.here = tagPtr.start;
  }

  /*  PrintChunkInfo (chunkMem, numChunkWords, numChunkTags);*/
  
  // create the bit ptrs in dictionary order
  unsigned long totalIBits = 0; // only dealing with memory
  unsigned long chunkWordCount, chunkFragCount;
  for (wordNum=0; wordNum<_idh.word_dict_size; wordNum++) {
    BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
    chunkWordCount = wordPtr.start;
    chunkFragCount = wordPtr.here;
    wordPtr.start = totalIBits;
    wordPtr.here = totalIBits;
    wordPtr.lastFragNum = chunkStartFragNum;
    wordPtr.lgB = 0;
    if (chunkWordCount > 0) {
      wordPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk,
						   chunkFragCount));
      totalIBits += BIO_Bblock_Bound (numFragsInChunk, chunkFragCount);
      // use unary encoding for memory buffer encoding of fragment freq
      if (!wordLevelIndex) {
	totalIBits += chunkWordCount;
      }
    }
  }
  for (tagNum=0; tagNum<_idh.tag_dict_size; tagNum++) {
    BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
    chunkFragCount = tagPtr.here;
    tagPtr.start = totalIBits;
    tagPtr.here = totalIBits;
    tagPtr.lastFragNum = chunkStartFragNum;
    tagPtr.lgB = 0;
    if (chunkFragCount > 0) {
      unsigned long pTag = chunkFragCount*2;
      tagPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk+pTag,
						  pTag));
      unsigned long bLen = BIO_Bblock_Bound (numFragsInChunk+pTag,
					     pTag);
//        cout << tagNum + _idh.word_dict_size << " ";
//        cout << "numFrags: " << numFragsInChunk
//    	   << " chunkFragCount: " << chunkFragCount
//    	   << " B: " << 1 << tagPtr.lgB
//    	   << " blen: " << blen << "\n";
      totalIBits += bLen;
    }
  }
  bitPtrs.GetEndStart() = totalIBits;
  bitPtrs.GetEndHere() = totalIBits;

  if ((totalIBits + 7ul) >> 3ul > chunkMem) {
    cerr << "totalIBits: " << totalIBits << "\n";
    cerr << "bytes: " << ((totalIBits + 7ul) >> 3ul) << "\n";
    cerr << "chunkMem: " << chunkMem << "\n";
    FatalError (1, "Pointers exceed buffer size");
  }
}




int init_ivf_2 (const TagInfo &/*tagInfo*/, char *filename) {
  // read in compressed dictionary header
  FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
			      MAGIC_STEM_BUILD, MG_ABORT);
  idh.Read (dictFile);
  fclose (dictFile);

  // set the size of the bit ptrs
  bitPtrs.SetSize (idh.word_dict_size, idh.tag_dict_size);

  // open the chunk file and read in the maximum memory needed
  // for the inverted memory buffer
  chunkFile = open_file (filename, INVF_CHUNK_SUFFIX, "rb",
			 MAGIC_CHUNK, MG_ABORT);
  ReadUL (chunkFile, ivfMemBufSize);
  chunkBuf.attachFile (chunkFile);

  // allocate memory for the inverted buffer
  ivfMemBuf = new char [ivfMemBufSize];
  ClearCharBuf (ivfMemBuf, ivfMemBufSize);

  // read in the word dictionary
  ReadWordDict (filename);

  // read in the tag dictionary
  ReadTagDict (filename, idh);

  // read in the level information
  ReadLevelFile (filename);
  bool wordLevelIndex = ivfLevel.indexLevel.empty();
  
  // set up the translation table file
  occurConvert.Open (filename, idh.word_dict_size, idh.tag_dict_size);

  // reset some globals
  numDocs = 0;
  numChunkDocs = 0;
  numDocsInChunk = 0;
  numFrags = 0;
  numFragsInChunk = 0;
  chunkStartFragNum = 0;

  strcpy (collectFilename, filename);
  
  
  // create the inverted file
  mg_ullong totalIBits = 0;
  FILE *invfFile = create_file (filename, INVF_SUFFIX, "wb",
				MAGIC_INVF, MG_ABORT);
  totalIBits += sizeof (unsigned long) * 8; // magic number
  totalIBits += 8 * 200;                    // 200 byte gap -- why??????
  fclose (invfFile);

  // init the inverted file state cache
  invfState.Open (filename);
  InitInvfState (filename, idh, invfState, totalIBits, wordLevelIndex);

  return COMPALLOK;
}

static void CloseTextTag (IP2TagInfo &tInfo, const UCArray &/*tagName*/) {
  if (!tInfo.inTag) return;

    // add this tag to the inverted list
  BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tInfo.tagNum);
  unsigned long endFrag = numFrags;
  int b = 1 << tagBitPtr.lgB;

  /*
  cout << (tInfo.tagNum+idh.word_dict_size) << " \"<" << tagName << ">\" "
       << tInfo.startFrag << " " << endFrag << "\n";
  */
    
  mems_bitio_buffer buffer ((u_char *) ivfMemBuf, tagBitPtr.here);
  buffer.bblock_encode (tInfo.startFrag - tagBitPtr.lastFragNum + 1,
			b, NULL);
  buffer.bblock_encode (endFrag - tInfo.startFrag + 1, b, NULL);
  tagBitPtr.lastFragNum = endFrag;
  tagBitPtr.here = buffer.position();
  buffer.encodeDone();
  
  // check for buffer overrun
  bitPtrs.CheckTagBufOverrun (tInfo.tagNum);
  
  // reset information about this tag
  tInfo.inTag = false;
  tInfo.startFrag = 0;
}

static void ProcessOpenTag (const TextEl &el, bool &inFrag) {
  // close tag if open
  IP2TagInfo &tInfo = tagMapDict[el.tagName];
  if (tInfo.inTag) CloseTextTag (tInfo, el.tagName);

  // open this tag
  tInfo.inTag = true;
  tInfo.startFrag = numFrags;

  // check for start of next fragment
  bool wordLevelIndex = ivfLevel.indexLevel.empty();
  if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
    numFrags++;
    inFrag = true;
  }
}

static void ProcessCloseTag (const TextEl &el, bool &inFrag) {
  // check for end of fragment
  bool wordLevelIndex = ivfLevel.indexLevel.empty();
  if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
    inFrag = false;
  }

  IP2TagInfo &tInfo = tagMapDict[el.tagName];
  CloseTextTag (tInfo, el.tagName);
}

static void ProcessText (const TextEl &el, bool &inFrag) {
  // make sure this text is to be indexed
  bool wordLevelIndex = ivfLevel.indexLevel.empty();
  if (!wordLevelIndex && !inFrag) return;

  const unsigned char *textHere = el.text.begin();
  const unsigned char *textEnd = el.text.end() - 1;
  unsigned char mgWord[MAXSTEMLEN + 1];
  
  if (!inaword (textHere, textEnd))
    ParseNonindexWord (textHere, textEnd);

  
  // Alternately parse off words and non-words from the input

  while (textHere <= textEnd) {
    textHere = ParseIndexMGWord (textHere, textEnd, mgWord);
    textHere = ParseNonindexWord (textHere, textEnd);

    if (mgWord[0] > 0) {
      if (wordLevelIndex) numFrags++;

      unsigned long wordNum = perf_hash (wordHashDict, mgWord);

      /*
      cout << wordNum << " \"";
      cout.write (mgWord+1, *mgWord);
      cout << "\" " << numFrags << "\n";
      */
      
      // add this word to the inverted list
      BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
      unsigned long fragNum = numFrags;
      int b = 1 << wordBitPtr.lgB;
  
      mems_bitio_buffer buffer ((u_char *) ivfMemBuf, wordBitPtr.here);

      // note: this assumes that fragments don't carry over between
      // chunks (which they don't because all tags are closed at the
      // end of each document and chunks are based on document
      // boundaries), i.e. the first fragment number must be greater
      // than the starting fragment number of the chunk.
      if (fragNum > wordBitPtr.lastFragNum) {
	buffer.bblock_encode ((fragNum - wordBitPtr.lastFragNum - 1) + 1,
			      b, NULL);
	if (!wordLevelIndex) buffer.encodeBit (1); // freq = 1
	
      } else if (!wordLevelIndex) {
	// add one to the frequency count for this word
	buffer.seek (buffer.position()-1);
	buffer.encodeBit (0); // unary encoding -- last = 1
	buffer.encodeBit (1);
      }
      
      wordBitPtr.lastFragNum = fragNum;
      wordBitPtr.here = buffer.position();
      buffer.encodeDone();
      
      // check for buffer overrun
      bitPtrs.CheckWordBufOverrun (wordNum);
    }
  }
}

// combine the in memory inverted buffer with the disk
// based inverted file
static void DiskMerge (char *filename) {
  bool wordLevelIndex = ivfLevel.indexLevel.empty();

      // make sure we have something to process
  if (numChunkDocs <= 0) return;
    
  // open the inverted file
  FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb+",
			      MAGIC_INVF, MG_ABORT);
  random_bitio_buffer invfOutBuf (invfFile);

  // set up to decode the entries in memory
  mems_bitio_buffer memInBuf ((u_char *) ivfMemBuf, 0);

  // write out the word information
  unsigned long wordNum;
  int b;
  unsigned long currFragNum;
  unsigned long delta;
  unsigned long currFreq;
  for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
    // go to the end of the last inverted file entry
    InvfStateRec &wordDiskState = invfState.GetRec (wordNum);
    invfOutBuf.SEEK_X (wordDiskState.here);

    // go to the start of the inverted chunk info in memory
    BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
    memInBuf.seek(wordBitPtr.start);

    // decode each entry and re-write to disk
    currFragNum = chunkStartFragNum;
    while (memInBuf.position() < wordBitPtr.here) {
      // decode word entry
      b = 1 << wordBitPtr.lgB;
      delta = memInBuf.bblock_decode (b, NULL);
      currFragNum += delta;
      if (!wordLevelIndex) currFreq = memInBuf.unary_decode (NULL);
      else currFreq = 1;

      // recode on disk
      invfOutBuf.bblock_encode (currFragNum-wordDiskState.lastFragNum,
				wordDiskState.B, NULL);
      if (!wordLevelIndex) invfOutBuf.gamma_encode (currFreq, NULL);
      wordDiskState.lastFragNum = currFragNum;
    }

    wordDiskState.here = invfOutBuf.TELL_X();
  }

  // write out the tag information
  unsigned long tagNum;
  unsigned long currTagStart;
  unsigned long currTagEnd;
  for (tagNum=0; tagNum<idh.tag_dict_size; tagNum++) {
    // go to the end of the last inverted file entry
    InvfStateRec &tagDiskState = invfState.GetRec (tagNum+idh.word_dict_size);
    invfOutBuf.SEEK_X (tagDiskState.here);

    // go to the start of the inverted chunk info in memory
    BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tagNum);
    memInBuf.seek(tagBitPtr.start);

    // decode each entry and re-write to disk
    currTagEnd = chunkStartFragNum;
    while (memInBuf.position() < tagBitPtr.here) {
      // decode tag entry
      b = 1 << tagBitPtr.lgB;
      delta = memInBuf.bblock_decode (b, NULL) - 1;
      currTagStart = currTagEnd + delta;
      delta = memInBuf.bblock_decode (b, NULL) - 1;
      currTagEnd = currTagStart + delta;

      // recode on disk
      invfOutBuf.bblock_encode (currTagStart-tagDiskState.lastFragNum+1,
				tagDiskState.B, NULL);
      invfOutBuf.bblock_encode (currTagEnd-currTagStart+1,
				tagDiskState.B, NULL);

      tagDiskState.lastFragNum = currTagEnd;
    }

    tagDiskState.here = invfOutBuf.TELL_X();
  }

  memInBuf.done();

  invfOutBuf.encodeDone();
  fclose (invfFile);
}


int process_ivf_2 (const TagInfo &/*tagInfo*/, const TextElArray &doc) {
  bool wordLevelIndex = ivfLevel.indexLevel.empty();
  bool inFrag = false;
  if (wordLevelIndex) inFrag = true; // unconditional
  
  // get next chunk information if need to. the chunk information
  // is needed before the first document is processed
  if (numChunkDocs >= numDocsInChunk) ReadChunk (idh, wordLevelIndex);

  // process each text element
  TextElArray::const_iterator here = doc.begin();
  TextElArray::const_iterator end = doc.end();
  while (here != end) {
    // process this element
    if ((*here).elType == OpenTagE) ProcessOpenTag (*here, inFrag);
    else if ((*here).elType == CloseTagE) ProcessCloseTag (*here, inFrag);
    else ProcessText (*here, inFrag);
    
    here++;
  }

  // close off any unclosed tags
  TagMapDict::iterator tdHere = tagMapDict.begin();
  TagMapDict::iterator tdEnd = tagMapDict.end();
  while (tdHere != tdEnd) {
    CloseTextTag ((*tdHere).second, (*tdHere).first);
    tdHere++;
  }
  
  // we've processed one more document
  numDocs++;
  numChunkDocs++;

  // merge the memory based inverted file with the one on
  // disk if this is the end of this chunk
  if (numChunkDocs >= numDocsInChunk) DiskMerge (collectFilename);
  
  return COMPALLOK;
}


static void CondenseInvfFile (char *filename, unsigned long &bytesOutput) {
  FILE *inInvfFile = open_file (filename, INVF_SUFFIX, "rb",
				MAGIC_INVF, MG_ABORT);
  FILE *outInvfFile = open_file (filename, INVF_SUFFIX, "rb+",
				 MAGIC_INVF, MG_ABORT);

  // skip the magic number
  fseek (outInvfFile, sizeof (unsigned long), SEEK_SET);
  
  // write the inverted file header -- use defaults for most things
  invf_file_header ifh;
  ifh.no_of_words = idh.word_dict_size;
  ifh.no_of_tags = idh.tag_dict_size;
  ifh.word_level_index = (ivfLevel.indexLevel.empty()) ? 1 : 0;
  ifh.Write (outInvfFile);

  bytesOutput = ftell (outInvfFile);

  // process each meaningful byte in the file
  unsigned long numEntries = ifh.no_of_words + ifh.no_of_tags;
  unsigned long entryNum;
  mg_ullong lastStart = 0;
  for (entryNum = 0; entryNum < numEntries; entryNum++) {
    InvfStateRec &stateRec = invfState.GetRec (entryNum);

    // overrun check
    if (stateRec.start < lastStart)
      FatalError (1, "Inverted file Buffer overrun");
    lastStart = stateRec.start;
    
    unsigned long oldEntryStart = stateRec.start >> 3;
    unsigned long oldEntryStartOver = stateRec.start & 7; // should be 0
    unsigned long oldEntryEnd = (stateRec.here + 7) >> 3;  // byte after end
    unsigned long oldEntryEndOver = stateRec.here & 7;

    fseek (inInvfFile, oldEntryStart, SEEK_SET);

    stateRec.here -= stateRec.start;
    stateRec.start = bytesOutput * 8 + oldEntryStartOver;
    stateRec.here += stateRec.start;
    while (oldEntryStart < oldEntryEnd) {
      unsigned char c = getc (inInvfFile);
      if (oldEntryStart == oldEntryEnd - 1) {
	u_char ands[8] =
	{0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
	c &= ands[oldEntryEndOver];
      }
      putc (c, outInvfFile);
      bytesOutput++;
      oldEntryStart++;
    }
  }  

  fclose (inInvfFile);

#ifdef __WIN32__
  if (!(_chsize (_fileno (outInvfFile), bytesOutput)))
    Message ("Could not truncate invf.");
#else
  ftruncate (fileno (outInvfFile), bytesOutput);
#endif

  fclose (outInvfFile);
}

static void OutputInvfIdx (char *filename, unsigned long invfNumBytes) {
  FILE *invfIdxFile = create_file (filename, INVF_IDX_SUFFIX, "wb",
				   MAGIC_INVI, MG_ABORT);

  // process each meaningful byte in the file
  unsigned long numEntries = idh.word_dict_size + idh.tag_dict_size;
  unsigned long entryNum;
  for (entryNum = 0; entryNum < numEntries; entryNum++) {
    InvfStateRec &stateRec = invfState.GetRec (entryNum);

    // assumes that inverted entries start at beginning of each byte
    if (!WriteUL (invfIdxFile, (stateRec.start >> 3))) break;
  }

  WriteUL (invfIdxFile, invfNumBytes);

  fclose (invfIdxFile);
}


int done_ivf_2 (const TagInfo &/*tagInfo*/, char *filename) {
  // close most open files
  if (chunkFile != NULL) {
    chunkBuf.done();
    fclose (chunkFile);
    chunkFile = NULL;
  }
  occurConvert.Close();
  
  // free allocated memory
  bitPtrs.Clear();
  if (ivfMemBuf != NULL) { delete [] ivfMemBuf; ivfMemBuf = NULL; }
  free_perf_hash (wordHashDict);
  wordHashDict = NULL;
  tagMapDict.erase (tagMapDict.begin(), tagMapDict.end());

  // condense the inverted file and truncate it
  // this function also writes out the inverted header
  unsigned long invfNumBytes = 0;
  CondenseInvfFile (filename, invfNumBytes);

  OutputInvfIdx (filename, invfNumBytes);

  // close the rest of the open files
  invfState.Close ();
  
  return COMPALLOK;
}
