/**************************************************************************
 *
 * invf.h -- Data structures for inverted files
 * Copyright (C) 1999  Rodger McNab
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * $Id: invf.h,v 1.4 2000/07/24 02:46:11 kjm18 Exp $
 *
 **************************************************************************/


#ifndef H_INVF
#define H_INVF

#include <stdio.h>

#include "UCArray.h"

// NOTE: This does not include the magic number
// header info for .invf.dict file
struct invf_dict_header {
  unsigned long lookback;
  unsigned long word_dict_start;
  unsigned long word_dict_size;
  unsigned long tag_dict_start;
  unsigned long tag_dict_size;
  unsigned long num_docs;
  unsigned long num_frags;
  unsigned long num_words;
  unsigned long total_bytes;
  unsigned long index_string_bytes;
  unsigned long num_levels;

  invf_dict_header ();
  virtual ~invf_dict_header ();
  virtual void Clear();

  virtual bool Read (FILE *f);
  virtual bool Write (FILE *f) const;
};


struct dict_el {
  UCArray el; // word or tag
  unsigned long frag_occur;
  unsigned long freq;

  virtual void Clear ();
  dict_el () { Clear (); }
  virtual ~dict_el () { }

  // Read assumes that the last word is in el
  bool Read (FILE *f);
  bool Write (FILE *f, const UCArray *lastEl) const;
};


struct word_dict_el : public dict_el {
  unsigned long *levelFreqs;

  void Clear ();
  word_dict_el () { levelFreqs = NULL; Clear (); }
  ~word_dict_el ();

  void SetNumLevels (unsigned long numLevels);

  // SetNumLevels should be called before either
  // reading or writing using Read and Write
  
  // Read assumes that the last word is in el
  bool Read (FILE *f, unsigned long numLevels);
  bool Write (FILE *f, const UCArray *lastEl,
	      unsigned long numLevels) const;
};


// wblk = word block
// tblk = tag block
// this version of the blocked dictionary uses a fixed number
// of entries per block, not a fixed block size
// info for .invf.dict.blocked file
// blocked dict has a heap of blocks, some for words, some for tags
// and an index into each set of blocks. The index has pointers to
// the first entry in each block. Can do a binary search on the index
// to find out which block an elemnet is in
struct block_dict_header : public invf_dict_header {
  // note: word_dict_start and tag_dict_start are undefined
  // for blocked dictionaries

  unsigned long entries_per_wblk; // word blocks
  unsigned long num_wblks;
  unsigned long max_wblk_size;
  unsigned long wblk_start;
  unsigned long wblk_idx_start;
  
  unsigned long entries_per_tblk; // tag blocks
  unsigned long num_tblks;
  unsigned long max_tblk_size;
  unsigned long tblk_start;
  unsigned long tblk_idx_start;
  
  block_dict_header ();
  void Clear ();
  
  bool Read (FILE *f);
  bool Write (FILE *f) const;
};


struct block_dict_el {
  UCArray el; // word or tag
  unsigned long frag_occur; // # entries in invf file - if have a
  // word level index, this is the same as freq, otherwise, its the number
  // of fragments containing this word
  unsigned long freq; // # of times this word occurs
  unsigned long invf_ptr; // pointer into inverted file

  virtual void Clear ();
  block_dict_el () { Clear (); }
  virtual ~block_dict_el () { }

  // Read assumes that the last word is in el
  // set lastEl = NULL when no lookback is wanted (eg
  // for the start of a block
  bool Read (FILE *f);
  bool Write (FILE *f, const UCArray *lastEl) const;
};

struct word_block_dict_el : public block_dict_el {
  unsigned long *levelFreqs; // freq of the word at each level

  void Clear ();
  word_block_dict_el () { levelFreqs = NULL; Clear (); }
  ~word_block_dict_el ();

  void SetNumLevels (unsigned long numLevels);

  // SetNumLevels should be called before either
  // reading or writing using Read and Write
  
  // Read assumes that the last word is in el
  bool Read (FILE *f, unsigned long numLevels);
  bool Write (FILE *f, const UCArray *lastEl,
	      unsigned long numLevels) const;
};

typedef vector<word_block_dict_el> word_block_dict_el_array;

struct block_idx_info {
  UCArray el;
  unsigned long block_ptr;

  block_idx_info ();
  void Clear ();

  bool Read (FILE *f);
  bool Write (FILE *f) const;
};

// used for an index into the word and tag blocks
typedef vector<block_idx_info> block_idx;

bool ReadBlockIdx (FILE *f, block_idx &blockIdx);
bool WriteBlockIdx (FILE *f, const block_idx &blockIdx);



struct stem_idx_header {
  unsigned long lookback;
  unsigned long dict_size;

  unsigned long entries_per_block;
  unsigned long num_blocks;
  unsigned long max_block_size;
  unsigned long blocks_start;
  unsigned long block_idx_start;
  
  unsigned long stemmer_num;
  unsigned long stem_method;

  stem_idx_header ();
  void Clear ();

  bool Read (FILE *f);
  bool Write (FILE *f) const;
};

struct stem_block_dict_el {
  UCArray el; // word or tag
  vector<unsigned long> equivWords;
  
  stem_block_dict_el ();
  void Clear ();

  // Read assumes that the last word is in el
  // set lastEl = NULL when no lookback is wanted (eg
  // for the start of a block
  bool Read (FILE *f);
  bool Write (FILE *f, const UCArray *lastEl) const;
};



#define SKIP_MODE_NO_SKIPS 0

// invf file - has a list of frags for each word, but the word is not 
//  stored in the invf file - the dictionaries store the words, along
// with num entries, and a pointer into invf file
struct invf_file_header {
  unsigned long no_of_words;
  unsigned long no_of_tags;
  unsigned long skip_mode;
  unsigned long word_level_index; // 1 if word level index
  unsigned long params[16];

  invf_file_header ();
  void Clear ();
  
  bool Read (FILE *f);
  bool Write (FILE *f) const;
};






// the search functions returns true if a block that could
// satisfy the request is found. these functions assume that
// the block index is sorted by DictCompare (or DictLTUCArray)
bool SearchElNum (const block_idx &bIdx,
		  unsigned long entriesPerBlock,
		  unsigned long elNum,
		  unsigned long &blockIdxNum,
		  unsigned long &blockStartElNum);
bool SearchEl (const block_idx &bIdx,
	       unsigned long entriesPerBlock,
	       const UCArray &el,
	       unsigned long &blockIdxNum,
	       unsigned long &blockStartElNum);


// The next six functions use SearchElNum and SearchEl
// for a particular type of dictionary (Block, WordBlock, or StemBlock)
// and then look up the entry
bool SearchBlockDictElNum (FILE *dictFile,
			   const block_idx &bIdx,
			   unsigned long entriesPerBlock,
			   unsigned long dictSize,
			   unsigned long elNum,
			   block_dict_el &dictEl);
bool SearchBlockDictEl (FILE *dictFile,
			const block_idx &bIdx,
			unsigned long entriesPerBlock,
			unsigned long dictSize,
			const UCArray &el,
			block_dict_el &dictEl,
			unsigned long &elNum);

// assumes the numLevels has been set for dictEl
bool SearchWordBlockDictElNum (FILE *dictFile,
			       const block_idx &bIdx,
			       unsigned long entriesPerBlock,
			       unsigned long dictSize,
			       unsigned long numLevels,
			       unsigned long elNum,
			       word_block_dict_el &dictEl);
bool SearchWordBlockDictEl (FILE *dictFile,
			    const block_idx &bIdx,
			    unsigned long entriesPerBlock,
			    unsigned long dictSize,
			    unsigned long numLevels,
			    const UCArray &el,
			    word_block_dict_el &dictEl,
			    unsigned long &elNum);

bool SearchStemBlockDictElNum (FILE *dictFile,
			       const block_idx &bIdx,
			       unsigned long entriesPerBlock,
			       unsigned long dictSize,
			       unsigned long elNum,
			       stem_block_dict_el &dictEl);
bool SearchStemBlockDictEl (FILE *dictFile,
			    const block_idx &bIdx,
			    unsigned long entriesPerBlock,
			    unsigned long dictSize,
			    const UCArray &el,
			    stem_block_dict_el &dictEl,
			    unsigned long &elNum);

//----------------------------------------------------------

// new functions for full text browse

bool NearestSearchWordBlockDictEl (FILE *dictFile,
			    const block_idx &bIdx,
			    unsigned long entriesPerBlock,
			    unsigned long dictSize,
			    unsigned long numLevels,
			    const UCArray &el,
			    word_block_dict_el &dictEl,
			    unsigned long &elNum);

// returns a list of word_block_dict_el, with no levelfreqs
bool SearchWordBlockDictElNumRange (FILE *dictFile,
				    const block_idx &bIdx,
				    unsigned long entriesPerBlock,
				    unsigned long dictSize,
				    unsigned long numLevels,
				    unsigned long elNum,
				    unsigned long numWords,
				    word_block_dict_el_array &terms);
				    
// just returns a list of terms
bool SearchWordBlockDictElNumRange (FILE *dictFile,
				    const block_idx &bIdx,
				    unsigned long entriesPerBlock,
				    unsigned long dictSize,
				    unsigned long numLevels,
				    unsigned long elNum,
				    unsigned long numWords,
				    UCArrayVector &terms);
				    


#endif
