/**********************************************************************
 *
 * suffix.cpp -- Extract the repeated phrases in the input using 
 *               suffix and prefix arrays.
 *
 * Copyright 2000 Gordon W. Paynter
 * Copyright 2000 The New Zealand Digital Library Project
 *
 * A component of the Greenstone digital library software
 * from the New Zealand Digital Library Project at the
 * University of Waikato, New Zealand.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 *********************************************************************/

#include <assert.h>
#include <fstream.h>
#include <iostream.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <algo.h>
#include <heap.h>
#include <vector.h>

#include "suffix.h"
#include "phrase.h"



// Global variables declared in suffix.h
cellcount inputLength;

symbol   *symbols;
symbol  **suffixArray;
check    *suffixCheck;
symbol  **prefixArray;
check    *prefixCheck;


// How many documents are in this collection?
cellcount numberOfDocuments;
symbol  **documentArray;

// Do we accept any phrase, or do we eliminate those ending with stopwords ?
int phraseMode = ANYPHRASE; //STOPWORDS;

// The filestem of the collection's phindex directory
char collection[FILENAME_MAX];

int suffixCompare(const void *, const void *);
int prefixCompare(const void *, const void *);
int pointerCompare(const void *, const void *);

int readNumbers();
void readStatistics();

void getMinimalExpansions(Phrase &p, vector<Phrase> &results);
cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency);

// The ranges of the stopword and content-word symbols for the collection
symbol firstStopSymbol = 0;
symbol lastStopSymbol = 0;
symbol firstContentSymbol = 0;
symbol lastContentSymbol = 0;




// Phrase memory 
// We have to "remember" each phrase that we've expanded
void initialisePhraseMemory();
void rememberThisPhrase(cellindex index, cellcount length);
bool isPhraseStored(cellindex index, cellcount length);
void deletePhraseMemory();


// how much output do we want?
int verbosity = 1;


int main (int argc, char * argv[]) {

  // Command-line arguments
  // argv[1] is the phindex directory
  // argv[2] is the maximum array symbol length (optional) 
  // argv[3] is the mode, where 1 is stopword mode (optional) 
  if (argc < 2) {
    cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
    exit(1);
  }

  // collection directory
  strcpy(collection, argv[1]);

  // mode parameter
  phraseMode = atoi(argv[2]);
  assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));

  // optional verbosity parameter
  if (argc == 4) {
    verbosity = atoi(argv[3]);
    assert (verbosity >= 0);
  }

  if (verbosity) {
    cout << "Suffix phrase extraction program" << endl;
  }

  if (verbosity > 1) {
    if (phraseMode == STOPWORDS) {
      cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
    } else {
      cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
    }
  }

  // Read the statistics file 
  readStatistics();

  // Read the numbers file
  readNumbers();

  // Create the suffix & prefix arrays
  suffixArray = new (symbol *)[inputLength];
  prefixArray = new (symbol *)[inputLength];
  suffixCheck = new (check)[inputLength];
  prefixCheck = new (check)[inputLength];
  if (prefixCheck == NULL) {
    cerr << "Suffix error: not enough memory to hold " << inputLength
	 << " symbols." << endl;
    exit(2);
  }  

  // Initialise prefix and suffix arrays
  for (cellcount j = 0; j < inputLength; j++) {
    suffixArray[j] = &symbols[j];
    prefixArray[j] = &symbols[j];
  }
  qsort(suffixArray, inputLength, sizeof(symbol *), suffixCompare);
  qsort(prefixArray, inputLength, sizeof(symbol *), prefixCompare);


  // Create the document arrays
  if (numberOfDocuments == 0) {
    cerr << "There are no documents in this collection!" << endl;
    exit(1);
  }
  if (verbosity > 1) {
    cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
  }

  // The document frequecy array is used to count the number of times
  // each phrase occurs in each document.  The number of documents in
  // which a phrase occurs is stored in df.
  frequency documentFrequency[numberOfDocuments];
  frequency df;

  // documentArray will be searched in order to discover which document
  // each phrase occurs in.
  documentArray = new (symbol *)[numberOfDocuments];  

  // Discover all the DOCUMENTSTART symbols and store as a phrase
  cellindex d = 0;
  while (*suffixArray[d] != DOCUMENTSTART) {
    d++;
  }
  Phrase p(suffixArray[d], 1, SUFFIX);
  p.findFirstAndLastSuffix(d, inputLength-1);
  
  // Insert the document locations (as pointers) into documentArray
  for (cellcount i = 0; i < p.suffixFrequency; i++) {
    documentArray[i] = suffixArray[i + p.firstSuffixIndex];
  }
  
  // Sort the document array into ascending order of raw pointer value
  qsort(documentArray, numberOfDocuments, sizeof(symbol *), pointerCompare);


  // Extract phrases
  //
  // We will make several passesover the data, in each case considering
  // a set of input phrases and generating a set of output phrases, which
  // we will expancd in later passes.
  //
  // The input phrases in the first pass will be the vocabulary.
  // In later passes, the input phrases will be the output phrases of the
  // previous pass.
  //
  // In each pass we will consider each input phrase in turn.  If we
  // have seen it before, we will ignore it.  Otherwise, we will expand
  // it and add its expansions to the set of output phrases.

  // Store the phrase data in the phrases file
  char phraseDataName[FILENAME_MAX];
  sprintf(phraseDataName, "%s/phrases", collection);
  ofstream phraseData(phraseDataName, ios::out);
  if (!phraseData) {
    cout << "File " << phraseDataName << " could not be opened\n";
    exit(1);
  }

  // Count the number of phrases output
  unsigned long int phraseCounter = 0;

  // Set up the phrase expansion memory.
  // We need this so that we don't expand a phrase more than once
  initialisePhraseMemory();

  // The current pass numebr
  int phrasePass = 1;


  // PASS NUMBER 1
  if (verbosity > 1) {
    cout << "Starting pass " << phrasePass << endl;
  }

  // We need an input file, for phrases we are about to examine, and an 
  // output file, for phrases still to come.
  ifstream inPhrase;
  char     inPhraseName[FILENAME_MAX];
  ofstream outPhrase;
  char     outPhraseName[FILENAME_MAX];
  unsigned long int outPhraseCounter = 0;

  // On the first pass, simply work through the vocabulary
  sprintf(outPhraseName, "%s/outPhrase.1", collection);
  outPhrase.open(outPhraseName, ios::out);
  if (!outPhrase) {
    cerr << "File " << outPhraseName << " could not be opened\n";
    exit(1);
  }

  // Iterate over the different symbols by working through the suffix array
  vector<Phrase> result;
  cellindex i = 0;
  char *tmpString;

  while (i < inputLength) {

    // make a new phrase of length 1
    p = Phrase(suffixArray[i], 1, SUFFIX);
    p.findFirstAndLastSuffix(i, inputLength-1);

    // cout << "cell " << i << " - " << p.toString() << endl;

    // We ignore this symbol if it occurs only once, if it is a delimiter,
    // of if we are in stopwords mode and it is a stopword
    //
    // We could imagine a new mode/command-line option, which is like 
    // STOPWORDS but without this restrictrion.  This would let you browse
    // from "the" to "the AGRIS" for example, but not from "AGRIS" to
    // "the AGRIS" (where the is a stopword and AGRIS a content word).
    // The system used to work like this; it is easy to implement, but
    // it explodes the size of the indexes.  So: would it be useful?  
    if (!((p.suffixFrequency <= 1) ||
	  // (*suffixArray[i] != 23054) ||
	  (*suffixArray[i] <= LASTDELIMITER) ||
	  ((phraseMode == STOPWORDS) && (*suffixArray[i] <= lastStopSymbol)))) {

      // Get minimal expansions of the phrase
      getMinimalExpansions(p, result);
      
      if (!result.empty()) {
	
	// Remember that we have expanded this phrase
	rememberThisPhrase(i, 1);

	// write the phrase text
	tmpString = p.toString();
	phraseData << i << "-1:" << tmpString << ":" << p.suffixFrequency << ":"
		   << result.size() << ":";
	delete [] tmpString;

	// write the results
	for (cellcount i = 0; i < result.size(); i++) {
	  if (i) {
	    phraseData << ",";
	  }
	  phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
	  outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
	  outPhraseCounter++;
	}
	result.clear();
	
	// Write the documents in which this phrase occurs
	df = getDocumentOccurrances(p, documentFrequency);
	phraseData << ":" << df << ":";

	// write the documents
	for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
	  if (documentFrequency[i]) {
	    if (first) {
	      first = 0;
	    } else {
	      phraseData << ";";
	    }
	    // Output the document number.  Note that here we've numbered the 
	    // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
	    // add 1 to the document id when we output it.
	    phraseData << "d" << (i+1);
	    // Next, output the frequency with which the document occurs, but
	    // only if it is > 1.
	    if (documentFrequency[i] > 1) {
	      phraseData << "," << documentFrequency[i];
	    }
	  }
	}

	phraseData << endl;
	phraseCounter++;

	// feedback 
	if (verbosity) {
	  if (phraseCounter % 1000 == 0) {
	    tmpString = p.toString();
	    cout << "phrase " << phraseCounter << ": "
		 << "cell " << p.firstSuffixIndex << " - " << tmpString << endl;
	    delete [] tmpString;
	  }
	}
      }
    }
   i = p.lastSuffixIndex + 1;
  }
  outPhrase.close();

  // REMAINING PASSES
  // The previous outPhrase file forms the input to each new pass
  cellcount start, length;
  while (outPhraseCounter > 0) {

    // Start a new pass
    phrasePass++;
    if (verbosity) {
      cout << "Starting pass " << phrasePass << endl;
    }

    // Open the input file
    sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
    inPhrase.open(inPhraseName, ios::in);
    if (!inPhrase) {
      cerr << "File " << inPhraseName << " could not be opened\n";
      exit(1);
    }

    // Open the output file
    sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
    outPhrase.open(outPhraseName, ios::out);
    if (!outPhrase) {
      cerr << "File " << outPhraseName << " could not be opened\n";
      exit(1);
    }
    outPhraseCounter = 0;

    // Process each phrase
    while(inPhrase >> start >> length) {

      // Ignore the phrase if we have expanded it before
      if (isPhraseStored(start, length)) {
	continue;
      }

      // Remember that we have examined this phrase
      rememberThisPhrase(start, length);

      // Find the phrase in the suffixarray
      p = Phrase(suffixArray[start], length, SUFFIX);
      p.findFirstAndLastSuffix(start, inputLength-1);

      // cout << "index " << start << ", length " << length << " - "  <<  p.toString() << endl;
      

      // Ignore the phrase if it only occurs once
      if (p.suffixFrequency < 2) {
	continue;
      }


      // Write the phrase text	tmpString = p.toString();
      tmpString = p.toString();
      phraseData << start << "-" << length << ":" << tmpString << ":" 
		 << p.suffixFrequency << ":";
      delete [] tmpString;
	

      // Expand the phrase, if it is fewer than 8 words long
      if (length <= 8) {

	// Get the minimal expansions for this phrase
	getMinimalExpansions(p, result);
      
	// write the results
	phraseData << result.size() << ":";

	for (cellcount i = 0; i < result.size(); i++) {
	  if (i) {
	    phraseData << ",";
	  }
	  phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
	  outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
	  outPhraseCounter++;
	}
	result.clear();
	
      } else {
	// phrase is too long to expand further
	phraseData << "0:";
      }

	
      // Write the documents in which this phrase occurs
      df = getDocumentOccurrances(p, documentFrequency);
      phraseData << ":" << df << ":";

      // write the documents
      for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
	if (documentFrequency[i]) {
	  if (first) {
	    first = 0;
	  } else {
	    phraseData << ";";
	  }
	  // Output the document number.  Note that here we've numbered the 
	  // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
	  // add 1 to the document id when we output it.
	  phraseData << "d" << (i+1);
	  // Next, output the frequency with which the document occurs, but
	  // only if it is > 1.
	  if (documentFrequency[i] > 1) {
	    phraseData << "," << documentFrequency[i];
	  }
	}
      }
      
      phraseData << endl;
      phraseCounter++;

      // feedback 
      if (verbosity) {
	if (phraseCounter % 1000 == 0) {
	  tmpString = p.toString();
	  cout << "phrase " << phraseCounter << ": "<< "start " << start 
	       << ", length " << length << " - " << tmpString << endl;
	  delete [] tmpString;
	}
      }

    }

    inPhrase.close();
    outPhrase.close();
  }
    
  phraseData.close();
  deletePhraseMemory();

  delete [] symbols;
  delete [] suffixArray;
  delete [] prefixArray;
  delete [] suffixCheck;
  delete [] prefixCheck;
  delete [] documentArray;


  
  cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
  return 0;
}


// Get Minimal Expansions
//
// Get the set of "minimal" expansions of a phrase p, using the
// algorithm described in the documentation.
//
// Returns a vector of Expansions.

void getMinimalExpansions(Phrase &p, vector<Phrase> &results) {

  // 1. Initialise the result and candiate vectors
  vector<Phrase> candidates;
  for (cellcount j = 0; j < inputLength; j++) {
    suffixCheck[j] = 0;
    prefixCheck[j] = 0;
  }

  // 2. Expand the phrase p 

  // 2.1 Create the candidate set
  p.initialSuffixCandidates(candidates);
  p.initialPrefixCandidates(candidates);

  // 2.2 Sort the candidates by phrase length
  make_heap(candidates.begin(), candidates.end(), isLonger);

  // 3. While candidates is non-empty, confirm the phrases it 
  //    contains, expanding them as required
  while (!candidates.empty()) {

    // 3.1 Get next candidate
    pop_heap(candidates.begin(), candidates.end(), isLonger);
    Phrase c = candidates.back();
    candidates.pop_back();

    // 3.2 If we know there are no unique right extensions
    //     (i.e. this is a phrase drawn from the suffix array)
    if (!c.hasUniqueSuffixExtension()) {
      
      c.ensurePrefixFound();

      // 3.2.1 Ignore candidate if we have used a subphrase instead
      if (suffixCheck[c.firstSuffixIndex] || prefixCheck[c.firstPrefixIndex]) {
	// cout << "ignoring" << endl;
      }

      // 3.2.2 If candidate has a unique left (prefix) extension, 
      //       Then extend it and add it back into Candidates.
      else if (c.hasUniquePrefixExtension()) {
	// cout << "expanding prefix " << c.toString() << "=> ";
	c.expandUniquePrefixExtensionByOne();
 	candidates.push_back(c);
	push_heap(candidates.begin(), candidates.end(), isLonger);
     }
	
      // 3.2.3 If candidate has no unique left (prefix) extension,
      //       Then add it to the list of results.
      else {
	// cout << "no unique prefix, add to results" << endl;
	results.push_back(c);
	for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) {
	  suffixCheck[i] = c.length;
	}
	for (cellcount i = c.firstPrefixIndex; i <= c.lastPrefixIndex; i++) {
	  prefixCheck[i] = c.length;
	}
      }
    }

    // 3.3 If we know there are no unique left extensions,
    //     Then fdo the same as for 3.2 but exchange suffix & prefix
    else if (!c.hasUniquePrefixExtension()) {
      
      c.ensureSuffixFound();

      // 3.3.1
      if (suffixCheck[c.firstSuffixIndex] || prefixCheck[c.firstPrefixIndex]) {

      }

      // 3.3.2 
      else if (c.hasUniqueSuffixExtension()) {
	c.expandUniqueSuffixExtensionByOne();
 	candidates.push_back(c);
	push_heap(candidates.begin(), candidates.end(), isLonger);
      }
	
      // 3.3.3 
      else {
	results.push_back(c);
	for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) {
	  suffixCheck[i] = c.length;
	}
	for (cellcount i = c.firstPrefixIndex; i <= c.lastPrefixIndex; i++) {
	  prefixCheck[i] = c.length;
	}

      }
    }
  }
}


// suffixCompare
//
// Compare two pointers into a suffix array.  We use this in the 
// qsort function, so the input are pointers to pointers.  
//
// Return -1 if (a < b), otherwise (a > b) so return +1, 

int suffixCompare(const void *cpa, const void *cpb) {

  // Cast then dereference pointers to suffix array elements
  symbol *pa = (symbol *) cpa;
  symbol *pb = (symbol *) cpb;
  pa = (symbol *) *pa;
  pb = (symbol *) *pb;

  // If the two elements are the same, examine the next one
  while (*pa == *pb) {
    *pa++;
    *pb++;
  }

  // Make the copmparison and return
  if ( *pa < *pb) {
    return -1;
  } else {
    return +1;
  }
}


// prefixCompare
//
// Compare two pointers into a prefix array.  We use this in the 
// qsort function, so the input are pointers to pointers.  
//
// Return -1 if (a > b), otherwise (a < b) so return +1, 

int prefixCompare(const void *cpa, const void *cpb) {

  // Cast then dereference pointers to prefix array elements
  symbol *pa = (symbol *) cpa;
  symbol *pb = (symbol *) cpb;
  pa = (symbol *) *pa;
  pb = (symbol *) *pb;

  // If the two elements are the same, examine the next one
  while (*pa == *pb) {
    *pa--;
    *pb--;
  }

  // Make the copmparison and return
  if ( *pa > *pb) {
    return -1;
  } else {
    return +1;
  }
}

// simpleCompare
//
// Compare two pointers based on the memory location they point to.

int pointerCompare( const void *pa, const void *pb ) {

  symbol **a = (symbol **) pa;
  symbol **b = (symbol **) pb;

  if (*a < *b) {
    return  -1;
  } else if (*a > *b) {
    return 1;
  } else {
    return 0;
  }
}


// Read the clauses.numbers file into the "symbols" array.
//
// Each number in the file is a symbol number; it is essential that 
// the first symbol (and no others) be COLLECTIONSTART and the last
// symbol (and no others) be COLLECTIONEND.
//
// Return the number of numbers in the array.

int readNumbers() {

  char filename[FILENAME_MAX];
  sprintf(filename, "%s/clauses.numbers", collection);
  if (verbosity) {
    cout << "Reading numbers file: " << filename << endl;
  }

  // Open the numbers file
  ifstream inFile(filename, ios::in);
  if (!inFile) {
    cerr << "File " << filename << " could not be opened\n";
    exit(1);
  }

  // Count the number of symbols
  inputLength = 0;
  symbol word;
  while (inFile >> word) {
    inputLength++;
  }
  inFile.close();

  // Allocate the symbbols array
  if (verbosity > 1) {
    cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
  }
  symbols = new (symbol)[inputLength];
  if (symbols == NULL) {
    cerr << "Suffix error: not enough memory to hold " << inputLength
	 << " symbols." << endl;
    exit(2);
  }

  // Read the numbers file into the numbers array
  if (verbosity > 2) {
    cout << "Reading the numbers" << endl;
  }
  inFile.open(filename, ios::in);
  cellcount next = 0;
  numberOfDocuments = 0;
  while (inFile >> word) {
    symbols[next++] = word;
    if (word == DOCUMENTSTART) {
      numberOfDocuments++;
    }
  }
  inFile.close();
  
  // Make sure the numbers file is intact
  assert(symbols[0] == COLLECTIONSTART);
  assert(symbols[next-1] == COLLECTIONEND);

  return inputLength;
}



// Get Document Occurrance statistics
//
// Given a phrase, what documents does it occur in?

cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency) {

  // cout << "searching for \""<< p.toString() << "\" in documents " 
  //      << 0 << "-" << numberOfDocuments - 1 << endl;

  // The number of documents in which this phrase occurs
  cellcount df = 0;

  // Initialise the document frequency array
  for (cellindex i = 0; i < numberOfDocuments; i++) {
    frequency[i] = 0;
  }

  // variables used to facilitate the search
  cellindex begin;
  cellindex end;
  cellindex d;
  symbol *target;
  bool found;

  // search for the document in which each occurence of the phrase is found
  for (cellcount i = p.firstSuffixIndex; i <= p.lastSuffixIndex; i++) {
    
    // cout << "looking for phrase at suffixArray[" << i << "]\n";
    
    target = suffixArray[i];
    begin = 0;
    end = numberOfDocuments - 1;
    found = false;

    // Search for the occurence of a document delimiter that target
    // occurs immediately after.  
    // We do this by performing a binary chop search on documentArray.
    while (!found) {

      // cout << "searching for " << (cellindex) target << " in "
      //      << begin << " - " << end << endl;

      assert (begin <= end);

      // If the beginning and end of the interval are the same,
      // then we've found the correct document
      if (begin == end) {
	if (frequency[begin] == 0) {
	  df++;
	}
	frequency[begin]++;
	found = true;
      }

      // Otherwise, examine a new document midway through the begin-end
      // interval and see if it is the one.
      else {
	d = (begin + end) / 2;
	if (target > documentArray[d]) {
	  // If target addrss is greater than this, but thisi sthe last document,
	  // then this must be the one we want.  Or, if target is greater than
	  // this one but less then the next, this must be the one we wnat.
	  if ((d == numberOfDocuments - 1) || (target < documentArray[d+1])) {
	    if (frequency[d] == 0) {
	      df++;
	    }
	    frequency[d]++;
	    found = true;
	  } else {	
	    // otherwise we know to search later in the document set
	    begin = d + 1;
	  }
	} else {
	  // search earlier in the document set
	  end = d - 1;
	}
      }
    }
  }
  return df;
}






// phraseExpansionMemory : Which phrases have we expanded?
//
// A set of utilities for keeping track of which phrases we have expanded.
// We don't want to expand a phrase more than once, after all.
//
// This REALLY ought to be in its own class, but it works so that's okay.
//
// Phrases are identified by their firstSuffixPosition and length.
//
// Functions provided are:
//       void initialisePhraseMemory()
//       void rememberThisPhrase(index, length)
//       bool isPhraseStored(index, length)
//       void deletePhraseMemory()
//
// Internally, we will have two separate cases:
//
// Phrases of length 1-8: 
//       unsigned char phraseMemory[inputLength]
// is an array where each cell "remembers" the corresponding index in the 
// suffixArray, and each of the 8 bits of the cell correspond to the phrases 
// of length 1, 2... 8.  
// Eventually, we will make this disk-based (i.e. store the array in a file).
//
// Phrases of length 9+:
//       file hashTableFile
//       file listOfEntries
// The first file is a hash table; each phrase maps to one of its cells, which
// contains either 0 (empty, no occurence) or a number which is an entry number 
// in the second file.  This file contains a "list" of entries.  Each consists of 
// three numbers: the suffixArray index of the phrase, the length of the phrase,
// and the entry number of the next phrase with the same hash.
//


unsigned char *phraseMemory;

void initialiseLongPhraseMemory();
void rememberThisLongPhrase(cellindex index, cellcount length);
bool isLongPhraseStored(cellindex index, cellcount length);
void deleteLongPhraseMemory();


void initialisePhraseMemory() {

  phraseMemory = new (unsigned char)[inputLength];

  // to begin with, everything is empty
  for (cellcount i = 0; i < inputLength; i++) {
    phraseMemory[i] = 0;
  }

  // intialise the hashTable of long phrases
  initialiseLongPhraseMemory();

}

void rememberThisPhrase(cellindex index, cellcount length) {

  // if the phrase is very long, use the file-based system
  if (length > 8) {
    rememberThisLongPhrase(index, length);
    return;
  }

  // create a char with just the bit corresponding to length set
  unsigned char newbit = 1;
  for (cellcount i = 1; i < length; i++) {
    newbit <<= 1;
  }

  // set that bit in the memory array at position index
  phraseMemory[index] |= newbit;
}


bool isPhraseStored(cellindex index, cellcount length) {

  // if the phrase is very long, use the file-based system
  if (length > 8) {
    return isLongPhraseStored(index, length);
  }

  // create a char with just the bit corresponding to length set
  unsigned char newbit = 1;
  for (cellcount i = 1; i < length; i++) {
    newbit <<= 1;
  }

  // retrurn true if that bit is set in memory arrayat position index
  return (phraseMemory[index] & newbit);
}

void deletePhraseMemory() {
  delete phraseMemory;
  deleteLongPhraseMemory();
}



// Files etc used to store "long" equavlents of the above

fstream hashTableFile;
char    hashTableFileName[FILENAME_MAX];
fstream listOfEntries;
char    listOfEntriesName[FILENAME_MAX];
cellindex nextEntryNumber;

const cellcount bigPrime = 7919;


void initialiseLongPhraseMemory() {

  cellindex example = 0;

  sprintf(hashTableFileName, "%s/hashTable", collection);
  sprintf(listOfEntriesName, "%s/hashLists", collection);


  // create the new hashtable
  if (verbosity > 1) {
    cout << "Initialising hashTable: " << hashTableFileName << endl;
  }
  hashTableFile.open(hashTableFileName, ios::in | ios::out);
  for (cellcount i = 0; i < bigPrime; i++) {
    hashTableFile.write((char *) &example, sizeof(example));
  }

  // create the list of phrases
  if (verbosity > 1) {
    cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
  }
  listOfEntries.open(listOfEntriesName, ios::in | ios::out);
  listOfEntries.write((char *) &example, sizeof(example));
  listOfEntries.write((char *) &example, sizeof(example));
  listOfEntries.write((char *) &example, sizeof(example));
  nextEntryNumber = 1;
}


void rememberThisLongPhrase(cellindex index, cellcount length) {

  // cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";

  cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
  cellindex pointer;
  cellindex zero = 0;
  cellindex readp = 0;
  cellindex readi = 0;
  cellindex readl = 0;

  hashTableFile.seekg(hashOffset);
  hashTableFile.read((char *) &pointer, sizeof(cellindex));

  if (pointer == 0) {
    // There is no entry at all in the hash table for this entry
    // so create one

    pointer = nextEntryNumber++;
    hashTableFile.seekg(hashOffset);
    hashTableFile.write((char *) &pointer, sizeof(cellindex));
        
    listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
    listOfEntries.write((char *) &zero, sizeof(cellindex));
    listOfEntries.write((char *) &index, sizeof(cellindex));
    listOfEntries.write((char *) &length, sizeof(cellindex));

  } else {
    // There is a list starting at this hash value, so the phrase may
    // be already remembered, or it might need to be appended
    
    while (pointer != 0) {
      // Read the entry pointed to by pointer
      listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
      listOfEntries.read((char *) &readp, sizeof(cellindex));
      listOfEntries.read((char *) &readi, sizeof(cellindex));
      listOfEntries.read((char *) &readl, sizeof(cellindex));

      // cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;

      if ((readi == index) && (readl = length)) {
	// we've found that we've already stored it
	return;
      } else if (readp == 0) {
	// we're reached the end of the list.  Add a new entry.
	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
	listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
	pointer = nextEntryNumber++;

	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
	listOfEntries.write((char *) &zero, sizeof(cellindex));
	listOfEntries.write((char *) &index, sizeof(cellindex));
	listOfEntries.write((char *) &length, sizeof(cellindex));
	return;
      } else {
	// go on to the next node
	pointer = readp;
      }
    }
  }


}

bool isLongPhraseStored(cellindex index, cellcount length) {

  // cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";

  cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
  cellindex pointer;
  cellindex readp = 0;
  cellindex readi = 0;
  cellindex readl = 0;

  // Find the phrase in the hashFile
  hashTableFile.seekg(hashOffset);
  hashTableFile.read((char *) &pointer, sizeof(cellindex));

  if (pointer == 0) {
    // There is no entry at all in the hash table for this entry
    // so nothing is stored
    return false;

  } else {
    // There is a list starting at this hash value, so the phrase may
    // be already remembered in that list
    while (pointer != 0) {
      // Read the entry pointed to by pointer
      listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
      listOfEntries.read((char *) &readp, sizeof(cellindex));
      listOfEntries.read((char *) &readi, sizeof(cellindex));
      listOfEntries.read((char *) &readl, sizeof(cellindex));

      if ((readi == index) && (readl = length)) {
	// we've found the phrase stored here
	return true;
      } else {
	// go on to the next node
	pointer = readp;
      }
    }
  }
  return false;
}

void deleteLongPhraseMemory() {
  // remove the hash & other files

  hashTableFile.close();
  listOfEntries.close();
  remove(hashTableFileName);
  remove(listOfEntriesName);

}




// Read the collection statistics file
void readStatistics() {

  // open the statistics file
  char filename[FILENAME_MAX];
  sprintf(filename, "%s/clauses.stats", collection);

  // Open the file
  ifstream inFile(filename, ios::in);
  if (!inFile) {
    cerr << "File " << filename << " could not be opened\n";
    exit(1);
  }

  // Read the numbers file into the numbers array
  char key[1000];
  symbol value;
  while (inFile >> key >> value){
    if (strcmp(key, "first_stopword") == 0) {
      firstStopSymbol = value;
    } else if (strcmp(key, "last_stopword") == 0) {
      lastStopSymbol = value;
    } else if (strcmp(key, "first_contentword") == 0) {
      firstContentSymbol = value;
    } else if (strcmp(key, "last_contentword") == 0) {
      lastContentSymbol = value;
    }
  }
  inFile.close();

  // Make sure we have the information we need
  if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
    cerr << "Statistics file incomplete" << endl;
    exit(1);
  }
}





