/**********************************************************************
 *
 * phindcgi.cpp -- cgi program to serve phind phrase hierarchies
 *
 * Copyright 2000 Gordon W. Paynter
 * Copyright 2000 The New Zealand Digital Library Project
 *
 *
 * A component of the Greenstone digital library software
 * from the New Zealand Digital Library Project at the
 * University of Waikato, New Zealand.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 *********************************************************************/

/*
 * phindcgi.cpp
 *
 * The program itself reads request for a phrase's data from the
 * QUERY_STRING variable, looks up the phrase (if necessary) in the MGPP
 * pword database, then looks up the phrase's charatoristics in the MGPP
 * pdata database, and reports output to STDOUT ar crude HTML or XML. 
 *
 */


#include <iostream.h>
#include <fstream.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#include <vector.h>
#include <algo.h>

// Include MGPP functionality.
#include <TextGet.h>
#include <MGQuery.h>
#include <Terms.h>
#include <messages.h>
#include <GSDLQueryParser.h>

// Include GSDL's text_t object, which makes parsing cgi arguments easier.
#include <text_t.h>
// Note that GSDL stores strings as text_t objects (vectors of 16-bit short int), 
// while MGPP stores strings as UCArray objects (vectors of 8-bit unsigned char).



void get_gsdlsite_parameters(char *&gsdlhome);

void get_cgi_parameters(char *&collection, char *&classifier,
			unsigned long &phrasenumber, UCArray &phrasetext,
			unsigned long &first_e, unsigned long &last_e, 
			unsigned long &first_l, unsigned long &last_l, 
			unsigned long &first_d, unsigned long &last_d,
			bool &XMLmode);

void print_expansions(char *cgi_script, char *collection, bool XMLmode, UCArray body,
		      TextData &textdata, vector <unsigned long> elist, 
		      unsigned long first, unsigned long last);

void print_thesaurus_links(char *cgi_script, char *collection, 
			   bool XMLmode, UCArray body, TextData &textdata, 
			   vector <unsigned long> &linkdest,
			   vector <UCArray> &linktype,
			   unsigned long first, unsigned long last);

void print_documents(bool XMLmode, char *basepath, char *cgi_script, 
		     char *collection, 
		     vector <unsigned long> docNums, 
		     vector <unsigned long> docFreq,
		     unsigned long first, unsigned long last);

void find_phrase_number_from_word(char *basepath, UCArray &query, DocNumArray &result);

void get_phrase_freq_data(TextData &textdata, unsigned long phrase,
			  UCArray &word, unsigned long &tf, 
			  unsigned long &ef, unsigned long &df);

void get_phrase_all_data(TextData &textdata, unsigned long phrase,
			 UCArray &word, 
			 unsigned long &tf, unsigned long &ef,
			 unsigned long &lf, unsigned long &df,
			 vector <unsigned long> &el,
			 vector <unsigned long> &linkdest,
			 vector <UCArray> &linktype,
			 vector <unsigned long> &docnum, 
			 vector <unsigned long> &docfrq);

void split_phrase(UCArray word, UCArray body, UCArray &prefix, UCArray &suffix);
bool phrase_match(UCArray text, UCArray::iterator &here, UCArray::iterator end);

void get_document_all_data(TextData &docdata, unsigned long docNum,
			   UCArray &title, UCArray &hash);

void cgi_error(bool XMLmode, char *message);

void toUCArray(text_t &in, UCArray &out);
unsigned long toLongInt(text_t &value);



int main (int argc, char * argv[]) {


  // the phrase to expand
  unsigned long phrase = 0;
  UCArray word;

  // the frequency and occurances of the phrase
  unsigned long tf;
  vector <unsigned long> el, linkdest, docNums, docfreq;
  vector <UCArray> linktype;

  // the number of occurances to display
  unsigned long ef, first_e, last_e, count_e, 
                lf, first_l, last_l, count_l, 
                df, first_d, last_d, count_d;
  
  // are we in XML mode (as opposed to HTML mode)
  bool XMLmode = false;

  // Read the gsdlsite.cfg file
  char *gsdlhome = NULL;
  get_gsdlsite_parameters(gsdlhome);
  
  if (gsdlhome == NULL) {
    cgi_error(XMLmode, "GSDLHOME not set in gsdlsite.cfg file.");
  }
  
  // Get command-line parameters
  char *collection = NULL;
  char *classifier = NULL;
  text_tmap param;
  get_cgi_parameters(collection, classifier, phrase, word, 
		     first_e, last_e, first_l, last_l, first_d, last_d, XMLmode);

  if (collection == NULL) {
    cgi_error(XMLmode, "No collection");
  }
  
  char basepath[FILENAME_MAX] = "";
  strcat(basepath, gsdlhome);
  strcat(basepath, "/collect/");
  strcat(basepath, collection);
  strcat(basepath, "/index/phind");
  strcat(basepath, classifier);

  // If we don't know the phrase number, look itup
  if (phrase == 0) {
    
    if (word.empty()) {
      cgi_error(XMLmode, "No phrase number or word.");
    }

    DocNumArray result;
    find_phrase_number_from_word(basepath, word, result);
    
    if (result.empty()) {
      cgi_error(XMLmode, "The search term does not occur in the collection.");
      exit(0);
    } else {
      phrase = result[0];
    }
  } 

  // Create a TextData object to read the phrase data (pdata)
  TextData textdata;
  char filename[FILENAME_MAX] = "pdata";
  if (!textdata.LoadData (basepath, filename)) {
    FatalError (1, "Couldn't load text information for \"%s\"", filename);
  }
  get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el, 
		      linkdest, linktype, docNums, docfreq);


  // Output the header
  if (XMLmode) {
    cout << "Content-type: text/plain" << endl << endl
	 << "<phinddata id=\"" << phrase 
	 << "\" text=\"" << word 
	 << "\" tf=\"" << tf 
	 << "\" ef=\"" << ef 
	 << "\" df=\"" << df 
	 << "\" lf=\"" << lf 
	 << "\">" << endl;
  } else {
    cout << "Content-type: text/html" << endl << endl
	 << "<html><head><title>" << word << "</title></head>" << endl
	 << "<body><center>" << endl
	 << "<p><h1>" << word << "</h1>" << endl
	 << "<p><b>"<< word << "</b> occurs " 
	 << tf << " times in " << df << " documents" << endl;
  }


  // Output the thesaurus links
  if ((lf > 0) && (first_l < last_l)) {

    // figure out the number of phrases to output
    if (last_l > lf) {
      last_l = lf;
    }
    count_l = last_l - first_l;
    
    if (XMLmode) {
      cout << "<thesauruslist length=\"" << lf 
	   << "\" start=\"" << first_l
	   << "\" end=\"" << last_l << "\">" << endl;
      print_thesaurus_links(argv[0], collection, XMLmode, word, textdata, 
			    linkdest, linktype, first_l, last_l);
      cout << "</thesauruslist>" << endl;
    }

    // output links as HTML
    else {
      if (count_l == lf) {
	cout << "<p><b> " << count_l << " thesaurus links</b>" << endl;
      } else {
	cout << "<p><b>" << count_l << " of " << lf << " thesaurus links</b>" << endl;
      }

      cout << "<p><table border=1><tr><th>type</th><th>topic</th><th>freq</th><th>docs</th></tr>" << endl;
      print_thesaurus_links(argv[0], collection, XMLmode, word, textdata, 
			    linkdest, linktype, first_l, last_l);
      
      cout << "</table>" << endl;

      if (last_l < lf) {
	if ((last_l + 10) < lf) {
	  cout << "<br><a href='" << argv[0] 
	       << "?c=" << collection
	       << "&n=" << phrase 
	       << "&e=" << first_e 
	       << "&f=" << last_e 
	       << "&h=" << first_d 
	       << "&i=" << last_d
	       << "&k=" << first_l 
	       << "&l=" << (last_l + 10)
	       << "'>Get more thesaurus links</a>" 
	       << endl;
	}
	cout << "<br><a href='" << argv[0] 
	     << "?c=" << collection 
	     << "&n=" << phrase 
	     << "&e=" << first_e 
	     << "&f=" << last_e 
	     << "&h=" << first_d 
	     << "&i=" << last_d 
	     << "&k=" << first_l 
	     << "&l=" << lf
	     << "'>Get every thesaurus link</a>" 
	     << endl;
      }
    }
    
  }
  
  // Output the expansions
  if ((ef > 0) && (first_e < last_e)) {

    // figure out the number of phrases to output
    if (last_e > el.size()) {
      last_e = el.size();
    }
    count_e = last_e - first_e;

    // output expansions as XML
    if (XMLmode) {
      cout << "<expansionlist length=\"" << ef 
	   << "\" start=\"" << first_e
	   << "\" end=\"" << last_e << "\">" << endl;

      print_expansions(argv[0], collection, XMLmode, word, textdata, el, first_e, last_e);

      cout << "</expansionlist>" << endl;
    }

    // output expansions as HTML
    else {
      if (count_e == el.size()) {
	cout << "<p><b> " << count_e << " expansions</b>" << endl;
      } else {
	cout << "<p><b>" << count_e << " of " << ef << " expansions</b>" << endl;
      }

      cout << "<p><table border=1><tr><th colspan=3>phrase</th><th>freq</th><th>docs</th></tr>" << endl;
      print_expansions(argv[0], collection, XMLmode, word, textdata, el, first_e, last_e);
      cout << "</table>" << endl;

      if (last_e < ef) {
	if ((last_e + 10) < ef) {
	  cout << "<br><a href='" << argv[0] 
	       << "?c=" << collection 
	       << "&n=" << phrase 
	       << "&e=" << first_e 
	       << "&f=" << (last_e + 10) 
	       << "&h=" << first_d 
	       << "&i=" << last_d 
	       << "&k=" << first_l 
	       << "&l=" << last_l
	       << "'>Get more expansions</a>" 
	       << endl;
	}
	cout << "<br><a href='" << argv[0] 
	     << "?c=" << collection 
	     << "&n=" << phrase 
	     << "&e=" << first_e 
	     << "&f=" << ef
	     << "&h=" << first_d 
	     << "&i=" << last_d 
	     << "&k=" << first_l 
	     << "&l=" << last_l
	     << "'>Get every expansion</a>" 
	     << endl;
      }
    }
  }

  // Output the document occurances
  if ((df > 0) && (first_d < last_d)) {

    // figure out the phrases to output
    if (last_d > docNums.size()) {
      last_d = docNums.size();
    }
    count_d = last_d - first_d;

    // output document list as XML
    if (XMLmode) {
      cout << "<documentlist length=\"" << df 
	   << "\" start=\"" << first_d
	   << "\" end=\"" << last_d << "\">" << endl;

      print_documents(XMLmode, basepath, "library", collection, 
		      docNums, docfreq, first_d, last_d);

      cout << "</documentlist>" << endl;
    }

    // output document list as HTML 
    else {

      if (count_d == docNums.size()) {
	cout << "<p><b> " << count_d << " documents</b>" << endl;
      } else {
	cout << "<p><b>" << count_d << " of " << df << " documents</b>" << endl;
      }

      cout << "<p><table border=1><tr><th align=left>document</th><th>freq</th></tr>" << endl;
      print_documents(XMLmode, basepath, "library", collection, 
		      docNums, docfreq, first_d, last_d);
      cout << "</table>" << endl;
      
      if (last_d < df) {
	if ((last_d + 10) < df) {
	  cout << "<br><a href='" << argv[0] 
	       << "?c=" << collection 
	       << "&n=" << phrase 
	       << "&e=" << first_e 
	       << "&f=" << last_e 
	       << "&h=" << first_d
	       << "&i=" << (last_d + 10)  
	       << "&k=" << first_l 
	       << "&l=" << last_l
	       << "'>Get more documents</a>" << endl;
	}
	cout << "<br><a href='" << argv[0] 
	     << "?c=" << collection 
	     << "&n=" << phrase 
	     << "&e=" << first_e 
	     << "&f=" << last_e 
	     << "&h=" << first_d 
	     << "&i=" << df
	     << "&k=" << first_l 
	     << "&l=" << last_l
	     << "'>Get every document</a>" << endl;
      }
    }
  }

  // Close the document
  if (XMLmode) {
    cout << "</phinddata>" << endl;
  } else {
    cout << "</center></body></html>" << endl;
  }

  textdata.UnloadData ();
  return 0;
}


// Print a list of expansions
//
// Given the textData and a list of phrase numbers, print out each of the
// expansions.

void print_expansions(char *cgi_script, char *collection, bool XMLmode, UCArray body, 
		      TextData &textdata, vector <unsigned long> elist,
		      unsigned long first, unsigned long last) {

  UCArray word;
  unsigned long phrase, tf, df, ef;

  UCArray suffix, prefix;
  
  for (unsigned long e = first; e < last; e++) {

    phrase = elist[e];
    get_phrase_freq_data(textdata, phrase, word, tf, ef, df);

    split_phrase(word, body, prefix, suffix);
    
    if (XMLmode) {
      // body is always the same as the text of the phrase, so no need to send it
      cout << "<expansion num=\"" << e 
	   << "\" id=\"" << phrase
	   << "\" tf=\"" << tf 
	   << "\" df=\"" << df;
      if (!prefix.empty()) {
	cout << "\" prefix=\"" << prefix;
      }
      if (!suffix.empty()) {
	cout << "\" suffix=\"" << suffix;
      }
      cout << "\"/>" << endl;
    } else {
      cout << "<tr valign=top><td align=right><a href='" << cgi_script 
	   << "?c=" << collection << "&n=" << phrase << "'>" << prefix << "</a></td>"
	   << "<td align=center><a href='" << cgi_script 
	   << "?c=" << collection << "&n=" << phrase << "'>" << body << "</a></td>"
	   << "<td align=left><a href='" << cgi_script 
	   << "?c=" << collection << "&n=" << phrase << "'>" << suffix << "</a></td>"
	   << "<td>" << tf << "</td><td>" << df << "</td></tr>" << endl;
    }
  }
}

void print_thesaurus_links(char *cgi_script, char *collection, 
			   bool XMLmode, UCArray body, TextData &textdata, 
			   vector <unsigned long> &linkdest,
			   vector <UCArray> &linktype,
			   unsigned long first, unsigned long last) {

  // information describing each link in the list
  unsigned long phrase, tf, ef, df;
  UCArray type, text, newbody, suffix, prefix;
  
  for (unsigned long l = first; l < last; l++) {

    // get the phrase data
    phrase = linkdest[l];
    type = linktype[l];
    get_phrase_freq_data(textdata, phrase, text, tf, ef, df);
    // split_phrase(text, newbody, prefix, suffix);
    
    if (XMLmode) {
      cout << "<thesaurus num=\"" << l 
	   << "\" id=\"" << phrase
	   << "\" tf=\"" << tf 
	   << "\" df=\"" << df
	   << "\" type=\"" << type
	   << "\" text=\"" << text
	   << "\"/>" << endl;
    } else {
      cout << "<tr valign=top><td>" << type << "</td><td>" 
	   << "<a href='" << cgi_script << "?c=" << collection 
	   << "&n=" << phrase << "'>" << text << "</a>"
	   << "</td><td>" << tf << "</td><td>" << df << "</td></tr>" << endl;
    }
  }
}


void print_documents(bool XMLmode, char *basepath, char *cgi_script, char *collection,  
		     vector <unsigned long> docNums, vector <unsigned long> docFreq,
		     unsigned long first, unsigned long last) {
  
  // Create a TextData object to read the document data
  TextData docdata;
  char filename[FILENAME_MAX] = "docs";
  if (!docdata.LoadData (basepath, filename)) {
    FatalError (1, "Couldn't load text information for \"%s\"", filename);
  }

  UCArray title, hash;
  unsigned long freq, doc;

  for (unsigned long d = first; d < last; d++) {
    doc = docNums[d];
    freq = docFreq[d];

    get_document_all_data(docdata, doc, title, hash);

    if (XMLmode) {
      cout << "<document num=\"" << d
	   << "\" hash=\"" << hash 
	   << "\" freq=\"" << freq 
	   << "\" title=\"" << title << "\"/>" << endl;
    } else {
      cout << "<tr valign=top><td><a href='" << cgi_script << "?c=" << collection 
	   << "&a=d&d=" << hash << "'>" << title << "</a>"
	   << "</td><td>" << freq << "</td></tr>" 
	   << endl;
    }
  }
}



// Get the frequency data about a phrase
//
// The phrase is stored in textData as record phrase.
// We retrieve:
//   word - the text of the phrase
//   tf - the total frequency of the phrase
//   ef - the expansion frequency of the phrase
//   df - the document frequency of the phrase

void get_phrase_freq_data(TextData &textdata, unsigned long phrase,
			  UCArray &word, unsigned long &tf, 
			  unsigned long &ef, unsigned long &df) {
  
  UCArray text;
  UCArray docLevel;
  SetCStr(docLevel, "Document");

  // Look the word up in the textData
  if (!GetDocText (textdata, docLevel, phrase, text)) {
    FatalError (1, "Error while trying to get phrase %u", phrase);
  }

  // Ignore everything up to the first colon
  UCArray::iterator next = text.begin();
  while (*next++ != ':');
  
  // Get the word
  word.clear();
  for (; *next != ':'; next++) {
    word.push_back(*next);
  }
  
  // Get total frequency
  tf = 0;
  for (next++; *next != ':'; next++) {
    tf *= 10;
    tf += (*next - '0');
  }
  
  // Get expansion frequency
  ef = 0;
  for (next++; *next != ':'; next++) {
    ef *= 10;
    ef += (*next - '0');
  }
  
  // Get document frequency
  df = 0;
  for (next++; *next != ':'; next++) {
    df *= 10;
    df += (*next - '0');
  }
}

// Get all the data about a phrase
//
// The phrase is stored in textData as record phrase.
// We retrieve:
//   word - the text of the phrase
//   tf - the total frequency of the phrase
//   ef - the expansion frequency of the phrase
//   lf - the thesaurus link frequency of the phrase
//   df - the document frequency of the phrase
//   el - the list of phrases that are expansions of phrase
//   ll - the list of phrases that are thesaurus links 
//   dl - the list of documents that contain phrase

void get_phrase_all_data(TextData &textdata, unsigned long phrase,
			 UCArray &word, 
			 unsigned long &tf, unsigned long &ef,
			 unsigned long &lf, unsigned long &df,
			 vector <unsigned long> &el,
			 vector <unsigned long> &linkdest,
			 vector <UCArray> &linktype,
			 vector <unsigned long> &docnum, 
			 vector <unsigned long> &docfrq) {
  UCArray text;
  UCArray docLevel;
  SetCStr(docLevel, "Document");

  // Look thwe word up in the textData
  if (!GetDocText (textdata, docLevel, phrase, text)) {
    FatalError (1, "Error while trying to get phrase %u", phrase);
  }

  // Ignore everything up to the first colon
  UCArray::iterator next = text.begin();
  while (*next++ != ':');

  // ignore training cariage returns
  while (text.back() == '\n') {
    text.pop_back();
  }
  
  // Get the word
  word.clear();
  for (; *next != ':'; next++) {
    word.push_back(*next);
  }
  
  // Get total frequency
  tf = 0;
  for (next++; *next != ':'; next++) {
    tf *= 10;
    tf += (*next - '0');
  }
  
  // Get expansion frequency
  ef = 0;
  for (next++; *next != ':'; next++) {
    ef *= 10;
    ef += (*next - '0');
  }
  
  // Get document frequency
  df = 0;
  for (next++; *next != ':'; next++) {
    df *= 10;
    df += (*next - '0');
  }
  
  // Get expansion list
  el.clear();
  unsigned long e = 0;
  for (next++; *next != ':'; next++) {
    if (*next == ',') {
      el.push_back(e);
      e = 0;
    } else {
      e *= 10;
      e += (*next - '0');
    }
  }

  // Get document list & the document frequency list
  docnum.clear();
  docfrq.clear();
  bool readnum = false;
  unsigned long d = 0;
  for (next++; *next != ':'; next++) {
    if (*next == ',') {
      docnum.push_back(d);
      readnum = true;
      d = 0;
    } else if (*next == ';') {
      if (readnum) {
	docfrq.push_back(d);
      } else {
	docnum.push_back(d);
	docfrq.push_back(1);
      }
      readnum = false;
      d = 0;
    } else {
      d *= 10;
      d += (*next - '0');
    }
  }

  // Get thesaurus link frequency & link list
  text.push_back(':');
  text.push_back(':');

  // link frequency
  lf = 0;
  for (next++; *next != ':'; next++) {
    lf *= 10;
    lf += (*next - '0');
  }

  // two lists of link data
  linkdest.clear();
  linktype.clear();
  
  UCArray thistype;
  thistype.clear();
  bool typedone = false;
  unsigned long l = 0;
  for (next++; *next != ':'; next++) {
    
    if (!typedone) {
      // first read the link type, a charactor string
      if (*next == ',') {
	typedone = true;
      } else {
	thistype.push_back(*next);
      }
    } else {
      // having read the link type, read the list of link destinations
      if (*next == ',') {
	linkdest.push_back(l);
	linktype.push_back(thistype);
	l = 0;
      } else if (*next == ';') {
	linkdest.push_back(l);
	linktype.push_back(thistype);
	l = 0;
	thistype.clear();
	typedone = false;
      } else {
	l *= 10;
	l += (*next - '0');
      }
    }
  }
}

// Get all the data about a docment
//
// The document's detailes are stored in docData as record docNum.
// We retrieve:
//   title - the document's title
//   hash - the documnt's unique OID

void get_document_all_data(TextData &docdata, unsigned long docNum,
			   UCArray &title, UCArray &hash) {

  UCArray text;
  UCArray docLevel;
  SetCStr(docLevel, "Document");

  // Look the word up in the textData
  if (!GetDocText (docdata, docLevel, docNum, text)) {
    FatalError (1, "Error while trying to get document %u", docNum);
  }

  // Ignore everything up to the first colon
  UCArray::iterator next = text.begin();
  while (*next++ != '\t');
  
  // Get the document OID (hash)
  hash.clear();
  for (; *next != '\t'; next++) {
    hash.push_back(*next);
  }

  // Get the title
  text.push_back('\n');
  title.clear();
  for (next++; *next != '\n'; next++) {
    title.push_back(*next);
  }
}


void get_gsdlsite_parameters(char *&gsdlhome) {

  // open the file
  ifstream gsdl("gsdlsite.cfg", ios::in);
  if (!gsdl) {
    cerr << "File gsdlsite.cfg could not be opened\n";
    exit(1);
  }

  // read each line of the file
  char buffer[2000];
  while (!gsdl.eof()) {
    gsdl.getline(buffer, 2000, '\n');

    // read the gsdlhome variable
    if (strncmp(buffer, "gsdlhome", 8) == 0) {
      
      // find the start of the gsdlhome string
      int len = strlen(buffer);
      int i = 8;
      while (i < len && (buffer[i] == ' ' || buffer[i] == '\t')) {
	i++;
      }
      // store the gsdlhome string
      gsdlhome = new (char)[len-i];
      strncpy(gsdlhome, &(buffer[i]), len-i);
    } 
  }
}

void get_cgi_parameters(char *&collection, char *&classifier,
			unsigned long &phrasenumber, UCArray &phrasetext,
			unsigned long &first_e, unsigned long &last_e, 
			unsigned long &first_l, unsigned long &last_l, 
			unsigned long &first_d, unsigned long &last_d,
			bool &XMLmode) {


  // set the default parameters
  phrasenumber = 0;
  phrasetext.clear();
  first_e = 0;
  last_e = 10;
  first_l = 0;
  last_l = 10;
  first_d = 0;
  last_d = 10;

  // get the query string
  char *request_method_str = getenv("REQUEST_METHOD");
  char *query_string = getenv("QUERY_STRING");
  text_t query;

  if (request_method_str != NULL 
      && (strcmp(request_method_str, "GET") == 0)
      && query_string != NULL) {
    // GET cgi args from querystring 
    query = query_string;

  } else {
    // debugging from command line
    cout << "? " << endl;
    char query_input[1024];
    cin.get(query_input, 1024, '\n');
    query = query_input;
  }
  
  // extract out the key=value pairs
  text_t::iterator here = query.begin();
  text_t::iterator end = query.end();
  text_t key, value;

  while (here != end) {
    // get the next key and value pair
    here = getdelimitstr (here, end, '=', key);
    here = getdelimitstr (here, end, '&', value);
    
    // store this key=value pair
    if (!key.empty() && !value.empty()) {

      // c: the collection name
      if (key[0] == 'c') {
	UCArray tmp;
	toUCArray(value, tmp);
	collection = GetCStr(tmp);
      }
      
      // d: the classifier number as string
      if (key[0] == 'd') {
	UCArray tmp;
	toUCArray(value, tmp);
	classifier = GetCStr(tmp);
      }
      
      // e: the first expansion number
      else if (key[0] == 'e') {
	first_e = toLongInt(value);
      }

      // f: the last expansion number
      else if (key[0] == 'f') {
	last_e = toLongInt(value);
      }

      // h: the first document number
      else if (key[0] == 'h') {
	first_d = toLongInt(value);
      }

      // i: the last document number
      else if (key[0] == 'i') {
	last_d = toLongInt(value);
      }

      // k: the first thesaurus list number
      else if (key[0] == 'k') {
	first_l = toLongInt(value);
      }

      // l: the last thesaurus list number
      else if (key[0] == 'l') {
	last_l = toLongInt(value);
      }

      // n: the phrase number
      else if (key[0] == 'n') {
	phrasenumber = toLongInt(value);
      }
      
      // p: the phrase text
      else if (key[0] == 'p') {
	toUCArray(value, phrasetext);
      }

      // x: XML mode
      else if (key[0] == 'x') {
	XMLmode = true;
      }

    }
  }  

  // if no classifier number is supplied, default to 1.
  if (classifier == NULL) {
    classifier = new (char)[2];
    strcpy(classifier, "1");
  }
}


// Find the phrase number of a word in the index file

void find_phrase_number_from_word(char *basepath, UCArray &query, DocNumArray &result) {

  // Open the index file for searching
  IndexData indexData;
  char indexfilename[FILENAME_MAX] = "pword";
  if (!indexData.LoadData (basepath, indexfilename)) {
    FatalError (1, "Couldn't load index information for \"%s\"", indexfilename);
  }

  // set up the query object
  QueryInfo queryInfo;
  SetCStr (queryInfo.docLevel, "Document");
  queryInfo.maxDocs = 5;
  queryInfo.sortByRank = true;
  queryInfo.exactWeights = false;
  queryInfo.needRankInfo = true;
  queryInfo.needTermFreqs = true;
  
  // mode 1 = casefolded, unstemmed search
  QueryNode *queryTree = ParseQuery(query, 1, 1);

  // cout << "-- query --" << endl;
  // PrintNode (cout, queryTree);
      
  // perform the query
  ExtQueryResult queryResult;
  MGQuery (indexData, queryInfo, queryTree, queryResult);
  // cout << "-- word lookup result -- " << endl << queryResult << endl ;

  result.clear();
  result = queryResult.docs;

  // delete the query
  if (queryTree != NULL) delete queryTree;
}




// cgi_error
//
// If for some reason we cannot proceed, output a simple error
// page and exit(0) the program.

void cgi_error(bool XMLmode, char *message) {

  if (XMLmode) {
    cout << "Content-type: text/plain" << endl << endl
	 << "<phinddata>" << endl
	 << "<phinderror>" << message << "</phinderror>" << endl
	 << "</phinddata>" << endl;
  } else {
    cout << "Content-type: text/html" << endl << endl
	 << "<html><head><title>phind error</title></head>" << endl
	 << "<body>" << endl
	 << "<p><h1>phind error</h1>" 
	 << "<p> An error occured processing your request: <p><b>" 
	 << message
	 << "</b></body></html>" << endl;
  }
  exit(0);
}


// split an expansion into prefix and suffix

void split_phrase(UCArray word, UCArray body, UCArray &prefix, UCArray &suffix) {

  prefix.clear();
  suffix.clear();

  bool readingPrefix = true;
  UCArray::iterator here = word.begin();
  UCArray::iterator end = word.end();
  
  while (here != end) {

    // if we've not read all the prefix, add the next char to the prefix
    if (readingPrefix) {
      if (phrase_match(body, here, end)) {
	readingPrefix = false;
	// trim whitespace from end of prefix & start of suffix
	if (!prefix.empty()) {
	  prefix.pop_back();
	}
	if ((here != end) && (*here == ' ')) {
	  here++;
	}
      } else {
	prefix.push_back(*here);
	here++;
      }
    }
    // if we've finished with the prefix, update the suffix
    else {
      suffix.push_back(*here);
      here++;
    }
  }
}

// phrase_match
//
// compare two strings, one represented as an UCArray, the other as two
// UCArray iterators.
//
// Return true if the UCArray is the same as the phrase the iterators point
// to for the length of the UCArray.

bool phrase_match(UCArray text, UCArray::iterator &here, UCArray::iterator end) {

  UCArray::iterator one_here = text.begin();
  UCArray::iterator one_end  = text.end();
  UCArray::iterator two_here = here;

  // iterate over the length of the first string, comparing each element to
  // the corresponding element in the second string.
  while (one_here != one_end) {
   
      if (two_here == end) {
	  return false;
      } else if (*one_here != *two_here) {
	  return false;
      }
      one_here++;
      two_here++;
  }

  here = two_here;
  return true;
}


// Convert from text_t format
//
// Conversions from text_t to other types

unsigned long toLongInt(text_t &value) {

  unsigned long result = 0;

  text_t::iterator here = value.begin(); 
  text_t::iterator end = value.end(); 
  while (here != end) {
    result *= 10;
    result += *here - '0';
    here++;
  }

  return result;
}

void toUCArray(text_t &in, UCArray &out) {
  out.clear();
  text_t::iterator here = in.begin();
  text_t::iterator end = in.end();
  while (here != end) {
    out.push_back((unsigned char) *here);
    here++;
  }
}

