/**********************************************************************
 *
 * mgsearch.cpp -- 
 * Copyright (C) 1999  The New Zealand Digital Library Project
 *
 * A component of the Greenstone digital library software
 * from the New Zealand Digital Library Project at the
 * University of Waikato, New Zealand.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * $Id: mgsearch.cpp,v 1.31 2001/02/19 02:02:00 sjboddie Exp $
 *
 *********************************************************************/

/*
   $Log: mgsearch.cpp,v $
   Revision 1.31  2001/02/19 02:02:00  sjboddie
   Set mg's accumulator method back to 'list' as the recent change appeared
   to introduce a new (and more serious) bug while fixing the old bug. For
   now we'll just have to live with it the way it is.

   Revision 1.30  2001/02/15 22:58:11  kjm18
   added a comment

   Revision 1.29  2001/02/15 03:57:02  kjm18
   changed accumulator_method for mg to be array rather than list - it was
   getting some weird results with ranked searches

   Revision 1.28  2001/01/25 18:26:44  cs025
   Included CORBA branch for first time

   Revision 1.22.2.1  2000/04/04 15:02:32  cs025
   Corba first commit

   Revision 1.22  1999/09/24 02:41:21  rjmcnab
   change to use has_unicode_letdig in text_t

   Revision 1.21  1999/09/21 21:41:41  sjboddie
   fixed an error in what I committed last

   Revision 1.20  1999/09/21 11:59:26  sjboddie
   added Maxdocs queryfilter option (which may be -1 for 'all)

   Revision 1.19  1999/09/07 22:52:52  rjmcnab
   Seems to be an error in mg for retrieving documents using a paragraph
   based index for some cases. Just added a work around (loads the default
   index every time).

   Revision 1.18  1999/09/07 04:57:22  sjboddie
   added gpl notice

   Revision 1.17  1999/08/31 22:42:41  rjmcnab
   A couple of minor things.

   Revision 1.16  1999/08/25 04:51:06  sjboddie
   small change to allow for searching using boolean operators

   Revision 1.15  1999/07/16 08:35:03  rjmcnab
   Fixed a weird bug to do with a faulty case statement.

   Revision 1.14  1999/07/16 03:42:22  sjboddie
   changed isApprox

   Revision 1.13  1999/07/16 00:12:46  sjboddie
   removed all the old post-processing stuff

   Revision 1.12  1999/07/07 06:17:47  rjmcnab
   broke search_index into index+subcollection+language
   within mgsearch

   Revision 1.11  1999/07/05 21:06:43  rjmcnab
   Disabled quoted strings.

   Revision 1.10  1999/07/01 09:29:19  rjmcnab
   Changes for better reporting of number documents which match a query. Changes
   should still work as before with older versions of mg.

   Revision 1.9  1999/07/01 03:54:48  rjmcnab
   Added code to plug in the equivalent terms of each of the query terms.
   Also added a function to get a raw utf8 encoded mg document (for speeding
   up a phrase matching function)

   Revision 1.8  1999/06/30 04:04:12  rjmcnab
   made stemming functions available from mgsearch and made the stems
   for the query terms available in queryinfo

   Revision 1.7  1999/06/27 22:07:27  sjboddie
   got rid of all the old functions for dealing with dir indexes

   Revision 1.6  1999/06/09 00:41:32  sjboddie
   phrase searching now uses case-folding if it's turned on

   Revision 1.5  1999/02/21 22:31:35  rjmcnab

   Removed locateinfo.

   Revision 1.4  1999/02/03 01:13:27  sjboddie

   Got interface to handle subcollections and language subcollections -
   committed changes made to some of the collections

   Revision 1.3  1999/01/19 01:38:17  rjmcnab

   Made the source more portable.

   Revision 1.2  1999/01/12 01:51:02  rjmcnab

   Standard header.

   Revision 1.1  1999/01/08 09:02:16  rjmcnab

   Moved from src/library.

 */

#include "gsdlconf.h"
#include "mgsearch.h"
#include "fileutil.h"

#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>

#if defined(GSDL_USE_OBJECTSPACE)
#  include <ospace\std\iostream>
#elif defined(GSDL_USE_IOS_H)
#  include <iostream.h>
#else
#  include <iostream>
#endif

#if defined(__WIN32__)
// gdbm stuff
#  include "autoconf.h"
#  include "systems.h"
#  include "gdbmconst.h"
#  include "gdbm.h"
#else
#  include <gdbm.h>
#endif

  
#include <assert.h>

#include "mgq.h"
// #include "locateinfo.h"
#include "gsdlunicode.h"
#include "unitool.h"


/////////////
// globals //
/////////////

static char *tempdoc = NULL;
static int templen = 0;


//////////////////////
// useful functions //
//////////////////////


// input and output are in utf8
text_t mgsearch_stemword (const text_t &word) {
  // allocate working stem space
  int maxstemlen = mgq_getmaxstemlen ();
  unsigned char *word_stem = new unsigned char [maxstemlen + 2];
  if (word_stem == NULL) return "";

  // copy word to word_stem
  int len = 0;
  text_t::const_iterator here = word.begin();
  text_t::const_iterator end = word.end();
  while (len < maxstemlen && here != end) {
    word_stem[len+1] = (unsigned char)(*here);
    len++; here++;
  }
  word_stem[len+1] = '\0';
  word_stem[0] = len;

  mgq_stemword (word_stem);

  // copy word_stem back to tempstr
  text_t tempstr;
  tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);

  delete [] word_stem;
  
  return tempstr;
}



////////////////////////
// callback functions //
////////////////////////

// This routine is called for each document found in a search
// it assumes that cache_num is set up correctly to point to
// a suitable result cache
int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum, 
		     float Weight, void *info) {

  
  queryresultsclass *queryresults = (queryresultsclass * )info;

  // append this entry to the document results
  docresultclass docresult;
  docresult.docnum = DocNum;
  docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
  docresult.docweight = Weight - docresult.num_query_terms_matched*100;
  
  queryresults->docs.docset[DocNum] = docresult;
  queryresults->docs.docorder.push_back(DocNum);
  
  return 0;
}

int termequivcallback(char *Word, int ULen,  int /*Freq*/, 
		      float /*Weight*/,  void *info) {
  text_tset *equivterms = (text_tset *)info;
  if (equivterms == NULL) return 0;

  text_t thisterm;
  thisterm.setcarr(Word, ULen);

  equivterms->insert(thisterm);
  
  return 0;
}


void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
  // allocate working stem space
  int maxstemlen = mgq_getmaxstemlen ();
  unsigned char *word_stem = new unsigned char [maxstemlen + 2];
  if (word_stem == NULL) return;

  // copy word to word_stem
  int len = 0;
  text_t::const_iterator here = word.begin();
  text_t::const_iterator end = word.end();
  while (len < maxstemlen && here != end) {
    word_stem[len+1] = (unsigned char)(*here);
    len++; here++;
  }
  word_stem[len+1] = '\0';
  word_stem[0] = len;

  // get the equivalent terms
  mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
  
  delete [] word_stem;

  return;
}

  text_tset utf8equivterms; // kept as utf8 string for fast matching


// This callback is called once for each term in the query
int termfreqcallback(char *Word, int ULen,  int Freq, 
		     float /*Weight*/,  void *info) {
  queryresultsclass *queryresults = (queryresultsclass *)info;
  if (queryresults == NULL) return 0;

  text_t term;
  term.setcarr(Word, ULen);
  termfreqclass termfreq;

  termfreq.termstr = to_uni(term);
  text_t utf8termstem = mgsearch_stemword (term);
  termfreq.termstemstr = to_uni (utf8termstem);

  mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
  
  termfreq.termfreq = Freq;
  queryresults->orgterms.push_back(termfreq);
  
  return 0;
}

// this callback is called once for each variation of each term
int termvariantscallback(char *Word, int ULen, int /*Freq*/,
			 float /*Weight*/, void *info) {

  text_t term;
  term.setcarr(Word, ULen);
  queryresultsclass *queryresults = (queryresultsclass *)info;
  queryresults->termvariants.insert(to_uni(term));

  return 0;
}

// This callback is for getting document text
int doctextcallback(char *Doc, int ULen,  int /*Freq*/, 
		    float /*Weight*/,  void * /*info*/) {
  tempdoc = Doc;
  templen = ULen;
  
  return 0;
}


static text_t getindexsuffix (const text_t &collection, 
			      const text_t &index) {

  text_t indexsuffix = "index";  
  indexsuffix = filename_cat (indexsuffix, index);
  indexsuffix = filename_cat (indexsuffix, collection);
  return indexsuffix;
}




////////////////////
// mgsearch class //
////////////////////

mgsearchclass::mgsearchclass ()
  : searchclass() {
 
}

mgsearchclass::~mgsearchclass () 
{
  if (cache != NULL) 
    {
      delete cache;
      cache = NULL;
    }
}

// you only need to use this function before doing any stemming
// casefolding and stemming will be set if values for them are
// provided (0 or 1).
// makeindexcurrent returns true if it was able to load the database
bool mgsearchclass::makeindexcurrent (const text_t &index,
				      const text_t &subcollection,
				      const text_t &language,
				      const text_t &collection,
				      int casefolding,
				      int stemming) {
  bool databaseloaded = true;

  // get the names of the collection, index and text suffixes
  char *ccollection = collection.getcstr();
  assert (ccollection != NULL);
  char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
  assert (idxsuffix != NULL);
  char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
  assert (txtsuffix != NULL);

#ifdef __WIN32__
  char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
#else
  char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
#endif

  if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
    if (casefolding == 0) mgq_ask(".set casefold off");
    else if (casefolding > 0) mgq_ask(".set casefold on");
    if (stemming == 0) mgq_ask(".set stem off");
    else if (stemming > 0) mgq_ask(".set stem on");
    
  } else databaseloaded = false;

  // free up the c strings
  delete ccollection;
  delete idxsuffix;
  delete txtsuffix;
  delete ccollectdir;

  return databaseloaded;
}


// stem word uses the values set in the last call to makeindexcurrent
// to stem the word. It is assumed that word is in unicode
text_t mgsearchclass::stemword (const text_t &word) {
  return to_uni (mgsearch_stemword (to_utf8 (word)));
}

text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
  return to_uni (mgsearch_stemword (to_utf8 (here, end)));
}

/**
 * search directs the whole execution of the search; a number of other
 * functions in this class are called as a result, and precondition
 * checks are also made
 */
bool mgsearchclass::search(const queryparamclass &queryparams, 
			   queryresultsclass &queryresults) {
  //  assert (cache != NULL);

  // clear any previous results
  queryresults.clear();
  // first check the cache
  if (cache != NULL) {
    if (cache->find(queryparams, queryresults)) return true;
  }
  // make sure there is a query to be processed
  if (!has_unicode_letdig(queryparams.querystring)) return true;

  if (makeindexcurrent (queryparams.index, queryparams.subcollection,
			queryparams.language, queryparams.collection)) {
    // initialise the form of results
    setsearchmode (queryparams);

    // execute the query
    submitquery (queryparams);

    // retrieve the results
    getresults (queryparams, queryresults);

    return true;
  }

  return false;
}

/* accumulator_method has been changed to use array rather than list.
list appears to be broken somewhat - for some ranked queries, it returned 
fewer results than it should have (eg 45 instead of 50). The three other
methods (array, splay_tree, hash_table) all return the same number of 
documents, in the same order, with the same ranks. list returns what 
appears to be the same documents (but less of them), but with different ranks,
and in a different order. Minimal time tests dont show any speed improvement
of list over array (maybe because its broken??).  [02/2001, kjm18]

... [sjboddie, also 02/2001] turns out that changing the accumulator_method
introduced a more serious bug than it fixed (i.e. occasionally when doing a
ranked search for a very common word you get no results at all). I've
changed it back to list for now, one day we should play with other
accumulator_methods but for now I don't have time and don't want to risk
introducing bugs (better the devil you know ;)
*/
void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
{
  mgq_ask(".set expert true");
  mgq_ask(".set sorted_terms true");
  mgq_ask(".set accumulator_method list");
  mgq_ask(".set max_accumulators 500000");
  mgq_ask(".set maxparas 500000");
  mgq_ask(".set verbatim true");
  mgq_ask(".unset skip_dump");
  mgq_ask(".set mode docnums");

  switch (queryparams.search_type) 
    {
    case 0: mgq_ask(".set query boolean");  break;
    case 1:  mgq_ask(".set query ranked"); break;
    }
  switch (queryparams.casefolding) 
    {
    case 1: mgq_ask(".set casefold on");  break;
    case 0: mgq_ask(".set casefold off"); break;
    }
  switch (queryparams.stemming) 
    {
    case 1: mgq_ask(".set stem on");  break;
    case 0: mgq_ask(".set stem off"); break;
    }
  mgq_ask(".set heads_length 150");
  
  if (queryparams.maxdocs == -1) {
    mgq_ask(".set maxdocs all");
  } else {
    char maxdocstr[32];
    sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
    mgq_ask(maxdocstr);
  }
}

/**
 * submitquery constructs the query string (into UTF8 encoding)
 * and submits it using mgq_ask to the mg search engine.  Most
 * of the processing will be done inside Greenstone
 */
void mgsearchclass::submitquery (const queryparamclass &queryparams)
{
  // sort out the query string; copy it, remove all special characters
  // and then convert it to a string in UTF8 format
  text_t ttquerystring = queryparams.querystring;
  filterquery (ttquerystring);
  char *querystring = to_utf8(ttquerystring).getcstr();
  
  // submit the query
  mgq_ask(querystring);

  // destroy the temporary character array
  delete querystring;
}

/**
 * getrults is called to retrieve the required data on the docs
 * which responded to the query submitted in submitquery above.
 *
 * It calls the local mgquery (mgq) interface to MG several times,
 * to obtain the document numbers, term frequencies, term variants
 * etc.  All processing of the query will be done by Greenstone
 * thereafter
 */
void mgsearchclass::getresults (const queryparamclass &queryparams,
				queryresultsclass &queryresults) {
  // get the configuration for the maximum number of documents to
  // retrieve
  int howmany = queryparams.maxdocs;
  if (howmany == -1) howmany = MAXNUMDOCS;
  mgq_results(result_docnums, 0, howmany, 
	      ourquerycallback, (void *)(&queryresults));
  
  // get the term frequencies
  mgq_results(result_termfreqs, 0, MAXNUMTERMS, 
	      termfreqcallback, (void *)(&queryresults));
  queryresults.sortuniqqueryterms();

  // get term variants 
  mgq_results(result_terms, 0, MAXNUMTERMS,
              termvariantscallback, (void *)(&queryresults));

  // get the number of documents retrieved
  int total_retrieved = 0, is_approx = 0;
  mgq_docsretrieved (&total_retrieved, &is_approx);

  if (total_retrieved == 0) {
    // not available (or really was zero)
    queryresults.docs_matched = queryresults.docs.docset.size();
    if ((queryparams.maxdocs == -1) ||
	(queryresults.docs_matched < queryparams.maxdocs))
      queryresults.is_approx = Exact;
    else
      queryresults.is_approx = MoreThan;
  } else {
    queryresults.docs_matched = total_retrieved;
    if (is_approx) queryresults.is_approx = Approximate;
    else queryresults.is_approx = Exact;
  }
}

/**
 * Tidies the given querystring, removing special characters
 */
void mgsearchclass::filterquery (text_t &ttquerystring) {
  text_t::iterator ithere = ttquerystring.begin ();
  text_t::iterator itend = ttquerystring.end ();
  
  // remove all non alphanumeric characters (except
  // boolean operators
  while (ithere != itend) {
    if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') &&
	(*ithere != '&') && (*ithere != '|') && (*ithere != '(') &&
	(*ithere != ')')) (*ithere) = ' ';
    ithere++;
  }
}


// the document text for 'docnum' is placed in 'output'
// docTargetDocument returns 'true' if it was able to
// try to get a document
// collection is needed to see if an index from the 
// collection is loaded. If no index has been loaded
// defaultindex is needed to load one
bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
				      const text_t &defaultsubcollection,
				      const text_t &defaultlanguage,
				      const text_t &collection,
				      int docnum,
				      text_t &output) {
  output.clear();

  // get the mg version of the document
  char *mgdoc = NULL;
  int doclen = 0;
  if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
		   collection, docnum, mgdoc, doclen)) return false;
  if (mgdoc == NULL) return false;

  // replace all control-Cs with spaces
  char *mgdoc_here = mgdoc;
  char *mgdoc_end = mgdoc + doclen;
  while (mgdoc_here < mgdoc_end) {
    if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
    mgdoc_here++;
  }

  // convert this document to unicode
  utf8inconvertclass inconvert;
  convertclass::status_t status;
  inconvert.reset ();
  inconvert.setinput (mgdoc, doclen);
  inconvert.convert (output, status);

  return true;
}


bool mgsearchclass::mgdocument (const text_t &defaultindex, 
				const text_t &defaultsubcollection,
				const text_t &defaultlanguage,
				const text_t &collection,
				int docnum,
				char *&UDoc, int &ULen) {
  int databaseloaded = 0;

  UDoc = NULL; ULen = 0;
  
  // see if we can make an appropriate database current
//    char *ccollection = collection.getcstr();
//    assert (ccollection != NULL);
//    databaseloaded = load_text_database (ccollection);
//    delete ccollection;
  
  // try and load the database
//    if (!databaseloaded)
  databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
				     defaultlanguage, collection);
  
  if (databaseloaded) {
    // retrieve the document from mg
    char docstr[32];
    sprintf(docstr, "%i", docnum);
    
    mgq_ask(".set mode text");
    mgq_ask(".set query docnums");
    mgq_ask(docstr);

    tempdoc = NULL;
    templen = 0;
    mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
    UDoc = tempdoc;
    ULen = templen;
  }

  return (bool)databaseloaded;
}

