/**********************************************************************
 *
 * phrasesearch.cpp -- tools to search for a phrase in a larger text
 * Copyright (C) 1999  DigiLib Systems Limited
 *
 * A component of the Greenstone digital library software
 * from the New Zealand Digital Library Project at the
 * University of Waikato, New Zealand.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 *********************************************************************/

#include "phrasesearch.h"
#include "gsdlunicode.h"

inline unsigned char *parse_nonword_word (unsigned char *here, unsigned char *end,
					  text_t &word) {
  int c_len = 0;
  unsigned short c = 0;

  word.clear();

  // parse non word
  while (here <= end) {
    c_len = parse_utf8_char (here, end, &c);
    if (c == '(') {
      // found a note, look for '}'
      while (here <= end && c != ')') {
	c_len = parse_utf8_char (here, end, &c);
	here += c_len;
      }
    }
    if (c == '{') {
      // found a composite character, look for '}'
      while (here <= end && c != '}') {
	c_len = parse_utf8_char (here, end, &c);
	here += c_len;
      }
    }
    if (is_unicode_letdig(c)) {
      while (c_len > 0) {
	// this is in a word
	word.push_back(*here);
	here++; c_len--;
      }
      break;
    }
    here += c_len;
  }

  // parse word
  while (here <= end) {
    c_len = parse_utf8_char (here, end, &c);
    if (!is_unicode_letdig(c)) {
      here += c_len;  // it is ok to skip a nonword character
      break;
    }
    while (c_len > 0) {
      word.push_back(*here);
      here++; c_len--;
    }
  }

  return here;
}


bool doc_phrase_search (unsigned char *doc, int doclen,
			const termfreqclassarray &phrase) {
  // note: this uses the most braindead search routine :-)
  // however its not so bad as there shouldn't be many partial
  // matches

  // a null phrase matches anything
  if (phrase.empty()) return true;

  // if there is nothing then there can't be a match
  if (doc == NULL || doclen == 0) return false;
  
  text_t doc_word;
  doc_word.reserve (16);

  bool first = true;

  unsigned char *doc_here = doc;
  unsigned char *doc_herefirstword = doc;
  unsigned char *doc_end = doc+doclen-1; // unitool conventions :-/

  while (doc_here <= doc_end) {
    first = true;
    
    // there will be at least one member of phrase (see above)
    termfreqclassarray::const_iterator phrase_here = phrase.begin();
    termfreqclassarray::const_iterator phrase_end = phrase.end();
    do {
      // get the next non-word ... and ignore it, then get the next word
      doc_here = parse_nonword_word (doc_here, doc_end, doc_word);
      if (first) {doc_herefirstword = doc_here; first = false;}

      // break if this word is not the next in the phrase
      if ((*phrase_here).utf8equivterms.find (doc_word) ==
	  (*phrase_here).utf8equivterms.end()) break;

      phrase_here++;
    } while (doc_here <= doc_end && phrase_here != phrase_end);

    // see if we found a phrase
    if (phrase_here == phrase_end) return true;
    
    doc_here = doc_herefirstword; // set the counter back
  }

  return false;
}


// looks for the stemmed phrase in the metadata or text associated with
// an OID. This function has not been coded with all situations in mind
bool OID_phrase_search (mgsearchclass &mgsearch,
			gdbmclass &gdbm,
			const text_t &index,
			const text_t &subcollection,
			const text_t &language,
			const text_t &longindex,
			const text_t &collection,
			const termfreqclassarray &phrase,
			int docnum) {
  // disect the long index to find out where the text should come from
  text_t level, gran;
  text_t::const_iterator longindex_here = longindex.begin();
  text_t::const_iterator longindex_end = longindex.end();
  longindex_here = getdelimitstr (longindex_here, longindex_end, ':', level);
  longindex_here = getdelimitstr (longindex_here, longindex_end, ':', gran);

  if (gran.empty()) return false;
  
  // note that we're treating indexes with granularity of 'all' (i.e. text,Title,Creator)
  // as if they were simply 'text' indexes
  if (gran == "text" || gran == "all") {
    char *doc = NULL;
    int doclen = 0;
  
    // get text from mg.
    if (!mgsearch.mgdocument (index, subcollection, language, collection,
			      docnum, doc, doclen)) return false;
    return doc_phrase_search ((unsigned char *)doc, doclen, phrase);
  } 

  // get OID
  char *metadata = NULL;
  text_t::size_type metadata_len = 0;
  infodbclass docnum_info;
  infodbclass OID_info;
  
  if (!gdbm.getinfo (docnum, docnum_info)) return false;
  text_t &OID = docnum_info["section"];
  if (OID.empty()) return false;
  
  // get field
  if (!gdbm.getinfo (OID, OID_info)) return false;

  bool result = false;
  text_tarray *tarr_ptr = OID_info.getmultinfo (gran);
  if (tarr_ptr != NULL ) {
    text_tarray::const_iterator subvalue_here = (*tarr_ptr).begin();
    text_tarray::const_iterator subvalue_end = (*tarr_ptr).end();
    while (subvalue_here != subvalue_end) {
      if (subvalue_here != NULL) {
	metadata = (to_utf8(*subvalue_here)).getcarr(metadata_len);
	result = doc_phrase_search ((unsigned char *)metadata, metadata_len, phrase);
	delete [] metadata;

	if (result) return true;
      }
      
      subvalue_here++;
    }
  }

  return result;
}
