/**********************************************************************
 *
 * text_t.cpp -- a simple 16-bit character string class
 * Copyright (C) 1999  The New Zealand Digital Library Project
 *
 * A component of the Greenstone digital library software
 * from the New Zealand Digital Library Project at the
 * University of Waikato, New Zealand.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * $Id: text_t.cpp,v 1.20 2001/01/25 18:26:44 cs025 Exp $
 *
 *********************************************************************/

/*
   $Log: text_t.cpp,v $
   Revision 1.20  2001/01/25 18:26:44  cs025
   Included CORBA branch for first time

   Revision 1.15.2.2  2000/04/05 10:19:38  syeates
   added automatic conversion to allow text_t's to be <<'ed to ostreams

   Revision 1.15.2.1  2000/04/04 15:02:29  cs025
   Corba first commit

   Revision 1.15  1999/10/14 22:52:39  sjboddie
   joinchar can join using text_t string now too

   Revision 1.14  1999/09/24 02:30:03  rjmcnab
   added function has_unicode_letdig

   Revision 1.13  1999/09/07 04:57:43  sjboddie
   added gpl notice

   Revision 1.12  1999/08/31 08:04:41  rjmcnab
   Fixed a small but hard to find bug in getcarr

   Revision 1.11  1999/07/01 04:05:09  rjmcnab
   Optimised append functions slightly and added a reserve function.

   Revision 1.10  1999/04/26 03:58:03  sjboddie
   added is_number function

   Revision 1.9  1999/04/06 22:17:24  rjmcnab
   Added splits and joins using text_tset.

   Revision 1.8  1999/02/28 23:14:41  rjmcnab

   Added uc and lc to convert to uppercase and lowercase.

   Revision 1.7  1999/02/21 22:26:39  rjmcnab

   Made getint() a constant function.

   Revision 1.6  1999/02/03 01:13:26  sjboddie

   Got interface to handle subcollections and language subcollections -
   committed changes made to some of the collections

   Revision 1.5  1999/01/19 01:38:14  rjmcnab

   Made the source more portable.

   Revision 1.4  1999/01/12 01:51:00  rjmcnab

   Standard header.

   Revision 1.3  1999/01/08 02:33:16  rjmcnab

   Added standard header to source files.

 */

#include "text_t.h"

#if defined(GSDL_USE_OBJECTSPACE)
#  include <ospace\std\algorithm>
#elif defined(GSDL_USE_STL_H)
#  if defined(GSDL_USE_ALGO_H)
#    include <algo.h>
#  else
#    include <algorithm.h>
#  endif
#else
#  include <algorithm>
#endif

#ifdef HAVE_CONFIG_H
# ifdef __WIN32__
#  include "WIN32cfg.h"
# else
#  include "config.h"
# endif
#endif


#include "unitool.h"

////////////////////////////////////
// text_t methods
////////////////////////////////////

// new stream converter ...
ostream& operator<< (ostream &o, const text_t text)
{
  text_t::const_iterator ithere = text.begin();
  text_t::const_iterator itend = text.end();

  while (ithere != itend)
    {
      if (*ithere < 256)
	{
	  o << (unsigned char)(*ithere);
	}
      else 
	{
	// put a space or a question mark depending on what
	// the character is. Question marks tell the user that
	// they are missing some information.
	if (is_unicode_space (*ithere)) 
	  o << ' ';
	else 
	  o << '?';
      }
      ithere++;
    }

  return o;
}

text_t::text_t () 
{
  setencoding(0);
  clear ();
}

text_t::text_t (int i) 
{
  setencoding(0);
  clear ();
  appendint (i);
}

text_t::text_t (char *s) 
{ 
  setencoding(0);
  clear ();
  appendcstr (s);
}


void text_t::append (const text_t &t) 
{
  text.insert(text.end(), t.begin(), t.end());
  //  const_iterator here, end=t.end();
  //  for (here=t.begin(); here!=end;here++)
  //    {
  //      text.push_back(*here);
  //    }
}

void text_t::appendrange (iterator first, iterator last) 
{
  text.insert(text.end(), first, last);
  //  while (first != last) 
  //  {
  //    text.push_back (*first);
  //    first++;
  //  }
}

void text_t::appendrange (const_iterator first, const_iterator last) 
{
  text.insert(text.end(), first, last);
  //  while (first != last) 
  //  {
  //    text.push_back (*first);
  //    first++;
  //  }
}

void text_t::appendint (int i)
{
  // deal with zeros and negatives
  if (i == 0) 
    {
      text.push_back('0');
      return;
    }
  else if (i < 0)
    {
      text.push_back('-');
      i *= -1;
    }

  // get a buffer for the conversion
  int maxbuflen = sizeof(int)*3;
  char *buf = new char[maxbuflen];
  int len = 0;
  
  // get the number in reverse
  while (i > 0)
    {
      buf[len++] = '0'+ (i%10);
      i = i/10;
    }

  // reverse the number
  while (len > 0)
    {
      text.push_back(buf[--len]);
    }

  delete buf;
}

int text_t::getint () const
{
  int i = 0;
  int mult = 1; // become -1 for negative numbers

  const_iterator here = text.begin();
  const_iterator end = text.end();
  
  // do plus and minus signs
  if (here != end)
    {
      if (*here == '-')
	{
	  mult = -1;
	  here++;
	}
      else if (*here == '+')
	{
	  mult = 1;
	  here++;
	}
    }

  // deal with the number
  while ((here != end) && (*here >= '0') && (*here <= '9'))
    {
      i = 10*i + (*here - '0');
      here++;
    }

  i *= mult;
  return i;
}



void text_t::appendcarr (char *s, size_type len)
{
  unsigned char *us = (unsigned char *)s;
  while (len > 0) 
    {
      text.push_back (*us); // append this character
      us++;
      len--;
    }
}

void text_t::appendcstr (char *s) 
{
  unsigned char *us = (unsigned char *)s;
  while (*us != '\0') 
    {
      text.push_back (*us); // append this character
      us++;
    }
}


// strings returned from getcarr and getcstr become the callers 
// responsibility and should be deallocated with "delete"

char *text_t::getcarr(size_type &len) const
{
  unsigned char *cstr = new unsigned char[size()];
  len = 0;

  const_iterator ithere = begin();
  const_iterator itend = end();
  while (ithere != itend)
    {
      if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
      else {
	// put a space or a question mark depending on what
	// the character is. Question marks tell the user that
	// they are missing some information.
	if (is_unicode_space (*ithere)) cstr[len] = ' ';
	else cstr[len] = '?';
      }
      len++;
      ithere++;
    }

  return (char *)cstr;
}

char *text_t::getcstr() const
{
  unsigned char *cstr = new unsigned char[size() + 1];
  const_iterator ithere = begin();
  const_iterator itend = end();
  int len = 0;

  while (ithere != itend)
    {
      if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
      else {
	// put a space or a question mark depending on what
	// the character is. Question marks tell the user that
	// they are missing some information.
	if (is_unicode_space (*ithere)) cstr[len] = ' ';
	else cstr[len] = '?';
      }
      len++;
      ithere++;
    }

  cstr[len] = '\0';

  return (char *)cstr;
}


// general functions which work on text_ts

// find a character within a range
text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last, 
				 unsigned short c)
{
  while (first != last)
    {
      if (*first == c) break;
      first++;
    }
  return first;
}

text_t::iterator findchar (text_t::iterator first, text_t::iterator last, 
			   unsigned short c)
{
  while (first != last)
    {
      if (*first == c) break;
      first++;
    }
  return first;
}

text_t::iterator findword (text_t::iterator first, text_t::iterator last, 
			   const text_t& word)
{
  text_t::const_iterator word_begin = word.begin();
  text_t::const_iterator word_end = word.end();

  while (first != last)
    {
      text_t::iterator char_match = first;
      text_t::const_iterator word_here = word_begin;
      while (word_here!=word_end)
	{
	  if (*char_match != *word_here)
	    {
	      break;
	    }
	  char_match++;
	  word_here++;
	}
      if (word_here==word_end)
	{
	  return first;
	}
      first++;
    }
  return last; // get to here only if there is no match
}

// get a string up to the next delimiter (which is skipped)
text_t::const_iterator getdelimitstr (text_t::const_iterator first, 
				      text_t::const_iterator last,
				      unsigned short c, text_t &outstr)
{
  text_t::const_iterator here = first;
  here = findchar (first, last, c);
  outstr.clear();
  outstr.appendrange (first, here);
  if (here != last) here++; // skip c
  return here;
}

text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
				unsigned short c, text_t &outstr)
{
  text_t::iterator here = first;
  here = findchar (first, last, c);
  outstr.clear();
  outstr.appendrange (first, here);
  if (here != last) here++; // skip c
  return here;
}

// split a string with a character
void splitchar (text_t::const_iterator first, text_t::const_iterator last,
		unsigned short c, text_tset &outlist)
{
  outlist.erase(outlist.begin(), outlist.end());

  text_t t;

  while (first != last)
    {
      first = getdelimitstr (first, last, c, t);
      outlist.insert (t);
    }
}

void splitchar (text_t::const_iterator first, text_t::const_iterator last,
		unsigned short c, text_tlist &outlist)
{
  outlist.erase(outlist.begin(), outlist.end());

  text_t t;

  while (first != last)
    {
      first = getdelimitstr (first, last, c, t);
      outlist.push_back (t);
    }
}

void splitchar (text_t::const_iterator first, text_t::const_iterator last,
		unsigned short c, text_tarray &outlist)
{
  outlist.erase(outlist.begin(), outlist.end());

  text_t t;

  while (first != last)
    {
      first = getdelimitstr (first, last, c, t);
      outlist.push_back (t);
    }
}

// join a string using a character
void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
{
  outtext.clear ();

  text_tset::const_iterator here = inlist.begin ();
  text_tset::const_iterator end = inlist.end ();
  bool first = true;
  while (here != end)
    {
      if (!first) outtext.push_back (c);
      first = false;
      outtext += *here;
      here++;
    }
}

void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
{
  outtext.clear ();

  text_tlist::const_iterator here = inlist.begin ();
  text_tlist::const_iterator end = inlist.end ();
  bool first = true;
  while (here != end)
    {
      if (!first) outtext.push_back (c);
      first = false;
      outtext += *here;
      here++;
    }
}

void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
{
  outtext.clear ();

  text_tarray::const_iterator here = inlist.begin ();
  text_tarray::const_iterator end = inlist.end ();
  bool first = true;
  while (here != end)
    {
      if (!first) outtext.push_back (c);
      first = false;
      outtext += *here;
      here++;
    }
}

void joinchar (const text_tlist &inlist, text_t c, text_t &outtext)
{
  outtext.clear ();

  text_tlist::const_iterator here = inlist.begin ();
  text_tlist::const_iterator end = inlist.end ();
  bool first = true;
  while (here != end)
    {
      if (!first) outtext += c;
      first = false;
      outtext += *here;
      here++;
    }
}

void joinchar (const text_tset &inlist, text_t c, text_t &outtext)
{
  outtext.clear ();

  text_tset::const_iterator here = inlist.begin ();
  text_tset::const_iterator end = inlist.end ();
  bool first = true;
  while (here != end)
    {
      if (!first) outtext += c;
      first = false;
      outtext += *here;
      here++;
    }
}

void joinchar (const text_tarray &inlist, text_t c, text_t &outtext)
{
  outtext.clear ();

  text_tarray::const_iterator here = inlist.begin ();
  text_tarray::const_iterator end = inlist.end ();
  bool first = true;
  while (here != end)
    {
      if (!first) outtext += c;
      first = false;
      outtext += *here;
      here++;
    }
}

// count the occurances of a character within a range
int countchar (text_t::const_iterator first, text_t::const_iterator last,
	       unsigned short c)
{
  int count = 0;
  while (first != last) {
    if (*first == c) count ++;
    first ++;
  }
  return count;
}

// return a substring of string from first up to but not including last
text_t substr (text_t::const_iterator first, text_t::const_iterator last) {

  text_t substr;
  while (first != last) {
    substr.push_back(*first);
    first ++;
  }
  return substr;
}


// convert to lowercase
void lc (text_t::iterator first, text_t::iterator last) {
  while (first != last) {
    *first = unicode_tolower(*first);
    first++;
  }
}

// convert to uppercase
void uc (text_t::iterator first, text_t::iterator last) {
  while (first != last) {
    *first = unicode_toupper(*first);
    first++;
  }
}


// checks to see if it is a number (i.e. contains only 0-9)
bool is_number (const text_t &text) {

  text_t::const_iterator here = text.begin();
  text_t::const_iterator end = text.end();

  while (here != end) {
    if ((*here!='0') && (*here!='1') && (*here!='2') &&
	(*here!='3') && (*here!='4') && (*here!='5') &&
	(*here!='6') && (*here!='7') && (*here!='8') &&
	(*here!='9')) return false;
    here ++;
  }
  return true;
}


// checks to see if the text has any letters or digits
bool has_unicode_letdig (const text_t &text) {
  if (text.empty()) return false;
  
  text_t::const_iterator here = text.begin();
  text_t::const_iterator end = text.end();
  while (here != end) {
    if (is_unicode_letdig (*here)) return true;
    here++;
  }

  return false;
}



////////////////////////////////////
// convertclass methods
////////////////////////////////////

// conversion classes used for getting information in to and out of
// the text_t class.

convertclass::convertclass () 
{
  // nothing to do
}

void convertclass::reset ()
{
  // nothing to do
}


////////////////////////////////////
// inconvertclass methods
////////////////////////////////////

// convert from a char stream to the text_t class
// the default version assumes the input is a ascii
// character array

inconvertclass::inconvertclass () 
{
  start = NULL;
  len = 0;
}


void inconvertclass::reset ()
{
  start = NULL;
  len = 0;
}

void inconvertclass::setinput (char *thestart, size_t thelen)
{
  start = thestart;
  len = thelen;
}

void inconvertclass::convert (text_t &output, status_t &status)
{
  output.clear();

  if (start == NULL || len == 0)
    {
      status = finished;
      return;
    }

  // don't want any funny sign conversions happening
  unsigned char *here = (unsigned char *)start;
  while (len > 0) 
    {
      output.push_back (*here); // append this character
      ++here;
      --len;
    }

  start = (char *)here; // save current position
  status = finished;
}

// will treat the text_t as a 8-bit string and convert
// it to a 16-bit string using the about convert method.
text_t inconvertclass::convert (const text_t &t) {
  text_t out;
  text_t tmpout;
  status_t status;
  text_t::const_iterator here = t.begin();
  text_t::const_iterator end = t.end();
  unsigned char cbuf[256];
  size_t cbuflen = 0;
 
  while (here != end) {
    while (here != end && cbuflen < 256) {
      cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
      here++;
    }

    if (cbuflen > 0) {
      setinput ((char *)cbuf, cbuflen);
      status = unfinished;
      while (status == unfinished) {
	convert (tmpout, status);
	out += tmpout;
      }
      cbuflen = 0;
    }
  }

  out.setencoding (0); // unicode

  return out;
}

// an instance of the default inconvertclass to do simple
// conversions. Note that any functions that use this are
// not reentrant. If a function needs to be reentrant it
// should declare its own instance.
inconvertclass ascii2text_t;


////////////////////////////////////
// outconvertclass methods
////////////////////////////////////

// Convert from a text_t class to a char stream
// This default version assumes the output is a ascii
// character array. If you set the output stream you
// can use this class to output to a stream using the
// << operator. The << operator can also be conveniently
// used to set the output stream by doing something like
//
// cout << text_t2ascii << text_tstr << anothertext_tstr;
//
outconvertclass::outconvertclass ()
{
  input = NULL;
  outs = NULL;
}

void outconvertclass::reset ()
{
  input = NULL;
  outs = NULL;
}

void outconvertclass::setinput (text_t *theinput)
{
  input = theinput;
  if (input != NULL) texthere = input->begin();
}

void outconvertclass::convert (char *output, size_t maxlen, 
		      size_t &len, status_t &status)
{
  if (input == NULL || output == NULL)
    {
      status = finished;
      return;
    }

  // don't want any funny sign conversions happening
  unsigned char *uoutput = (unsigned char *)output;
  text_t::iterator textend = input->end();
  len = 0;
  while ((len < maxlen) && (texthere != textend)) 
    {
      if (*texthere < 256) *uoutput = (unsigned char)(*texthere);
      else {
	// put a space or a question mark depending on what
	// the character is. Question marks tell the user that
	// they are missing some information.
	if (is_unicode_space (*texthere)) *uoutput = ' ';
	else *uoutput = '?';
      }
      ++uoutput;
      ++len;
      ++texthere;
    }
  
  if (texthere == textend) status = finished;
  else status = unfinished;
}

// will convert the 16-bit string to a 8-bit stream
// and place the result in a text_t. This method uses
// the above convert function.
text_t outconvertclass::convert (const text_t &t) {
  text_t out;
  unsigned char cbuf[256];
  size_t cbuflen = 0;
  status_t status = unfinished;

  setinput ((text_t *)&t); // discard constant
  while (status == unfinished) {
    convert ((char *)cbuf, 256, cbuflen, status);
    out.appendcarr ((char *)cbuf, cbuflen);
  }

  out.setencoding (1); // other encoding
  
  return out;
}


void outconvertclass::setostream (ostream *theouts)
{
  outs = theouts;
}

ostream *outconvertclass::getostream ()
{
  return outs;
}




// an instance of the default outconvertclass to do simple
// conversions
outconvertclass text_t2ascii;



// stream operators for the output class

outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
{
  outconverter.setostream(&theouts);
  return outconverter;
}


#define STREAMBUFSIZE 256
outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
{
  ostream *outstream = outconverter.getostream();

  if (outstream == NULL) return outconverter;

  char outbuf[STREAMBUFSIZE];
  size_t len;
  outconvertclass::status_t status = outconvertclass::unfinished;

  // assume that there is no data needing converting
  // left in the converter
  outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion

  while (status == outconvertclass::unfinished)
    {
      outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
      if (len > 0) outstream->write(outbuf, len);
    }

  return outconverter;
}
