/*
 * This file is part of libswish3
 * Copyright (C) 2010 Peter Karman
 *
 *  libswish3 is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libswish3 is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libswish3; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
 /**
  * This file is automatically generated from the individual src .c and .h files
  * that are part of the libswish3 distribution. The guiding purpose of this file
  * is to allow for easier distribution in language bindings. See the
  * bindings/perl/3.xs file in the libswish3 distribution for one example.
  */

#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif

#ifdef HAVE_ALLOCA_H
# include <alloca.h>
#elif defined __GNUC__
# define alloca __builtin_alloca
#elif defined _AIX
# define alloca __alloca
#elif defined _MSC_VER
# include <malloc.h>
# define alloca _alloca
#else
# ifndef HAVE_ALLOCA
#  ifdef  __cplusplus
extern "C"
#  endif
void *alloca (size_t);
# endif
#endif

#include <stdio.h>
#include <locale.h>
#include <stdarg.h>
#include <assert.h>
#include <wchar.h>
#include <limits.h>
#include <errno.h>
#include <string.h>
#include <ctype.h>
#include <wctype.h>
#include <dirent.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <stdint.h>
#include <inttypes.h>
#include <time.h>

#if defined (HAVE_GETRUSAGE) && defined (HAVE_SYS_RESOURCE_H)
#include <sys/time.h>
#include <sys/resource.h>
#endif

#ifdef HAVE_TIMES
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#include <sys/times.h>
#endif

#include <zlib.h>

#include <libxml/parserInternals.h>
#include <libxml/parser.h>
#include <libxml/hash.h>
#include <libxml/xmlstring.h>
#include <libxml/HTMLparser.h>
#include <libxml/globals.h>
#include <libxml/xmlerror.h>
#include <libxml/tree.h>
#include <libxml/debugXML.h>
#include <libxml/xmlmemory.h>
#include <libxml/xmlreader.h>
#include <libxml/xmlwriter.h>
#include <libxml/encoding.h>
#include <libxml/xinclude.h>
#include <libxml/uri.h>


#define LIBSWISH3_SINGLE_FILE 1



/*************** start libswish3.h ************/
/*
 * This file is part of libswish3
 * Copyright (C) 2007 Peter Karman
 *
 *  libswish3 is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libswish3 is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libswish3; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */


#ifndef __LIBSWISH3_H__
#define __LIBSWISH3_H__

#ifndef LIBSWISH3_SINGLE_FILE
#include <sys/types.h>
#include <stdint.h>
#include <inttypes.h>
#include <sys/stat.h>
#include <time.h>
#include <libxml/parser.h>
#include <libxml/hash.h>
#include <libxml/xmlstring.h>
#endif

#define SWISH_LIB_VERSION           "1.0.200a024"
#define SWISH_VERSION               "3.0.0"
#define SWISH_BUFFER_CHUNK_SIZE     16384
#define SWISH_TOKEN_LIST_SIZE       1024
#define SWISH_MAXSTRLEN             2048
#define SWISH_MAX_HEADERS           6
#define SWISH_RD_BUFFER_SIZE        65536   // used ??
#define SWISH_MAX_WORD_LEN          256
#define SWISH_MIN_WORD_LEN          1
#define SWISH_STACK_SIZE            255  /* starting size for metaname/tag stack */
#define SWISH_CONTRACTIONS          1
#define SWISH_SPECIAL_ARG           1
#define SWISH_MAX_SORT_STRING_LEN   100
#define SWISH_TRUE                  1
#define SWISH_FALSE                 0

#define SWISH_DATE_FORMAT_STRING    "%Y-%m-%d %H:%M:%S %Z"
#define SWISH_URL_LENGTH            255

/* default config hash key names */
#define SWISH_HEADER_ROOT           "swish"
#define SWISH_INCLUDE_FILE          "IncludeConfigFile"
#define SWISH_CLASS_ATTRIBUTES      "XMLClassAttributes"
#define SWISH_PROP                  "PropertyNames"
#define SWISH_META                  "MetaNames"
#define SWISH_MIME                  "MIME"
#define SWISH_PARSERS               "Parsers"
#define SWISH_INDEX                 "Index"
#define SWISH_ALIAS                 "TagAlias"
#define SWISH_WORDS                 "Words"
#define SWISH_DEFAULT_PARSER        "default"
#define SWISH_PARSER_TXT            "TXT"
#define SWISH_PARSER_XML            "XML"
#define SWISH_PARSER_HTML           "HTML"
#define SWISH_DEFAULT_PARSER_TYPE   "HTML"
#define SWISH_INDEX_FORMAT          "Format"
#define SWISH_INDEX_NAME            "Name"
#define SWISH_INDEX_LOCALE          "Locale"
#define SWISH_INDEX_STEMMER_LANG    "Stemmer"
#define SWISH_DEFAULT_VALUE         "1"
#define SWISH_TOKENIZE              "Tokenize"
#define SWISH_CASCADE_META_CONTEXT  "CascadeMetaContext"
#define SWISH_IGNORE_XMLNS          "IgnoreXMLNameSpaces"
#define SWISH_FOLLOW_XINCLUDE       "FollowXInclude"
#define SWISH_UNDEFINED_METATAGS    "UndefinedMetaTags"
#define SWISH_UNDEFINED_XML_ATTRIBUTES "UndefinedXMLAttributes"

/* tags */
#define SWISH_DEFAULT_METANAME    "swishdefault"
#define SWISH_TITLE_METANAME      "swishtitle"
#define SWISH_TITLE_TAG           "title"
#define SWISH_BODY_TAG            "body"

/* mimes */
#define SWISH_DEFAULT_MIME        "text/html"

/* indexes */
#define SWISH_INDEX_FILENAME      "index.swish"
#define SWISH_XAPIAN_FORMAT       "Xapian"
#define SWISH_SWISH_FORMAT        "Native"
#define SWISH_ESTRAIER_FORMAT     "Estraier"
#define SWISH_KINOSEARCH_FORMAT   "KinoSearch"
#define SWISH_LUCY_FORMAT         "Lucy"
#define SWISH_INDEX_FILEFORMAT    "Native"
#define SWISH_HEADER_FILE         "swish.xml"

/* properties */
#define SWISH_PROP_STRING          1
#define SWISH_PROP_DATE            2
#define SWISH_PROP_INT             3

#define SWISH_PROP_RECCNT          "swishreccount"
#define SWISH_PROP_RANK            "swishrank"
#define SWISH_PROP_DOCID           "swishfilenum"
#define SWISH_PROP_DOCPATH         "swishdocpath"
#define SWISH_PROP_DBFILE          "swishdbfile"
#define SWISH_PROP_TITLE           "swishtitle"
#define SWISH_PROP_SIZE            "swishdocsize"
#define SWISH_PROP_MTIME           "swishlastmodified"
#define SWISH_PROP_DESCRIPTION     "swishdescription"
#define SWISH_PROP_MIME            "swishmime"
#define SWISH_PROP_PARSER          "swishparser"
#define SWISH_PROP_NWORDS          "swishwordnum"
#define SWISH_PROP_ENCODING        "swishencoding"
#define SWISH_TOKENPOS_BUMPER      "\3"
#define SWISH_DOT                  '.'
#define SWISH_SPACE                ' '
#define SWISH_DOM_CHAR             '/'
#define SWISH_DOM_STR              "/"
#define SWISH_XMLNS_CHAR           ':'

/* error codes */
typedef enum {
    SWISH_ERR_NO_SUCH_FILE = 1
} SWISH_ERR_CODES;

/* built-in id values */
typedef enum {
    SWISH_META_DEFAULT_ID = 0,
    SWISH_META_TITLE_ID,
    SWISH_META_THIS_MUST_COME_LAST_ID
} SWISH_META_ID;

/* special since not stored */
#define SWISH_PROP_RANK_ID  -1
typedef enum {
    SWISH_PROP_DOCID_ID = 0,
    SWISH_PROP_DOCPATH_ID,
    SWISH_PROP_DBFILE_ID,
    SWISH_PROP_TITLE_ID,
    SWISH_PROP_SIZE_ID,
    SWISH_PROP_MTIME_ID,
    SWISH_PROP_DESCRIPTION_ID,
    SWISH_PROP_NWORDS_ID,
    SWISH_PROP_MIME_ID,
    SWISH_PROP_PARSER_ID,
    SWISH_PROP_ENCODING_ID,
    SWISH_PROP_THIS_MUST_COME_LAST_ID
} SWISH_PROP_ID;

/* parser settings for undefined tags and attributes */
typedef enum {
    SWISH_UNDEF_METAS_INDEX = 0,    /* default */
    SWISH_UNDEF_METAS_ERROR,
    SWISH_UNDEF_METAS_IGNORE,
    SWISH_UNDEF_METAS_AUTO,
    SWISH_UNDEF_METAS_AUTOALL,
    SWISH_UNDEF_ATTRS_DISABLE,      /* default */
    SWISH_UNDEF_ATTRS_ERROR,
    SWISH_UNDEF_ATTRS_IGNORE,
    SWISH_UNDEF_ATTRS_INDEX,
    SWISH_UNDEF_ATTRS_AUTO,
    SWISH_UNDEF_ATTRS_AUTOALL
} SWISH_UNDEF;

/* xapian (maybe others) need string prefixes for metanames */
#define SWISH_PREFIX_URL            "U"
#define SWISH_PREFIX_MTIME          "T"


/* utils */
#define SWISH_MAX_WORD_LEN        256
#define SWISH_MAX_FILE_LEN        102400000 /* ~100 mb */

#if defined(WIN32) && !defined (__CYGWIN__)
#define SWISH_PATH_SEP             '\\'
#define SWISH_PATH_SEP_STR         "\\"
#define SWISH_EXT_SEP              "\\."
#else
#define SWISH_PATH_SEP             '/'
#define SWISH_PATH_SEP_STR         "/"
#define SWISH_EXT_SEP              "/."
#endif

#define SWISH_EXT_CH               '.'

/* encodings */
#define SWISH_DEFAULT_ENCODING    "UTF-8"
#define SWISH_LATIN1_ENCODING     "ISO8859-1"
#define SWISH_LOCALE              "en_US.UTF-8"
#define SWISH_ENCODING_ERROR      100

/* debugging levels */
typedef enum {
    SWISH_DEBUG_DOCINFO     = 1,
    SWISH_DEBUG_TOKENIZER   = 2,
    SWISH_DEBUG_TOKENLIST   = 4,
    SWISH_DEBUG_PARSER      = 8,
    SWISH_DEBUG_CONFIG      = 16,
    SWISH_DEBUG_MEMORY      = 32,
    SWISH_DEBUG_NAMEDBUFFER = 64,
    SWISH_DEBUG_IO          = 128
} SWISH_DEBUG_LEVELS;

/* the FUNCTION__ logic below first appeared in Perl 5.8.8
 * mostly it is for Win32 compat
 */
#ifndef FUNCTION__
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || (defined(__SUNPRO_C))
/* C99 or close enough. */
#  define FUNCTION__ __func__
#else
#  if (defined(_MSC_VER) && _MSC_VER < 1300) || /* Pre-MSVC 7.0 has neither __func__ nor
 __FUNCTION and no good workarounds, either. */ \
      (defined(__DECC_VER)) /* Tru64 or VMS, and strict C89 being used, but not modern e
nough cc (in Tur64, -c99 not known, only -std1). */
#    define FUNCTION__ ""
#  else
#    define FUNCTION__ __FUNCTION__ /* Common extension. */
#  endif
#endif
#endif

#define SWISH_DEBUG_MSG(args...)                                    \
    swish_debug(__FILE__, __LINE__, FUNCTION__, args)

#define SWISH_CROAK(args...)                                        \
    swish_croak(__FILE__, __LINE__, FUNCTION__, args)

#define SWISH_WARN(args...)                                         \
    swish_warn(__FILE__, __LINE__, FUNCTION__, args)

#ifdef __cplusplus
extern "C" {
#endif

typedef char   boolean;
typedef struct swish_3                  swish_3;
typedef struct swish_StringList         swish_StringList;
typedef struct swish_Config             swish_Config;
typedef struct swish_ConfigFlags        swish_ConfigFlags;
typedef struct swish_ConfigValue        swish_ConfigValue;
typedef struct swish_DocInfo            swish_DocInfo;
typedef struct swish_MetaStackElement   swish_MetaStackElement;
typedef struct swish_MetaStackElement  *swish_MetaStackElementPtr;
typedef struct swish_MetaStack          swish_MetaStack;
typedef struct swish_MetaName           swish_MetaName;
typedef struct swish_Property           swish_Property;
typedef struct swish_Token              swish_Token;
typedef struct swish_TokenList          swish_TokenList;
typedef struct swish_TokenIterator      swish_TokenIterator;
typedef struct swish_ParserData         swish_ParserData;
typedef struct swish_Tag                swish_Tag;
typedef struct swish_TagStack           swish_TagStack;
typedef struct swish_Analyzer           swish_Analyzer;
typedef struct swish_Parser             swish_Parser;
typedef struct swish_NamedBuffer        swish_NamedBuffer;

/*
=head2 Data Structures
*/

struct swish_3
{
    int             ref_cnt;
    void           *stash;
    swish_Config   *config;
    swish_Analyzer *analyzer;
    swish_Parser   *parser;
};

struct swish_StringList
{
    unsigned int    n;
    unsigned int    max;
    xmlChar**       word;
};


struct swish_Config
{
    int                          ref_cnt;
    void                        *stash;      /* for bindings */
    xmlHashTablePtr              misc;
    xmlHashTablePtr              properties;
    xmlHashTablePtr              metanames;
    xmlHashTablePtr              tag_aliases;
    xmlHashTablePtr              parsers;
    xmlHashTablePtr              mimes;
    xmlHashTablePtr              index;
    xmlHashTablePtr              stringlists;
    struct swish_ConfigFlags    *flags;      /* shortcuts for parsing */
};

struct swish_ConfigFlags
{
    boolean         tokenize;
    boolean         cascade_meta_context;
    boolean         ignore_xmlns;
    boolean         follow_xinclude;
    int             undef_metas;
    int             undef_attrs;
    int             max_meta_id;
    int             max_prop_id;
    xmlHashTablePtr meta_ids;
    xmlHashTablePtr prop_ids;
    //xmlHashTablePtr contexts;
};

struct swish_NamedBuffer
{
    int             ref_cnt;    /* for bindings */
    void           *stash;      /* for bindings */
    xmlHashTablePtr hash;       /* the meat */
};

struct swish_DocInfo
{
    time_t              mtime;
    off_t               size;
    xmlChar *           mime;
    xmlChar *           encoding;
    xmlChar *           uri;
    unsigned int        nwords;
    xmlChar *           ext;
    xmlChar *           parser;
    xmlChar *           action;
    boolean             is_gzipped;
    int                 ref_cnt;
};

struct swish_MetaName
{
    int                 ref_cnt;
    int                 id;
    xmlChar            *name;
    int                 bias;
    xmlChar            *alias_for;
};

struct swish_Property
{
    int                 ref_cnt;
    int                 id;
    xmlChar            *name;
    boolean             ignore_case;
    int                 type;
    boolean             verbatim;
    xmlChar            *alias_for;
    unsigned int        max;
    boolean             sort;
    boolean             presort;
    unsigned int        sort_length;
};

struct swish_Token
{
    unsigned int        pos;            // this token's position in document
    swish_MetaName     *meta;
    xmlChar            *value;
    xmlChar            *context;
    unsigned int        offset;
    unsigned int        len;
    int                 ref_cnt;
};

struct swish_TokenList
{
    unsigned int        n;
    unsigned int        pos;            // track position in document
    xmlHashTablePtr     contexts;       // cache contexts
    xmlBufferPtr        buf;
    swish_Token**       tokens;
    int                 ref_cnt;
};

struct swish_TokenIterator
{
    swish_TokenList     *tl;
    swish_Analyzer      *a;
    unsigned int         pos;           // position in iteration
    int                  ref_cnt;
};

struct swish_Tag
{
    xmlChar            *raw;            // tag as libxml2 sees it
    xmlChar            *baked;          // tag as libswish3 sees it
    xmlChar            *context;
    struct swish_Tag   *next;
    unsigned int        n;
};

struct swish_TagStack
{
    swish_Tag         *head;
    swish_Tag         *temp;
    unsigned int       count;
    char              *name;       // debugging aid -- name of the stack
};

struct swish_Analyzer
{
    unsigned int           maxwordlen;         // max word length
    unsigned int           minwordlen;         // min word length
    boolean                tokenize;           // should we parse into TokenList
    int                  (*tokenizer) (swish_TokenIterator*, xmlChar*, swish_MetaName*, xmlChar*);
    xmlChar*             (*stemmer)   (xmlChar*);
    boolean                lc;                 // should tokens be lowercased
    void                  *stash;              // for script bindings
    void                  *regex;              // optional regex
    int                    ref_cnt;            // for script bindings
};

struct swish_Parser
{
    int                    ref_cnt;             // for script bindings
    void                 (*handler)(swish_ParserData*); // handler reference
    void                  *stash;               // for script bindings
    int                    verbosity;           
};

struct swish_ParserData
{
    swish_3               *s3;                 // main object
    xmlBufferPtr           meta_buf;           // tmp MetaName buffer
    xmlBufferPtr           prop_buf;           // tmp Property buffer
    xmlChar               *tag;                // current tag name
    swish_DocInfo         *docinfo;            // document-specific properties
    unsigned int           ignore_content;     // toggle flag. should buffer be indexed.
    boolean                is_html;            // shortcut flag for html parser
    boolean                bump_word;          // boolean for moving word position/adding space
    unsigned int           offset;             // current offset position
    swish_TagStack        *metastack;          // stacks for tracking the tag => metaname
    swish_TagStack        *propstack;          // stacks for tracking the tag => property
    swish_TagStack        *domstack;           // stacks for tracking xml/html dom tree
    xmlParserCtxtPtr       ctxt;               // so we can free at end
    swish_TokenIterator   *token_iterator;     // token container
    swish_NamedBuffer     *properties;         // buffer all properties
    swish_NamedBuffer     *metanames;          // buffer all metanames
};

/*
=cut
*/

/*
=head2 Global Functions
*/
void            swish_setup();
const char *    swish_lib_version();
const char *    swish_libxml2_version();
void            swish_setenv(char * name, char * value, int override);
/*
=cut
*/

/*
=head2 Top-Level Functions
*/
swish_3 *       swish_3_init( void (*handler) (swish_ParserData *), void *stash );
void            swish_3_free( swish_3 *s3 );
int             swish_parse_file( swish_3 * s3, xmlChar *filename );
unsigned int    swish_parse_fh( swish_3 * s3, FILE * fh );
int             swish_parse_buffer( swish_3 * s3, xmlChar * buf );
unsigned int    swish_parse_directory( swish_3 *s3, xmlChar *dir, boolean follow_symlinks );
/*
=cut
*/

/*
=head2 I/O Functions
*/
xmlChar *   swish_io_slurp_fh( FILE * fh, unsigned long flen, boolean binmode );
xmlChar *   swish_io_slurp_file_len( xmlChar *filename, off_t flen, boolean binmode );
xmlChar *   swish_io_slurp_gzfile_len( xmlChar *filename, off_t *flen, boolean binmode );
xmlChar *   swish_io_slurp_file( xmlChar *filename, off_t flen, boolean is_gzipped, boolean binmode );
long int    swish_io_count_operable_file_lines( xmlChar *filename );
boolean     swish_io_is_skippable_line( xmlChar *str );
/*
=cut
*/

/*
=head2 Filesystem Functions
*/
boolean     swish_fs_file_exists( xmlChar *filename );
boolean     swish_fs_is_dir( xmlChar *path );
boolean     swish_fs_is_file( xmlChar *path );
boolean     swish_fs_is_link( xmlChar *path );
off_t       swish_fs_get_file_size( xmlChar *path );
time_t      swish_fs_get_file_mtime( xmlChar *path );
xmlChar *   swish_fs_get_file_ext( xmlChar *url );
xmlChar *   swish_fs_get_path( xmlChar *url );
boolean     swish_fs_looks_like_gz( xmlChar *file );
/*
=cut
*/


/*
=head2 Hash Functions
*/
int         swish_hash_add( xmlHashTablePtr hash, xmlChar *key, void * value );
int         swish_hash_replace( xmlHashTablePtr hash, xmlChar *key, void *value );
int         swish_hash_delete( xmlHashTablePtr hash, xmlChar *key );
boolean     swish_hash_exists( xmlHashTablePtr hash, xmlChar *key );
int         swish_hash_exists_or_add( xmlHashTablePtr hash, xmlChar *key, xmlChar *value );
void        swish_hash_merge( xmlHashTablePtr hash1, xmlHashTablePtr hash2 );
void *      swish_hash_fetch( xmlHashTablePtr hash, xmlChar *key );
void        swish_hash_dump( xmlHashTablePtr hash, const char *label );
xmlHashTablePtr swish_hash_init(int size);
void        swish_hash_free( xmlHashTablePtr hash );
/*
=cut
*/

/*
=head2 Memory Functions
*/
void        swish_mem_init();
void *      swish_xrealloc(void *ptr, size_t size);
void *      swish_xmalloc( size_t size );
void        swish_xfree( void *ptr );
void        swish_mem_debug();
long int    swish_memcount_get();
void        swish_memcount_dec();
xmlChar *   swish_xstrdup( const xmlChar * ptr );
xmlChar *   swish_xstrndup( const xmlChar * ptr, int len );
/*
=cut
*/

/*
=head2 Time Functions
*/
double      swish_time_elapsed(void);
double      swish_time_cpu(void);
char *      swish_time_print(double time);
char *      swish_time_print_fine(double time);
char *      swish_time_format(time_t epoch);
/*
=cut
*/

/*
=head2 Error Functions
*/
void        swish_set_error_handle( FILE *where );
void        swish_croak(const char *file, int line, const char *func, const char *msg,...);
void        swish_warn(const char *file, int line, const char *func, const char *msg,...);
void        swish_debug(const char *file, int line, const char *func, const char *msg,...);
const char* swish_err_msg(int err_code);
/*
=cut
*/

/*
=head2 String Functions
*/
char *              swish_get_locale();
void                swish_verify_utf8_locale();
boolean             swish_is_ascii( xmlChar *str );
int                 swish_bytes_in_wchar( int wchar );
int                 swish_utf8_chr_len( xmlChar *utf8 );
uint32_t            swish_utf8_codepoint( xmlChar *utf8 );
int                 swish_utf8_num_chrs( xmlChar *utf8 );
void                swish_utf8_next_chr( xmlChar *s, int *i );
void                swish_utf8_prev_chr( xmlChar *s, int *i );
xmlChar *           swish_str_escape_utf8( xmlChar *utf8 );
xmlChar *           swish_str_unescape_utf8( xmlChar *ascii );
wchar_t *           swish_locale_to_wchar(xmlChar * str);
xmlChar *           swish_wchar_to_locale(wchar_t * str);
wchar_t *           swish_wstr_tolower(wchar_t *s);
xmlChar *           swish_str_tolower(xmlChar *s );
xmlChar *           swish_utf8_str_tolower(xmlChar *s);
xmlChar *           swish_ascii_str_tolower(xmlChar *s);
xmlChar *           swish_str_skip_ws(xmlChar *s);
void                swish_str_trim_ws(xmlChar *string);
void                swish_str_ctrl_to_ws(xmlChar *s);
boolean             swish_str_all_ws(xmlChar * s);
boolean             swish_str_all_ws_len(xmlChar * s, int len);
void                swish_debug_wchars( const wchar_t * widechars );
int                 swish_wchar_t_comp(const void *s1, const void *s2);
int                 swish_sort_wchar(wchar_t *s);
swish_StringList *  swish_stringlist_build(xmlChar *line);
swish_StringList *  swish_stringlist_init();
void                swish_stringlist_free(swish_StringList *sl);
unsigned int        swish_stringlist_add_string(swish_StringList *sl, xmlChar *str);
void                swish_stringlist_merge(swish_StringList *sl1, swish_StringList *sl2);
swish_StringList *  swish_stringlist_copy(swish_StringList *sl);
swish_StringList *  swish_stringlist_parse_sort_string(xmlChar *sort_string, swish_Config *cfg);
void                swish_stringlist_debug(swish_StringList *sl);
int                 swish_string_to_int( char *buf );
boolean             swish_string_to_boolean( char *buf );
xmlChar *           swish_int_to_string( int val );
xmlChar *           swish_long_to_string( long val );
xmlChar *           swish_double_to_string( double val );
xmlChar *           swish_date_to_string( int y, int m, int d );
char                swish_get_C_escaped_char(xmlChar *s, xmlChar **se);
/*
=cut
*/


/*
=head2 Configuration Functions
*/
swish_Config *      swish_config_init();
void                swish_config_set_default( swish_Config *config );
void                swish_config_merge( swish_Config *config1, swish_Config *config2 );
swish_Config *      swish_config_add( swish_Config * config, xmlChar * conf );
swish_Config *      swish_config_parse( swish_Config * config, xmlChar * conf );
void                swish_config_debug( swish_Config * config );
void                swish_config_free( swish_Config * config);
xmlHashTablePtr     swish_mime_defaults();
xmlChar *           swish_mime_get_type( swish_Config * config, xmlChar * fileext );
xmlChar *           swish_mime_get_parser( swish_Config * config, xmlChar *mime );
void                swish_config_test_alias_fors( swish_Config *c );
swish_ConfigFlags * swish_config_flags_init();
void                swish_config_flags_debug( swish_ConfigFlags *flags );
void                swish_config_flags_free( swish_ConfigFlags *flags );
void                swish_config_test_alias_fors( swish_Config *config );
void                swish_config_test_unique_ids( swish_Config *config );

/*
=cut
*/

/*
=head2 Parser Functions
*/
swish_Parser *  swish_parser_init( void (*handler) (swish_ParserData *) );
void            swish_parser_free( swish_Parser * parser );
/*
=cut
*/

/*
=head2 Token Functions 
*/
swish_TokenList *   swish_token_list_init();
void                swish_token_list_free( swish_TokenList *tl );
int                 swish_token_list_add_token(    
                                        swish_TokenList *tl, 
                                        xmlChar *token,
                                        int token_len,
                                        swish_MetaName *meta,
                                        xmlChar *context );
int                 swish_token_list_set_token(
                                        swish_TokenList *tl,
                                        xmlChar *token,
                                        int len );
swish_Token *       swish_token_init();
void                swish_token_free( swish_Token *t );
swish_TokenIterator *swish_token_iterator_init( swish_Analyzer *a );
void                swish_token_iterator_free( swish_TokenIterator *ti );
swish_Token *       swish_token_iterator_next_token( swish_TokenIterator *it );
int                 swish_tokenize(     swish_TokenIterator *ti, 
                                        xmlChar *buf, 
                                        swish_MetaName *meta,
                                        xmlChar *context );
int                 swish_tokenize_ascii(    
                                        swish_TokenIterator *ti, 
                                        xmlChar *buf, 
                                        swish_MetaName *meta,
                                        xmlChar *context );
int                 swish_tokenize_utf8(    
                                        swish_TokenIterator *ti, 
                                        xmlChar *buf, 
                                        swish_MetaName *meta,
                                        xmlChar *context );
void                swish_token_list_debug( swish_TokenIterator *it );
xmlChar *           swish_token_list_get_token_value( swish_TokenList *tl, swish_Token *t );
void                swish_token_debug( swish_Token *t );

/*
=cut
*/

/*
=head2 Analyzer Functions
*/
swish_Analyzer *    swish_analyzer_init( swish_Config * config );
void                swish_analyzer_free( swish_Analyzer * analyzer );
/*
=cut
*/

/*
=head2 DocInfo Functions
*/
swish_DocInfo *     swish_docinfo_init();
void                swish_docinfo_free( swish_DocInfo * ptr );
int                 swish_docinfo_check(swish_DocInfo * docinfo, swish_Config * config);
int                 swish_docinfo_from_filesystem(  xmlChar *filename, 
                                                    swish_DocInfo * i, 
                                                    swish_ParserData *parser_data );
void                swish_docinfo_debug( swish_DocInfo * docinfo );
/*
=cut
*/

/*
=head2 Buffer Functions
*/
swish_NamedBuffer * swish_nb_init( xmlHashTablePtr confhash );
void                swish_nb_free( swish_NamedBuffer *nb );
void                swish_nb_new( swish_NamedBuffer *nb, xmlChar *key );
void                swish_nb_debug( swish_NamedBuffer *nb, xmlChar *label );
void                swish_nb_add_buf( swish_NamedBuffer *nb, 
                                      xmlChar *name,
                                      xmlBufferPtr buf, 
                                      xmlChar *joiner,
                                      boolean cleanwsp,
                                      boolean autovivify);
void                swish_nb_add_str(   swish_NamedBuffer *nb, 
                                        xmlChar *name, 
                                        xmlChar *str,
                                        unsigned int len,
                                        xmlChar *joiner,
                                        boolean cleanwsp,
                                        boolean autovivify);
void                swish_buffer_append( xmlBufferPtr buf, xmlChar * txt, int len );
void                swish_buffer_concat( swish_NamedBuffer *nb1, swish_NamedBuffer *nb2 );
xmlChar*            swish_nb_get_value( swish_NamedBuffer* nb, xmlChar* key );
/*
=cut
*/

/*
=head2 Property Functions
*/
swish_Property *    swish_property_init( xmlChar *propname );
void                swish_property_new( xmlChar *name, swish_Config *config );
void                swish_property_free( swish_Property *prop );
void                swish_property_debug( swish_Property *prop );
int                 swish_property_get_builtin_id( xmlChar *propname );
int                 swish_property_get_id( xmlChar *propname, xmlHashTablePtr properties );
/*
=cut
*/

/*
=head2 MetaName Functions
*/
swish_MetaName *    swish_metaname_init( xmlChar *name);
void                swish_metaname_new( xmlChar *name, swish_Config *config );
void                swish_metaname_free( swish_MetaName *m );
void                swish_metaname_debug( swish_MetaName *m );
/*
=cut
*/

/*
=head2 Header Functions
*/
boolean             swish_header_validate(char *filename);
boolean             swish_header_merge(char *filename, swish_Config *c);
swish_Config *      swish_header_read(char *filename);
void                swish_header_write(char* filename, swish_Config* config);
/*
=cut
*/


#ifdef __cplusplus
}
#endif
#endif /* ! __LIBSWISH3_H__ */


/*************** end libswish3.h ************/


/*************** start getruntime.h ************/
/*
**
$Id: getruntime.h,v 1.6 2005/05/12 15:41:05 karman Exp $

    This file is part of Swish-e.

    Swish-e is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    Swish-e is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along  with Swish-e; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    
    See the COPYING file that accompanies the Swish-e distribution for details
    of the GNU GPL and the special exception available for linking against
    the Swish-e library.
    
** Mon May  9 18:19:34 CDT 2005
** added GPL


**-------------------------------------------------------
**
**
*/


#ifndef GETRUNTIME_H
#define GETRUNTIME_H 1

#ifdef __cplusplus
extern "C" {
#endif

typedef double cpu_seconds;
cpu_seconds get_cpu_secs ();

#ifdef __cplusplus
}
#endif /* __cplusplus */

#endif /* GETRUNTIME_H */



/*************** end getruntime.h ************/


/*************** start getruntime.c ************/
/* Return time used so far, in microseconds.
   Copyright (C) 1994, 1999 Free Software Foundation, Inc.

This file is part of the libiberty library.
Libiberty is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.

Libiberty is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Library General Public License for more details.

You should have received a copy of the GNU Library General Public
License along with libiberty; see the file COPYING.LIB.  If
not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA.
*/

#ifndef LIBSWISH3_SINGLE_FILE
#include "acconfig.h"

/* For testing */
//
#undef HAVE_GETRUSAGE
//
#undef HAVE_SYS_RESOURCE_H
//
#undef HAVE_TIMES

/* There are several ways to get elapsed execution time; unfortunately no
   single way is available for all host systems, nor are there reliable
   ways to find out which way is correct for a given host. */

#include "getruntime.h"
#include <time.h>

#if defined (HAVE_GETRUSAGE) && defined (HAVE_SYS_RESOURCE_H)
#include <sys/time.h>
#include <sys/resource.h>
#endif

#ifdef HAVE_TIMES
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#include <sys/times.h>
#endif

#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif

/* end #ifndef LIBSWISH3_SINGLE_FILE */
#endif

/* This is a fallback; if wrong, it will likely make obviously wrong
   results. */

#ifndef CLOCKS_PER_SEC
#define CLOCKS_PER_SEC 1
#endif

#ifdef _SC_CLK_TCK
#define GNU_HZ  sysconf(_SC_CLK_TCK)
#else
#ifdef HZ
#define GNU_HZ  HZ
#else
#ifdef CLOCKS_PER_SEC
#define GNU_HZ  CLOCKS_PER_SEC
#endif
#endif
#endif

cpu_seconds
get_cpu_secs(
)
{
#if defined (HAVE_GETRUSAGE) && defined (HAVE_SYS_RESOURCE_H)
    struct rusage rusage;
    cpu_seconds secs;

    getrusage(0, &rusage);
    secs = (cpu_seconds) (rusage.ru_utime.tv_sec + rusage.ru_stime.tv_sec);

    if (rusage.ru_utime.tv_usec > 500000)
        secs++;
    if (rusage.ru_stime.tv_usec > 500000)
        secs++;

    return secs;

#else /* ! HAVE_GETRUSAGE */
#ifdef HAVE_TIMES

/* This returns number of clock "ticks" since: */
/* In linux since boot, in BSD since 1/1/1970 */
/* Again, these are clock_t, which may overflow, but under linux it's 1/100 second so about 6000 hours */

    struct tms tms;

    times(&tms);

    return (cpu_seconds) ((tms.tms_utime + tms.tms_stime) / GNU_HZ);

#else /* ! HAVE_TIMES */
/* Fall back on clock and hope it's correctly implemented. */
/* clock() returns clock_t, which seems to be a long.  On Linux CLOCKS_PER_SEC is 10^6 */
/* so expect an overflow at about 35 minutes. */

    clock_t t = clock();
    if (t < 0)
        t = 0;

    return (cpu_seconds) (t / CLOCKS_PER_SEC);

#endif /* HAVE_TIMES */
#endif /* HAVE_GETRUSAGE */
}


/*************** end getruntime.c ************/


/*************** start utf8.c ************/
/* see http://cprogramming.com/tutorial/unicode.html */

/*
  Basic UTF-8 manipulation routines
  by Jeff Bezanson
  placed in the public domain Fall 2005

  This code is designed to provide the utilities you need to manipulate
  UTF-8 as an internal string encoding. These functions do not perform the
  error checking normally needed when handling UTF-8 data, so if you happen
  to be from the Unicode Consortium you will want to flay me alive.
  I do this because error checking can be performed at the boundaries (I/O),
  with these routines reserved for higher performance on data known to be
  valid.
  A UTF-8 validation routine is included.
*/

/*
this file is a simple UTF-8 string handling library based on the url above.
the .h and .c file have been combined and all functions labeled 'static'
so you must include utf8.c to get the library.
We include in string.c.
*/

/* http://cprogramming.com/tutorial/utf8.h */
#ifndef LIBSWISH3_SINGLE_FILE
#include <stdarg.h>
#endif

/* is c the start of a utf8 sequence? */
#define isutf(c) (((c)&0xC0)!=0x80)

static size_t 
u8_charlen(
    uint32_t ch
);

static size_t 
u8_codingsize(
    uint32_t *wcstr, 
    size_t n
);

/* convert UTF-8 data to wide character */
static int u8_toucs(
    uint32_t * dest,
    int sz,
    char *src,
    int srcsz
);

/* the opposite conversion */
static int u8_toutf8(
    char *dest,
    int sz,
    uint32_t * src,
    int srcsz
);

/* single character to UTF-8 */
static int u8_wc_toutf8(
    char *dest,
    uint32_t ch
);

/* character number to byte offset */
static int u8_offset(
    char *str,
    int charnum
);

/* byte offset to character number */
static int u8_charnum(
    char *s,
    int offset
);

/* return next character, updating an index variable */
static uint32_t u8_nextchar(
    char *s,
    int *i
);

/* move to next character */
static void u8_inc(
    char *s,
    int *i
);

/* move to previous character */
static void u8_dec(
    char *s,
    int *i
);

/* returns length of next utf-8 sequence */
static int u8_seqlen(
    char *s
);

/* assuming src points to the character after a backslash, read an
   escape sequence, storing the result in dest and returning the number of
   input characters processed */
static int u8_read_escape_sequence(
    char *src,
    uint32_t * dest
);

/* given a wide character, convert it to an ASCII escape sequence stored in
   buf, where buf is "sz" bytes. returns the number of characters output. */
static int u8_escape_wchar(
    char *buf,
    int sz,
    uint32_t ch
);

/* convert a string "src" containing escape sequences to UTF-8 */
static int u8_unescape(
    char *buf,
    int sz,
    char *src
);

/* convert UTF-8 "src" to ASCII with escape sequences.
   if escape_quotes is nonzero, quote characters will be preceded by
   backslashes as well. */
static int u8_escape(
    char *buf,
    int sz,
    char *src,
    int escape_quotes
);

/* utility predicates used by the above */
static int octal_digit(
    char c
);
static int hex_digit(
    char c
);

/* return a pointer to the first occurrence of ch in s, or NULL if not
   found. character index of found character returned in *charn. */
static char *u8_strchr(
    char *s,
    uint32_t ch,
    int *charn
);

/* same as the above, but searches a buffer of a given size instead of
   a NUL-terminated string. */
static char *u8_memchr(
    char *s,
    uint32_t ch,
    size_t sz,
    int *charn
);

/* count the number of characters in a UTF-8 string */
static int u8_strlen(
    char *s
);

static int u8_is_locale_utf8(
    char *locale
);

/* printf where the format string and arguments may be in UTF-8.
   you can avoid this function and just use ordinary printf() if the current
   locale is UTF-8. */
static int u8_vprintf(
    char *fmt,
    va_list ap
);
static int u8_printf(
    char *fmt,
    ...
);

static int 
u8_isvalid(
    const char *str, 
    int length
);

static int 
u8_reverse(
    char *dest, 
    char * src, 
    size_t len
);


/* http://cprogramming.com/tutorial/utf8.c */

/*
  Basic UTF-8 manipulation routines
  by Jeff Bezanson
  placed in the public domain Fall 2005

  This code is designed to provide the utilities you need to manipulate
  UTF-8 as an internal string encoding. These functions do not perform the
  error checking normally needed when handling UTF-8 data, so if you happen
  to be from the Unicode Consortium you will want to flay me alive.
  I do this because error checking can be performed at the boundaries (I/O),
  with these routines reserved for higher performance on data known to be
  valid.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdarg.h>
/*
#ifdef WIN32
#include <malloc.h>
#else
#include <alloca.h>
#endif
*/

static const uint32_t offsetsFromUTF8[6] = {
    0x00000000UL, 0x00003080UL, 0x000E2080UL,
    0x03C82080UL, 0xFA082080UL, 0x82082080UL
};

static const char trailingBytesForUTF8[256] = {
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4,
        4, 4, 4, 5, 5, 5, 5
};

/* returns length of next utf-8 sequence */
static int
u8_seqlen(
    char *s
)
{
    return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1;
}

/* returns the # of bytes needed to encode a certain character
   0 means the character cannot (or should not) be encoded. */
static size_t 
u8_charlen(
    uint32_t ch
)
{
    if (ch < 0x80)
        return 1;
    else if (ch < 0x800)
        return 2;
    else if (ch < 0x10000)
        return 3;
    else if (ch < 0x110000)
        return 4;
    return 0;
}

static size_t 
u8_codingsize(
    uint32_t *wcstr, 
    size_t n
)
{
    size_t i, c=0;

    for(i=0; i < n; i++)
        c += u8_charlen(wcstr[i]);
    return c;
}


/* conversions without error checking
   only works for valid UTF-8, i.e. no 5- or 6-byte sequences
   srcsz = source size in bytes, or -1 if 0-terminated
   sz = dest size in # of wide characters

   returns # characters converted
   dest will always be L'\0'-terminated, even if there isn't enough room
   for all the characters.
   if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
*/
static int
u8_toucs(
    uint32_t * dest,
    int sz,
    char *src,
    int srcsz
)
{
    uint32_t ch;
    char *src_end = src + srcsz;
    int nb;
    int i = 0;

    while (i < sz - 1) {
        nb = trailingBytesForUTF8[(unsigned char)*src];
        if (srcsz == -1) {
            if (*src == 0)
                goto done_toucs;
        }
        else {
            if (src + nb >= src_end)
                goto done_toucs;
        }
        ch = 0;
        switch (nb) {
            /*
               these fall through deliberately 
             */
        case 3:
            ch += (unsigned char)*src++;
            ch <<= 6;
        case 2:
            ch += (unsigned char)*src++;
            ch <<= 6;
        case 1:
            ch += (unsigned char)*src++;
            ch <<= 6;
        case 0:
            ch += (unsigned char)*src++;
        }
        ch -= offsetsFromUTF8[nb];
        dest[i++] = ch;
    }
  done_toucs:
    dest[i] = 0;
    return i;
}

/* srcsz = number of source characters, or -1 if 0-terminated
   sz = size of dest buffer in bytes

   returns # characters converted
   dest will only be '\0'-terminated if there is enough space. this is
   for consistency; imagine there are 2 bytes of space left, but the next
   character requires 3 bytes. in this case we could NUL-terminate, but in
   general we can't when there's insufficient space. therefore this function
   only NUL-terminates if all the characters fit, and there's space for
   the NUL as well.
   the destination string will never be bigger than the source string.
*/
static int
u8_toutf8(
    char *dest,
    int sz,
    uint32_t * src,
    int srcsz
)
{
    uint32_t ch;
    int i = 0;
    char *dest_end = dest + sz;

    while (srcsz < 0 ? src[i] != 0 : i < srcsz) {
        ch = src[i];
        if (ch < 0x80) {
            if (dest >= dest_end)
                return i;
            *dest++ = (char)ch;
        }
        else if (ch < 0x800) {
            if (dest >= dest_end - 1)
                return i;
            *dest++ = (ch >> 6) | 0xC0;
            *dest++ = (ch & 0x3F) | 0x80;
        }
        else if (ch < 0x10000) {
            if (dest >= dest_end - 2)
                return i;
            *dest++ = (ch >> 12) | 0xE0;
            *dest++ = ((ch >> 6) & 0x3F) | 0x80;
            *dest++ = (ch & 0x3F) | 0x80;
        }
        else if (ch < 0x110000) {
            if (dest >= dest_end - 3)
                return i;
            *dest++ = (ch >> 18) | 0xF0;
            *dest++ = ((ch >> 12) & 0x3F) | 0x80;
            *dest++ = ((ch >> 6) & 0x3F) | 0x80;
            *dest++ = (ch & 0x3F) | 0x80;
        }
        i++;
    }
    if (dest < dest_end)
        *dest = '\0';
    return i;
}

static int
u8_wc_toutf8(
    char *dest,
    uint32_t ch
)
{
    if (ch < 0x80) {
        dest[0] = (char)ch;
        return 1;
    }
    if (ch < 0x800) {
        dest[0] = (ch >> 6) | 0xC0;
        dest[1] = (ch & 0x3F) | 0x80;
        return 2;
    }
    if (ch < 0x10000) {
        dest[0] = (ch >> 12) | 0xE0;
        dest[1] = ((ch >> 6) & 0x3F) | 0x80;
        dest[2] = (ch & 0x3F) | 0x80;
        return 3;
    }
    if (ch < 0x110000) {
        dest[0] = (ch >> 18) | 0xF0;
        dest[1] = ((ch >> 12) & 0x3F) | 0x80;
        dest[2] = ((ch >> 6) & 0x3F) | 0x80;
        dest[3] = (ch & 0x3F) | 0x80;
        return 4;
    }
    return 0;
}

/* charnum => byte offset */
static int
u8_offset(
    char *str,
    int charnum
)
{
    int offs = 0;

    while (charnum > 0 && str[offs]) {
        (void)(isutf(str[++offs]) || isutf(str[++offs]) || isutf(str[++offs])
               || ++offs);
        charnum--;
    }
    return offs;
}

/* byte offset => charnum */
static int
u8_charnum(
    char *s,
    int offset
)
{
    int charnum = 0, offs = 0;

    while (offs < offset && s[offs]) {
        (void)(isutf(s[++offs]) || isutf(s[++offs]) || isutf(s[++offs])
               || ++offs);
        charnum++;
    }
    return charnum;
}

/* number of characters */
static int
u8_strlen(
    char *s
)
{
    int count = 0;
    int i = 0;

    while (u8_nextchar(s, &i) != 0)
        count++;

    return count;
}

/* reads the next utf-8 sequence out of a string, updating an index */
static uint32_t
u8_nextchar(
    char *s,
    int *i
)
{
    uint32_t ch = 0;
    int sz = 0;

    do {
        ch <<= 6;
        ch += (unsigned char)s[(*i)++];
        sz++;
    } while (s[*i] && !isutf(s[*i]));
    ch -= offsetsFromUTF8[sz - 1];

    return ch;
}

static void
u8_inc(
    char *s,
    int *i
)
{
    (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) || isutf(s[++(*i)]) || ++(*i));
}

static void
u8_dec(
    char *s,
    int *i
)
{
    (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) || isutf(s[--(*i)]) || --(*i));
}

static int
octal_digit(
    char c
)
{
    return (c >= '0' && c <= '7');
}

static int
hex_digit(
    char c
)
{
    return ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')
            || (c >= 'a' && c <= 'f'));
}

/* assumes that src points to the character after a backslash
   returns number of input characters processed */
static int
u8_read_escape_sequence(
    char *str,
    uint32_t * dest
)
{
    uint32_t ch;
    char digs[9] = "\0\0\0\0\0\0\0\0\0";
    int dno = 0, i = 1;

    ch = (uint32_t) str[0];    /* take literal character */
    if (str[0] == 'n')
        ch = L'\n';
    else if (str[0] == 't')
        ch = L'\t';
    else if (str[0] == 'r')
        ch = L'\r';
    else if (str[0] == 'b')
        ch = L'\b';
    else if (str[0] == 'f')
        ch = L'\f';
    else if (str[0] == 'v')
        ch = L'\v';
    else if (str[0] == 'a')
        ch = L'\a';
    else if (octal_digit(str[0])) {
        i = 0;
        do {
            digs[dno++] = str[i++];
        } while (octal_digit(str[i]) && dno < 3);
        ch = strtol(digs, NULL, 8);
    }
    else if (str[0] == 'x') {
        while (hex_digit(str[i]) && dno < 2) {
            digs[dno++] = str[i++];
        }
        if (dno > 0)
            ch = strtol(digs, NULL, 16);
    }
    else if (str[0] == 'u') {
        while (hex_digit(str[i]) && dno < 4) {
            digs[dno++] = str[i++];
        }
        if (dno > 0)
            ch = strtol(digs, NULL, 16);
    }
    else if (str[0] == 'U') {
        while (hex_digit(str[i]) && dno < 8) {
            digs[dno++] = str[i++];
        }
        if (dno > 0)
            ch = strtol(digs, NULL, 16);
    }
    *dest = ch;

    return i;
}

/* convert a string with literal \uxxxx or \Uxxxxxxxx characters to UTF-8
   example: u8_unescape(mybuf, 256, "hello\\u220e")
   note the double backslash is needed if called on a C string literal */
static int
u8_unescape(
    char *buf,
    int sz,
    char *src
)
{
    int c = 0, amt;
    uint32_t ch;
    char temp[4];

    while (*src && c < sz) {
        if (*src == '\\') {
            src++;
            amt = u8_read_escape_sequence(src, &ch);
        }
        else {
            ch = (uint32_t) * src;
            amt = 1;
        }
        src += amt;
        amt = u8_wc_toutf8(temp, ch);
        if (amt > sz - c)
            break;
        memcpy(&buf[c], temp, amt);
        c += amt;
    }
    if (c < sz)
        buf[c] = '\0';
    return c;
}

static int
u8_escape_wchar(
    char *buf,
    int sz,
    uint32_t ch
)
{
    if (ch == L'\n')
        return snprintf(buf, sz, "\\n");
    else if (ch == L'\t')
        return snprintf(buf, sz, "\\t");
    else if (ch == L'\r')
        return snprintf(buf, sz, "\\r");
    else if (ch == L'\b')
        return snprintf(buf, sz, "\\b");
    else if (ch == L'\f')
        return snprintf(buf, sz, "\\f");
    else if (ch == L'\v')
        return snprintf(buf, sz, "\\v");
    else if (ch == L'\a')
        return snprintf(buf, sz, "\\a");
    else if (ch == L'\\')
        return snprintf(buf, sz, "\\\\");
    else if (ch < 32 || ch == 0x7f)
        return snprintf(buf, sz, "\\x%hhX", (unsigned char)ch);
    else if (ch > 0xFFFF)
        return snprintf(buf, sz, "\\U%.8X", (uint32_t) ch);
    else if (ch >= 0x80 && ch <= 0xFFFF)
        return snprintf(buf, sz, "\\u%.4hX", (unsigned short)ch);

    return snprintf(buf, sz, "%c", (char)ch);
}

static int
u8_escape(
    char *buf,
    int sz,
    char *src,
    int escape_quotes
)
{
    int c = 0, i = 0, amt;

    while (src[i] && c < sz) {
        if (escape_quotes && src[i] == '"') {
            amt = snprintf(buf, sz - c, "\\\"");
            i++;
        }
        else {
            amt = u8_escape_wchar(buf, sz - c, u8_nextchar(src, &i));
        }
        c += amt;
        buf += amt;
    }
    if (c < sz)
        *buf = '\0';
    return c;
}

static char *
u8_strchr(
    char *s,
    uint32_t ch,
    int *charn
)
{
    int i = 0, lasti = 0;
    uint32_t c;

    *charn = 0;
    while (s[i]) {
        c = u8_nextchar(s, &i);
        if (c == ch) {
            return &s[lasti];
        }
        lasti = i;
        (*charn)++;
    }
    return NULL;
}

static char *
u8_memchr(
    char *s,
    uint32_t ch,
    size_t sz,
    int *charn
)
{
    int i = 0, lasti = 0;
    uint32_t c;
    int csz;

    *charn = 0;
    while (i < sz) {
        c = csz = 0;
        do {
            c <<= 6;
            c += (unsigned char)s[i++];
            csz++;
        } while (i < sz && !isutf(s[i]));
        c -= offsetsFromUTF8[csz - 1];

        if (c == ch) {
            return &s[lasti];
        }
        lasti = i;
        (*charn)++;
    }
    return NULL;
}

static int
u8_is_locale_utf8(
    char *locale
)
{
    /*
       this code based on libutf8 
     */
    const char *cp = locale;

    for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) {
        if (*cp == '.') {
            const char *encoding = ++cp;
            for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++);
            if ((cp - encoding == 5 && !strncmp(encoding, "UTF-8", 5))
                || (cp - encoding == 4 && !strncmp(encoding, "utf8", 4)))
                return 1;       /* it's UTF-8 */
            break;
        }
    }
    return 0;
}

static int
u8_vprintf(
    char *fmt,
    va_list ap
)
{
    int cnt, sz = 0;
    char *buf;
    uint32_t *wcs;

    sz = 512;
    buf = (char *)alloca(sz);
  try_print:
    cnt = vsnprintf(buf, sz, fmt, ap);
    if (cnt >= sz) {
        buf = (char *)alloca(cnt - sz + 1);
        sz = cnt + 1;
        goto try_print;
    }
    wcs = (uint32_t *) alloca((cnt + 1) * sizeof(uint32_t));
    cnt = u8_toucs(wcs, cnt + 1, buf, cnt);
    printf("%ls", (wchar_t *) wcs);
    return cnt;
}

static int
u8_printf(
    char *fmt,
    ...
)
{
    int cnt;
    va_list args;

    va_start(args, fmt);

    cnt = u8_vprintf(fmt, args);

    va_end(args);
    return cnt;
}

/* based on the valid_utf8 routine from the PCRE library by Philip Hazel

   length is in bytes, since without knowing whether the string is valid
   it's hard to know how many characters there are! */
static int 
u8_isvalid(
    const char *str, 
    int length
)
{
    const unsigned char *p, *pend = (unsigned char*)str + length;
    unsigned char c;
    int ab;

    for (p = (unsigned char*)str; p < pend; p++) {
        c = *p;
        if (c < 128)
            continue;
        if ((c & 0xc0) != 0xc0)
            return 0;
        ab = trailingBytesForUTF8[c];
        if (length < ab)
            return 0;
        length -= ab;

        p++;
        /* Check top bits in the second byte */
        if ((*p & 0xc0) != 0x80)
            return 0;

        /* Check for overlong sequences for each different length */
        switch (ab) {
            /* Check for xx00 000x */
        case 1:
            if ((c & 0x3e) == 0) return 0;
            continue;   /* We know there aren't any more bytes to check */

            /* Check for 1110 0000, xx0x xxxx */
        case 2:
            if (c == 0xe0 && (*p & 0x20) == 0) return 0;
            break;

            /* Check for 1111 0000, xx00 xxxx */
        case 3:
            if (c == 0xf0 && (*p & 0x30) == 0) return 0;
            break;

            /* Check for 1111 1000, xx00 0xxx */
        case 4:
            if (c == 0xf8 && (*p & 0x38) == 0) return 0;
            break;

            /* Check for leading 0xfe or 0xff,
               and then for 1111 1100, xx00 00xx */
        case 5:
            if (c == 0xfe || c == 0xff ||
                (c == 0xfc && (*p & 0x3c) == 0)) return 0;
            break;
        }

        /* Check for valid bytes after the 2nd, if any; all must start 10 */
        while (--ab > 0) {
            if ((*(++p) & 0xc0) != 0x80) return 0;
        }
    }

    return 1;
}

static int 
u8_reverse(
    char *dest, 
    char * src, 
    size_t len
)
{
    size_t si=0, di=len;
    unsigned char c;

    dest[di] = '\0';
    while (si < len) {
        c = (unsigned char)src[si];
        if ((~c) & 0x80) {
            di--;
            dest[di] = c;
            si++;
        }
        else {
            switch (c>>4) {
            case 0xC:
            case 0xD:
                di -= 2;
                *((int16_t*)&dest[di]) = *((int16_t*)&src[si]);
                si += 2;
                break;
            case 0xE:
                di -= 3;
                dest[di] = src[si];
                *((int16_t*)&dest[di+1]) = *((int16_t*)&src[si+1]);
                si += 3;
                break;
            case 0xF:
                di -= 4;
                *((int32_t*)&dest[di]) = *((int32_t*)&src[si]);
                si += 4;
                break;
            default:
                return 1;
            }
        }
    }
    return 0;
}


/*************** end utf8.c ************/


/*************** start config.c ************/
/*
 * This file is part of libswish3
 * Copyright (C) 2007 Peter Karman
 *
 *  libswish3 is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libswish3 is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libswish3; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
*/

#ifndef LIBSWISH3_SINGLE_FILE
#include <sys/param.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <locale.h>
#include <err.h>

#include "libswish3.h"
#endif

extern int SWISH_DEBUG;

static void free_string(
    xmlChar *payload,
    xmlChar *key
);
static void free_props(
    swish_Property *prop,
    xmlChar *propname
);
static void free_metas(
    swish_MetaName *meta,
    xmlChar *metaname
);
static void config_printer(
    xmlChar *val,
    xmlChar *str,
    xmlChar *key
);
static void stringlist_printer(
    swish_StringList *strlist,
    xmlChar *str,
    xmlChar *key
);
static void property_printer(
    swish_Property *prop,
    xmlChar *str,
    xmlChar *propname
);
static void metaname_printer(
    swish_MetaName *meta,
    xmlChar *str,
    xmlChar *metaname
);
static void copy_property(
    swish_Property *prop2,
    xmlHashTablePtr props1,
    xmlChar *prop2name
);
static void merge_properties(
    xmlHashTablePtr props1,
    xmlHashTablePtr props2
);
static void copy_metaname(
    swish_MetaName *meta2,
    xmlHashTablePtr metas1,
    xmlChar *meta2name
);
static void merge_metanames(
    xmlHashTablePtr metas1,
    xmlHashTablePtr metas2
);
static void
free_stringlist(
    swish_StringList *strlist,
    xmlChar *key
);

static void
free_string(
    xmlChar *payload,
    xmlChar *key
)
{
    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG)
        SWISH_DEBUG_MSG("   freeing config %s => %s", key, payload);

    swish_xfree(payload);
}

static void
free_stringlist(
    swish_StringList *strlist,
    xmlChar *key
)
{
    int i;
    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("   freeing config->stringlists %s [%d strings]", key, strlist->n);
        for(i=0; i<strlist->n; i++) {
            SWISH_DEBUG_MSG("     string: %s", strlist->word[i]);
        }
    }

    swish_stringlist_free(strlist);
}

static void
free_props(
    swish_Property *prop,
    xmlChar *propname
)
{
    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("   freeing config->prop %s", propname);
        swish_property_debug((swish_Property *)prop);
    }
    prop->ref_cnt--;
    if (prop->ref_cnt < 1) {
        swish_property_free(prop);
    }
}

static void
free_metas(
    swish_MetaName *meta,
    xmlChar *metaname
)
{
    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG(" freeing config->meta %s", metaname);
        swish_metaname_debug((swish_MetaName *)meta);
    }
    meta->ref_cnt--;
    if (meta->ref_cnt < 1) {
        swish_metaname_free(meta);
    }
}

void
swish_config_free(
    swish_Config *config
)
{
    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
        SWISH_DEBUG_MSG("freeing config");
        SWISH_DEBUG_MSG("ptr addr: 0x%x  %d", (long int)config, (long int)config);
        swish_mem_debug();
    }

    xmlHashFree(config->misc, (xmlHashDeallocator)free_string);
    xmlHashFree(config->properties, (xmlHashDeallocator)free_props);
    xmlHashFree(config->metanames, (xmlHashDeallocator)free_metas);
    xmlHashFree(config->tag_aliases, (xmlHashDeallocator)free_string);
    xmlHashFree(config->parsers, (xmlHashDeallocator)free_string);
    xmlHashFree(config->mimes, (xmlHashDeallocator)free_string);
    xmlHashFree(config->index, (xmlHashDeallocator)free_string);
    xmlHashFree(config->stringlists, (xmlHashDeallocator)free_stringlist);
    swish_config_flags_free(config->flags);

    if (config->ref_cnt != 0) {
        SWISH_WARN("config ref_cnt != 0: %d", config->ref_cnt);
    }

    if (config->stash != NULL) {
        SWISH_WARN("possible memory leak: config->stash was not freed");
    }

    swish_xfree(config);
}

swish_ConfigFlags *
swish_config_init_flags(
)
{
    swish_ConfigFlags *flags;
    flags = swish_xmalloc(sizeof(swish_ConfigFlags));
    flags->tokenize = SWISH_TRUE;
    flags->cascade_meta_context = SWISH_FALSE;  /* add tokens to every metaname in the stack */
    flags->ignore_xmlns = SWISH_TRUE;
    flags->follow_xinclude = SWISH_TRUE;
    flags->undef_metas = SWISH_UNDEF_METAS_INDEX;
    flags->undef_attrs = SWISH_UNDEF_ATTRS_DISABLE;
    flags->max_meta_id = -1;
    flags->max_prop_id = -1;
    flags->meta_ids = swish_hash_init(8);
    flags->prop_ids = swish_hash_init(8);
    //flags->contexts = swish_hash_init(8); // TODO cache these to save malloc/frees

    return flags;
}

void
swish_config_flags_free(
    swish_ConfigFlags * flags
)
{
    /*
       these hashes are for convenience and are really freed in swish_config_free() 
     */
    xmlHashFree(flags->meta_ids, NULL);
    xmlHashFree(flags->prop_ids, NULL);
    if (SWISH_DEBUG) {
        swish_config_flags_debug(flags);
    }
    swish_xfree(flags);
}

void
swish_config_flags_debug(
    swish_ConfigFlags *flags
)
{
    SWISH_DEBUG_MSG("config->tokenize == %d", flags->tokenize);
    SWISH_DEBUG_MSG("config->cascade_meta_context == %d", flags->cascade_meta_context);
    SWISH_DEBUG_MSG("config->ignore_xmlns == %d", flags->ignore_xmlns);
    SWISH_DEBUG_MSG("config->follow_xinclude == %d", flags->follow_xinclude);
    SWISH_DEBUG_MSG("config->undef_metas == %d", flags->undef_metas);
    SWISH_DEBUG_MSG("config->undef_attrs == %d", flags->undef_attrs);
    SWISH_DEBUG_MSG("config->max_meta_id == %d", flags->max_meta_id);
    SWISH_DEBUG_MSG("config->max_prop_id == %d", flags->max_prop_id);
}

/* init config object */
swish_Config *
swish_config_init(
)
{
    swish_Config *config;

    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
        SWISH_DEBUG_MSG("init config");
    }

/* the hashes will automatically grow as needed so we init with sane starting size */
    config = swish_xmalloc(sizeof(swish_Config));
    config->flags = swish_config_init_flags();
    config->misc = swish_hash_init(8);
    config->metanames = swish_hash_init(8);
    config->properties = swish_hash_init(8);
    config->parsers = swish_hash_init(8);
    config->index = swish_hash_init(8);
    config->tag_aliases = swish_hash_init(8);   /* alias => real */
    config->stringlists = swish_hash_init(8);
    config->mimes = NULL;
    config->ref_cnt = 0;
    config->stash = NULL;

    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
        SWISH_DEBUG_MSG("config ptr 0x%x", (long int)config);
    }

    return config;

}

void
swish_config_set_default(
    swish_Config *config
)
{
    swish_Property *tmpprop;
    swish_MetaName *tmpmeta;
    xmlChar *tmpbuf;

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG)
        SWISH_DEBUG_MSG("setting default config");

/* we xstrdup a lot in order to consistently free in swish_config_free() */

/* MIME types */
    config->mimes = swish_mime_defaults();

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG)
        SWISH_DEBUG_MSG("mime hash set");

/* metanames */
    // default
    tmpmeta = swish_metaname_init(swish_xstrdup((xmlChar *)SWISH_DEFAULT_METANAME));
    tmpmeta->ref_cnt++;
    tmpmeta->id = SWISH_META_DEFAULT_ID;
    tmpbuf = swish_int_to_string(SWISH_META_DEFAULT_ID);
    swish_hash_add(config->flags->meta_ids, tmpbuf, tmpmeta);
    swish_hash_add(config->metanames, (xmlChar *)SWISH_DEFAULT_METANAME, tmpmeta);
    swish_xfree(tmpbuf);
    config->flags->max_meta_id = tmpmeta->id;
    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("swishdefault metaname set");
    }
    
    // title
    tmpmeta = swish_metaname_init(swish_xstrdup((xmlChar *)SWISH_TITLE_METANAME));
    tmpmeta->ref_cnt++;
    tmpmeta->id = SWISH_META_TITLE_ID;
    tmpbuf = swish_int_to_string(SWISH_META_TITLE_ID);
    swish_hash_add(config->flags->meta_ids, tmpbuf, tmpmeta);
    swish_hash_add(config->metanames, (xmlChar *)SWISH_TITLE_METANAME, tmpmeta);
    swish_xfree(tmpbuf);
    if (tmpmeta->id > config->flags->max_meta_id) {
        config->flags->max_meta_id = tmpmeta->id;
    }
    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("swishtitle metaname set");
    }
    
/* properties */
    // description
    tmpprop = swish_property_init(swish_xstrdup((xmlChar *)SWISH_PROP_DESCRIPTION));
    tmpprop->ref_cnt++;
    tmpprop->id = SWISH_PROP_DESCRIPTION_ID;
    tmpprop->sort = SWISH_FALSE;
    swish_hash_add(config->properties, (xmlChar *)SWISH_PROP_DESCRIPTION, tmpprop);
    tmpbuf = swish_int_to_string(SWISH_PROP_DESCRIPTION_ID);
    swish_hash_add(config->flags->prop_ids, tmpbuf, tmpprop);
    swish_xfree(tmpbuf);
    config->flags->max_prop_id = tmpprop->id;
    
    // title
    tmpprop = swish_property_init(swish_xstrdup((xmlChar *)SWISH_PROP_TITLE));
    tmpprop->ref_cnt++;
    tmpprop->id = SWISH_PROP_TITLE_ID;
    swish_hash_add(config->properties, (xmlChar *)SWISH_PROP_TITLE, tmpprop);
    tmpbuf = swish_int_to_string(SWISH_PROP_TITLE_ID);
    swish_hash_add(config->flags->prop_ids, tmpbuf, tmpprop);
    swish_xfree(tmpbuf);
    if (tmpprop->id > config->flags->max_prop_id) {
        config->flags->max_prop_id = tmpprop->id;
    }
    
/* parsers */
    swish_hash_add(config->parsers, (xmlChar *)"text/plain",
                   swish_xstrdup((xmlChar *)SWISH_PARSER_TXT));
    swish_hash_add(config->parsers, (xmlChar *)"application/xml",
                   swish_xstrdup((xmlChar *)SWISH_PARSER_XML));
    swish_hash_add(config->parsers, (xmlChar *)"text/xml",
                   swish_xstrdup((xmlChar *)SWISH_PARSER_XML));
    swish_hash_add(config->parsers, (xmlChar *)"text/html",
                   swish_xstrdup((xmlChar *)SWISH_PARSER_HTML));
    swish_hash_add(config->parsers, (xmlChar *)SWISH_DEFAULT_PARSER,
                   swish_xstrdup((xmlChar *)SWISH_DEFAULT_PARSER_TYPE));

/* index */
    swish_hash_add(config->index, (xmlChar *)SWISH_INDEX_FORMAT,
                   swish_xstrdup((xmlChar *)SWISH_INDEX_FILEFORMAT));
    swish_hash_add(config->index, (xmlChar *)SWISH_INDEX_NAME,
                   swish_xstrdup((xmlChar *)SWISH_INDEX_FILENAME));
    swish_hash_add(config->index, (xmlChar *)SWISH_INDEX_LOCALE,
                   swish_xstrdup((xmlChar *)setlocale(LC_CTYPE, NULL)));

/* aliases: other names a tag might be known as, for matching properties and
     * metanames */
    swish_hash_add(config->tag_aliases, (xmlChar *)SWISH_TITLE_TAG,
                   swish_xstrdup((xmlChar *)SWISH_TITLE_METANAME));
    swish_hash_add(config->tag_aliases, (xmlChar *)SWISH_BODY_TAG,
                   swish_xstrdup((xmlChar *)SWISH_PROP_DESCRIPTION));

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("config_set_default done");
        swish_config_debug(config);
    }

}

swish_Config *
swish_config_add(
    swish_Config *config,
    xmlChar *conf
)
{
    config = swish_config_parse(config, conf);
    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG)
        swish_config_debug(config);

    return config;
}

swish_Config *
swish_config_parse(
    swish_Config *config,
    xmlChar *conf
)
{
    swish_header_merge((char *)conf, config);
    return config;
}

static void
config_printer(
    xmlChar *val,
    xmlChar *str,
    xmlChar *key
)
{
    SWISH_DEBUG_MSG(" %s:  %s => %s", str, key, val);
}

static void 
stringlist_printer(
    swish_StringList *strlist,
    xmlChar *str,
    xmlChar *key
)
{
    int i;
    for(i=0; i<strlist->n; i++) {
        SWISH_DEBUG_MSG(" %s: %s => %s", str, key, strlist->word[i]);
    }
}

static void
property_printer(
    swish_Property *prop,
    xmlChar *str,
    xmlChar *propname
)
{
    SWISH_DEBUG_MSG(" %s:  %s =>", str, propname);
    swish_property_debug(prop);
}

static void
metaname_printer(
    swish_MetaName *meta,
    xmlChar *str,
    xmlChar *metaname
)
{
    SWISH_DEBUG_MSG(" %s:  %s =>", str, metaname);
    swish_metaname_debug(meta);
}

/* PUBLIC */
void
swish_config_debug(
    swish_Config *config
)
{
    SWISH_DEBUG_MSG("config->ref_cnt = %d", config->ref_cnt);
    SWISH_DEBUG_MSG("config->stash address = 0x%x  %d", (long int)config->stash,
                    (long int)config->stash);
    SWISH_DEBUG_MSG("ptr addr: 0x%x  %d", (long int)config, (long int)config);

    xmlHashScan(config->misc, (xmlHashScanner)config_printer, "misc conf");
    xmlHashScan(config->stringlists, (xmlHashScanner)stringlist_printer, "stringlists");
    xmlHashScan(config->properties, (xmlHashScanner)property_printer, "properties");
    xmlHashScan(config->metanames, (xmlHashScanner)metaname_printer, "metanames");
    xmlHashScan(config->parsers, (xmlHashScanner)config_printer, "parsers");
    xmlHashScan(config->mimes, (xmlHashScanner)config_printer, "mimes");
    xmlHashScan(config->index, (xmlHashScanner)config_printer, "index");
    xmlHashScan(config->tag_aliases, (xmlHashScanner)config_printer, "tag_aliases");
    swish_config_flags_debug(config->flags);
}

static void
copy_property(
    swish_Property *prop2,
    xmlHashTablePtr props1,
    xmlChar *prop2name
)
{
    swish_Property *prop1;

    if (swish_hash_exists(props1, prop2name)) {
        prop1 = swish_hash_fetch(props1, prop2name);
        if (prop1->name != NULL) {
            swish_xfree(prop1->name);
            prop1->name = swish_xstrdup(prop2->name);
        }
    }
    else {
        prop1 = swish_property_init(swish_xstrdup(prop2name));
        prop1->ref_cnt++;
        swish_hash_add(props1, prop1->name, prop1);
    }
/* 
    SWISH_DEBUG_MSG("%s prop1->id = %d    %s prop2->id = %d",
                    prop1->name, prop1->id, prop2->name, prop2->id);
 */
    prop1->id = prop2->id;        
    prop1->ignore_case = prop2->ignore_case;
    prop1->type = prop2->type;
    prop1->verbatim = prop2->verbatim;
    if (prop1->alias_for != NULL) {
        swish_xfree(prop1->alias_for);
    }
    if (prop2->alias_for != NULL) {
        prop1->alias_for = swish_xstrdup(prop2->alias_for);
    }
    prop1->max = prop2->max;
    prop1->sort = prop2->sort;

}

static void
merge_properties(
    xmlHashTablePtr props1,
    xmlHashTablePtr props2
)
{
    xmlHashScan(props2, (xmlHashScanner)copy_property, props1);
}

static void
copy_metaname(
    swish_MetaName *meta2,
    xmlHashTablePtr metas1,
    xmlChar *meta2name
)
{
    swish_MetaName *meta1;
    
    if (swish_hash_exists(metas1, meta2name)) {
        meta1 = swish_hash_fetch(metas1, meta2name);
        if (meta1->name != NULL) {
            swish_xfree(meta1->name);
            meta1->name = swish_xstrdup(meta2->name);
        }    
    }
    else {
        meta1 = swish_metaname_init(swish_xstrdup(meta2name));
        meta1->ref_cnt++;
        swish_hash_add(metas1, meta1->name, meta1);
    }
/*     
    SWISH_DEBUG_MSG("%s meta1->id = %d    %s meta2->id = %d",
                    meta1->name, meta1->id, meta2->name, meta2->id);
 */
    // only change id if meta2->id is not already spoken for.
    meta1->id = meta2->id;
    meta1->bias = meta2->bias;
    if (meta1->alias_for != NULL) {
        swish_xfree(meta1->alias_for);
    }
    if (meta2->alias_for != NULL) {
        meta1->alias_for = swish_xstrdup(meta2->alias_for);
    }

}

static void
merge_metanames(
    xmlHashTablePtr metas1,
    xmlHashTablePtr metas2
)
{
    xmlHashScan(metas2, (xmlHashScanner)copy_metaname, metas1);
}

static void
copy_strlist(
    swish_StringList *strlist2,
    xmlHashTablePtr strlists1,
    xmlChar *key
)
{
    swish_StringList *strlist1;
    if (swish_hash_exists(strlists1, key)) {
        strlist1 = swish_hash_fetch(strlists1, key);
        swish_stringlist_merge(strlist2, strlist1);
    }
    else {
        strlist1 = swish_stringlist_copy(strlist2);
        swish_hash_add(strlists1, key, strlist1);
    }
}

static void
merge_stringlists(
    xmlHashTablePtr strlists1,
    xmlHashTablePtr strlists2
)
{
    xmlHashScan(strlists2, (xmlHashScanner)copy_strlist, strlists1);
}

void
swish_config_merge(
    swish_Config *config1,
    swish_Config *config2
)
{

    xmlChar *v;

/* values in config2 override and are set in config1 */

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("Merging config2 0x%lx into config1 0x%lx",
            config2, config1);
        swish_config_debug(config2);
        swish_config_debug(config1);
    }


    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("merge properties");
    }
    merge_properties(config1->properties, config2->properties);

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("merge metanames");
    }
    merge_metanames(config1->metanames, config2->metanames);

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("merge parsers");
    }
    swish_hash_merge(config1->parsers, config2->parsers);

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("merge mimes");
    }
    swish_hash_merge(config1->mimes, config2->mimes);

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("merge index");
    }
    swish_hash_merge(config1->index, config2->index);

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("merge tag_aliases");
    }
    swish_hash_merge(config1->tag_aliases, config2->tag_aliases);

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("merge misc");
    }
    swish_hash_merge(config1->misc, config2->misc);
    
    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("merge stringlists");
    }
    merge_stringlists(config1->stringlists, config2->stringlists);

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("merge complete");
    }

/* set flags */
    if (swish_hash_exists(config2->misc, BAD_CAST SWISH_TOKENIZE)) {
        config2->flags->tokenize = swish_string_to_boolean(swish_hash_fetch(config2->misc, BAD_CAST SWISH_TOKENIZE));
    }
    config1->flags->tokenize = config2->flags->tokenize;
    if (swish_hash_exists(config2->misc, BAD_CAST SWISH_CASCADE_META_CONTEXT)) {
        config2->flags->cascade_meta_context = 
            swish_string_to_boolean(swish_hash_fetch(config2->misc, BAD_CAST SWISH_CASCADE_META_CONTEXT));
    }
    config1->flags->cascade_meta_context = config2->flags->cascade_meta_context;
    if (swish_hash_exists(config2->misc, BAD_CAST SWISH_IGNORE_XMLNS)) {
        config2->flags->ignore_xmlns = 
            swish_string_to_boolean(swish_hash_fetch(config2->misc, BAD_CAST SWISH_IGNORE_XMLNS));
    }
    config1->flags->ignore_xmlns = config2->flags->ignore_xmlns;
    if (swish_hash_exists(config2->misc, BAD_CAST SWISH_FOLLOW_XINCLUDE)) {
        config2->flags->follow_xinclude =
            swish_string_to_boolean(swish_hash_fetch(config2->misc, BAD_CAST SWISH_FOLLOW_XINCLUDE));
    }
    config1->flags->follow_xinclude = config2->flags->follow_xinclude;
    if (swish_hash_exists(config2->misc, BAD_CAST SWISH_UNDEFINED_METATAGS)) {
        v = swish_hash_fetch(config2->misc, BAD_CAST SWISH_UNDEFINED_METATAGS);
        if (xmlStrEqual(v, BAD_CAST "error")) {
            config2->flags->undef_metas = SWISH_UNDEF_METAS_ERROR;
        }
        else if (xmlStrEqual(v, BAD_CAST "ignore")) {
            config2->flags->undef_metas = SWISH_UNDEF_METAS_IGNORE;
        }
        else if (xmlStrEqual(v, BAD_CAST "index")) {
            config2->flags->undef_metas = SWISH_UNDEF_METAS_INDEX;
        }
        else if (xmlStrEqual(v, BAD_CAST "auto")) {
            config2->flags->undef_metas = SWISH_UNDEF_METAS_AUTO;
        }
        else if (xmlStrEqual(v, BAD_CAST "autoall")) {
            config2->flags->undef_metas = SWISH_UNDEF_METAS_AUTOALL;
        }
        else {
            SWISH_CROAK("Unknown value for %s: %s", SWISH_UNDEFINED_METATAGS, v);
        }
    }
    config1->flags->undef_metas = config2->flags->undef_metas;
    if (swish_hash_exists(config2->misc, BAD_CAST SWISH_UNDEFINED_XML_ATTRIBUTES)) {
        v = swish_hash_fetch(config2->misc, BAD_CAST SWISH_UNDEFINED_XML_ATTRIBUTES);
        if (xmlStrEqual(v, BAD_CAST "error")) {
            config2->flags->undef_attrs = SWISH_UNDEF_ATTRS_ERROR;
        }
        else if (xmlStrEqual(v, BAD_CAST "ignore")) {
            config2->flags->undef_attrs = SWISH_UNDEF_ATTRS_IGNORE;
        }
        else if (xmlStrEqual(v, BAD_CAST "index")) {
            config2->flags->undef_attrs = SWISH_UNDEF_ATTRS_INDEX;
        }
        else if (xmlStrEqual(v, BAD_CAST "auto")) {
            config2->flags->undef_attrs = SWISH_UNDEF_ATTRS_AUTO;
        }
        else if (xmlStrEqual(v, BAD_CAST "autoall")) {
            config2->flags->undef_attrs = SWISH_UNDEF_ATTRS_AUTOALL;
        }
        else if (xmlStrEqual(v, BAD_CAST "disable")) {
            config2->flags->undef_attrs = SWISH_UNDEF_ATTRS_DISABLE;
        }
        else {
            SWISH_CROAK("Unknown value for %s: %s", SWISH_UNDEFINED_XML_ATTRIBUTES, v);
        }
    }
    config1->flags->undef_attrs = config2->flags->undef_attrs;
    
    if (config1->flags->max_meta_id < config2->flags->max_meta_id) {
        config1->flags->max_meta_id = config2->flags->max_meta_id;
    }
    if (config1->flags->max_prop_id < config2->flags->max_prop_id) {
        config1->flags->max_prop_id = config2->flags->max_prop_id;
    }


    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("flags set");
    }

}


/*************** end config.c ************/


/*************** start docinfo.c ************/
/*
 * This file is part of libswish3
 * Copyright (C) 2007 Peter Karman
 *
 *  libswish3 is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libswish3 is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libswish3; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
*/

/* docinfo.c -- stat and time of files */

#ifndef LIBSWISH3_SINGLE_FILE
#include <sys/param.h>
#include <stdio.h>
#include <string.h>
#include <err.h>
#include <errno.h>
#include <limits.h>
#include <stdlib.h>

#include "libswish3.h"
#endif

extern int errno;
extern int SWISH_DEBUG;

/* PUBLIC */
swish_DocInfo *
swish_docinfo_init(
)
{

    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY)
        SWISH_DEBUG_MSG("init'ing docinfo");

    swish_DocInfo *docinfo = swish_xmalloc(sizeof(swish_DocInfo));
    docinfo->ref_cnt = 0;
    docinfo->nwords = 0;
    docinfo->mtime = 0;
    docinfo->size = 0;
    docinfo->encoding = swish_xstrdup((xmlChar *)SWISH_DEFAULT_ENCODING);
    docinfo->uri = NULL;
    docinfo->mime = NULL;
    docinfo->parser = NULL;
    docinfo->ext = NULL;
    docinfo->action = NULL;
    docinfo->is_gzipped = SWISH_FALSE;

    /*
    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
        SWISH_DEBUG_MSG("docinfo all ready");
        swish_docinfo_debug(docinfo);
    }
    */

    return docinfo;
}

/* PUBLIC */
void
swish_docinfo_free(
    swish_DocInfo *ptr
)
{
    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
        SWISH_DEBUG_MSG("freeing swish_DocInfo");
    }
    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
        swish_docinfo_debug(ptr);
    }
    if (ptr->ref_cnt != 0) {
        SWISH_WARN("docinfo ref_cnt != 0: %d", ptr->ref_cnt);
    }

    ptr->nwords = 0;            /* why is this required? */
    ptr->mtime = 0;
    ptr->size = 0;
    ptr->is_gzipped = SWISH_FALSE;

/* encoding and mime are malloced via xmlstrdup elsewhere */
    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY)
        SWISH_DEBUG_MSG("freeing docinfo->encoding");
    swish_xfree(ptr->encoding);

    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY)
        SWISH_DEBUG_MSG("freeing docinfo->mime");
    if (ptr->mime != NULL)
        swish_xfree(ptr->mime);

    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY)
        SWISH_DEBUG_MSG("freeing docinfo->uri");
    if (ptr->uri != NULL)
        swish_xfree(ptr->uri);

    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY)
        SWISH_DEBUG_MSG("freeing docinfo->ext");
    if (ptr->ext != NULL)
        swish_xfree(ptr->ext);

    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY)
        SWISH_DEBUG_MSG("freeing docinfo->parser");
    if (ptr->parser != NULL)
        swish_xfree(ptr->parser);

    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY)
        SWISH_DEBUG_MSG("freeing docinfo ptr");
    swish_xfree(ptr);

    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY)
        SWISH_DEBUG_MSG("swish_DocInfo all freed");
}

int
swish_docinfo_check(
    swish_DocInfo *docinfo,
    swish_Config *config
)
{
    int ok;
    xmlChar *ext;

    ok = 1;

    if (SWISH_DEBUG & SWISH_DEBUG_DOCINFO)
        swish_docinfo_debug(docinfo);

    if (!docinfo->uri)
        SWISH_CROAK("Failed to return required header Content-Location:");

    if (docinfo->size == -1)
        SWISH_CROAK
            ("Failed to return required header Content-Length: for doc '%s'",
             docinfo->uri);

/* might make this conditional on verbose level */
    if (docinfo->size == 0)
        SWISH_CROAK("Found zero Content-Length for doc '%s'", docinfo->uri);

    ext = swish_fs_get_file_ext(docinfo->uri);
    
/* this fails with non-filenames like db ids, etc. */

    if (docinfo->ext == NULL) {
        if (ext != NULL) {
            docinfo->ext = swish_xstrdup(ext);
        }
        else {
            docinfo->ext = swish_xstrdup((xmlChar *)"none");
        }
    }

    if (ext != NULL) {
        swish_xfree(ext);
    }
    
    if (!docinfo->mime) {
        if (SWISH_DEBUG & SWISH_DEBUG_DOCINFO) {
            SWISH_DEBUG_MSG
                ("no MIME known. guessing based on uri extension '%s'",
                 docinfo->ext);
        }
        docinfo->mime = swish_mime_get_type(config, docinfo->ext);
    }
    else {
        if (SWISH_DEBUG & SWISH_DEBUG_DOCINFO) {
            SWISH_DEBUG_MSG("found MIME type in headers: '%s'", docinfo->mime);
        }
    }

    if (!docinfo->parser) {
        if (SWISH_DEBUG & SWISH_DEBUG_DOCINFO) {
            SWISH_DEBUG_MSG
                ("no parser defined in headers -- deducing from content type '%s'",
                 docinfo->mime);
        }
        docinfo->parser = swish_mime_get_parser(config, docinfo->mime);
    }
    else {
        if (SWISH_DEBUG & SWISH_DEBUG_DOCINFO) {
            SWISH_DEBUG_MSG("found parser in headers: '%s'", docinfo->parser);
        }
    }

    if (SWISH_DEBUG & SWISH_DEBUG_DOCINFO) {
        swish_docinfo_debug(docinfo);
    }
    return ok;

}

/* PUBLIC */
int
swish_docinfo_from_filesystem(
    xmlChar *filename,
    swish_DocInfo *i,
    swish_ParserData *parser_data
)
{
    if (i->ext != NULL)
        swish_xfree(i->ext);

    i->ext = swish_fs_get_file_ext(filename);
    if (xmlStrEqual(i->ext, BAD_CAST "gz")) {
        i->is_gzipped = SWISH_TRUE;
        /* get new ext */
        xmlChar *copy = swish_xstrdup(filename);
        unsigned int len = xmlStrlen(filename);
        copy[len-3] = '\0';
        swish_xfree(i->ext);
        i->ext = swish_fs_get_file_ext(copy);
        swish_xfree(copy);
    }
    
    if (!swish_fs_file_exists(filename)) {
        SWISH_WARN("Can't stat '%s': %s", filename, strerror(errno));
        return 0;
    }

    if (SWISH_DEBUG & SWISH_DEBUG_DOCINFO) {
        SWISH_DEBUG_MSG("handling url %s", filename);
    }
    if (i->uri != NULL) {
        swish_xfree(i->uri);
    }
    i->uri = swish_xstrdup(filename);
    i->mtime = swish_fs_get_file_mtime(filename);
    i->size = swish_fs_get_file_size(filename);

    if (SWISH_DEBUG & SWISH_DEBUG_DOCINFO) {
        SWISH_DEBUG_MSG("handling mime");
    }
    if (i->mime != NULL) {
        swish_xfree(i->mime);
    }
    i->mime = swish_mime_get_type(parser_data->s3->config, i->ext);

    if (SWISH_DEBUG & SWISH_DEBUG_DOCINFO) {
        SWISH_DEBUG_MSG("handling parser");
    }
    if (i->parser != NULL) {
        swish_xfree(i->parser);
    }
    
    i->parser = swish_mime_get_parser(parser_data->s3->config, i->mime);

    return 1;

}

/* PUBLIC */
void
swish_docinfo_debug(
    swish_DocInfo *docinfo
)
{
    char *ts;
    ts = swish_time_format(docinfo->mtime);
    
    SWISH_DEBUG_MSG("DocInfo");
    SWISH_DEBUG_MSG("  docinfo ptr: %lu", (unsigned long)docinfo);
/* SWISH_DEBUG_MSG("  size of swish_DocInfo struct: %d", (int)sizeof(swish_DocInfo)); */
/* SWISH_DEBUG_MSG("  size of docinfo ptr: %d",           (int)sizeof(*docinfo)); */
    SWISH_DEBUG_MSG("  uri: %s (%d)", docinfo->uri, (int)sizeof(docinfo->uri));
    SWISH_DEBUG_MSG("  doc size: %lu bytes (%d)", (unsigned long)docinfo->size,
                    (int)sizeof(docinfo->size));
    SWISH_DEBUG_MSG("  doc mtime: %lu (%d)", (unsigned long)docinfo->mtime,
                    (int)sizeof(docinfo->mtime));
/* SWISH_DEBUG_MSG("  size of mime: %d",                  (int)sizeof(docinfo->mime)); */
/* SWISH_DEBUG_MSG("  size of encoding: %d",              (int)sizeof(docinfo->encoding)); */
    SWISH_DEBUG_MSG("  mtime str: %s", ts);
    SWISH_DEBUG_MSG("  mime type: %s", docinfo->mime);
    SWISH_DEBUG_MSG("  encoding: %s", docinfo->encoding);       /* only known after parsing has
                                                                   started ... */
    SWISH_DEBUG_MSG("  file ext: %s", docinfo->ext);
    SWISH_DEBUG_MSG("  parser: %s", docinfo->parser);
    SWISH_DEBUG_MSG("  nwords: %d", docinfo->nwords);
    SWISH_DEBUG_MSG("  is_gzipped: %d", docinfo->is_gzipped);
    
    swish_xfree(ts);
}


/*************** end docinfo.c ************/


/*************** start error.c ************/
/*
 * This file is part of libswish3
 * Copyright (C) 2007 Peter Karman
 *
 *  libswish3 is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libswish3 is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libswish3; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
*/

/* error handling based on Swish-e ver2 error.c */

#ifndef LIBSWISH3_SINGLE_FILE
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <errno.h>
#include <err.h>
#include <string.h>
#include <libxml/globals.h>

#include "libswish3.h"
#endif

extern int SWISH_DEBUG;
extern int SWISH_WARNINGS;

static FILE *error_handle = NULL;

void
swish_set_error_handle(
    FILE * where
)
{
    error_handle = where;
}

void
swish_croak(
    const char *file,
    int line,
    const char *func,
    const char *msgfmt,
    ...
)
{
    va_list args;

    if (!error_handle)
        error_handle = stderr;

    va_start(args, msgfmt);
    fprintf(error_handle, "Swish ERROR %s:%d %s: ", file, line, func);
    vfprintf(error_handle, msgfmt, args);
    fprintf(error_handle, "\n");
    va_end(args);

    if (!errno)
        errno = 1;

    exit(errno);
}

void
swish_warn(
    const char *file,
    int line,
    const char *func,
    const char *msgfmt,
    ...
)
{
    va_list args;

    if (!error_handle)
        error_handle = stderr;
        
    if (!SWISH_WARNINGS)
        return;

    va_start(args, msgfmt);
    fprintf(error_handle, "Swish WARNING %s:%d %s: ", file, line, func);
    vfprintf(error_handle, msgfmt, args);
    fprintf(error_handle, "\n");
    va_end(args);
}

void
swish_debug(
    const char *file,
    int line,
    const char *func,
    const char *msgfmt,
    ...
)
{
    va_list args;

    if (!error_handle)
        error_handle = stderr;

    va_start(args, msgfmt);
    fprintf(error_handle, "Swish DEBUG %s:%d %s: ", file, line, func);
    vfprintf(error_handle, msgfmt, args);
    fprintf(error_handle, "\n");
    va_end(args);
}

const char*
swish_err_msg(
    int err_code
)
{
    const char *msg;

    switch(err_code) {
    
        case SWISH_ERR_NO_SUCH_FILE:
            msg = "No such file or directory";
            break;

        default:
            msg = "Unknown error";
    }

    return msg;
}



/*************** end error.c ************/


/*************** start hash.c ************/
/*
 * This file is part of libswish3
 * Copyright (C) 2007 Peter Karman
 *
 *  libswish3 is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libswish3 is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libswish3; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
*/

/* wrappers to common functions in libxml2 hash */

#ifndef LIBSWISH3_SINGLE_FILE
#include <stdlib.h>

#include "libswish3.h"
#endif

extern int SWISH_DEBUG;

static void free_hashval(
    void *val,
    xmlChar *key
);
static void merge_hashes(
    xmlChar *value,
    xmlHashTablePtr hash1,
    xmlChar *key
);

static void
free_hashval(
    void *val,
    xmlChar *key
)
{
    swish_xfree(val);
}

int
swish_hash_add(
    xmlHashTablePtr hash,
    xmlChar *key,
    void *value
)
{
    int ret;
    ret = xmlHashAddEntry(hash, key, value);
    if (ret == -1) {
        SWISH_CROAK("xmlHashAddEntry for '%s' failed", key);
    }
    /*
    else {
        
        SWISH_DEBUG_MSG("key %s added to hash with ret value %d", key, ret);
        
    } 
    */   
    return ret;
}

int
swish_hash_exists_or_add(
    xmlHashTablePtr hash,
    xmlChar *key,
    xmlChar *value
)
{
    if (!swish_hash_exists(hash, key)) {
        return swish_hash_add(hash, key, swish_xstrdup( value ));
    }
    else {
        return 1;
    }
}

void
swish_hash_free(
    xmlHashTablePtr hash
)
{
    xmlHashFree(hash, (xmlHashDeallocator)free_hashval);
}

int
swish_hash_replace(
    xmlHashTablePtr hash,
    xmlChar *key,
    void *value
)
{
    int ret;
    ret =
        xmlHashUpdateEntry(hash, key, value, (xmlHashDeallocator) free_hashval);
    if (ret == -1)
        SWISH_CROAK("xmlHashUpdateEntry for %s failed", key);

    return ret;
}

int
swish_hash_delete(
    xmlHashTablePtr hash,
    xmlChar *key
)
{
    int ret;
    ret = xmlHashRemoveEntry(hash, key, (xmlHashDeallocator) free_hashval);
    if (ret == -1)
        SWISH_CROAK("xmlHashRemoveEntry for %s failed", key);

    return ret;
}

boolean
swish_hash_exists(
    xmlHashTablePtr hash,
    xmlChar *key
)
{
    return xmlHashLookup(hash, key) ? 1 : 0;
}

void *
swish_hash_fetch(
    xmlHashTablePtr hash,
    xmlChar *key
)
{
    return xmlHashLookup(hash, key);
}

xmlHashTablePtr
swish_hash_init(
    int size
)
{
    xmlHashTablePtr h;

    h = xmlHashCreate(size);
    if (h == NULL) {
        SWISH_CROAK("error creating hash of size %d", size);
        return NULL; // never get here
    }

    return h;
}

static void
merge_hashes(
    xmlChar *value,
    xmlHashTablePtr hash1,
    xmlChar *key
)
{
    if (swish_hash_exists(hash1, key)) {
        swish_hash_replace(hash1, key, swish_xstrdup(value));
    }
    else {
        swish_hash_add(hash1, key, swish_xstrdup(value));
    }
}

void
swish_hash_merge(
    xmlHashTablePtr hash1,
    xmlHashTablePtr hash2
)
{
    xmlHashScan(hash2, (xmlHashScanner) merge_hashes, hash1);
}

static void
dump_hash_value(
    xmlChar *val,
    xmlChar *label,
    xmlChar *key
)
{
    SWISH_DEBUG_MSG(" %s:  [0x%x] => [0x%x]", label, key, val);
    SWISH_DEBUG_MSG(" %s:  %s [0x%x] => %s [0x%x]", label, key, key, val, val);
}

void
swish_hash_dump(
    xmlHashTablePtr hash,
    const char *label
)
{
    SWISH_DEBUG_MSG("start hash_dump for %s [0x%x]", label, hash);
    xmlHashScan(hash, (xmlHashScanner) dump_hash_value, (xmlChar*)label);
    SWISH_DEBUG_MSG("end hash_dump for %s [0x%x]", label, hash);
}


/*************** end hash.c ************/


/*************** start fs.c ************/
/*
 * This file is part of libswish3
 * Copyright (C) 2009 Peter Karman
 *
 *  libswish3 is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libswish3 is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libswish3; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
*/

#ifndef LIBSWISH3_SINGLE_FILE
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <dirent.h>
#include <stdio.h>
#include <errno.h>
#include <err.h>
#include <string.h>
#include <sys/param.h>
#include <limits.h>
#include <assert.h>

#include "libswish3.h"

#endif

extern int errno;
extern int SWISH_DEBUG;

static xmlChar *findlast(
    xmlChar *str,
    xmlChar *set
);
static xmlChar *lastptr(
    xmlChar *str
);

boolean
swish_fs_file_exists(
    xmlChar *path
)
{
    struct stat info;
    if (stat((char *)path, &info)) {
        return 0;
    }
    return 1;
}

boolean
swish_fs_is_dir(
    xmlChar *path
)
{
    struct stat stbuf;

    if (stat((char *)path, &stbuf))
        return 0;
    return ((stbuf.st_mode & S_IFMT) == S_IFDIR) ? 1 : 0;
}

boolean
swish_fs_is_file(
    xmlChar *path
)
{
    struct stat stbuf;

    if (stat((char *)path, &stbuf))
        return 0;
    return ((stbuf.st_mode & S_IFMT) == S_IFREG) ? 1 : 0;
}

boolean
swish_fs_is_link(
    xmlChar *path
)
{
#ifdef HAVE_LSTAT
    struct stat stbuf;

    if (lstat((char *)path, &stbuf))
        return 0;
    return ((stbuf.st_mode & S_IFLNK) == S_IFLNK) ? 1 : 0;
#else
    return 0;
#endif
}

off_t
swish_fs_get_file_size(
    xmlChar *path
)
{
    struct stat stbuf;

    if (stat((char *)path, &stbuf))
        return -1;
    return stbuf.st_size;
}

time_t
swish_fs_get_file_mtime(
    xmlChar *path
)
{
    struct stat stbuf;

    if (stat((char *)path, &stbuf))
        return -1;
    return stbuf.st_mtime;
}

/* parse a URL to determine file ext */
/* inspired by http://www.tug.org/tex-archive/tools/zoo/ by Rahul Dhesi */
xmlChar *
swish_fs_get_file_ext(
    xmlChar *url
)
{
    xmlChar *p;

    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
        SWISH_DEBUG_MSG("parsing url %s for extension", url);

    p = findlast(url, (xmlChar *)SWISH_EXT_SEP);        /* look for . or /         */

    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
        SWISH_DEBUG_MSG("p = %s", p);

    if (p == NULL)
        return p;

    if (p != NULL && *p != SWISH_EXT_CH)        /* found .?                     */
        return NULL;            /* ... if not, ignore / */

    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
        SWISH_DEBUG_MSG("p = %s", p);

    if (*p == SWISH_EXT_CH)
        p++;                    /* skip to next char after . */

    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
        SWISH_DEBUG_MSG("ext is %s", p);

    return swish_str_tolower(p);
}

xmlChar *
swish_fs_get_path(
    xmlChar *url
)
{
    xmlChar *p;
    xmlChar *path;
    
    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
        SWISH_DEBUG_MSG("parsing url %s for path", url);

    p = findlast(url, (xmlChar *)SWISH_PATH_SEP_STR);

    if (p == NULL) {
        return p;
    }
    if (p != NULL && *p != SWISH_PATH_SEP) {
        return NULL;
    }
    if (xmlStrEqual(url, p)) {
        return NULL;
    }

    p++;    /* bump to include PATH_SEP */
    path = xmlStrsub(url, 0, p - url);
    
    //SWISH_DEBUG_MSG("url=%s  path=%s", url, path);
    
    return path;
}

boolean
swish_fs_looks_like_gz(
    xmlChar *file
)
{
    xmlChar *ext;
    boolean is_eq;
    ext = swish_fs_get_file_ext(file);
    is_eq = xmlStrEqual(ext, BAD_CAST "gz");
    //SWISH_DEBUG_MSG("looks like gz? %d", is_eq);
    if (ext != NULL) {
        swish_xfree(ext);
    }
    return is_eq;
}

/*******************/
/*
findlast() finds last occurrence in provided string of any of the characters
except the null character in the provided set.

If found, return value is pointer to character found, else it is NULL.
*/

static xmlChar *
findlast(
    xmlChar *str,
    xmlChar *set
)
{
    xmlChar *p;

    if (str == NULL || set == NULL || *str == '\0' || *set == '\0')
        return (NULL);

    p = lastptr(str);           /* pointer to last char of string */
    assert(p != NULL);

    while (p != str && xmlStrchr(set, *p) == NULL) {
        --p;
    }

/* either p == str or we found a character or both */
    if (xmlStrchr(set, *p) == NULL)
        return (NULL);
    else
        return (p);
}

/*
lastptr() returns a pointer to the last non-null character in the string, if
any.  If the string is null it returns NULL
*/

static xmlChar *
lastptr(
    xmlChar *str
)
{
    xmlChar *p;
    if (str == NULL)
        SWISH_CROAK("received null pointer while looking for last NULL");
    if (*str == '\0')
        return (NULL);
    p = str;
    while (*p != '\0')          /* find trailing null char */
        ++p;
    --p;                        /* point to just before it */
    return (p);
}


/*************** end fs.c ************/


/*************** start io.c ************/
/*
 * This file is part of libswish3
 * Copyright (C) 2007 Peter Karman
 *
 *  libswish3 is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libswish3 is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libswish3; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
*/

/* simple I/O functions */

#ifndef LIBSWISH3_SINGLE_FILE
#include <stdio.h>
#include <errno.h>
#include <err.h>
#include <string.h>
#include <zlib.h>

#include "libswish3.h"
#endif

extern int SWISH_DEBUG;
extern int errno;

static void no_nulls(
    xmlChar *filename,
    xmlChar *buffer,
    long bytes_read
);

/* substitute embedded null chars with a newline so we can treat the buffer as a whole
 * string based on similar code in swish-e ver2 file.c */
static void
no_nulls(
    xmlChar *filename,
    xmlChar *buffer,
    long bytes_read
)
{
    if (xmlStrlen(buffer) < bytes_read) {
        long i;
        int j = 0;
        long i_bytes_read = bytes_read;

        for (i = 0; i < i_bytes_read; ++i) {
            if (buffer[i] == '\0') {
                buffer[i] = '\n';
                j++;
            }
            if (buffer[i] == SWISH_TOKENPOS_BUMPER[0]) {
                buffer[i] = '\n';
                j++;
            }
        }

        if (j) {
            SWISH_WARN
                ("Substituted %d embedded null or connector character(s) in file '%s' with newline(s)",
                 j, filename);
        }
    }

}

xmlChar *
swish_io_slurp_fh(
    FILE * fh,
    unsigned long flen,
    boolean binmode
)
{
    size_t bytes_read;
    xmlChar *buffer;

/* printf("slurping %d bytes\n", flen); */

    buffer = swish_xmalloc(flen + 1);
    *buffer = '\0';

    bytes_read = fread(buffer, sizeof(xmlChar), flen, fh);

    if (bytes_read != flen) {
        SWISH_CROAK("did not read expected bytes: %ld expected, %d read", flen,
                    bytes_read);
    }
    buffer[bytes_read] = '\0';  /* terminate the string */

/* printf("read %d bytes from stdin\n", bytes_read); */

    if (!binmode) {
        no_nulls((xmlChar *)"filehandle", buffer, (int)bytes_read);
    }
    
    return buffer;
}

xmlChar *
swish_io_slurp_file_len(
    xmlChar *filename,
    off_t flen,
    boolean binmode
)
{
    size_t bytes_read;
    FILE *fp;
    xmlChar *buffer;

    if (flen > SWISH_MAX_FILE_LEN) {
        flen = SWISH_MAX_FILE_LEN;
        SWISH_WARN("max file len %ld exceeded - cannot read %ld bytes from %s",
                   SWISH_MAX_FILE_LEN, flen, filename);

    }
    
    if (SWISH_DEBUG & SWISH_DEBUG_IO)
        SWISH_DEBUG_MSG("slurp file '%s' [%ld bytes]", filename, flen);

    buffer = swish_xmalloc(flen + 1);

    if ((fp = fopen((char *)filename, "r")) == 0) {
        SWISH_CROAK("Error reading file %s: %s", filename, strerror(errno));
    }

    bytes_read = fread(buffer, sizeof(xmlChar), flen, fp);

    if (bytes_read != flen) {
        SWISH_CROAK("did not read expected bytes: %ld expected, %d read (%s)",
                    flen, bytes_read, strerror(errno));
    }
    buffer[bytes_read] = '\0';  /* terminate the string */

/* close the stream */
    if (fclose(fp))
        SWISH_CROAK("error closing filehandle for %s: %s", 
            filename, strerror(errno));

    if (!binmode) {
        no_nulls(filename, buffer, (long)bytes_read);
    }
    
    return buffer;
}

xmlChar *
swish_io_slurp_gzfile_len(
    xmlChar *filename,
    off_t *flen,
    boolean binmode
)
{
    off_t bytes_read, buffer_len;
    int ret;
    gzFile fh;
    xmlChar *buffer;
    unsigned int buf_size;
    int compression_rate = 3;   /* seems about right */
    
    buf_size = sizeof(xmlChar)*(*flen)*compression_rate;
    buffer = swish_xmalloc(buf_size);
    buffer_len = 0;
    fh = gzopen((char*)filename, "r");
    if (fh == NULL) {
        SWISH_CROAK("Failed to open file '%s' for read: %s",
            filename, strerror(errno));
    }
    while ((bytes_read = gzread(fh, buffer, buf_size)) != 0) {
        if (bytes_read == -1) {
            SWISH_CROAK("Error reading gzipped file '%s': %s",
                filename, strerror(errno));
        }
        if (SWISH_DEBUG & SWISH_DEBUG_IO) {
            SWISH_DEBUG_MSG("Read %d bytes from %s", bytes_read, filename);
        }
        if (bytes_read < buf_size) {
            if (SWISH_DEBUG & SWISH_DEBUG_IO) {
                SWISH_DEBUG_MSG("Read to end of file");
            }
            buffer_len = bytes_read;
            break;
        }
        buf_size *= compression_rate;
        buffer = swish_xrealloc(buffer, buf_size);
        if (SWISH_DEBUG & SWISH_DEBUG_IO) {
            SWISH_DEBUG_MSG("grew buffer to %d", buf_size);
        }
        buffer_len = bytes_read;
        ret = gzrewind(fh);
        if (SWISH_DEBUG & SWISH_DEBUG_IO) {
            SWISH_DEBUG_MSG("gzrewind ret = %d", ret);
        }
    }
    ret = gzclose(fh);    // TODO check for err?
        
    buffer[buffer_len] = '\0';
    
    if (!binmode) {
        no_nulls(filename, buffer, (long)buffer_len);
    }
   
    if (SWISH_DEBUG & SWISH_DEBUG_IO) { 
        SWISH_DEBUG_MSG("slurped gzipped file '%s' buffer_len=%d buf_size=%d orig flen=%d", 
            filename, buffer_len, buf_size, *flen);
    }

    /* set the flen pointer to the actual length */
    *flen = buffer_len;
      
    return buffer;
}

xmlChar *
swish_io_slurp_file(
    xmlChar *filename,
    off_t file_len,
    boolean is_gzipped,
    boolean binmode
)
{
    if (!file_len) {
        file_len = swish_fs_get_file_size(filename);
    }
    if (!file_len || file_len == -1) {
        SWISH_CROAK("Can't stat %s: %s\n", filename, strerror(errno));
    }
    if (is_gzipped) {
        return swish_io_slurp_gzfile_len(filename, &file_len, binmode);
    }
    else {
        return swish_io_slurp_file_len(filename, file_len, binmode);
    }
}

long int
swish_io_count_operable_file_lines(
    xmlChar *filename
)
{
    long int count;
    FILE *fp;
    xmlChar line_in_file[SWISH_MAXSTRLEN];
    
    count = 0;
    
    fp = fopen((const char*)filename, "r");
    if (fp == NULL) {
        SWISH_CROAK("failed to open file: %s", filename);
    }
    while (fgets((char*)line_in_file, SWISH_MAXSTRLEN, fp) != NULL) {
        if (swish_io_is_skippable_line(line_in_file))  
            continue;
        
        count++;
        //SWISH_DEBUG_MSG("count %d for '%s'", count, line_in_file);

    }
    
    if (fclose(fp)) {
        SWISH_CROAK("error closing filelist");
    }

    return count;
}


boolean
swish_io_is_skippable_line(
    xmlChar *str
)
{
    xmlChar *line;

    /* skip leading white space */
    line = swish_str_skip_ws(str);
    
    //SWISH_DEBUG_MSG("line: '%s'", line);
        
    if (xmlStrlen(line) == 0 || (xmlStrlen(line) == 1 && line[0] == '\n')) {
        /* blank line */
        return SWISH_TRUE;
    }
    if (line[0] == '#') {
        /* skip comments */
        return SWISH_TRUE;
    }
    
    return SWISH_FALSE;
}


/*************** end io.c ************/


/*************** start mem.c ************/
/*
 * This file is part of libswish3
 * Copyright (C) 2007 Peter Karman
 *
 *  libswish3 is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libswish3 is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libswish3; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
*/

/* mem.c -- graceful memory handling */

#ifndef LIBSWISH3_SINGLE_FILE
#include <sys/types.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <libxml/xmlstring.h>
#include <err.h>

#include "libswish3.h"
#endif

extern int SWISH_DEBUG;

static long int memcount = 0;

void
swish_mem_init(
)
{
    memcount = 0;
}

long int
swish_memcount_get(
)
{
    return memcount;
}

void
swish_memcount_dec(
)
{
    memcount--;
}

/* PUBLIC */
/* realloc a block of memory */
void *
swish_xrealloc(
    void *ptr,
    size_t size
)
{
    void *new_ptr = realloc(ptr, size);

    if (new_ptr == NULL)
        SWISH_CROAK("Out of memory (could not reallocate %lu more bytes)!",
                    (unsigned long)size);

    return new_ptr;
}

/* PUBLIC */
void *
swish_xmalloc(
    size_t size
)
{
    void *ptr;
    
    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
        SWISH_DEBUG_MSG("malloc %ld bytes", (long)size);
    }

    ptr = malloc(size);

    if (ptr == NULL)
        SWISH_CROAK("Out of memory! Can't malloc %lu bytes",
                    (unsigned long)size);

    memcount++;
    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
        SWISH_DEBUG_MSG("memcount = %ld", memcount);
        SWISH_DEBUG_MSG("xmalloc address: 0x%lx", ptr);
    }

    return ptr;
}

xmlChar *
swish_xstrdup(
    const xmlChar *ptr
)
{
    xmlChar *copy;
    memcount++;
    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY)
        SWISH_DEBUG_MSG("memcount = %ld", memcount);
    copy = xmlStrdup(ptr);
    if (copy == NULL) 
        SWISH_CROAK("strdup returned NULL for %s", ptr);

    return copy;
}

xmlChar *
swish_xstrndup(
    const xmlChar *ptr,
    int len
)
{
    memcount++;
    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY)
        SWISH_DEBUG_MSG("memcount = %ld", memcount);
    return (xmlStrndup(ptr, len));
}

void
swish_xfree(
    void *ptr
)
{
    if (ptr == NULL) {
        SWISH_WARN
            (" >>>>>>>>>>>>>> attempt to free NULL pointer <<<<<<<<<<<<<<");
        return;
    }
    
    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY)
        SWISH_DEBUG_MSG("freeing %s 0x%lx", (char*)ptr, ptr);

    xmlFree(ptr);

    memcount--;

    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY)
        SWISH_DEBUG_MSG("memcount = %ld", memcount);
}

void
swish_mem_debug(
)
{
/* SWISH_DEBUG_MSG("memcount = %ld", memcount); */
    if (memcount > 0)
        SWISH_WARN
            ("%ld more swish_xmalloc()s or swish_xstrdup()s than swish_xfree()s",
             memcount);

    if (memcount < 0)
        SWISH_WARN("too many swish_xfree()s %ld", memcount);
}


/*************** end mem.c ************/


/*************** start mime_types.c ************/
/*
 * This file is part of libswish3
 * Copyright (C) 2007 Peter Karman
 *
 *  libswish3 is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libswish3 is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libswish3; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
*/

/* MIME types hash based on Apache 2.0 mime.types listing
see <http://www.iana.org/assignments/media-types/> for official registry.
*/

#ifndef LIBSWISH3_SINGLE_FILE
#include <err.h>
#include <stdlib.h>
#include <string.h>

#include "libswish3.h"
#endif

extern int SWISH_DEBUG;

//should be total number of strings(NOT pairs !) below
#define SWISH_MIME_TABLE_COUNT  304

static const char *SWISH_MIME_TABLE[] = {
    "ai", "application/postscript",
    "aif", "audio/x-aiff",
    "aifc", "audio/x-aiff",
    "aiff", "audio/x-aiff",
    "asc", "text/plain",
    "au", "audio/basic",
    "avi", "video/x-msvideo",
    "bcpio", "application/x-bcpio",
    "bin", "application/octet-stream",
    "bmp", "image/bmp",
    "cdf", "application/x-netcdf",
    "cgm", "image/cgm",
    "class", "application/octet-stream",
    "cpio", "application/x-cpio",
    "cpt", "application/mac-compactpro",
    "csh", "application/x-csh",
    "css", "text/css",
    "dcr", "application/x-director",
    "dir", "application/x-director",
    "djv", "image/vnd.djvu",
    "djvu", "image/vnd.djvu",
    "dll", "application/octet-stream",
    "dmg", "application/octet-stream",
    "dms", "application/octet-stream",
    "doc", "application/msword",
    "dtd", "application/xml-dtd",
    "dvi", "application/x-dvi",
    "dxr", "application/x-director",
    "eps", "application/postscript",
    "etx", "text/x-setext",
    "exe", "application/octet-stream",
    "ez", "application/andrew-inset",
    "gif", "image/gif",
    "gram", "application/srgs",
    "grxml", "application/srgs+xml",
    "gtar", "application/x-gtar",
    "gz", "application/x-gzip",
    "hdf", "application/x-hdf",
    "hqx", "application/mac-binhex40",
    "htm", "text/html",
    "html", "text/html",
    "ice", "x-conference/x-cooltalk",
    "ico", "image/x-icon",
    "ics", "text/calendar",
    "ief", "image/ief",
    "ifb", "text/calendar",
    "iges", "model/iges",
    "igs", "model/iges",
    "jpe", "image/jpeg",
    "jpeg", "image/jpeg",
    "jpg", "image/jpeg",
    "js", "application/x-javascript",
    "kar", "audio/midi",
    "latex", "application/x-latex",
    "lha", "application/octet-stream",
    "lzh", "application/octet-stream",
    "m3u", "audio/x-mpegurl",
    "m4u", "video/vnd.mpegurl",
    "man", "application/x-troff-man",
    "mathml", "application/mathml+xml",
    "me", "application/x-troff-me",
    "mesh", "model/mesh",
    "mid", "audio/midi",
    "midi", "audio/midi",
    "mif", "application/vnd.mif",
    "mov", "video/quicktime",
    "movie", "video/x-sgi-movie",
    "mp2", "audio/mpeg",
    "mp3", "audio/mpeg",
    "mpe", "video/mpeg",
    "mpeg", "video/mpeg",
    "mpg", "video/mpeg",
    "mpga", "audio/mpeg",
    "ms", "application/x-troff-ms",
    "msh", "model/mesh",
    "mxu", "video/vnd.mpegurl",
    "nc", "application/x-netcdf",
    "oda", "application/oda",
    "ogg", "application/ogg",
    "pbm", "image/x-portable-bitmap",
    "pdb", "chemical/x-pdb",
    "pdf", "application/pdf",
    "pgm", "image/x-portable-graymap",
    "pgn", "application/x-chess-pgn",
    "png", "image/png",
    "pnm", "image/x-portable-anymap",
    "ppm", "image/x-portable-pixmap",
    "ppt", "application/vnd.ms-powerpoint",
    "ps", "application/postscript",
    "qt", "video/quicktime",
    "ra", "audio/x-pn-realaudio",
    "ram", "audio/x-pn-realaudio",
    "ras", "image/x-cmu-raster",
    "rdf", "application/rdf+xml",
    "rgb", "image/x-rgb",
    "rm", "application/vnd.rn-realmedia",
    "roff", "application/x-troff",
    "rtf", "text/rtf",
    "rtx", "text/richtext",
    "sgm", "text/sgml",
    "sgml", "text/sgml",
    "sh", "application/x-sh",
    "shar", "application/x-shar",
    "silo", "model/mesh",
    "sit", "application/x-stuffit",
    "skd", "application/x-koan",
    "skm", "application/x-koan",
    "skp", "application/x-koan",
    "skt", "application/x-koan",
    "smi", "application/smil",
    "smil", "application/smil",
    "snd", "audio/basic",
    "so", "application/octet-stream",
    "spl", "application/x-futuresplash",
    "src", "application/x-wais-source",
    "sv4cpio", "application/x-sv4cpio",
    "sv4crc", "application/x-sv4crc",
    "svg", "image/svg+xml",
    "swf", "application/x-shockwave-flash",
    "t", "application/x-troff",
    "tar", "application/x-tar",
    "tcl", "application/x-tcl",
    "tex", "application/x-tex",
    "texi", "application/x-texinfo",
    "texinfo", "application/x-texinfo",
    "tif", "image/tiff",
    "tiff", "image/tiff",
    "tr", "application/x-troff",
    "tsv", "text/tab-separated-values",
    "txt", "text/plain",
    "ustar", "application/x-ustar",
    "vcd", "application/x-cdlink",
    "vrml", "model/vrml",
    "vxml", "application/voicexml+xml",
    "wav", "audio/x-wav",
    "wbmp", "image/vnd.wap.wbmp",
    "wbxml", "application/vnd.wap.wbxml",
    "wml", "text/vnd.wap.wml",
    "wmlc", "application/vnd.wap.wmlc",
    "wmls", "text/vnd.wap.wmlscript",
    "wmlsc", "application/vnd.wap.wmlscriptc",
    "wrl", "model/vrml",
    "xbm", "image/x-xbitmap",
    "xht", "application/xhtml+xml",
    "xhtml", "application/xhtml+xml",
    "xls", "application/vnd.ms-excel",
    "xml",       "application/xml", 
    /*"xml", "text/xml", */ /* w3 standard is application/xml now */
    "xpm", "image/x-xpixmap",
    "xsl", "application/xml",
    "xslt", "application/xslt+xml",
    "xul", "application/vnd.mozilla.xul+xml",
    "xwd", "image/x-xwindowdump",
    "xyz", "chemical/x-xyz",
    "zip", "application/zip"
};

/* create hash of file ext => mime type */
xmlHashTablePtr
swish_mime_defaults(
)
{
    int i;
    xmlChar *ext, *type;
    xmlHashTablePtr mimes;
    mimes = swish_hash_init(SWISH_MIME_TABLE_COUNT / 2);

    for (i = 0; i <= SWISH_MIME_TABLE_COUNT; i += 2) {
        ext = (xmlChar *)SWISH_MIME_TABLE[i];
        type = swish_xstrdup((xmlChar *)SWISH_MIME_TABLE[i + 1]);
        /*
        SWISH_DEBUG_MSG("%s: copy of %s [0x%x] at %s [0x%x]", 
            ext, SWISH_MIME_TABLE[i+1],SWISH_MIME_TABLE[i+1],type,type);
        */
        swish_hash_add(mimes, ext, type);
    }
    
    /*
    SWISH_DEBUG_MSG("mime_hash == 0x%x", mimes);

    for (i = 0; i <= SWISH_MIME_TABLE_COUNT; i += 2) {
        ext = (xmlChar *)SWISH_MIME_TABLE[i];
        if (!swish_hash_exists(mimes, ext)) {
            SWISH_CROAK("%s no in mimes hash", ext);
        }
        else {
            SWISH_DEBUG_MSG("fetched %s for %s", swish_hash_fetch(mimes, ext), ext);
        }
    }

    SWISH_DEBUG_MSG("mime_hash == 0x%x", mimes);
    */
    
    return mimes;
}

/* retrieve mime type from hash */
xmlChar *
swish_mime_get_type(
    swish_Config *config,
    xmlChar *fileext
)
{
    xmlChar *mime;
    mime = swish_hash_fetch(config->mimes, fileext);
    if (mime == NULL) {
        SWISH_WARN("No MIME type known for '%s' -- using '%s'", fileext,
                   SWISH_DEFAULT_MIME);
        mime = swish_xstrdup((xmlChar *)SWISH_DEFAULT_MIME);
    }
    return swish_xstrdup(mime);
}

/* returns parser type (TXT, HTML, XML) based on mime type */
xmlChar *
swish_mime_get_parser(
    swish_Config *config,
    xmlChar *mime
)
{
    xmlChar *parser;
    xmlChar *deftype;

    parser = swish_hash_fetch(config->parsers, mime);

    if (SWISH_DEBUG & SWISH_DEBUG_DOCINFO)
        SWISH_DEBUG_MSG("using parser '%s' based on MIME '%s'", parser, mime);

    deftype = swish_hash_fetch(config->parsers, (xmlChar *)SWISH_DEFAULT_PARSER);       /* error check?? */

    if (parser == NULL) {
        SWISH_WARN("No parser for MIME '%s' -- using '%s'", mime, deftype);
        parser = deftype;
    }

    return swish_xstrdup(parser);       /* so we don't change orig value -- MUST free */
}


/*************** end mime_types.c ************/


/*************** start parser.c ************/
/* 
* This file is part of libswish3
* Copyright (C) 2007 Peter Karman
*
*  libswish3 is free software; you can redistribute it and/or modify
*  it under the terms of the GNU General Public License as published by
*  the Free Software Foundation; either version 2 of the License, or
*  (at your option) any later version.
*
*  libswish3 is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU General Public License for more details.
*
*  You should have received a copy of the GNU General Public License
*  along with libswish3; if not, write to the Free Software
*  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
*/

/* 
* parse XML doc from memory using libxml2 SAX2 based on tutorial at
* http://www.jamesh.id.au/articles/libxml-sax/libxml-sax.html
*
* save all character() data to buffer, flushing on new metanames
* flush should split buffer into words, skipping nonwordchars/space, and
* lowercase all
*
* see iswlower(3) man page, etc.
*
* all the mb*() functions rely on locale to recognize multi-byte strings
*
*/

#ifndef LIBSWISH3_SINGLE_FILE

#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <stdio.h>
#include <locale.h>
#include <stdarg.h>
#include <err.h>
#include <errno.h>
#include <string.h>
#include <ctype.h>
#include <wctype.h>
#include <dirent.h>

#include <libxml/parserInternals.h>
#include <libxml/parser.h>
#include <libxml/HTMLparser.h>
#include <libxml/globals.h>
#include <libxml/xmlerror.h>
#include <libxml/tree.h>
#include <libxml/debugXML.h>
#include <libxml/xmlmemory.h>
#include <libxml/xinclude.h>
#include <libxml/uri.h>

#include "libswish3.h"
#endif

extern int errno;
extern int SWISH_DEBUG;

/* should we pass on libxml2 via SWISH_WARN() */
/* default is "on" per consistency with version 2.4.x */
int SWISH_PARSER_WARNINGS = 1;

static void get_env_vars(
);

static void flush_buffer(
    swish_ParserData *parser_data,
    xmlChar *metaname,
    xmlChar *context
);

static void tokenize(
    swish_ParserData *parser_data,
    xmlChar *string,
    int len,
    xmlChar *metaname,
    xmlChar *content
);

static void mystartDocument(
    void *parser_data
);
static void myendDocument(
    void *parser_data
);
static void mystartElement(
    void *parser_data,
    const xmlChar *name,
    const xmlChar **atts
);
static void myendElement(
    void *parser_data,
    const xmlChar *name
);

/* 
* SAX2 support 
*/
static void mystartElementNs(
    void *parser_data,
    const xmlChar *localname,
    const xmlChar *prefix,
    const xmlChar *URI,
    int nb_namespaces,
    const xmlChar **namespaces,
    int nb_attributes,
    int nb_defaulted,
    const xmlChar **attributes
);

static void myendElementNs(
    void *ctx ATTRIBUTE_UNUSED,
    const xmlChar *localname,
    const xmlChar *prefix,
    const xmlChar *URI
);

static void buffer_characters(
    swish_ParserData *parser_data,
    const xmlChar *ch,
    int len
);
static void mycharacters(
    void *parser_data,
    const xmlChar *ch,
    int len
);
static void mycomments(
    void *parser_data,
    const xmlChar *ch
);
static void myerr(
    void *user_data,
    xmlChar *msg,
    ...
);

static void open_tag(
    void *data,
    const xmlChar *tag,
    xmlChar **atts,
    const xmlChar *xmlns_prefix
);
static void close_tag(
    void *data,
    const xmlChar *tag,
    const xmlChar *xmlns_prefix
);
static xmlChar *bake_tag(
    swish_ParserData *parser_data,
    xmlChar *tag,
    xmlChar **atts,
    xmlChar *xmlns_prefix
);

static int docparser(
    swish_ParserData *parser_data,
    xmlChar *filename,
    xmlChar *buffer,
    int size
);
static int xml_parser(
    xmlSAXHandlerPtr sax,
    void *user_data,
    xmlChar *buffer,
    int size
);
static int html_parser(
    xmlSAXHandlerPtr sax,
    void *user_data,
    xmlChar *buffer,
    int size
);
static int txt_parser(
    swish_ParserData *parser_data,
    xmlChar *buffer,
    int size
);

static swish_ParserData *init_parser_data(
    swish_3 *s3
);
static void free_parser_data(
    swish_ParserData *parser_data
);

/* 
* parsing fh/buffer headers 
*/
typedef struct
{
    xmlChar **lines;
    int body_start;
    int nlines;
} HEAD;

static HEAD *buf_to_head(
    xmlChar *buf
);
static void free_head(
    HEAD * h
);
static swish_DocInfo *head_to_docinfo(
    HEAD * h
);

static xmlChar *document_encoding(
    xmlParserCtxtPtr ctxt
);

static void set_encoding(
    swish_ParserData *parser_data,
    xmlChar *buffer
);

/* tag tracker */
static xmlChar *flatten_tag_stack(
    xmlChar *baked,
    swish_TagStack *stack,
    char flatten_join
);
static void add_stack_to_prop_buf(
    xmlChar *baked,
    swish_ParserData *parser_data
);
static void push_tag_stack(
    swish_TagStack *stack,
    xmlChar *raw,
    xmlChar *baked,
    char flatten_join
);
static swish_Tag *pop_tag_stack(
    swish_TagStack *stack
);
static swish_Tag *pop_tag_stack_on_match(
    swish_TagStack *stack,
    xmlChar *raw
);
static void free_swishTag(
    swish_Tag * st
);
static void
free_swishTagStack(
    swish_TagStack *stack
);

static void
process_xinclude(
    swish_ParserData *parser_data,
    xmlChar *uri,
    boolean is_text
);

static void
xinclude_handler(
    swish_ParserData *parser_data
);

/***********************************************************************
*                end prototypes
***********************************************************************/

swish_Parser *
swish_parser_init(
    void (*handler) (swish_ParserData *)
)
{
    swish_Parser *p = (swish_Parser *)swish_xmalloc(sizeof(swish_Parser));

    p->handler = handler;
    p->verbosity = 0;
    p->ref_cnt = 0;

/*
* libxml2 stuff 
*/
    xmlInitParser();
    xmlSubstituteEntitiesDefault(1);    /* resolve text entities */

/*
* debugging help 
*/
    get_env_vars();

    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
        SWISH_DEBUG_MSG("parser ptr 0x%x", (long int)p);
    }

    return p;
}

void
swish_parser_free(
    swish_Parser *p
)
{
    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
        SWISH_DEBUG_MSG("freeing parser");
        swish_mem_debug();
    }
    if (p->ref_cnt != 0) {
        SWISH_WARN("parser ref_cnt != 0: %d\n", p->ref_cnt);
    }
    xmlCleanupParser();
    xmlMemoryDump();
    swish_xfree(p);
}

/* 
* turn the literal xml/html tag into a swish tag for matching against
* metanames and properties 
*/
static xmlChar *
bake_tag(
    swish_ParserData *parser_data,
    xmlChar *tag,
    xmlChar **atts,
    xmlChar *xmlns_prefix
)
{
    int i, j, size, prev_ignore_content;
    boolean is_html_tag, prev_bump_word; 
    xmlChar *swishtag,
            *swishdomtag,
            *tmpstr,
            *xmlns,
            *attr_lower, 
            *attr_val_lower, 
            *alias, 
            *metaname, 
            *metacontent, 
            *metaname_from_attr;
    swish_StringList *strlist;

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        SWISH_DEBUG_MSG(" tag: %s   parser->tag: %s ", tag, parser_data->tag);
        if (atts != NULL) {
            SWISH_DEBUG_MSG(" has attributes [%d]", xmlStrlen((xmlChar *)atts));
            for (i = 0; (atts[i] != NULL); i += 2) {
                SWISH_DEBUG_MSG(" att: %s=", atts[i]);
                if (atts[i + 1] != NULL) {
                    SWISH_DEBUG_MSG(" '%s'", atts[i + 1]);
                }
            }
        }
    }

    metaname = NULL;
    metacontent = NULL;

    // normalize all tags 

    swishtag = swish_str_tolower(tag);
    
    /* XML namespace support optional */
    if (xmlns_prefix != NULL && !parser_data->s3->config->flags->ignore_xmlns) {
        xmlns = swish_str_tolower(xmlns_prefix);
        if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
            SWISH_DEBUG_MSG("xmlns_prefix: '%s' (%s)", xmlns, xmlns_prefix);
        }
        size = xmlStrlen(swishtag) + xmlStrlen(xmlns) + 2;     /*  : + NUL */
        tmpstr = swish_xmalloc(size + 1);
        snprintf((char *)tmpstr, size, "%s%c%s", (char *)xmlns, 
            SWISH_XMLNS_CHAR, (char *)swishtag);
        swish_xfree(swishtag);
        swish_xfree(xmlns);
        swishtag = tmpstr;
    }

/*
* html tags 
*/
    if (parser_data->is_html) {

/*
           TODO config features about img tags and a/href tags 
*/
        if (xmlStrEqual(swishtag, BAD_CAST "br")
            || 
            xmlStrEqual(swishtag, BAD_CAST "img")
        ) {
            
            if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
                SWISH_DEBUG_MSG("found html tag '%s' ... bump_word = %d", swishtag, SWISH_TRUE);
            }
            parser_data->bump_word = SWISH_TRUE;
        }
        else {
            const htmlElemDesc *element = htmlTagLookup(swishtag);

            if (!element) {
                is_html_tag = 0;  // TODO unused?      /* flag that this might be a meta name */
            }
            else if (!element->isinline) {

/*
* need to bump token position so we don't match across block *
* elements 
*/
                if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
                    SWISH_DEBUG_MSG("found html !inline tag '%s' ... bump_word = %d", swishtag, SWISH_TRUE);
                }
                parser_data->bump_word = SWISH_TRUE;

            }
            else {
            
                if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
                    SWISH_DEBUG_MSG("found html inline tag '%s' ... bump_word = %d", swishtag, SWISH_FALSE);
                }
                parser_data->bump_word = SWISH_FALSE;
            
            }
        }

/*
* is this an HTML <meta> tag? treat 'name' attribute as a tag
* and 'content' attribute as the tag content.
* we assume 'name' and 'content' are always in english. 
*/

        if (xmlStrEqual(swishtag, BAD_CAST "meta") && atts != NULL) {
            for (i = 0; (atts[i] != 0); i++) {

                if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
                    SWISH_DEBUG_MSG("%d HTML attr: %s", i, atts[i]);
                }
                
                if (xmlStrEqual(atts[i], BAD_CAST "name")) {

                    //SWISH_DEBUG_MSG("found name: %s", atts[i+1]); 
                    metaname = (xmlChar *)atts[i + 1];
                }

                else if (xmlStrEqual(atts[i],  BAD_CAST "content")) {
                
                    // SWISH_DEBUG_MSG("found content: %s", atts[i+1]); 
                    metacontent = (xmlChar *)atts[i + 1];
                }

            }
        }

        if (metaname != NULL) {
        
            prev_ignore_content = parser_data->ignore_content;  // remember
        
            if (!swish_hash_exists(parser_data->s3->config->metanames, metaname)
                &&
                !swish_hash_exists(parser_data->s3->config->tag_aliases, metaname)
            ) {
            
                switch(parser_data->s3->config->flags->undef_metas) {
            
                    case SWISH_UNDEF_METAS_ERROR:
                        SWISH_CROAK("HTML <meta> tag with 'name' attribute '%s' is not a defined MetaName and %s == error",
                            metaname, SWISH_UNDEFINED_METATAGS);
                        break;
                    
                    case SWISH_UNDEF_METAS_IGNORE:
                        if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
                            SWISH_DEBUG_MSG("setting ignore_content=%d", 
                                (parser_data->ignore_content +1));
                        }
                        parser_data->ignore_content++;
                        break;
                
                    case SWISH_UNDEF_METAS_AUTO:
                        swish_metaname_new(metaname, parser_data->s3->config);
                        swish_nb_new(parser_data->metanames, metaname);
                        break;

                    case SWISH_UNDEF_METAS_AUTOALL:
                        swish_metaname_new(metaname, parser_data->s3->config);
                        swish_nb_new(parser_data->metanames, metaname);
                        if (!swish_hash_exists(parser_data->s3->config->properties, metaname)) {
                            swish_property_new(metaname, parser_data->s3->config);
                            swish_nb_new(parser_data->properties, metaname);
                        }
                        break;
                
                    case SWISH_UNDEF_METAS_INDEX:
                    default:
                        break;  // nothing to do
                    
                }   // end switch
            
            }
        
            if (metacontent != NULL) {
                if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
                    SWISH_DEBUG_MSG("found HTML meta: %s => %s", metaname, metacontent);
                }
                
                // do not match across metas 
                if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
                    SWISH_DEBUG_MSG("found HTML meta tag '%s' ... bump_word = %d", metaname, SWISH_TRUE);
                }
                
                prev_bump_word = parser_data->bump_word;    // remember
                parser_data->bump_word = SWISH_TRUE;
                open_tag(parser_data, metaname, NULL, xmlns_prefix);
                buffer_characters(parser_data, metacontent, xmlStrlen(metacontent));
                close_tag(parser_data, metaname, xmlns_prefix);
                parser_data->bump_word = prev_bump_word;    // restore
                
                if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
                    SWISH_DEBUG_MSG("close_tag done. swishtag = '%s', parser->tag = '%s'", 
                        swishtag, parser_data->tag);
                }
                        
                swish_xfree(parser_data->tag);  // metaname set recursively, so must free
                swish_xfree(swishtag);          // 'meta'
                
                return NULL;

            }
            else {
                SWISH_WARN("No content for meta tag '%s'", metaname);
            }
            
            if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
                SWISH_DEBUG_MSG("setting ignore_content=%d", prev_ignore_content);
            }
            parser_data->ignore_content = prev_ignore_content;  // restore
        }

    }

/*
* xml tags 
*/
    else {

        if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
            SWISH_DEBUG_MSG("found xml tag '%s' ... bump_word = %d", swishtag, SWISH_TRUE);
        }
        
        parser_data->bump_word = SWISH_TRUE;    // TODO make this configurable

/*
    XML attributes are parsed in 2 ways:
    (1) <foo class="bar">text</foo>
        becomes
        <foo.bar>text</foo>
    
    (2) <foo class="bar">text</foo>
        becomes
        <foo.class>bar</foo.class><foo>text</foo>
        
    the (2)-style is similar to HTML parser for meta name/content

*/

        if (atts != NULL) {
            strlist = NULL;
            if (swish_hash_exists(parser_data->s3->config->stringlists, (xmlChar *)SWISH_CLASS_ATTRIBUTES)) {
                strlist = swish_hash_fetch(parser_data->s3->config->stringlists, (xmlChar *)SWISH_CLASS_ATTRIBUTES);
            }

            for (i = 0; (atts[i] != NULL); i += 2) {

                if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
                    SWISH_DEBUG_MSG(" %d XML attr: %s=%s [%d]", i, atts[i], atts[i + 1],
                                    xmlStrlen(atts[i + 1]));

                attr_lower = swish_str_tolower(atts[i]);
                attr_val_lower = swish_str_tolower(atts[i + 1]);

/* is this attribute a metaname? */
                if (strlist != NULL) {
                    for (j = 0; j < strlist->n; j++) {
                        if (xmlStrEqual(strlist->word[j], attr_lower)) {
                            if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
                                SWISH_DEBUG_MSG("found %s: %s", attr_lower, attr_val_lower);
    
/* eligible attribute name, attribute value part of baked tag */
                            size = xmlStrlen(swishtag) + xmlStrlen(attr_val_lower) + 2;     /*  dot + NUL */
                            metaname = swish_xmalloc(size + 1);
                            snprintf((char *)metaname, size, "%s%c%s", (char *)swishtag, SWISH_DOT,
                                     (char *)attr_val_lower);
    
                            swish_xfree(swishtag);
                            swishtag = metaname;
                        }
                    }
                }
                
/* 
    explicit metaname with dotted notation. 
    attribute value considered document content, similar to how HTML parser works. 
*/
                size = xmlStrlen(swishtag) + xmlStrlen(attr_lower) + 2;     /*  dot + NUL */
                metaname_from_attr = swish_xmalloc(size + 1);
                snprintf((char *)metaname_from_attr, size, "%s%c%s", (char *)swishtag, SWISH_DOT, 
                    (char *)attr_lower);
                    
                if (!swish_hash_exists(parser_data->s3->config->metanames, metaname_from_attr)) {
                    switch(parser_data->s3->config->flags->undef_attrs) {
                
                        case SWISH_UNDEF_ATTRS_ERROR:
                            SWISH_CROAK("XML tag '%s' is not a defined MetaName and %s == error",
                                metaname_from_attr, SWISH_UNDEFINED_XML_ATTRIBUTES);
                            break;
                        
                        case SWISH_UNDEF_ATTRS_IGNORE:
                            // TODO in the case of attributes, is this needed?
                            //parser_data->ignore_content++;
                            break;
                    
                        case SWISH_UNDEF_ATTRS_AUTO:
                            swish_metaname_new(metaname_from_attr, parser_data->s3->config);
                            swish_nb_new(parser_data->metanames, metaname_from_attr);
                            break;

                        case SWISH_UNDEF_ATTRS_AUTOALL:
                            swish_metaname_new(metaname_from_attr, parser_data->s3->config);
                            swish_nb_new(parser_data->metanames, metaname_from_attr);
                            if (!swish_hash_exists(parser_data->s3->config->properties, metaname_from_attr)) {
                                swish_property_new(metaname_from_attr, parser_data->s3->config);
                                swish_nb_new(parser_data->properties, metaname_from_attr);
                            }
                            break;
                    
                        case SWISH_UNDEF_ATTRS_INDEX:
                            // TODO what metaname to use?
                            prev_bump_word = parser_data->bump_word;
                            parser_data->bump_word = SWISH_TRUE;
                            buffer_characters(parser_data, attr_val_lower, xmlStrlen(attr_val_lower));
                            parser_data->bump_word = prev_bump_word;
                            break;
                        
                        case SWISH_UNDEF_ATTRS_DISABLE:
                        default:
                            break;  // nothing to do
                        
                    }   // end switch
                }
                    
                if (swish_hash_exists(parser_data->s3->config->metanames, metaname_from_attr)) {
                    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
                        SWISH_DEBUG_MSG("found XML meta tag '%s' with content '%s'", 
                            metaname_from_attr, attr_val_lower);

                    parser_data->bump_word = SWISH_TRUE;
                    open_tag(parser_data, metaname_from_attr, NULL, xmlns_prefix);
                    buffer_characters(parser_data, attr_val_lower, xmlStrlen(attr_val_lower));
                    close_tag(parser_data, metaname_from_attr, xmlns_prefix);
                    swish_xfree(parser_data->tag);  // metaname set recursively, so must free
                
                    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
                        SWISH_DEBUG_MSG("close_tag done. swishtag = '%s', parser->tag = '%s'", 
                            metaname_from_attr, parser_data->tag);
                
                }

                swish_xfree(metaname_from_attr);
                swish_xfree(attr_lower);
                swish_xfree(attr_val_lower);
            
            }
        }
        
        if (!swish_hash_exists(parser_data->s3->config->metanames, swishtag)
            &&
            !swish_hash_exists(parser_data->s3->config->tag_aliases, swishtag)
        ) {
        
            switch(parser_data->s3->config->flags->undef_metas) {
            
                case SWISH_UNDEF_METAS_ERROR:
                    SWISH_CROAK("XML tag '%s' is not a defined MetaName and %s == error",
                        swishtag, SWISH_UNDEFINED_METATAGS);
                    break;
                    
                case SWISH_UNDEF_METAS_IGNORE:
                    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
                        SWISH_DEBUG_MSG("setting ignore_content=%d", 
                            (parser_data->ignore_content +1));
                    }
                    parser_data->ignore_content++;
                    break;
                
                case SWISH_UNDEF_METAS_AUTO:
                    swish_metaname_new(swishtag, parser_data->s3->config);
                    swish_nb_new(parser_data->metanames, swishtag);
                    break;

                case SWISH_UNDEF_METAS_AUTOALL:
                    swish_metaname_new(swishtag, parser_data->s3->config);
                    swish_nb_new(parser_data->metanames, swishtag);
                    if (!swish_hash_exists(parser_data->s3->config->properties, swishtag)) {
                        swish_property_new(swishtag, parser_data->s3->config);
                        swish_nb_new(parser_data->properties, swishtag);
                    }    
                    break;
                
                case SWISH_UNDEF_METAS_INDEX:
                default:
                    if (parser_data->ignore_content) {
                        if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
                            SWISH_DEBUG_MSG("ignore_content was %d, setting ignore_content=0", 
                                parser_data->ignore_content);
                        }
                        parser_data->ignore_content = 0;
                    }
                    break;
                    
            }   // end switch
        }
        
    }   // end XML tag

/*
 * change our internal name for this tag if it is aliased in config.
 * test the simple tag first, and if that fails, the whole dom stack.
 */
    alias = swish_hash_fetch(parser_data->s3->config->tag_aliases, swishtag);
    if (alias) {
        if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
            SWISH_DEBUG_MSG("%s alias -> %s", swishtag, alias); 
        }
        swish_xfree(swishtag);
        swishtag = swish_xstrdup(alias);
    }
    else {
        swishdomtag = flatten_tag_stack(swishtag, parser_data->domstack, SWISH_DOT);
        alias = swish_hash_fetch(parser_data->s3->config->tag_aliases, swishdomtag);
        if (alias) {
            if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
                SWISH_DEBUG_MSG("%s alias -> %s", swishdomtag, alias); 
            }
            swish_xfree(swishtag);
            swishtag = swish_xstrdup(alias);
        }
        swish_xfree(swishdomtag);
    }

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        SWISH_DEBUG_MSG(" swishtag = %s", swishtag);
    }

    return swishtag;
}

static void
flush_buffer(
    swish_ParserData *parser_data,
    xmlChar *metaname,
    xmlChar *context
)
{
    swish_MetaName *meta;
    xmlChar *metaname_stored_as;
    swish_TagStack *s = parser_data->metastack;

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("buffer is >>%s<< before flush",
                        xmlBufferContent(parser_data->meta_buf));

/*
* add meta_buf as-is to metanames buffer under current tag. this
* gives us both tokens and raw text de-tagged but organized by
* metaname. If the metaname is an alias_for, use the target of the alias.
*/
    meta = swish_hash_fetch(parser_data->s3->config->metanames, metaname);
    if (meta->alias_for != NULL) {
        metaname_stored_as = meta->alias_for;
    }
    else {
        metaname_stored_as = metaname;
    }
    swish_nb_add_buf(parser_data->metanames, metaname_stored_as, parser_data->meta_buf,
                        (xmlChar *)SWISH_TOKENPOS_BUMPER, 0, 1);

/*
*  if cascade_meta_context is true, add tokens (buffer) to every metaname on the stack.
*/

    if (parser_data->s3->config->flags->cascade_meta_context) {
        for (s->temp = s->head; s->temp != NULL; s->temp = s->temp->next) {
            if (xmlStrEqual(s->temp->baked, metaname_stored_as))  /*  already added */
                continue;

            swish_nb_add_buf(parser_data->metanames, s->temp->baked,
                                parser_data->meta_buf, (xmlChar *)SWISH_TOKENPOS_BUMPER,
                                0, 1);
        }
    }

    if (parser_data->s3->analyzer->tokenize) {
        tokenize(parser_data, (xmlChar *)xmlBufferContent(parser_data->meta_buf),
                 xmlBufferLength(parser_data->meta_buf), metaname_stored_as, context);
    }

    xmlBufferEmpty(parser_data->meta_buf);

}

/* 
* SAX2 callback 
*/
static void
mystartDocument(
    void *data
)
{

/*
* swish_ParserData *parser_data = (swish_ParserData *) data; 
*/

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("startDocument()");

}

/* 
* SAX2 callback 
*/
static void
myendDocument(
    void *parser_data
)
{

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("endDocument()");

/*
* whatever's left 
*/
    flush_buffer(parser_data, (xmlChar *)SWISH_DEFAULT_METANAME,
                 (xmlChar *)SWISH_DEFAULT_METANAME);

}

/* 
* SAX1 callback 
*/
static void
mystartElement(
    void *data,
    const xmlChar *name,
    const xmlChar **atts
)
{
    open_tag(data, name, (xmlChar **)atts, NULL);
}

/* 
* SAX1 callback 
*/
static void
myendElement(
    void *data,
    const xmlChar *name
)
{
    close_tag(data, name, NULL);
}

/* 
* SAX2 handler 
*/
static void
mystartElementNs(
    void *data,
    const xmlChar *localname,
    const xmlChar *xmlns_prefix,
    const xmlChar *URI,
    int nb_namespaces,
    const xmlChar **namespaces,
    int nb_attributes,
    int nb_defaulted,
    const xmlChar **attributes
)
{
    int i, j, len;
    xmlChar **atts;
    xmlChar *xinclude_uri;
    boolean xinclude_is_text;
    swish_ParserData *parser_data;
    atts = NULL;
    parser_data = (swish_ParserData*)data;

    if (nb_attributes > 0) {
        atts = swish_xmalloc(((nb_attributes * 2) + 1) * sizeof(xmlChar *));
        j = 0;
        for (i = 0; i < nb_attributes * 5; i += 5) {
            atts[j] = (xmlChar *)attributes[i];
            len = (int)(attributes[i + 4] - attributes[i + 3]);
            if (len > 0) {
                atts[j + 1] = xmlStrsub(attributes[i + 3], 0, len);
            }
            else {
                atts[j] = NULL;
            }
            j += 2;
        }
        atts[j] = '\0';
    }

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        //SWISH_DEBUG_MSG(" tag: %s nb_attributes %d", localname, nb_attributes);
        if (atts != NULL) {
            for (i = 0; (atts[i] != NULL); i += 2) {
                //SWISH_DEBUG_MSG(" att: %s=%s", atts[i], atts[i + 1]);
/* SWISH_DEBUG_MSG(" att: %s=", atts[i++], atts[i] || ""); */
            }
        }
    }
            
    /* check for XInclude */
    if ((xmlStrEqual(URI, XINCLUDE_OLD_NS) || xmlStrEqual(URI, XINCLUDE_NS))
        &&
        xmlStrEqual(localname, XINCLUDE_NODE)
        &&
        atts != NULL
    ) {
    
        /*
        SWISH_DEBUG_MSG("localname=%s  xmlns_prefix=%s  URI=%s", 
            localname, xmlns_prefix, URI);
        */
        xinclude_is_text = SWISH_FALSE;
        xinclude_uri = NULL;
        for (i = 0; (atts[i] != NULL); i += 2) {
            //SWISH_DEBUG_MSG(" att: %s=%s", atts[i], atts[i + 1]);
            if (xmlStrEqual(atts[i], XINCLUDE_HREF)) {
                //SWISH_DEBUG_MSG("XInclude: %s", atts[i + 1]);
                xinclude_uri = atts[i+1];
            }
            if (xmlStrEqual(atts[i], XINCLUDE_PARSE)) {
                xinclude_is_text = (boolean)xmlStrEqual(atts[i+1], XINCLUDE_PARSE_TEXT);
            }
        }
        if (xinclude_uri != NULL && parser_data->s3->config->flags->follow_xinclude) {
            process_xinclude( parser_data, xinclude_uri, xinclude_is_text );
        }
    }

    open_tag(data, localname, atts, xmlns_prefix);

    if (atts != NULL) {
        for (i = 0; (atts[i] != NULL); i += 2) {
            xmlFree(atts[i+1]); /* do not use swish_xfree since we did not malloc it */
        }
        swish_xfree(atts);
    }
}

static void
xinclude_handler(
    swish_ParserData *parser_data
)
{   
    swish_ParserData *parent;
    swish_Token *t;
    swish_TokenIterator *it;
    
    parent = (swish_ParserData*)parser_data->s3->stash;
    it = parser_data->token_iterator;
    while ((t = swish_token_iterator_next_token(it)) != NULL) {
        //swish_token_debug(t);
        swish_token_list_add_token(
            parent->token_iterator->tl,
            t->value,
            t->len + 1, // include the NUL
            t->meta,
            t->context
        );
    }
    parent->docinfo->nwords += parser_data->docinfo->nwords;
    
    swish_buffer_concat(parent->properties, parser_data->properties);
    swish_buffer_concat(parent->metanames, parser_data->metanames);
}

static void
process_xinclude(
    swish_ParserData *parser_data,
    xmlChar *uri,
    boolean is_text
)
{
    xmlChar *xuri;
    xmlChar *path;
    void *cur_stash;
    int res;
    swish_ParserData *child_data;
    boolean path_is_absolute, path_needs_free;

    path_needs_free = SWISH_FALSE;
    
    /* test if absolute path */
    if (uri[0] == SWISH_PATH_SEP) {
        xuri = uri;
        path = NULL;
        path_is_absolute = SWISH_TRUE;
    }
    else {
        path = swish_fs_get_path(parser_data->docinfo->uri);
        if (path == NULL) {
            // no path == cwd
            path = swish_xmalloc(3);
            snprintf((char*)path, 3, ".%c", SWISH_PATH_SEP);
            path[2] = '\0';
            path_needs_free = SWISH_TRUE;
        }
        xuri = xmlBuildURI(uri, path);
        if (xuri == NULL) {
            SWISH_CROAK("Unable to build XInclude URI for %s and %s", uri, path);
        }
        path_is_absolute = SWISH_FALSE;
    }
    
    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        SWISH_DEBUG_MSG("xinclude  uri=%s  path=%s  xuri=%s", parser_data->docinfo->uri, path, xuri);
    }
        
    /*
     * set up our internal handler function,
     * which merges the 2 docs together. This isn't ideal, but since we 
     * are using SAX we can't leverage all the built-in XInclude support
     * that is part of libxml2.
     */
    cur_stash = parser_data->s3->stash;
    parser_data->s3->stash = parser_data;
    flush_buffer(   parser_data, 
                    parser_data->metastack->head->baked,
                    parser_data->metastack->head->context
                );
    child_data = init_parser_data(parser_data->s3);
    child_data->docinfo = swish_docinfo_init();
    child_data->docinfo->ref_cnt++;

    if (!swish_docinfo_from_filesystem(xuri, child_data->docinfo, child_data)) {
        SWISH_WARN("Skipping XInclude %s", xuri);
    }
    else {
        if (is_text && !xmlStrEqual(child_data->docinfo->parser, BAD_CAST SWISH_PARSER_TXT)) {
            swish_xfree(child_data->docinfo->parser);
            child_data->docinfo->parser = swish_xstrdup( BAD_CAST SWISH_PARSER_TXT );
        }
        res = docparser(child_data, xuri, NULL, 0);
        xinclude_handler(child_data);
    }
    
    /* clean up */
    free_parser_data(child_data);
    if (!path_is_absolute) {
        if (path_needs_free) {
            swish_xfree(path);
        }
        else {
            xmlFree(path);
        }
        xmlFree(xuri);
    }
    else {
        if (path != NULL) {
            swish_xfree(path);
        }
    }
    
    /* restore stash */
    parser_data->s3->stash = cur_stash;
}

/* 
* SAX2 handler 
*/
static void
myendElementNs(
    void *data,
    const xmlChar *localname,
    const xmlChar *xmlns_prefix,
    const xmlChar *URI
)
{
    close_tag(data, localname, xmlns_prefix);
}

static void
open_tag(
    void *data,
    const xmlChar *tag,
    xmlChar **atts,
    const xmlChar *xmlns_prefix
)
{
    swish_ParserData *parser_data;
    xmlChar *baked;
    
    parser_data = (swish_ParserData *)data;
    
    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        SWISH_DEBUG_MSG("<%s>", tag);
    }
    
    if (parser_data->tag != NULL) {
        if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
            SWISH_DEBUG_MSG("Freeing swishtag (parser_data->tag): '%s'", parser_data->tag);
        }
        swish_xfree(parser_data->tag);
        parser_data->tag = NULL;
    }

    parser_data->tag = bake_tag(
                parser_data, 
                (xmlChar *)tag, 
                (xmlChar **)atts, 
                (xmlChar *)xmlns_prefix);
        
    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        SWISH_DEBUG_MSG("checking config for '%s' in watched tags", parser_data->tag);
    }
    
/* all tags on domstack */

    if (parser_data->tag == NULL) {
        push_tag_stack(parser_data->domstack, (xmlChar *)tag, (xmlChar *)tag, SWISH_DOT);
    }
    else {
        push_tag_stack(parser_data->domstack, (xmlChar *)tag, parser_data->tag, SWISH_DOT);
    }
    
/*
* set property if this tag is configured for it 
*/
    if (swish_hash_exists(parser_data->s3->config->properties, parser_data->tag)
        ||
        swish_hash_exists(parser_data->s3->config->properties, parser_data->domstack->head->context)
    ) {
        if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
            SWISH_DEBUG_MSG(" %s = new property", parser_data->tag);
        }
        
        add_stack_to_prop_buf(NULL, parser_data);       /* NULL means all properties in the stack are added */
        xmlBufferEmpty(parser_data->prop_buf);
        
        if (swish_hash_exists(parser_data->s3->config->properties, parser_data->domstack->head->context)) {
            baked = parser_data->domstack->head->context;
        }
        else {
            baked = parser_data->tag;
        }
        
        push_tag_stack(parser_data->propstack, (xmlChar *)tag, baked, SWISH_DOM_CHAR);

        if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
            SWISH_DEBUG_MSG("%s pushed ok unto propstack", baked);
        }
    }

/*
* likewise for metastack 
*/

    if (swish_hash_exists(parser_data->s3->config->metanames, parser_data->tag)
        ||
        swish_hash_exists(parser_data->s3->config->metanames, parser_data->domstack->head->context)
    ) {
        if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
            SWISH_DEBUG_MSG(" %s = new metaname", parser_data->tag);
        }
        flush_buffer(parser_data, parser_data->metastack->head->baked,
                     parser_data->metastack->head->context);
                     
        if (swish_hash_exists(parser_data->s3->config->properties, parser_data->domstack->head->context)) {
            baked = parser_data->domstack->head->context;
        }
        else {
            baked = parser_data->tag;
        }
        push_tag_stack(parser_data->metastack, (xmlChar *)tag, baked, SWISH_DOM_CHAR);
    }

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        SWISH_DEBUG_MSG("config check for '%s' done", parser_data->tag);
    }
}

static void
close_tag(
    void *data,
    const xmlChar *tag,
    const xmlChar *xmlns_prefix
)
{
    swish_ParserData *parser_data;
    swish_Tag *st;

    parser_data = (swish_ParserData *)data;

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("</%s>", tag);
        
/*
* lowercase all names for comparison against metanames (which are
* also * lowercased) 
*/
    if (parser_data->tag != NULL) {
        if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
            SWISH_DEBUG_MSG("freeing parser_data->tag '%s'", parser_data->tag);
        }
        swish_xfree(parser_data->tag);
        parser_data->tag = NULL;
    }
    
    parser_data->tag = bake_tag(parser_data, (xmlChar *)tag, NULL, (xmlChar *)xmlns_prefix);

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG(" endElement(%s) (%s)", (xmlChar *)tag, parser_data->tag);
        
    if (parser_data->tag == NULL)
        return;

    if ((st = pop_tag_stack_on_match(parser_data->propstack, (xmlChar *)tag)) != NULL) {

        add_stack_to_prop_buf(st->baked, parser_data);
        xmlBufferEmpty(parser_data->prop_buf);
        free_swishTag(st);
    }

    if ((st = pop_tag_stack_on_match(parser_data->metastack, (xmlChar *)tag)) != NULL) {

        //SWISH_DEBUG_MSG("flush_buffer before free_swishTag");
        flush_buffer(parser_data, st->baked, st->context);
        //SWISH_DEBUG_MSG("metastack pop_tag_stack_on_match free_swishTag");
        free_swishTag(st);
    }
    
    // always pop the raw domstack
    st = pop_tag_stack(parser_data->domstack);
    free_swishTag(st);
    //SWISH_DEBUG_MSG("free_swishTag raw DOMstack done");

}

/* 
* handle all characters in doc 
*/
static void
buffer_characters(
    swish_ParserData *parser_data,
    const xmlChar *ch,
    int len
)
{

    if (parser_data->ignore_content) {
        if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
            SWISH_DEBUG_MSG("skipping %d bytes because ignore_content > 0", len);
        }
        return;
    }


    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        SWISH_DEBUG_MSG("appending %d bytes to buffer (bump_word=%d)", 
            len, parser_data->bump_word);
    }

    if (parser_data->bump_word && xmlBufferLength(parser_data->meta_buf)) {
        if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {    
            SWISH_DEBUG_MSG("bump_word is true; appending TOKENPOS_BUMPER to meta_buf");
        }
        swish_buffer_append(parser_data->meta_buf, (xmlChar *)SWISH_TOKENPOS_BUMPER, 1);
    }
    
    swish_buffer_append(parser_data->meta_buf, BAD_CAST ch, len);

    if (parser_data->bump_word && xmlBufferLength(parser_data->prop_buf)) {
        if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
            SWISH_DEBUG_MSG("bump_word is true; appending TOKENPOS_BUMPER to prop_buf");
        }
        swish_buffer_append(parser_data->prop_buf, (xmlChar *)SWISH_TOKENPOS_BUMPER, 1);
    }

    swish_buffer_append(parser_data->prop_buf, BAD_CAST ch, len);
    
    // reset
    parser_data->bump_word = SWISH_FALSE;
}

/* 
* SAX2 callback 
*/
static void
mycharacters(
    void *parser_data,
    const xmlChar *ch,
    int len
)
{
    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        int i;
        for (i=0; i<len; i++) {
            SWISH_DEBUG_MSG("%c [%d]", ch[i], i);
        }
    }
    
    buffer_characters(parser_data, ch, len);
}

/* 
* SAX2 callback 
*/
static void
mycomments(
    void *data,
    const xmlChar *cmt
)
{
    int swishcmd_found = 0;
    int len = xmlStrlen(cmt);
    xmlChar *swishcmd;
    xmlChar *comment_text = swish_str_skip_ws((xmlChar*)cmt);
    swish_ParserData *parser_data = (swish_ParserData *)data;

/*
*   Allowed comments to enable/disable indexing a block by either:
*
*       <!-- noindex -->
*       <!-- index -->
*       <!-- SwishCommand noindex -->
*       <!-- SwishCommand index -->
*
*/    

    swish_str_trim_ws(comment_text);
    if (! *comment_text) {
        return;
    }
    
    /* Strip off SwishCommand - might be for future use */
    if ( ( swishcmd = (xmlChar*)xmlStrcasestr( comment_text, (xmlChar*)"SwishCommand" ) ) 
        && swishcmd == comment_text 
    ) {
        comment_text = swish_str_skip_ws( comment_text + xmlStrlen( (xmlChar*)"SwishCommand" ) );
        swishcmd_found++;
    }

    if ( !xmlStrcasecmp( comment_text, (xmlChar*)"noindex" ) ) {
        if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
            SWISH_DEBUG_MSG("found noindex comment, setting ignore_content=%d", 
                (parser_data->ignore_content +1));
        }
        parser_data->ignore_content++;
        return;
    }
    else if ( !xmlStrcasecmp( comment_text, (xmlChar*)"index" ) ) {
        if ( parser_data->ignore_content > 0 ) {
            if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
                SWISH_DEBUG_MSG("found index comment, setting ignore_content=%d", 
                    (parser_data->ignore_content -1));
            }
            parser_data->ignore_content--;
        }
        return;
    }


    if( swishcmd_found )
        return;


    /* Bump position around comments - hard coded, always done to prevent phrase matching */
    parser_data->bump_word = SWISH_TRUE;

    return;
/*
* TODO: make comments indexing optional 
*/
    buffer_characters(data, cmt, len);
}

/* 
* SAX2 callback 
*/
static void
myerr(
    void *data,
    xmlChar *msg,
    ...
)
{
    swish_ParserData *parser_data;
    va_list args;
    char str[1000];

    if (!SWISH_PARSER_WARNINGS)
        return;

    parser_data = (swish_ParserData *)data;

    SWISH_WARN("libxml2 error for %s:", parser_data->docinfo->uri);

    va_start(args, msg);
    vsnprintf((char *)str, 1000, (char *)msg, args);
    /* passing args as last param is ignored but quiets a gcc warning */
    xmlParserError(parser_data->ctxt, (char *)str, args);
    va_end(args);
}

/* 
* SAX2 callback 
*/
static void
mywarn(
    void *user_data,
    xmlChar *msg,
    ...
)
{
    swish_ParserData *parser_data;
    va_list args;
    char str[1000];

    if (!SWISH_PARSER_WARNINGS)
        return;

    parser_data = (swish_ParserData *)user_data;

    SWISH_WARN("libxml2 warning for %s:", parser_data->docinfo->uri);
    if (parser_data->ctxt == NULL) {
        SWISH_WARN("ctxt is null");
    }

    va_start(args, msg);
    vsnprintf((char *)str, 1000, (char *)msg, args);
    /* passing args as last param is ignored but quiets a gcc warning */
    xmlParserWarning(parser_data->ctxt, (char *)str, args);
    va_end(args);
}

/* 
* SAX2 handler struct for html and xml parsing 
*/

xmlSAXHandler my_parser = {
    NULL,                       /* internalSubset */
    NULL,                       /* isStandalone */
    NULL,                       /* hasInternalSubset */
    NULL,                       /* hasExternalSubset */
    NULL,                       /* resolveEntity */
    NULL,                       /* getEntity */
    NULL,                       /* entityDecl */
    NULL,                       /* notationDecl */
    NULL,                       /* attributeDecl */
    NULL,                       /* elementDecl */
    NULL,                       /* unparsedEntityDecl */
    NULL,                       /* setDocumentLocator */
    mystartDocument,            /* startDocument */
    myendDocument,              /* endDocument */
    mystartElement,             /* startElement */
    myendElement,               /* endElement */
    NULL,                       /* reference */
    mycharacters,               /* characters */
    NULL,                       /* ignorableWhitespace */
    NULL,                       /* processingInstruction */
    mycomments,                 /* comment */
    (warningSAXFunc) & mywarn,  /* xmlParserWarning */
    (errorSAXFunc) & mywarn,     /* xmlParserError */
    (fatalErrorSAXFunc) & myerr, /* xmlfatalParserError */
    NULL,                       /* getParameterEntity */
    NULL,                       /* cdataBlock */
    NULL,                       /* externalSubset; */
    XML_SAX2_MAGIC,
    NULL,
    mystartElementNs,           /* startElementNs */
    myendElementNs,             /* endElementNs */
    NULL                        /* xmlStructuredErrorFunc */
};

xmlSAXHandlerPtr my_parser_ptr = &my_parser;

static int
docparser(
    swish_ParserData *parser_data,
    xmlChar *filename,
    xmlChar *buffer,
    int size
)
{

    int ret;
    ret = 0;
    xmlChar *mime = (xmlChar *)parser_data->docinfo->mime;
    xmlChar *parser = (xmlChar *)parser_data->docinfo->parser;

    if (!size && !xmlStrlen(buffer) && !parser_data->docinfo->size) {
        SWISH_WARN("%s appears to be empty -- can't parse it", parser_data->docinfo->uri);

        return 1;
    }

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        SWISH_DEBUG_MSG("%s -- using %s parser [%c]", parser_data->docinfo->uri, parser, parser[0]);
    }
    
/*
* slurp file if not already in memory 
*/
    if (filename && !buffer) {
        if (parser_data->docinfo->is_gzipped) {
            buffer = swish_io_slurp_gzfile_len(
                filename, 
                &(parser_data->docinfo->size), 
                SWISH_FALSE
            );
            parser_data->docinfo->size = xmlStrlen(buffer);
        }
        else {
            buffer = swish_io_slurp_file_len(
                filename, 
                (off_t)parser_data->docinfo->size,
                SWISH_FALSE
            );
        }
        size = parser_data->docinfo->size;
    }

    if (parser[0] == 'H' || parser[0] == 'h') {
        parser_data->is_html = SWISH_TRUE;
        ret = html_parser(my_parser_ptr, parser_data, buffer, size);
    }
    else if (parser[0] == 'X' || parser[0] == 'x') {
        ret = xml_parser(my_parser_ptr, parser_data, buffer, size);
    }
    else if (parser[0] == 'T' || parser[0] == 't') {
        ret = txt_parser(parser_data, (xmlChar *)buffer, size);
    }
    else {
        SWISH_CROAK("no parser known for MIME '%s' parser '%s'", mime, parser);
    }
    
    if (filename) {
        //SWISH_DEBUG_MSG("freeing buffer for %s", filename);
        swish_xfree(buffer);
    }

    return ret;

}

static swish_ParserData *
init_parser_data(
    swish_3 *s3
)
{

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("init parser_data");

    swish_ParserData *ptr = (swish_ParserData *)swish_xmalloc(sizeof(swish_ParserData));

    ptr->s3 = s3;
    ptr->s3->ref_cnt++;

    ptr->meta_buf = xmlBufferCreateSize(SWISH_BUFFER_CHUNK_SIZE);
    ptr->prop_buf = xmlBufferCreateSize(SWISH_BUFFER_CHUNK_SIZE);

    ptr->tag = NULL;
    ptr->token_iterator = swish_token_iterator_init(s3->analyzer);
    ptr->token_iterator->ref_cnt++;
    ptr->properties = swish_nb_init(s3->config->properties);
    ptr->properties->ref_cnt++;
    ptr->metanames = swish_nb_init(s3->config->metanames);
    ptr->metanames->ref_cnt++;

/*
*   set tokenizer if one has not been explicitly set
*/
    if (s3->analyzer->tokenizer == NULL) {
        s3->analyzer->tokenizer = (&swish_tokenize);
    }

/*
* prime the stacks 
*/
    ptr->metastack = (swish_TagStack *)swish_xmalloc(sizeof(swish_TagStack));
    ptr->metastack->name = "MetaStack";
    ptr->metastack->head = NULL;
    ptr->metastack->temp = NULL;
    ptr->metastack->count = 0;
    push_tag_stack(ptr->metastack, (xmlChar *)SWISH_DEFAULT_METANAME,
                   (xmlChar *)SWISH_DEFAULT_METANAME, SWISH_DOM_CHAR);

    ptr->propstack = (swish_TagStack *)swish_xmalloc(sizeof(swish_TagStack));
    ptr->propstack->name = "PropStack";
    ptr->propstack->head = NULL;
    ptr->propstack->temp = NULL;
    ptr->propstack->count = 0;
    push_tag_stack(ptr->propstack, (xmlChar *)SWISH_DOM_STR, (xmlChar *)SWISH_DOM_STR, SWISH_DOM_CHAR);
    
    ptr->domstack  = (swish_TagStack *)swish_xmalloc(sizeof(swish_TagStack));
    ptr->domstack->name  = "DOMStack";
    ptr->domstack->head  = NULL;
    ptr->domstack->temp  = NULL;
    ptr->domstack->count = 0;

/*
* gets toggled per-tag 
*/
    ptr->bump_word = SWISH_TRUE;

/*
* toggle 
*/
    ptr->ignore_content = 0;

/*
* shortcut rather than looking parser up in hash for each tag event 
*/
    ptr->is_html = SWISH_FALSE;
        
/*
* always start at first byte 
*/
    ptr->offset = 0;

/*
* pointer to the xmlParserCtxt since we want to free it only after
* we're completely done with it. NOTE this is a change per libxml2
* vers > 2.6.16 
*/
    ptr->ctxt = NULL;

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        SWISH_DEBUG_MSG("init done for parser_data");
    }
    
    return ptr;

}

static void
free_swishTagStack(
    swish_TagStack *stack
)
{
    swish_Tag *st;
    
    while ((st = pop_tag_stack(stack)) != NULL) {
        if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
            SWISH_DEBUG_MSG("%s %d POP %s [%s] [%s]", stack->name,
                            stack->count, st->raw, st->baked, st->context);

        free_swishTag(st);
    }

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("freeing stack %s", stack->name);

    swish_xfree(stack);
}

static void
free_parser_data(
    swish_ParserData *ptr
)
{

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("freeing swish_ParserData");

/*
* dec ref count for shared ptr 
*/
    ptr->s3->ref_cnt--;

/*
* Pop the stacks 
*/
    free_swishTagStack(ptr->metastack);
    free_swishTagStack(ptr->propstack);
    free_swishTagStack(ptr->domstack);
    
/* free named buffers */

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("freeing swish_ParserData properties");

    ptr->properties->ref_cnt--;
    swish_nb_free(ptr->properties);

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("freeing swish_ParserData metanames");

    ptr->metanames->ref_cnt--;
    swish_nb_free(ptr->metanames);

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("freeing swish_ParserData xmlBuffer");

    xmlBufferFree(ptr->meta_buf);

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("freeing swish_ParserData prop xmlBuffer");

    xmlBufferFree(ptr->prop_buf);

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("freeing swish_ParserData tag");

    if (ptr->tag != NULL)
        swish_xfree(ptr->tag);

    if (ptr->ctxt != NULL) {

        if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
            SWISH_DEBUG_MSG("freeing swish_ParserData libxml2 parser ctxt");

        if (xmlStrEqual(ptr->docinfo->parser, (xmlChar *)SWISH_PARSER_XML))
            xmlFreeParserCtxt(ptr->ctxt);

        if (xmlStrEqual(ptr->docinfo->parser, (xmlChar *)SWISH_PARSER_HTML))
            htmlFreeParserCtxt(ptr->ctxt);
    }
    else {
        if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
            SWISH_DEBUG_MSG("swish_ParserData libxml2 parser ctxt already freed");

    }

    if (ptr->token_iterator != NULL) {

        if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
            SWISH_DEBUG_MSG("free swish_ParserData TokenIterator");

        ptr->token_iterator->ref_cnt--;
        swish_token_iterator_free(ptr->token_iterator);
    }

    if (ptr->docinfo != NULL) {

        if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
            SWISH_DEBUG_MSG("free swish_ParserData docinfo");

        ptr->docinfo->ref_cnt--;
        swish_docinfo_free(ptr->docinfo);

    }

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("freeing swish_ParserData ptr");

    swish_xfree(ptr);

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("swish_ParserData all freed");
}

static HEAD *
buf_to_head(
    xmlChar *buf
)
{
    int i, j, k;
    xmlChar *line;
    const xmlChar *newlines;
    HEAD *h;

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("parsing head from buffer: %s", buf);

    h = swish_xmalloc(sizeof(HEAD));
    h->lines = swish_xmalloc(SWISH_MAX_HEADERS * sizeof(line));
    h->nlines = 0;
    h->body_start = 0;
    line = swish_xmalloc(SWISH_MAXSTRLEN + 1);
    i = 0;
    j = 0;
    k = 0;

    while (j < SWISH_MAX_HEADERS && i <= SWISH_MAXSTRLEN) {

/*
* SWISH_DEBUG_MSG( "i = %d j = %d k = %d", i, j, k); 
*/

        if (buf[k] == '\n') {
            SWISH_CROAK("illegal newline to start doc header");
        }
        line[i] = buf[k];

/*
* fprintf(stderr, "%c", line[i]); 
*/
        i++;
        k++;

        if (buf[k] == '\n') {

            line[i] = '\0';
            h->lines[j++] = swish_xstrdup(line);
            h->nlines++;

/*
* get to the next char no matter what, then check if == '\n' 
*/
            k++;

            if (buf[k] == '\n' || buf[k] == '\0') {

                if (SWISH_DEBUG & SWISH_DEBUG_DOCINFO) {
                    SWISH_DEBUG_MSG("found blank header line at byte %d\n", k);
                }
                
                h->body_start = k + 1;
                break;
            }
            i = 0;

            continue;
        }
    }
    
    swish_xfree(line);

    /* sanity check */
    if (SWISH_DEBUG & SWISH_DEBUG_DOCINFO) {
        SWISH_DEBUG_MSG("finished parsing head from buffer");
        newlines = xmlStrstr((const xmlChar*)buf, (const xmlChar*)"\n\n");
        if (newlines != NULL) {
            SWISH_DEBUG_MSG("strstr found body start at %d; loop at %d",
            (int)(buf - newlines), h->body_start);
        }
    }


    return h;
}

static swish_DocInfo *
head_to_docinfo(
    HEAD * h
)
{
    int i;
    xmlChar *val, *line;

    swish_DocInfo *info = swish_docinfo_init();

    info->ref_cnt++;

    if (SWISH_DEBUG & SWISH_DEBUG_DOCINFO)
        SWISH_DEBUG_MSG("preparing to parse %d header lines", h->nlines);

    for (i = 0; i < h->nlines; i++) {

        line = h->lines[i];

        if (SWISH_DEBUG & SWISH_DEBUG_DOCINFO)
            SWISH_DEBUG_MSG("parsing header line: >%s<", line);

        val = (xmlChar *)xmlStrchr(line, ':');
        if(!val) {
            SWISH_CROAK("bad header line: %s", line);
        }
        val = swish_str_skip_ws(++val);

        if (SWISH_DEBUG & SWISH_DEBUG_DOCINFO) {
            SWISH_DEBUG_MSG("%d parsing header line: %s", i, line);

        }

        if (!xmlStrncasecmp(line, (const xmlChar *)"Content-Length", 14)) {
            if (!val)
                SWISH_WARN("Failed to parse Content-Length header '%s'", line);

            info->size = swish_string_to_int((char *)val);
            continue;
        }
        if (!xmlStrncasecmp(line, (const xmlChar *)"Last-Modified", 13)) {

            if (!val)
                SWISH_WARN("Failed to parse Last-Modified header '%s'", line);

            info->mtime = swish_string_to_int((char *)val);
            continue;
        }
        if (!xmlStrncasecmp(line, (const xmlChar *)"Last-Mtime", 10)) {

            SWISH_WARN("%s: Last-Mtime is deprecated in favor of Last-Modified", val);

            if (!val)
                SWISH_WARN("Failed to parse Last-Mtime header '%s'", line);

            info->mtime = swish_string_to_int((char *)val);
            continue;
        }
        if (!xmlStrncasecmp(line, (const xmlChar *)"Content-Location", 16)) {

            if (!val)
                SWISH_WARN("Failed to parse Content-Location header '%s'", line);

            if (!*val)
                SWISH_WARN("Failed to find path name in Content-Location header '%s'",
                           line);

            if (info->uri != NULL)
                swish_xfree(info->uri);

            info->uri = swish_xstrdup(val);
            continue;
        }
        if (!xmlStrncasecmp(line, (const xmlChar *)"Path-Name", 9)) {

            SWISH_WARN("%s: Path-Name is deprecated in favor of Content-Location", val);

            if (!val)
                SWISH_WARN("Failed to parse Path-Name header '%s'", line);

            if (!*val)
                SWISH_WARN("Failed to find path name in Path-Name header '%s'", line);

            if (info->uri != NULL)
                swish_xfree(info->uri);

            info->uri = swish_xstrdup(val);
            continue;
        }
        if (!xmlStrncasecmp(line, (const xmlChar *)"Document-Type", 13)) {

            SWISH_WARN("%s: Document-Type is deprecated in favor of Parser-Type", val);

            if (!val)
                SWISH_WARN("Failed to parse Document-Type header '%s'", line);

            if (!*val)
                SWISH_WARN("Failed to find path name in Document-Type header '%s'", line);

            if (info->parser != NULL)
                swish_xfree(info->parser);

            info->parser = swish_xstrdup(val);
            continue;
        }
        if (!xmlStrncasecmp(line, (const xmlChar *)"Parser-Type", 11)) {

            if (!val)
                SWISH_WARN("Failed to parse Parser-Type header '%s'", line);

            if (!*val)
                SWISH_WARN("Failed to find path name in Parser-Type header '%s'", line);

            if (info->parser != NULL)
                swish_xfree(info->parser);

            info->parser = swish_xstrdup(val);
            continue;
        }
        if (!xmlStrncasecmp(line, (const xmlChar *)"Content-Type", 12)) {

            if (!val)
                SWISH_WARN("Failed to parse Content-Type header '%s'", line);

            if (!*val)
                SWISH_WARN("Failed to find path name in Content-Type header '%s'", line);

/*
* TODO: get encoding out of this line too if
* present. example:   text/xml; charset=ISO-8859-1
*/

            if (info->mime != NULL)
                swish_xfree(info->mime);

            info->mime = swish_xstrdup(val);
            continue;
        }
        if (!xmlStrncasecmp(line, (const xmlChar *)"Encoding", 8)
            || !xmlStrncasecmp(line, (const xmlChar *)"Charset", 7)) {

            if (!val)
                SWISH_WARN("Failed to parse Encoding or Charset header '%s'", line);

            if (!*val)
                SWISH_WARN("Failed to find value in Encoding or Charset header '%s'",
                           line);

            if (info->encoding != NULL)
                swish_xfree(info->encoding);

            info->encoding = swish_xstrdup(val);
            continue;
        }
        if (!xmlStrncasecmp(line, (const xmlChar *)"Action", 11)) {

            if (!val)
                SWISH_WARN("Failed to parse Action header '%s'", line);

            if (!*val)
                SWISH_WARN("Failed to find value in Action header '%s'", line);

            if (info->action != NULL)
                swish_xfree(info->action);

            info->action = swish_xstrdup(val);
            continue;
        }

/*
* if we get here, unrecognized header line 
*/
        SWISH_WARN("Unknown header line: '%s'\n", line);

    }

    if (SWISH_DEBUG & SWISH_DEBUG_DOCINFO) {
        SWISH_DEBUG_MSG("returning %d header lines", h->nlines);
        swish_docinfo_debug(info);
    }

    return info;
}

static void
get_env_vars(
)
{

/*
* init the global env vars, but don't override if already set 
*/

    swish_setenv("SWISH_PARSER_WARNINGS", "1", 0);
    SWISH_PARSER_WARNINGS = swish_string_to_int(getenv("SWISH_PARSER_WARNINGS"));

    if (SWISH_DEBUG) {
        SWISH_PARSER_WARNINGS = SWISH_DEBUG;
    }
    
}

unsigned int
swish_parse_fh(
    swish_3 *s3,
    FILE * fh
)
{
    xmlChar *ln;
    HEAD *head;
    int i;
    xmlChar *read_buffer;
    xmlBufferPtr head_buf;
    swish_ParserData *parser_data;
    int xmlErr;
    int min_headers, nheaders;
    double curTime;
    char *etime;
    unsigned int file_cnt;

    i = 0;
    file_cnt = 0;
    nheaders = 0;
    min_headers = 2;

    if (fh == NULL)
        fh = stdin;

    ln = swish_xmalloc(SWISH_MAXSTRLEN + 1);
    head_buf =
        xmlBufferCreateSize((SWISH_MAX_HEADERS * SWISH_MAXSTRLEN) + SWISH_MAX_HEADERS);

/*
* based on extprog.c 
*/
    while (fgets((char *)ln, SWISH_MAXSTRLEN, fh) != NULL) {

/*
* we don't use fgetws() because we don't care about * indiv
* characters yet 
*/

        xmlChar *end;
        xmlChar *line;

        line = swish_str_skip_ws(ln);   /* skip leading white space */
        end = (xmlChar *)strrchr((char *)line, '\n');

/*
* trim any white space at end of doc, including \n 
*/
        if (end) {
            while (end > line && isspace((int)*(end - 1)))
                end--;

            *end = '\0';
        }

        if (nheaders >= min_headers && xmlStrlen(line) == 0) {

/*
* blank line indicates body 
*/
            curTime = swish_time_elapsed();
            parser_data = init_parser_data(s3);
            head = buf_to_head((xmlChar *)xmlBufferContent(head_buf));
            parser_data->docinfo = head_to_docinfo(head);
            swish_docinfo_check(parser_data->docinfo, s3->config);

            if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
                SWISH_DEBUG_MSG("reading %ld bytes from filehandle",
                                (long int)parser_data->docinfo->size);

            read_buffer = swish_io_slurp_fh(fh, parser_data->docinfo->size, SWISH_FALSE);

/*
* parse 
*/
            xmlErr =
                docparser(parser_data, NULL, read_buffer, parser_data->docinfo->size);

            if (xmlErr)
                SWISH_WARN("parser returned error %d", xmlErr);

            if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
                SWISH_DEBUG_MSG
                    ("\n===============================================================\n");
                swish_docinfo_debug(parser_data->docinfo);
                SWISH_DEBUG_MSG("  word buffer length: %d bytes",
                                xmlBufferLength(parser_data->meta_buf));
                SWISH_DEBUG_MSG(" (%d words)", parser_data->docinfo->nwords);
            }
            if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
                SWISH_DEBUG_MSG("passing to handler");

/*
* pass to callback function 
*/
            (*s3->parser->handler) (parser_data);

            if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
                SWISH_DEBUG_MSG("handler done");

/*
* reset everything for next time 
*/

            swish_xfree(read_buffer);
            free_parser_data(parser_data);
            free_head(head);
            xmlBufferEmpty(head_buf);
            nheaders = 0;

/*
* count the file 
*/
            file_cnt++;

            if (SWISH_DEBUG) {
                etime = swish_time_print_fine(swish_time_elapsed() - curTime);
                SWISH_DEBUG_MSG("%s elapsed time", etime);
                swish_xfree(etime);
            }

/*
* timer 
*/
            curTime = swish_time_elapsed();

            if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
                SWISH_DEBUG_MSG
                    ("\n================ filehandle - done with file ===================\n");

        }
        else if (xmlStrlen(line) == 0) {

            SWISH_CROAK("Not enough header lines reading from filehandle");

        }
        else {

/*
* we are reading headers 
*/
            if (xmlBufferAdd(head_buf, line, -1))
                SWISH_CROAK("error adding header to buffer");

            if ((xmlBufferCCat(head_buf, "\n")) != 0)
                SWISH_CROAK("can't add newline to end of header buffer");

            nheaders++;
            
            if (SWISH_DEBUG & SWISH_DEBUG_DOCINFO) {
                SWISH_DEBUG_MSG("nheaders = %d for buffer >%s<", 
                    nheaders, xmlBufferContent(head_buf));
            }
        }

    }

    if (xmlBufferLength(head_buf)) {
        SWISH_CROAK("Some unparsed header lines remaining");
    }

    swish_xfree(ln);
    xmlBufferFree(head_buf);

    return file_cnt;
}

static void
free_head(
    HEAD * h
)
{
    int i;

    for (i = 0; i < h->nlines; i++) {
        swish_xfree(h->lines[i]);
    }
    swish_xfree(h->lines);
    swish_xfree(h);
}

/* 
* PUBLIC 
*/

/* 
* pass in a string including headers. like parsing fh, but only for one
* doc
*/
int
swish_parse_buffer(
    swish_3 *s3,
    xmlChar *buf
)
{

    int res;
    double curTime = swish_time_elapsed();
    HEAD *head;
    char *etime;

    head = buf_to_head(buf);

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("number of headlines: %d", head->nlines);

    swish_ParserData *parser_data = init_parser_data(s3);

    parser_data->docinfo = head_to_docinfo(head);
    swish_docinfo_check(parser_data->docinfo, s3->config);

/*
* reposition buf pointer at start of body (just past head) 
*/

    buf += head->body_start;

    res = docparser(parser_data, 0, buf, xmlStrlen(buf));

/*
* pass to callback function 
*/
    (*s3->parser->handler) (parser_data);

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        swish_docinfo_debug(parser_data->docinfo);
        SWISH_DEBUG_MSG("  word buffer length: %d bytes",
                        xmlBufferLength(parser_data->meta_buf));
        SWISH_DEBUG_MSG(" (%d words)", parser_data->docinfo->nwords);
    }

/*
* free buffers 
*/
    free_head(head);
    free_parser_data(parser_data);

    if (SWISH_DEBUG) {
        etime = swish_time_print_fine(swish_time_elapsed() - curTime);
        SWISH_DEBUG_MSG("%s elapsed time", etime);
        swish_xfree(etime);
    }
    curTime = swish_time_elapsed();

    return res;

}

/* 
* PUBLIC 
*/
int
swish_parse_file(
    swish_3 *s3,
    xmlChar *filename
)
{
    int res;
    double curTime = swish_time_elapsed();
    char *etime;

    swish_ParserData *parser_data = init_parser_data(s3);

    parser_data->docinfo = swish_docinfo_init();
    parser_data->docinfo->ref_cnt++;

    if (!swish_docinfo_from_filesystem(filename, parser_data->docinfo, parser_data)) {
        SWISH_WARN("Skipping %s", filename);
        free_parser_data(parser_data);
        return SWISH_ERR_NO_SUCH_FILE;
    }

    res = docparser(parser_data, filename, 0, 0);

/*
* pass to callback function 
*/
    (*s3->parser->handler) (parser_data);

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        swish_docinfo_debug(parser_data->docinfo);
        SWISH_DEBUG_MSG("  word buffer length: %d bytes",
                        xmlBufferLength(parser_data->meta_buf));
        SWISH_DEBUG_MSG(" (%d words)", parser_data->docinfo->nwords);
    }

/*
* free buffers 
*/
    free_parser_data(parser_data);

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        etime = swish_time_print_fine(swish_time_elapsed() - curTime);
        SWISH_DEBUG_MSG("%s elapsed time", etime);
        swish_xfree(etime);
    }

    return res;

}

/*
 * based on swish-e 2.4 indexadir() in fs.c
 */
unsigned int
swish_parse_directory(
    swish_3 *s3,
    xmlChar *dir,
    boolean follow_symlinks
)
{
    DIR *dir_handle;
#ifdef NEXTSTEP
    struct direct   *dir_ptr;
#else
    struct dirent   *dir_ptr;
#endif
    xmlChar *pathbuf;
    unsigned int pathbuflen;
    unsigned int dir_len;
    unsigned int files_parsed;
    
    files_parsed = 0;
    
    if ((dir_handle = opendir((char*)dir)) == NULL) {
        SWISH_WARN("Failed to open directory '%s' : %s", dir, strerror(errno));
        return files_parsed;
    }

    pathbuflen = SWISH_MAXSTRLEN;
    pathbuf = (xmlChar*)swish_xmalloc(pathbuflen + 1);
    dir_len = xmlStrlen(dir);
    
    /* case of root dir */
    if ( dir_len == 1 && dir[0] == SWISH_PATH_SEP ) 
        dir_len = 0;
        
    while ((dir_ptr = readdir(dir_handle)) != NULL) {
        int file_len = strlen( dir_ptr->d_name );

        /* For security reasons, don't index dot files */
        /* TODO Check for hidden under Windows? */
        if ((dir_ptr->d_name)[0] == '.')
            continue;


        /* Build full path to file */

        /* reallocate filename buffer, if needed (dir + path + SWISH_PATH_SEP ) */
        if ( (dir_len + file_len + 1) > pathbuflen ) {
            pathbuflen = dir_len + file_len + 256;
            pathbuf = (xmlChar *)swish_xrealloc(pathbuf, pathbuflen + 1);
        }

        if ( dir_len )
            memcpy(pathbuf, dir, dir_len);

        pathbuf[dir_len] = SWISH_PATH_SEP;  // Add path separator
        memcpy(pathbuf + dir_len + 1, dir_ptr->d_name, file_len);
        pathbuf[dir_len + file_len + 1] = '\0';

        /* Check if the path is a symlink */
        if ( !follow_symlinks && swish_fs_is_link( pathbuf ) )
            continue;


        if ( swish_fs_is_dir(pathbuf) ) {
            /* recurse immediately. this is a depth-first algorithm */
            if (s3->parser->verbosity) {
                printf("Found directory: %s\n", pathbuf);
            }
            files_parsed += swish_parse_directory(s3, pathbuf, follow_symlinks);
        }
        else if (swish_fs_is_link(pathbuf) && follow_symlinks) {
            if (s3->parser->verbosity) {
                printf("Found symlink: %s\n", pathbuf);
            }
            // TODO?
        
        }
        else if (swish_fs_is_file(pathbuf)) {
            if (s3->parser->verbosity) {
                printf("Found file: %s\n", pathbuf);
            }
            if (!swish_parse_file(s3, pathbuf)) {
                files_parsed++;
            }
        }
        else {
            SWISH_CROAK("Unknown file in directory: %s", pathbuf);
        }
    }
    closedir(dir_handle);
    swish_xfree(pathbuf);

    return files_parsed;
}


/**
* based on libxml2 xmlSAXUserParseMemory in parser.c
* which we don't use directly so that we can get encoding
*/
static int
xml_parser(
    xmlSAXHandlerPtr sax,
    void *user_data,
    xmlChar *buffer,
    int size
)
{
    int ret = 0;
    xmlParserCtxtPtr ctxt;
    swish_ParserData *parser_data = (swish_ParserData *)user_data;
    xmlSAXHandlerPtr oldsax = NULL;

    if (sax == NULL)
        return -1;
    ctxt = xmlCreateMemoryParserCtxt((const char *)buffer, size);
    if (ctxt == NULL)
        return -1;
    oldsax = ctxt->sax;
    ctxt->sax = sax;
    ctxt->sax2 = 1;

/*
* always use sax2 -- this pulled from xmlDetextSAX2() 
*/
    ctxt->str_xml = xmlDictLookup(ctxt->dict, BAD_CAST "xml", 3);
    ctxt->str_xmlns = xmlDictLookup(ctxt->dict, BAD_CAST "xmlns", 5);
    ctxt->str_xml_ns = xmlDictLookup(ctxt->dict, XML_XML_NAMESPACE, 36);
    if ((ctxt->str_xml == NULL) || (ctxt->str_xmlns == NULL)
        || (ctxt->str_xml_ns == NULL)) {

/*
* xmlErrMemory is/was not a public func but is in parserInternals.h.
* basically, this is a bad, fatal error, so we'll just die.
*/

/*
* xmlErrMemory(ctxt, NULL); 
*/
        SWISH_CROAK("Fatal libxml2 memory error");
    }

    if (user_data != NULL)
        ctxt->userData = user_data;

/* track ctxt in parser_data during the actual parsing
   so that warnings/errors show context. But set to NULL
   afterwards so we don't try and free it when parser_data
   gets freed.
*/
    parser_data->ctxt = ctxt;
    if (xmlParseDocument(ctxt) < 0) {
        SWISH_WARN("recovering from libxml2 error for %s", parser_data->docinfo->uri);
    }
    parser_data->ctxt = NULL;

    if (ctxt->wellFormed) {
        ret = 0;
    }
    else {
        if (ctxt->errNo != 0) {
            ret = ctxt->errNo;
        }
        else {
            ret = -1;
        }
    }
    ctxt->sax = oldsax;
    if (ctxt->myDoc != NULL) {
        xmlFreeDoc(ctxt->myDoc);
        ctxt->myDoc = NULL;
    }

    if (parser_data->docinfo->encoding != NULL)
        swish_xfree(parser_data->docinfo->encoding);

    parser_data->docinfo->encoding = document_encoding(ctxt);

    xmlFreeParserCtxt(ctxt);

    return ret;
}

static int
html_parser(
    xmlSAXHandlerPtr sax,
    void *user_data,
    xmlChar *buffer,
    int size
)
{
    int ret;
    htmlParserCtxtPtr ctxt;
    xmlChar *default_encoding;
    htmlSAXHandlerPtr oldsax = 0;
    swish_ParserData *parser_data = (swish_ParserData *)user_data;
    default_encoding = (xmlChar *)getenv("SWISH_ENCODING");

    xmlInitParser();

    ctxt = htmlCreateMemoryParserCtxt((const char *)buffer, xmlStrlen(buffer));
    
    parser_data->ctxt = ctxt;

    if (parser_data->docinfo->encoding != NULL) {
        swish_xfree(parser_data->docinfo->encoding);
    }
        
    parser_data->docinfo->encoding = document_encoding(ctxt);
        
    if (parser_data->docinfo->encoding == NULL) {
        set_encoding(parser_data, (xmlChar *)buffer);
    }
    
    /*
     * HTML parser defaults to ISO-8859-1 and that's what Swish-e 2.x does.
     * Leave that default alone, since we assume any HTML docs that actually
     * care about encoding will explicitly specify it in the <meta> header,
     * which libxml2 should respect.
    if (ctxt->encoding == NULL) {
        xmlCharEncoding enc = xmlParseCharEncoding((char*)parser_data->docinfo->encoding);
        xmlSwitchEncoding(ctxt, enc);
    }
    */

    if (ctxt == 0) {
        return (0);
    }
    
    if (sax != 0) {
        oldsax = ctxt->sax;
        ctxt->sax = (htmlSAXHandlerPtr) sax;
        ctxt->userData = parser_data;
    }
    
    ret = htmlParseDocument(ctxt);

    if (sax != 0) {
        ctxt->sax = oldsax;
        ctxt->userData = 0;
    }

    return ret;
}

static int
txt_parser(
    swish_ParserData *parser_data,
    xmlChar *buffer,
    int size
)
{
    int err = 0;
    xmlChar *out, *enc;
    int outlen;

    out = NULL;
    enc = (xmlChar *)getenv("SWISH_ENCODING");
    outlen = 0;

/*
* TODO better encoding detection. for now we assume unknown text
* files are latin1 
*/
    set_encoding(parser_data, buffer);

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("txt parser encoding: %s", parser_data->docinfo->encoding);

    if (!xmlStrEqual(parser_data->docinfo->encoding, (xmlChar *)SWISH_DEFAULT_ENCODING)) {
        SWISH_WARN("%s docinfo->encoding %s != %s", 
            parser_data->docinfo->uri, parser_data->docinfo->encoding, SWISH_DEFAULT_ENCODING);

        if (!xmlStrncasecmp(parser_data->docinfo->encoding, (xmlChar *)SWISH_LATIN1_ENCODING, 9)) {
            outlen = size * 2;
            out = swish_xmalloc(outlen);

            if (!isolat1ToUTF8(out, &outlen, buffer, &size)) {
                SWISH_WARN("could not convert buf from %s (outlen: %d)", SWISH_LATIN1_ENCODING, outlen);
            }
            else {
                SWISH_WARN("converted %s from %s to %s", 
                    parser_data->docinfo->uri, SWISH_LATIN1_ENCODING, SWISH_DEFAULT_ENCODING);
            }
            
            size = outlen;
            buffer = out;
        }

        else if (xmlStrEqual(parser_data->docinfo->encoding, enc)) {
            if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
                SWISH_DEBUG_MSG("default env encoding -> %s", enc);

            if (xmlStrncasecmp(enc, (xmlChar *)SWISH_LATIN1_ENCODING, 9)) {
                SWISH_WARN
                    ("%s encoding is unknown (not %s) but LC_CTYPE is %s -- assuming file is %s",
                     parser_data->docinfo->uri, SWISH_DEFAULT_ENCODING, enc, SWISH_LATIN1_ENCODING);

            }

            outlen = size * 2;
            out = swish_xmalloc(outlen);

            if (!isolat1ToUTF8(out, &outlen, buffer, &size)) {
                SWISH_WARN("could not convert buf from %s (outlen: %d): %s", 
                    SWISH_LATIN1_ENCODING, outlen, buffer);
                swish_xfree(out);
                return SWISH_ENCODING_ERROR;
            }
            else {
                SWISH_WARN("converted %s from %s to %s", 
                    parser_data->docinfo->uri, SWISH_LATIN1_ENCODING, SWISH_DEFAULT_ENCODING);
            }

            size = outlen;
            buffer = out;

        }
    }

/*
* we obviously haven't any tags on which to trigger our metanames,
* so set default
* TODO check config to determine if we should buffer swish_prop_description etc
*/

    push_tag_stack(parser_data->metastack, (xmlChar *)SWISH_DEFAULT_METANAME,
                   (xmlChar *)SWISH_DEFAULT_METANAME, SWISH_DOM_CHAR);
                   
    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("%s stack PUSH %s", parser_data->metastack->head->context);

    buffer_characters(parser_data, buffer, size);
    flush_buffer(parser_data, (xmlChar *)SWISH_DEFAULT_METANAME,
                 (xmlChar *)SWISH_DEFAULT_METANAME);

/* add filename as title */
    push_tag_stack(parser_data->metastack, (xmlChar *)SWISH_TITLE_METANAME,
                   (xmlChar *)SWISH_TITLE_METANAME, SWISH_DOM_CHAR);
                   
    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("%s stack PUSH %s", parser_data->metastack->head->context);

    buffer_characters(parser_data, parser_data->docinfo->uri, xmlStrlen(parser_data->docinfo->uri));
    flush_buffer(parser_data, (xmlChar *)SWISH_TITLE_METANAME,
                 (xmlChar *)SWISH_TITLE_METANAME);

/* clean up */
    if (out != NULL) {
        if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
            SWISH_DEBUG_MSG("tmp text buffer being freed");

        swish_xfree(out);
    }

    return err;
}

static void
set_encoding(
    swish_ParserData *parser_data,
    xmlChar *buffer
)
{

/*
* this feels like it doesn't work ... would iconv() be better ? 
*/

    swish_xfree(parser_data->docinfo->encoding);

    if (xmlCheckUTF8((const xmlChar *)buffer) != 0) {
        parser_data->docinfo->encoding = swish_xstrdup((xmlChar *)SWISH_DEFAULT_ENCODING);
    }
    else {
        parser_data->docinfo->encoding = swish_xstrdup((xmlChar *)getenv("SWISH_ENCODING"));
    }
}

static xmlChar *
document_encoding(
    xmlParserCtxtPtr ctxt
)
{
    xmlChar *enc;

    if (ctxt->encoding != NULL) {
        enc = swish_xstrdup(ctxt->encoding);
        //SWISH_DEBUG_MSG("ctxt->encoding == %s", enc);
    }
    else if (ctxt->inputTab[0]->encoding != NULL) {
        enc = swish_xstrdup(ctxt->inputTab[0]->encoding);
        //SWISH_DEBUG_MSG("ctxt->inputTab->encoding == %s", enc);
    }
    else {

/*
* if we get here, we didn't error with bad encoding via SAX,
* so assume the current locale encoding.
*/
        enc = swish_xstrdup((xmlChar *)getenv("SWISH_ENCODING"));
        //SWISH_DEBUG_MSG("using default SWISH_ENCODING == %s", enc);
    }

    return enc;
}

static void
tokenize(
    swish_ParserData *parser_data,
    xmlChar *string,
    int len,
    xmlChar *metaname,
    xmlChar *context
)
{
    swish_MetaName *meta;

    meta = swish_hash_fetch(parser_data->s3->config->metanames, metaname);

    if (len == 0)
        return;

    if (metaname == NULL)
        metaname = parser_data->metastack->head->baked;

    if (context == NULL)
        context = parser_data->metastack->head->context;

    parser_data->docinfo->nwords +=
            (*parser_data->s3->analyzer->tokenizer) (parser_data->token_iterator, 
                                                    string, meta, context);
    return;

}

static void
_debug_stack(
    swish_TagStack *stack
)
{
    int i = 0;

    SWISH_DEBUG_MSG("%s stack->count: %d", stack->name, stack->count);

    for (stack->temp = stack->head; stack->temp != NULL; stack->temp = stack->temp->next) {
        SWISH_DEBUG_MSG("  %d: count %d  tagstack: %s", i++, stack->temp->n,
                        stack->temp->raw);

    }

    if (i != stack->count) {
        SWISH_WARN("stack count appears wrong (%d items, but count=%d)", i, stack->count);

    }
    else {
        SWISH_DEBUG_MSG("tagstack looks ok");
    }
}

/* 
* return stack as single string of joiner-separated names 
*/
static xmlChar *
flatten_tag_stack(
    xmlChar *baked,
    swish_TagStack *stack,
    char flatten_join
)
{
    xmlChar *tmp;
    xmlChar *flat;
    int size;
    int i;

    i = 0;
    stack->temp = stack->head;

    if (baked != NULL) {
        flat = swish_xstrdup(baked);
    }
    else {
        flat = swish_xstrdup(stack->head->baked);
        stack->temp = stack->temp->next;
    }

    for (; stack->temp != NULL; stack->temp = stack->temp->next) {
        size =
            (
            (xmlStrlen(flat)*sizeof(xmlChar)) + 
            (xmlStrlen(stack->temp->baked)*sizeof(xmlChar)) +
            3   // flatten_join + nulls
            );
            
        tmp = swish_xmalloc(size);
                
        if (snprintf((char *)tmp, size, "%s%c%s",
             (char *)stack->temp->baked, flatten_join, (char *)flat)
            > 0
        ) {
            if (flat != NULL)
                swish_xfree(flat);

            flat = tmp;
        }
        else {
            SWISH_CROAK("sprintf failed to concat %s -> %s", stack->temp->baked, flat);
        }

    }

    return flat;

}
static void
add_stack_to_prop_buf(
    xmlChar *baked,
    swish_ParserData *parser_data
)
{
    swish_TagStack *stack;
    boolean cleanwsp;
    swish_Property *prop;
    xmlChar *prop_to_store;

    stack = parser_data->propstack;
    cleanwsp = 1;
    
    if (baked != NULL) {
        /* If the propertyname is an alias_for, use the target of the alias. */
        prop = swish_hash_fetch(parser_data->s3->config->properties, baked);
        if (prop->alias_for != NULL) {
            prop_to_store = prop->alias_for;
        }
        else {
            prop_to_store = baked;
        }
    
        /* override per-property */
        if (prop->verbatim) {
            cleanwsp = 0;
        }
    
        if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
            SWISH_DEBUG_MSG("adding property %s to buffer", prop_to_store);
        }

        swish_nb_add_buf(parser_data->properties, prop_to_store, parser_data->prop_buf,
                            (xmlChar *)SWISH_TOKENPOS_BUMPER, cleanwsp, 0);
    }

    /* Swish-e 2.x behavior is to add for each member in the stack */
    for (stack->temp = stack->head; stack->temp != NULL; stack->temp = stack->temp->next) {
        if (xmlStrEqual(stack->temp->baked, (xmlChar *)SWISH_DOM_STR))
            continue;

        swish_nb_add_buf(parser_data->properties, stack->temp->baked,
                         parser_data->prop_buf, (xmlChar *)SWISH_TOKENPOS_BUMPER,
                         cleanwsp, 0);
    }
    

}

static void
free_swishTag(
    swish_Tag * st
)
{
    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        SWISH_DEBUG_MSG(" freeing swishTag: (raw)%s (baked)%s (context)%s", 
            st->raw, st->baked, st->context);
    }

    //SWISH_DEBUG_MSG("free raw: %s", st->raw);
    swish_xfree(st->raw);
    //SWISH_DEBUG_MSG("free baked: %s", st->baked);
    swish_xfree(st->baked);
    //SWISH_DEBUG_MSG("free context: %s", st->context);
    swish_xfree(st->context);
    //SWISH_DEBUG_MSG("free swishTag");
    swish_xfree(st);
    //SWISH_DEBUG_MSG("free swishTag done");
}

static void
push_tag_stack(
    swish_TagStack *stack,
    xmlChar *raw,
    xmlChar *baked,
    char flatten_join
)
{

    swish_Tag *thistag = swish_xmalloc(sizeof(swish_Tag));

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        SWISH_DEBUG_MSG("%s PUSH: tag = '%s'", stack->name, raw);
        _debug_stack(stack);
    }

/* assign this tag to the struct  */
    thistag->raw = swish_xstrdup(raw);

/*  the normalized tag */
    thistag->baked = swish_xstrdup(baked);

/* increment counter  */
    thistag->n = stack->count++;

/*  push */
    thistag->next = stack->head;
    stack->head = thistag;

/*  create context */
    thistag->context = flatten_tag_stack(NULL, stack, flatten_join);

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        SWISH_DEBUG_MSG("%s size: %d  thistag count: %d  current head tag = '%s'",
                        stack->name, stack->count, thistag->n, stack->head->context);

        _debug_stack(stack);

    }

}

static swish_Tag *
pop_tag_stack(
    swish_TagStack *stack
)
{
/*  stack is completely empty */
    if (stack->head == NULL)
        return NULL;

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        SWISH_DEBUG_MSG("%s POP: %s", stack->name, stack->head->raw);
        _debug_stack(stack);

    }

    if (stack->count > 1) {
        if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
            SWISH_DEBUG_MSG("%s %d: popping '%s'", stack->name, stack->head->n,
                            stack->head->raw);

        }

        stack->temp = stack->head;
        stack->head = stack->head->next;
        stack->count--;

    }
    else {

/*  the stack has only one member */

        if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
            SWISH_DEBUG_MSG("%s %d: popping '%s' will leave stack empty [%s]",
                            stack->name, stack->head->n, stack->head->raw,
                            stack->head->context);

        }

        stack->temp = stack->head;
        stack->head = NULL;
        stack->count--;

    }

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        SWISH_DEBUG_MSG("%s stack count = %d", stack->name, stack->count);
    }

    return stack->temp;

}

/* 
* returns top of the stack if the current tag matches.
*/
static swish_Tag *
pop_tag_stack_on_match(
    swish_TagStack *stack,
    xmlChar *tag
)
{

    swish_Tag *st;

    st = NULL;

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        SWISH_DEBUG_MSG("%s: POP if %s matches %s", stack->name, tag, stack->head->raw);
        _debug_stack(stack);
    }

    if (xmlStrEqual(stack->head->raw, tag)) {

        if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
            SWISH_DEBUG_MSG("%s POP '%s' == head", stack->name, tag);

        }

/*
* more than default meta 
*/
        if ((st = pop_tag_stack(stack)) != NULL) {

            if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
                SWISH_DEBUG_MSG("%s POPPED.  tag = %s  st->raw = %s", stack->name, tag,
                                st->raw);

                _debug_stack(stack);
            }

        }

/*
* only tag on stack. TODO do we ever get here? 
*/
        else if (stack->count) {
            if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
                SWISH_DEBUG_MSG("%s head %s", stack->name, stack->head->raw);

        }
        else {
            SWISH_CROAK("%s stack was empty", stack->name);
        }

    }
    else {

        if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
            SWISH_DEBUG_MSG("%s: no match for '%s'", stack->name, tag);

    }

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        if (st != NULL)
            SWISH_DEBUG_MSG("POP on match returning: %s", st->raw);
        else
            SWISH_DEBUG_MSG("POP on match returning null");
    }

    return st;
}


/*************** end parser.c ************/


/*************** start namedbuffer.c ************/
/*
 * This file is part of libswish3
 * Copyright (C) 2007 Peter Karman
 *
 *  libswish3 is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libswish3 is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libswish3; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
*/

/* named buffers are just a hash where each key is a text buffer
*/

#ifndef LIBSWISH3_SINGLE_FILE
#include <stdio.h>
#include <stdlib.h>
#include <locale.h>
#include <err.h>

#include "libswish3.h"
#endif

extern int SWISH_DEBUG;

static void free_name_from_hash(
    void *buffer,
    xmlChar *name
);
static void add_name_to_hash(
    void *ignored,
    xmlHashTablePtr nbhash,
    xmlChar *name
);
static void print_buffer(
    xmlBufferPtr buffer,
    xmlChar *label,
    xmlChar *name
);
static void
cat_buffer(
    xmlBufferPtr buffer,
    swish_NamedBuffer *nb2,
    xmlChar *name
);

static void
add_name_to_hash(
    void *ignored,
    xmlHashTablePtr nbhash,
    xmlChar *name
)
{
/* make sure we don't already have it */
    if (swish_hash_exists(nbhash, name)) {
        SWISH_WARN("%s is already in NamedBuffer hash -- ignoring", name);
        return;
    }

    if (SWISH_DEBUG == SWISH_DEBUG_NAMEDBUFFER)
        SWISH_DEBUG_MSG("  adding %s to NamedBuffer\n", name);

    swish_hash_add(nbhash, name, xmlBufferCreateSize((size_t) SWISH_BUFFER_CHUNK_SIZE));
}

static void
free_name_from_hash(
    void *buffer,
    xmlChar *name
)
{
    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY)
        SWISH_DEBUG_MSG(" freeing NamedBuffer %s\n", name);

    xmlBufferFree(buffer);
}

swish_NamedBuffer *
swish_nb_init(
    xmlHashTablePtr confhash
)
{
    swish_NamedBuffer *nb = swish_xmalloc(sizeof(swish_NamedBuffer));
    nb->stash = NULL;
    nb->ref_cnt = 0;
    nb->hash = xmlHashCreate(8);        /* will grow as needed */

/* init a buffer for each key in confhash. Note that this inits hashes for alias_for
   PropertyNames and MetaNames, which we will never use. But it's easier this way.
*/
    xmlHashScan(confhash, (xmlHashScanner)add_name_to_hash, nb->hash);

    return nb;
}

void
swish_nb_new(
    swish_NamedBuffer *nb,
    xmlChar *key
)
{
    add_name_to_hash(NULL, nb->hash, key);
    //SWISH_DEBUG_MSG("NamedBuffer->new(%s)", key);
}

void
swish_nb_free(
    swish_NamedBuffer * nb
)
{
    xmlHashFree(nb->hash, (xmlHashDeallocator)free_name_from_hash);

    if (nb->ref_cnt != 0) {
        SWISH_WARN("freeing NamedBuffer with ref_cnt != 0 (%d)", nb->ref_cnt);
    }

    if (nb->stash != NULL)
        SWISH_WARN("freeing NamedBuffer with non-null stash");

    swish_xfree(nb);
}

static void
print_buffer(
    xmlBufferPtr buffer,
    xmlChar *label,
    xmlChar *name
)
{
    const xmlChar *substr;
    const xmlChar *buf;
    int sub_len;

    SWISH_DEBUG_MSG("len=%d %s:<%s>%s</%s>", xmlBufferLength(buffer), 
                        label, name, xmlBufferContent(buffer), name);
    
    buf = xmlBufferContent(buffer);
    while ((substr = xmlStrstr(buf, (const xmlChar *)SWISH_TOKENPOS_BUMPER)) != NULL) {
        sub_len = substr - buf;
        SWISH_DEBUG_MSG("  len=%d <%s>%s</%s>", sub_len, name, xmlStrsub(buf, 0, sub_len), name );
        buf = substr + 1;
    }
    if (buf != NULL) {
        SWISH_DEBUG_MSG("  len=%d <%s>%s</%s>", xmlStrlen(buf), name, buf, name );
    }
}

void
swish_nb_debug(
    swish_NamedBuffer * nb,
    xmlChar *label
)
{
    xmlHashScan(nb->hash, (xmlHashScanner)print_buffer, label);
}

void
swish_nb_add_buf(
    swish_NamedBuffer * nb,
    xmlChar *name,
    xmlBufferPtr buf,
    xmlChar *joiner,
    boolean cleanwsp,
    boolean autovivify
)
{
    swish_nb_add_str(nb, name, (xmlChar *)xmlBufferContent(buf), xmlBufferLength(buf),
                        joiner, cleanwsp, autovivify);
}

void
swish_nb_add_str(
    swish_NamedBuffer * nb,
    xmlChar *name,
    xmlChar *str,
    unsigned int len,
    xmlChar *joiner,
    boolean cleanwsp,
    boolean autovivify
)
{
    xmlChar *nowhitesp;
    xmlBufferPtr buf = swish_hash_fetch(nb->hash, name);

/* if the str is nothing but whitespace, skip it */
    if (swish_str_all_ws(str)) {
        if (SWISH_DEBUG & SWISH_DEBUG_NAMEDBUFFER)
            SWISH_DEBUG_MSG("skipping all whitespace string '%s'", str);

        return;
    }

    if (!buf) {
        if (autovivify) {
/* spring to life */
            add_name_to_hash(NULL, nb->hash, name);
            buf = swish_hash_fetch(nb->hash, name);
        }

        if (!buf)
            SWISH_CROAK("%s is not a named buffer", name);

    }

/* if the buf already exists and we're about to add more, append the joiner */
    if (xmlBufferLength(buf)) {
        if (SWISH_DEBUG & SWISH_DEBUG_NAMEDBUFFER) {
            SWISH_DEBUG_MSG("appending string joiner '%s' to '%s' buffer", joiner, name);
        }
        swish_buffer_append(buf, joiner, xmlStrlen(joiner));
    }

    if (cleanwsp) {
        if (SWISH_DEBUG & SWISH_DEBUG_NAMEDBUFFER) {
            SWISH_DEBUG_MSG("before cleanwsp: '%s'", str);
        }
        swish_str_ctrl_to_ws(str);
        nowhitesp = swish_str_skip_ws(str);
        swish_str_trim_ws(nowhitesp);
        if (SWISH_DEBUG & SWISH_DEBUG_NAMEDBUFFER) {
            SWISH_DEBUG_MSG("after  cleanwsp: adding '%s' to buffer '%s'", nowhitesp, name);
        }
        swish_buffer_append(buf, nowhitesp, xmlStrlen(nowhitesp));
    }
    else {
        if (SWISH_DEBUG & SWISH_DEBUG_NAMEDBUFFER) {
            SWISH_DEBUG_MSG("adding '%s' to buffer '%s'", str, name);
        }
        swish_buffer_append(buf, str, len);
    }

}

void
swish_buffer_append(
    xmlBufferPtr buf,
    xmlChar *txt,
    int txtlen
)
{
    int ret;

/* shouldn't happen */
    if (txtlen == 0) {
        return;
    }
    
    if (buf == NULL) {
        SWISH_CROAK("Can't append NULL pointer to buffer.");
    }

    ret = xmlBufferAdd(buf, (const xmlChar *)txt, txtlen);
    if (ret) {
        SWISH_CROAK("Problem adding \n>>%s<<\n length %d to buffer. Err: %d", txt, txtlen,
                    ret);
    }
}

static void
cat_buffer(
    xmlBufferPtr buffer,
    swish_NamedBuffer *nb2,
    xmlChar *name
)
{
    xmlChar *buf2;
    buf2 = swish_nb_get_value(nb2, name);
    if (xmlStrlen(buf2)) {
        if (xmlBufferLength(buffer)) {
            xmlBufferCat(buffer, (xmlChar *)SWISH_TOKENPOS_BUMPER);
        }
        xmlBufferCat(buffer, buf2);
    }
}

void
swish_buffer_concat(
    swish_NamedBuffer *nb1,
    swish_NamedBuffer *nb2
)
{
    xmlHashScan(nb1->hash, (xmlHashScanner)cat_buffer, nb2);
}

xmlChar *
swish_nb_get_value(
    swish_NamedBuffer *nb,
    xmlChar *key
)
{
    xmlBufferPtr buf;
    buf = swish_hash_fetch(nb->hash, key);
    return (xmlChar *)xmlBufferContent(buf);
}


/*************** end namedbuffer.c ************/


/*************** start string.c ************/
/*
 * This file is part of libswish3
 * Copyright (C) 2007 Peter Karman
 *
 *  libswish3 is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libswish3 is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libswish3; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
*/

/* string.c -- handle xmlChar and wchar_t strings
 * much of this module based on swstring.c in swish-e vers 2
 * but re-written for UTF-8 support
*/

#ifndef LIBSWISH3_SINGLE_FILE
#include <assert.h>
#include <wchar.h>
#include <wctype.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <locale.h>
#include <err.h>
#include <limits.h>
#include <errno.h>

#include "libswish3.h"
#endif

extern int SWISH_DEBUG;

static xmlChar *getword(
    xmlChar **in_buf
);

#ifndef LIBSWISH3_SINGLE_FILE
#include "utf8.c"
#endif

/* these string conversion functions based on code from xapian-omega */
#define BUFSIZE 100
#define DATE_BUFSIZE 8
#define DATE_FMT "%04d%02d%02d"

#define CONVERT_TO_STRING(FMT) \
    xmlChar *str;\
    int ret;\
    str = swish_xmalloc(BUFSIZE);\
    ret = snprintf((char*)str, BUFSIZE, (FMT), val);\
    if (ret<0) SWISH_CROAK("snprintf failed with %d", ret);\
    return str;

int
swish_string_to_int(
    char *buf
)
{
    long i;
    errno = 0;
    i = strtol(buf, (char **)NULL, 10);
    
    // Check for various possible errors 
    if ((errno == ERANGE && (i == LONG_MAX || i == LONG_MIN))
        || (errno != 0 && i == 0)) {
        perror("strtol");
        exit(EXIT_FAILURE);
    }
    return (int)i;
}

boolean
swish_string_to_boolean(
    char *buf
)
{
    if (buf == '\0' || buf == NULL) {
        return SWISH_FALSE;
    }
    if (    buf[0] == 'Y' 
        ||  buf[0] == 'y'
        ||  buf[0] == '1'
        ||  (buf[0] == 'o' && buf[1] == 'n')
        ||  (buf[0] == 'O' && buf[1] == 'N')
    ) {
        return SWISH_TRUE;
    }
    if (    buf[0] == 'N'
        ||  buf[0] == 'n'
        ||  buf[0] == '0'
        ||  (buf[0] == 'o' && buf[1] == 'f')
        ||  (buf[0] == 'O' && buf[1] == 'F')
    ) {
        return SWISH_FALSE;
    }
    
    return SWISH_FALSE; /* default */
}

xmlChar *
swish_int_to_string(
    int val
)
{
    CONVERT_TO_STRING("%d")
}

xmlChar *
swish_long_to_string(
    long val
)
{
    CONVERT_TO_STRING("%ld")
}

xmlChar *
swish_double_to_string(
    double val
)
{
    CONVERT_TO_STRING("%f")
}

xmlChar *
swish_date_to_string(
    int y,
    int m,
    int d
)
{
    char buf[DATE_BUFSIZE + 1];
    if (y < 0)
        y = 0;
    else if (y > 9999)
        y = 9999;
    if (m < 1)
        m = 1;
    else if (m > 12)
        m = 12;
    if (d < 1)
        d = 1;
    else if (d > 31)
        d = 31;
#ifdef SNPRINTF
    int len = SNPRINTF(buf, sizeof(buf), DATE_FMT, y, m, d);
    if (len == -1 || len >= DATE_BUFSIZE)
        buf[DATE_BUFSIZE] = '\0';
#else
    buf[DATE_BUFSIZE] = '\0';
    sprintf(buf, DATE_FMT, y, m, d);
    if (buf[DATE_BUFSIZE])
        abort();                /* Uh-oh, buffer overrun */
#endif
    return swish_xstrdup((xmlChar *)buf);
}

/* returns the UCS32 value for a UTF8 string -- the character's Unicode value.
   see http://scripts.sil.org/cms/scripts/page.php?site_id=nrsi&item_id=IWS-AppendixA
*/

uint32_t
swish_utf8_codepoint(
    xmlChar *utf8
)
{
    uint32_t len;
    len = swish_utf8_chr_len(utf8);

    switch (len) {

    case 1:
        return utf8[0];

    case 2:
        return (utf8[0] - 192) * 64 + utf8[1] - 128;

    case 3:
        return (utf8[0] - 224) * 4096 + (utf8[1] - 128) * 64 + utf8[2] - 128;

    case 4:
    default:
        return (utf8[0] - 240) * 262144 + (utf8[1] - 128) * 4096 + (utf8[2] - 128) * 64 +
            utf8[3] - 128;

    }
}

void
swish_utf8_next_chr(
    xmlChar *s,
    int *i
)
{
    u8_inc((char *)s, i);
}

void
swish_utf8_prev_chr(
    xmlChar *s, 
    int *i
)
{
    u8_dec((char *)s, i);
}


/* returns length of a UTF8 character, based on first byte (see below) */
int
swish_utf8_chr_len(
    xmlChar *utf8
)
{
    int n;
    n = xmlUTF8Size(utf8);
    if (n == -1)
        SWISH_CROAK("Bad UTF8 string: %s", utf8);
        
    return n;
}

/* returns the number of UCS32 codepoints (characters) in a UTF8 string */
int
swish_utf8_num_chrs(
    xmlChar *utf8
)
{
    int n;
    n = xmlUTF8Strlen(utf8);
    if (n == -1)
        SWISH_CROAK("Bad UTF8 string: %s", utf8);
        
    return n;
}

/* returns true if all bytes in the *str are in the ascii range.
 * this helps speed up string handling when we don't need to worry
 * about multi-byte chars.
*/

/* from the libxml2 xmlstring.c file:
     * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
     * are as follows (in "bit format"):
     *    0xxxxxxx                                      valid 1-byte
     *    110xxxxx 10xxxxxx                             valid 2-byte
     *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
     *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
*/

boolean
swish_is_ascii(
    xmlChar *str
)
{
    int i;
    int len = xmlStrlen(str);

    if (!len || str == NULL)
        return 0;

    for (i = 0; i < len; i++) {
        if (str[i] >= 0x80)
            return 0;

    }
    return 1;
}

char*
swish_get_locale(
)
{
    char *locale;
    
    /* initialize with LC_ALL sets all the relevant env vars */
    setlocale(LC_ALL, "");
    locale = setlocale(LC_ALL, "");
    if (locale == NULL || !strlen(locale)) {
        //SWISH_DEBUG_MSG("locale for LC_ALL was null");

/* use LC_CTYPE specifically: 
 * http://mail.nl.linux.org/linux-utf8/2001-09/msg00030.html 
 */
        locale = setlocale(LC_CTYPE, "");
        if (locale == NULL || !strlen(locale)) {
            //SWISH_DEBUG_MSG("locale for LC_CTYPE was null");
            locale = getenv("LANG");
            if (locale == NULL || !strlen(locale)) {
                //SWISH_DEBUG_MSG("getenv for LANG was null");
                locale = SWISH_LOCALE;
            }
        }
    }
    return locale;
}

void
swish_verify_utf8_locale(
)
{
    char *loc;
    const xmlChar *enc;

/* a bit about encodings: libxml2 takes whatever encoding the input XML is
 * (latin1, ascii, utf8, etc) and standardizes it using iconv (or other) in xmlChar as
 * UTF-8. However, we must ensure we have UTF-8 locale because all the mb* and wc*
 * routines rely on the locale to correctly interpret chars.
 *
 * See also
 * http://www.cl.cam.ac.uk/~mgk25/unicode.html#c
 * and
 * http://www.cl.cam.ac.uk/~mgk25/unicode.html#activate
 */

    loc = swish_get_locale(); 
    enc = xmlStrchr((xmlChar *)loc, (xmlChar)'.');

    if (enc != NULL) {
        enc++;
        if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
            SWISH_DEBUG_MSG("encoding = %s", enc);
    }
    else {
        if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
            SWISH_DEBUG_MSG("no encoding in %s, using %s", loc, SWISH_DEFAULT_ENCODING);

        enc = (xmlChar *)SWISH_DEFAULT_ENCODING;
    }

    swish_setenv("SWISH_ENCODING", (char *)enc, 0);   /* remember in env var, if not already set */

    if (!loc) {
        SWISH_WARN("can't get locale via setlocale()");
    }
    else if (SWISH_DEBUG) {
        SWISH_DEBUG_MSG("current locale and encoding: %s %s", loc, enc);
    }

    if (u8_is_locale_utf8(loc)) {
/* a-ok */

        if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
            SWISH_DEBUG_MSG("locale looks like UTF-8");

    }
    else {
/* must be UTF-8 charset since libxml2 converts everything to UTF-8 */
        if (SWISH_DEBUG)
            SWISH_DEBUG_MSG
                ("Your locale (%s) was not UTF-8 so internally we are using %s", loc,
                 SWISH_LOCALE);

        if (!setlocale(LC_CTYPE, SWISH_LOCALE)) {
            SWISH_WARN("failed to set locale to %s from %s", SWISH_LOCALE, loc);
        }

    }

    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER) 
        SWISH_DEBUG_MSG("active locale is %s", setlocale(LC_CTYPE, NULL));

}

xmlChar *
swish_str_escape_utf8(
    xmlChar *u8str
)
{
    xmlChar *escaped;
    int u8chrs, n_escaped, esc_len;
    
    u8chrs = swish_utf8_num_chrs(u8str);
    
    /* 10 is the max number of ascii chars needed to represent a utf8 chr:
     * \Uxxxxxxxx
     * 1234567890
     */
    esc_len = (10*u8chrs)+1;    /* +1 == nul */
    escaped = swish_xmalloc(esc_len);
    
    /*
    SWISH_DEBUG_MSG("escaping %s len %d for '%s'",
        escaped, esc_len, u8str);
    */    
    n_escaped = u8_escape((char*)escaped, esc_len, (char*)u8str, 0); // TODO quotes?
    
    return escaped;
}

xmlChar *
swish_str_unescape_utf8(
    xmlChar *ascii
)
{
    xmlChar *unescaped;
    int n_unescaped, ascii_len;
    
    ascii_len = xmlStrlen(ascii);        
    unescaped = swish_xmalloc(ascii_len+1);
    n_unescaped = u8_unescape((char*)unescaped, ascii_len+1, (char*)ascii);
    
    return unescaped;
}


/* based on swstring.c  */

int
swish_wchar_t_comp(
    const void *s1,
    const void *s2
)
{
    return (*(wchar_t *) s1 - *(wchar_t *) s2);
}

/* Sort a string */
int
swish_sort_wchar(
    wchar_t * s
)
{
    int i, j, len;
    i = 0;
    j = 0;
    len = wcslen(s);
    qsort(s, len, sizeof(wchar_t), &swish_wchar_t_comp);

/* printf("sorted array s is %d long\n", len); */

    for (i = 0; s[i] != 0; i++)
/* printf("%d = %lc (%d)\n", i, s[i], s[i]); */

        for (i = 1, j = 1; i < (len - 1); i++) {
            if (s[i] != s[j - 1]) {
                s[j++] = s[i];
/* printf("%d item is %lc (%d)\n", j, s[j], s[j]); */
            }
        }

    return s[j];

}

/* based on swstring.c in Swish-e but handles wide char strings instead */

wchar_t *
swish_wstr_tolower(
    wchar_t * s
)
{
    wchar_t *p = (wchar_t *) s;
    while (*p) {
        *p = (wchar_t) towlower(*p);
        p++;
    }
    return s;
}

/* convert a string to lowercase.
 * returns a new malloc'd string, so should be freed eventually
*/
xmlChar *
swish_str_tolower(
    xmlChar *s
)
{

    if (swish_is_ascii(s))
        return swish_ascii_str_tolower(s);
    else
        return swish_utf8_str_tolower(s);

}

/* convert utf8 to wchar,
   lowercase the wchar,
   then convert back to utf8
   and free the wchar
*/
xmlChar *
swish_utf8_str_tolower(
    xmlChar *s
)
{
    xmlChar *str;
    wchar_t *wstr;

/* convert mb to wide -- must free */
    wstr = swish_locale_to_wchar(s);

/* convert wide tolower */
    swish_wstr_tolower(wstr);

/* convert wide back to mb */
    str = swish_wchar_to_locale(wstr);

    swish_xfree(wstr);

    return str;
}

/* based on swstring.c in Swish-e */
xmlChar *
swish_ascii_str_tolower(
    xmlChar *s
)
{
    xmlChar *copy = swish_xstrdup(s);
    xmlChar *p = copy;
    while (*p) {
        *p = tolower(*p);
        p++;
    }
    return copy;
}

/*
  -- Skip white spaces...
  -- position to non space character
  -- return: ptr. to non space char or \0
  -- 2001-01-30  rasc

  TODO make utf8 safe. 
*/

xmlChar *
swish_str_skip_ws(
    xmlChar *s
)
{
    while (*s && isspace((int)(xmlChar)*s))
        s++;
    return s;
}

/*************************************
* Trim trailing white space
* Returns void
**************************************/

// TODO make utf8 safe
void
swish_str_trim_ws(
    xmlChar *s
)
{
    int i = xmlStrlen(s);

    while (i && isspace((int)s[i - 1]))
        s[--i] = '\0';
}

boolean
swish_str_all_ws(
    xmlChar *s
)
{
    return swish_str_all_ws_len(s, xmlStrlen(s));
}

boolean
swish_str_all_ws_len(
    xmlChar * s, 
    int len
)
{
    int i;
    for (i = 0; i < len; i++) {
        if (!isspace((int)s[i])) {
            return 0;
        }
    }
    return 1;
}

/* change all ascii controll chars < 32 to space */
void
swish_str_ctrl_to_ws(
    xmlChar *s
)
{
    int i, k;
    if (!swish_is_ascii(s)) // TODO utf8-safe
        return;
        
    i = xmlStrlen(s);
    for(k=0; k<i; k++) {
        if ((int)s[k] < 32)
            s[k] = SWISH_SPACE;
    }
}

void
swish_debug_wchars(
    const wchar_t * widechars
)
{
    int i;
    for (i = 0; widechars[i] != 0; i++) {
        printf(" >%lc< %ld %#lx \n", (wint_t) widechars[i], (long int)widechars[i],
               (long unsigned int)widechars[i]);
    }
}

/* returns the number of UTF-8 char* needed to hold the codepoint
   represented by 'ch'.
   similar to swish_utf8_chr_len() except that the arg is already
   a 4-byte container and we want to know how many of the 4 bytes
   we really need.
*/
int
swish_bytes_in_wchar(
    int ch
)
{
    int len = 0;

    if (ch < 0x80) {
        len = 1;
    }
    if (ch < 0x800) {
        len = 2;
    }
    if (ch < 0x10000) {
        len = 3;
    }
    if (ch < 0x110000) {
        len = 4;
    }

    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
        SWISH_DEBUG_MSG(" %lc is %d bytes long", ch, len);

    return len;
}


/* from http://www.triptico.com/software/unicode.html */
wchar_t *
swish_locale_to_wchar(
    xmlChar *str
)
{
    wchar_t *ptr;
    size_t s;
    int len;
    char *loc;

/* first arg == 0 means 'calculate needed space' */
    s = mbstowcs(0, (const char *)str, 0);

    len = mblen((const char *)str, 4);

/* a size of -1 is triggered by an error in encoding; 
 * never happen in ISO-8859-* locales, but possible in UTF-8 
 */
    if (s == -1) {
        loc = swish_get_locale();
        SWISH_CROAK("error converting mbs to wide str under locale %s : %s", 
            loc, str);
    }


/* malloc the necessary space */
    ptr = swish_xmalloc((s + 1) * sizeof(wchar_t));

/* really do it */
    s = mbstowcs(ptr, (const char *)str, s);

/* ensure NUL termination */
    ptr[s] = '\0';

/* remember to free() ptr when done */
    return (ptr);
}

/* from http://www.triptico.com/software/unicode.html */
xmlChar *
swish_wchar_to_locale(
    wchar_t * str
)
{
    xmlChar *ptr;
    size_t s;

/* first arg == 0 means 'calculate needed space' */
    s = wcstombs(0, str, 0);

/* a size of -1 means there are characters that could not be converted to current
     * locale */
    if (s == -1)
        SWISH_CROAK("error converting wide chars to mbs: %ls", str);

/* malloc the necessary space */
    ptr = (xmlChar *)swish_xmalloc(s + 1);

/* really do it */
    s = wcstombs((char *)ptr, (const wchar_t *)str, s);

/* ensure NUL termination */
    ptr[s] = '\0';

/* remember to free() ptr when done */
    return (ptr);
}

/* StringList functions derived from swish-e vers 2 */
swish_StringList *
swish_stringlist_init(
)
{
    swish_StringList *sl = swish_xmalloc(sizeof(swish_StringList));
    sl->n = 0;
    sl->max = 2; /* 2 to allow for NUL-terminate */
    sl->word = swish_xmalloc(sl->max * sizeof(xmlChar *));
    return sl;
}

void
swish_stringlist_free(
    swish_StringList * sl
)
{
    while (sl->n)
        swish_xfree(sl->word[--sl->n]);

    swish_xfree(sl->word);
    swish_xfree(sl);
}

void
swish_stringlist_merge(
    swish_StringList * sl1,
    swish_StringList * sl2
)
{
    int i;
    // add sl1 -> sl2
    sl2->word =
        (xmlChar **)swish_xrealloc(sl2->word, (sl1->n + sl2->n) * sizeof(xmlChar *) + 1);
    for (i = 0; i < sl1->n; i++) {
        // copy is a little overhead, but keeps mem count simple
        sl2->word[sl2->n++] = swish_xstrdup(sl1->word[i]);
    }
    swish_stringlist_free(sl1);
}

swish_StringList *
swish_stringlist_copy(
    swish_StringList * sl
)
{
    swish_StringList *s2;
    int i;
    s2 = swish_stringlist_init();
    s2->word = (xmlChar **)swish_xrealloc(s2->word, sl->n * sizeof(xmlChar *) + 1);
    for (i = 0; i < sl->n; i++) {
        s2->word[i] = swish_xstrdup(sl->word[i]);
    }
    s2->n = sl->n;
    return s2;
}

void
swish_stringlist_debug(
    swish_StringList *sl
)
{
    int i;
    for (i=0; i<sl->n; i++) {
        SWISH_DEBUG_MSG("[%d] %s", i, sl->word[i]);
    }
}

swish_StringList *  
swish_stringlist_parse_sort_string(
    xmlChar *sort_string,
    swish_Config *cfg
)
{
    xmlChar *sort_string_lc, *prop, *dir, *normalized;
    swish_StringList *sl;
    int i, nlen;
    
    /*  normalize so we know we are comparing ASC vs asc 
     *  and since propertynames are always lowercased.
     */
    sort_string_lc = swish_str_tolower(sort_string);
    sl = swish_stringlist_build(sort_string);
    swish_xfree(sort_string_lc);
    
    /* 2x longer should be ample */
    nlen = 2*xmlStrlen(sort_string);
    normalized = swish_xmalloc(nlen);
    normalized[0] = '\0';
    
    /* create the normalized string */
    for (i=0; i < sl->n; i++) {
        prop = sl->word[i]; /* just for code clarity */
        if (cfg) {
            swish_property_get_id(prop, cfg->properties); /* will croak if invalid */
        }
        if (i < sl->n) {
            dir = sl->word[i+1];
        }
        else {
            dir = NULL;
        }
        normalized = xmlStrncat(normalized, BAD_CAST " ", 1);
        normalized = xmlStrncat(normalized, prop, xmlStrlen(prop));
        normalized = xmlStrncat(normalized, BAD_CAST " ", 1);
        if (xmlStrEqual(dir, BAD_CAST "asc")
            ||
            xmlStrEqual(dir, BAD_CAST "desc")
        ) {
            normalized = xmlStrncat(normalized, dir, xmlStrlen(dir));
            i++;    /* bump to next prop */
        }
        else {
            normalized = xmlStrncat(normalized, BAD_CAST "asc", 3);
        }
    }
    swish_stringlist_free(sl);
    sl = swish_stringlist_build(normalized);
    swish_xfree(normalized);
    return sl;
}

swish_StringList *
swish_stringlist_build(
    xmlChar *line
)
{
    swish_StringList *sl;
    xmlChar *p;

    if (!line)
        return (NULL);

    sl = swish_stringlist_init();
    p = (xmlChar *)strchr((const char *)line, '\n');
    if (p != NULL)
        *p = '\0';

    p = line;

    while ((p = getword(&line))) {
    
/* getword returns "" when not null, 
 * so need to free it if we are not using it 
 */
        if (!*p) {
            swish_xfree(p);
            break;
        }
        
        swish_stringlist_add_string(sl, (xmlChar*)p);
    }

/* Add an extra NUL */
    if (sl->n == sl->max) {
        sl->word =
            (xmlChar **)swish_xrealloc(sl->word, (sl->max += 1) * sizeof(xmlChar *));
    }

    sl->word[sl->n] = NULL;

    return sl;
}

unsigned int
swish_stringlist_add_string(
    swish_StringList *sl,
    xmlChar *str
)
{
    if (sl->n == sl->max) {
        sl->word = (xmlChar **)swish_xrealloc(sl->word, (sl->max *= 2) * sizeof(xmlChar *));
    }

    sl->word[sl->n++] = str;
    return sl->n;
}

/* Gets the next word in a line. If the word's in quotes,
 * include blank spaces in the word or phrase.
 * should be utf-8 compatible; only pitfall would be if a continuation byte
 * returns true for isspace().
*/

static xmlChar *
getword(
    xmlChar **in_buf
)
{
    xmlChar quotechar;
    xmlChar uc;
    xmlChar *s = *in_buf;
    xmlChar *start = *in_buf;
    xmlChar buf[SWISH_MAX_WORD_LEN + 1];
    xmlChar *cur_char = buf;
    int backslash = 0;

    quotechar = '\0';

    s = swish_str_skip_ws(s);

/* anything to read? */
    if (!*s) {
        *in_buf = s;
        return swish_xstrdup((xmlChar *)"\0");
    }

    if (*s == '\"' || *s == '\'')
        quotechar = *s++;

/* find end of "more words" or word */

    while (*s) {
        uc = (xmlChar)*s;

        if (uc == '\\' && !backslash && quotechar)
/* only enable backslash
         * inside of quotes */
        {
            s++;
            backslash++;
            continue;
        }

/* Can't see why we would need to escape these, can you? - always fed a
         * single line */
        if (uc == '\n' || uc == '\r') {
            s++;
            break;
        }

        if (!backslash) {
/* break on ending quote or unquoted space */

            if (uc == quotechar || (!quotechar && isspace((int)uc))) {
                s++;            /* past quote or space char. */
                break;
            }

        }
        else {
            backslash = 0;
        }

        *cur_char++ = *s++;

        if (cur_char - buf > SWISH_MAX_WORD_LEN) {
            SWISH_WARN("Parsed word '%s' exceeded max length of %d", start,
                       SWISH_MAX_WORD_LEN);
        }

    }

    if (backslash)
        *cur_char++ = '\\';

    *cur_char = '\0';

    *in_buf = s;

    return swish_xstrdup(buf);

}

/*
 * based on charDecode_C_Escape() in swstring.c in swish-e
 */

char    
swish_get_C_escaped_char(xmlChar *s, xmlChar **se)
{
    char    c,
           *se2;

    if (*s != '\\') {
        /* no escape   */
        c = *s;                 /* return same char */

    }
    else {

        switch (*(++s))
        {                       /* can be optimized ... */
        case 'a':
            c = '\a';
            break;
        case 'b':
            c = '\b';
            break;
        case 'f':
            c = '\f';
            break;
        case 'n':
            c = '\n';
            break;
        case 'r':
            c = '\r';
            break;
        case 't':
            c = '\t';
            break;
        case 'v':
            c = '\v';
            break;

        // TODO support full UTF-8
        case 'x':              /* Hex  \xff  */
            c = (char) strtoul((char*)++s, &se2, 16);
            s = (xmlChar*)--se2;
            break;

        case '0':              /* Oct  \0,  \012 */
            c = (char) strtoul((char*)s, &se2, 8);
            s = (xmlChar*)--se2;
            break;

        case '\0':             /* outch!! null after \ */
            s--;               /* it's a "\"    */
            
        default:
            c = *s;            /* the escaped character */
            break;
        }

    }

    if (se)
        *se = s + 1;
    return c;
}


/*************** end string.c ************/


/*************** start times.c ************/
/*
 * This file is part of libswish3
 * Copyright (C) 2007 Peter Karman
 *
 *  libswish3 is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libswish3 is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libswish3; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
*/
/* based on Swish-e version 2 */

#ifndef LIBSWISH3_SINGLE_FILE
#include <stdio.h>
#include <time.h>
#include "getruntime.c"
#include "libswish3.h"
#endif

/*
  -- TimeHiRes returns a ClockTick value (double)
  -- in seconds.fractions
*/

#ifdef HAVE_BSDGETTIMEOFDAY
#define gettimeofday BSDgettimeofday
#endif

#ifdef NO_GETTOD

double
swish_time_elapsed(
    void
)
{
#ifdef HAVE_SYS_TIMEB_H
#include <sys/timeb.h>

    struct timeb ftimebuf;

    ftime(&ftimebuf);
    return (double)ftimebuf.time + (double)ftimebuf.millitm / 1000.0;

#else

    return ((double)clock()) / CLOCKS_PER_SEC;

#endif
}

#else

#include <sys/time.h>

double
swish_time_elapsed(
    void
)
{
    struct timeval t;
    int i;

    i = gettimeofday(&t, NULL);
    if (i)
        return 0;

    return (double)(t.tv_sec + t.tv_usec / 1000000.0);
}
#endif

/* return CPU time used */
double
swish_time_cpu(
    void
)
{
    return (double)get_cpu_secs();
}

char *
swish_time_print(
    double time
)
{
    int hh, mm, ss;
    int delta;
    char *str;

    if (time < 0)
        time = 0;

    delta = (int)(time + 0.5);
    ss = delta % 60;
    delta /= 60;
    hh = delta / 60;
    mm = delta % 60;

    str = swish_xmalloc(9);
    if (sprintf(str, "%02d:%02d:%02d", hh, mm, ss) > 0) {
        return str;
    }
    else {
        swish_xfree(str);
        return (char *)swish_xstrdup((xmlChar *)"unknown time");
    }
}

char *
swish_time_print_fine(
    double time
)
{
    char *str;

    if (time >= 10)
        time = 9.99999;

    str = swish_xmalloc(8);
    if (sprintf(str, "%1.5f", time) > 0)
        return str;

    else {
        swish_xfree(str);
        return (char *)swish_xstrdup((xmlChar *)"unknown fine time");
    }

}

char *
swish_time_format(
    time_t epoch
)
{
    char *h_mtime;
    h_mtime = (char*)swish_xmalloc(30);
    strftime(h_mtime, (unsigned long)30, SWISH_DATE_FORMAT_STRING,
             (struct tm *)localtime((time_t *) & (epoch)));
    return h_mtime;
}


/*************** end times.c ************/


/*************** start swish.c ************/
/*
 * This file is part of libswish3
 * Copyright (C) 2007 Peter Karman
 *
 *  libswish3 is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libswish3 is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libswish3; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
*/

#ifndef LIBSWISH3_SINGLE_FILE
#include <stdlib.h>
#include <errno.h>
#include <err.h>
#include <string.h>
#include "acconfig.h"
#include "libswish3.h"
#endif

extern int errno;
int SWISH_DEBUG = 0;            /* global var */
int SWISH_WARNINGS = 1;         /* global var */

const char *
swish_lib_version(
)
{
    return (const char*)SWISH_LIB_VERSION;
}

const char *
swish_libxml2_version(
)
{
    return (const char*)LIBXML_DOTTED_VERSION;
}

swish_3 *
swish_3_init(
    void (*handler) (swish_ParserData *),
    void *stash
)
{
    swish_3 *s3;
    s3 = swish_xmalloc(sizeof(swish_3));
    s3->ref_cnt = 0;
    s3->config = swish_config_init();
    s3->config->ref_cnt++;
    swish_config_set_default(s3->config);
    s3->analyzer = swish_analyzer_init(s3->config);
    s3->analyzer->ref_cnt++;
    s3->parser = swish_parser_init(handler);
    s3->parser->ref_cnt++;
    s3->stash = stash;
    
    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
        SWISH_DEBUG_MSG("s3 ptr 0x%lx", s3);
    }
    
    return s3;
}

void
swish_3_free(
    swish_3 *s3
)
{    
    s3->parser->ref_cnt--;
    if (s3->parser->ref_cnt < 1) {
        swish_parser_free(s3->parser);
    }

    s3->analyzer->ref_cnt--;
    if (s3->analyzer->ref_cnt < 1) {
        swish_analyzer_free(s3->analyzer);
    }

    s3->config->ref_cnt--;
    if (s3->config->ref_cnt < 1) {
        swish_config_free(s3->config);
    }

    if (s3->ref_cnt != 0) {
        SWISH_WARN("s3 ref_cnt != 0: %d\n", s3->ref_cnt);
    }
    swish_xfree(s3);
}

void
swish_setenv(
    char * name,
    char * value,
    int override
)
{
    int ret;
    ret = setenv(name, value, override);
    if (ret != 0) {
        SWISH_CROAK("setenv failed with %d: %s", errno, strerror(errno));
    }
}

/* MUST call this before instantiating any swish_3 objects */
void
swish_setup(
)
{

/* global var that scripts can check to determine what version of Swish they are
 * using. the second 0 indicates that it will not override it if already set */
    swish_setenv("SWISH3", "1", 0);

/* global debug flag */
    swish_setenv("SWISH_DEBUG", "0", 0);
    swish_setenv("SWISH_DEBUG_MEMORY", "0", 0);
    swish_setenv("SWISH_DEBUG_CONFIG", "0", 0);
    swish_setenv("SWISH_DEBUG_DOCINFO", "0", 0);
    swish_setenv("SWISH_DEBUG_IO", "0", 0);
    swish_setenv("SWISH_DEBUG_TOKENLIST", "0", 0);
    swish_setenv("SWISH_DEBUG_TOKENIZER", "0", 0);
    swish_setenv("SWISH_DEBUG_PARSER", "0", 0);
    swish_setenv("SWISH_DEBUG_NAMEDBUFFER", "0", 0);
    swish_setenv("SWISH_WARNINGS", "1", 0);
    if (!SWISH_DEBUG) {

        SWISH_DEBUG += swish_string_to_int(getenv("SWISH_DEBUG"));

/* additional env vars just increase the global var value */

        if (swish_string_to_int(getenv("SWISH_DEBUG_MEMORY"))) {
            SWISH_DEBUG += SWISH_DEBUG_MEMORY;
        }
        if (swish_string_to_int(getenv("SWISH_DEBUG_CONFIG"))) {
            SWISH_DEBUG += SWISH_DEBUG_CONFIG;
        }
        if (swish_string_to_int(getenv("SWISH_DEBUG_DOCINFO"))) {
            SWISH_DEBUG += SWISH_DEBUG_DOCINFO;
        }
        if (swish_string_to_int(getenv("SWISH_DEBUG_TOKENLIST"))) {
            SWISH_DEBUG += SWISH_DEBUG_TOKENLIST;
        }
        if (swish_string_to_int(getenv("SWISH_DEBUG_TOKENIZER"))) {
            SWISH_DEBUG += SWISH_DEBUG_TOKENIZER;
        }
        if (swish_string_to_int(getenv("SWISH_DEBUG_PARSER"))) {
            SWISH_DEBUG += SWISH_DEBUG_PARSER;
        }
        if (swish_string_to_int(getenv("SWISH_DEBUG_NAMEDBUFFER"))) {
            SWISH_DEBUG += SWISH_DEBUG_NAMEDBUFFER;
        }
        if (swish_string_to_int(getenv("SWISH_DEBUG_IO"))) {
            SWISH_DEBUG += SWISH_DEBUG_IO;
        }
        
        /* special value to turn on all debugging */
        if (SWISH_DEBUG == -1) {
            SWISH_DEBUG += SWISH_DEBUG_MEMORY;
            SWISH_DEBUG += SWISH_DEBUG_CONFIG;
            SWISH_DEBUG += SWISH_DEBUG_DOCINFO;
            SWISH_DEBUG += SWISH_DEBUG_TOKENLIST;
            SWISH_DEBUG += SWISH_DEBUG_TOKENIZER;
            SWISH_DEBUG += SWISH_DEBUG_PARSER;
            SWISH_DEBUG += SWISH_DEBUG_NAMEDBUFFER;
            SWISH_DEBUG += SWISH_DEBUG_IO;        
        }

        if (SWISH_DEBUG) {
            SWISH_DEBUG_MSG("SWISH_DEBUG set to %d", SWISH_DEBUG);
        }
    }
    
    SWISH_WARNINGS = swish_string_to_int(getenv("SWISH_WARNINGS"));

/*
     * initialize the library and check potential API mismatches
     * between the version it was compiled for and the actual shared
     * library used.
*/
    LIBXML_TEST_VERSION 
    swish_mem_init();
    swish_verify_utf8_locale();

}


/*************** end swish.c ************/


/*************** start analyzer.c ************/
/*
 * This file is part of libswish3
 * Copyright (C) 2007 Peter Karman
 *
 *  libswish3 is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libswish3 is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libswish3; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
*/

/* text analyzer
   tokenize strings, stemming

*/

#ifndef LIBSWISH3_SINGLE_FILE
#include "libswish3.h"
#endif

extern int SWISH_DEBUG;

swish_Analyzer *
swish_analyzer_init(
    swish_Config *config
)
{
    swish_Analyzer *a;
    a = swish_xmalloc(sizeof(swish_Analyzer));

/* TODO get these all from config */
    a->maxwordlen = SWISH_MAX_WORD_LEN;
    a->minwordlen = SWISH_MIN_WORD_LEN;
    a->lc = SWISH_TRUE;
    a->ref_cnt = 0;
    a->tokenize = config->flags->tokenize;

    if (!a->tokenize && SWISH_DEBUG)
        SWISH_DEBUG_MSG("skipping tokenizer");

/* tokenizer set in the parse* function */
    a->tokenizer = NULL;

/* TODO get stemmer via config */
    a->stemmer = NULL;

/* TODO standalone regex lib */
    a->regex = NULL;

    a->stash = NULL;
    
    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
        SWISH_DEBUG_MSG("analyzer ptr 0x%x", (long int)a);
    }

    return a;
}

/* 
   IMPORTANT -- any struct members that require unique free()s should
   do that prior to calling this function.
   stemmer, for example, or regex
*/

void
swish_analyzer_free(
    swish_Analyzer *a
)
{
    if (a->ref_cnt != 0) {
        SWISH_WARN("analyzer ref_cnt != 0: %d\n", a->ref_cnt);
    }
    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
        SWISH_DEBUG_MSG("free analyzer");
        swish_mem_debug();
    }
    
    if (a->stash != NULL)
        SWISH_WARN("Analyzer->stash not freed 0x%x", (long int)a->stash);
        
    if (a->regex != NULL)
        SWISH_WARN("Analyzer->regex not freed 0x%x", (long int)a->regex);
        
    if (a->stemmer != NULL)
        SWISH_WARN("Analyzer->stemmer not freed");
        
    
    swish_xfree(a);
}


/*************** end analyzer.c ************/


/*************** start property.c ************/
/*
 * This file is part of libswish3
 * Copyright (C) 2008 Peter Karman
 *
 *  libswish3 is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libswish3 is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libswish3; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
*/

#ifndef LIBSWISH3_SINGLE_FILE
#include "libswish3.h"
#endif

extern int SWISH_DEBUG;

swish_Property *
swish_property_init(
    xmlChar *name
)
{
    swish_Property *p;
    p = swish_xmalloc(sizeof(swish_Property));
    p->ref_cnt      = 0;
    p->id           = -1;
    p->name         = name;
    p->ignore_case  = SWISH_TRUE;
    p->type         = SWISH_PROP_STRING;
    p->verbatim     = SWISH_FALSE;
    p->alias_for    = NULL;
    p->max          = 0;
    p->sort         = SWISH_TRUE;
    p->presort      = SWISH_TRUE;
    p->sort_length  = 0;
    return p;
}

void
swish_property_new(
    xmlChar *name,
    swish_Config *config
)
{
    swish_Property *p; 
    xmlChar *id_str;
    p = swish_property_init(swish_xstrdup(name));
    p->ref_cnt++;
    config->flags->max_prop_id++;
    p->id = config->flags->max_prop_id;
    id_str = swish_int_to_string(p->id);
    swish_hash_add(config->flags->prop_ids, id_str, p); 
    swish_hash_add(config->properties, name, p); 
    swish_xfree(id_str);
    //SWISH_DEBUG_MSG("PropertyName->new(%s)", name);
    //swish_property_debug(p);
}

void
swish_property_debug(
    swish_Property *p
)
{
    SWISH_DEBUG_MSG("\n\
    p->ref_cnt       = %d\n\
    p->id            = %d\n\
    p->name          = %s\n\
    p->ignore_case   = %d\n\
    p->type          = %d\n\
    p->verbatim      = %d\n\
    p->alias_for     = %s\n\
    p->max           = %d\n\
    p->sort          = %d\n\
    p->presort       = %d\n\
    p->sort_length   = %d\n\
    ", p->ref_cnt, p->id, p->name, p->ignore_case, p->type, p->verbatim, 
       p->alias_for, p->max, p->sort, p->presort, p->sort_length);
}

void
swish_property_free(
    swish_Property *p
)
{
    if (p->ref_cnt != 0) {
        SWISH_WARN("Property ref_cnt != 0: %d", p->ref_cnt);
    }

    if (p->name != NULL) {
        swish_xfree(p->name);
    }
    if (p->alias_for != NULL) {
        swish_xfree(p->alias_for);
    }

    swish_xfree(p);
}

int
swish_property_get_builtin_id(
    xmlChar *propname
)
{
    int prop_id = -2;
    if (xmlStrEqual(propname, BAD_CAST SWISH_PROP_RANK)) {
        prop_id = SWISH_PROP_RANK_ID;
    }
    else if (xmlStrEqual(propname, BAD_CAST SWISH_PROP_DOCPATH)) {
        prop_id = SWISH_PROP_DOCPATH_ID;
    }
    else if (xmlStrEqual(propname, BAD_CAST SWISH_PROP_MTIME)) {
        prop_id = SWISH_PROP_MTIME_ID;
    }
    else if (xmlStrEqual(propname, BAD_CAST SWISH_PROP_SIZE)) {
        prop_id = SWISH_PROP_SIZE_ID;
    }
    else if (xmlStrEqual(propname, BAD_CAST SWISH_PROP_MIME)) {
        prop_id = SWISH_PROP_MIME_ID;
    }
    else if (xmlStrEqual(propname, BAD_CAST SWISH_PROP_PARSER)) {
        prop_id = SWISH_PROP_PARSER_ID;
    }
    else if (xmlStrEqual(propname, BAD_CAST SWISH_PROP_NWORDS)) {
        prop_id = SWISH_PROP_NWORDS_ID;
    }
    return prop_id;
}

int
swish_property_get_id(
    xmlChar *propname, 
    xmlHashTablePtr properties
)
{
    int prop_id = -2;
    swish_Property *prop;
    
    // special cases
    if (swish_property_get_builtin_id(propname) != -2) {
        prop_id = swish_property_get_builtin_id(propname);
    }
    // look up the propname in the config
    else if (swish_hash_exists( properties, propname )) {
        prop = (swish_Property*)swish_hash_fetch( properties, propname );
        prop_id = prop->id;
    }
    else {
        SWISH_CROAK("No such PropertyName: %s", propname);
    }

    return prop_id;
}


/*************** end property.c ************/


/*************** start metaname.c ************/
/*
 * This file is part of libswish3
 * Copyright (C) 2008 Peter Karman
 *
 *  libswish3 is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libswish3 is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libswish3; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
*/

#ifndef LIBSWISH3_SINGLE_FILE
#include "libswish3.h"
#endif

extern int SWISH_DEBUG;

swish_MetaName *
swish_metaname_init(
    xmlChar *name
)
{
    swish_MetaName *m;
    m = swish_xmalloc(sizeof(swish_MetaName));
    m->ref_cnt = 0;
    m->id = -1;
    m->name = name;
    m->bias = 0;
    m->alias_for = NULL;
    return m;
}

void
swish_metaname_new(
    xmlChar *name,
    swish_Config *config
)
{
    swish_MetaName *m;
    xmlChar *id_str;
    m = swish_metaname_init(swish_xstrdup(name));
    m->ref_cnt++;
    config->flags->max_meta_id++;
    m->id = config->flags->max_meta_id;
    id_str = swish_int_to_string(m->id);
    swish_hash_add(config->flags->meta_ids, id_str, m);
    swish_hash_add(config->metanames, name, m);
    swish_xfree(id_str);
    //SWISH_DEBUG_MSG("MetaName->new(%s)", name);
    //swish_metaname_debug(m);
}

void
swish_metaname_debug(
    swish_MetaName *m
)
{
    SWISH_DEBUG_MSG("0x%x\n\
    m->ref_cnt      = %d\n\
    m->id           = %d\n\
    m->name         = %s\n\
    m->bias         = %d\n\
    m->alias_for    = %s\n\
    ", (long int)m, m->ref_cnt, m->id, m->name, m->bias, m->alias_for);
}

void
swish_metaname_free(
    swish_MetaName *m
)
{
    if (m->ref_cnt != 0) {
        SWISH_WARN("MetaName ref_cnt != 0: %d", m->ref_cnt);
    }

    if (m->name != NULL) {
        swish_xfree(m->name);
    }
    if (m->alias_for != NULL) {
        swish_xfree(m->alias_for);
    }

    swish_xfree(m);
}


/*************** end metaname.c ************/


/*************** start header.c ************/
/*
 * This file is part of libswish3
 * Copyright (C) 2008 Peter Karman
 *
 *  libswish3 is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libswish3 is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libswish3; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
*/

/* read/write the swish.xml header file */

#ifndef LIBSWISH3_SINGLE_FILE
#include <libxml/xmlreader.h>
#include <libxml/xmlwriter.h>
#include <libxml/encoding.h>
#include <libxml/uri.h>
#include <ctype.h>
#include "libswish3.h"
#endif

extern int SWISH_DEBUG;

/* local struct to ease passing around flags/state */
typedef struct
{
    boolean isprops;
    boolean ismetas;
    boolean isindex;
    boolean isparser;
    boolean isalias;
    boolean ismime;
    const xmlChar *parent_name;
    xmlChar *conf_file;
    swish_Config *config;
    boolean is_valid;
    unsigned int prop_id;
    unsigned int meta_id;
} headmaker;

typedef struct
{
    void *thing1;
    void *thing2;
    void *thing3;
} temp_things;

static void read_metaname_aliases(
    xmlChar *str,
    headmaker * h,
    swish_MetaName *meta
);
static void read_metaname_attr(
    const xmlChar *attr,
    const xmlChar *attr_val,
    swish_MetaName *meta,
    headmaker * h
);
static void read_metaname(
    xmlTextReaderPtr reader,
    headmaker * h
);
static void read_property_aliases(
    xmlChar *str,
    headmaker * h,
    swish_Property *prop
);
static void read_property_attr(
    const xmlChar *attr,
    const xmlChar *attr_val,
    swish_Property *prop,
    headmaker * h
);
static void read_property(
    xmlTextReaderPtr reader,
    headmaker * h
);
static void process_node(
    xmlTextReaderPtr reader,
    headmaker * h
);
static void read_key_values_pair(
    xmlTextReaderPtr reader,
    xmlHashTablePtr hash,
    xmlChar *name
);
static void read_key_value_pair(
    xmlTextReaderPtr reader,
    xmlHashTablePtr hash,
    xmlChar *name
);
static void
read_key_value_stringlist(
    xmlTextReaderPtr reader,
    xmlHashTablePtr hash,
    xmlChar *name
);
static void read_header(
    char *filename,
    headmaker * h
);
static void test_meta_alias_for(
    swish_MetaName *meta,
    swish_Config *c,
    xmlChar *name
);
static void test_prop_alias_for(
    swish_Property *prop,
    swish_Config *c,
    xmlChar *name
);
static headmaker *init_headmaker(
);
static void
reset_headmaker(
    headmaker *h
);
static void write_open_tag(
    xmlTextWriterPtr writer,
    xmlChar *tag
);
static void write_close_tag(
    xmlTextWriterPtr writer
);
static void write_element_with_content(
    xmlTextWriterPtr writer,
    xmlChar *tag,
    xmlChar *content
);
static void write_metaname(
    swish_MetaName *meta,
    xmlTextWriterPtr writer,
    xmlChar *name
);
static void write_metanames(
    xmlTextWriterPtr writer,
    xmlHashTablePtr metanames
);
static void write_hash_entry(
    xmlChar *value,
    xmlTextWriterPtr writer,
    xmlChar *key
);
static void write_property(
    swish_Property *prop,
    xmlTextWriterPtr writer,
    xmlChar *name
);
static void write_properties(
    xmlTextWriterPtr writer,
    xmlHashTablePtr properties
);
static void write_parser(
    xmlChar *val,
    xmlTextWriterPtr writer,
    xmlChar *key
);
static void write_parsers(
    xmlTextWriterPtr writer,
    xmlHashTablePtr parsers
);
static void write_mime(
    xmlChar *type,
    temp_things *things,
    xmlChar *ext
);
static void write_mimes(
    xmlTextWriterPtr writer,
    xmlHashTablePtr mimes
);
static void write_index(
    xmlTextWriterPtr writer,
    xmlHashTablePtr index
);
static void write_tag_aliases(
    xmlTextWriterPtr writer,
    xmlHashTablePtr tag_aliases
);
static void write_misc(
    xmlTextWriterPtr writer,
    xmlHashTablePtr hash
);
static void handle_special_misc_flags(
    headmaker *h
);

static void
handle_special_misc_flags(
    headmaker *h
)
{
    xmlChar *v;
    
    if (swish_hash_exists(h->config->misc, BAD_CAST SWISH_TOKENIZE)) {
        /*
        SWISH_DEBUG_MSG("tokenize in config == %s", 
            swish_hash_fetch(h->config->misc, BAD_CAST SWISH_TOKENIZE));
        */
        h->config->flags->tokenize = 
            swish_string_to_boolean(swish_hash_fetch(h->config->misc, BAD_CAST SWISH_TOKENIZE));
    }
    if (swish_hash_exists(h->config->misc, BAD_CAST SWISH_CASCADE_META_CONTEXT)) {
        /*
        SWISH_DEBUG_MSG("cascade_meta_context in config == %s", 
            swish_hash_fetch(h->config->misc, BAD_CAST SWISH_CASCADE_META_CONTEXT));
        */
        h->config->flags->cascade_meta_context = 
            swish_string_to_boolean(swish_hash_fetch(h->config->misc, BAD_CAST SWISH_CASCADE_META_CONTEXT));
    }
    if (swish_hash_exists(h->config->misc, BAD_CAST SWISH_IGNORE_XMLNS)) {
        /*
        SWISH_DEBUG_MSG("ignore_xmlns in config == %s", 
            swish_hash_fetch(h->config->misc, BAD_CAST SWISH_IGNORE_XMLNS));
        */
        h->config->flags->ignore_xmlns = 
            swish_string_to_boolean(swish_hash_fetch(h->config->misc, BAD_CAST SWISH_IGNORE_XMLNS));
    }
    if (swish_hash_exists(h->config->misc, BAD_CAST SWISH_UNDEFINED_METATAGS)) {
        v = swish_hash_fetch(h->config->misc, BAD_CAST SWISH_UNDEFINED_METATAGS);
        if (xmlStrEqual(v, BAD_CAST "error")) {
            h->config->flags->undef_metas = SWISH_UNDEF_METAS_ERROR;
        }
        else if (xmlStrEqual(v, BAD_CAST "ignore")) {
            h->config->flags->undef_metas = SWISH_UNDEF_METAS_IGNORE;
        }
        else if (xmlStrEqual(v, BAD_CAST "index")) {
            h->config->flags->undef_metas = SWISH_UNDEF_METAS_INDEX;
        }
        else if (xmlStrEqual(v, BAD_CAST "auto")) {
            h->config->flags->undef_metas = SWISH_UNDEF_METAS_AUTO;
        }
        else if (xmlStrEqual(v, BAD_CAST "autoall")) {
            h->config->flags->undef_metas = SWISH_UNDEF_METAS_AUTOALL;
        }
        else {
            SWISH_CROAK("Unknown value for %s: %s", SWISH_UNDEFINED_METATAGS, v);
        }
    }
    if (swish_hash_exists(h->config->misc, BAD_CAST SWISH_UNDEFINED_XML_ATTRIBUTES)) {
        v = swish_hash_fetch(h->config->misc, BAD_CAST SWISH_UNDEFINED_XML_ATTRIBUTES);
        if (xmlStrEqual(v, BAD_CAST "error")) {
            h->config->flags->undef_attrs = SWISH_UNDEF_ATTRS_ERROR;
        }
        else if (xmlStrEqual(v, BAD_CAST "ignore")) {
            h->config->flags->undef_attrs = SWISH_UNDEF_ATTRS_IGNORE;
        }
        else if (xmlStrEqual(v, BAD_CAST "index")) {
            h->config->flags->undef_attrs = SWISH_UNDEF_ATTRS_INDEX;
        }
        else if (xmlStrEqual(v, BAD_CAST "auto")) {
            h->config->flags->undef_attrs = SWISH_UNDEF_ATTRS_AUTO;
        }
        else if (xmlStrEqual(v, BAD_CAST "autoall")) {
            h->config->flags->undef_attrs = SWISH_UNDEF_ATTRS_AUTOALL;
        }
        else if (xmlStrEqual(v, BAD_CAST "disable")) {
            h->config->flags->undef_attrs = SWISH_UNDEF_ATTRS_DISABLE;
        }
        else {
            SWISH_CROAK("Unknown value for %s: %s", SWISH_UNDEFINED_XML_ATTRIBUTES, v);
        }
    }

}

static void
read_metaname_aliases(
    xmlChar *str,
    headmaker * h,
    swish_MetaName *meta
)
{
    swish_StringList *strlist;
    int i;

    strlist = swish_stringlist_build(str);

/* loop over each alias and create a MetaName for each,
       setting alias_for to meta->name
*/
    for (i = 0; i < strlist->n; i++) {

        if (!swish_hash_exists(h->config->metanames, strlist->word[i])) {
            swish_MetaName *newmeta;
            xmlChar *newname;
            newname = swish_str_tolower(strlist->word[i]);

/* is this an existing metaname? pull it from hash and update */
            if (swish_hash_exists(h->config->metanames, newname)) {
                newmeta = swish_hash_fetch(h->config->metanames, newname);
            }
/* else new metaname */
            else {
                newmeta = swish_metaname_init(newname);
                newmeta->ref_cnt++;
                newmeta->id = h->meta_id++;
                newmeta->bias = meta->bias;
                swish_hash_add(h->config->metanames, newmeta->name, newmeta);
            }

            newmeta->alias_for = swish_xstrdup(meta->name);

/* swish_metaname_debug(newmeta); */
        }
        else {
            SWISH_CROAK
                ("Cannot alias MetaName %s to %s because %s is already a real MetaName",
                 strlist->word[i], meta->name, strlist->word[i]);
        }

    }

    swish_stringlist_free(strlist);
}

static void
read_metaname_attr(
    const xmlChar *attr,
    const xmlChar *attr_val,
    swish_MetaName *meta,
    headmaker * h
)
{
    swish_MetaName *dupe;
        
    if (xmlStrEqual(attr, (xmlChar *)"bias")) {
        meta->bias = swish_string_to_int((char*)attr_val);
    }
    else if (xmlStrEqual(attr, (xmlChar *)"id")) {
        // make sure id is not already assigned
        if (swish_hash_exists(h->config->flags->meta_ids, (xmlChar*)attr_val)) {
            dupe = swish_hash_fetch(h->config->flags->meta_ids, (xmlChar*)attr_val);
            SWISH_CROAK("duplicate id %s on MetaName %s (already assigned to %s)",
                attr_val, meta->name, dupe->name);
        }
        meta->id = swish_string_to_int((char*)attr_val);
        // cache for id lookup
        swish_hash_add(h->config->flags->meta_ids, (xmlChar*)attr_val, meta);
    }
    else if (xmlStrEqual(attr, (xmlChar *)"alias_for")) {
        meta->alias_for = swish_str_tolower(BAD_CAST attr_val);
    }
    else {
        SWISH_CROAK("Unknown MetaName attribute: %s", attr);
    }
}

static void
read_metaname(
    xmlTextReaderPtr reader,
    headmaker * h
)
{
    const xmlChar *nodename;
    swish_MetaName *meta;
    
    nodename = xmlTextReaderConstName(reader);

    meta = swish_metaname_init(swish_str_tolower((xmlChar *)nodename));
    meta->ref_cnt++;

    if (xmlTextReaderHasValue(reader)
        && xmlTextReaderNodeType(reader) == XML_READER_TYPE_TEXT) 
    {
        if (!h->parent_name) {
            SWISH_CROAK("Illegal text in MetaNames section: '%s'",
                xmlTextReaderValue(reader));
        }   
        
        swish_xfree(meta->name);
        meta->name = swish_str_tolower((xmlChar *)h->parent_name);
        read_metaname_aliases(xmlTextReaderValue(reader), h, meta);
        meta->ref_cnt--;
        swish_metaname_free(meta);
        return;
    }

    if (xmlTextReaderHasAttributes(reader)) {

        xmlTextReaderMoveToFirstAttribute(reader);
        if (xmlStrEqual(xmlTextReaderConstPrefix(reader),(xmlChar*)"xmlns")) {
            if (xmlTextReaderMoveToNextAttribute(reader) == 1) {
                read_metaname_attr(xmlTextReaderConstName(reader),
                           xmlTextReaderConstValue(reader), meta, h);
            }
        }
        else {
            read_metaname_attr(xmlTextReaderConstName(reader),
                           xmlTextReaderConstValue(reader), meta, h);
        }
        
        while (
            xmlTextReaderMoveToNextAttribute(reader) == 1
            &&
            !xmlStrEqual(xmlTextReaderConstPrefix(reader),(xmlChar*)"xmlns")
        ) {
            read_metaname_attr(xmlTextReaderConstName(reader),
                               xmlTextReaderConstValue(reader), meta, h);
        }

    }

/*  must have an id */
    if (meta->id == -1) {
        meta->id = h->meta_id++;
        h->config->flags->max_meta_id = h->meta_id;
    }

    if (!swish_hash_exists(h->config->metanames, meta->name)) {
        swish_hash_add(h->config->metanames, meta->name, meta);
    }
    else {
        SWISH_WARN("MetaName %s is already defined", meta->name);
        // TODO could be alias. how to check?
    }

    // swish_metaname_debug(meta);
    
    h->parent_name = nodename;

}

static void
read_property_aliases(
    xmlChar *str,
    headmaker * h,
    swish_Property *prop
)
{
    swish_StringList *strlist;
    int i;

    strlist = swish_stringlist_build(str);

/* loop over each alias and create a Property for each,
   setting alias_for to prop->name
*/
    for (i = 0; i < strlist->n; i++) {

        if (!swish_hash_exists(h->config->properties, strlist->word[i])) {
            swish_Property *newprop =
                swish_property_init(swish_str_tolower(strlist->word[i]));
            newprop->ref_cnt++;
            newprop->alias_for = swish_xstrdup(prop->name);
            newprop->id = h->prop_id++;
            newprop->ignore_case = prop->ignore_case;
            newprop->type = prop->type;
            newprop->verbatim = prop->verbatim;
            newprop->max = prop->max;
            newprop->sort = prop->sort;
            swish_hash_add(h->config->properties, newprop->name, newprop);
            /* swish_property_debug(newprop); */
        }
        else {
            SWISH_CROAK
                ("Cannot alias Property %s to %s because %s is already a real Property",
                 strlist->word[i], prop->name, strlist->word[i]);
        }

    }

    swish_stringlist_free(strlist);
}

static void
read_property_attr(
    const xmlChar *attr,
    const xmlChar *attr_val,
    swish_Property *prop,
    headmaker * h
)
{
    swish_Property *dupe;
    
    if (xmlStrEqual(attr, (xmlChar *)"ignore_case")) {
        prop->ignore_case = swish_string_to_boolean((char *)attr_val);
    }
    else if (xmlStrEqual(attr, (xmlChar *)"max")) {
        prop->max = swish_string_to_int((char *)attr_val);
    }
    else if (xmlStrEqual(attr, (xmlChar *)"verbatim")) {
        prop->verbatim = swish_string_to_boolean((char *)attr_val);
    }
    else if (xmlStrEqual(attr, (xmlChar *)"sort")) {
        prop->sort = swish_string_to_boolean((char *)attr_val);
    }
    else if (xmlStrEqual(attr, (xmlChar *)"presort")) {
        prop->presort = swish_string_to_boolean((char *)attr_val);
    }
    else if (xmlStrEqual(attr, (xmlChar *)"sort_length")) {
        prop->sort_length = swish_string_to_int((char *)attr_val);
    }
    else if (xmlStrEqual(attr, (xmlChar *)"id")) {
        // make sure id is not already assigned
        if (swish_hash_exists(h->config->flags->prop_ids, (xmlChar*)attr_val)) {
            dupe = swish_hash_fetch(h->config->flags->prop_ids, (xmlChar*)attr_val);
            SWISH_CROAK("duplicate id %s on MetaName %s (already assigned to %s)",
                attr_val, prop->name, dupe->name);
        }
        prop->id = swish_string_to_int((char*)attr_val);
        // cache for id lookup
        swish_hash_add(h->config->flags->prop_ids, (xmlChar*)attr_val, prop);
    }
    else if (xmlStrEqual(attr, (xmlChar *)"type")) {
        if (xmlStrEqual(attr_val, (xmlChar *)"int")) {
            prop->type = SWISH_PROP_INT;
        }
        else if (xmlStrEqual(attr_val, (xmlChar *)"date")) {
            prop->type = SWISH_PROP_DATE;
        }
        else if (xmlStrEqual(attr_val, (xmlChar*)"string")
                ||
                 xmlStrEqual(attr_val, (xmlChar*)"text")
        ) {
            prop->type = SWISH_PROP_STRING;
        }
        else if (isdigit(attr_val[0])) {
            prop->type = swish_string_to_int((char*)attr_val);
        }
        else {
            SWISH_CROAK("Invalid value for PropertyName '%s' type: %s",
                prop->name, attr_val);
        }
    }
    else if (xmlStrEqual(attr, (xmlChar *)"alias_for")) {
        prop->alias_for = swish_str_tolower(BAD_CAST attr_val);
    }
    else {
        SWISH_CROAK("unknown Property attribute: %s", attr);
    }

}

static void
read_property(
    xmlTextReaderPtr reader,
    headmaker * h
)
{
    const xmlChar *nodename;
    swish_Property *prop;

    nodename = xmlTextReaderConstName(reader);
    prop = swish_property_init(swish_str_tolower((xmlChar *)nodename));
    prop->ref_cnt++;

    if (xmlTextReaderHasValue(reader)
        && xmlTextReaderNodeType(reader) == XML_READER_TYPE_TEXT) {
        
        if (!h->parent_name) {
            SWISH_CROAK("Illegal text in PropertyNames section: '%s'",
                xmlTextReaderValue(reader));
        }
        
        swish_xfree(prop->name);
        prop->name = swish_str_tolower((xmlChar *)h->parent_name);
        read_property_aliases(xmlTextReaderValue(reader), h, prop);
        prop->ref_cnt--;
        swish_property_free(prop);
        return;
    }

    if (xmlTextReaderHasAttributes(reader)) {

        xmlTextReaderMoveToFirstAttribute(reader);
        if (xmlStrEqual(xmlTextReaderConstPrefix(reader),(xmlChar*)"xmlns")) {
            if (xmlTextReaderMoveToNextAttribute(reader) == 1) {
                read_property_attr(xmlTextReaderConstName(reader),
                           xmlTextReaderConstValue(reader), prop, h);
            }
        }
        else {
            read_property_attr(xmlTextReaderConstName(reader),
                           xmlTextReaderConstValue(reader), prop, h);
        }

        while (
            xmlTextReaderMoveToNextAttribute(reader) == 1
            &&
            !xmlStrEqual(xmlTextReaderConstPrefix(reader),(xmlChar*)"xmlns")
        ) {
            read_property_attr(xmlTextReaderConstName(reader),
                               xmlTextReaderConstValue(reader), prop, h);
        }

    }

    if (prop->id == -1) {
        prop->id = h->prop_id++;
        h->config->flags->max_prop_id = h->prop_id;
    }

    if (!swish_hash_exists(h->config->properties, prop->name)) {
        swish_hash_add(h->config->properties, prop->name, prop);
    }
    else {
/* swish_config_debug( h->config ); */
        SWISH_CROAK("Property %s is already defined", prop->name);
    }

/* swish_property_debug(prop); */

    h->parent_name = nodename;

}

static void
process_node(
    xmlTextReaderPtr reader,
    headmaker * h
)
{
    const xmlChar *name, *value;
    int type;

    type = xmlTextReaderNodeType(reader);
    name = xmlTextReaderConstLocalName(reader);
    value = xmlTextReaderConstValue(reader);

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG)
        SWISH_DEBUG_MSG("name %s  type %d  value %s", name, type, value);

    if (name == NULL)
        name = BAD_CAST "--";

    if (type == XML_READER_TYPE_COMMENT)
        return;

    if (xmlStrEqual(name, (const xmlChar *)SWISH_HEADER_ROOT)) {
        h->is_valid = 1;
        return;
    }
    if (!h->is_valid) {
        SWISH_CROAK("invalid header file");
    }

    if (swish_str_all_ws((xmlChar *)value)
        && xmlStrEqual(name, (xmlChar *)"#text")) {
        return;
    }            

    if (type == XML_READER_TYPE_END_ELEMENT) {
        if (xmlStrEqual(name, (const xmlChar *)SWISH_PROP)) {
            reset_headmaker(h);
            return;
        }
        else if (xmlStrEqual(name, (const xmlChar *)SWISH_META)) {
            reset_headmaker(h);
            return;
        }
        else if (xmlStrEqual(name, (const xmlChar *)SWISH_INDEX)) {
            reset_headmaker(h);
            return;
        }
        else if (xmlStrEqual(name, (const xmlChar *)SWISH_PARSERS)) {
            reset_headmaker(h);
            return;
        }
        else if (xmlStrEqual(name, (const xmlChar *)SWISH_MIME)) {
            reset_headmaker(h);
            return;
        }
        else if (xmlStrEqual(name, (const xmlChar *)SWISH_ALIAS)) {
            reset_headmaker(h);
            return;
        }

        //SWISH_DEBUG_MSG("END ELEMENT name %s  type %d  value %s", name, type, value);

        return;

    }
    else {
    
              
    /* the special include directive means we stop and process
     * that config file immediately instead of storing the value
     * in the hash.
     */
        if (xmlStrEqual(name, BAD_CAST SWISH_INCLUDE_FILE)) {
            if (xmlTextReaderRead(reader) == 1) {
                if (xmlTextReaderNodeType(reader) == XML_READER_TYPE_TEXT) {
                    value = xmlTextReaderConstValue(reader);
                    xmlChar *conf_file = swish_xstrdup(value);
                    if (conf_file[0] != SWISH_PATH_SEP) {
                        xmlChar *path, *xuri;
                        path = swish_fs_get_path(h->conf_file);
                        if (path == NULL) {
                            SWISH_CROAK("Unable to resolve config file path %s relative to %s", 
                                conf_file, h->conf_file);
                        }
                        xuri = xmlBuildURI(conf_file, path);
                        if (xuri == NULL) {
                            SWISH_CROAK("Unable to build URI for %s and %s", conf_file, path);
                        }
                        swish_xfree(conf_file);
                        conf_file = swish_xstrdup(xuri);
                        xmlFree(xuri); /* because we did not malloc it */
                        xmlFree(path); /* because we did not malloc it */
                    }
                    swish_header_merge((char*)conf_file, h->config);
                    swish_xfree(conf_file);
                    return;
                }
            }
            SWISH_CROAK("Invalid value for %s", name);
        }
        else if (xmlStrEqual(name, (const xmlChar *)SWISH_PROP)) {
            reset_headmaker(h);
            h->isprops = 1;
            return;
        }
        else if (xmlStrEqual(name, (const xmlChar *)SWISH_META)) {
            reset_headmaker(h);
            h->ismetas = 1;
            return;
        }
        else if (xmlStrEqual(name, (const xmlChar *)SWISH_INDEX)) {
            reset_headmaker(h);
            h->isindex = 1;
            return;
        }
        else if (xmlStrEqual(name, (const xmlChar *)SWISH_PARSERS)) {
            reset_headmaker(h);
            h->isparser = 1;
            return;
        }
        else if (xmlStrEqual(name, (const xmlChar *)SWISH_MIME)) {
            reset_headmaker(h);
            h->ismime = 1;
            return;
        }
        else if (xmlStrEqual(name, (const xmlChar *)SWISH_ALIAS)) {
            reset_headmaker(h);
            h->isalias = 1;
            return;
        }

        //SWISH_DEBUG_MSG("NOT END ELEMENT name %s  type %d  value %s", name, type, value);
    }

    if (type != XML_READER_TYPE_END_ELEMENT) {

        if (h->isprops) {
            read_property(reader, h);
            return;
        }
        else if (h->ismetas) {
            read_metaname(reader, h);
            return;
        }
        else if (h->isindex) {
            read_key_value_pair(reader, h->config->index, (xmlChar *)name);
            return;
        }
        else if (h->isparser) {
            read_key_values_pair(reader, h->config->parsers, (xmlChar *)name);
            return;
        }
        else if (h->ismime) {
            read_key_value_pair(reader, h->config->mimes, (xmlChar *)name);
            return;
        }
        else if (h->isalias) {
            read_key_values_pair(reader, h->config->tag_aliases, (xmlChar *)name);
            return;
        }
        else if (xmlStrEqual((xmlChar *)SWISH_CLASS_ATTRIBUTES, (xmlChar *)name)) {
            read_key_value_stringlist(reader, h->config->stringlists, (xmlChar *)name);
            return;
        }
        else if (type == XML_READER_TYPE_ELEMENT) {
            read_key_value_pair(reader, h->config->misc, (xmlChar *)name);
            handle_special_misc_flags(h);
            return;
        }

        /*
           SWISH_DEBUG_MSG("STILL NOT END ELEMENT name %s  type %d  value %s", name, type, value); 
         */

    }

}

static void
read_key_value_stringlist(
    xmlTextReaderPtr reader,
    xmlHashTablePtr hash,
    xmlChar *name
)
{
    swish_StringList *strlist;
    xmlChar *str;
    const xmlChar *value;

/* element. get text and add to misc */
    if (xmlTextReaderRead(reader) == 1) {
        if (xmlTextReaderNodeType(reader) == XML_READER_TYPE_TEXT) {

            value = xmlTextReaderConstValue(reader);
            str = swish_str_tolower((xmlChar *)value);
            strlist = swish_stringlist_build(str);
            if (swish_hash_exists(hash, name)) {
                swish_stringlist_merge(strlist, swish_hash_fetch(hash, name));
            }
            else {
                swish_hash_add(hash, name, strlist);
            }
            swish_xfree(str);
        }
        else {
            SWISH_CROAK("Top-level XML element missing value: %s", name);
        }
    }
    else {
        SWISH_CROAK("Error reading value for top-level XML element %s", name);
    }
}

static void
read_key_values_pair(
    xmlTextReaderPtr reader,
    xmlHashTablePtr hash,
    xmlChar *name
)
{
    swish_StringList *strlist;
    xmlChar *str;
    const xmlChar *value;
    int i;

/* element. get text and add to misc */
    if (xmlTextReaderRead(reader) == 1) {
        if (xmlTextReaderNodeType(reader) == XML_READER_TYPE_TEXT) {

            value = xmlTextReaderConstValue(reader);
            str = swish_str_tolower((xmlChar *)value);
            strlist = swish_stringlist_build(str);

            for (i = 0; i < strlist->n; i++) {
/*  SWISH_DEBUG_MSG("key_values pair: %s -> %s", strlist->word[i], name);  */
                if (swish_hash_exists(hash, strlist->word[i])) {
                    swish_hash_replace(hash, strlist->word[i], swish_xstrdup(name));
                }
                else {
                    swish_hash_add(hash, strlist->word[i], swish_xstrdup(name));
                }
            }

            swish_stringlist_free(strlist);
            swish_xfree(str);

        }
        else {
            SWISH_CROAK("Top-level XML element missing value: %s", name);
        }
    }
    else {
        SWISH_CROAK("Error reading value for top-level XML element %s", name);
    }

}

static void
read_key_value_pair(
    xmlTextReaderPtr reader,
    xmlHashTablePtr hash,
    xmlChar *name
)
{
    const xmlChar *value;

/* element. get text and add to misc */
    if (xmlTextReaderRead(reader) == 1) {
        if (xmlTextReaderNodeType(reader) == XML_READER_TYPE_TEXT) {
            value = xmlTextReaderConstValue(reader);            
            
            if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
                SWISH_DEBUG_MSG("read key %s for value %s", name, value);
            }
            
            if (swish_hash_exists(hash, name)) {
            
                if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
                    SWISH_DEBUG_MSG("replacing %s => %s in hash", name, value);
                }
                swish_hash_replace(hash, name, swish_xstrdup(value));
            }
            else {
            
                if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
                    SWISH_DEBUG_MSG("adding %s => %s to hash", name, value);
                }
                swish_hash_add(hash, name, swish_xstrdup(value));
            }
        }
        else {
            SWISH_CROAK("Top-level XML element missing value: %s", name);
        }
    }
    else {
        SWISH_CROAK("Error reading value for top-level XML element %s", name);
    }

}

static void
read_header(
    char *filename,
    headmaker * h
)
{
    xmlTextReaderPtr reader;
    int ret;

/* parse either a filename, or, if we can't stat it,
 * assume conf is a XML string.
 */
    if (!swish_fs_file_exists((xmlChar*)filename)) {
        reader =
            xmlReaderForMemory((const char *)filename, xmlStrlen((xmlChar *)filename),
                               "[ swish.xml ]", NULL, 0);

        h->conf_file = swish_xstrdup(BAD_CAST "in-memory");
        
        if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
            SWISH_DEBUG_MSG("header parsed in-memory");
        }
    }
    else {
        reader = xmlReaderForFile(filename, NULL, 0);
        h->conf_file = swish_xstrdup(BAD_CAST filename);

        if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
            SWISH_DEBUG_MSG("header parsed from file");
        }
    }

    if (reader != NULL) {
        ret = xmlTextReaderRead(reader);
        while (ret == 1) {
            process_node(reader, h);
            ret = xmlTextReaderRead(reader);
        }
        xmlFreeTextReader(reader);
        if (ret != 0) {
            SWISH_CROAK("%s : failed to parse\n", filename);
        }
        swish_xfree(h->conf_file);
        h->conf_file = NULL;
    }
    else {
        SWISH_CROAK("Unable to open %s\n", filename);
    }

/*
 * Cleanup function for the XML library.
 */
    xmlCleanupParser();
}

static void
test_meta_alias_for(
    swish_MetaName *meta,
    swish_Config *c,
    xmlChar *name
)
{
    if (meta->alias_for != NULL && !swish_hash_exists(c->metanames, meta->alias_for)
        ) {
        SWISH_CROAK
            ("MetaName '%s' has alias_for value of '%s' but no such MetaName defined",
             name, meta->alias_for);
    }
}

static void
test_prop_alias_for(
    swish_Property *prop,
    swish_Config *c,
    xmlChar *name
)
{
    if (prop->alias_for != NULL 
        && !swish_hash_exists(c->properties, prop->alias_for)
        && !swish_property_get_id(prop->alias_for, c->properties)
    ) {
        SWISH_CROAK("Property '%s' has alias_for value of '%s' but no such Property defined",
             name, prop->alias_for);
    }
}

void
swish_config_test_alias_fors(
    swish_Config *c
)
{
    xmlHashScan(c->metanames, (xmlHashScanner)test_meta_alias_for, c);
    xmlHashScan(c->properties, (xmlHashScanner)test_prop_alias_for, c);
}


static headmaker *
init_headmaker(
)
{
    headmaker *h;
    h = swish_xmalloc(sizeof(headmaker));
    h->config = swish_config_init();
/*  mimes is set to NULL in default config but we need it to be a hash here. */
    h->config->mimes = swish_hash_init(8);
    reset_headmaker(h);
    h->prop_id = SWISH_PROP_THIS_MUST_COME_LAST_ID;
    h->meta_id = SWISH_META_THIS_MUST_COME_LAST_ID;
    h->conf_file = NULL;
    return h;
}

static void
reset_headmaker(
    headmaker *h
)
{
    h->isprops = 0;
    h->ismetas = 0;
    h->isindex = 0;
    h->isalias = 0;
    h->isparser = 0;
    h->ismime = 0;
    h->parent_name = NULL;
}

boolean
swish_header_validate(
    char *filename
)
{
    headmaker *h;
    h = init_headmaker();
    read_header(filename, h);

/*  test that all the alias_for links resolve ok */
    swish_config_test_alias_fors(h->config);

    swish_config_debug(h->config);
    swish_config_free(h->config);
    if (h->conf_file != NULL) {
        swish_xfree(h->conf_file);
    }
    swish_xfree(h);
    return 1;                   /* how to test ? */
}

boolean
swish_header_merge(
    char *filename,
    swish_Config *c
)
{
    headmaker *h;
    h = init_headmaker();
    read_header(filename, h);
    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("read_header complete");
    }
    swish_config_merge(c, h->config);
    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("config_merge complete");
    }
    swish_config_free(h->config);
    if (h->conf_file != NULL) {
        swish_xfree(h->conf_file);
    }
    swish_xfree(h);
    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("temp head struct freed");
    }

/*  test that all the alias_for links resolve ok */
    swish_config_test_alias_fors(c);

    return 1;
}

swish_Config *
swish_header_read(
    char *filename
)
{
    headmaker *h;
    swish_Config *c;
    h = init_headmaker();
    read_header(filename, h);
    c = h->config;
    if (h->conf_file != NULL) {
        swish_xfree(h->conf_file);
    }
    swish_xfree(h);
    return c;
}

static void
write_open_tag(
    xmlTextWriterPtr writer,
    xmlChar *tag
)
{
    int rc;
    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("writing open tag <%s>", tag);
    }
    rc = xmlTextWriterStartElement(writer, tag);
    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("wrote open tag <%s>", tag);
    }

    if (rc < 0) {
        SWISH_CROAK("Error writing element %s", tag);
    }
}

static void
write_close_tag(
    xmlTextWriterPtr writer
)
{
    int rc;
    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("writing close tag");
    }
    rc = xmlTextWriterEndElement(writer);
    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("wrote close tag");
    }
    if (rc < 0) {
        SWISH_CROAK("Error at xmlTextWriterEndElement");
    }
}

static void
write_element_with_content(
    xmlTextWriterPtr writer,
    xmlChar *tag,
    xmlChar *content
)
{
    int rc;
    rc = xmlTextWriterWriteElement(writer, tag, content);
    if (rc < 0) {
        SWISH_CROAK("Error writing element %s with content %s", tag, content);
    }
}

static void
write_metaname(
    swish_MetaName *meta,
    xmlTextWriterPtr writer,
    xmlChar *name
)
{
    int rc;
    write_open_tag(writer, name);
    rc = xmlTextWriterWriteFormatAttribute(writer, BAD_CAST "id", "%d", meta->id);
    if (rc < 0) {
        SWISH_CROAK("Error writing metaname id attribute for %s", name);
    }

    if (meta->alias_for != NULL) {
        rc = xmlTextWriterWriteAttribute(writer, BAD_CAST "alias_for", meta->alias_for);
        if (rc < 0) {
            SWISH_CROAK("Error writing metaname alias_for attribute for %s", name);
        }

    }
    else {
        rc = xmlTextWriterWriteFormatAttribute(writer, BAD_CAST "bias", "%d", meta->bias);
        if (rc < 0) {
            SWISH_CROAK("Error writing metaname bias attribute for %s", name);
        }
    }

    write_close_tag(writer);
}

static void
write_metanames(
    xmlTextWriterPtr writer,
    xmlHashTablePtr metanames
)
{
    xmlHashScan(metanames, (xmlHashScanner)write_metaname, writer);
}

static void
write_hash_entry(
    xmlChar *value,
    xmlTextWriterPtr writer,
    xmlChar *key
)
{
    write_element_with_content(writer, key, value);
}

static void
write_reverse_hash_entry(
    xmlChar *value,
    xmlTextWriterPtr writer,
    xmlChar *key
)
{
    write_element_with_content(writer, value, key);
}

static void
write_property(
    swish_Property *prop,
    xmlTextWriterPtr writer,
    xmlChar *name
)
{
    int rc;
    write_open_tag(writer, name);
    rc = xmlTextWriterWriteFormatAttribute(writer, BAD_CAST "id", "%d", prop->id);
    if (rc < 0) {
        SWISH_CROAK("Error writing property id attribute for %s", name);
    }

    if (prop->alias_for != NULL) {
        rc = xmlTextWriterWriteAttribute(writer, BAD_CAST "alias_for", prop->alias_for);
        if (rc < 0) {
            SWISH_CROAK("Error writing property alias_for attribute for %s", name);
        }
    }
    else {

/* all other attrs are irrelevant if this is an alias */
        rc = xmlTextWriterWriteFormatAttribute(writer, BAD_CAST "ignore_case", "%d",
                                               prop->ignore_case);
        if (rc < 0) {
            SWISH_CROAK("Error writing property ignore_case attribute for %s", name);
        }
        rc = xmlTextWriterWriteFormatAttribute(writer, BAD_CAST "verbatim", "%d",
                                               prop->verbatim);
        if (rc < 0) {
            SWISH_CROAK("Error writing property verbatim attribute for %s", name);
        }
        rc = xmlTextWriterWriteFormatAttribute(writer, BAD_CAST "type", "%d", prop->type);
        if (rc < 0) {
            SWISH_CROAK("Error writing property type attribute for %s", name);
        }
        rc = xmlTextWriterWriteFormatAttribute(writer, BAD_CAST "max", "%d", prop->max);
        if (rc < 0) {
            SWISH_CROAK("Error writing property max attribute for %s", name);
        }
        rc = xmlTextWriterWriteFormatAttribute(writer, BAD_CAST "sort", "%d", prop->sort);
        if (rc < 0) {
            SWISH_CROAK("Error writing property sort attribute for %s", name);
        }
        rc = xmlTextWriterWriteFormatAttribute(writer, BAD_CAST "sort_length", "%d", prop->sort_length);
        if (rc < 0) {
            SWISH_CROAK("Error writing property sort attribute for %s", name);
        }
        rc = xmlTextWriterWriteFormatAttribute(writer, BAD_CAST "presort", "%d", prop->presort);
        if (rc < 0) {
            SWISH_CROAK("Error writing property sort attribute for %s", name);
        }
    }
    write_close_tag(writer);
}

static void
write_properties(
    xmlTextWriterPtr writer,
    xmlHashTablePtr properties
)
{
    xmlHashScan(properties, (xmlHashScanner)write_property, writer);
}

static void
write_parser(
    xmlChar *val,
    xmlTextWriterPtr writer,
    xmlChar *key
)
{
    write_element_with_content(writer, val, key);
}

static void
write_parsers(
    xmlTextWriterPtr writer,
    xmlHashTablePtr parsers
)
{
    xmlHashScan(parsers, (xmlHashScanner)write_parser, writer);
}

static void
write_mime(
    xmlChar *type,
    temp_things  *things,
    xmlChar *ext
)
{
    if (   !swish_hash_exists((xmlHashTablePtr) things->thing1, ext)
        || !xmlStrEqual(swish_hash_fetch((xmlHashTablePtr) things->thing1, ext), type)
    ) {

/*
        if (!swish_hash_exists((xmlHashTablePtr) things->thing1, ext)) {
            SWISH_DEBUG_MSG("%s not in hash", ext);
        }
*/
        if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
            SWISH_DEBUG_MSG("writing unique MIME %s => %s", ext, type);
        }
        write_element_with_content((xmlTextWriterPtr) things->thing3, ext, type);
        if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
            SWISH_DEBUG_MSG("wrote unique MIME %s => %s", ext, type);
        }
    }
}

static void
write_mimes(
    xmlTextWriterPtr writer,
    xmlHashTablePtr mimes
)
{
/*  only write what differs from the default */
    temp_things *t;
    t = swish_xmalloc(sizeof(temp_things));
    t->thing1 = swish_mime_defaults();
    t->thing2 = mimes;
    t->thing3 = writer;
    xmlHashScan(mimes, (xmlHashScanner)write_mime, t);
    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("done writing MIMEs");
    }
    swish_hash_free(t->thing1);
    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("freed thing1 hash");
    }
    swish_xfree(t);
}

static void
write_index(
    xmlTextWriterPtr writer,
    xmlHashTablePtr index
)
{
    xmlHashScan(index, (xmlHashScanner)write_hash_entry, writer);
}

static void
write_tag_aliases(
    xmlTextWriterPtr writer,
    xmlHashTablePtr tag_aliases
)
{
    xmlHashScan(tag_aliases, (xmlHashScanner)write_reverse_hash_entry, writer);
}

static void
write_misc(
    xmlTextWriterPtr writer,
    xmlHashTablePtr hash
)
{
    xmlHashScan(hash, (xmlHashScanner)write_hash_entry, writer);
}

void
swish_header_write(
    char *uri,
    swish_Config *config
)
{
#if !defined(LIBXML_WRITER_ENABLED) || !defined(LIBXML_OUTPUT_ENABLED)
    SWISH_CROAK("libxml2 writer not compiled in this version of libxml2");
#else
    int rc;
    xmlTextWriterPtr writer;

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        swish_config_debug(config);
    }

/* Create a new XmlWriter for uri, with no compression. */
    writer = xmlNewTextWriterFilename((const char *)uri, 0);
    if (writer == NULL) {
        SWISH_CROAK("Error creating the xml writer\n");
    }

/* set some basic formatting rules. these make it easier to debug headers */
    rc = xmlTextWriterSetIndent(writer, 1);
    if (rc < 0) {
        SWISH_CROAK("failed to set indent on XML writer");
    }

/* Start the document with the xml default for the version,
     * encoding UTF-8 (default) and the default for the standalone
     * declaration. */
    rc = xmlTextWriterStartDocument(writer, NULL, NULL, NULL);
    if (rc < 0) {
        SWISH_CROAK("Error at xmlTextWriterStartDocument\n");
    }

/* root element
    NOTE the BAD_CAST macro is xml2 shortcut for (xmlChar*)
*/
    write_open_tag(writer, BAD_CAST SWISH_HEADER_ROOT);

/* Write a comment indicating a computer wrote this file */
    rc = xmlTextWriterWriteComment(writer, BAD_CAST "written by libswish3 - DO NOT EDIT");
    if (rc < 0) {
        SWISH_CROAK("Error at xmlTextWriterWriteComment\n");
    }

    // TODO check for these in reader and croak if mismatch
    if (!swish_hash_exists(config->misc, BAD_CAST "swish_version")) {
        write_element_with_content(writer, BAD_CAST "swish_version",
                                   BAD_CAST SWISH_VERSION);
    }
    if (!swish_hash_exists(config->misc, BAD_CAST "swish_lib_version")) {
        write_element_with_content(writer, BAD_CAST "swish_lib_version",
                                   BAD_CAST swish_lib_version());
    }

/* write MetaNames */
    write_open_tag(writer, BAD_CAST SWISH_META);
    write_metanames(writer, config->metanames);
    write_close_tag(writer);

/* write PropertyNames */
    write_open_tag(writer, BAD_CAST SWISH_PROP);
    write_properties(writer, config->properties);
    write_close_tag(writer);

/* write Parsers */
    write_open_tag(writer, BAD_CAST SWISH_PARSERS);
    write_parsers(writer, config->parsers);
    write_close_tag(writer);

/* write MIMEs */

    write_open_tag(writer, BAD_CAST SWISH_MIME);
    write_mimes(writer, config->mimes);
    write_close_tag(writer);

/* write index */
    write_open_tag(writer, BAD_CAST SWISH_INDEX);
    write_index(writer, config->index);
    write_close_tag(writer);

    write_open_tag(writer, BAD_CAST SWISH_ALIAS);
    write_tag_aliases(writer, config->tag_aliases);
    write_close_tag(writer);

/* misc tags have no parent */
    write_misc(writer, config->misc);

/* this function will close any open tags */
    rc = xmlTextWriterEndDocument(writer);
    if (rc < 0) {
        SWISH_CROAK("Error at xmlTextWriterEndDocument\n");
    }

    xmlFreeTextWriter(writer);
#endif
}


/*************** end header.c ************/


/*************** start tokenizer.c ************/
/*
 * This file is part of libswish3
 * Copyright (C) 2008 Peter Karman
 *
 *  libswish3 is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libswish3 is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libswish3; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
*/

/* utf8 tokenizer */
#ifndef LIBSWISH3_SINGLE_FILE
#include <wchar.h>
#include <string.h>
#include <ctype.h>
#include <wctype.h>
#include <err.h>
#include <stdarg.h>

#include "libswish3.h"
#endif

extern int SWISH_DEBUG;

static int is_ignore_start_utf8(
    uint32_t c
);
static int is_ignore_end_utf8(
    uint32_t c
);
static int is_ignore_word_utf8(
    uint32_t c
);
static int is_ignore_start_ascii(
    char c
);
static int is_ignore_end_ascii(
    char c
);
static int is_ignore_word_ascii(
    char c
);
static void make_ascii_tables(
);
static int strip_utf8_chrs(
    xmlChar *token,
    int len
);
static int strip_ascii_chrs(
    xmlChar *word,
    int len
);

static int
is_ignore_start_utf8(
    uint32_t c
)
{
    return (!c || iswspace(c) || iswcntrl(c) || iswpunct(c)
        )
        ? 1 : 0;
}

static int
is_ignore_end_utf8(
    uint32_t c
)
{
    return (!c || iswspace(c) || iswcntrl(c) || iswpunct(c)
        )
        ? 1 : 0;
}

static int
is_ignore_word_utf8(
    uint32_t c
)
{
    if (c == '\'') {              /*  contractions allowed */
        return 0;
    }

    if (c == '_') {               /* consider underscore a wordchar like regex does */
        return 0;
    }

    if (!c || iswspace(c) || iswcntrl(c) || iswpunct(c)) {
        return 1;
    }

    return 0;
}

static int
is_ignore_start_ascii(
    char c
)
{
    return (!c || isspace(c) || iscntrl(c) || ispunct(c)
        )
        ? 1 : 0;

}

static int
is_ignore_end_ascii(
    char c
)
{
    return (!c || isspace(c) || iscntrl(c) || ispunct(c)
        )
        ? 1 : 0;
}

static int
is_ignore_word_ascii(
    char c
)
{
    if (c == '\'') {              /*  contractions allowed */
        return 0;
    }

    if (c == '_') {               /* consider underscore a wordchar like regex does */
        return 0;
    }

    return (!c || isspace(c) || iscntrl(c) || ispunct(c)) ? 1 : 0;
}

/************************************************
*   mimic the Swish-e WordCharacters lookup tables
*   using the default is*() functions.
*************************************************/

static boolean ascii_init = 0;
static char ascii_word_table[128];
static char ascii_start_table[128];
static char ascii_end_table[128];

static void
make_ascii_tables(
)
{
    int i;
    for (i = 0; i < 127; i++) {
        if (is_ignore_word_ascii(i))
            ascii_word_table[i] = 0;
        else
            ascii_word_table[i] = 1;

        if (is_ignore_end_ascii(i))
            ascii_end_table[i] = 0;
        else
            ascii_end_table[i] = 1;

        if (is_ignore_start_ascii(i))
            ascii_start_table[i] = 0;
        else
            ascii_start_table[i] = 1;

    }
    ascii_init = 1;
}

/*************************************************
* remove all ignorable start/end chars
* and return new length of token
* based on code in swish-e vers2 swish_words.c
*************************************************/

static int
strip_utf8_chrs(
    xmlChar *token,
    int len
)
{
    int i, j, k, chr_len, start, end, token_len, cp;
    xmlChar chr[5];

    start = 0;
    end = 0;

    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
        SWISH_DEBUG_MSG("Before: %s", token);

// end chrs -- must do before start chars
    j = len;
    for (i = len; i >= 0; swish_utf8_prev_chr(token, &i)) {
        chr_len = j - i;
        if (!chr_len) {
            j = i;
            continue;
        }
        for (k = 0; k < chr_len; k++) {
            chr[k] = token[i + k];
        }
        chr[k] = '\0';
        if (is_ignore_end_utf8(swish_utf8_codepoint(chr))) {
            token[i] = '\0';
            end++;
        }
        else {
            break;
        }
    }

    chr[0] = '\0';
    j = 0;

// start chrs 
    for (i = 0; token[j] != '\0'; swish_utf8_next_chr(token, &i)) {
        chr_len = i - j;
        if (!chr_len) {
            j = i;
            continue;
        }
        for (k = 0; k < chr_len; k++) {
            chr[k] = token[j + k];
        }
        chr[k] = '\0';
        cp = swish_utf8_codepoint(chr);

        if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
            SWISH_DEBUG_MSG("start chr_len %d chr: %s  [%d]", chr_len, chr, cp);

        if (!is_ignore_start_utf8(cp)) {
            break;
        }
        else {
            if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                SWISH_DEBUG_MSG("ignore_start %s", chr);

            token += i;
            start++;
        }
    }

    token_len = xmlStrlen(token) + 1;

    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
        SWISH_DEBUG_MSG("After: %s (stripped %d start chars, %d end chars, len=%d)",
                        token, start, end, token_len);

    return token_len;
}

static int
strip_ascii_chrs(
    xmlChar *word,
    int len
)
{
    int i, j, k, wlen, start, end;
    start = 0;
    end = 0;

    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
        SWISH_DEBUG_MSG("Before: %s", word);

/* end chars -- must do before start chars */

    i = len;

/* Iteratively strip off the last character if it's an ignore character */
    while (i-- > 0) {

        if (!ascii_end_table[word[i]]) {
            word[i] = '\0';
            end++;
        }
        else {
            break;
        }
    }

/* start chars */
    i = 0;

    while (word[i]) {
        k = i;
        if (ascii_start_table[word[k]]) {
            break;
        }
        else {
            i = k + 1;
            start++;
        }
    }

/* If all the chars are valid, just leave word alone */
    if (i != 0) {
        for (k = i, j = 0; word[k] != '\0'; j++, k++) {
            word[j] = word[k];
        }

/* Add the NUL */
        word[j] = '\0';
    }

    wlen = xmlStrlen(word) + 1;

    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
        SWISH_DEBUG_MSG("After: %s (stripped %d start chars, %d end chars, wlen=%d)",
                        word, start, end, wlen);

    return wlen;
}

swish_TokenList *
swish_token_list_init(
)
{
    swish_TokenList *tl;
    tl = swish_xmalloc(sizeof(swish_TokenList));
    tl->buf = xmlBufferCreateSize((size_t) SWISH_BUFFER_CHUNK_SIZE);
    tl->n = 0;
    tl->pos = 0;
    tl->ref_cnt = 0;
    tl->tokens = swish_xmalloc(sizeof(swish_Token *) * SWISH_TOKEN_LIST_SIZE);
    tl->contexts = swish_hash_init(8);

    if (!ascii_init)
        make_ascii_tables();

    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
        SWISH_DEBUG_MSG("TokenList ptr 0x%x", (long int)tl);
        SWISH_DEBUG_MSG("TokenList->tokens ptr 0x%x", (long int)tl->tokens);
    }

    return tl;
}

void
swish_token_list_free(
    swish_TokenList *tl
)
{
    if (tl->ref_cnt != 0) {
        SWISH_WARN("freeing TokenList with ref_cnt != 0 (%d)", tl->ref_cnt);
    }

    while (tl->n) {
        tl->n--;
        tl->tokens[tl->n]->ref_cnt--;
        if (tl->tokens[tl->n]->ref_cnt < 1)
            swish_token_free(tl->tokens[tl->n]);
    }

    swish_xfree(tl->tokens);
    xmlBufferFree(tl->buf);
    swish_hash_free(tl->contexts); // MUST free **after** tokens
    swish_xfree(tl);
}

int
swish_token_list_add_token(
    swish_TokenList *tl,
    xmlChar *token,
    int token_len,
    swish_MetaName *meta,
    xmlChar *context
)
{
    int num_of_allocs;
    swish_Token *stoken;

    if (!token_len || !xmlStrlen(token)) {
        SWISH_CROAK("can't add empty token to token list");
    }

    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
        SWISH_DEBUG_MSG("adding token: %s  meta=%s", token, meta->name);

    stoken = swish_token_init();
    stoken->offset  = xmlBufferLength(tl->buf);
    stoken->len     = token_len - 1;    // -1 to exclude the NUL
    stoken->pos     = ++tl->pos;
    stoken->meta    = meta;
    stoken->meta->ref_cnt++;
        
    /* add the token str to the token_list buffer */
    swish_token_list_set_token(tl, token, token_len);

    /* cache the context string and point at the cached value */
    swish_hash_exists_or_add( tl->contexts, context, context );
    stoken->context = swish_hash_fetch( tl->contexts, context );
    stoken->value   = swish_token_list_get_token_value( tl, stoken );
    stoken->ref_cnt++;

    num_of_allocs = tl->n / SWISH_TOKEN_LIST_SIZE;

    if (SWISH_DEBUG & SWISH_DEBUG_TOKENLIST) {
        SWISH_DEBUG_MSG("TokenList size: %d  num_allocs = %d  modulus %d", tl->n,
                        num_of_allocs, tl->n % SWISH_TOKEN_LIST_SIZE);
        swish_token_debug(stoken);
    }

    if (num_of_allocs && !(tl->n % SWISH_TOKEN_LIST_SIZE)) {
        if (SWISH_DEBUG & SWISH_DEBUG_TOKENLIST) {
            SWISH_DEBUG_MSG("realloc for tokens: 0x%x", (long int)tl->tokens);
        }

        tl->tokens =
            (swish_Token **)swish_xrealloc(tl->tokens,
                                           sizeof(swish_Token*) * (SWISH_TOKEN_LIST_SIZE *
                                                                  ++num_of_allocs));

    }
    tl->tokens[tl->n++] = stoken;
    return tl->n;
}

int
swish_token_list_set_token(
    swish_TokenList *tl,
    xmlChar *token,
    int len
)
{
    int ret;
    /* include the NUL so token->value can be treated like substr */
    ret = xmlBufferAdd(tl->buf, token, len);    
    if (ret != 0) {
        SWISH_CROAK("error appending token to buffer: %d", ret);
    }
    return ret;
}

swish_Token *
swish_token_init(
)
{
    swish_Token *t;
    t = swish_xmalloc(sizeof(swish_Token));
    t->pos = 0;
    t->offset = 0;
    t->meta = NULL;
    t->context = NULL;
    t->value = NULL;
    t->len = 0;
    t->ref_cnt = 0;
    return t;
}

void
swish_token_free(
    swish_Token *t
)
{
    if (t->ref_cnt != 0) {
        SWISH_WARN("freeing Token with ref_cnt != 0 (%d)", t->ref_cnt);
    }
    
    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
        SWISH_DEBUG_MSG("freeing Token 0x%x with MetaName ref_cnt %d", 
            (long int)t, t->meta->ref_cnt);
    }
    
    t->meta->ref_cnt--;
    if (t->meta->ref_cnt == 0) {
        if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
            SWISH_DEBUG_MSG("Token's MetaName ref_cnt == 0 ... freeing MetaName");
        }
        swish_metaname_free(t->meta);
    }
    
    swish_xfree(t);
}

void
swish_token_debug(
    swish_Token *t
)
{
    SWISH_DEBUG_MSG("\n\
    t->ref_cnt      = %d\n\
    t->pos          = %d\n\
    t->context      = %s\n\
    t->meta         = %d [%s]\n\
    t->offset       = %d\n\
    t->len          = %d\n\
    t->value        = %s\n\
    ", t->ref_cnt, t->pos, t->context, t->meta->id, t->meta->name, t->offset, t->len, t->value);

}

void
swish_token_list_debug(
    swish_TokenIterator *it
)
{
    swish_Token *t;

    SWISH_DEBUG_MSG("Token buf:\n%s", xmlBufferContent(it->tl->buf));
    SWISH_DEBUG_MSG("Token buf length: %d\n", xmlBufferLength(it->tl->buf));
    SWISH_DEBUG_MSG("Number of tokens: %d", it->tl->n);

    while ((t = swish_token_iterator_next_token(it)) != NULL) {
        swish_token_debug(t);
    }
}

swish_TokenIterator *
swish_token_iterator_init(
    swish_Analyzer *a
)
{
    swish_TokenIterator *it;
    it = swish_xmalloc(sizeof(swish_TokenIterator));
    it->a = a;
    it->a->ref_cnt++;
    it->pos = 0;
    it->tl = swish_token_list_init();
    it->tl->ref_cnt++;
    it->ref_cnt = 0;
    return it;
}

void
swish_token_iterator_free(
    swish_TokenIterator *it
)
{
    if (it->ref_cnt != 0) {
        SWISH_WARN("freeing TokenIterator with ref_cnt != 0 (%d)", it->ref_cnt);
    }
    
    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
        SWISH_DEBUG_MSG(
        "freeing TokenIterator %d with TokenList ref_cnt %d and Analyzer ref_cnt %d", 
        it, it->tl->ref_cnt, it->a->ref_cnt);
    }
    
    it->a->ref_cnt--;
    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
        SWISH_DEBUG_MSG("freeing TokenIterator with Analyzer ref_cnt = %d",
            it->a->ref_cnt);
    }
    if (it->a->ref_cnt == 0)
        swish_analyzer_free(it->a);
        
    it->tl->ref_cnt--;
    if (it->tl->ref_cnt == 0)
        swish_token_list_free(it->tl);
    
    swish_xfree(it);
}

xmlChar *
swish_token_list_get_token_value(
    swish_TokenList *tl,
    swish_Token *t
)
{
    const xmlChar *buf;
    buf = xmlBufferContent(tl->buf);
    buf += t->offset;
    return (xmlChar*)buf;
}

swish_Token *
swish_token_iterator_next_token(
    swish_TokenIterator *it
)
{
    swish_Token *t;
    t = NULL;
    
/* SWISH_DEBUG_MSG("next_token: %d %d", it->pos, it->tl->n); */
    if (it->pos >= it->tl->n)
        return NULL;

    
    t = it->tl->tokens[it->pos++];
    t->value = swish_token_list_get_token_value(it->tl, t);
    return t;
}

/* returns number of tokens added to TokenList */
int
swish_tokenize(
    swish_TokenIterator *ti, 
    xmlChar *buf, 
    swish_MetaName *meta,
    xmlChar *context
)
{
    if (swish_is_ascii(buf)) {
        return swish_tokenize_ascii(ti, buf, meta, context);
    }
    else {
        return swish_tokenize_utf8(ti, buf, meta, context);
    }
}

int
swish_tokenize_utf8(
    swish_TokenIterator *ti, 
    xmlChar *buf, 
    swish_MetaName *meta,
    xmlChar *context
)
{
    uint32_t cp;
    int nstart, byte_pos, prev_pos, i, chr_len, token_len, maxwordlen, minwordlen;
    swish_TokenList *tl;
    boolean inside_token;
    xmlChar chr[5];             /*  max len of UCS32 plus NUL */
    xmlChar *token, *copy, *buf_lower;
    
    tl          = ti->tl;
    maxwordlen  = ti->a->maxwordlen;
    minwordlen  = ti->a->minwordlen;
    token       = swish_xmalloc(sizeof(xmlChar) * maxwordlen);
    buf_lower   = swish_utf8_str_tolower(buf);
    nstart      = tl->n;
    inside_token = 0;
    byte_pos    = 0;
    prev_pos    = 0;
    token_len   = 0;

    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
        SWISH_DEBUG_MSG("starting tokenize3 for meta=%s", meta->name);

/*
       iterate over each utf8 character, evaluating its Unicode value,
       and creating tokens
*/

    for (   byte_pos = 0; 
            buf_lower[prev_pos] != '\0';
            swish_utf8_next_chr(buf_lower, &byte_pos)
    ) {
        chr_len = byte_pos - prev_pos;
        if (!chr_len) {
            prev_pos = byte_pos;
            continue;
        }

        for (i = 0; i < chr_len; i++) {
            chr[i] = buf_lower[prev_pos + i];
        }
        chr[i] = '\0';

        cp = swish_utf8_codepoint(chr);

        if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER) {
            SWISH_DEBUG_MSG("%d %d: ut8 chr '%s' unicode %d  len %d next byte: %d",
                            byte_pos, prev_pos, chr, cp, chr_len, buf_lower[prev_pos + 1]);

        }
        
        prev_pos = byte_pos;

        if (is_ignore_word_utf8(cp)) {

            if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                SWISH_DEBUG_MSG("%s is ignore_word", chr);

            if (inside_token) {
                if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                    SWISH_DEBUG_MSG("found end of token: '%s'", chr);

                inside_token = 0;       /*  turn off flag */

                token[++token_len] = '\0';
                copy = token;
                token_len = strip_utf8_chrs(token, token_len);

                if (token[0] != '\0' && token_len >= minwordlen) {

                    swish_token_list_add_token(tl, token, token_len, meta, context);

                }
                else {
                    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                        SWISH_DEBUG_MSG("skipping token '%s' -- too short: %d", token,
                                        token_len);
                }

                token = copy;   /*  restore to top of array so we do not leak */

                if (cp == SWISH_TOKENPOS_BUMPER[0]) {
                    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                        SWISH_DEBUG_MSG("found tokenpos bumper byte at pos %d", tl->pos);
                    tl->pos++;
                }

                continue;

            }
            else {
                if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                    SWISH_DEBUG_MSG("ignoring chr '%s'", chr);

                if (cp == SWISH_TOKENPOS_BUMPER[0]) {
                    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                        SWISH_DEBUG_MSG("found tokenpos bumper byte at pos %d", tl->pos);
                    tl->pos++;
                }

                continue;
            }

        }
        else {

            if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                SWISH_DEBUG_MSG("%s is NOT ignore_word", chr);

            if (inside_token) {

                /* edge case */
                if ((chr_len + token_len) > maxwordlen) {
                    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                        SWISH_DEBUG_MSG("token_len = %d  forcing end of token: '%s'",
                                        token_len, chr);
                    continue;
                }

                if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                    SWISH_DEBUG_MSG("adding to token: '%s'", chr);

                memcpy(&token[token_len], chr, chr_len * sizeof(xmlChar));
                token[token_len + chr_len] = '\0';
                token_len += chr_len;

                if (token_len >= maxwordlen || buf_lower[byte_pos] == '\0') {

                    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                        SWISH_DEBUG_MSG("token_len = %d  forcing end of token: '%s'",
                                        token_len, chr);

                    inside_token = 0;   /*  turn off flag */

                    token[++token_len] = '\0';
                    copy = token;
                    token_len = strip_utf8_chrs(token, token_len);

                    if (token[0] != '\0' && token_len >= minwordlen) {

                        swish_token_list_add_token(tl, token, token_len, meta, context);

                    }
                    else {
                        if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                            SWISH_DEBUG_MSG("skipping token '%s' -- too short: %d", token,
                                            token_len);
                    }

                    token = copy;       /*  restore to top of array */

                }

                if (cp == SWISH_TOKENPOS_BUMPER[0]) {
                    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                        SWISH_DEBUG_MSG("found tokenpos bumper byte at pos %d", tl->pos);
                    tl->pos++;
                }

                continue;

            }
            else {

                if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                    SWISH_DEBUG_MSG("start a token with '%s'", chr);

                token[0] = '\0';
                token_len = 0;
                inside_token = 1;       /*  turn on flag */
                /* edge case */
                if (chr_len > maxwordlen)
                    continue;

                memcpy(&token[0], chr, chr_len * sizeof(xmlChar));
                token[chr_len] = '\0';
                token_len += chr_len;
                
                /* special case for one-character tokens */
                if (buf_lower[prev_pos] == '\0' && minwordlen == 1) {
                    inside_token        = 0;
                    token[token_len++]  = '\0';
                    swish_token_list_add_token(tl, token, token_len, meta, context);
                }

                if (cp == SWISH_TOKENPOS_BUMPER[0]) {
                    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                        SWISH_DEBUG_MSG("found tokenpos bumper byte at pos %d", tl->pos);
                    tl->pos++;
                }

                continue;

            }

        }

    }

    swish_xfree(token);
    swish_xfree(buf_lower);
    return tl->n - nstart;
}

int
swish_tokenize_ascii(
    swish_TokenIterator *ti, 
    xmlChar *buf, 
    swish_MetaName *meta,
    xmlChar *context
)
{
    char c, nextc;
    boolean inside_token;
    int i, token_len, nstart, maxwordlen, minwordlen;
    xmlChar *token, *copy;
    swish_TokenList *tl;
    
    tl              = ti->tl;
    maxwordlen      = ti->a->maxwordlen;
    minwordlen      = ti->a->minwordlen;
    token           = swish_xmalloc(sizeof(xmlChar) * maxwordlen);
    nstart          = tl->n;
    token_len       = 0;
    token[0]        = '\0';
    inside_token    = 0;

    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
        SWISH_DEBUG_MSG("tokenizing string: '%s'", buf);

    for (i = 0; buf[i] != '\0'; i++) {
        c = (char)tolower(buf[i]);
        nextc = (char)tolower(buf[i + 1]);

        if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
            SWISH_DEBUG_MSG(" char: %c lower: %c  int: %d %#x (next is %c)", buf[i], c,
                            (int)c, (unsigned int)c, nextc);
                            
        if (!ascii_word_table[(int)c]) {

            if (inside_token) {
                if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                    SWISH_DEBUG_MSG("found end of token: '%c' at %d", c, i);

                inside_token = 0;
                token[token_len++] = '\0';
                copy = token;
                token_len = strip_ascii_chrs(token, token_len);

                if (token[0] != '\0' && token_len >= minwordlen) {
                    swish_token_list_add_token(tl, token, token_len, meta, context);
                }
                else {
                    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                        SWISH_DEBUG_MSG("skipping token '%s' -- too short: %d", token,
                                        token_len);
                }

                token = copy;
                
                if (c == SWISH_TOKENPOS_BUMPER[0]) {
                    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                        SWISH_DEBUG_MSG("found tokenpos bumper byte at pos %d", tl->pos);
                    tl->pos++;
                }


                continue;

            }
            else {
                if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                    SWISH_DEBUG_MSG("ignoring char '%c'", c);

                if (c == SWISH_TOKENPOS_BUMPER[0]) {
                    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                        SWISH_DEBUG_MSG("found tokenpos bumper byte at pos %d", tl->pos);
                    tl->pos++;
                }

                continue;
            }

        }
        else {

            if (inside_token) {

                if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                    SWISH_DEBUG_MSG("adding to token: '%c' %d", c, i);

                token[token_len++] = c;

                if (token_len >= maxwordlen || nextc == '\0') {

                    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                        SWISH_DEBUG_MSG("forcing end of token: '%c' %d", c, i);

                    inside_token = 0;

                    token[token_len++] = '\0';
                    copy = token;
                    token_len = strip_ascii_chrs(token, token_len);

                    if (token[0] != '\0' && token_len >= minwordlen) {
                        swish_token_list_add_token(tl, token, token_len, meta, context);
                    }
                    else {
                        if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                            SWISH_DEBUG_MSG("skipping token '%s' -- too short: %d", token,
                                            token_len);
                    }

                    token = copy;

                }

                if (c == SWISH_TOKENPOS_BUMPER[0]) {
                    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                        SWISH_DEBUG_MSG("found tokenpos bumper byte at pos %d", tl->pos);
                    tl->pos++;
                }

                continue;

            }
            else {

                if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                    SWISH_DEBUG_MSG("start a token with '%c' %d", c, i);

                token_len = 0;
                inside_token = 1;
                token[token_len++] = c;
                
                /* special case for one-character tokens */
                if (nextc == '\0' && minwordlen == 1) {
                    inside_token        = 0;
                    token[token_len++]  = '\0';
                    swish_token_list_add_token(tl, token, token_len, meta, context);
                }
                
                if (c == SWISH_TOKENPOS_BUMPER[0]) {
                    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
                        SWISH_DEBUG_MSG("found tokenpos bumper byte at pos %d", tl->pos);
                    tl->pos++;
                }

                continue;

            }

        }

    }

    swish_xfree(token);
    return tl->n - nstart;
}


/*************** end tokenizer.c ************/
