static char rcsid[] = "localParsers.c,v 1.3 1995/08/04 02:15:28 duane Exp";
/*
 *  Contains a parser for Harvest's Summary Object Interchange Format (SOIF)
 *  http://harvest.cs.colorado.edu/.
 *
 *  There are a few known problems with this parser:
 *    - WAIS parses files line-by-line, so this does not support binary data
 *    - Parsing SOIF attributes is not exact, because the parser needs 
 *      to guess on every line with which attribute to associate the data.
 *    - Only a total of MAX_FIELD (256) different attributes are supported.
 *      Any SOIF attribute after MAX_FIELD is ignored (on a first-come,
 *      first-serve basis).
 *    - It's slow, since it needs to test to see if the current line
 *      starts a new attribute or not (worst case is 2 sscanf's per line).
 *    - Only works with SOIF from a Broker (e.g., attribute names are all
 *      lower case).
 *    - All WAIS field names cannot (evidently) contain any
 *      non-alpha-numeric characters.
 *
 *  However, this parser does work well for most SOIF data...
 *
 *  Written by Darren Hardy, hardy@cs.colorado.edu, April 1995
 */
#define localParser_c

#include <string.h>
#include <ctype.h>

#include "localParsers.h"

/* define to allow only SOIF attributes that are all lower-case */
/* Harvest Brokers only use lower-case attributes */
#ifndef SOIF_LOWERCASE_ONLY
#define SOIF_LOWERCASE_ONLY
#endif

/* define for debugging output to stderr */
#ifndef SOIF_DEBUG
#undef SOIF_DEBUG
#endif

/* define to strip SOIF attribute names of non-alpha-numeric characters */
#ifndef SOIF_STRIP_ATTR
#define SOIF_STRIP_ATTR
#endif

/*
 *  verify_attr() - Returns non-zero if the attr is a valid SOIF attr.
 */
static int soif_verify_attr(char *attr)
{
  char *p;
  int ok = 0;

  /* needs to have a (lowercase) alpha character */
  for (p = attr; *p; p++) {
#ifdef SOIF_LOWERCASE_ONLY
    if (islower((unsigned char) *p)) {
#else
    if (isalpha((unsigned char) *p)) {
#endif
      ok = 1;
      break;
    }
  }
  /* needs to be at least 2 characters long */
  if (attr[0] == '\0' || attr[1] == '\0') {
    ok = 0;
  }
  return(ok);
}

/* strips attr of all non-alpha-numeric characters */
static void strip_attr(attr)
char *attr;
{
	int i,j;
	char s[BUFSIZ];

	for (i = j = 0; attr[i]; i++) 
		if (isalnum((unsigned char) attr[i])) 
			s[j++] = attr[i];
	s[j] = '\0';
	strcpy(attr, s);
}

/* maps attribute name into fieldID */
static char *soif_attrtofid[MAX_FIELD+1];

/*
 *  grab_fieldID() - Returns the fieldID number for attr;  if it's not
 *  registered already, then it returns the next available fieldID
 */
static long soif_grab_attrtofid(char *attr)
{
   long i, min_field;
   static int initialized = 0;

   if (!initialized) {
     initialized = 1;
     memset(soif_attrtofid, '\0', sizeof(char *) * (MAX_FIELD+1));
   }
   for (i = min_field = MIN_FIELD; i <= MAX_FIELD; i++) {
     if (soif_attrtofid[i] != NULL) {
       min_field = i;
       if (!strcmp(attr, soif_attrtofid[i])) {
         return (i);
       }
     }
   }
   if (min_field == MAX_FIELD && soif_attrtofid[MAX_FIELD] != NULL) {
     return(MIN_FIELD);	/* error: too many attributes */
   }
   soif_attrtofid[++min_field] = safeStrdup(attr);
#ifdef SOIF_DEBUG
  fprintf(stderr, "ADDED %d %s\n", min_field, soif_attrtofid[min_field]);
#endif
   return (min_field);
}

/*---------------------------------------------------------------------------*/

fieldInfo*
soifField(char* line,long* fieldID)
{
	char attr[8192];
	int x, vsize;
	static long previous_fieldID = MIN_FIELD;

	attr[0] = '\0';
	x = vsize = -1;

	/* sscanf's are expensive, try quickies first */
	/* assumes all attribute names are lowercase as is in Brokers */
	if (((line[0] == 'e' && /* quickie */
#ifdef SOIF_LOWERCASE_ONLY
              sscanf(line,"embed<%d>-%[a-z0-9-]{%d}:\t",&x,attr,&vsize)==3)||
#else
              sscanf(line,"embed<%d>-%[A-Za-z0-9-]{%d}:\t",&x,attr,&vsize)==3)||
#endif
              (isalnum((unsigned char) line[0]) && /* quickie, always char or digit */
#ifdef SOIF_LOWERCASE_ONLY
              sscanf(line, "%[a-z0-9-]{%d}:\t", &attr, &vsize) == 2)) &&
#else
              sscanf(line, "%[A-Za-z0-9-]{%d}:\t", &attr, &vsize) == 2)) &&
#endif
              soif_verify_attr(attr)) {
#ifndef SOIF_LOWERCASE_ONLY
{		/* convert attribute to all lowercase */
		char *p;
		for (p = attr; *p; p++)
			*p = tolower((unsigned char) *p);
}
#endif
#ifdef SOIF_STRIP_ATTR
		strip_attr(attr);
#endif
		*fieldID = soif_grab_attrtofid(attr);
#ifdef SOIF_DEBUG
  fprintf(stderr, "GRABBED (embed %d) %d %s\n", x == -1 ?0:1, *fieldID, attr);
#endif
		previous_fieldID = *fieldID;
   		if (fieldIsDefined[*fieldID] == false) { 
        		fieldInfo* fd = makeFieldInfo();
        		fieldIsDefined[*fieldID] = true;
        		fd->fieldID = *fieldID;
                        if (*fieldID == MIN_FIELD) {
        		  fd->names = collecting(fd->names,safeStrdup("body"));
        		  fd->description = safeStrdup("body");
			} else {
        		  fd->names = collecting(fd->names,safeStrdup(attr));
        		  fd->description = safeStrdup(attr);
			}
        		fd->fieldType = TEXT_FIELD;
        		fd->lowerBoundSet = false;
        		fd->upperBoundSet = false;
        		return(fd);
      		}
                return(NULL);
   	} 
	*fieldID = previous_fieldID;	/* associate data with prev attr */
   	return(NULL);
}

/*---------------------------------------------------------------------------*/

long
soifGatherHeadline(char* line)
{
  char *s;

  /* this is called on every line so be quick about it... */
  if (line[0] == '@' && theHeadline[0] == '\0') {
     /* save away the URL */
     if ((s = strchr(line, '{')) == NULL)
       return IN_BODY;
     else
       s++;
     while (isspace((unsigned char) *s))
       s++;
     strncpy(theHeadline,s,MAX_HEADLINE_LEN);
     return(IN_HEADLINE);
  } 
  return IN_BODY;
}

/*---------------------------------------------------------------------------*/

void 
soifFinishHeadline(char* headline,char* filename)
{
  char *s;

  if (theHeadline[0]) {
    /* Use the OBJxxx first in the headline */
    if ((s = strrchr(filename, '/')) == NULL) {
      s = filename;
    } else {
      s++;
    }
    /* Headline is ``OBJxxx URL'' */
    strncpy(headline,s,MAX_HEADLINE_LEN);
    strcat(headline, " ");
    strncat(headline,theHeadline,MAX_HEADLINE_LEN-strlen(s)-2);
    theHeadline[0] = '\0';
  } else {
    strcpy(headline,"Unknown Headline");
  }
}

/*---------------------------------------------------------------------------*/

parserInfo localParserList[] = 
{
  defParser("soif",
            "Harvest's Summary Object Interchange Format",
            "TEXT",
            true, /* true to index contents, false otherwise */
            NULL,
            soifField,
            soifGatherHeadline,
            NULL,
            NULL,
            NULL,
            NULL,
            soifFinishHeadline,
            NULL
           ),
  defParser(NULL,
            NULL,
            NULL,
            false,
            NULL,
            NULL,
            NULL,
            NULL,
            NULL,
            NULL,
            NULL,
            NULL,
            NULL
           )
};

/*  The localParserList is a null-terminated list of defParser structures;
    DO NOT DELETE the NULL structure at the end! */

/*---------------------------------------------------------------------------*/
