static char rcsid[] = "summarize.c,v 1.87 1996/01/05 20:28:59 duane Exp";
/*
 *  summarize.c - Summarizing for the Essence system.
 *
 *  DEBUG: section  64, level 1         Gatherer essence object summarizing
 * 
 *  Darren Hardy, hardy@cs.colorado.edu, February 1994
 *
 *  ----------------------------------------------------------------------
 *  Copyright (c) 1994, 1995.  All rights reserved.
 *  
 *    The Harvest software was developed by the Internet Research Task
 *    Force Research Group on Resource Discovery (IRTF-RD):
 *  
 *          Mic Bowman of Transarc Corporation.
 *          Peter Danzig of the University of Southern California.
 *          Darren R. Hardy of the University of Colorado at Boulder.
 *          Udi Manber of the University of Arizona.
 *          Michael F. Schwartz of the University of Colorado at Boulder.
 *          Duane Wessels of the University of Colorado at Boulder.
 *  
 *    This copyright notice applies to software in the Harvest
 *    ``src/'' directory only.  Users should consult the individual
 *    copyright notices in the ``components/'' subdirectories for
 *    copyright information about other software bundled with the
 *    Harvest source code distribution.
 *  
 *  TERMS OF USE
 *    
 *    The Harvest software may be used and re-distributed without
 *    charge, provided that the software origin and research team are
 *    cited in any use of the system.  Most commonly this is
 *    accomplished by including a link to the Harvest Home Page
 *    (http://harvest.cs.colorado.edu/) from the query page of any
 *    Broker you deploy, as well as in the query result pages.  These
 *    links are generated automatically by the standard Broker
 *    software distribution.
 *    
 *    The Harvest software is provided ``as is'', without express or
 *    implied warranty, and with no support nor obligation to assist
 *    in its use, correction, modification or enhancement.  We assume
 *    no liability with respect to the infringement of copyrights,
 *    trade secrets, or any patents, and are not responsible for
 *    consequential damages.  Proper use of the Harvest software is
 *    entirely the responsibility of the user.
 *  
 *  DERIVATIVE WORKS
 *  
 *    Users may make derivative works from the Harvest software, subject 
 *    to the following constraints:
 *  
 *      - You must include the above copyright notice and these 
 *        accompanying paragraphs in all forms of derivative works, 
 *        and any documentation and other materials related to such 
 *        distribution and use acknowledge that the software was 
 *        developed at the above institutions.
 *  
 *      - You must notify IRTF-RD regarding your distribution of 
 *        the derivative work.
 *  
 *      - You must clearly notify users that your are distributing 
 *        a modified version and not the original Harvest software.
 *  
 *      - Any derivative product is also subject to these copyright 
 *        and use restrictions.
 *  
 *    Note that the Harvest software is NOT in the public domain.  We
 *    retain copyright, as specified above.
 *  
 *  HISTORY OF FREE SOFTWARE STATUS
 *  
 *    Originally we required sites to license the software in cases
 *    where they were going to build commercial products/services
 *    around Harvest.  In June 1995 we changed this policy.  We now
 *    allow people to use the core Harvest software (the code found in
 *    the Harvest ``src/'' directory) for free.  We made this change
 *    in the interest of encouraging the widest possible deployment of
 *    the technology.  The Harvest software is really a reference
 *    implementation of a set of protocols and formats, some of which
 *    we intend to standardize.  We encourage commercial
 *    re-implementations of code complying to this set of standards.  
 *  
 */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <memory.h>
#include <ctype.h>
#include <sys/types.h>
#include <sys/param.h>
#include <sys/wait.h>
#include <time.h>
#include <GNUregex.h>
#include "util.h"
#include "essence.h"
#include "post_process.h"

/* Local Functions */
static int summarize_file();
static int read_structured_summary();
static void grab_fulltext();
static void mkkeywords();
static void mkdescription();
static void mkgid();
#ifdef USE_QUICKSUM
static int can_quicksum();
static void init_quicksum();
static void generate_quicksum();
static void finish_quicksum();
#endif

/*
 *  summarize() - Summarizes an object and adds the generated template 
 *  to the storage manager.  Returns 0 on success; non-zero otherwise.
 */
int summarize(object)
     DataObject *object;
{
	Debug(64, 1, ("summarize(%s, %s)\n", object->url->url, object->type));

#ifdef NO_UNIX_RECURSE
	if (!strcmp(object->type, "Directory")) {
		Log("Skipping %s (%s)\n", object->url->url, object->type);
		return (0);	/* skip directories */
	}
#endif
	switch (object->url->type) {
	case URL_FILE:		/* Supported Types */
	case URL_FTP:
	case URL_GOPHER:
	case URL_NEWS:
	case URL_HTTP:
	case URL_NOP:
		return (summarize_file(object));
	default:
		errorlog("Internal summarize() error.  Unsupported type.\n");
	}
	return (1);
}

/*
 *  init_summarize() - Initializes the Summarize step.
 */
void init_summarize()
{
#ifdef USE_QUICKSUM
	init_quicksum();
#endif
}

/*
 *  finish_summarize() - Cleans up after the Summarize step
 */
void finish_summarize()
{
#ifdef USE_QUICKSUM
	finish_quicksum();
#endif
}

/*
 *  summarize_file() - Summarizes a file and adds the generated template 
 *  to the storage manager.  Returns 0 on success; non-zero otherwise.
 */
static int summarize_file(obj)
     DataObject *obj;
{
	Template *template = NULL;
	FILE *ifp = NULL;
	struct OID *oid = NULL;
	char buf[BUFSIZ], *s, *q;
	int pipefds[2], pid = 0, err;
	int localobj = 0;
	int pp_code = 0;

	/* 
	 *  We don't really need the object to do a full summary, so
	 *  set a flag to say if we got it or not to make this section
	 *  more clear.
	 */
	localobj = !object_retrieve(obj);

	/*
	 *  Check to see if this object is a nested object.  
	 *  If so, change the URL of the template to the URL 
	 *  of the parent object, and include an Attribute for 
	 *  the name of the nested file (using only the relative pathname)
	 */
	if ((obj->flags & F_NESTED) && obj->parent_url) {
		Debug(64, 1, ("Creating Nested object for %s\n", obj->parent_url));
		template = create_template(obj->ttype, obj->parent_url);
		s = strstr(obj->url->url, tmpdir);
		s = (s != NULL) ? s + strlen(tmpdir) + 1 : obj->url->url;
		q = strchr(s, '/');
		q = (q == NULL) ? s : q + 1;
		template->list = create_AVList(T_NESTED, q, strlen(q));
	} else {
		oid = generate_oid(obj->url->url, gatherer_id, obj);
		template = create_template_with_oid(obj->ttype,
		    obj->url->url, oid);
	}
	/* Add some other known Attributes */
	add_AVList(template->list, T_FILETYPE, obj->type, strlen(obj->type));

	/* We can't look at the object so finish up */
	if (obj->flags & F_NO_ACCESS || obj->flags & F_MANUAL || !localobj) {
		goto finish_summarizing;
	}
	/* below here localobj == non-zero; add a few more attributes... */

	sprintf(buf, "%u", (unsigned int) obj->s->st_size);
	add_AVList(template->list, T_FILESIZE, buf, strlen(buf));
#ifdef USE_MD5
	{
		/* If the file is local, we have its MD5 value */
		if (obj->url->md5) {
			add_AVList(template->list, T_MD5, obj->url->md5,
			    strlen(obj->url->md5));
		}
	}
#endif
	/* If we don't know its type then we can do no more */
	if (!strcmp(obj->type, "Unknown")) {
		goto finish_summarizing;	/* goto? aack! oh well */
	}
	/* For full-text indexing try to grab all the data and finish */
	if (do_fulltext) {
		grab_fulltext(template, obj);
		goto finish_summarizing;
	}
	/* 
	 *  Summarize the Object 
	 *
	 *  Check to see if we want to summarize.
	 *  Check to see if we can access its contents.  
	 *  Check to see if we're to summarize the contents or to
	 *  simply use the full-text of the file.  Then, check to see
	 *  if we can use the fast, internal summarizer (quicksum())
	 *  (or the semi-fast, external summarizer) that uses regular
	 *  expressions to define the values for the attributes.
	 *  If all else fails, run the standard, external summarizer.
	 */

	/* 
	 *  The quicksum mechanism lets some easy types be summarized 
	 *  very quickly using regular expressions; we save a fork()
	 *  and it's easier to specify how to summarize the object.
	 *  See quicksum.cf for the regular expression syntax.
	 * 
	 *  If we have POSIX regular expressions, then we can do
	 *  all of the quick summing in a procedure; otherwise, we
	 *  need to call the hacked perl script to do it for us...
	 */
#ifdef USE_QUICKSUM
	if (can_quicksum(obj->type)) {
		generate_quicksum(template, obj);
		goto finish_summarizing;
	}
#else
	buf[0] = '\0';
	sprintf(buf, "quick-sum \"%s\" \"%s\" < /dev/null",
	    quicksum_file, obj->type);
	if (do_system(buf) == 0) {	/* Make sure this works */
		buf[0] = '\0';
		sprintf(buf, "quick-sum \"%s\" \"%s\" \"%s\"",
		    quicksum_file, obj->type, obj->url->filename);
	}
#endif
	else {
		buf[0] = '\0';	/* in case sprintf fails */
		sprintf(buf, "%s.sum \"%s\"", obj->type, obj->url->filename);
	}

	Debug(64, 1, ("Summarizer: RUNNING: %s\n", buf));

	/* 
	 *  Run the external summarizer.  We could use popen(3), but it
	 *  exec's a shell to process the command line.  We build our
	 *  own pipeline and fork/exec to save this extra process.
	 */
	if (pipe(pipefds) < 0) {
		log_errno("pipe");
		goto finish_summarizing;
	}
	/*
	 *  We can't use vfork() here, because otherwise parse_argv 
	 *  introduces a memory leak.
	 */
	if ((pid = fork()) < 0) {
		log_errno("fork");
		goto finish_summarizing;
	}
	if (pid == 0) {		/* child: summarizer */
		char *argv[64], *urlbuf;

		close(pipefds[0]);	/* child wont read from pipe */
		dup2(pipefds[1], 1);	/* stdout -> write:pipe */
		close(pipefds[1]);	/* close pipe, its now stdout */

		/* parse_argv may barf, so initialize */
		memset(argv, '\0', sizeof(argv));
		parse_argv(argv, buf);

		/* add an environment variable for the child */
		urlbuf = xmalloc(strlen(obj->url->url) + 32);
		urlbuf[0] = '\0';
		sprintf(urlbuf, "SUMMARIZER_URL=%s", obj->url->url);
		if (putenv(urlbuf) < 0) {
			log_errno("putenv");
		}
		execvp(argv[0], argv);
		sprintf(buf, "execvp: %s", argv[0]);
		log_errno(buf);
		_exit(1);
	}
	/* parent */
	close(pipefds[1]);
	if ((ifp = fdopen(pipefds[0], "r")) == NULL) {
		errorlog("summarize: Running external summarizer: ");
		log_errno(buf);
		close(pipefds[0]);
		goto finish_summarizing;
	}
	if (!read_structured_summary(ifp, template)) {
		errorlog("Invalid output from %s.sum (url=%s)\n",
		    obj->type, obj->url->url);
	}
	/* Write the Template to the Database */
      finish_summarizing:
	Debug(64, 1, ("Finish building summary for %s\n", obj->url->url));
	if (obj->avl) {		/* add "hardcoded" AVPairs to Template */
		merge_AVList(template->list, obj->avl);
		Debug(64, 1, ("Merging AVList for obj: %s\n", obj->url->url));
	}
	if (do_keywords) {
		mkkeywords(template);
	}
	mkdescription(template);	/* only do description for main tmpl */
	mkgid(template);
	pp_code = post_process(template);
	if (pp_code == SUMMARIZE_DONT_ADD_OBJECT) {
		Debug(64, 1, ("NOT adding %s to the database\n", obj->url->url));
	} else {
		add_template(template, obj);
	}

	/* clean up */
	free_template(template);
	if (oid)
		free_oid(oid);
	if (ifp) {		/* some people came from early on so check */
		fclose(ifp);
		close(pipefds[0]);
		/* explicitly wait for the summarizer to exit */
		err = waitpid(pid, (int *) NULL, (int) NULL);
		if (err != pid) {
			Debug(64, 1, ("WARNING: waiting for child %d got %d...\n",
				pid, err));
		}
	}
	return (0);
}

/*
 *  summarize_nested_object() - Summarizes a nested object and adds 
 *  the generated template to the storage manager.  Returns 0 on 
 *  success; non-zero otherwise.
 */
int summarize_nested_object(object)
     DataObject *object;
{
	Template *template = NULL;
	struct OID *oid = NULL;
	char buf[BUFSIZ], *s, *q;
	int localobject = 0;
	int pp_code = 0;

	localobject = !object_retrieve(object);		/* Force retrieval of object */

	/*
	 *  Check to see if this object is a nested object.  
	 *  If so, change the URL of the template to the URL 
	 *  of the parent object, and include an Attribute for 
	 *  the name of the nested file (using only the last 
	 *  component of the relative pathname)
	 */
	if ((object->flags & F_NESTED) && object->parent_url) {
		Debug(64, 1, ("Creating Nested object for %s\n", object->parent_url));
		template = create_template(object->ttype, object->parent_url);
		s = strstr(object->url->url, tmpdir);
		s = (s != NULL) ? s + strlen(tmpdir) + 1 : object->url->url;
		q = strchr(s, '/');
		q = (q == NULL) ? s : q + 1;
		template->list = create_AVList(T_NESTED, q, strlen(q));
	} else {
		Debug(64, 1, ("Creating Nested object for %s\n", object->url->url));
		oid = generate_oid(object->url->url, gatherer_id, object);
		template = create_template_with_oid(object->ttype,
		    object->url->url, oid);
	}
	/* Add some other known Attributes */
	add_AVList(template->list, T_FILETYPE, object->type, strlen(object->type));
	if (localobject) {
		sprintf(buf, "%u", (unsigned int) object->s->st_size);
		add_AVList(template->list, T_FILESIZE, buf, strlen(buf));
	}
#ifdef USE_MD5
	{
		/* The file is local, so generate its MD5 value */
		if (localobject && object->url->md5) {
			add_AVList(template->list, T_MD5, object->url->md5, strlen(object->url->md5));
		}
	}
#endif

	/* We don't need to do any summarizing, a stubby template is enough */

	/* Write the Template to the Database */
	if (do_keywords) {
		mkkeywords(template);
	}
	mkgid(template);
	pp_code = post_process(template);
	if (pp_code == SUMMARIZE_DONT_ADD_OBJECT) {
		Debug(64, 1, ("NOT adding %s to the database\n",
			object->url->url));
	} else {
		add_template(template, object);
	}

	/* clean up */
	free_template(template);
	if (oid)
		free_oid(oid);
	return (0);
}


/* Local functions */

#define skip_whitespace()	\
	while (1) { \
		c = fgetc(fp); \
		if (c == EOF) return 1; \
		if (c == '}') return 1; \
		if (!isspace(c)) { ungetc(c, fp); break; } \
	}

#define grab_attribute() \
	p = buf; \
	while (1) { \
		c = fgetc(fp); \
		if (c == EOF) return 1; \
		if (c == '\n') return 0; \
		if (c == '{') break; \
		if (c == '}') break; \
		*p++ = c; \
		if (p == &buf[BUFSIZ-1]) return 1; \
	} \
	*p = '\0';

#define grab_ttype() \
	do { \
		int i; \
		if ((c = getc(fp)) == '@') { \
			memset(buf,'\0',BUFSIZ); \
			i = 0; \
			for (c = getc(fp); (!isspace(c) && (c != '{')); c = getc(fp)) { \
				if (i >= BUFSIZ -1) return(1); \
				buf[i++] = c; \
			} \
			xfree (template->template_type);  \
			template->template_type = xstrdup(buf);  \
			while (isspace(c) || (c == '{')) c = getc(fp); \
			ungetc(c,fp); \
			memset(buf,'\0',BUFSIZ); \
			i = 0; \
			for (c = getc(fp); !isspace(c); c = getc(fp)) { \
				if (i >= BUFSIZ - 1) return(1); \
				buf[i++] = c; \
			} \
			xfree(template->url); \
			template->url = xstrdup(buf); \
			while (c != '\n') c = getc(fp); \
		} else ungetc(c, fp);  \
	} while(0);


/*
 *  read_structured_summary() - Reads the output of a structured
 *  summarizer that outputs its data as attribute value pairs.  It
 *  parses the attribute value pairs and adds them to the given template.
 *
 *  XXX: NOTE that this doesn't work if the summarizer crashes.  For
 *  example, if it gets a segmentation fault and prints that to stderr,
 *  then the text to stderr will get caught in the attribute name.  Need
 *  to use fork/exec and check error code to make sure that it returns 0
 *  as the exit code, otherwise throw away the data that was generated.
 *  grab_attribute() dies on '\n' so if there's an error message from
 *  the summarizer that goes to ifp, then the parser dies.
 */
static int read_structured_summary(fp, template)
     FILE *fp;
     Template *template;
{
	static char buf[BUFSIZ];
	char *attr = NULL;
	char *value = NULL;
	char *p = NULL;
	int vsize;
	int c;

	grab_ttype();		/* @TYPE { is optional */
	while (1) {
		skip_whitespace();
		grab_attribute();
		attr = strdup(buf);	/* Read Attribute */
		grab_attribute();
		vsize = atoi(buf);	/* Read Value Size */
		c = fgetc(fp);
		if (c != ':') {
			xfree(attr);
			return 0;	/* expecting : */
		}
		c = fgetc(fp);
		if (c != '\t') {
			xfree(attr);
			return 0;	/* expecting <TAB> */
		}
		value = xmalloc(vsize + 1);	/* Read Value */
		if (fread(value, 1, vsize, fp) != vsize) {
			xfree(attr);
			xfree(value);
			return 0;
		}
		value[vsize] = '\0';
		if (do_cksumdups)
			add_AVList(template->list, attr, value, vsize);
		else
			FAST_add_AVList(template->list, attr, value, vsize);
		xfree(attr);
		xfree(value);
	}
}
#undef skip_whitespace
#undef grab_attribute

/*
 *  grab_fulltext() - Adds the contents of an entire file to the template.
 */
static void grab_fulltext(template, object)
     Template *template;
     DataObject *object;
{
	FILE *fp;
	char *value = NULL;

	if (!object->url->filename || !object->s)	/* Object is not local... */
		return;

	if ((fp = fopen(object->url->filename, "r")) == NULL) {
		log_errno(object->url->filename);
		return;
	}
	/* We have the file, so slurp the whole thing in at once */
	value = xmalloc(object->s->st_size + 1);
	if (fread(value, 1, object->s->st_size, fp) != object->s->st_size) {
		log_errno(object->url->filename);
		xfree(value);
		return;
	}
	fclose(fp);
	value[object->s->st_size] = '\0';	/* be nice */
	add_AVList(template->list, T_FULLTEXT, value, object->s->st_size);
	xfree(value);		/* don't need this memory anymore */
}


#ifdef USE_QUICKSUM
#define MAX_REGEX 32		/* max number of regular expressions per type */

/* 
 *  For each type, define all of the regular expressions and their
 *  associated attributes.
 */
struct quicksums {
	char *type;
	char *attribute[MAX_REGEX];
	char *regex[MAX_REGEX];
	regex_t compiled[MAX_REGEX];	/* compiled version of regex */
};
struct quicksums *qs[MAX_TYPES];

/*
 *  When summarizing data, allocate a structure for each attribute that 
 *  lets us control the value buffer.
 */
struct avbuf {
	char *attribute;
	Buffer *b;
};

struct avbuf vbuf[MAX_REGEX];	/* value buffers */

#if defined(USE_POSIX_REGEX)
#define do_match(s, c) (regexec((c), (s), 0, 0, 0) == 0)
#else
#error "unsupported"
#endif

static void init_quicksum()
{
	FILE *fp;
	char buf[BUFSIZ], *type, *attr, *regex, *p, *s;
	int i, j, done;

	memset(qs, '\0', MAX_TYPES * sizeof(struct quicksums *));
	if ((fp = fopen(quicksum_file, "r")) == NULL) {
		log_errno(quicksum_file);
		return;
	}
	while (fgets(buf, BUFSIZ, fp)) {
		if (buf[0] == '#')
			continue;	/* skip comments */
		type = p = buf;
		while (*++p != '\t');	/* skip type name */
		*p++ = '\0';
		while (isspace(*p))
			p++;	/* skip whitespace */
		attr = p;
		while (!isspace(*p))
			p++;	/* skip whitespace */
		*p++ = '\0';
		while (isspace(*p))
			p++;	/* skip whitespace */
		regex = p;
		if ((s = strrchr(regex, '\n')) != NULL)
			*s = '\0';	/* remove newline */
		for (i = done = 0; qs[i] != NULL; i++) {
			if (!strcmp(qs[i]->type, type)) {	/* add to type */
				/* find attribute's place */
				for (j = 0; qs[i]->attribute[j] != NULL; j++);
				qs[i]->attribute[j] = strdup(attr);
				qs[i]->regex[j] = strdup(regex);
#if defined(USE_POSIX_REGEX)
				regcomp(&qs[i]->compiled[j], qs[i]->regex[j],
				    USE_RE_SYNTAX);
#endif
				if (j + 1 < MAX_REGEX) {
					qs[i]->attribute[j + 1] = NULL;
					qs[i]->regex[j + 1] = NULL;
				} else
					errorlog("WARNING! INCREASE MAX_REGEX");
				done = 1;
			}
		}
		if (!done) {	/* new type */
			qs[i] = xmalloc(sizeof(struct quicksums));
			qs[i]->type = strdup(type);
			qs[i]->attribute[0] = strdup(attr);
			qs[i]->regex[0] = strdup(regex);
#if defined(USE_POSIX_REGEX)
			regcomp(&qs[i]->compiled[0], qs[i]->regex[0],
			    USE_RE_SYNTAX);
#endif
			qs[i]->attribute[1] = NULL;
			qs[i]->regex[1] = NULL;
		}
	}
	fclose(fp);

	/* Reset the buffers, then allocate the buffers */
	for (i = 0; i < MAX_REGEX; i++) {
		vbuf[i].attribute = NULL;
		vbuf[i].b = create_buffer(BUFSIZ);
	}

	if (debug_ok(64, 1)) {
		for (i = 0; qs[i] != NULL; i++) {
			Log("Type: %s\n", qs[i]->type);
			for (j = 0; qs[i]->attribute[j]; j++)
				Log("Attribute: %s --> RE: %s\n",
				    qs[i]->attribute[j], qs[i]->regex[j]);
		}
	}
}

static void finish_quicksum()
{
	int i, j;

	for (i = 0; qs[i] != NULL; i++) {
		if (qs[i]->type)
			xfree(qs[i]->type);
#if defined(USE_POSIX_REGEX)
		for (j = 0; qs[i]->attribute[j] != NULL; j++)
			regfree(&qs[i]->compiled[j]);
#endif
		for (j = 0; qs[i]->attribute[j] != NULL; j++)
			xfree(qs[i]->attribute[j]);
		for (j = 0; qs[i]->regex[j] != NULL; j++)
			xfree(qs[i]->regex[j]);
		xfree(qs[i]);
	}
	memset(qs, '\0', MAX_TYPES * sizeof(struct quicksums *));

	for (i = 0; i < MAX_REGEX; i++) {
		if (vbuf[i].attribute) {
			xfree(vbuf[i].attribute);
			vbuf[i].attribute = NULL;
		}
		if (vbuf[i].b) {
			free_buffer(vbuf[i].b);
			vbuf[i].b = NULL;
		}
	}
}
/*
 *  generate_quicksum() - Quickly summarizes object and addes attributes to
 *  template.
 */
static void generate_quicksum(template, object)
     Template *template;
     DataObject *object;
{
	char buf[BUFSIZ];
	FILE *fp;
	int i, j, curqs, found;

	Debug(64, 1, ("generate_quicksum(%s, %s)\n", object->type, object->url->url));

	/* Can we quicksum the object?  If so, find the object's type */
	for (i = 0; qs[i] != NULL; i++) {
		if (!strcmp(qs[i]->type, object->type))
			break;
	}
	if (qs[i] == NULL)
		return;
	curqs = i;

	/* Try opening the file to summarize */
	if ((fp = fopen(object->url->filename, "r")) == NULL) {
		log_errno(object->url->filename);
		return;
	}
	/* Reset the buffers */
	for (i = 0; i < MAX_REGEX; i++) {
		vbuf[i].attribute = NULL;
	}

	for (i = 0; qs[curqs]->attribute[i] != NULL; i++) {
		/* check to see if attribute is in value buffer */
		for (found = j = 0; vbuf[j].attribute; j++) {
			if (!strcmp(vbuf[j].attribute,
				qs[curqs]->attribute[i])) {
				found = 1;
				break;
			}
		}
		if (!found) {
			for (j = 0; vbuf[j].attribute != NULL; j++);	/* Find first spot */
			vbuf[j].attribute = strdup(qs[curqs]->attribute[i]);
		}
	}

	/* Now summarize the file and write saved lines to vbuf */
	while (fgets(buf, BUFSIZ, fp)) {
		for (j = 0; qs[curqs]->attribute[j] != NULL; j++) {
			if (do_match(buf, &qs[curqs]->compiled[j])) {
				for (found = i = 0; vbuf[i].attribute; i++) {
					if (!strcmp(qs[curqs]->attribute[j],
						vbuf[i].attribute)) {
						found = 1;
						break;	/* Find vbuf to use */
					}
				}
				if (found) {
					add_buffer(vbuf[i].b, buf, strlen(buf));
				}
			}
		}
	}
	fclose(fp);

	/* Add values to template */
	for (i = 0; vbuf[i].attribute; i++) {
		if (vbuf[i].b->length > 0) {
			add_AVList(template->list, vbuf[i].attribute,
			    vbuf[i].b->data, vbuf[i].b->length);
		}
	}

	/* Clean up */
	for (i = 0; i < MAX_REGEX; i++) {
		if (vbuf[i].attribute) {
			xfree(vbuf[i].attribute);
			vbuf[i].attribute = NULL;
		}
		shrink_buffer(vbuf[i].b);
	}
}

/*
 *  can_quicksum() - Returns non-zero if generate_quicksum() can process type;
 *  returns 0 otherwise;
 */
static int can_quicksum(type)
     char *type;
{
	int i;

	for (i = 0; qs[i] != NULL; i++)
		if (!strcmp(qs[i]->type, type))
			return (1);
	return (0);
}
#endif /* USE_QUICKSUM */

/*
 *  mkdescription() - Generates a Description for the Template.
 */
static void mkdescription(t)
     Template *t;
{
	AVPair *avp;
	int i, j, gotdata, n;
	char *s;

	if (t == NULL || t->list == NULL)
		return;

	/* See if the Summarizer already generated one */
	if (extract_AVPair(t->list, T_DESCRIPTION) != NULL)
		return;

	/* Try to build a Description attribute based on other fields */
	avp = extract_AVPair(t->list, T_TITLE);
	if (avp == NULL)
		avp = extract_AVPair(t->list, T_ABSTRACT);
	if (avp == NULL)
		avp = extract_AVPair(t->list, T_PARTTEXT);

	/* Cannot find any data to use, or not enough */
	if (avp == NULL || avp->vsize < 2)
		return;

	/* Locate the first line of the data */
	for (i = 0, gotdata = 0; i < avp->vsize; i++) {
		if (avp->value[i] == '\n') {
			if (gotdata == 1)
				break;
		} else if (gotdata == 0 && isalnum(avp->value[i])) {
			gotdata = 1;
		}
	}
	n = i;

	/* If we got a reasonable chunk then use i bytes of it */
	if (gotdata && n > 0) {
		/* Copy the data and strip the newlines */
		s = xmalloc(n + 1);
		for (i = 0, j = 0; j < n; j++) {
			if (avp->value[j] != '\n')
				s[i++] = avp->value[j];
		}
		s[i] = '\0';

		/* We know for sure that T_DESCRIPTION isn't in t->list */
		FAST_add_AVList(t->list, T_DESCRIPTION, s, i);
		xfree(s);
	}
}

/*
 *  mkkeywords() - Generates a keywords list for the Template.
 */
static void mkkeywords(t)
     Template *t;
{
	AVPair *avp;
	char *s;
	int have_keys = 0;

	if (t == NULL || t->list == NULL)
		return;

	/* 
	 *  Make canonical Keywords list, using attribute Keywords 
	 *  or Partial-Text, or Description, Abstract, or Title.
	 */
	if ((avp = extract_AVPair(t->list, T_KEYS)) != NULL)
		have_keys = 1;
	if (avp == NULL)
		avp = extract_AVPair(t->list, T_PARTTEXT);
	if (avp == NULL)
		avp = extract_AVPair(t->list, "Description");
	if (avp == NULL)
		avp = extract_AVPair(t->list, T_ABSTRACT);
	if (avp == NULL)
		avp = extract_AVPair(t->list, T_TITLE);
	if (avp == NULL)
		return;		/* don't make any modifications */

	if ((s = mkwordlist(avp->value, avp->vsize)) == NULL)
		return;		/* don't make any modifications */

	if (have_keys) {
		xfree(avp->value);
		avp->value = strdup(s);
		avp->vsize = strlen(s);
	} else {
		/* We know for sure that T_KEYS isn't in t->list */
		FAST_add_AVList(t->list, T_KEYS, s, strlen(s));
	}
	xfree(s);
	return;
}

/*
 *  mkgid() - Verifies that the Template contains the Gatherer
 *  Identification attributes:  Gatherer-Name, Gatherer-Host, and
 *  Gatherer-Version.
 */
static void mkgid(t)
     Template *t;
{
	AVPair *avp;

	if (t == NULL || t->list == NULL || gatherer_id == NULL)
		return;

	if ((avp = extract_AVPair(t->list, T_GHOST)) == NULL) {
		add_AVList(t->list, T_GHOST, gatherer_id->host,
		    strlen(gatherer_id->host));
	}
	if ((avp = extract_AVPair(t->list, T_GVERSION)) == NULL) {
		add_AVList(t->list, T_GVERSION, gatherer_id->version,
		    strlen(gatherer_id->version));
	}
	if ((avp = extract_AVPair(t->list, T_GNAME)) == NULL) {
		add_AVList(t->list, T_GNAME, gatherer_id->name,
		    strlen(gatherer_id->name));
	}
	return;
}
