static char rcsid[] = "HTML-lax.sum.c,v 1.4 1996/01/16 08:44:27 duane Exp";
/*
 *  HTML-lax.sum.c - Non-strict HTML summarizer
 *
 *  Usage: HTML-lax.sum [--url-only | --text-only] --body-text filename
 *
 *  Outputs SOIF
 *
 *  DEBUG: 
 *
 *  Darren Hardy, hardy@cs.colorado.edu, April 1994
 *  Duane Wessels, wessels@cs.colorado.edu, October 1995
 *
 *  ----------------------------------------------------------------------
 *  Copyright (c) 1994, 1995.  All rights reserved.
 *  
 *    The Harvest software was developed by the Internet Research Task
 *    Force Research Group on Resource Discovery (IRTF-RD):
 *  
 *          Mic Bowman of Transarc Corporation.
 *          Peter Danzig of the University of Southern California.
 *          Darren R. Hardy of the University of Colorado at Boulder.
 *          Udi Manber of the University of Arizona.
 *          Michael F. Schwartz of the University of Colorado at Boulder.
 *          Duane Wessels of the University of Colorado at Boulder.
 *  
 *    This copyright notice applies to software in the Harvest
 *    ``src/'' directory only.  Users should consult the individual
 *    copyright notices in the ``components/'' subdirectories for
 *    copyright information about other software bundled with the
 *    Harvest source code distribution.
 *  
 *  TERMS OF USE
 *    
 *    The Harvest software may be used and re-distributed without
 *    charge, provided that the software origin and research team are
 *    cited in any use of the system.  Most commonly this is
 *    accomplished by including a link to the Harvest Home Page
 *    (http://harvest.cs.colorado.edu/) from the query page of any
 *    Broker you deploy, as well as in the query result pages.  These
 *    links are generated automatically by the standard Broker
 *    software distribution.
 *    
 *    The Harvest software is provided ``as is'', without express or
 *    implied warranty, and with no support nor obligation to assist
 *    in its use, correction, modification or enhancement.  We assume
 *    no liability with respect to the infringement of copyrights,
 *    trade secrets, or any patents, and are not responsible for
 *    consequential damages.  Proper use of the Harvest software is
 *    entirely the responsibility of the user.
 *  
 *  DERIVATIVE WORKS
 *  
 *    Users may make derivative works from the Harvest software, subject 
 *    to the following constraints:
 *  
 *      - You must include the above copyright notice and these 
 *        accompanying paragraphs in all forms of derivative works, 
 *        and any documentation and other materials related to such 
 *        distribution and use acknowledge that the software was 
 *        developed at the above institutions.
 *  
 *      - You must notify IRTF-RD regarding your distribution of 
 *        the derivative work.
 *  
 *      - You must clearly notify users that your are distributing 
 *        a modified version and not the original Harvest software.
 *  
 *      - Any derivative product is also subject to these copyright 
 *        and use restrictions.
 *  
 *    Note that the Harvest software is NOT in the public domain.  We
 *    retain copyright, as specified above.
 *  
 *  HISTORY OF FREE SOFTWARE STATUS
 *  
 *    Originally we required sites to license the software in cases
 *    where they were going to build commercial products/services
 *    around Harvest.  In June 1995 we changed this policy.  We now
 *    allow people to use the core Harvest software (the code found in
 *    the Harvest ``src/'' directory) for free.  We made this change
 *    in the interest of encouraging the widest possible deployment of
 *    the technology.  The Harvest software is really a reference
 *    implementation of a set of protocols and formats, some of which
 *    we intend to standardize.  We encourage commercial
 *    re-implementations of code complying to this set of standards.  
 *  
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "HTML.h"
#include "util.h"
#include "template.h"

/* Global */
char *Url = NULL;

/* Local Variables */
static int intype[64];
static Buffer *citations, *keywords, *title, *author, *urls, *body_text;
static int url_only = 0;
static int text_only = 0;
static int html_body_text = 0;

static void usage()
{
	fprintf(stderr, "Usage: HTML-lax.sum [--url-only | --text-only] --body-text filename\n");
	exit(1);
}

/*
 *  read_file() - Reads the file fp into memory and returns a pointer to it.
 */
Buffer *read_file(fp)
FILE *fp;
{
	static Buffer *b;
	char buf[BUFSIZ];
	int nread;

	b = create_buffer(BUFSIZ);

	while ((nread = fread(buf, 1, BUFSIZ, fp)) > 0) 
		add_buffer(b, buf, nread);

	return(b);
}

/*
 *  process_anchor() - Extracts the URL from the anchor href tag.
 */
void process_anchor(s)
char *s;
{
	char *p, *q, *tmps = s;

	while ((p = strchr(tmps, '=')) != NULL) {
		/* Check to see if there is a ABCD= */
		if (p - 4 <= tmps) {
			tmps = ++p;
			continue;
		}
		if (strncasecmp(p-4, "href", 4) != 0) { 	/* href? */
			tmps = ++p;
			continue;
		}
		p++;					/* skip '=' */
		while (isspace(*p) || (*p == '\"'))
			p++;				/* skip space '"'s */
		q = strdup(p);				/* copy URL */
		if ((p = strchr(q, '\"')) != NULL)	/* terminate string */
			*p = '\0';
		if ((p = strchr(q, ' ')) != NULL)	/* terminate string */
			*p = '\0';
		add_buffer(urls, q, strlen(q));		/* Add URL to urls */
		add_buffer(urls, "\n", 1);
		xfree(q);
		return;
	}
}

void print_node(mp)
struct mark_up *mp;
{
	printf("mp->type: %d\n", mp->type);
	printf("mp->is_end: %d\n", mp->is_end);
	printf("mp->text: %s\n", mp->text);
	printf("mp->start: %s\n", mp->start);
	printf("mp->end: %s\n", mp->end);
	printf("\n");
	fflush(stdout);
}


void process_node(mp)
struct mark_up *mp;
{
	if (mp->type < 0)
		return;

	if (mp->is_end) {
		intype[mp->type]--;
		return;
	} else
		intype[mp->type]++;

	if (mp->text && strlen(mp->text) < 2) 
		return;
	if (mp->start && strlen(mp->start) < 6)
		return;

	switch (mp->type) {
	case M_NONE:
		if (intype[M_TITLE]) {
			add_buffer(title, mp->text, strlen(mp->text));
			add_buffer(title, "\n", 1);
		}
		if (intype[M_STRONG]) {
			add_buffer(keywords, mp->text, strlen(mp->text));
			add_buffer(keywords, "\n", 1);
		}
		if (intype[M_CITATION]) {
			add_buffer(citations, mp->text, strlen(mp->text));
			add_buffer(citations, "\n", 1);
		}
		if (intype[M_ANCHOR]) {
			add_buffer(keywords, mp->text, strlen(mp->text));
			add_buffer(keywords, "\n", 1);
		}
		if (intype[M_ADDRESS]) {
			add_buffer(author, mp->text, strlen(mp->text));
			add_buffer(author, "\n", 1);
		}
		if (intype[M_HEADER_1] || intype[M_HEADER_2]) {
			add_buffer(title, mp->text, strlen(mp->text));
			add_buffer(title, "\n", 1);
		}
		if (html_body_text && mp->text) {
			add_buffer (body_text, mp->text, strlen (mp->text));
			add_buffer (body_text, "\n", 1);
		}
		if (text_only) 
			puts(mp->text);
		break;
	case M_ANCHOR:
		process_anchor(mp->start);
	default:
		/* do nothing */
		break;
	}
}


static void free_struct_markup(x)
struct mark_up *x;
{
	if (x->text)	free(x->text);
	if (x->start)	free(x->start);
	if (x->end)	free(x->end);
	free(x);
}

int main(argc, argv)
int argc;
char *argv[];
{
	struct mark_up *mp = NULL, *walker, *t, *HTMLParse();
	Buffer *b;
	FILE *fp;

	init_log(NULL, stderr);
	argv++;
	argc--;
	if (argc < 1)
		usage();
	if (!strcmp(*argv, "--url-only")) {
		url_only = 1;
		argv++;
		argc--;
		if (argc < 1)
			usage();
	}
	if (!strcmp(*argv, "--text-only")) {
		text_only = 1;
		argv++;
		argc--;
		if (argc < 1)
			usage();
	}
        if (!strcmp(*argv, "--body-text")) {
                html_body_text = 1;
                argv++;
                argc--;
                if (argc < 1)
                        usage();
        }
	
	memset(intype, '\0', 64);

	/* Parse the HTML file */
	if ((fp = fopen(*argv, "r")) == NULL) {
		log_errno(*argv);
		exit(1);
	}

        if (getenv("ENUMERATOR_URL"))
                Url = xstrdup(getenv("ENUMERATOR_URL"));
        if (Url == (char *) NULL)
                Url = xstrdup(*argv);

	b = read_file(fp);
	fclose(fp);
	mp = HTMLParse(NULL, b->data);
	free_buffer(b);

	author = create_buffer(BUFSIZ);
	keywords = create_buffer(BUFSIZ);
	citations = create_buffer(BUFSIZ);
	urls = create_buffer(BUFSIZ);
	title = create_buffer(BUFSIZ);
	if (html_body_text)
		body_text = create_buffer(BUFSIZ);

	/* Extract important information from the parsed HTML */
	for (walker = mp; walker != NULL; 
	     t = walker, walker = walker->next, free_struct_markup(t)) 
		process_node(walker);
	

	if (url_only) {
		fwrite(urls->data, 1, urls->length, stdout);
		exit(0);
	}
	if (text_only) {
		exit(0);
	}
	if (author->length > 0) {
		printf("Author{%u}:\t", author->length);
		fwrite(author->data, 1, author->length, stdout);
	}
	if (keywords->length > 0) {
		printf("keywords{%u}:\t", keywords->length);
		fwrite(keywords->data, 1, keywords->length, stdout);
	}
	if (urls->length > 0) {
		printf("URL-References{%u}:\t", urls->length);
		fwrite(urls->data, 1, urls->length, stdout);
	}
	if (citations->length > 0) {
		printf("Citations{%u}:\t", citations->length);
		fwrite(citations->data, 1, citations->length, stdout);
	}
	if (title->length > 0) {
		printf("Title{%u}:\t", title->length);
		fwrite(title->data, 1, title->length, stdout);
	}
	if (html_body_text && body_text->length > 0) {
		printf("Body{%u}:\t", body_text->length);
		fwrite(body_text->data, 1, body_text->length, stdout);
	}

	exit(0);
}
