static char rcsid[] = "httpenum.c,v 1.98 1996/01/16 08:45:02 duane Exp";
/*
 *  httpenum.c - RootNode URL enumerator for HTTP URLs
 *
 *  Usage: httpenum http-URL
 *
 *  Outputs the following format:
 *
 *      URL of tree root
 *      URL <tab> md5
 *      ...
 *      URL <tab> md5
 * 
 *  DEBUG: section  42, level 1, 5, 9   Gatherer enumeration for HTTP
 *
 *  Darren Hardy, hardy@cs.colorado.edu, April 1994
 *  Duane Wessels, wessels@cs.colorado.edu, January 1996
 *
 *  ----------------------------------------------------------------------
 *  Copyright (c) 1994, 1995.  All rights reserved.
 *  
 *    The Harvest software was developed by the Internet Research Task
 *    Force Research Group on Resource Discovery (IRTF-RD):
 *  
 *          Mic Bowman of Transarc Corporation.
 *          Peter Danzig of the University of Southern California.
 *          Darren R. Hardy of the University of Colorado at Boulder.
 *          Udi Manber of the University of Arizona.
 *          Michael F. Schwartz of the University of Colorado at Boulder.
 *          Duane Wessels of the University of Colorado at Boulder.
 *  
 *    This copyright notice applies to software in the Harvest
 *    ``src/'' directory only.  Users should consult the individual
 *    copyright notices in the ``components/'' subdirectories for
 *    copyright information about other software bundled with the
 *    Harvest source code distribution.
 *  
 *  TERMS OF USE
 *    
 *    The Harvest software may be used and re-distributed without
 *    charge, provided that the software origin and research team are
 *    cited in any use of the system.  Most commonly this is
 *    accomplished by including a link to the Harvest Home Page
 *    (http://harvest.cs.colorado.edu/) from the query page of any
 *    Broker you deploy, as well as in the query result pages.  These
 *    links are generated automatically by the standard Broker
 *    software distribution.
 *    
 *    The Harvest software is provided ``as is'', without express or
 *    implied warranty, and with no support nor obligation to assist
 *    in its use, correction, modification or enhancement.  We assume
 *    no liability with respect to the infringement of copyrights,
 *    trade secrets, or any patents, and are not responsible for
 *    consequential damages.  Proper use of the Harvest software is
 *    entirely the responsibility of the user.
 *  
 *  DERIVATIVE WORKS
 *  
 *    Users may make derivative works from the Harvest software, subject 
 *    to the following constraints:
 *  
 *      - You must include the above copyright notice and these 
 *        accompanying paragraphs in all forms of derivative works, 
 *        and any documentation and other materials related to such 
 *        distribution and use acknowledge that the software was 
 *        developed at the above institutions.
 *  
 *      - You must notify IRTF-RD regarding your distribution of 
 *        the derivative work.
 *  
 *      - You must clearly notify users that your are distributing 
 *        a modified version and not the original Harvest software.
 *  
 *      - Any derivative product is also subject to these copyright 
 *        and use restrictions.
 *  
 *    Note that the Harvest software is NOT in the public domain.  We
 *    retain copyright, as specified above.
 *  
 *  HISTORY OF FREE SOFTWARE STATUS
 *  
 *    Originally we required sites to license the software in cases
 *    where they were going to build commercial products/services
 *    around Harvest.  In June 1995 we changed this policy.  We now
 *    allow people to use the core Harvest software (the code found in
 *    the Harvest ``src/'' directory) for free.  We made this change
 *    in the interest of encouraging the widest possible deployment of
 *    the technology.  The Harvest software is really a reference
 *    implementation of a set of protocols and formats, some of which
 *    we intend to standardize.  We encourage commercial
 *    re-implementations of code complying to this set of standards.  
 *  
 */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <memory.h>
#include <string.h>
#include <signal.h>
#include <gdbm.h>
#include <GNUregex.h>
#include "util.h"
#include "url.h"
#define PUBLIC extern
#include "filter.h"

typedef struct _list_t {
    void *ptr;
    int depth;
    struct _list_t *next;
} list_t;

list_t *head = NULL;
list_t **Tail = NULL;

/* define HOST_COUNT_IP to 'count' visited hosts based on IP, not the   */
/* given hostname.  This way aliased machines will be properly          */
/* enumerated                                                           */
#define HOST_COUNT_IP

/* Global variables */
int max_depth = 0;
int cur_depth = 0;
int depth_hist[100];

/* Local variables */
static int url_max = 0;
static int nurls = 0;
static int host_max = 0;
static int nhosts = 0;
static char *tree_root = NULL;
static char *urldb_filename = NULL;
static char *hostdb_filename = NULL;
static char *md5db_filename = NULL;
static GDBM_FILE urldbf = NULL;
static GDBM_FILE hostdbf = NULL;
static GDBM_FILE md5dbf = NULL;

static FILE *not_visited = NULL;

/* Local functions */
static void usage();
static void mark_retrieved();
static void sigdie();
static int url_in_db();
static int md5_in_db();
static int http_enum();

extern int RobotsTxtCheck _PARAMS((URL *));

list_t *add_to_list(url, depth)
     char *url;
     int depth;
{
    list_t *l = NULL;

    l = (list_t *) xmalloc(sizeof(list_t));
    l->ptr = (void *) xstrdup(url);
    l->next = (list_t *) NULL;
    l->depth = depth;
    *Tail = l;
    Tail = &(l->next);
    return l;
}

list_t *free_from_list(l)
     list_t *l;
{
    list_t *r = NULL;

    r = l->next;
    xfree(l->ptr);
    xfree(l);
    return r;
}

/* ---------------------------------------------------------------------- */

/*
 *  mark_retrieved() - Mark that the given URL was successfully retrieved,
 *  so that the URL is not retrieved again.  This prevents cycles in the
 *  enumeration.
 */
static void mark_retrieved(up)
     URL *up;
{
    datum k, d;

    Debug(42, 9, ("mark_retrieved: url='%s', md5='%s'\n", up->url, up->md5));

    k.dptr = xstrdup(up->url);
    k.dsize = strlen(k.dptr) + 1;
    d.dptr = xstrdup(up->md5);
    d.dsize = strlen(d.dptr) + 1;

    if (!gdbm_exists(urldbf, k) && gdbm_store(urldbf, k, d, GDBM_INSERT))
	fatal("GDBM URLDB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno));
    if (!gdbm_exists(md5dbf, d) && gdbm_store(md5dbf, d, k, GDBM_INSERT))
	fatal("GDBM MD5DB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno));

    xfree(k.dptr);
    xfree(d.dptr);

    if (up->redir_from_url != (char *) NULL) {
	Debug(42, 9, ("mark_retrieved: url='%s', md5='%s'\n",
		up->redir_from_url, up->md5));

	k.dptr = xstrdup(up->redir_from_url);
	k.dsize = strlen(k.dptr) + 1;
	d.dptr = xstrdup(up->md5);
	d.dsize = strlen(d.dptr) + 1;

	if (!gdbm_exists(urldbf, k) && gdbm_store(urldbf, k, d, GDBM_INSERT))
	    fatal("GDBM URLDB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno));

	xfree(k.dptr);
	xfree(d.dptr);
    }
    /* Print URL to stdout to enumerate; flush to keep pipe moving */
    fprintf(stdout, "%s\t%s\n", up->url, up->md5);	/* URL <tab> MD5 */
    fflush(stdout);

    if (++nurls >= url_max) {
	Log("Truncating RootNode %s at %d LeafNode URLs\n",
	    tree_root, url_max);
	url_close(up);
	up = NULL;
	sigdie(0);
    }
}

/*
 *  url_in_db() - check to see if the URL is in the database
 */
static int url_in_db(url)
     char *url;
{
    datum k;
    int r;

    Debug(42, 9, ("url_in_db: checking for url='%s'\n", url));

    k.dptr = xstrdup(url);
    k.dsize = strlen(k.dptr) + 1;
    r = gdbm_exists(urldbf, k);
    xfree(k.dptr);
    return (r);
}

/*
 *  md5_in_db() - check to see if the MD5 is in the database
 */
static int md5_in_db(md5)
     char *md5;
{
    datum k;
    int r;

    k.dptr = xstrdup(md5);
    k.dsize = strlen(k.dptr) + 1;
    r = gdbm_exists(md5dbf, k);
    xfree(k.dptr);
    return (r);
}

/*
 *  host_in_db() - check to see if the host is in the database
 */
static int host_in_db(host)
     char *host;
{
    datum k;
    int r;
#ifdef HOST_COUNT_IP
    Host *h;

    h = get_host(host);
    if (!h)
	return 0;
    k.dptr = xstrdup(h->dotaddr);
#else
    k.dptr = xstrdup(host);
#endif
    k.dsize = strlen(k.dptr) + 1;
    r = gdbm_exists(hostdbf, k);
    xfree(k.dptr);
    return (r);
}

/*
 *  visit_server() - Determine if we should visit the server.  Return
 *  zero if we should not process the URL; otherwise, return non-zero.
 */
static int visit_server(up)
     URL *up;
{
    datum k, d;
#ifdef HOST_COUNT_IP
    Host *h = NULL;
#endif

    if (host_in_db(up->host))	/* Host is already in the db */
	return (1);
    if (++nhosts > host_max)
	return (0);

#ifdef HOST_COUNT_IP
    h = get_host(up->host);
    if (!h)
	return (0);
    k.dptr = xstrdup(h->dotaddr);
#else
    k.dptr = xstrdup(up->host);
#endif
    k.dsize = strlen(k.dptr) + 1;
    d.dptr = xstrdup(up->url);
    d.dsize = strlen(d.dptr) + 1;

    if (gdbm_store(hostdbf, k, d, GDBM_INSERT))
	fatal("GDBM HOSTDB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno));
    xfree(k.dptr);
    xfree(d.dptr);
    return (1);
}

int url_is_allowed(url)
     char *url;
{
    URL *tup = NULL;
    int y;

    if ((tup = url_open(url)) == NULL)
	return 0;;
    if (url_in_db(tup->url)) {	/* Have we been here? */
	Debug(42, 1, ("Already Visited URL: %s\n", tup->url));
	url_close(tup);
	return 0;
    }
    if ((y = filter_selection(tup))) {
	Debug(42, 1, ("Removing Candidate: [%s] %s\n",
		Filter_Type_Name[y], tup->url));
	if (not_visited)
	    fprintf(not_visited, "[FILTER] %s\n", tup->url);
	url_close(tup);
	return 0;
    }
    if (!visit_server(tup)) {
	Debug(42, 1, ("Server count exceeded: %s\n",
		tup->url));
	if (not_visited)
	    fprintf(not_visited, "[SERVER] %s\n", tup->url);
	url_close(tup);
	return 0;
    }
    if (!RobotsTxtCheck(tup)) {
	Debug(42, 1, ("Disallowed by robots.txt: %s\n", tup->url));
	if (not_visited)
	    fprintf(not_visited, "[ROBOTS.TXT] %s\n", tup->url);
	url_close(tup);
	return 0;
    }
    return 1;
}

/*
 *  http_enum() - Returns a linked list of all the URLs in this object,
 *  or NULL on error.  Checks for "text/html" in MIME headers and then
 *  runs "HTMLurls" on the file.
 */

static int http_enum(up, depth)
     URL *up;
     int depth;
{
    FILE *fp = NULL;
    char *enum_url = NULL;
    char *s = NULL;
    char *t0 = NULL;
    char *t1 = NULL;
    char *t2 = NULL;
    int err;
    int nurls = 0;
    int pid;
    int pipefds[2];
    static char *argv[64];
    static char buf[BUFSIZ];

    if (url_in_db(up->url)) {	/* Have we been here? */
	Debug(42, 1, ("Already Visited URL: %s\n", up->url));
	return 0;
    }
    /* Ack.  Check for symbolic link loops in server generated HTML listings
     * Do this by comparing the last two pathname components.  If they are
     * the same then guess its a loop.  */
    s = xstrdup(up->pathname);
    t0 = t1 = t2 = NULL;
    for (t0 = strtok(s, "/"); t0; t0 = strtok(NULL, "/")) {
	t2 = t1;
	t1 = t0;
    }
    if (t1 != NULL && t2 != NULL) {
	if (strcmp(t1, t2) == 0) {
	    Debug(42, 0, ("Possible symlink loop: %s\n", up->url));
	    xfree(s);
	    s = NULL;
	    return 0;
	}
    }
    xfree(s);
    s = NULL;


    if (url_retrieve(up)) {	/* Grab the URL; success? */
	Debug(42, 1, ("Cannot Retrieve URL: %s\n", up->url));
	return 0;
    }
    if (up->md5 && md5_in_db(up->md5)) {	/* Have we been here? */
	Debug(42, 1, ("Already Visited MD5: %s\n", up->url));
	return 0;
    }
    /* Remember that we've been here before */
    if (up->md5)
	mark_retrieved(up);

    /* Are we dealing with an HTML file, if not we can't get href links */
    if (up->http_mime_hdr != NULL &&
	strstr(up->http_mime_hdr, "text/html") == NULL)
	return 0;

    /* Extract the HREF's */
    sprintf(buf, "HTMLurls --base-url %s %s", up->url, up->filename);

    if (pipe(pipefds) < 0) {
	log_errno("pipe");
	return 0;
    }
    if ((pid = fork()) < 0) {
	log_errno("fork");
	return 0;
    }
    if (pid == 0) {		/* child: HTMLurls */
	enum_url = (char *) xmalloc(strlen(up->url) + 20);
	sprintf(enum_url, "ENUMERATOR_URL=%s", up->url);
	putenv(enum_url);

	close(pipefds[0]);	/* child wont read from pipe */
	dup2(pipefds[1], 1);	/* stdout -> write:pipe */
	close(pipefds[1]);	/* close pipe, its now stdout */

	/* parse_argv may barf, so initialize */
	memset(argv, '\0', sizeof(argv));
	parse_argv(argv, buf);

	execvp(argv[0], argv);
	sprintf(buf, "execvp: %s", argv[0]);
	log_errno(buf);
	_exit(1);
    }
    close(pipefds[1]);		/* parent wont write */
    if ((fp = fdopen(pipefds[0], "r")) == NULL) {
	log_errno("fdopen");
	return 0;
    }
    /* 
     *  For each HREF pointer, convert it to a URL, and add it to
     *  the global list of URLs to process.
     */
    while (fgets(buf, BUFSIZ, fp) != NULL) {
	if ((s = strrchr(buf, '\n')) != NULL)
	    *s = '\0';		/* strip newline */
	Debug(42, 1, ("Input: %s\n", buf));
	if (url_is_allowed(buf))
	    add_to_list(buf, depth);
	nurls++;
    }
    fclose(fp);
    close(pipefds[0]);
    if ((err = waitpid(pid, (int *) NULL, (int *) NULL)) != pid) {
	Debug(42, 1, ("WARNING: waiting for child %d got %d...\n",
		pid, err));
    }
    Debug(42, 1, ("Adding %d URLs from %s to workload\n", nurls, up->url));
    return 1;
}

/* ---------------------------------------------------------------------- */

/*
 *  initialize() - Basic init routines
 */
static void initialize()
{
    char *s = NULL;
    extern int liburl_conform_rfc1738;
    FILE *logfp = NULL;

#ifdef USE_HOST_CACHE
    host_cache_init();
#endif

    cur_depth = max_depth = url_max = host_max = 0;
    if ((s = getenv("HARVEST_URL_MAX")) != NULL)
	url_max = atoi(s);
    if ((s = getenv("HARVEST_HOST_MAX")) != NULL)
	host_max = atoi(s);
    if ((s = getenv("HARVEST_DEPTH_MAX")) != NULL)
	max_depth = atoi(s);
    if ((s = getenv("HARVEST_DEPTH_CUR")) != NULL)
	cur_depth = atoi(s);
    if (url_max < 1)
	url_max = 250;		/* hard-coded maximum */
    if (host_max < 1)
	host_max = 1;		/* hard-coded maximum */
    if (max_depth < 1)
	max_depth = 0;		/* hard-coded maximum */
    host_filterfile = getenv("HARVEST_HOST_FILTER");
    url_filterfile = getenv("HARVEST_URL_FILTER");
    access_types = getenv("HARVEST_ACCESS_TYPES");

    if ((s = getenv("HARVEST_GATHERER_LOGFILE")) != (char *) NULL)
	logfp = fopen(s, "a+");
    if (logfp == (FILE *) NULL)
	logfp = stderr;
    init_log3("httpenum", logfp, stderr);
    init_url();
    liburl_conform_rfc1738 = 1;
    filter_initialize();
    Debug(42, 5, ("access_mask: %#02X\n", access_mask));

    /* Open GDBM databases to keep track of where we've been */
    urldb_filename = xstrdup(tempnam(NULL, "Hurl"));
    urldbf = gdbm_open(urldb_filename, 0, GDBM_NEWDB, 0644, NULL);
    if (urldbf == NULL) {
	log_errno(urldb_filename);
	fatal("gdbm_open: %s: %s", urldb_filename,
	    gdbm_strerror(gdbm_errno));
    }
    hostdb_filename = xstrdup(tempnam(NULL, "Hhost"));
    hostdbf = gdbm_open(hostdb_filename, 0, GDBM_NEWDB, 0644, NULL);
    if (hostdbf == NULL) {
	log_errno(hostdb_filename);
	fatal("gdbm_open: %s: %s", hostdb_filename,
	    gdbm_strerror(gdbm_errno));
    }
    md5db_filename = xstrdup(tempnam(NULL, "Hmd5"));
    md5dbf = gdbm_open(md5db_filename, 0, GDBM_NEWDB, 0644, NULL);
    if (md5dbf == NULL) {
	log_errno(md5db_filename);
	fatal("gdbm_open: %s: %s", md5db_filename,
	    gdbm_strerror(gdbm_errno));
    }
    /* open not-visited file */
    if ((s = getenv("HARVEST_NOT_VISITED_LOG")) != NULL)
	not_visited = fopen(s, "a+");
    if (not_visited)
	setbuf(not_visited, NULL);
}

/* Die gracefully */
static void sigdie(x)
     int x;
{
    int i;
#ifdef USE_HOST_CACHE
    dump_host_cache(42, 9);
#endif
    if (urldbf != NULL)
	gdbm_close(urldbf);
    if (hostdbf != NULL)
	gdbm_close(hostdbf);
    if (md5dbf != NULL)
	gdbm_close(md5dbf);
    if (not_visited)
	fclose(not_visited);

    (void) unlink(urldb_filename);
    xfree(urldb_filename);
    (void) unlink(hostdb_filename);
    xfree(hostdb_filename);
    (void) unlink(md5db_filename);
    xfree(md5db_filename);

    for (i = 0; i < 100; i++) {
	if (i > max_depth && depth_hist[i] == 0)
	    break;
	Log("Found %8d objects at depth %d\n", depth_hist[i], i);
    }

    Debug(42, 1, ("httpenum: exiting (signal %d)\n", x));
    exit(0);
}

/* ---------------------------------------------------------------------- */

static void usage()
{
    fprintf(stderr, "Usage: httpenum http-URL\n");
    exit(1);
}

int main(argc, argv)
     int argc;
     char **argv;
{
    URL *up = NULL;
    list_t *l = NULL;
    char *url = NULL;
    int depth = 0;

    debug_init();		/* from $HARVEST_DEBUG */
    for (argc--, argv++; argc > 0 && **argv == '-'; argc--, argv++) {
	if (!strncmp(*argv, "-D", 2)) {
	    debug_flag(*argv);
	}
    }

    if (argc != 1)
	usage();

    for (depth = 0; depth < 100; depth++)
	depth_hist[depth] = 0;

    signal(SIGTERM, sigdie);	/* Die gracefully */
    signal(SIGINT, sigdie);
    signal(SIGPIPE, sigdie);	/* Quickly clean up on broken pipe */

    initialize();		/* Initialize */

    Debug(42, 1, ("httpenum: Starting...\n"));

    /* Grab the RootNode URL from the command line */
    if ((up = url_open(*argv)) == NULL || up->type != URL_HTTP) {
	usage();
    }
    /* Mark the RootNode */
    tree_root = xstrdup(up->url);
    Tail = &head;

    printf("%s\n", up->url);	/* Print tree root */
    add_to_list(up->url, cur_depth);	/* start at depth = 0 */
    url_close(up);

    for (l = head; l; l = free_from_list(l)) {
	url = (char *) l->ptr;
	depth = l->depth;

	if (depth < 100)
	    depth_hist[depth]++;

	if (max_depth > 0 && depth > max_depth) {
	    if (not_visited)
		fprintf(not_visited, "[DEPTH] %s\n", url);
	    Debug(42, 1, ("Maximum Depth of %d Reached: %s\n",
		    max_depth, url));
	    continue;
	}
	Debug(42, 1, ("Processing: [%2d] %s\n", depth, url));

	if ((up = url_open(url)) == NULL)
	    continue;

	if ((up->type != URL_HTTP)) {
	    Debug(42, 1, ("Examining: [%d:%d] %s\n", depth, max_depth, up->url));
	    /* filter_selection() checks access_mask */
	    if (!filter_selection(up) && (depth <= max_depth)) {
		/* Print URL with bogus MD5 to enumerate; 
		 * flush to keep pipe moving 
		 * URL <tab> MD5 */
		fprintf(stdout, "%s\tDepth=%d:%d\n", up->url,
		    depth, max_depth);
		fflush(stdout);
		Debug(42, 1, ("HTTPENUM Re-enumeration: %s\tDepth=%d:%d\n",
			up->url, depth, max_depth));
	    }
	    url_close(up);
	    continue;
	}
	/* search for more links from this one */
	http_enum(up, depth + 1);
	url_close(up);
    }

    finish_url();
    sigdie(0);
    /* NOTREACHED */
}
