static char rcsid[] = "gopherenum.c,v 1.58 1996/01/30 20:38:04 duane Exp";
/*
 *  gopherenum.c - RootNode URL enumerator for Gopher URLs
 *
 *  Usage: gopherenum gopher-URL
 *
 *  Outputs the following format:
 *
 *      URL of tree root
 *      URL <tab> md5
 *      ...
 *      URL <tab> md5
 * 
 *  DEBUG: section  43, level 1, 5, 9   Gatherer enumeration for Gopher
 *
 *  Darren Hardy, hardy@cs.colorado.edu, April 1994
 *  Duane Wessels, wessels@cs.colorado.edu, January 1996
 *
 *  ----------------------------------------------------------------------
 *  Copyright (c) 1994, 1995.  All rights reserved.
 *  
 *    The Harvest software was developed by the Internet Research Task
 *    Force Research Group on Resource Discovery (IRTF-RD):
 *  
 *          Mic Bowman of Transarc Corporation.
 *          Peter Danzig of the University of Southern California.
 *          Darren R. Hardy of the University of Colorado at Boulder.
 *          Udi Manber of the University of Arizona.
 *          Michael F. Schwartz of the University of Colorado at Boulder.
 *          Duane Wessels of the University of Colorado at Boulder.
 *  
 *    This copyright notice applies to software in the Harvest
 *    ``src/'' directory only.  Users should consult the individual
 *    copyright notices in the ``components/'' subdirectories for
 *    copyright information about other software bundled with the
 *    Harvest source code distribution.
 *  
 *  TERMS OF USE
 *    
 *    The Harvest software may be used and re-distributed without
 *    charge, provided that the software origin and research team are
 *    cited in any use of the system.  Most commonly this is
 *    accomplished by including a link to the Harvest Home Page
 *    (gopher://harvest.cs.colorado.edu/) from the query page of any
 *    Broker you deploy, as well as in the query result pages.  These
 *    links are generated automatically by the standard Broker
 *    software distribution.
 *    
 *    The Harvest software is provided ``as is'', without express or
 *    implied warranty, and with no support nor obligation to assist
 *    in its use, correction, modification or enhancement.  We assume
 *    no liability with respect to the infringement of copyrights,
 *    trade secrets, or any patents, and are not responsible for
 *    consequential damages.  Proper use of the Harvest software is
 *    entirely the responsibility of the user.
 *  
 *  DERIVATIVE WORKS
 *  
 *    Users may make derivative works from the Harvest software, subject 
 *    to the following constraints:
 *  
 *      - You must include the above copyright notice and these 
 *        accompanying paragraphs in all forms of derivative works, 
 *        and any documentation and other materials related to such 
 *        distribution and use acknowledge that the software was 
 *        developed at the above institutions.
 *  
 *      - You must notify IRTF-RD regarding your distribution of 
 *        the derivative work.
 *  
 *      - You must clearly notify users that your are distributing 
 *        a modified version and not the original Harvest software.
 *  
 *      - Any derivative product is also subject to these copyright 
 *        and use restrictions.
 *  
 *    Note that the Harvest software is NOT in the public domain.  We
 *    retain copyright, as specified above.
 *  
 *  HISTORY OF FREE SOFTWARE STATUS
 *  
 *    Originally we required sites to license the software in cases
 *    where they were going to build commercial products/services
 *    around Harvest.  In June 1995 we changed this policy.  We now
 *    allow people to use the core Harvest software (the code found in
 *    the Harvest ``src/'' directory) for free.  We made this change
 *    in the interest of encouraging the widest possible deployment of
 *    the technology.  The Harvest software is really a reference
 *    implementation of a set of protocols and formats, some of which
 *    we intend to standardize.  We encourage commercial
 *    re-implementations of code complying to this set of standards.  
 *  
 */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <memory.h>
#include <string.h>
#include <signal.h>
#include <gdbm.h>
#include <GNUregex.h>
#include "util.h"
#include "url.h"
#define PUBLIC extern
#include "filter.h"

typedef struct _list_t {
    void *ptr;
    int depth;
    struct _list_t *next;
} list_t;

list_t *head = NULL;
list_t **Tail = NULL;

/* define HOST_COUNT_IP to 'count' visited hosts based on IP, not the   */
/* given hostname.  This way aliased machines will be properly          */
/* enumerated                                                           */
#define HOST_COUNT_IP

/* Global variables */
int max_depth = 0;
int cur_depth = 0;
int depth_hist[100];

/* Local variables */
static int url_max = 0;
static int nurls = 0;
static int host_max = 0;
static int nhosts = 0;
static char *tree_root = NULL;
static char *urldb_filename = NULL;
static char *hostdb_filename = NULL;
static char *md5db_filename = NULL;
static GDBM_FILE urldbf = NULL;
static GDBM_FILE hostdbf = NULL;
static GDBM_FILE md5dbf = NULL;

static FILE *not_visited = NULL;

/* Local functions */
static void usage();
static void mark_retrieved();
static void sigdie();
static int url_in_db();
static int md5_in_db();
static int gopher_enum();

extern int RobotsTxtCheck _PARAMS((URL *));

list_t *add_to_list(url, depth)
     char *url;
     int depth;
{
    list_t *l = NULL;

    l = (list_t *) xmalloc(sizeof(list_t));
    l->ptr = (void *) xstrdup(url);
    l->next = (list_t *) NULL;
    l->depth = depth;
    *Tail = l;
    Tail = &(l->next);
    return l;
}

list_t *free_from_list(l)
     list_t *l;
{
    list_t *r = NULL;

    r = l->next;
    xfree(l->ptr);
    xfree(l);
    return r;
}

/* ---------------------------------------------------------------------- */

/*
 *  mark_retrieved() - Mark that the given URL was successfully retrieved,
 *  so that the URL is not retrieved again.  This prevents cycles in the
 *  enumeration.
 */
static void mark_retrieved(up)
     URL *up;
{
    datum k, d;

    Debug(43, 9, ("mark_retrieved: url='%s', md5='%s'\n", up->url, up->md5));

    k.dptr = xstrdup(up->url);
    k.dsize = strlen(k.dptr) + 1;
    d.dptr = xstrdup(up->md5);
    d.dsize = strlen(d.dptr) + 1;

    if (!gdbm_exists(urldbf, k) && gdbm_store(urldbf, k, d, GDBM_INSERT))
	fatal("GDBM URLDB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno));
    if (!gdbm_exists(md5dbf, d) && gdbm_store(md5dbf, d, k, GDBM_INSERT))
	fatal("GDBM MD5DB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno));

    xfree(k.dptr);
    xfree(d.dptr);

    /* Print URL to stdout to enumerate; flush to keep pipe moving */
    fprintf(stdout, "%s\t%s\n", up->url, up->md5);	/* URL <tab> MD5 */
    fflush(stdout);

    if (++nurls >= url_max) {
	Log("Truncating RootNode %s at %d LeafNode URLs\n",
	    tree_root, url_max);
	url_close(up);
	up = NULL;
	sigdie(0);
    }
}

/*
 *  url_in_db() - check to see if the URL is in the database
 */
static int url_in_db(url)
     char *url;
{
    datum k;
    int r;

    Debug(43, 9, ("url_in_db: checking for url='%s'\n", url));

    k.dptr = xstrdup(url);
    k.dsize = strlen(k.dptr) + 1;
    r = gdbm_exists(urldbf, k);
    xfree(k.dptr);
    return (r);
}

/*
 *  md5_in_db() - check to see if the MD5 is in the database
 */
static int md5_in_db(md5)
     char *md5;
{
    datum k;
    int r;

    k.dptr = xstrdup(md5);
    k.dsize = strlen(k.dptr) + 1;
    r = gdbm_exists(md5dbf, k);
    xfree(k.dptr);
    return (r);
}

/*
 *  host_in_db() - check to see if the host is in the database
 */
static int host_in_db(host)
     char *host;
{
    datum k;
    int r;
#ifdef HOST_COUNT_IP
    Host *h;

    h = get_host(host);
    if (!h)
	return 0;
    k.dptr = xstrdup(h->dotaddr);
#else
    k.dptr = xstrdup(host);
#endif
    k.dsize = strlen(k.dptr) + 1;
    r = gdbm_exists(hostdbf, k);
    xfree(k.dptr);
    return (r);
}

/*
 *  visit_server() - Determine if we should visit the server.  Return
 *  zero if we should not process the URL; otherwise, return non-zero.
 */
static int visit_server(up)
     URL *up;
{
    datum k, d;
#ifdef HOST_COUNT_IP
    Host *h = NULL;
#endif

    if (host_in_db(up->host))	/* Host is already in the db */
	return (1);
    if (++nhosts > host_max)
	return (0);

#ifdef HOST_COUNT_IP
    h = get_host(up->host);
    if (!h)
	return (0);
    k.dptr = xstrdup(h->dotaddr);
#else
    k.dptr = xstrdup(up->host);
#endif
    k.dsize = strlen(k.dptr) + 1;
    d.dptr = xstrdup(up->url);
    d.dsize = strlen(d.dptr) + 1;

    if (gdbm_store(hostdbf, k, d, GDBM_INSERT))
	fatal("GDBM HOSTDB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno));
    xfree(k.dptr);
    xfree(d.dptr);
    return (1);
}

int url_is_allowed(url)
     char *url;
{
    URL *tup = NULL;
    int y;

    if ((tup = url_open(url)) == NULL)
	return 0;;
    if (url_in_db(tup->url)) {	/* Have we been here? */
	Debug(43, 1, ("Already Visited URL: %s\n", tup->url));
	url_close(tup);
	return 0;
    }
    if ((y = filter_selection(tup))) {
	Debug(43, 1, ("Removing Candidate: [%s] %s\n",
		Filter_Type_Name[y], tup->url));
	if (not_visited)
	    fprintf(not_visited, "[FILTER] %s\n", tup->url);
	url_close(tup);
	return 0;
    }
    if (!visit_server(tup)) {
	Debug(43, 1, ("Server count exceeded: %s\n",
		tup->url));
	if (not_visited)
	    fprintf(not_visited, "[SERVER] %s\n", tup->url);
	url_close(tup);
	return 0;
    }
    if (!RobotsTxtCheck(tup)) {
	Debug(43, 1, ("Disallowed by robots.txt: %s\n", tup->url));
	if (not_visited)
	    fprintf(not_visited, "[ROBOTS.TXT] %s\n", tup->url);
	url_close(tup);
	return 0;
    }
    return 1;
}

static int gopher_enum(up, depth)
     URL *up;
     int depth;
{
    FILE *fp = NULL;
    char *s = NULL;
    char *p = NULL;
    char *q = NULL;
    char *gopher_name = NULL;
    char *gopher_path = NULL;
    char *gopher_host = NULL;
    char *gopher_port = NULL;
    int nurls = 0;
    static char buf[BUFSIZ];
    static char urlbuf[BUFSIZ];
    static char newurl[BUFSIZ];

    if (url_in_db(up->url)) {	/* Have we been here? */
	Debug(43, 1, ("Already Visited URL: %s\n", up->url));
	return 0;
    }
    if (url_retrieve(up)) {	/* Grab the URL; success? */
	Debug(43, 1, ("Cannot Retrieve URL: %s\n", up->url));
	return 0;
    }
    if (up->md5 && md5_in_db(up->md5)) {	/* Have we been here? */
	Debug(43, 1, ("Already Visited MD5: %s\n", up->url));
	return 0;
    }
    /* Remember that we've been here before */
    if (up->md5)
	mark_retrieved(up);
    if (up->gophertype == 0)
	return 0;

    /* 
     *  For each meny entry, convert it to a URL, and add it to
     *  the global list of URLs to process.
     */
    if ((fp = fopen(up->filename, "r")) == NULL) {
	log_errno2(__FILE__, __LINE__, up->filename);
	return 0;
    }
    while (fgets(buf, BUFSIZ, fp)) {
	if ((s = strchr(buf, '\r')))
	    *s = (char) '\n';
	strcpy(urlbuf, buf);
	if ((s = strchr(buf, '\n')))
	    *s = (char) '\0';
	Debug(43, 3, ("Input: %s\n", buf));
	if (!strcmp(buf, "."))
	    break;

	p = urlbuf;
	if ((q = strchr(p, '\t')) == NULL) {
	    errorlog("Illegal Gopher format: No Name: %s\n", buf);
	    goto gopher_enum_cont;
	}
	*q = (char) '\0';
	gopher_name = xstrdup(p);
	Debug(43, 5, ("gopher_name = '%s'\n", gopher_name));

	p = q + 1;
	if ((q = strchr(p, '\t')) == NULL) {
	    errorlog("Illegal Gopher format: No Path: %s\n", buf);
	    goto gopher_enum_cont;
	}
	*q = (char) '\0';
	gopher_path = xstrdup(rfc1738_escape(p));
	Debug(43, 5, ("gopher_path = '%s'\n", gopher_path));

	p = q + 1;
	if ((q = strchr(p, '\t')) == NULL) {
	    errorlog("Illegal Gopher format: No Host: %s\n", buf);
	    goto gopher_enum_cont;
	}
	*q = (char) '\0';
	gopher_host = xstrdup(p);
	Debug(43, 5, ("gopher_host = '%s'\n", gopher_host));

	p = q + 1;
	q = strchr(p, '\t');
	if (q == NULL)
	    q = strchr(p, '\n');
	if (q == NULL) {
	    errorlog("Illegal Gopher format: No Port: %s\n", buf);
	    goto gopher_enum_cont;
	}
	*q = (char) '\0';
	gopher_port = xstrdup(p);
	Debug(43, 5, ("gopher_port = '%s'\n", gopher_port));

	/* Fix for wierd cross-site Gopher links - wessels */
	if (!strncasecmp(gopher_path, "ftp%3a", 6))
	    goto gopher_enum_cont;
	if (!strncasecmp(gopher_path, "ftp:", 4))
	    goto gopher_enum_cont;
	if (!strncasecmp(gopher_path, "exec%3a", 7))
	    goto gopher_enum_cont;
	if (!strncasecmp(gopher_path, "exec:", 5))
	    goto gopher_enum_cont;

	sprintf(newurl, "gopher://%s:%d/%c%s", gopher_host,
	    atoi(gopher_port), gopher_name[0], gopher_path);
	if (url_is_allowed(newurl))
	    add_to_list(newurl, depth);
	nurls++;

      gopher_enum_cont:
	xfree(gopher_name);
	gopher_name = NULL;
	xfree(gopher_path);
	gopher_path = NULL;
	xfree(gopher_host);
	gopher_host = NULL;
	xfree(gopher_port);
	gopher_port = NULL;

    }
    fclose(fp);
    Debug(43, 1, ("Adding %d URLs from %s to workload\n", nurls, up->url));
    return 1;
}

/* ---------------------------------------------------------------------- */

/*
 *  initialize() - Basic init routines
 */
static void initialize()
{
    char *s = NULL;
    extern int liburl_conform_rfc1738;
    FILE *logfp = NULL;

#ifdef USE_HOST_CACHE
    host_cache_init();
#endif

    cur_depth = max_depth = url_max = host_max = 0;
    if ((s = getenv("HARVEST_URL_MAX")) != NULL)
	url_max = atoi(s);
    if ((s = getenv("HARVEST_HOST_MAX")) != NULL)
	host_max = atoi(s);
    if ((s = getenv("HARVEST_DEPTH_MAX")) != NULL)
	max_depth = atoi(s);
    if ((s = getenv("HARVEST_DEPTH_CUR")) != NULL)
	cur_depth = atoi(s);
    if (url_max < 1)
	url_max = 250;		/* hard-coded maximum */
    if (host_max < 1)
	host_max = 1;		/* hard-coded maximum */
    if (max_depth < 1)
	max_depth = 0;		/* hard-coded maximum */
    host_filterfile = getenv("HARVEST_HOST_FILTER");
    url_filterfile = getenv("HARVEST_URL_FILTER");
    access_types = getenv("HARVEST_ACCESS_TYPES");

    if ((s = getenv("HARVEST_GATHERER_LOGFILE")) != (char *) NULL)
	logfp = fopen(s, "a+");
    if (logfp == (FILE *) NULL)
	logfp = stderr;
    init_log3("gopherenum", logfp, stderr);
    init_url();
    liburl_conform_rfc1738 = 1;
    filter_initialize();
    Debug(43, 5, ("access_mask: %#02X\n", access_mask));

    /* Open GDBM databases to keep track of where we've been */
    urldb_filename = xstrdup(tempnam(NULL, "Gurl"));
    urldbf = gdbm_open(urldb_filename, 0, GDBM_NEWDB, 0644, NULL);
    if (urldbf == NULL) {
	log_errno(urldb_filename);
	fatal("gdbm_open: %s: %s", urldb_filename,
	    gdbm_strerror(gdbm_errno));
    }
    hostdb_filename = xstrdup(tempnam(NULL, "Ghost"));
    hostdbf = gdbm_open(hostdb_filename, 0, GDBM_NEWDB, 0644, NULL);
    if (hostdbf == NULL) {
	log_errno(hostdb_filename);
	fatal("gdbm_open: %s: %s", hostdb_filename,
	    gdbm_strerror(gdbm_errno));
    }
    md5db_filename = xstrdup(tempnam(NULL, "Gmd5"));
    md5dbf = gdbm_open(md5db_filename, 0, GDBM_NEWDB, 0644, NULL);
    if (md5dbf == NULL) {
	log_errno(md5db_filename);
	fatal("gdbm_open: %s: %s", md5db_filename,
	    gdbm_strerror(gdbm_errno));
    }
    /* open not-visited file */
    if ((s = getenv("HARVEST_NOT_VISITED_LOG")) != NULL)
	not_visited = fopen(s, "a+");
    if (not_visited)
	setbuf(not_visited, NULL);
}

/* Die gracefully */
static void sigdie(x)
     int x;
{
    int i;
#ifdef USE_HOST_CACHE
    dump_host_cache(43, 9);
#endif
    if (urldbf != NULL)
	gdbm_close(urldbf);
    if (hostdbf != NULL)
	gdbm_close(hostdbf);
    if (md5dbf != NULL)
	gdbm_close(md5dbf);
    if (not_visited)
	fclose(not_visited);

    (void) unlink(urldb_filename);
    xfree(urldb_filename);
    (void) unlink(hostdb_filename);
    xfree(hostdb_filename);
    (void) unlink(md5db_filename);
    xfree(md5db_filename);

    for (i = 0; i < 100; i++) {
	if (i > max_depth && depth_hist[i] == 0)
	    break;
	Log("Found %8d objects at depth %d\n", depth_hist[i], i);
    }

    Debug(43, 1, ("gopherenum: exiting (signal %d)\n", x));
    exit(0);
}

/* ---------------------------------------------------------------------- */

static void usage()
{
    fprintf(stderr, "Usage: gopherenum gopher-URL\n");
    exit(1);
}

int main(argc, argv)
     int argc;
     char **argv;
{
    URL *up = NULL;
    list_t *l = NULL;
    char *url = NULL;
    int depth = 0;

    debug_init();		/* from $HARVEST_DEBUG */
    for (argc--, argv++; argc > 0 && **argv == '-'; argc--, argv++) {
	if (!strncmp(*argv, "-D", 2)) {
	    debug_flag(*argv);
	}
    }

    if (argc != 1)
	usage();

    for (depth = 0; depth < 100; depth++)
	depth_hist[depth] = 0;

    signal(SIGTERM, sigdie);	/* Die gracefully */
    signal(SIGINT, sigdie);
    signal(SIGPIPE, sigdie);	/* Quickly clean up on broken pipe */

    initialize();		/* Initialize */

    Debug(43, 1, ("gopherenum: Starting...\n"));

    /* Grab the RootNode URL from the command line */
    if ((up = url_open(*argv)) == NULL || up->type != URL_GOPHER) {
	usage();
    }
    /* Mark the RootNode */
    tree_root = xstrdup(up->url);
    Tail = &head;

    printf("%s\n", up->url);	/* Print tree root */
    add_to_list(up->url, cur_depth);	/* start at depth = 0 */
    url_close(up);

    for (l = head; l; l = free_from_list(l)) {
	url = (char *) l->ptr;
	depth = l->depth;

	if (depth < 100)
	    depth_hist[depth]++;

	if (max_depth > 0 && depth > max_depth) {
	    if (not_visited)
		fprintf(not_visited, "[DEPTH] %s\n", url);
	    Debug(43, 1, ("Maximum Depth of %d Reached: %s\n",
		    max_depth, url));
	    continue;
	}
	Debug(43, 1, ("Processing: [%2d] %s\n", depth, url));

	if ((up = url_open(url)) == NULL)
	    continue;

	/* search for more links from this one */
	gopher_enum(up, depth + 1);
	url_close(up);
    }

    finish_url();
    sigdie(0);
    /* NOTREACHED */
}
