static char rcsid[] = "filter.c,v 1.28 1996/01/08 09:08:22 duane Exp";
/*
 *  filter.c - RootNode URL enumerator filter support
 *
 *  Darren Hardy, hardy@cs.colorado.edu, December 1994
 *
 *  DEBUG: section  44, level 1, 5      Gatherer enumeration filter routines
 *
 *  ----------------------------------------------------------------------
 *  Copyright (c) 1994, 1995.  All rights reserved.
 *  
 *    The Harvest software was developed by the Internet Research Task
 *    Force Research Group on Resource Discovery (IRTF-RD):
 *  
 *          Mic Bowman of Transarc Corporation.
 *          Peter Danzig of the University of Southern California.
 *          Darren R. Hardy of the University of Colorado at Boulder.
 *          Udi Manber of the University of Arizona.
 *          Michael F. Schwartz of the University of Colorado at Boulder.
 *          Duane Wessels of the University of Colorado at Boulder.
 *  
 *    This copyright notice applies to software in the Harvest
 *    ``src/'' directory only.  Users should consult the individual
 *    copyright notices in the ``components/'' subdirectories for
 *    copyright information about other software bundled with the
 *    Harvest source code distribution.
 *  
 *  TERMS OF USE
 *    
 *    The Harvest software may be used and re-distributed without
 *    charge, provided that the software origin and research team are
 *    cited in any use of the system.  Most commonly this is
 *    accomplished by including a link to the Harvest Home Page
 *    (http://harvest.cs.colorado.edu/) from the query page of any
 *    Broker you deploy, as well as in the query result pages.  These
 *    links are generated automatically by the standard Broker
 *    software distribution.
 *    
 *    The Harvest software is provided ``as is'', without express or
 *    implied warranty, and with no support nor obligation to assist
 *    in its use, correction, modification or enhancement.  We assume
 *    no liability with respect to the infringement of copyrights,
 *    trade secrets, or any patents, and are not responsible for
 *    consequential damages.  Proper use of the Harvest software is
 *    entirely the responsibility of the user.
 *  
 *  DERIVATIVE WORKS
 *  
 *    Users may make derivative works from the Harvest software, subject 
 *    to the following constraints:
 *  
 *      - You must include the above copyright notice and these 
 *        accompanying paragraphs in all forms of derivative works, 
 *        and any documentation and other materials related to such 
 *        distribution and use acknowledge that the software was 
 *        developed at the above institutions.
 *  
 *      - You must notify IRTF-RD regarding your distribution of 
 *        the derivative work.
 *  
 *      - You must clearly notify users that your are distributing 
 *        a modified version and not the original Harvest software.
 *  
 *      - Any derivative product is also subject to these copyright 
 *        and use restrictions.
 *  
 *    Note that the Harvest software is NOT in the public domain.  We
 *    retain copyright, as specified above.
 *  
 *  HISTORY OF FREE SOFTWARE STATUS
 *  
 *    Originally we required sites to license the software in cases
 *    where they were going to build commercial products/services
 *    around Harvest.  In June 1995 we changed this policy.  We now
 *    allow people to use the core Harvest software (the code found in
 *    the Harvest ``src/'' directory) for free.  We made this change
 *    in the interest of encouraging the widest possible deployment of
 *    the technology.  The Harvest software is really a reference
 *    implementation of a set of protocols and formats, some of which
 *    we intend to standardize.  We encourage commercial
 *    re-implementations of code complying to this set of standards.  
 *  
 */
#ifndef USE_POSIX_REGEX
#define USE_POSIX_REGEX		/* put before includes; always use POSIX it */
#endif
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <GNUregex.h>
#include "util.h"
#include "url.h"
#include "filter.h"

#define LETTERS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
#define NUMBERS "0123456789"

static void init_filterre();

static char *host_dotaddr = 0;

char *Filter_Type_Name[] =
{
    "Allowed",
    "Denied",
    "Denied Host",
    "Denied URL",
    "Denied Scheme",
    "Unknown",
};


/*
 *  filter_selection() - Returns non-zero if the enumerator should NOT
 *  follow the URL; othwerwise returns 0;
 */
int filter_selection(up)
     URL *up;
{
    int r = Filter_ALLOW;
    char *hostport = NULL;
    Host *h = NULL;

    if (host_filter != NULL && nhost_filter > 0) {
	if ((h = get_host(up->host)))
	    host_dotaddr = xstrdup(h->dotaddr);
	else
	    host_dotaddr = (char *) 0;
	if ((up->type == URL_HTTP) || (up->type == URL_FTP) ||
	    (up->type == URL_GOPHER)) {
	    hostport = xmalloc(strlen(up->host) + 10);
	    sprintf(hostport, "%s:%d", up->host, up->port);
	    if (filter_match(hostport, host_filter, nhost_filter))
		r = Filter_DENY_HOST;
	    xfree(hostport);
	} else {
	    if (filter_match(up->host, host_filter, nhost_filter))
		r = Filter_DENY_HOST;
	}
	if (host_dotaddr)
	    xfree(host_dotaddr);
    }
    if (r == Filter_ALLOW && url_filter != NULL && nurl_filter > 0)
	if (filter_match(up->pathname, url_filter, nurl_filter))
	    r = Filter_DENY_URL;
    if (r == Filter_ALLOW && access_mask != 0)
	if (!(1 << up->type & access_mask))
	    r = Filter_DENY_ACCESS;
    Debug(44, 1, ("filter_selection: returning '%s' for %s\n",
	    r ? "DON'T FOLLOW" : "FOLLOW", up->url));
    return (r);
}

/*
 *  filter_match() - Returns non-zero if the enumerator should NOT
 *  follow the URL; othwerwise returns 0;
 */
int filter_match(data, tbl, ntbl)
     char *data;
     struct filter_regex *tbl;
     int ntbl;
{
    int i;
    char *olddata = data;

    if (tbl == NULL || ntbl < 1 || data == NULL)
	return 0;

    for (i = 0; i < ntbl; i++) {
	if (tbl[i].filtertype == Filter_UNKNOWN)
	    continue;

	/* hack: match host dot address if this is the host filter */
	/* and the pattern contains no letters, but some digits    */
	if ((tbl == host_filter) &&
	    (strpbrk(tbl[i].pattern, LETTERS) == (char *) 0) &&
	    (strpbrk(tbl[i].pattern, NUMBERS) != (char *) 0) &&
	    (host_dotaddr != (char *) NULL))
	    data = host_dotaddr;

	if (do_match(data, tbl[i].compiled_pattern)) {
	    Debug(44, 5, ("filter_match: '%s' matches expression '%s'\n", data, tbl[i].pattern));
	    Debug(44, 5, ("filter_match: Returning '%s'\n", tbl[i].filtertype == Filter_DENY ? "DENY" : "ALLOW"));
	    if (tbl[i].filtertype == Filter_DENY)
		return 1;
	    if (tbl[i].filtertype == Filter_ALLOW)
		return 0;
	    return 0;
	}
	data = olddata;
    }
    return 0;
}


/*
 *  filter_initialize() - Initializes the RE-based candidate selection.
 */
void filter_initialize()
{
    int i;
    char *t;
    char *s;

    host_filter = url_filter = NULL;
    nhost_filter = nurl_filter = 0;
    if (host_filterfile != NULL) {
	host_filter = xmalloc(MAX_TYPES * sizeof(struct filter_regex));
	for (i = 0; i < MAX_TYPES; i++)
	    host_filter[i].filtertype = Filter_UNKNOWN;
	init_filterre(host_filterfile, host_filter, &nhost_filter);
    }
    if (url_filterfile != NULL) {
	url_filter = xmalloc(MAX_TYPES * sizeof(struct filter_regex));
	for (i = 0; i < MAX_TYPES; i++)
	    url_filter[i].filtertype = Filter_UNKNOWN;
	init_filterre(url_filterfile, url_filter, &nurl_filter);
    }
    if (access_types != NULL) {
	access_mask = 0;
	t = strdup(access_types);
	for (s = strtok(t, "|"); s; s = strtok(0, "|")) {
	    if (!strcasecmp(s, "FILE"))
		access_mask |= 1 << URL_FILE;
	    if (!strcasecmp(s, "FTP"))
		access_mask |= 1 << URL_FTP;
	    if (!strcasecmp(s, "GOPHER"))
		access_mask |= 1 << URL_GOPHER;
	    if (!strcasecmp(s, "HTTP"))
		access_mask |= 1 << URL_HTTP;
	    if (!strcasecmp(s, "NEWS"))
		access_mask |= 1 << URL_NEWS;
	    if (!strcasecmp(s, "TELNET"))
		access_mask |= 1 << URL_TELNET;
	    if (!strcasecmp(s, "WAIS"))
		access_mask |= 1 << URL_WAIS;
	}
	xfree(t);
    }
}


/*
 *  init_filterre() - Initializes the given type_regex array with the regular
 *  expressions from filename.  Returns 0 on success; non-zero otherwise.
 *
 *  File format looks like:
 *      # comment
 *      Allow   Regular-Expression
 *      Deny    Regular-Expression
 */
static void init_filterre(filename, t, nt)
     char *filename;
     struct filter_regex *t;
     int *nt;
{
    FILE *fp = NULL;
    char *s = NULL;
    int ret;
    static char buf[BUFSIZ];
    static char pat[BUFSIZ];
    static char what[BUFSIZ];

    if ((fp = fopen(filename, "r")) == NULL) {
	log_errno(filename);
	return;
    }
    while (fgets(buf, BUFSIZ, fp) != NULL) {
	if (buf[0] == '#' || buf[0] == '\n')
	    continue;
	if ((s = strrchr(buf, '\n')) != NULL)
	    *s = '\0';
	for (s = &buf[0]; isspace((int) *s); s++);
	if (sscanf(s, "%s %s", what, pat) != 2) {
	    errorlog("Ignoring in %s: %s\n", filename, buf);
	    continue;
	}
	if (!strcasecmp(what, "allow"))
	    t[*nt].filtertype = Filter_ALLOW;
	else if (!strcasecmp(what, "deny"))
	    t[*nt].filtertype = Filter_DENY;
	else {
	    t[*nt].filtertype = Filter_UNKNOWN;
	    errorlog("%s: Unknown filter directive: %s\n",
		filename, what);
	}

	t[*nt].pattern = strdup(pat);

	ret = regcomp(&t[*nt].compiled_pattern, t[*nt].pattern,
	    USE_RE_SYNTAX);
	if (ret != 0) {
	    errorlog("Could not compile regular expression: %s",
		t[*nt].pattern);
	    xfree(t[*nt].pattern);
	    t[*nt].pattern = NULL;
	    t[*nt].filtertype = Filter_UNKNOWN;
	    continue;
	}
	if (++(*nt) >= MAX_TYPES) {
	    Log("WARNING: %s has too many types.\n", filename);
	    break;
	}
    }
    fclose(fp);
}
