
/*
 *  DEBUG: section  48, level 1, 5      Gatherer enumeration robots.txt stuff
 */

#include <stdlib.h>
#include <unistd.h>
#include "util.h"
#include "url.h"


#define ROBOTS_TXT_DISALLOW 0
#define ROBOTS_TXT_ALLOW 1

typedef struct _word_list {
    char *word;
    struct _word_list *next;
} word_list;

typedef struct _record {
    word_list *user_agent;
    word_list *disallow;
    struct _record *next;
} record;

typedef struct _robots_txt {
    char *server;
    record *record_list;
    struct _robots_txt *next;
} robots_txt;

robots_txt *RobotsTxtHead = NULL;
robots_txt **RobotsTxtTail = &RobotsTxtHead;

static robots_txt *RobotsTxtFindServer _PARAMS((char *));
static record *RobotsTxtFindRecord _PARAMS((robots_txt *, char *));
static robots_txt *RobotsTxtLoad _PARAMS((char *, int));
int RobotsTxtCheck _PARAMS((URL * up));

static char *this_UA = "Harvest";

static int pattern_match(pattern, path)
     char *pattern;
     char *path;
{
    if (pattern == (char *) NULL)
	return 1;
    if (!strncasecmp(pattern, path, strlen(pattern)))
	return 1;
    return 0;
}

int RobotsTxtCheck(up)
     URL *up;
{
    robots_txt *R = NULL;
    record *Q = NULL;
    word_list *W = NULL;
    static char server[BUFSIZ];

    if (up == (URL *) NULL) {
	errorlog("RobotsTxtCheck: NULL URL\n");
	return ROBOTS_TXT_DISALLOW;
    }
    Debug(48, 1, ("RobotsTxtCheck: URL %s\n", up->url));

    if (up->port == url_table[up->type].port) {
	sprintf(server, "%s://%s",
	    url_table[up->type].scheme,
	    up->host);
    } else {
	sprintf(server, "%s://%s:%d",
	    url_table[up->type].scheme,
	    up->host,
	    up->port);
    }
    R = RobotsTxtFindServer(server);
    if (R == NULL)
	R = RobotsTxtLoad(server, up->type);

    /* Can't find a robots.txt file for this server, assume its okay
     * to visit */
    if (R == NULL)
	return ROBOTS_TXT_ALLOW;

    Debug(48, 5, ("RobotsTxtCheck: Found data for server %s\n", server));

    Q = RobotsTxtFindRecord(R, this_UA);
    if (Q == NULL) {
	Debug(48, 1, ("RobotsTxtCheck: No match for UA '%s'\n", this_UA));
	Debug(48, 1, ("RobotsTxtCheck: Returning ROBOTS_TXT_ALLOW\n"));
	return ROBOTS_TXT_ALLOW;
    }
    Debug(48, 5, ("RobotsTxtCheck: Found record for UA '%s'\n", this_UA));

    for (W = Q->disallow; W; W = W->next) {
	Debug(48, 5, ("RobotsTxtCheck: Pattern %s\n", W->word));
	if (!pattern_match(W->word, up->raw_pathname))
	    continue;
	Debug(48, 1, ("RobotsTxtCheck: Matched '%s'\n", W->word));
	Debug(48, 1, ("RobotsTxtCheck: Returning ROBOTS_TXT_DISALLOW\n"));
	return ROBOTS_TXT_DISALLOW;
    }

    Debug(48, 1, ("RobotsTxtCheck: No matches.\n"));
    Debug(48, 1, ("RobotsTxtCheck: Returning ROBOTS_TXT_ALLOW\n"));
    return ROBOTS_TXT_ALLOW;
}


static robots_txt *RobotsTxtLoad(server, type)
     char *server;
     int type;
{
    robots_txt *R = NULL;
    record *Q = NULL;
    record **QT = NULL;
    word_list *W = NULL;
    word_list **UAWLT = NULL;
    word_list **DAWLT = NULL;
    URL *up = NULL;
    static char url[BUFSIZ];
    static char buf[256];
    char *t = NULL;
    enum {
	none, user_agent, disallow
    } lastline = none;

    if (server == (char *) NULL) {
	errorlog("RobotsTxtLoad: NULL server!\n");
	return NULL;
    }
    R = (robots_txt *) xmalloc(sizeof(robots_txt));
    R->server = xstrdup(server);
    *RobotsTxtTail = R;
    RobotsTxtTail = &R->next;

    switch (type) {
    case URL_HTTP:
    case URL_FTP:
	sprintf(url, "%s/robots.txt", server);
	break;
    case URL_GOPHER:
	sprintf(url, "%s/00/robots.txt", server);
	break;
    default:
	return R;
	/* NOTREACHED */
    }

    if ((up = url_open(url)) == (URL *) NULL) {
	Debug(48, 1, ("RobotsTxtLoad: Bad URL: %s\n", url));
	return R;
    }
    if (url_retrieve(up)) {
	Debug(48, 1, ("RobotsTxtLoad: %s: cannot retrieve\n", url));
	url_close(up);
	return R;
    }
    if ((up->fp = fopen(up->filename, "r")) == NULL) {
	log_errno2(__FILE__, __LINE__, up->filename);
	url_close(up);
	return R;
    }
    QT = &R->record_list;

    Debug(48, 1, ("RobotsTxtLoad: Reading %s\n", url));
    while (fgets(buf, 256, up->fp)) {
	if ((t = strchr(buf, '\n')))
	    *t = '\0';
	if ((t = strchr(buf, '\r')))
	    *t = '\0';
	Debug(48, 5, ("%s: %s\n", url, buf));
	if ((t = strchr(buf, '#')))
	    *t = '\0';
	if (buf[0] == '\0')
	    continue;
	if ((t = strtok(buf, ":\t ")) == NULL)
	    continue;
	if (!strcasecmp(t, "User-Agent")) {
	    if (lastline != user_agent) {
		Q = (record *) xmalloc(sizeof(record));
		UAWLT = &Q->user_agent;
		DAWLT = &Q->disallow;
	    }
	    W = (word_list *) xmalloc(sizeof(word_list));
	    if ((t = strtok(NULL, " \t")))
		W->word = xstrdup(t);
	    *UAWLT = W;
	    UAWLT = &W->next;
	    lastline = user_agent;
	} else if (!strcasecmp(t, "Disallow")) {
	    if (lastline != disallow) {
		*QT = Q;
		QT = &Q->next;
	    }
	    W = (word_list *) xmalloc(sizeof(word_list));
	    if ((t = strtok(NULL, " \t")))
		W->word = xstrdup(t);
	    *DAWLT = W;
	    DAWLT = &W->next;
	    lastline = disallow;
	}
    }

    url_close(up);
    return R;
}

static robots_txt *RobotsTxtFindServer(server)
     char *server;
{
    robots_txt *R = NULL;

    if (server == (char *) NULL) {
	errorlog("RobotsTxtFind: NULL server!\n");
	return NULL;
    }
    Debug(48, 1, ("RobotsTxtFind: %s: Finding\n", server));

    for (R = RobotsTxtHead; R; R = R->next) {
	Debug(48, 1, ("RobotsTxtFind: Checking %s\n", R->server));
	if (!strcasecmp(R->server, server))
	    return R;
    }
    Debug(48, 1, ("RobotsTxtFind: %s: Not Found\n", server));
    return NULL;
}

static record *RobotsTxtFindRecord(R, UA)
     robots_txt *R;
     char *UA;
{
    record *Q = NULL;
    record *wildcard = NULL;
    word_list *W = NULL;

    for (Q = R->record_list; Q; Q = Q->next) {
	for (W = Q->user_agent; W; W = W->next) {
	    if (!strcmp(W->word, "*"))
		wildcard = Q;
	    if (strstr(W->word, UA))
		return Q;
	    if (!strncasecmp(W->word, UA, strlen(UA)))
		return Q;
	}
    }
    return wildcard;
}
