static char rcsid[] = "url.c,v 1.97 1996/02/01 06:34:05 duane Exp";
/*
 *  url.c - URL processing code 
 *
 *  DEBUG: section  20, level 1         Common liburl URL processing
 *
 *  Darren Hardy, hardy@cs.colorado.edu, March 1994
 *
 *  ----------------------------------------------------------------------
 *  Copyright (c) 1994, 1995.  All rights reserved.
 *  
 *    The Harvest software was developed by the Internet Research Task
 *    Force Research Group on Resource Discovery (IRTF-RD):
 *  
 *          Mic Bowman of Transarc Corporation.
 *          Peter Danzig of the University of Southern California.
 *          Darren R. Hardy of the University of Colorado at Boulder.
 *          Udi Manber of the University of Arizona.
 *          Michael F. Schwartz of the University of Colorado at Boulder.
 *          Duane Wessels of the University of Colorado at Boulder.
 *  
 *    This copyright notice applies to software in the Harvest
 *    ``src/'' directory only.  Users should consult the individual
 *    copyright notices in the ``components/'' subdirectories for
 *    copyright information about other software bundled with the
 *    Harvest source code distribution.
 *  
 *  TERMS OF USE
 *    
 *    The Harvest software may be used and re-distributed without
 *    charge, provided that the software origin and research team are
 *    cited in any use of the system.  Most commonly this is
 *    accomplished by including a link to the Harvest Home Page
 *    (http://harvest.cs.colorado.edu/) from the query page of any
 *    Broker you deploy, as well as in the query result pages.  These
 *    links are generated automatically by the standard Broker
 *    software distribution.
 *    
 *    The Harvest software is provided ``as is'', without express or
 *    implied warranty, and with no support nor obligation to assist
 *    in its use, correction, modification or enhancement.  We assume
 *    no liability with respect to the infringement of copyrights,
 *    trade secrets, or any patents, and are not responsible for
 *    consequential damages.  Proper use of the Harvest software is
 *    entirely the responsibility of the user.
 *  
 *  DERIVATIVE WORKS
 *  
 *    Users may make derivative works from the Harvest software, subject 
 *    to the following constraints:
 *  
 *      - You must include the above copyright notice and these 
 *        accompanying paragraphs in all forms of derivative works, 
 *        and any documentation and other materials related to such 
 *        distribution and use acknowledge that the software was 
 *        developed at the above institutions.
 *  
 *      - You must notify IRTF-RD regarding your distribution of 
 *        the derivative work.
 *  
 *      - You must clearly notify users that your are distributing 
 *        a modified version and not the original Harvest software.
 *  
 *      - Any derivative product is also subject to these copyright 
 *        and use restrictions.
 *  
 *    Note that the Harvest software is NOT in the public domain.  We
 *    retain copyright, as specified above.
 *  
 *  HISTORY OF FREE SOFTWARE STATUS
 *  
 *    Originally we required sites to license the software in cases
 *    where they were going to build commercial products/services
 *    around Harvest.  In June 1995 we changed this policy.  We now
 *    allow people to use the core Harvest software (the code found in
 *    the Harvest ``src/'' directory) for free.  We made this change
 *    in the interest of encouraging the widest possible deployment of
 *    the technology.  The Harvest software is really a reference
 *    implementation of a set of protocols and formats, some of which
 *    we intend to standardize.  We encourage commercial
 *    re-implementations of code complying to this set of standards.  
 *  
 */
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <ctype.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <netdb.h>
#include <errno.h>
#include "util.h"
#include "url.h"
#ifdef USE_CCACHE
#include "ccache.h"
#endif

#define BIG_BUFSIZ (BUFSIZ<<3)

/* Global variables */
int liburl_conform_rfc1738 = 0;
int liburl_sleep_time = 1;
#ifdef USE_LOCAL_CACHE
int use_local_cache = 1;
#endif

/* Local Functions */
static void Tolower();
static void remove_dot();
static void remove_dotdot();
static URL *url_parse();
static char *shsafe_path();
static void get_lmt();

#ifdef OLD_CODE
static int compare_fullhost();
#endif

/* NOTE these rely on the order of 'enum url_types' in ../include/url.h */

struct _url_table url_table[] = {
	{
		"unknown", 0, 0
	},			/* URL_UNKNOWN, */
	{
		"file", 0, 0
	},			/* URL_FILE,    */
	{
		"ftp", 21, ftp_get
	},			/* URL_FTP,     */
	{
		"gopher", 70, gopher_get
	},			/* URL_GOPHER,  */
	{
		"http", 80, http_get
	},			/* URL_HTTP,    */
	{
		"news", 119, news_get
	},			/* URL_NEWS,    */
	{
		"nop", 0, 0
	},			/* URL_NOP,     */
	{
		"telnet", 25, 0
	},			/* URL_TELNET,  */
	{
		"wais", 0, 0
	},			/* URL_WAIS,    */
	{
		"x-", 0, 0
	},			/* URL_X,       */
	{
		"mailto", 0, 0
	},			/* URL_MAILTO,  */
};


static int init_called = 0;

struct local_trans_table {
	char *from;
	char *to;
	struct local_trans_table *next;
};
static struct local_trans_table *LocalTransTable = NULL;

void url_initLocalServers()
{
	FILE *fp = NULL;
	char *from = NULL;
	char *to = NULL;
	char *t = NULL;
	char *buf = NULL;
	struct local_trans_table *x;

	LocalTransTable = NULL;
	if ((t = getenv("HARVEST_URL_LOCAL_MAPPINGS")) == NULL)
		return;
	Debug(20, 1, ("url_initLocalServers: OPEN URLTABLE: %s\n", t));
	if ((fp = fopen(t, "r")) == NULL)
		return;
	from = xmalloc(BUFSIZ);
	to = xmalloc(BUFSIZ);
	buf = xmalloc(BUFSIZ);
	while (fgets(buf, BUFSIZ, fp)) {
		if ((t = strchr(buf, '\n')))
			*t = '\0';
		if (sscanf(buf, "%s %s", from, to) != 2)
			continue;
		Debug(20, 1, ("url_initLocalServers: READ URLTABLE: %s --> %s\n",
			from, to));
		x = (struct local_trans_table *)
		    xmalloc(sizeof(struct local_trans_table));
		x->from = xstrdup(from);
		x->to = xstrdup(to);
		x->next = LocalTransTable;
		LocalTransTable = x;
	}
	fclose(fp);
	xfree(from);
	xfree(to);
	xfree(buf);
}


void init_url()
{
	char *s;

	if (init_called)
		return;

	init_called = 1;
	liburl_sleep_time = 1;	/* hard-coded default */
	if ((s = getenv("HARVEST_URL_DELAY")) != NULL)
		liburl_sleep_time = atoi(s);

#ifdef USE_LOCAL_CACHE
	if (use_local_cache)
		init_cache();
#endif
#ifdef USE_CCACHE
	url_initCache(10, 600);
#endif
	url_initLocalServers();
}

void url_purge()
{
	if (!init_called)
		init_url();
#ifdef USE_LOCAL_CACHE
	if (use_local_cache)
		expire_cache();
#endif
}

void finish_url()
{
#ifdef USE_LOCAL_CACHE
	if (use_local_cache)
		finish_cache();
#endif
#ifdef USE_CCACHE
	url_shutdowncache();
#endif
}

/*
 *  url_open() - Parses and initializes the given url into a URL structure.
 *  Returns a pointer to the structure on success; or returns NULL if the
 *  URL is not parseable, or *  if the URL's host is not valid.
 */
URL *url_open(url)
     char *url;
{
	static URL *up = NULL;
	static char buf[BUFSIZ];
	struct local_trans_table *l;
	char *s, *local_filename = NULL;
	struct stat sb;

	Debug(20, 1, ("url_open: %s\n", url));

	if (!init_called) {
		init_url();
	}
	if ((up = url_parse(url)) == NULL) {
		url_close(up);
		return (NULL);
	}
	for (l = LocalTransTable; !local_filename && l; l = l->next) {
		if (!strncasecmp(up->url, l->from, strlen(l->from))) {
			int fd;

			Debug(20, 1, ("Local Mapping: '%s' matched '%s'\n",
				up->url, l->from));
			s = up->url + strlen(l->from);
			local_filename = (char *) xmalloc(strlen(l->to) +
			    strlen(s) + 1);
			sprintf(local_filename, "%s%s", l->to, s);

			/* no HTTP involved, so unescape URI */
			rfc1738_unescape(local_filename);
			/* 
			 *  Don't use the mapping if the file is unreadable,
			 *  if fstat() fails, if it's a special file, or if
			 *  it's executable.
			 */
			if ((fd = open(local_filename, O_RDONLY, 0)) < 0 ||
			    fstat(fd, &sb) < 0 || !S_ISREG(sb.st_mode) ||
			    (sb.st_mode & S_IXUSR)) {
				xfree(local_filename);
				local_filename = NULL;
			}
			if (fd >= 0) {
				(void) close(fd);
			}
		}
		/* Special hacks for news: URLs.  We want to change     */
		/* news:comp.sex.html into                              */
		/* /var/spool/nov/comp/sex/html/.overview               */
		/* The local mapping should be:                         */
		/*    news:overview     /var/spool/nov/                 */
		if (!strncasecmp("news:overview", l->from, 13))
			if (up->type == URL_NEWS && (strchr(up->url, '@') == 0)) {
				int fd;
				char *group_path;
				local_filename = xmalloc(strlen(l->to) +
				    strlen(up->pathname) + 20);
				group_path = xstrdup(up->pathname);
				for (s = group_path; *s; s++)	/* dots to slashes */
					if (*s == '.')
						*s = '/';
				sprintf(local_filename, "%s%s/.overview",
				    l->to, group_path);
				xfree(group_path);
				if ((fd = open(local_filename, O_RDONLY, 0)) < 0 ||
				    fstat(fd, &sb) < 0 || !S_ISREG(sb.st_mode) ||
				    (sb.st_mode & S_IXUSR)) {
					xfree(local_filename);
					local_filename = NULL;
				}
				if (fd >= 0) {
					(void) close(fd);
				}
			}
	}

	if (local_filename != (char *) NULL)

	if (local_filename != NULL &&
	    !symlink(local_filename, s = tempnam(NULL, "local"))) {
		Debug(20, 1, ("url_open: Local Mapping succeeded: %s -> %s\n",
			up->url, local_filename));
		up->filename = s;	/* point to the symlink */
		up->shsafe_filename = shsafe_path(up->filename);
		up->flags |= URL_FLAG_LOCAL_MAPPED;
		xfree(local_filename);
		local_filename = NULL;
		return (up);
	}
	/* Type-specific additions to the URL */
	Debug(20, 5, ("url_open: type=%d\n", up->type));
	switch (up->type) {
	case URL_FILE:
		up->filename = xstrdup(up->pathname);
		up->shsafe_filename = shsafe_path(up->filename);

		break;
	case URL_FTP:

		/*      If no userinfo yet, see if we can get it from the
		 * **   FTPAuth.cf file (which came from FTP-Auth: in
		 * **   gatherer.cf.                                    */

		if (up->user == (char *) NULL && up->password == (char *) NULL)
			ftp_get_auth(up);

		/*      If still no userinfo, set to defaults           */

		if (up->user == (char *) NULL)
			up->user = xstrdup("anonymous");
		if (up->password == (char *) NULL) {
			sprintf(buf, "%s@", getmylogin());
			up->password = xstrdup(buf);
		}
		break;

	case URL_HTTP:
		break;

	case URL_GOPHER:
		if (strlen(up->pathname) == 1)
			up->gophertype = 1;
		else
			up->gophertype = *(up->pathname + 1) - 0x30;
		up->filename = NULL;
		break;

	case URL_NEWS:
	case URL_NOP:
	case URL_X:
		break;

	default:
		Debug(20, 1, ("url_open: WARNING: Unsupported URL: %s\n",
			up->url));
		url_close(up);
		return (NULL);
		break;
	}
	Debug(20, 9, ("url_open: returning %#08x\n", up));
	return (up);
}


/*
 *  url_read() - Reads n bytes in buf from the URL *up.  Returns the
 *  number of bytes read, or a negative number on error.
 */
int url_read(buf, n, off, up)
     char *buf;			/* buffer to place the data */
     int n;			/* read at most n bytes */
     int off;			/* offset into URL data */
     URL *up;			/* URL */
{
	int x;

	if (!up)
		return (-1);

	if (up->filename == NULL)
		if (url_retrieve(up))
			return (-1);

	if ((up->fp = fopen(up->filename, "r")) == NULL) {
		log_errno(up->filename);
		return (-1);
	}
	if (off > 0 && fseek(up->fp, off, SEEK_SET)) {
		log_errno(up->filename);
		fclose(up->fp);
		return (-1);
	}
	x = fread(buf, 1, n, up->fp);
	fclose(up->fp);
	up->fp = NULL;

	return (x);
}

/*
 *  url_retrieve() - Retrieves the URL's data and places it into a
 *  temporary file.  Returns non-zero if it could not retrieve the data;
 *  otherwise returns 0;
 *
 * Return code indicates severity of error (DW 6/9/95):
 *
 *      1-9     'soft', maybe temporary errors.  Doesn't necessarily
 *              mean the object doens't exist.
 *      10+     'hard' errors from remote HTTPD.  The URL is invalid
 *              or no longer exists
 *
 * Return codes:
 *      0       Success
 *      1       DNS errors (from get_sockaddr())
 *      2       socket()/bind() errors
 *      3       connect() errors
 *      4       network write/read errors
 *      10      HTTP/FTP/Gopher/etc protocol errors
 *
 */
int url_retrieve(up)
     URL *up;
{
	int get_code;
#ifdef USE_MD5
	extern char *get_md5();
#endif
#ifdef USE_CCACHE
	DataReturn *dataRec;
#endif
#ifdef USE_LOCAL_CACHE
	int cache_hit = 0;

	/* See if we have the file in the cache already */
	if (use_local_cache && up->type != URL_FILE && up->type != URL_NOP &&
	    up->type != URL_X && up->filename == NULL) {
		char *s = lookup_cache(up->url);
		Debug(20, 1, ("url_retrieve: lookup_cache: returned: %s\n", s ? s : "Null"));

		if (s != NULL) {
			up->filename = s;
			up->shsafe_filename = shsafe_path(up->filename);
			up->lmt = lmt_cache(up->url);
			cache_hit = 1;
		}
	}
#endif
	if (up->type == URL_NOP || up->type == URL_X)
		return (1);


	/* 
	 *  If we don't have the file, then grab it via the access protocol 
	 */
	if (up->filename == NULL) {
		if (liburl_sleep_time > 0)
			sleep(liburl_sleep_time);
		switch (up->type) {
		case URL_FTP:
#ifdef USE_CCACHE
			/*
			 *  Use the FTP connection cache, rather than ftp_get().
			 */
			if ((dataRec = SockGetData(up, TEMP, NULL)) == NULL)
				return (1);
			up->filename = xstrdup(dataRec->fileName);
			up->shsafe_filename = shsafe_path(up->filename);
			free(dataRec);
			break;
#endif
		case URL_GOPHER:
		case URL_NEWS:
		case URL_HTTP:
			up->filename = tempnam(NULL, url_table[up->type].scheme);
			up->shsafe_filename = shsafe_path(up->filename);
			get_code = (*url_table[up->type].get_func) (up);
			if (get_code > 0) {
				Debug(20, 1, ("WARNING: url_retrieve: Cannot access %s\n",
					up->url));
				(void) unlink(up->filename);
				xfree(up->filename);
				xfree(up->shsafe_filename);
				up->filename = NULL;
				up->shsafe_filename = NULL;
				return get_code;
			}
			break;
		case URL_FILE:
			errorlog("Internal Error: url_retrieve: %s\n", up->url);
			break;
		default:
			Debug(20, 1, ("WARNING: url_retrieve: Unsupported type: %s\n", up->url));
			return 10;
		}
	}
	if (up->lmt == 0)
		get_lmt(up);

#ifdef USE_LOCAL_CACHE
	/* Now that we have the file, add it to the cache */
	if (use_local_cache && up->type != URL_FILE && !cache_hit) {
		add_cache(up->url, up->filename, up->lmt);
		if (up->redir_from_url != (char *) NULL)
			add_cache(up->redir_from_url, up->filename, up->lmt);
	}
#endif /* USE_LOCAL_CACHE */

#ifdef USE_MD5
	/* Compute an MD5 checksum, if not done already */
	if (up->md5 == NULL)
		up->md5 = get_md5(up->filename);
#endif
	return 0;
}

/*
 *  url_close() - Closes the URL, and frees memory.
 */
void url_close(up)
     URL *up;
{
	if (!up)
		return;
	if (up->filename && up->type != URL_FILE)
		(void) unlink(up->filename);

	if (up->shsafe_filename)
		xfree(up->shsafe_filename);

	if (up->url)
		xfree(up->url);
	if (up->raw_pathname)
		xfree(up->raw_pathname);
	if (up->pathname)
		xfree(up->pathname);
	if (up->host)
		xfree(up->host);

	if (up->user)
		xfree(up->user);
	if (up->password)
		xfree(up->password);

	if (up->filename)
		xfree(up->filename);
#ifdef USE_MD5
	if (up->md5)
		xfree(up->md5);
#endif
	if (up->http_version)
		xfree(up->http_version);
	if (up->http_mime_hdr)
		xfree(up->http_mime_hdr);
	if (up->http_reason_line)
		xfree(up->http_reason_line);

	xfree(up);
}


/*
 *  Tolower() - converts an entire string to lowercase.
 */
static void Tolower(q)
     char *q;
{
	char *s = q;

	while (*s) {
		*s = tolower(*s);
		s++;
	}
}


/*
 *  url_parse() - Parses the URL from the URL *up and sets up's values.
 *  Returns an allocated URL structure on success; otherwise, returns NULL.
 */
static URL *url_parse(url)
     char *url;
{
	static URL *up = NULL;
	char *urlbuf = NULL;
	char *buf = NULL;
	char *scheme = NULL;
	char *scheme_specific = NULL;
	char *host_part = NULL;
	char *url_path = NULL;
	char *raw_url_path = NULL;
	char *userinfo = NULL;
	char *username = NULL;
	char *password = NULL;
	char *hostinfo = NULL;
	char *hostname = NULL;
	char *t = NULL;
	int port;
	int n;

	if (url == (char *) NULL) {
		Debug(20, 1, ("url_parse: Somebody gave me a NULL URL!\n"));
		return (NULL);
	}
	urlbuf = xstrdup(url);	/* working copy */

	if ((t = strrchr(urlbuf, '\n')) != (char *) NULL)
		*t = (char) '\0';
	if ((t = strrchr(urlbuf, '\r')) != (char *) NULL)
		*t = (char) '\0';

	Debug(20, 9, ("url_parse: parsing '%s'\n", url));

	if ((t = strchr(urlbuf, ':')) == (char *) NULL) {
		Log("url_parse: Invalid URL: %s\n", urlbuf);
		xfree(urlbuf);
		return NULL;
	}
	*t = (char) '\0';
	scheme = xstrdup(urlbuf);
	scheme_specific = xstrdup(t + 1);
	*t = (char) ':';

	Debug(20, 9, ("url_parse:          scheme = %s\n", scheme));
	Debug(20, 9, ("url_parse: scheme_specific = %s\n", scheme_specific));

	up = xmalloc(sizeof(URL));	/* Basic initialization */

	Tolower(scheme);
	if (!strncmp(scheme, "x-", 2))	/* any x- type */
		up->type = URL_X;
	else if (!strcmp(scheme, "file"))
		up->type = URL_FILE;
	else if (!strcmp(scheme, "ftp"))
		up->type = URL_FTP;
	else if (!strcmp(scheme, "http"))
		up->type = URL_HTTP;
	else if (!strcmp(scheme, "gopher"))
		up->type = URL_GOPHER;
	else if (!strcmp(scheme, "news"))
		up->type = URL_NEWS;
	else if (!strcmp(scheme, "nop"))
		up->type = URL_NOP;
	else if (!strcmp(scheme, "telnet"))
		up->type = URL_TELNET;
	else if (!strcmp(scheme, "wais"))
		up->type = URL_WAIS;
	else if (!strcmp(scheme, "mailto"))
		up->type = URL_MAILTO;
	else {
		Log("url_parse: Unknown URL scheme: %s\n", scheme);
		xfree(urlbuf);
		xfree(scheme);
		xfree(scheme_specific);
		xfree(up);
		return NULL;
	}

	/*      Do scheme-specific parsing              */

	switch (up->type) {
	case URL_FILE:
	case URL_HTTP:
	case URL_GOPHER:
	case URL_FTP:
		host_part = xmalloc(strlen(scheme_specific));
		url_path = xmalloc(strlen(scheme_specific));
		n = sscanf(scheme_specific, "//%[^/]%s", host_part, url_path);
		if (n < 1 || n > 2) {
			Log("url_parse: Invalid URL: %s\n", urlbuf);
			xfree(urlbuf);
			xfree(scheme);
			xfree(scheme_specific);
			xfree(host_part);
			xfree(url_path);
			xfree(up);
			return NULL;
		}
		if (*url_path == '\0')
			strcpy(url_path, "/");
		break;
	case URL_NEWS:
	case URL_X:
	case URL_NOP:
		url_path = scheme_specific;
		scheme_specific = (char *) NULL;
		break;
	default:
		Debug(20, 1, ("Harvest does not support %s URLs\n", scheme));
		xfree(urlbuf);
		xfree(scheme);
		xfree(scheme_specific);
		xfree(host_part);
		xfree(url_path);
		xfree(up);
		return NULL;
		break;
	}
	xfree(urlbuf);

	Debug(20, 9, ("url_parse:       host_part = %s\n", host_part ? host_part : "(none)"));
	Debug(20, 9, ("url_parse:        url_path = %s\n", url_path));

	if (host_part == (char *) NULL)
		goto finish_host_part;

	/* ---  HOST PART PARSING  ---
	 * **
	 * ** In general, the host part may look like:
	 * **
	 * **   [username[:password]@]hostname[:port]
	 */

	/* --- First, separate host_part into userinfo and hostinfo --- */

	if ((t = strchr(host_part, '@')) != (char *) NULL) {
		*t = (char) '\0';
		userinfo = xstrdup(host_part);
		hostinfo = xstrdup(t + 1);
		*t = (char) '@';
	} else {
		hostinfo = host_part;
		host_part = (char *) NULL;
	}

	/* --- separate userinfo into username and password --- */

	if (userinfo) {
		if ((t = strchr(userinfo, ':')) != (char *) NULL) {
			*t = (char) '\0';
			username = xstrdup(userinfo);
			password = xstrdup(t + 1);
			*t = (char) ':';
		} else {
			username = userinfo;
			userinfo = (char *) NULL;
			password = (char *) NULL;
		}
		if (username)
			rfc1738_unescape(username);
		if (password)
			rfc1738_unescape(password);
	}
	/* --- separate hostinfo into hostname and port --- */

	port = url_table[up->type].port;
	if ((t = strchr(hostinfo, ':')) != (char *) NULL) {
		*t = (char) '\0';
		hostname = xstrdup(hostinfo);
		port = atoi(t + 1);
		*t = (char) ':';
	} else {
		hostname = hostinfo;
		hostinfo = (char *) NULL;
	}
	Tolower(hostname);


      finish_host_part:

	Debug(20, 9, ("url_parse:        username = %s\n", username ? username : "(none)"));
	Debug(20, 9, ("url_parse:        password = %s\n", password ? password : "(none)"));
	Debug(20, 9, ("url_parse:        hostname = %s\n", hostname ? hostname : "(none)"));
	Debug(20, 9, ("url_parse:            port = %d\n", port));

	/* ---  URL-PATH PART PARSING  --- */

	/* Remove HTML Bookmarks */
	if (up->type == URL_HTTP) {
		if ((t = strchr(url_path, '#')) != (char *) NULL)
			*t = '\0';
	}
	switch (up->type) {
	case URL_HTTP:
	case URL_FTP:
	case URL_GOPHER:
	case URL_FILE:
		remove_dot(url_path);
		remove_dotdot(url_path);
		break;
	default:
		break;
	}

	raw_url_path = xstrdup(url_path);
	rfc1738_unescape(url_path);

	/* Conform to RFC 1738 if needed */
	if (liburl_conform_rfc1738) {
		char *x = raw_url_path;

		/* use unescaped pathname for the escape */
		rfc1738_unescape(x);
		raw_url_path = xstrdup(rfc1738_escape(x));
		xfree(x);
	}
	Debug(20, 9, ("url_parse:        url_path = %s\n", url_path));
	Debug(20, 9, ("url_parse:    raw_url_path = %s\n", raw_url_path));


	/* Write the URL */
	urlbuf = xmalloc(BUFSIZ);
	buf = xmalloc(BUFSIZ);

	/*      Note: Here we write the username and password into
	 * **   the URL string.  So, if the user specifies user:pw
	 * **   in a rootnode URL, it gets passed all throughout
	 * **   the plumbing and will be visible in the query results,
	 * **   etc.  To specifiy ``hidden'' password info, use the
	 * **   HTTP-Basic-Auth and FTP-Auth lines in the Gatherer
	 * **   config file.    -DW                                      */

	sprintf(buf, "%s:", scheme);
	strcat(urlbuf, buf);
	if (hostname) {
		strcat(urlbuf, "//");
		if (username)
			strcat(urlbuf, username);
		if (password) {
			sprintf(buf, ":%s", password);
			strcat(urlbuf, buf);
		}
		if (username || password)
			strcat(urlbuf, "@");
		strcat(urlbuf, hostname);
		if (port != url_table[up->type].port) {
			sprintf(buf, ":%d", port);
			strcat(urlbuf, buf);
		}
	}
	strcat(urlbuf, raw_url_path);

	up->url = xstrdup(urlbuf);
	Debug(20, 9, ("url_parse:             url = %s\n", up->url));

	up->port = port;
	if (hostname)
		up->host = xstrdup(hostname);
	if (username)
		up->user = xstrdup(username);
	if (password)
		up->password = xstrdup(password);
	if (url_path)
		up->pathname = xstrdup(url_path);
	if (raw_url_path)
		up->raw_pathname = xstrdup(raw_url_path);
	if (username || password)
		URL_FLAG_SET(up->flags, URL_FLAG_PASS_USERINFO);

	xfree(urlbuf);
	xfree(buf);
	xfree(scheme);
	xfree(scheme_specific);
	xfree(host_part);
	xfree(url_path);
	xfree(raw_url_path);
	xfree(userinfo);
	xfree(username);
	xfree(password);
	xfree(hostinfo);
	xfree(hostname);

	return (up);
}

/*
 *  remove_dot - Removes /./ portions of the string.
 */
static void remove_dot(pathname)
     char *pathname;
{
	char *p;

	while ((p = strstr(pathname, "/./")) != NULL) {
		/* move the string up, *including* terminating null */
#ifdef HAVE_MEMMOVE
		memmove(p + 1, p + 3, strlen(p + 3) + 1);
#else
		bcopy(p + 3, p + 1, strlen(p + 3) + 1);
#endif
	}
}

/*
 *  remove_dotdot - Normalizes pathnames to remove the /../ portion 
 */
static void remove_dotdot(pathname)
     char *pathname;
{
	char *p, *q;

	while ((p = strstr(pathname, "/../")) != NULL) {
		if (p == pathname)	/* nothing to strip, bail */
			return;
		for (q = p - 1; q != pathname; q--)
			if (*q == '/')
				break;
		if (*q != '/')	/* no previous /, bail */
			return;
		/* q now points to previous / at beginning of component */
#ifdef HAVE_MEMMOVE
		memmove(q + 1, p + 4, strlen(p + 4) + 1);
#else
		bcopy(p + 4, q + 1, strlen(p + 4) + 1);
#endif
	}
}

#ifdef USE_CCACHE
/*
 * url_initCache() - inits ftp connection cache to desired parameters
 */
void url_initCache(maxConnect, timeout)
     int maxConnect;
     long timeout;
{
	InitConfigRec *initParam;

	initParam = (InitConfigRec *) xmalloc(sizeof(InitConfigRec));
	if (!initParam)
		return;

	initParam->maxConnections = maxConnect;
	initParam->timeOut = timeout;
	SockInit(initParam);
}

/*
 * url_shutdowncache() - shuts down ftp connection cache and cleans up mess
 */
void url_shutdowncache()
{
	ShutDownCache();
}

#endif /* USE_CCACHE */


#define safe_strdup(s)	(s) == NULL ? NULL : xstrdup(s)
URL *dup_url(up)
     URL *up;
{
	static URL *newup;

	newup = xmalloc(sizeof(URL));

	newup->url = safe_strdup(up->url);
	newup->raw_pathname = safe_strdup(up->raw_pathname);
	newup->pathname = safe_strdup(up->pathname);
	newup->host = safe_strdup(up->host);
	newup->user = safe_strdup(up->user);
	newup->password = safe_strdup(up->password);
	newup->filename = safe_strdup(up->filename);
#ifdef USE_MD5
	newup->md5 = safe_strdup(up->md5);
#endif
	newup->fp = NULL;	/* can't copy */
	newup->port = up->port;
	newup->type = up->type;
	newup->gophertype = up->gophertype;
	newup->lmt = up->lmt;

	newup->http_status_code = up->http_status_code;
	newup->http_version = safe_strdup(up->http_version);
	newup->http_reason_line = safe_strdup(up->http_reason_line);
	newup->http_mime_hdr = safe_strdup(up->http_mime_hdr);
	return (newup);
}
#undef safe_strdup


#ifdef OLD_CODE
/*
 *  compare_fullhost() - compares the two hosts. Returns 0 on match;
 *  non-zero otherwise.
 */
static int compare_fullhost(host1, host2)
     char *host1;
     char *host2;
{
	char *s, *s1, *s2;
	int r;

	if ((s = getrealhost(host1)) == NULL)
		return (1);
	s1 = xstrdup(s);
	xfree(s);
	if ((s = getrealhost(host2)) == NULL) {
		xfree(s1);
		return (1);
	}
	s2 = xstrdup(s);
	xfree(s);
	r = strcmp(s1, s2);
	xfree(s1);
	xfree(s2);
	return (r);
}
#endif



static void get_lmt(up)
     URL *up;
{
	struct stat sb;

	if (stat(up->filename, &sb) < 0) {
		log_errno(up->filename);
		return;
	}
	up->lmt = sb.st_ctime;

	if (up->http_mime_hdr != NULL) {
		char *tbuf, *p, *q;

		tbuf = xstrdup(up->http_mime_hdr);
		p = strtok(tbuf, "\n");
		while (p != NULL) {
			if (!strncasecmp(p, "Last-Modified:",
				strlen("Last-Modified:"))) {
				q = p + strlen("Last-Modified:") + 1;
				up->lmt = parse_rfc850(q);
			}
			p = strtok(NULL, "\n");
		}
		xfree(tbuf);
	}
	Debug(20, 1, ("get_lmt: %ld %s\n", up->lmt, up->url));
}

void print_url(up)
     URL *up;
{
	Log("\n--------------\n");
	Log("URL url      : %s\n", up->url);
	Log("URL Type     : %d\n", up->type);
	Log("URL RPathname: %s\n", up->raw_pathname);
	Log("URL Pathname : %s\n", up->pathname);
	Log("URL Host     : %s\n", up->host);
	Log("URL Port     : %d\n", up->port);
	Log("URL User     : %s\n", up->user);
	Log("URL Password : %s\n", up->password);
	Log("URL G Type   : %d\n", up->gophertype);
	Log("URL Filename : %s\n", up->filename);
	Log("URL LUpdate  : %d\n", up->lmt);
#ifdef USE_MD5
	Log("URL MD5      : %s\n", up->md5);
#endif
	Log("URL HTTP Ver : %s\n", up->http_version);
	Log("URL HTTP Code: %d\n", up->http_status_code);
	Log("URL HTTP RLin: %s\n", up->http_reason_line);
	Log("URL HTTP MIME: %s\n", up->http_mime_hdr);
	Log("--------------\n");
}

/*
 *  url_confirm() - quickly checks to see if the URL is ok.
 *  Returns 0 if it is ok; otherwise, returns non-zero.
 */
int url_confirm(up)
     URL *up;
{
	char *tmp = NULL;

	switch (up->type) {
	case URL_HTTP:
	case URL_GOPHER:
	case URL_FTP:
		if ((tmp = getrealhost(up->host)) == NULL) {
			errorlog("%s: Host unknown.\n", up->host);
			return (1);
		}
		break;
	default:
		break;
	}
	if (tmp)
		xfree(tmp);
	return (0);
}

/*
 *  shsafe_path - Escapes characters to use text inside "'s for sh.
 */
static char *shsafe_path(s)
     char *s;
{
	static char buf[BIG_BUFSIZ];
	char *p, *q;

	for (p = s, q = &buf[0]; *p != '\0'; p++, q++) {
		if ((*p == '\"') || (*p == '\\') || (*p == '$')) {
			*q++ = '\\';	/* escape */
			*q = *p;
		} else {
			*q = *p;
		}
	}
	*q = '\0';
	return (xstrdup(buf));
}
