static char rcsid[] = "cache.c,v 1.46 1996/01/05 20:28:23 duane Exp";
/*
 *  cache.c - Simple, local disk cache for liburl.  
 *  Uses a GDBM file to map URLs to the cached files.  Uses links to copy
 *  files.  Locks out other processes that might make modifications to the
 *  cache by using the mutual exclusion protection of GDBM.  Maintains a
 *  Cache.size file that has the number of bytes in the cache.
 *
 *  DEBUG: section  22, level 1         Common liburl disk cache routines
 *
 *  Darren Hardy, hardy@cs.colorado.edu, April 1994
 *
 *  ----------------------------------------------------------------------
 *  Copyright (c) 1994, 1995.  All rights reserved.
 *  
 *    The Harvest software was developed by the Internet Research Task
 *    Force Research Group on Resource Discovery (IRTF-RD):
 *  
 *          Mic Bowman of Transarc Corporation.
 *          Peter Danzig of the University of Southern California.
 *          Darren R. Hardy of the University of Colorado at Boulder.
 *          Udi Manber of the University of Arizona.
 *          Michael F. Schwartz of the University of Colorado at Boulder.
 *          Duane Wessels of the University of Colorado at Boulder.
 *  
 *    This copyright notice applies to software in the Harvest
 *    ``src/'' directory only.  Users should consult the individual
 *    copyright notices in the ``components/'' subdirectories for
 *    copyright information about other software bundled with the
 *    Harvest source code distribution.
 *  
 *  TERMS OF USE
 *    
 *    The Harvest software may be used and re-distributed without
 *    charge, provided that the software origin and research team are
 *    cited in any use of the system.  Most commonly this is
 *    accomplished by including a link to the Harvest Home Page
 *    (http://harvest.cs.colorado.edu/) from the query page of any
 *    Broker you deploy, as well as in the query result pages.  These
 *    links are generated automatically by the standard Broker
 *    software distribution.
 *    
 *    The Harvest software is provided ``as is'', without express or
 *    implied warranty, and with no support nor obligation to assist
 *    in its use, correction, modification or enhancement.  We assume
 *    no liability with respect to the infringement of copyrights,
 *    trade secrets, or any patents, and are not responsible for
 *    consequential damages.  Proper use of the Harvest software is
 *    entirely the responsibility of the user.
 *  
 *  DERIVATIVE WORKS
 *  
 *    Users may make derivative works from the Harvest software, subject 
 *    to the following constraints:
 *  
 *      - You must include the above copyright notice and these 
 *        accompanying paragraphs in all forms of derivative works, 
 *        and any documentation and other materials related to such 
 *        distribution and use acknowledge that the software was 
 *        developed at the above institutions.
 *  
 *      - You must notify IRTF-RD regarding your distribution of 
 *        the derivative work.
 *  
 *      - You must clearly notify users that your are distributing 
 *        a modified version and not the original Harvest software.
 *  
 *      - Any derivative product is also subject to these copyright 
 *        and use restrictions.
 *  
 *    Note that the Harvest software is NOT in the public domain.  We
 *    retain copyright, as specified above.
 *  
 *  HISTORY OF FREE SOFTWARE STATUS
 *  
 *    Originally we required sites to license the software in cases
 *    where they were going to build commercial products/services
 *    around Harvest.  In June 1995 we changed this policy.  We now
 *    allow people to use the core Harvest software (the code found in
 *    the Harvest ``src/'' directory) for free.  We made this change
 *    in the interest of encouraging the widest possible deployment of
 *    the technology.  The Harvest software is really a reference
 *    implementation of a set of protocols and formats, some of which
 *    we intend to standardize.  We encourage commercial
 *    re-implementations of code complying to this set of standards.  
 *  
 */
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <time.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <gdbm.h>
#include "util.h"
#include "url.h"

/*
 *  Try HAVE_SRAND48, then try HAVE_SRANDOM, otherwise assume HAVE_SRAND
 */

/*
 *  CACHE_TTL - number of seconds that makes cached files invalid
 */
#ifndef CACHE_TTL
#define CACHE_TTL		(1 * 7 * 24 * 60 * 60)	/* 1 week */
#endif

/*
 *  USE_CACHE_TMPDIR is the default temporary directory of where to 
 *  place the cache, or the environment variable TMPDIR is used.  
 *  This directory MUST be on the same partition as TMPDIR, since we 
 *  use link(2) for copying.
 */
#ifndef USE_CACHE_TMPDIR
#define USE_CACHE_TMPDIR	"/tmp"
#endif

/* Local variables */
static char cachedir[BUFSIZ];
static char cachetable[BUFSIZ];
static char cachesize[BUFSIZ];
static time_t watermark;
static GDBM_FILE dbf = NULL;
static int max_cache_size = (32 * 1024 * 1024);		/* 32 MBs */
static int cache_ttl = CACHE_TTL;

static GDBM_FILE lm_dbf = NULL;
static char lmttable[BUFSIZ];

/* Local functions */
static void delete_cache_entry();
static void get_access();
static void release_access();
static void die();
static int get_cachesize();
static void change_cachesize();
static void delete_cache_url();
static char *next_filename();
static void init_next_filename();

static void die()
{
	if (lm_dbf != NULL)
		gdbm_close(lm_dbf);
	lm_dbf = NULL;
	if (dbf != NULL)
		gdbm_close(dbf);
	dbf = NULL;
	exit(1);
}

/*
 *  finish_cache() - Cleanup the cache.
 */
void finish_cache()
{
	if (lm_dbf != NULL)
		gdbm_close(lm_dbf);
	lm_dbf = NULL;
	if (dbf != NULL)
		gdbm_close(dbf);
	dbf = NULL;
}

/*
 *  init_cache() - Startup the cache 
 */
void init_cache()
{
	char *s = getenv("TMPDIR");
	struct stat sb;

	/* Create a directory in which to cache the files */
	sprintf(cachedir, "%s/cache-liburl", s ? s : USE_CACHE_TMPDIR);
	(void) mkdir(cachedir, 0755);
	if (access(cachedir, W_OK)) {
		errorlog("Cannot use %s\n", cachedir);
		die();
	}
	init_next_filename(cachedir);
	sprintf(cachetable, "%s/Cache.gdbm", cachedir);
	sprintf(cachesize, "%s/Cache.size", cachedir);
	if (access(cachetable, F_OK)) {
		dbf = gdbm_open(cachetable, 0, GDBM_NEWDB, 0664, NULL);
		if (dbf == NULL) {
			if ((gdbm_errno != GDBM_CANT_BE_WRITER) &&
			    (gdbm_errno != GDBM_CANT_BE_READER)) {
				errorlog("GDBM ERROR: gdbm_open: %s: %s\n",
				    cachetable, gdbm_strerror(gdbm_errno));
				die();
			}
		} else
			gdbm_close(dbf);
	}
	dbf = NULL;
	/*
	 * watermark was used for comparing cached object time with time on
	 * GDBM file.  Now we use the the current time instead so this could
	 * go away -DW
	 */
	if (stat(cachetable, &sb) < 0) {
		log_errno(cachetable);
		watermark = 0;
	} else {
		watermark = sb.st_mtime;
	}
	watermark = watermark > 0 ? watermark : 0;
	sprintf(lmttable, "%s/LMT.gdbm", cachedir);
	if (access(lmttable, F_OK)) {
		lm_dbf = gdbm_open(lmttable, 0, GDBM_NEWDB, 0664, NULL);
		if (lm_dbf == NULL) {
			if ((gdbm_errno != GDBM_CANT_BE_WRITER) &&
			    (gdbm_errno != GDBM_CANT_BE_READER)) {
				errorlog("GDBM ERROR: gdbm_open: %s: %s\n",
				    lmttable, gdbm_strerror(gdbm_errno));
				die();
			}
		} else
			gdbm_close(lm_dbf);
	}
	lm_dbf = NULL;

#if   defined(HAVE_SRAND48)
	(void) srand48((long) time(NULL));
#elif defined(HAVE_SRANDOM)
	(void) srandom((unsigned) time(NULL));
#else
	(void) srand(time(NULL));
#endif

	max_cache_size = (32 * 1024 * 1024);
	if ((s = getenv("HARVEST_MAX_LOCAL_CACHE")) != NULL)
		max_cache_size = atoi(s);
	if (max_cache_size < 0)
		max_cache_size = (32 * 1024 * 1024);

	cache_ttl = CACHE_TTL;
	if ((s = getenv("GATHERER_CACHE_TTL")) != NULL)
		cache_ttl = atoi(s);
	if (cache_ttl < 0)
		cache_ttl = CACHE_TTL;
}


/*
 *  get_access() - Obtains access to GDBM database table.  Blocks until
 *  it can obtain access.  Locks all other liburl's from the cache table.
 */
static void get_access(flag)
     int flag;
{
	while (1) {
		dbf = gdbm_open(cachetable, 0, flag, 0664, NULL);
		if (dbf != NULL)
			break;
		if ((gdbm_errno != GDBM_CANT_BE_WRITER) &&
		    (gdbm_errno != GDBM_CANT_BE_READER)) {
			errorlog("GDBM ERROR: gdbm_open: %s: %s\n",
			    cachetable, gdbm_strerror(gdbm_errno));
			die();
		}
#ifdef HAVE_USLEEP
#if   defined(HAVE_SRAND48)
		(void) usleep((lrand48() % 200) + 10);	/* wait a random amount */
#elif defined(HAVE_SRANDOM)
		(void) usleep((random() % 200) + 10);	/* wait a random amount */
#else
		(void) usleep((rand() % 200) + 10);	/* wait a random amount */
#endif
#else
		{
			struct timeval sleep;
			sleep.tv_sec = 0;
#if   defined(HAVE_SRAND48)
			sleep.tv_usec = (lrand48() % 200) + 10;
#elif defined(HAVE_SRANDOM)
			sleep.tv_usec = (random() % 200) + 10;
#else
			sleep.tv_usec = (rand() % 200) + 10;
#endif

#ifndef _HARVEST_HPUX_
			select(0, (fd_set *) 0, (fd_set *) 0, (fd_set *) 0, &sleep);
#else /* _HARVEST_HPUX_ */
			select(0, (int *) 0, (int *) 0, (int *) 0, &sleep);
#endif /* _HARVEST_HPUX_ */
		}
#endif
	}

	/*
	 * this should be safe.  Only open this DB after the other has
	 * been opened.
	 */
	lm_dbf = gdbm_open(lmttable, 0, flag, 0664, NULL);
	if (lm_dbf == NULL) {
		errorlog("GDBM ERROR: gdbm_open: %s: %s\n",
		    lmttable, gdbm_strerror(gdbm_errno));
		die();
	}
}

/*
 *  release_access() - Releases access to the GDBM database.
 */
static void release_access()
{
	if (lm_dbf != NULL)
		gdbm_close(lm_dbf);
	lm_dbf = NULL;
	if (dbf != NULL)
		gdbm_close(dbf);
	dbf = NULL;
}

/*
 *  get_cache_filename() - Generates a unique filename to store in the cache
 */
static char *get_cache_filename()
{
	static char *s;

	while (1) {
		if ((s = next_filename()) == NULL)
			return (NULL);
		if (access(s, F_OK))
			return (s);
		xfree(s);
	}
	return (NULL);
}

/*
 *  add_cache() - Add the URL,filename to the cache.
 */
void add_cache(url, filename, lmt)
     char *url;
     char *filename;
     time_t lmt;
{
	datum k, d;
	char *cfile;
	struct stat sb;
	int ndeletes = 0, current_size;
	int status;

	/* Find out some more about the file */
	if (lstat(filename, &sb) < 0) {
		log_errno(filename);
		return;
	}
	if (!S_ISREG(sb.st_mode))
		return;

	get_access(GDBM_WRCREAT);	/* LOCK */

	/* See if we have room, if not delete an object */
	while (1) {
		if (ndeletes > 3) {	/* Try 3 times to reduce cache size */
			release_access();
			return;	/* RELEASE and give up */
		}
		current_size = get_cachesize();
		Debug(22, 1, ("add_cache: Current Cache Size is %d\n",
			current_size));
		if (sb.st_size + current_size > max_cache_size) {
			delete_cache_entry();
			ndeletes++;
		} else {
			break;	/* We're ready to add */
		}
	}

	/* Cache the file; by link'ing it */
	cfile = get_cache_filename();
	Debug(22, 1, ("add_cache: Linking %s to %s\n", cfile, filename));
	if (link(filename, cfile) < 0) {
		/* must copy the file */
		int n;
		char *buf = NULL;
		int rfd, wfd;
		if ((wfd = open(cfile, O_WRONLY | O_CREAT, 0666)) < 0) {
			log_errno2(__FILE__, __LINE__, cfile);
			release_access();	/* RELEASE  and give up */
			xfree(cfile);
			return;
		}
		if ((rfd = open(filename, O_RDONLY)) < 0) {
			log_errno2(__FILE__, __LINE__, filename);
			release_access();	/* RELEASE  and give up */
			xfree(cfile);
			return;
		}
		buf = (char *) xmalloc(4096);
		while ((n = read(rfd, buf, 4096)) > 0)
			write(wfd, buf, n);
		close(rfd);
		close(wfd);
		xfree(buf);
	}
	/* Add the file to the cache */
	k.dptr = xstrdup(url);
	k.dsize = strlen(k.dptr) + 1;
	d.dptr = xstrdup(cfile);
	d.dsize = strlen(d.dptr) + 1;

	Debug(22, 1, ("add_cache: Adding %s -> %s\n", k.dptr, d.dptr));

	status = gdbm_store(dbf, k, d, GDBM_INSERT);
	xfree(k.dptr);
	xfree(d.dptr);
	xfree(cfile);
	if (status) {
		/* already an entry for URL */
		Debug(22, 1, ("add_cache: GDBM_ERROR: gdbm_store: key (%s): %s: %s\n", k.dptr, cfile, gdbm_strerror(gdbm_errno)));
		(void) unlink(cfile);
		release_access();	/* RELEASE */
		return;
	}
	change_cachesize(sb.st_size);

	/* Add the LMT */
	k.dptr = xstrdup(url);
	k.dsize = strlen(k.dptr) + 1;
	d.dptr = (char *) &lmt;
	d.dsize = sizeof(lmt);

	Debug(22, 5, ("Storing LMT=%d for %s\n", lmt, url));

	status = gdbm_store(lm_dbf, k, d, GDBM_INSERT);
	xfree(k.dptr);
	if (status) {
		/* already an entry for URL */
		Debug(22, 1, ("add_cache: GDBM_ERROR: gdbm_store: url=%s LMT=%ld: %s\n", k.dptr, lmt, gdbm_strerror(gdbm_errno)));
		release_access();	/* RELEASE */
		return;
	}
	release_access();	/* RELEASE */
}


/*
 *  lookup_cache() - Checks to see if the URL is already cached on the
 *  local disk.  If it is, then it returns a pointer to a copy of the file.
 */
char *lookup_cache(url)
     char *url;
{
	static char *filecopy;
	datum k, d;
	struct stat sb;

	Debug(22, 1, ("lookup_cache: %s\n", url));

	k.dptr = xstrdup(url);
	k.dsize = strlen(url) + 1;

	get_access(GDBM_READER);	/* LOCK */

	d = gdbm_fetch(dbf, k);
	xfree(k.dptr);

	if (d.dptr == NULL) {
		release_access();	/* RELEASE */
		xfree(d.dptr);
		return (NULL);
	}
	/* make sure its really there */
	if (stat(d.dptr, &sb) < 0) {
		release_access();	/* RELEASE */
		delete_cache_url(d.dptr);
		xfree(d.dptr);
		return NULL;
	}
	filecopy = tempnam(cachedir, "cache");
	Debug(22, 1, ("lookup_cache: CACHE HIT: Linking %s to %s\n",
		filecopy, d.dptr));
	if (link(d.dptr, filecopy) < 0) {
		if (symlink(d.dptr, filecopy) < 0) {	/* try symlink(2) */
			log_errno(filecopy);
			xfree(filecopy);
			filecopy = NULL;
		}
	}
	release_access();	/* RELEASE */
	xfree(d.dptr);
	return (filecopy);
}

/*
 *  delete_cache_url() - deletes the cache entry for the URL.
 */
static void delete_cache_url(url)
     char *url;
{
	datum k, d;
	struct stat sb;

	Debug(22, 1, ("delete_cache_url: %s\n", url));
	k.dptr = xstrdup(url);
	k.dsize = strlen(k.dptr) + 1;

	d = gdbm_fetch(dbf, k);
	if (d.dptr == NULL) {
		errorlog("URL %s is not in the cache.\n", url);
		return;
	}
	if (stat(d.dptr, &sb) < 0) {
		log_errno(d.dptr);
		sb.st_size = 0;
	}
	(void) unlink(d.dptr);
	xfree(d.dptr);
	change_cachesize(-sb.st_size);
	(void) gdbm_delete(dbf, k);
	xfree(k.dptr);

	Debug(22, 1, ("delete_cache_url (LMT): %s\n", url));
	k.dptr = xstrdup(url);
	k.dsize = strlen(k.dptr) + 1;

	d = gdbm_fetch(lm_dbf, k);
	if (d.dptr == NULL) {
		errorlog("not in LMT.gdbm: %s\n", url);
		return;
	}
	(void) gdbm_delete(dbf, k);
	xfree(d.dptr);
	xfree(k.dptr);

}

/*
 *  delete_cache_entry() - remove the file that was accessed last.
 *  ASSUMES that get_access() has already been called.
 */
static void delete_cache_entry()
{
	datum k, nk;

	Debug(22, 1, ("delete_cache_entry: deleting...\n"));

	/* Randomly select an entry to delete */
	k = gdbm_firstkey(dbf);
	while (k.dptr) {
		nk = gdbm_nextkey(dbf, k);
#if   defined(HAVE_SRAND48)
		if (lrand48() % 13 == 0) {	/* delete a random entry */
#elif defined(HAVE_SRANDOM)
			if (random() % 13 == 0) {	/* delete a random entry */
#else
		if (rand() % 13 == 0) {		/* delete a random entry */
#endif
			delete_cache_url(k.dptr);
		}
		xfree(k.dptr);
		k = nk;
	}

	/* 
	 *  Oops, we didn't select any during the random traversal, 
	 *  so just delete the first one 
	 */
	k = gdbm_firstkey(dbf);
	if (k.dptr) {
		delete_cache_url(k.dptr);
		xfree(k.dptr);
	}
}

/*
 *  get_cachesize() - Returns the number of bytes in the cache
 */
static int get_cachesize()
{
	FILE *fp;
	int sz = 0;

	if ((fp = fopen(cachesize, "r")) == NULL) {
		return (0);
	}
	if (fscanf(fp, "%d", &sz) != 1) {
		fclose(fp);
		return (0);
	}
	fclose(fp);
	return (sz < 0 ? 0 : sz);
}

/*
 *  change_cachesize() - Changes the cache size by n bytes;
 */
static void change_cachesize(n)
     int n;
{
	FILE *fp;
	int sz = get_cachesize();

	if ((fp = fopen(cachesize, "w+")) == NULL) {
		log_errno(cachesize);
		die();
	}
	fprintf(fp, "%d", sz + n);
	fclose(fp);
}

#define myabs(a)	((a) < 0 ? -(a) : (a))

/*
 *  expire_cache() - Removes any cached files that are older than CACHE_TTL.
 */
void expire_cache()
{
	datum k, nk, d;
	struct stat sb;
	time_t now = time(0);

	Debug(22, 1, ("expire_cache: Deleting expired entries...\n"));
	Debug(22, 1, ("expire_cache: local-disk cache in %s...\n", cachedir));

	get_access(GDBM_WRITER);	/* LOCK */

	/* Walk each cache file and delete if necessary */
	k = gdbm_firstkey(dbf);
	while (k.dptr) {
		nk = gdbm_nextkey(dbf, k);
		d = gdbm_fetch(dbf, k);
		if (d.dptr == NULL) {
			errorlog("Internal Error: %s not in cache table\n",
			    k.dptr);
			die();
		}
		if (stat(d.dptr, &sb) < 0) {
			log_errno(d.dptr);
			delete_cache_url(k.dptr);
		} else if (myabs(now - sb.st_mtime) > cache_ttl) {
			delete_cache_url(k.dptr);
		}
		xfree(d.dptr);
		xfree(k.dptr);
		k = nk;
	}

	release_access(GDBM_WRITER);	/* RELEASE */
}
#undef myabs

static char *topdir = NULL;

static void init_next_filename(dirname)
     char *dirname;		/* name of top level directory to store files */
{
	if (dirname == NULL)
		return;
	topdir = xstrdup(dirname);
#if   defined(HAVE_SRAND48)
	(void) srand48((long) time(NULL));
#elif defined(HAVE_SRANDOM)
	(void) srandom((unsigned) time(NULL));
#else
	(void) srand(time(NULL));
#endif
}

static char *next_filename()
{
	static char p[BUFSIZ];
#if   defined(HAVE_SRAND48)
	unsigned n = lrand48() % 100000000;	/* 8 digits */
#elif defined(HAVE_SRANDOM)
	unsigned n = random() % 100000000;	/* 8 digits */
#else
	unsigned n = rand() % 100000000;	/* 8 digits */
#endif

	sprintf(p, "%s/%02d", topdir, (int) n / 1000000);	/* first 2 digits */
	if (mkdir(p, 0755) < 0) {
		if (errno != EEXIST) {
			log_errno(p);
			return (NULL);
		}
	}
	/* Entire filename uses directory (2 digits) and 6 digits for name */
	sprintf(p, "%s/%02d/%06d", topdir, (int) n / 1000000, (int) n % 1000000);
	return (xstrdup(p));
}

/*
 *  lmt_cache() - Returns the Last-Modification-Time for the cache hit.
 */
time_t lmt_cache(url)
     char *url;
{
	datum k, d;
	char *filename;
	struct stat sb;
	time_t t;

	Debug(22, 1, ("lmt_cache: %s\n", url));

	k.dptr = xstrdup(url);
	k.dsize = strlen(url) + 1;

	get_access(GDBM_READER);	/* LOCK */

	Debug(22, 5, ("Looking up LMT: %s\n", url));
	d = gdbm_fetch(lm_dbf, k);
	xfree(k.dptr);
	if (d.dptr != NULL) {
		t = *((time_t *) d.dptr);
		Debug(22, 5, ("--> Found LMT=%d in LMT.gdbm.\n", t));
		release_access();	/* RELEASE */
		xfree(d.dptr);
		return t;
	}
	d = gdbm_fetch(dbf, k);
	xfree(k.dptr);

	if (d.dptr == NULL) {
		release_access();	/* RELEASE */
		xfree(d.dptr);
		return ((time_t) NULL);
	}
	filename = xstrdup(d.dptr);
	xfree(d.dptr);
	release_access();	/* RELEASE */

	t = (stat(filename, &sb) < 0) ? 0 : sb.st_mtime;
	xfree(filename);
	return (t);
}
