static char rcsid[] = "db.c,v 1.43 1996/01/05 20:28:53 duane Exp";
/*
 *  db.c - Storage Manager for the Essence system
 *
 *  DEBUG: section  61, level 1         Gatherer essence database routines
 *
 *  Darren Hardy, hardy@cs.colorado.edu, February 1994
 *
 *  ----------------------------------------------------------------------
 *  Copyright (c) 1994, 1995.  All rights reserved.
 *  
 *    The Harvest software was developed by the Internet Research Task
 *    Force Research Group on Resource Discovery (IRTF-RD):
 *  
 *          Mic Bowman of Transarc Corporation.
 *          Peter Danzig of the University of Southern California.
 *          Darren R. Hardy of the University of Colorado at Boulder.
 *          Udi Manber of the University of Arizona.
 *          Michael F. Schwartz of the University of Colorado at Boulder.
 *          Duane Wessels of the University of Colorado at Boulder.
 *  
 *    This copyright notice applies to software in the Harvest
 *    ``src/'' directory only.  Users should consult the individual
 *    copyright notices in the ``components/'' subdirectories for
 *    copyright information about other software bundled with the
 *    Harvest source code distribution.
 *  
 *  TERMS OF USE
 *    
 *    The Harvest software may be used and re-distributed without
 *    charge, provided that the software origin and research team are
 *    cited in any use of the system.  Most commonly this is
 *    accomplished by including a link to the Harvest Home Page
 *    (http://harvest.cs.colorado.edu/) from the query page of any
 *    Broker you deploy, as well as in the query result pages.  These
 *    links are generated automatically by the standard Broker
 *    software distribution.
 *    
 *    The Harvest software is provided ``as is'', without express or
 *    implied warranty, and with no support nor obligation to assist
 *    in its use, correction, modification or enhancement.  We assume
 *    no liability with respect to the infringement of copyrights,
 *    trade secrets, or any patents, and are not responsible for
 *    consequential damages.  Proper use of the Harvest software is
 *    entirely the responsibility of the user.
 *  
 *  DERIVATIVE WORKS
 *  
 *    Users may make derivative works from the Harvest software, subject 
 *    to the following constraints:
 *  
 *      - You must include the above copyright notice and these 
 *        accompanying paragraphs in all forms of derivative works, 
 *        and any documentation and other materials related to such 
 *        distribution and use acknowledge that the software was 
 *        developed at the above institutions.
 *  
 *      - You must notify IRTF-RD regarding your distribution of 
 *        the derivative work.
 *  
 *      - You must clearly notify users that your are distributing 
 *        a modified version and not the original Harvest software.
 *  
 *      - Any derivative product is also subject to these copyright 
 *        and use restrictions.
 *  
 *    Note that the Harvest software is NOT in the public domain.  We
 *    retain copyright, as specified above.
 *  
 *  HISTORY OF FREE SOFTWARE STATUS
 *  
 *    Originally we required sites to license the software in cases
 *    where they were going to build commercial products/services
 *    around Harvest.  In June 1995 we changed this policy.  We now
 *    allow people to use the core Harvest software (the code found in
 *    the Harvest ``src/'' directory) for free.  We made this change
 *    in the interest of encouraging the widest possible deployment of
 *    the technology.  The Harvest software is really a reference
 *    implementation of a set of protocols and formats, some of which
 *    we intend to standardize.  We encourage commercial
 *    re-implementations of code complying to this set of standards.  
 *  
 */
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/param.h>
#include <time.h>
#include <gdbm.h>
#include "util.h"
#include "url.h"
#include "template.h"
#include "essence.h"

/*
 *  GDBM does not reclaim deleted space within a file.  Each replace
 *  operation is very likely to cause wasted space within the file.
 *  The storage manager calls gdbm_reorganize() after every
 *  MAX_DELETIONS replacement operations to shrink the GDBM file.
 *  downsj@csos.orst.edu says that this is a bug in 1.7.1 and below, and
 *  that he hopes to have it fixed by the next major version release.
 *
 *  GDBM 1.7.3 seems to work a little better but still grows a lot.
 */
#ifndef MAX_DELETIONS
#define MAX_DELETIONS	32
#endif

/* Local variables */
static char dbfile[MAXPATHLEN + 1];	/* WORKING.gdbm */
static char prodbfile[MAXPATHLEN + 1];	/* PRODUCTION.gdbm */
static char indexfile[MAXPATHLEN + 1];	/* INDEX.gdbm */
static char md5file[MAXPATHLEN + 1];	/* MD5.gdbm */
static char reffile[MAXPATHLEN + 1];	/* REFRESH.gdbm */
static GDBM_FILE dbf = NULL;	/* WORKING.gdbm */
static GDBM_FILE pdbf = NULL;	/* PRODUCTION.gdbm */
static GDBM_FILE idbf = NULL;	/* INDEX.gdbm */
static GDBM_FILE mdbf = NULL;	/* MD5.gdbm */
static GDBM_FILE rdbf = NULL;	/* REFRESH.gdbm */
static int ndeletions = 0;	/* num of deletion operations */
static int null_filter = 0;	/* dbcheck is nop? */
static int max_deletions = MAX_DELETIONS;

/* Local functions */
static void dbcheck_refresh();
static Buffer *soif_to_buffer();

/*
 *  init_db() - Initialize database routines.  n is the number of deletion
 *  operations allowed before reorganizing the GDBM database.  If n is NULL, 
 *  then the default is used.
 */
void init_db(dbdir, n)
     char *dbdir;
     int n;
{
	max_deletions = (n > 0) ? n : MAX_DELETIONS;
	ndeletions = 0;

	sprintf(dbfile, "%s/WORKING.gdbm", dbdir ? dbdir : topdir);
	sprintf(prodbfile, "%s/PRODUCTION.gdbm", dbdir ? dbdir : topdir);
	sprintf(indexfile, "%s/INDEX.gdbm", dbdir ? dbdir : topdir);
	sprintf(md5file, "%s/MD5.gdbm", dbdir ? dbdir : topdir);
	sprintf(reffile, "%s/REFRESH.gdbm", dbdir ? dbdir : topdir);

	/* Initialize WORKING.gdbm */
	dbf = gdbm_open(dbfile, 0, memefficient ?
	    GDBM_NEWDB : (GDBM_NEWDB | GDBM_FAST), 0644, NULL);
	if (dbf == NULL) {
		/* Cannot run without the working db */
		log_errno(dbfile);
		fatal("gdbm_open: %s: %s\n", dbfile, gdbm_strerror(gdbm_errno));
	}
	pdbf = gdbm_open(prodbfile, 0, GDBM_READER, 0644, NULL);
	idbf = gdbm_open(indexfile, 0, GDBM_READER, 0644, NULL);
	mdbf = gdbm_open(md5file, 0, GDBM_READER, 0644, NULL);
	rdbf = NULL;

	if (pdbf == NULL || idbf == NULL || mdbf == NULL) {
		/* Act as a nop filter */
		Log("WARNING: Incremental Gatherering will NOT be supported on this run.\n");
		Log("\tunable to locate these database(s) needed for incremental gatherering:\n");
		if (pdbf == NULL)
			Log("\t%s\n", prodbfile);
		if (idbf == NULL)
			Log("\t%s\n", indexfile);
		if (mdbf == NULL)
			Log("\t%s\n", md5file);
		null_filter = 1;
	}
	if (null_filter)
		return;

	/* We don't need the refresh database if we have a null filter */
	rdbf = gdbm_open(reffile, 0, memefficient ?
	    GDBM_NEWDB : (GDBM_NEWDB | GDBM_FAST), 0644, NULL);
	if (rdbf == NULL) {
		Log("WARNING: gdbm_open: %s: %s\n", reffile,
		    gdbm_strerror(gdbm_errno));
		log_errno(reffile);
	}
}

/*
 *  finish_db() - Cleaned up after database routines.
 */
void finish_db()
{
	if (dbf == NULL)
		return;
	gdbm_sync(dbf);		/* sync to disk */

#ifdef GDBM_GROWTH_BUG
	if (ndeletions > 0 && gdbm_reorganize(dbf))
		Log("WARNING: gdbm_reorganize: %s: %s\n", dbfile, gdbm_strerror(gdbm_errno));
#endif
	ndeletions = 0;
	if (dbf != NULL) {
		gdbm_close(dbf);
		dbf = NULL;
	}
	if (pdbf != NULL) {
		gdbm_close(pdbf);
		pdbf = NULL;
	}
	if (idbf != NULL) {
		gdbm_close(idbf);
		idbf = NULL;
	}
	if (mdbf != NULL) {
		gdbm_close(mdbf);
		mdbf = NULL;
	}
	if (rdbf != NULL) {
		gdbm_close(rdbf);
		rdbf = NULL;
	}
}

/*
 *  duplicate_url() - Returns non-zero if the URL is already in the
 *  database; zero otherwise.
 */
int duplicate_url(url)
     char *url;
{
	datum k;
	int r;

	k.dptr = url;
	k.dsize = strlen(url) + 1;
	r = gdbm_exists(dbf, k);
	return (r);
}

/*
 *  duplicate_url() - Returns non-zero if the URL is already in any of
 *  the databases (WORKING or PRODUCTION); zero otherwise.
 */
int duplicate_url_any(url)
     char *url;
{
	datum k;
	int r;

	k.dptr = url;
	k.dsize = strlen(url) + 1;
	r = gdbm_exists(dbf, k);
	if (r == 0 && pdbf != NULL)
		r = gdbm_exists(pdbf, k);
	return (r);
}

/*
 *  add_template() - Adds the template to the database.  If should_append
 *  is non-zero, then the template is appended to any existing
 *  template data for the URL.
 */
void add_template(template, object)
     Template *template;
     DataObject *object;
{
	datum k, d;
	Buffer *b = NULL;
	Template *ct = NULL;
	int appending = 0;

	Debug(61, 1, ("add_template(%s)\n", template->url));

	/* Set the key */
	k.dptr = strdup(template->url);
	k.dsize = strlen(k.dptr) + 1;	/* store terminating null char, too */

	if (gdbm_exists(dbf, k)) {
		datum curd;

		/* If a template already exists, then check nested file.  */
		if ((object->flags & F_NESTED) == 0) {
			errorlog("Existing GDBM Entry for non-nested %s\n",
			    template->url);
			xfree(k.dptr);
			return;
		}
		/* Grab the existing template and parse it into a Template */
		curd = gdbm_fetch(dbf, k);
		init_parse_template_string(curd.dptr, curd.dsize);
		ct = parse_template();
		finish_parse_template();
		free(curd.dptr);

		/* Verify that the template was parsable */
		if (ct == NULL) {
			errorlog("Template for %s in %s is malformed.\n",
			    k.dptr, dbfile);
			xfree(k.dptr);
			return;
		}
		/* Embed the current template within old template. */
		if (embed_template(template, ct) == NULL) {
			errorlog("add_template: Failed to embed template: %s\n",
			    template->url);
			xfree(k.dptr);
			free_template(ct);
			return;
		}
		appending = 1;
	}
	b = init_print_template(NULL);
	print_template(appending ? ct : template);
	d.dptr = b->data;
	d.dsize = b->length;

	Debug(61, 1, ("Adding to GDBM file: key{%d}, data{%d}: %s\n",
		k.dsize, d.dsize, (appending) ? "REPLACING" : "INSERTING"));

	/* Store the data into the database */
	if (gdbm_store(dbf, k, d, (appending) ? GDBM_REPLACE : GDBM_INSERT)) {
		errorlog("gdbm_store: %s: %s: %s\n", k.dptr, dbfile, gdbm_strerror(gdbm_errno));
	}
	/* Clean up */
	finish_print_template();	/* frees datum d */
	xfree(k.dptr);
	if (ct != NULL)
		free_template(ct);


	/* Make sure that there haven't been too many deletions */
	if (appending)
		ndeletions++;	/* gdbm_store will cause a deletion */

	if (ndeletions >= max_deletions) {
		Debug(61, 1, ("Reorganizing database after %d deletions\n", ndeletions));
		ndeletions = 0;
		gdbm_sync(dbf);	/* sync to disk */
		if (gdbm_reorganize(dbf))
			errorlog("gdbm_reorganize: %s: %s\n", dbfile, gdbm_strerror(gdbm_errno));
	}
}

/*
 *  dbcheck_timestamp() - Checks to see if the given URL has changed
 *  since the given timestamp.  If it has not changed, then it will try
 *  to refresh the object.  Returns non-zero if the URL is unchanged;
 *  otherwise, returns 0.
 */
int dbcheck_timestamp(url, timestamp)
     char *url;
     int timestamp;
{
	datum d, k;
	int t;

	if (null_filter || url == NULL || timestamp < 1)
		return (0);

	k.dptr = url;
	k.dsize = strlen(k.dptr) + 1;
	d = gdbm_fetch(idbf, k);
	if (d.dptr == NULL)
		return (0);	/* not in the INDEX database */

	/* INDEX data includes the terminating \0 */
	t = atoi(d.dptr);
	xfree(d.dptr);

	/* Compare the timestamps */
	if ((t < 0) || (timestamp > t))		/* changed */
		return (0);

	dbcheck_refresh(url);	/* unchanged */
	return (1);
}

/*
 *  dbcheck_md5() - Checks to see if the given URL has changed
 *  since the given md5.  If it has not changed, then it will try
 *  to refresh the object.  Returns non-zero if the URL is unchanged;
 *  otherwise, returns 0.
 */
int dbcheck_md5(url, md5)
     char *url;
     char *md5;
{
	datum d, k;

	if (null_filter || md5 == NULL || url == NULL)
		return (0);

	k.dptr = url;
	k.dsize = strlen(k.dptr) + 1;
	d = gdbm_fetch(mdbf, k);
	if (d.dptr == NULL)
		return (0);	/* not in the MD5 database */

	/* MD5 from GDBM includes the terminating \0 */
	if (strcmp(md5, d.dptr) != 0) {
		xfree(d.dptr);
		return (0);	/* it has changed */
	}
	xfree(d.dptr);
	dbcheck_refresh(url);	/* it has not changed */
	return (1);
}

/*
 *  dbcheck_refresh() - Update the timestamp for URL.  Logs either
 *  "Unchanged" or "Refreshed".
 */
static void dbcheck_refresh(url)
     char *url;
{
	static int ndone = 0;
	datum d, k;
	Template *t;
	AVPair *avp;
	int refresh_rate, update_time, ttl;
	time_t current_time;
	char tbuf[BUFSIZ];
	Buffer *b;

	if (null_filter || rdbf == NULL) {
		Log("Unchanged: %s\n", url);
		return;
	}
	if (ndone == -1 || (max_refresh > 0 && ndone >= max_refresh)) {
		if (ndone != -1) {
			Log("WARNING: Reached refresh maximum: %d objects.\n",
			    ndone);
		}
		ndone = -1;
		Log("Unchanged (with no refresh): %s\n", url);
		return;
	}
	/* Grab the PRODUCTION copy of the object */
	k.dptr = url;
	k.dsize = strlen(k.dptr) + 1;
	d = gdbm_fetch(pdbf, k);
	if (d.dptr == NULL) {
		errorlog("dbcheck_refresh: refreshing non-existant object?: %s\n", url);
		Log("Unchanged: %s\n", url);
		return;
	}
	/* Parse the object to find it's update time */
	init_parse_template_string(d.dptr, d.dsize);
	if ((t = parse_template()) == NULL) {
		errorlog("dbcheck_refresh: Corrupt SOIF object: %s\n", url);
		xfree(d.dptr);
		Log("Unchanged: %s\n", url);
		return;
	}
	finish_parse_template();
	xfree(d.dptr);

	/* Grab the refresh rate */
	refresh_rate = DEFAULT_REFRESH;		/* default */
	if ((avp = extract_AVPair(t->list, T_REFRESH)) != NULL) {
		refresh_rate = atoi(avp->value);
	}
	/* Grab the TTL */
	ttl = DEFAULT_TTL;	/* default */
	if ((avp = extract_AVPair(t->list, T_TTL)) != NULL) {
		ttl = atoi(avp->value);
	}
	/*      Make sure the refresh rate is never less than   **
	 * **   the TTL.                                        */
	if (ttl < refresh_rate)
		refresh_rate = ttl;

	/* Grab the update time */
	if ((avp = extract_AVPair(t->list, T_UPDATE)) == NULL) {
		free_template(t);
		errorlog("dbcheck_refresh: Illegal SOIF: %s: No %s attribute.\n", url, T_UPDATE);
		Log("Unchanged: %s\n", url);
		return;
	}
	update_time = atoi(avp->value);

	/* 
	 *  See if the object is ready for a refresh.  If it isn't, then
	 *  ignore the request; otherwise, update the timestamp.
	 *
	 *  An object should be refreshed if the time it was created
	 *  (Update-Time) plus the Refresh-Rate has expired.
	 */
	current_time = time(NULL);
	if ((update_time + refresh_rate) > current_time) {
		free_template(t);
		Log("Unchanged: %s\n", url);
		return;
	}
	Log("Refreshing: %s\n", url);

	/* Directly replace the update time */
	xfree(avp->value);
	sprintf(tbuf, "%u", (unsigned int) current_time);
	avp->value = strdup(tbuf);
	avp->vsize = strlen(avp->value);

	/* Write the new object to the refresh database */
	if (((b = soif_to_buffer(t)) == NULL) || (b->length < 1)) {
		errorlog("dbcheck_refresh: Internal error!\n");
		return;
	}
	d.dptr = b->data;
	d.dsize = b->length;
	k.dptr = url;
	k.dsize = strlen(k.dptr) + 1;
	if (!gdbm_exists(rdbf, k) && gdbm_store(rdbf, k, d, GDBM_INSERT)) {
		Log("WARNING: Cannot refresh: %s: %s\n", url,
		    gdbm_strerror(gdbm_errno));
	} else {
		ndone++;
	}

	/* Clean up */
	free_buffer(b);
	b = NULL;
}

/*
 *  soif_to_buffer() - Prints the in-memory SOIF template to an
 *  in-memory buffer.  The given Template is no longer valid on exit.
 *  Must call free_buffer() after this call.
 */
static Buffer *soif_to_buffer(t)
     Template *t;
{
	static Buffer *b;
	FILE *fp;
	char *tfile, buf[BUFSIZ];
	int n;

	/* For large SOIF objects you need 2x the size of the object in mem */
	if (!memefficient) {
		/* Just print the string and return the buffer */
		b = init_print_template(NULL);
		print_template(t);
		free_template(t);
		return (b);
	}
	/* 
	 *  This would not be very memory efficient since we keep the 
	 *  current template and the new version both in memory.  
	 *  The more memory efficient solution is below.  We write the 
	 *  template to a file, free the template, then read the file 
	 *  into a buffer, then write the data to the database.
	 */
	tfile = tempnam(NULL, "stb");
	if ((fp = fopen(tfile, "w")) == NULL) {		/* fallback */
		b = init_print_template(NULL);
		print_template(t);
		free_template(t);
		xfree(tfile);
		return (b);
	}
	(void) init_print_template(fp);
	print_template(t);
	finish_print_template();
	free_template(t);
	fclose(fp);

	if ((fp = fopen(tfile, "r")) == NULL) {		/* fatal error! */
		(void) unlink(tfile);
		xfree(tfile);
		return (NULL);
	}
	b = create_buffer(BUFSIZ);
	while ((n = fread(buf, 1, BUFSIZ, fp)) > 0) {
		add_buffer(b, buf, n);
	}
	fclose(fp);
	(void) unlink(tfile);
	xfree(tfile);
	return (b);
}

void db_delete_byurl(url)
     char *url;
{
	datum k;

	k.dptr = url;
	k.dsize = strlen(url) + 1;
	(void) gdbm_delete(dbf, k);
}
