static char rcsid[] = "mkindex.c,v 1.23 1996/01/17 10:36:04 duane Exp";
/*
 *  mkindex.c - Builds a (URL, Timestamp) and a (URL, MD5) hash table 
 *  from a (URL, Template) hash table.
 *
 *  Usage: mkindex [indb tstmpdb md5db]
 *
 *  For example, mkindex 
 *               mkindex PRODUCTION.gdbm INDEX.gdbm MD5.gdbm
 *  
 *  Darren R. Hardy, hardy@cs.colorado.edu, May 1994
 *
 *  ----------------------------------------------------------------------
 *  Copyright (c) 1994, 1995.  All rights reserved.
 *  
 *    The Harvest software was developed by the Internet Research Task
 *    Force Research Group on Resource Discovery (IRTF-RD):
 *  
 *          Mic Bowman of Transarc Corporation.
 *          Peter Danzig of the University of Southern California.
 *          Darren R. Hardy of the University of Colorado at Boulder.
 *          Udi Manber of the University of Arizona.
 *          Michael F. Schwartz of the University of Colorado at Boulder.
 *          Duane Wessels of the University of Colorado at Boulder.
 *  
 *    This copyright notice applies to software in the Harvest
 *    ``src/'' directory only.  Users should consult the individual
 *    copyright notices in the ``components/'' subdirectories for
 *    copyright information about other software bundled with the
 *    Harvest source code distribution.
 *  
 *  TERMS OF USE
 *    
 *    The Harvest software may be used and re-distributed without
 *    charge, provided that the software origin and research team are
 *    cited in any use of the system.  Most commonly this is
 *    accomplished by including a link to the Harvest Home Page
 *    (http://harvest.cs.colorado.edu/) from the query page of any
 *    Broker you deploy, as well as in the query result pages.  These
 *    links are generated automatically by the standard Broker
 *    software distribution.
 *    
 *    The Harvest software is provided ``as is'', without express or
 *    implied warranty, and with no support nor obligation to assist
 *    in its use, correction, modification or enhancement.  We assume
 *    no liability with respect to the infringement of copyrights,
 *    trade secrets, or any patents, and are not responsible for
 *    consequential damages.  Proper use of the Harvest software is
 *    entirely the responsibility of the user.
 *  
 *  DERIVATIVE WORKS
 *  
 *    Users may make derivative works from the Harvest software, subject 
 *    to the following constraints:
 *  
 *      - You must include the above copyright notice and these 
 *        accompanying paragraphs in all forms of derivative works, 
 *        and any documentation and other materials related to such 
 *        distribution and use acknowledge that the software was 
 *        developed at the above institutions.
 *  
 *      - You must notify IRTF-RD regarding your distribution of 
 *        the derivative work.
 *  
 *      - You must clearly notify users that your are distributing 
 *        a modified version and not the original Harvest software.
 *  
 *      - Any derivative product is also subject to these copyright 
 *        and use restrictions.
 *  
 *    Note that the Harvest software is NOT in the public domain.  We
 *    retain copyright, as specified above.
 *  
 *  HISTORY OF FREE SOFTWARE STATUS
 *  
 *    Originally we required sites to license the software in cases
 *    where they were going to build commercial products/services
 *    around Harvest.  In June 1995 we changed this policy.  We now
 *    allow people to use the core Harvest software (the code found in
 *    the Harvest ``src/'' directory) for free.  We made this change
 *    in the interest of encouraging the widest possible deployment of
 *    the technology.  The Harvest software is really a reference
 *    implementation of a set of protocols and formats, some of which
 *    we intend to standardize.  We encourage commercial
 *    re-implementations of code complying to this set of standards.  
 *  
 */
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <gdbm.h>
#include "util.h"
#include "template.h"

/* Local functions */
static void usage();

static void usage()
{
	fprintf(stderr, "Usage: mkindex [indb timedb md5db]\n");
	exit(1);
}

static GDBM_FILE indbf = NULL, tsdbf = NULL, mddbf = NULL;

static void die(x)
     int x;
{
	if (indbf != NULL)
		gdbm_close(indbf);
	if (tsdbf != NULL)
		gdbm_close(tsdbf);
	if (mddbf != NULL)
		gdbm_close(mddbf);
	exit(x);
}

int main(argc, argv)
     int argc;
     char *argv[];
{
	char *infile = "PRODUCTION.gdbm";
	char *tsfile = "INDEX.gdbm";
	char *mdfile = "MD5.gdbm";
	datum k;
	datum nextkey;
	datum d;
	datum newd;
	Template *template = NULL;
	AVPair *avp = NULL;
	/* 
	 *  # of internal cache buckets in gdbm.  We can use a large number
	 *  here because each bucket holds a d.dptr, but our d.dptrs are
	 *  very small in this program (max 33 bytes).
	 *  But cannot be too large because GDBM is sloppy in how it does it.
	 */
	int cache_size = 300;	/* 3 times normal amount */


	init_log3("mkindex", stderr, stderr);
	if (argc == 4) {
		infile = strdup(argv[1]);
		tsfile = strdup(argv[2]);
		mdfile = strdup(argv[3]);
	} else if (argc != 1)
		usage();

	indbf = gdbm_open(infile, 0, GDBM_READER, 0644, NULL);
	if (indbf == NULL) {
		errorlog("gdbm_open: %s: %s\n", infile,
		    gdbm_strerror(gdbm_errno));
		die(1);
	}
	tsdbf = gdbm_open(tsfile, 0, GDBM_NEWDB | GDBM_FAST, 0644, NULL);
	if (tsdbf == NULL) {
		errorlog("gdbm_open: %s: %s\n", tsfile,
		    gdbm_strerror(gdbm_errno));
		die(1);
	}
	if (gdbm_setopt(tsdbf, GDBM_CACHESIZE, &cache_size, sizeof(int)))
		    Log("WARNING: Cannot reset GDBM cache size to %d.\n",
		    cache_size);

	mddbf = gdbm_open(mdfile, 0, GDBM_NEWDB | GDBM_FAST, 0644, NULL);
	if (mddbf == NULL) {
		errorlog("gdbm_open: %s: %s\n", mdfile,
		    gdbm_strerror(gdbm_errno));
		die(1);
	}
	if (gdbm_setopt(mddbf, GDBM_CACHESIZE, &cache_size, sizeof(int)))
		    Log("WARNING: Cannot reset GDBM cache size to %d.\n",
		    cache_size);

	/*
	 *  Extract the (URL, Template) from indbf, then extract the
	 *  timestamp from Template, then build the (Timestamp, URL)
	 *  in the tsdbf.
	 */
	k = gdbm_firstkey(indbf);
	while (k.dptr) {
		d = gdbm_fetch(indbf, k);
		if (d.dptr == NULL) {
			errorlog("gdbm_fetch: %s: %s\n", infile,
			    gdbm_strerror(gdbm_errno));
			die(1);
		}
		init_parse_template_string(d.dptr, d.dsize);
		template = parse_template();
		finish_parse_template();
		if (template == NULL) {
			Log("WARNING: %s is not parseable.\n", k.dptr);
			goto next_item;
		}
		avp = extract_AVPair(template->list, T_UPDATE);
		if (avp == NULL) {
			errorlog("%s not in template %s\n", T_UPDATE,
			    template->url);
			die(1);
		}
		/* Store URL->timestamp mapping in the INDEX */
		newd.dptr = xmalloc(avp->vsize + 1);
		memcpy(newd.dptr, avp->value, avp->vsize);
		newd.dptr[avp->vsize] = '\0';
		newd.dsize = avp->vsize + 1;	/* include \0 */
		if (gdbm_store(tsdbf, k, newd, GDBM_INSERT)) {
			errorlog("gdbm_store: %s: %s\n", tsfile,
			    gdbm_strerror(gdbm_errno));
			die(1);
		}
		xfree(newd.dptr);

		avp = extract_AVPair(template->list, T_MD5);
		if (avp != NULL) {
			/* Store URL->timestamp mapping in the INDEX */
			newd.dptr = xmalloc(avp->vsize + 1);
			memcpy(newd.dptr, avp->value, avp->vsize);
			newd.dptr[avp->vsize] = '\0';
			newd.dsize = avp->vsize + 1;	/* include \0 */
			if (gdbm_store(mddbf, k, newd, GDBM_INSERT)) {
				errorlog("gdbm_store: %s: %s\n", mdfile,
				    gdbm_strerror(gdbm_errno));
				die(1);
			}
			xfree(newd.dptr);
		}
	      next_item:
		nextkey = gdbm_nextkey(indbf, k);
		free(k.dptr);
		free(d.dptr);
		free_template(template);
		k = nextkey;
	}
	die(0);
}
