: # *-*-perl-*-*
    eval 'exec perl -S $0 "$@"'
    if $running_under_some_shell;  
#
#  Gatherer - Main interface to the Gatherer.  Parses the configuration file
#  and starts up the Gatherering process.
#
#  Usage: Gatherer [-manual | -export | -debug | -background] file.cf
#
#  Darren Hardy, hardy@cs.colorado.edu, July 1994
#
#  $Id: Gatherer,v 1.54 1995/11/07 22:09:14 duane Exp $
#
#######################################################################
#
#  Copyright (c) 1994, 1995.  All rights reserved.
#  
#    The Harvest software was developed by the Internet Research Task
#    Force Research Group on Resource Discovery (IRTF-RD):
#  
#          Mic Bowman of Transarc Corporation.
#          Peter Danzig of the University of Southern California.
#          Darren R. Hardy of the University of Colorado at Boulder.
#          Udi Manber of the University of Arizona.
#          Michael F. Schwartz of the University of Colorado at Boulder.
#          Duane Wessels of the University of Colorado at Boulder.
#  
#    This copyright notice applies to software in the Harvest
#    ``src/'' directory only.  Users should consult the individual
#    copyright notices in the ``components/'' subdirectories for
#    copyright information about other software bundled with the
#    Harvest source code distribution.
#  
#  TERMS OF USE
#    
#    The Harvest software may be used and re-distributed without
#    charge, provided that the software origin and research team are
#    cited in any use of the system.  Most commonly this is
#    accomplished by including a link to the Harvest Home Page
#    (http://harvest.cs.colorado.edu/) from the query page of any
#    Broker you deploy, as well as in the query result pages.  These
#    links are generated automatically by the standard Broker
#    software distribution.
#    
#    The Harvest software is provided ``as is'', without express or
#    implied warranty, and with no support nor obligation to assist
#    in its use, correction, modification or enhancement.  We assume
#    no liability with respect to the infringement of copyrights,
#    trade secrets, or any patents, and are not responsible for
#    consequential damages.  Proper use of the Harvest software is
#    entirely the responsibility of the user.
#  
#  DERIVATIVE WORKS
#  
#    Users may make derivative works from the Harvest software, subject 
#    to the following constraints:
#  
#      - You must include the above copyright notice and these 
#        accompanying paragraphs in all forms of derivative works, 
#        and any documentation and other materials related to such 
#        distribution and use acknowledge that the software was 
#        developed at the above institutions.
#  
#      - You must notify IRTF-RD regarding your distribution of 
#        the derivative work.
#  
#      - You must clearly notify users that your are distributing 
#        a modified version and not the original Harvest software.
#  
#      - Any derivative product is also subject to these copyright 
#        and use restrictions.
#  
#    Note that the Harvest software is NOT in the public domain.  We
#    retain copyright, as specified above.
#  
#  HISTORY OF FREE SOFTWARE STATUS
#  
#    Originally we required sites to license the software in cases
#    where they were going to build commercial products/services
#    around Harvest.  In June 1995 we changed this policy.  We now
#    allow people to use the core Harvest software (the code found in
#    the Harvest ``src/'' directory) for free.  We made this change
#    in the interest of encouraging the widest possible deployment of
#    the technology.  The Harvest software is really a reference
#    implementation of a set of protocols and formats, some of which
#    we intend to standardize.  We encourage commercial
#    re-implementations of code complying to this set of standards.  
#  
#
$ENV{'HARVEST_HOME'} = "/usr/local/harvest" if (!defined($ENV{'HARVEST_HOME'}));
$ENV{'PATH'} = $ENV{'PATH'} . ":" . "$ENV{'HARVEST_HOME'}/bin" .
			      ":" . "$ENV{'HARVEST_HOME'}/lib/gatherer" .
			      ":" . "$ENV{'HARVEST_HOME'}/lib";
$debug = 0;

sub usage {
	print STDERR "Usage: Gatherer [options] config-file\n";
	exit(1);
}

#
#  Set the default values.  Basically, everything is set off of the
#  Top-Directory.
#
chop($cdir = &grab_cmd_output("pwd"));
undef %vals;
$vals{"Top-Directory"} = $cdir;
$vals{"Data-Directory"} = 	$vals{"Top-Directory"} . "/data";
$vals{"Working-Directory"} = 	$vals{"Top-Directory"} . "/tmp";
$vals{"Log-File"} = 		$vals{"Top-Directory"} . "/log.gatherer";
$vals{"Errorlog-File"} = 	$vals{"Top-Directory"} . "/log.errors";

$vals{"Gatherer-Port"} = "8000";
$vals{"Gatherd-Inetd"} = "no";
$automatic = 1;
$setupdone = 0;
$do_export = 0;
$do_background = 0;

$configfile = shift(@ARGV);

#  These are some voodoo flags for those who know what they're doing
while ($configfile =~ /^-/) {
    if ($configfile eq "-debug") {
	$debug = 1;
	$configfile = shift(@ARGV);
    } elsif ($configfile eq "-manual") {
	$automatic = 0;
	$configfile = shift(@ARGV);
    } elsif ($configfile eq "-export") {
	$do_export = 1;
	$configfile = shift(@ARGV);
    } elsif ($configfile eq "-background") {
	$do_background = 1;
	$configfile = shift(@ARGV);
    } else {
	&usage();
    }
}

# are args OK?
&usage() if ($#ARGV > -1);		# still args left?
&usage() if ($configfile eq "");	# configfile bogus?
$| = 1 if ($debug);

#  Valid tags for attribute-value pairs in configuration files
@tags = ( 	"Data-Directory", 
		"Debug-Options", 
		"Essence-Options", 
		"Gatherer-Options", 
		"Gatherd-Inetd", 
		"Gatherer-Host", 
		"Gatherer-Name",
		"Gatherer-Port",
		"Gatherer-Version",
		"HTTP-Basic-Auth", 
		"FTP-Auth", 
		"HTTP-Proxy", 
		"Keep-Cache",
		"Lib-Directory",
		"Local-Mapping",
		"Log-File",
		"Errorlog-File",
		"Post-Summarizing",
		"Refresh-Rate",
		"Time-To-Live",
		"Top-Directory",
		"Working-Directory",
	);


print "Data Directory is ", $vals{"Data-Directory"}, "\n" if ($debug);
print "PATH is $ENV{'PATH'}\n" if ($debug);

#
#  Looks like this:  Gatherer 
#			|-> prepurls -> essence
#				|-> enum & staturl
#
#  Read in the configuration of the gatherer
#
open(CONFIG, "$configfile") || 
	die "Gatherer: Cannot read configuration file: $configfile: $!\n";
while (<CONFIG>) {
	$recog = 0;
	next if (/^#/o);
	next if (/^\s+$/o);
	last if ($do_export && (/^<RootNodes>/io || /^<LeafNodes>/io));
	chop;
	&process_rootnodes(), next if (/^<RootNodes>/io);
	&process_leafnodes(), next if (/^<LeafNodes>/io);
	foreach $tag (@tags) {
		if (/^$tag:\s+(.*)$/) {
			$vals{$tag} = $1;
			if ($tag eq "Local-Mapping") {
				($url, $path) = split(/\s+/, $vals{$tag});
				$mapping{$url} = $path;
			}
			if ($tag eq "HTTP-Basic-Auth") {
				$HTTPAuth .= 'Basic ' . $vals{$tag} . "\n";
			}
			if ($tag eq "FTP-Auth") {
				$FTPAuth .= $vals{$tag} . "\n";
			}
			if ($tag eq "Top-Directory") {
				&init_dir("Top-Directory", 1);
				$vals{"Data-Directory"} = 	$vals{"Top-Directory"} . "/data";
				$vals{"Working-Directory"} = 	$vals{"Top-Directory"} . "/tmp";
				$vals{"Log-File"} = 		$vals{"Top-Directory"} . "/log.gatherer";
				$vals{"Errorlog-File"} = 	$vals{"Top-Directory"} . "/log.errors";
			}
			$recog = 1;
			last;
		}
	}
	print "WARNING: Unrecognized line: $_\n" if (!$recog);
}
close(CONFIG);
close(URL);		# must close URL for process to stop
chdir($vals{'Top-Directory'}) || 
	die "Gatherer: Cannot chdir to $vals{'Top-Directory'}: $!";
&run_system("/bin/csh -f $gcmd 2>> $vals{'Errorlog-File'}");	# actually run the Gatherer
unlink($gcmd) if ($debug == 0);
unlink($gcmdinput) if ($debug == 0);
&install_gatherer() if ($automatic == 1);
exit(0);		# END OF PROGRAM


sub process_rootnodes {
	&startup_prepurls() if (!$setupdone);
	while (<CONFIG>) {
		chop;
		next if (/^#/o);
		last if (/^<\/RootNodes>/io);

		while (substr($_, $#_, 1) eq "\\") {
			chop($_);
			chop($nextline = <CONFIG>);
			$_ .= $nextline;
		}

		($rooturl, @options) = split;
		&set_defaults();
		&parse_options(@options);
		next if ($rooturl =~ /^\s*$/io);	# empty rooturl
		if ($rooturl =~ /^\|(.*)$/) {		# generate URLs from pgm
			$pgm = $1;
			die "$pgm: $!\n" unless open (PGM, "$pgm|");
			while (<PGM>) {
				chop;
				$rootargs = "$_ $urlmax $urlfilter $hostmax $hostfilter $delay $depth $accesstypes $enumeratepgm";
				print URL "ROOT\t$rootargs\n";
				print "ROOT\t$rootargs\n" if ($debug);
			}
			close PGM;
		} else {
			$rootargs = "$rooturl $urlmax $urlfilter $hostmax $hostfilter $delay $depth $accesstypes $enumeratepgm";
			print URL "ROOT\t$rootargs\n";
			print "ROOT\t$rootargs\n" if ($debug);
		}
	}
}

sub process_leafnodes {
	&startup_prepurls() if (!$setupdone);
	while (<CONFIG>) {
		next if (/^#/o);
		last if (/^<\/LeafNodes>/io);
		if (/^\|(.*)$/) {		# generate URLs from pgm
			$pgm = $1;
			die "$pgm: $!\n" unless open (PGM, "$pgm|");
			while (<PGM>) {
				print URL "LEAF\t$_";
				print "LEAF\t$_" if ($debug);
			}
			close PGM;
		} else {
			print URL "LEAF\t$_";
			print "LEAF\t$_" if ($debug);
		}
	}
}

sub init_dir {
	local($k, $write_test) = @_;
	return if (!defined($vals{$k}));
	if ($vals{$k} !~ /^\//o) {
		$vals{$k} = $1 if ($vals{$k} =~ /^\.\/(.*)$/o);
		$vals{$k} = $cdir . "/" . $vals{$k};
	}
	print "init_dir($vals{$k})\n" if ($debug);
	mkdir($vals{$k}, 0755) || die "Gatherer: mkdir: $vals{$k}: $!\n" 
		if (! -d $vals{$k});

	return unless ($write_test);

	if (open (TEST, ">$vals{$k}/.write_test")) {
		close TEST;
		unlink "$vals{$k}/.write_test";
		return;
	}

	die "Gatherer: Unable to make directory writable.\n\tchmod: $vals{$k}: $!\n"
		unless (chmod(0755, $vals{$k}));
}

sub init_essence {
	$essencecmd  = "essence";
	if (defined($vals{"Data-Directory"})) {
		$essencecmd .= " --dbdir ";
		$essencecmd .= $vals{"Data-Directory"};
	}
	if (defined($vals{"Working-Directory"})) {
		$essencecmd .= " --tmpdir ";
		$essencecmd .= $vals{"Working-Directory"};
	}
	if (defined($vals{"Lib-Directory"})) {
		$essencecmd .= " --libdir ";
		$essencecmd .= $vals{"Lib-Directory"};
	}
	if (defined($vals{"Log-File"})) {
		$essencecmd .= " --log ";
		$essencecmd .= $vals{"Log-File"};
		$ENV{'HARVEST_GATHERER_LOGFILE'} = $vals{"Log-File"};
	}
	if (defined($vals{"Gatherer-Host"})) {
		$essencecmd .= " --gatherer-host ";
		$essencecmd .= "'" . $vals{"Gatherer-Host"} . "'";
	}
	if (defined($vals{"Gatherer-Name"})) {
		$essencecmd .= " --gatherer-name ";
		$essencecmd .= "'" . $vals{"Gatherer-Name"} . "'";
	}
	if (defined($vals{"Gatherer-Version"})) {
		$essencecmd .= " --gatherer-version ";
		$essencecmd .= "'" . $vals{"Gatherer-Version"} . "'";
	}
	if (defined($vals{"Post-Summarizing"})) {
		$essencecmd .= " --post-process ";
		$essencecmd .= "'" . $vals{"Post-Summarizing"} . "'";
	}
	if (defined($vals{"Refresh-Rate"})) {
		$essencecmd .= " --default-refresh ";
		$essencecmd .= "'" . $vals{"Refresh-Rate"} . "'";
	}
	if (defined($vals{"Time-To-Live"})) {
		$essencecmd .= " --default-ttl ";
		$essencecmd .= "'" . $vals{"Time-To-Live"} . "'";
	}
	if (defined($vals{"Essence-Options"})) {
		$essencecmd .= " " . $vals{"Essence-Options"};
	}
	if (defined($vals{"Debug-Options"})) {
		$essencecmd .= " " . $vals{"Debug-Options"};
		$ENV{'HARVEST_DEBUG'} = $vals{"Debug-Options"};
	}
	$essencecmd .= " --verbose";
	$essencecmd .= " -f -";
}

sub startup_prepurls {
	&init_dir('Lib-Directory', 0)			# dont test writable
		if (defined($vals{'Lib-Directory'}));
	&init_dir('Data-Directory', 1)
		if (defined($vals{'Data-Directory'}));
	&init_dir('Working-Directory', 1)
		if (defined($vals{'Working-Directory'}));

	foreach $f ("$vals{'Data-Directory'}/index.html", "$vals{'Working-Directory'}/index.html") {
		open(INDEXHTML, "> $f");
		print INDEXHTML <<EOM;
<html>
Please use the Harvest Gatherer's interface to retrieve these files.
</html>
EOM
		close(INDEXHTML);
		chmod(0644, $f);
	}
	$ENV{'TMPDIR'} = $vals{"Working-Directory"};

	if (defined(%mapping)) {
		$tfile = "$vals{'Working-Directory'}/localmap.cf";
		open(MAPPING, "> $tfile") || 
			die "Gatherer: Cannot write Mapping: $tfile: $!\n";
		foreach $k (sort keys %mapping) {
			print MAPPING "$k\t$mapping{$k}\n";
		}
		close(MAPPING);
		$ENV{'HARVEST_URL_LOCAL_MAPPINGS'} = $tfile;
	} else {
		$ENV{'HARVEST_URL_LOCAL_MAPPINGS'} = "/dev/null";
	}

	if (defined($HTTPAuth)) {
		$tfile = "$vals{'Working-Directory'}/HTTPAuth.cf";
		open(AUTH, "> $tfile") || 
			die "Gatherer: Cannot write Auth file: $tfile: $!\n";
		print AUTH $HTTPAuth;
		close(AUTH);
		$ENV{'HARVEST_HTTP_AUTHENTICATIONS'} = $tfile;
	} else {
		$ENV{'HARVEST_HTTP_AUTHENTICATIONS'} = '/dev/null';
	}

	if (defined($FTPAuth)) {
		$tfile = "$vals{'Working-Directory'}/FTPAuth.cf";
		open(AUTH, "> $tfile") || 
			die "Gatherer: Cannot write Auth file: $tfile: $!\n";
		print AUTH $FTPAuth;
		close(AUTH);
		$ENV{'HARVEST_FTP_AUTHENTICATIONS'} = $tfile;
	} else {
		$ENV{'HARVEST_FTP_AUTHENTICATIONS'} = '/dev/null';
	}

	$proddb = $vals{"Data-Directory"} . "/PRODUCTION.gdbm";
	$indexdb = $vals{"Data-Directory"} . "/INDEX.gdbm";
	$mddb = $vals{"Data-Directory"} . "/MD5.gdbm";
	$prepcmd = "prepurls ";
	$prepcmd .= " --leaf 'staturl ";
	$prepcmd .= " $vals{'Debug-Options'} " . "'";
	$prepcmd .= " --root 'enum";
	$prepcmd .= " $vals{'Debug-Options'} ";
	$prepcmd .= " -tmpdb " . $vals{"Working-Directory"} .  "/tmpdb.gdbm";
	$prepcmd .= " -log " . $vals{"Log-File"};
	if (-r $proddb) {
		$prepcmd .= " -db " . $proddb . "'";
	} else {
		$prepcmd .= " -db /dev/null'";
	}
	&init_essence();

	#
	#  Create a script that will run Essence and the rest of the
	#  programs need to gather.  This gets around a Solaris 2.3 bug.
	#  when trying to use fork/exec to &this...
	#
	$gcmd = $vals{"Working-Directory"} . "/gathercmd.$$";
	$gcmdinput = $vals{"Working-Directory"} . "/gatherinput.$$";
	open(GCMD, "> $gcmd") || die "Gatherer: Cannot write $gcmd: $!\n";
	print GCMD "#\n#  This is the command to run the Gatherer\n#\n";
	foreach $k (sort keys %ENV) {
		next if ($k !~ /^(HARVEST|TMPDIR)/o);
		print GCMD "setenv $k '$ENV{$k}'\n";
	}
	$okpath = $ENV{'PATH'};
	$okpath =~ s/:/ /g;
	print GCMD "set path = ( $okpath )\n";
	print GCMD "set clobber\n";
	print GCMD "set noglob\n";
	print GCMD "\n";

	if ($vals{'HTTP-Proxy'} eq "") {
		print GCMD "unsetenv http_proxy\n";
	} else {
		print GCMD "setenv http_proxy http://$vals{'HTTP-Proxy'}/\n";
	}

	# Expire objects from the URL cache
	print GCMD "urlpurge < /dev/null\n";

	# Expire objects from the production database 
	print GCMD <<EOM;
if (-r $proddb) then
	chmod +w $proddb
	expiredb -log $vals{'Log-File'} $proddb
	if (\$status == 1) then
		# expiredb expired some objects, rebuild index
		chmod -w $proddb
		rm -f $indexdb $mddb
		mkindex $proddb $indexdb $mddb
		chmod 444 $indexdb $mddb
	else
		chmod -w $proddb
	endif
endif
EOM

	# Run the Gatherer
	print GCMD "\n";
	print GCMD "/bin/cat $gcmdinput" . " |  \\\n";
	print GCMD $prepcmd . " |  \\\n";
	print GCMD $essencecmd . "\n";
	print GCMD "\n";
	unless ($debug) {
		unless ($vals{'Keep-Cache'} =~ /^y.*/io) {
			print GCMD "if (\$status == 0) then\n";
			print GCMD "\trm -rf $vals{'Working-Directory'}/cache-liburl/\n";
			print GCMD "endif\n";
		print GCMD "\n";
		}
		print GCMD "rm -f $vals{'Working-Directory'}/localmap.cf\n";
		print GCMD "rm -f $vals{'Working-Directory'}/HTTPAuth.cf\n";
		print GCMD "rm -f $vals{'Working-Directory'}/FTPAuth.cf\n";
	}
	print GCMD "exit 0\n";
	close(GCMD);

	open(URL, "> $gcmdinput") || 
		die "Gatherer: Cannot write $gcmdinput: $!\n";

	$setupdone = 1;
}

#
#  Once the Gatherer has run, install the PRODUCTION database and
#  run gatherd.
#
sub install_gatherer {
	# prepare the database
	if (defined($vals{'Gatherer-Host'})) {
		$gid = "$vals{'Gatherer-Host'}:$vals{'Gatherer-Port'}";
	} else {
		chop($h = &grab_cmd_output("hostname"));
		($fullh, @blah) = gethostbyname($h);
		undef @blah;
		$fullh = $h if ($fullh eq "");
		$gid = "$fullh:$vals{'Gatherer-Port'}";
	}
	$folddbcmd = "folddb ";
	$folddbcmd .= "$vals{'Gatherer-Options'} ";
	$folddbcmd .= "\"$gid\" ";
	$folddbcmd .= "$vals{'Data-Directory'}";

	# prepare the access control list if needed
	$gatherdcf = $vals{'Data-Directory'} . "/gatherd.cf";
	if (! -r $gatherdcf) {
		open(GCF, "> $gatherdcf") || 
			die "Gatherer: Cannot create $gatherdcf: $!\n";
		print GCF <<EOM;
#
#  gatherd.cf - Access Control List for gatherd
#
Allow all
EOM
		close(GCF);
	}

	if ($vals{"Gatherd-Inetd"} eq "yes") {
		$gatherdcmd = "/bin/true";	# ignore it
	} else {
		$gatherdcmd = "gatherd -d " . $vals{"Data-Directory"} . " ";
		$gatherdcmd .= $vals{"Gatherer-Port"};
	}

	$ecmd = $vals{"Working-Directory"} . "/exportcmd.$$";
	open(ECMD, "> $ecmd") || die "Cannot write $ecmd: $!\n";
	print ECMD "#!/bin/sh\n";
	print ECMD "$folddbcmd\n";
	print ECMD "$gatherdcmd\n";
	close(ECMD);
	chmod(0755, $ecmd) || die "Cannot chmod $ecmd: $!\n";
	if ($do_background) {
		&run_system("$ecmd &");
	} else {
		&run_system("$ecmd");
		unlink($ecmd) if (!$debug);
	}
}

sub run_system {
	local($cmd) = @_;
	print "RUNNING: $cmd\n" if ($debug);
	system($cmd);
}

#
#  This is an ugly hack so that it works with Perl 4.036 on Solaris 2.3.
#  The backticks (`) don't work on Solaris like they should. -Darren.
#
sub grab_cmd_output {
	local($the_cmd) = @_;
	undef $the_var;
	unlink("/tmp/cmdoutput.$$");
	system("$the_cmd > /tmp/cmdoutput.$$");
	open(CMDOUT, "< /tmp/cmdoutput.$$") || return "none";
	$the_var = <CMDOUT>;
	close(CMDOUT);
	unlink("/tmp/cmdoutput.$$");
	return $the_var;
}

sub set_defaults {
	$urlmax = 250;
	$urlfilter = "/dev/null";
	$urlfilter = "$ENV{'HARVEST_HOME'}/lib/gatherer/URL-filter-default"
		if (-f "$ENV{'HARVEST_HOME'}/lib/gatherer/URL-filter-default");
	$hostmax = 1;
	$hostfilter = "/dev/null";
	$delay = 1;
	$depth = 0;
	$accesstypes  = "HTTP";		# maybe "HTTP|FTP|Gopher" ?
	$enumeratepgm = "/bin/false";
}

sub parse_options {
	local(@options) = @_;

	foreach $opt (@options) {
		if ($opt =~ /^URL=(\d+)/io) {
			$urlmax = $1;
			$urlfilter = $1 if ($opt =~ /^URL=\d+,(\S+)/io);
			next;
		}
		if ($opt =~ /^Host=(\d+)/io) {
			$hostmax = $1;
			$hostfilter = $1 if ($opt =~ /^Host=\d+,(\S+)/io);
			next;
		}
		if ($opt =~ /^Site=(\d+)/io) {
			$hostmax = $1;
			$hostfilter = $1 if ($opt =~ /^Site=\d+,(\S+)/io);
			next;
		}
		if ($opt =~ /^Access=(.*)/io)  {
        		$accesstypes = $1;
			next;
		}
		if ($opt =~ /^Delay=(\d+)/io) {
			$delay = $1;
			next;
		}
		if ($opt =~ /^Depth=(\d+)/io) {
			$depth = $1;
			next;
		}
		if ($opt =~ /^Enumeration=(\S+)/io) {
			$enumeratepgm = $1;
			next;
		}
		print STDERR "Illegal Option: $opt\n";
	}
}
