: # *-*-perl-*-*
    eval 'exec perl -S $0 "$@"'
    if $running_under_some_shell;  
#
#  Gatherer - Main interface to the Gatherer.  Parses the configuration file
#  and starts up the Gatherering process.
#
#  Usage: Gatherer [-manual | -export | -debug] file.cf
#
#  Darren Hardy, hardy@cs.colorado.edu, July 1994
#
#  $Id: Gatherer,v 1.33 1995/03/29 05:36:55 hardy Exp $
#
#######################################################################
#
#  Copyright (c) 1994, 1995.  All rights reserved.
#  
#          Mic Bowman of Transarc Corporation.
#          Peter Danzig of the University of Southern California.
#          Darren R. Hardy of the University of Colorado at Boulder.
#          Udi Manber of the University of Arizona.
#          Michael F. Schwartz of the University of Colorado at Boulder. 
#          Duane Wessels of the University of Colorado at Boulder. 
#  
#  This copyright notice applies to all code in Harvest other than
#  subsystems developed elsewhere, which contain other copyright notices
#  in their source text.
#  
#  The Harvest software was developed by the Internet Research Task
#  Force Research Group on Resource Discovery (IRTF-RD).  The Harvest
#  software may be used for academic, research, government, and internal
#  business purposes without charge.  If you wish to sell or distribute
#  the Harvest software to commercial clients or partners, you must
#  license the software.  See
#  http://harvest.cs.colorado.edu/harvest/copyright,licensing.html#licensing.
#  
#  The Harvest software is provided ``as is'', without express or
#  implied warranty, and with no support nor obligation to assist in its
#  use, correction, modification or enhancement.  We assume no liability
#  with respect to the infringement of copyrights, trade secrets, or any
#  patents, and are not responsible for consequential damages.  Proper
#  use of the Harvest software is entirely the responsibility of the user.
#  
#  For those who are using Harvest for non-commercial purposes, you may
#  make derivative works, subject to the following constraints:
#  
#  - You must include the above copyright notice and these accompanying 
#    paragraphs in all forms of derivative works, and any documentation 
#    and other materials related to such distribution and use acknowledge 
#    that the software was developed at the above institutions.
#  
#  - You must notify IRTF-RD regarding your distribution of the 
#    derivative work.
#  
#  - You must clearly notify users that your are distributing a modified 
#    version and not the original Harvest software.
#  
#  - Any derivative product is also subject to the restrictions of the 
#    copyright, including distribution and use limitations.
#
$ENV{'HARVEST_HOME'} = "/usr/local/harvest" if (!defined($ENV{'HARVEST_HOME'}));
$ENV{'PATH'} = $ENV{'PATH'} . ":" . "$ENV{'HARVEST_HOME'}/bin" .
			      ":" . "$ENV{'HARVEST_HOME'}/lib/gatherer" .
			      ":" . "$ENV{'HARVEST_HOME'}/lib";
$debug = 0;
$verbose = 1;

sub usage {
	print STDERR "Usage: Gatherer [-manual | -export | -debug] config-file\n";
	exit(1);
}

#
#  Set the default values.  Basically, everything is set off of the
#  Top-Directory.
#
chop($cdir = &grab_cmd_output("pwd"));
undef %vals;
$vals{"Top-Directory"} = $cdir;
$vals{"Data-Directory"} = 	$vals{"Top-Directory"} . "/data";
$vals{"Working-Directory"} = 	$vals{"Top-Directory"} . "/tmp";
$vals{"Log-File"} = 		$vals{"Top-Directory"} . "/log.gatherer";
$vals{"Errorlog-File"} = 	$vals{"Top-Directory"} . "/log.errors";

$vals{"Gatherer-Port"} = "8000";
$vals{"Gatherd-Inetd"} = "no";
$automatic = 1;
$setupdone = 0;
$do_export = 0;

$configfile = shift(@ARGV);

#  These are some voodoo flags for those who know what they're doing
while ($configfile =~ /^-/) {
    if ($configfile eq "-debug") {
	$debug = 1;
	$configfile = shift(@ARGV);
    } elsif ($configfile eq "-manual") {
	$automatic = 0;
	$configfile = shift(@ARGV);
    } elsif ($configfile eq "-export") {
	$do_export = 1;
	$configfile = shift(@ARGV);
    } else {
	&usage();
    }
}

# are args OK?
&usage() if ($#ARGV > -1);		# still args left?
&usage() if ($configfile eq "");	# configfile bogus?
$| = 1 if ($debug);

#  Valid tags for attribute-value pairs in configuration files
@tags = ( 	"Data-Directory", 
		"Debug-Options", 
		"Essence-Options", 
		"Gatherer-Options", 
		"Gatherd-Inetd", 
		"Gatherer-Host", 
		"Gatherer-Name",
		"Gatherer-Port",
		"Gatherer-Version",
		"HTTP-Proxy", 
		"Lib-Directory",
		"Local-Mapping",
		"Log-File",
		"Errorlog-File",
		"Top-Directory",
		"Working-Directory",
	);


print "Data Directory is ", $vals{"Data-Directory"}, "\n" if ($debug);
print "PATH is $ENV{'PATH'}\n" if ($debug);

#
#  Looks like this:  Gatherer 
#			|-> prepurls -> essence
#				|-> enum & staturl
#
#  Read in the configuration of the gatherer
#
open(CONFIG, "$configfile") || 
	die "Gatherer: Cannot read configuration file: $configfile: $!\n";
while (<CONFIG>) {
	$recog = 0;
	next if (/^#/o);
	next if (/^\s+$/o);
	last if ($do_export && (/^<RootNodes>/io || /^<LeafNodes>/io));
	chop;
	&process_rootnodes(), next if (/^<RootNodes>/io);
	&process_leafnodes(), next if (/^<LeafNodes>/io);
	foreach $tag (@tags) {
		if (/^$tag:\s+(.*)$/) {
			$vals{$tag} = $1;
			if ($tag eq "Local-Mapping") {
				($url, $path) = split(/\s+/, $vals{$tag});
				$mapping{$url} = $path;
			}
			if ($tag eq "Top-Directory") {
				&init_dir("Top-Directory");
				$vals{"Data-Directory"} = 	$vals{"Top-Directory"} . "/data";
				$vals{"Working-Directory"} = 	$vals{"Top-Directory"} . "/tmp";
				$vals{"Log-File"} = 		$vals{"Top-Directory"} . "/log.gatherer";
				$vals{"Errorlog-File"} = 	$vals{"Top-Directory"} . "/log.errors";
			}
			$recog = 1;
			last;
		}
	}
	print "WARNING: Unrecognized line: $_\n" if (!$recog);
}
close(CONFIG);
close(URL);		# must close URL for process to stop
chdir($vals{'Top-Directory'}) || 
	die "Gatherer: Cannot chdir to $vals{'Top-Directory'}: $!";
&run_system("/bin/csh -f $gcmd 2>> $vals{'Errorlog-File'}");	# actually run the Gatherer
unlink($gcmd) if ($debug == 0);
unlink($gcmdinput) if ($debug == 0);
&install_gatherer() if ($automatic == 1);
exit(0);		# END OF PROGRAM


sub process_rootnodes {
	&startup_prepurls() if (!$setupdone);
	while (<CONFIG>) {
		chop;
		next if (/^#/o);
		last if (/^<\/RootNodes>/io);

		while (substr($_, $#_, 1) eq "\\") {
			chop($_);
			chop($nextline = <CONFIG>);
			$_ .= $nextline;
		}

		($rooturl, @options) = split;
		&set_defaults();
		&parse_options(@options);
		next if ($rooturl =~ /^\s*$/io);	# empty rooturl
		print URL "ROOT\t$rooturl $urlmax $urlfilter $hostmax $hostfilter $delay $depth $accesstypes\n";
	}
}

sub process_leafnodes {
	&startup_prepurls() if (!$setupdone);
	while (<CONFIG>) {
		next if (/^#/o);
		last if (/^<\/LeafNodes>/io);
		print URL "LEAF\t$_";
		print "LEAF\t$_" if ($debug);
	}
}

sub init_dir {
	local($k) = @_;
	return if (!defined($vals{$k}));
	if ($vals{$k} !~ /^\//o) {
		$vals{$k} = $1 if ($vals{$k} =~ /^\.\/(.*)$/o);
		$vals{$k} = $cdir . "/" . $vals{$k};
	}
	print "init_dir($vals{$k})\n" if ($debug);
	mkdir($vals{$k}, 0755) || die "Gatherer: mkdir: $vals{$k}: $!\n" 
		if (! -e $vals{$k});
	chmod(0755, $vals{$k}) || die "Gatherer: chmod: $vals{$k}: $!\n" 
		if (! -w $vals{$k});
}

sub init_essence {
	$essencecmd  = "essence";
	if (defined($vals{"Data-Directory"})) {
		$essencecmd .= " --dbdir ";
		$essencecmd .= $vals{"Data-Directory"};
	}
	if (defined($vals{"Working-Directory"})) {
		$essencecmd .= " --tmpdir ";
		$essencecmd .= $vals{"Working-Directory"};
	}
	if (defined($vals{"Lib-Directory"})) {
		$essencecmd .= " --libdir ";
		$essencecmd .= $vals{"Lib-Directory"};
	}
	if (defined($vals{"Log-File"})) {
		$essencecmd .= " --log ";
		$essencecmd .= $vals{"Log-File"};
	}
	if (defined($vals{"Gatherer-Host"})) {
		$essencecmd .= " --gatherer-host ";
		$essencecmd .= "'" . $vals{"Gatherer-Host"} . "'";
	}
	if (defined($vals{"Gatherer-Name"})) {
		$essencecmd .= " --gatherer-name ";
		$essencecmd .= "'" . $vals{"Gatherer-Name"} . "'";
	}
	if (defined($vals{"Gatherer-Version"})) {
		$essencecmd .= " --gatherer-version ";
		$essencecmd .= "'" . $vals{"Gatherer-Version"} . "'";
	}
	if (defined($vals{"Essence-Options"})) {
		$essencecmd .= " " . $vals{"Essence-Options"};
	}
	if (defined($vals{"Debug-Options"})) {
		$essencecmd .= " " . $vals{"Debug-Options"};
	}
	$essencecmd .= " --verbose";
	$essencecmd .= " -f -";
}

sub startup_prepurls {
	@dirs = ("Lib-Directory", "Data-Directory", "Working-Directory");
	foreach $d (@dirs) {
		&init_dir($d) if (defined($vals{$d}));
	}
	foreach $f ("$vals{'Data-Directory'}/index.html", "$vals{'Working-Directory'}/index.html") {
		open(INDEXHTML, "> $f");
		print INDEXHTML <<EOM;
<html>
Please use the Harvest Gatherer's interface to retrieve these files.
</html>
EOM
		close(INDEXHTML);
		chmod(0644, $f);
	}
	$ENV{'TMPDIR'} = $vals{"Working-Directory"};

	if (defined(%mapping)) {
		$tfile = "$vals{'Working-Directory'}/localmap.cf";
		open(MAPPING, "> $tfile") || 
			die "Gatherer: Cannot write Mapping: $tfile: $!\n";
		foreach $k (sort keys %mapping) {
			print MAPPING "$k\t$mapping{$k}\n";
		}
		close(MAPPING);
		$ENV{'HARVEST_URL_LOCAL_MAPPINGS'} = $tfile;
	} else {
		$ENV{'HARVEST_URL_LOCAL_MAPPINGS'} = "/dev/null";
	}

	$proddb = $vals{"Data-Directory"} . "/PRODUCTION.gdbm";
	$indexdb = $vals{"Data-Directory"} . "/INDEX.gdbm";
	$mddb = $vals{"Data-Directory"} . "/MD5.gdbm";
	$prepcmd = "prepurls ";
	$prepcmd .= " --leaf 'staturl ";
	$prepcmd .= " $vals{'Debug-Options'} " . "'";
	$prepcmd .= " --root 'enum";
	$prepcmd .= " $vals{'Debug-Options'} ";
	$prepcmd .= " -tmpdb " . $vals{"Working-Directory"} .  "/tmpdb.gdbm";
	$prepcmd .= " -log " . $vals{"Log-File"};
	if (-r $proddb) {
		$prepcmd .= " -db " . $proddb . "'";
	} else {
		$prepcmd .= " -db /dev/null'";
	}
	&init_essence();

	#
	#  Create a script that will run Essence and the rest of the
	#  programs need to gather.  This gets around a Solaris 2.3 bug.
	#  when trying to use fork/exec to &this...
	#
	$gcmd = $vals{"Working-Directory"} . "/gathercmd.$$";
	$gcmdinput = $vals{"Working-Directory"} . "/gatherinput.$$";
	open(GCMD, "> $gcmd") || die "Gatherer: Cannot write $gcmd: $!\n";
	print GCMD "#\n#  This is the command to run the Gatherer\n#\n";
	foreach $k (sort keys %ENV) {
		next if ($k !~ /^(HARVEST|TMPDIR)/o);
		print GCMD "setenv $k $ENV{$k}\n";
	}
	$okpath = $ENV{'PATH'};
	$okpath =~ s/:/ /g;
	print GCMD "set path = ( $okpath )\n";
	print GCMD "set clobber\n";
	print GCMD "set noglob\n";
	print GCMD "\n";

	if ($vals{'HTTP-Proxy'} eq "") {
		print GCMD "unsetenv http_proxy\n"
	} else {
		print GCMD "setenv http_proxy http://$vals{'HTTP-Proxy'}/\n";
	}

	# Expire objects from the URL cache
	print GCMD "urlpurge < /dev/null\n";

	# Expire objects from the production database 
	print GCMD <<EOM;
if (-r $proddb) then
	chmod +w $proddb
	expiredb -log $vals{'Log-File'} $proddb
	if (\$status == 1) then
		# expiredb expired some objects, rebuild index
		chmod -w $proddb
		rm -f $indexdb $mddb
		mkindex $proddb $indexdb $mddb
		chmod 444 $indexdb $mddb
	else
		chmod -w $proddb
	endif
endif
EOM

	# Run the Gatherer
	print GCMD "\n";
	print GCMD "/bin/cat $gcmdinput" . " |  \\\n";
	print GCMD $prepcmd . " |  \\\n";
	print GCMD $essencecmd . "\n";
	print GCMD "\n";
	print GCMD "exit 0\n";
	close(GCMD);

	open(URL, "> $gcmdinput") || 
		die "Gatherer: Cannot write $gcmdinput: $!\n";

	$setupdone = 1;
}

#
#  Once the Gatherer has run, install the PRODUCTION database and
#  run gatherd.
#
sub install_gatherer {
	# prepare the database
	if (defined($vals{'Gatherer-Host'})) {
		$gid = "$vals{'Gatherer-Host'}:$vals{'Gatherer-Port'}";
	} else {
		chop($h = &grab_cmd_output("hostname"));
		($fullh, @blah) = gethostbyname($h);
		undef @blah;
		$fullh = $h if ($fullh eq "");
		$gid = "$fullh:$vals{'Gatherer-Port'}";
	}
	$folddbcmd = "folddb ";
	$folddbcmd .= "$vals{'Gatherer-Options'} ";
	$folddbcmd .= "\"$gid\" ";
	$folddbcmd .= "$vals{'Data-Directory'}";
	&run_system($folddbcmd);

	# prepare the access control list if needed
	$gatherdcf = $vals{'Data-Directory'} . "/gatherd.cf";
	if (! -r $gatherdcf) {
		open(GCF, "> $gatherdcf") || 
			die "Gatherer: Cannot create $gatherdcf: $!\n";
		print GCF <<EOM;
#
#  gatherd.cf - Access Control List for gatherd
#
Allow all
EOM
		close(GCF);
	}

	# run gatherd now to export data if needed
	if ($vals{"Gatherd-Inetd"} eq "yes") {
		return;	# don't run gatherd by hand
	}
	$gatherdcmd = "gatherd -d " . $vals{"Data-Directory"} . " ";
	$gatherdcmd .= $vals{"Gatherer-Port"};
	&run_system($gatherdcmd);
}

sub run_system {
	local($cmd) = @_;
	print "RUNNING: $cmd\n" if ($debug);
	system($cmd);
}

#
#  This is an ugly hack so that it works with Perl 4.036 on Solaris 2.3.
#  The backticks (`) don't work on Solaris like they should. -Darren.
#
sub grab_cmd_output {
	local($the_cmd) = @_;
	undef $the_var;
	unlink("/tmp/cmdoutput.$$");
	system("$the_cmd > /tmp/cmdoutput.$$");
	open(CMDOUT, "< /tmp/cmdoutput.$$") || return "none";
	$the_var = <CMDOUT>;
	close(CMDOUT);
	unlink("/tmp/cmdoutput.$$");
	return $the_var;
}

sub set_defaults {
	$urlmax = 250;
	$urlfilter = "/dev/null";
	$hostmax = 1;
	$hostfilter = "/dev/null";
	$delay = 1;
	$depth = 0;
	$accesstypes  = "HTTP";		# maybe "HTTP|FTP|Gopher" ?
}

sub parse_options {
	local(@options) = @_;

	foreach $opt (@options) {
		if ($opt =~ /^URL=(\d+)/io) {
			$urlmax = $1;
			$urlfilter = $1 if ($opt =~ /^URL=\d+,(\S+)/io);
			next;
		}
		if ($opt =~ /^Host=(\d+)/io) {
			$hostmax = $1;
			$hostfilter = $1 if ($opt =~ /^Host=\d+,(\S+)/io);
			next;
		}
		if ($opt =~ /^Site=(\d+)/io) {
			$hostmax = $1;
			$hostfilter = $1 if ($opt =~ /^Site=\d+,(\S+)/io);
			next;
		}
		if ($opt =~ /^Access=(.*)/io)  {
        		$accesstypes = $1;
			next;
		}
		if ($opt =~ /^Delay=(\d+)/io) {
			$delay = $1;
			next;
		}
		if ($opt =~ /^Depth=(\d+)/io) {
			$depth = $1;
			next;
		}
		print STDERR "Illegal Option: $opt\n";
	}
}
