#!/usr/bin/perl
#
# archieplexengine.pl -- ArchiePlex backend
#
# $Id: archieplexengine.pl,v 1.14 1994/09/01 11:01:03 mak Exp $
#
# By Martijn Koster (m.koster@nexor.co.uk)
# This code is placed in the public domain.

package archieplexengine;

$maintainer = '<A HREF="/mak/mak.html">Martijn Koster</A>';
#$debug = 1;

$archie		= "/usr/local/bin/archie";
$newarchie      = 1;		# prospero 5.1
$archie_timeout = 240;

$defaultserver = 'archie.hensa.ac.uk';
$defaulttype = '-c'; # case insensitive substring match

require "plexus-support.pl" unless defined $main'plexus_configured;

# Some sites are reacheable via different paths,
# that maybe more suitable. For example:

%protosites = (
	      'src.doc.ic.ac.uk', 'http://src.doc.ic.ac.uk/public',
	      );
 
# The hostnames are matched against these patterns so they can be
# printed in order. 

@hostorder = ('src\.doc\.ic\.ac\.uk$', 'uk$', 'no$', 'fi$', 'nl$', 'edu$');
# No, this is not going to stop anybody
# No, I'm not a censoring moralist, I just want to 
# prevent searches for things that aren't going to 
# work anyway.
sub censor {
    local($_) = @_;

    return 0 unless (
        /girl.g/i || /a.s.b/i || /rotic/i ||
        /nude/i || /naked/i || /porn/i || /sex/i);

    $output = <<EOM;
<HEAD>
<TITLE>ArchiePlex query '$query' is discouraged.</TITLE>
</HEAD>
<BODY>
<H1>ArchiePlex query '$query' is discouraged.</H1>

The ArchiePlex query '$query' is discouraged. 

<p>

See the page <A href="http://web.nexor.co.uk/archieplex-info/erotic.html">
Looking for Erotic Gifs?</A> to understand why...

</BODY>
EOM
  
    &main'add_header(*main'out_headers, "Content-length: " . length($output))
	if (defined &main'add_header);

    &main'MIME_header('ok', 'text/html');

    print $output;

    return 1;
}

sub execute
{
    local($server, $type, $order, $domain, $query, $max, $nice, $extra) = @_;

    if ($main'plexus_configured) {
	$0 = 'plexus (archieplexengine)';
	$useragent = $main'in_headers{'User-agent'};
    } else {
	$0 = 'archieplex';
	$useragent = $ENV{'HTTP_USER_AGENT'}; # CGI
    }
    $img = ($useragent =~ m!NCSA Mosaic for the X Window System/2!); # has internal images

    &main'debug(<<EOM);
server = $server
type = $type
order = $order
domain = $domain
query = $query
max = $max
nice = $nice
extra = $extra
img = $img
EOM
    
    &main'debug("images = $img");

    # $server is the domain name of the archie server
    #
    # $type is one of:
    #      -c : case sensitive substring search
    #      -e : exact string match (default)
    #      -r : regular expression search
    #      -s : case insensitive substring search
    #
    # $order is one of:
    #      -t : sorted inverted by date
    #  <empty> or undef: grouped by host name
    #
    # $query is the string to search for.
    #
    # $domain is a domain restriction string (only with prospero5 client)
    # $max is the maximum number of hits
    # $nice is the nicefactor

    $server = $defaultserver if (! $server);
    $type = $defaulttype if (!defined $type);
    $order = $defaultorder if (!defined $order);

    return if &censor($query);

    local($cmd);
    if ($newarchie)
    {
	# for Prospero client - Version Beta.5.1 use:
	push(@cmd, $archie);
	push(@cmd, '-m', $max) if ($max);
	push(@cmd, '-N', $nice) if ($nice);
	push(@cmd, '-H', $server) if ($server);
	push(@cmd, $type) if ($type);
	push(@cmd, $order) if ($order eq '-t');
	push(@cmd, '-fd', $domain) if ($domain);
	push(@cmd, '--');
	$query=~s/\s.*//;		# strip trailing words, archie can't cope
	push(@cmd, $query);
    }
    else
    {
	# for client 1.4.1 based on Prospero 4.2e use:

	push(@cmd, $archie);
	push(@cmd, '-h', $server) if ($server);
	push(@cmd, $type) if ($type);
	push(@cmd, $order) if ($order eq '-t');
	push(@cmd, '--', $query);
    }

    &main'debug("Archie command line: ", join(' ', @cmd));

     # fork off the search

    pipe(ARCHIE_IN, WFD);
    if (($cpid = fork()) == 0) {
        close(ARCHIE_IN);
        open(STDOUT,">&WFD");
        open(STDERR,">&WFD");
        exec @cmd;
        die "exec: $archie: $! [$cmd]\n";
    }
    close(WFD);
 
    select((select(STDOUT), $| = 1)[0]);

    &main'clear_timeout;
    $main'SIG{'ALRM'}="archieplexengine'archie_timeout";	
    alarm($archie_timeout);			

    eval ('&results');
    if ($@) {
	if ($@ =~ /Timeout Exceeded/) {
	    undef($@);
	    &main'error('timed_out', "The Archie Search on <CODE>$server</CODE> was aborted after $archie_timeout seconds");
	    die;
	} else {
	    die $@;
	}
    }
    1;
}

sub results
{
    $cachedline = <ARCHIE_IN>;	# checkfirstline for possible errors

    &main'debug("CACHED: $cachedline");
    if ($cachedline =~ /^ld.so: warning:/)
    {
	&main'debug("Loader warning: $cachedline");
	$cachedline = <ARCHIE_IN>; # only a warning, ignore
    }
    elsif (($cachedline =~ /^Usage:/) || ($cachedline =~ /^WARNING/i) || ($cachedline =~ /^$archie:/))
    {
	&main'error('bad_request',"$cachedline");
	die;
    }
    elsif ($timeoutexceeded || $cachedline =~ / failed:.*Timed out/i)
    {
	&main'error('timed_out', "The Archie Search on <CODE>$server</CODE> was aborted after $archie_timeout seconds");
	die;
    }
    elsif ($cachedline =~ /failed:/)
    {
	&main'error('bad_request',"$cachedline");
	die;
    }

    $output = <<EOM;
<HTML>
<HEAD>
<TITLE>ArchiePlex Results</TITLE>
</HEAD>
<BODY>
<H1>ArchiePlex Results</H1>

Results for query '$query' on '$server':

<HR>
EOM

    # loop over the results.
    # If we are sorting by host we put things into an array
    # so we can sort before displaying.

    local($sizedescription);
    local($doneheader) = 0;
    while($_ = ($cachedline || <ARCHIE_IN>))
    {
	undef $cachedline; 			# used up first time
	chop;
	next if ($_ eq '');
	if (! /Host|Location|DIRECTORY|FILE/)
	{
#	    $output .=  "<PRE>\n$_\n</PRE>\n";
	}
	$line = $_;

	if ($line =~ /Host/)
	{
	    if ($host)
	    {
		$out{ $host } .= "</DL>\n<HR>\n";

		if ($order eq '-t')
		{
		    $output .=  $out{ $host };
		    undef $out{ $host };
		}
	    }
	    ($title, $host) = split(' ', $line);
				# By default use the file (ftp) protocol to where archie
				# points you
 	    
 	    push( @hostlist, $host );

	    $urlhead = $protosites{ $host };
	    $urlhead= "ftp://$host" if (!$urlhead);

	    $out{ $host } .= "<H2>Host <A HREF=\"$urlhead\">$host</A></H2>\n";
	    $out{ $host } .= "<DL>\n";
	}
	elsif ($line =~ /Location/)
	{
	    ($title, $location) = split(' ', $line);
	    $out{ $host } .= "<DT>\n";
	    $out{ $host } .= "  ";
	    $out{ $host } .= $img ?
		"<IMG SRC=\"internal-gopher-menu\" ALT=\"In Directory\">\n" :
		    "In Directory\n";
	    $out{ $host } .= "  ";
	    $out{ $host } .= "<A HREF=\"$urlhead$location\">$location</A>\n";
	    
	    $first = 1;
	}
	elsif ($line =~ /DIRECTORY|FILE/)
	{
	    ($title, $permissions, $size, $mon, $date, $time, $name) = split(' ', $line);

	    $sizedescription = "$size bytes";
	    $sizedescription =  int($size / 1000) . "K ($size bytes)" if ($size > 1000);
	    $sizedescription = int($size / 1000000) . "M ($size bytes)" if ($size > 1000000);

	    $out{ $host } .= $first ? "<DD>\n" : "<P>\n";
	    undef($first);

	    $out{ $host } .= "  ";
	    if($title eq 'DIRECTORY')
	    {
		$out{ $host } .= $img ?
		    "<IMG SRC=\"internal-gopher-menu\" ALT=\"Directory\">\n" 
			: "Directory \n";
	    }
	    elsif($title eq 'FILE')
	    {
		$out{ $host } .= $img ? "<IMG SRC=\"internal-gopher-binary\" ALT=\"File\"> \n" : "File \n";
	    }
	    # directories haven't got a trailing slash, unless
	    # it is the root. In that case we temporarily strip
	    # it to prevent doubles
	    $out{ $host } .= "  ";
	    $location='' if ($location eq '/');
	    $out{ $host } .= "<A HREF=\"$urlhead$location/$name\">$name</A> $size $mon $date $time\n";
	    $location='/' if (!$location);
	}
    }

    if ($host )
    {			
	$out{ $host } .= "</DL>\n<HR>\n"; # terminate last

	if ($order eq '-t')
	{
	    $output .=  $out{ $host };
	}
	else			# sort and display
	{
	    local( $hostpatt );
	    local( $host );
	    foreach $hostpatt ( @hostorder ){
		foreach $host ( @hostlist ){
		    if( $host =~ /$hostpatt/ && defined( $out{ $host } ) ){
			$output .=  $out{ $host };
			undef $out{ $host };
		    }
		}
	    }
	    # do the rest
	    foreach (sort {(reverse $a) cmp (reverse $b);} keys %out)	# sort them by domain
	    {			
		$output .=  $out{ $_ };
	    }
	}
    }
    else
    {
	$output .=  "No matches.\n<HR>\n";
    }
    $output .= <<EOM;
    See also: <a href="http://web.nexor.co.uk/archie.html">
    List of Hypertext Archie Servers</a> and <A href=
    "http://web.nexor.co.uk/archieplex-info/info.html">
    Information on ArchiePlex</a>.
    <HR>
  </BODY>
</HTML>
EOM

    &main'add_header(*main'out_headers, "Content-length: " . length($output));

    &main'MIME_header('ok', 'text/html');
    
    print $output;

    return 1;
}

sub archie_timeout {
    kill 15,$cpid if $cpid;	# Kill the archie that we spawned
    &main'debug("archieplex'archie_timeout: Timeout Exceeded\n");
    die "Timeout Exceeded";
}

1;
