#!/usr/local/bin/perl
#
# WebCopy v0.97b 95/05/31
#
# Copyright (C) 1994, 1995 by Victor Parada (vparada@inf.utfsm.cl)
#
# Copy files (recursively) via HTTP protocol.
#
# Syntax: webcopy [options] http://host:port/path/file\n";
#
# Please read this "License Agreement and Lack of Warranty":
# - The author of this program is Victor Parada <vparada@inf.utfsm.cl>.
# - This program is "Freeware", not "Public Domain".
# - This program must be distributed for free, and cannot be included in
#   commercial packages without prior written permisson from the autor.
# - This program cannot be distributed if modified in any way.
# - This program can be used by anyone if the copyright and this notice
#   remains intact in every file.
# - If you modify this program, please e-mail patches to the the author.
# - This is a Beta version of the program. You have been warned!
# - This program is provided ``AS IS'', without any warranty.
# - This program can cause huge file transfers and all the related effects.
# - This program can fill data disks without notice.
# - Neither the author nor UTFSM are responsibles for the use of this program.
# - Bug reports, comments, questions and suggestions are welcome!  But
#   please check first that you have the latest version!
# - The latest version of this program is available at:
#   <URL:ftp://ftp.inf.utfsm.cl/pub/utfsm/perl/webcopy.tgz>
#
# If you use this program, please send e-mail to the author.
# He will try to notify you of any updates made to it.
#

# WebCopy info
#
$version="0.97b";
$agent="WebCopy/$version";

# Setup perl defaults
#
$webcopy=$0;
$0=~s![^/]*/!!g;
$|=1;
$[=0;

# Library routines required
#
require 'sys/socket.ph';
require 'timelocal.pl';

# Global setup
#
@month=('Jan','Feb','Mar','Apr','May','Jun',
	'Jul','Aug','Sep','Oct','Nov','Dec');
@weekday=('Sunday','Monday','Tuesday','Wednesday',
	  'Thursday','Friday','Saturday');
%monthnum=('jan',0,'feb',1,'mar',2,'apr',3,'may',4,'jun',5,
	   'jul',6,'aug',7,'sep',8,'oct',9,'nov',10,'dec',11);

# Configurable defaults
#
$tmpfile="/W.tmp";		# Must begin with "/"
$logfile="/W.log";		# idem.
$index="index.html";
@cgidir=("/cgi-bin/","/htbin/","/cgi/","/usr-cgi/","/bin/");
$rn="\r\n";

# Default options for command line
#
$log=1;
$samepath=1;
$cd="";
$verbose=0;
$map=0;
$form=0;
$delay=30;
$proxy=undef;
$noproxy=undef;
$post=undef;

# Process options in command line
#
while ($_=$ARGV[0],/^-/) {
    shift;
    /^-w(.*)$/ && length($1) && ($cd=$1, ($webcopy.=" $_"), next);
    /^-x(.*)$/ && length($1) && ($index=$1, ($webcopy.=" $_"), next);
    /^-t(.*)$/ && length($1) && ($delay=$1, ($webcopy.=" $_"), next);
    /^-z(.*)$/ && ($recurse || ($post=$cgi=1, $post_data=$1,
				($webcopy.=" $_"), next));
    /^-(.)(.+)$/ && (unshift(@ARGV,"-$2"), $_="-$1");
    $webcopy.=" $_";
    /o/ && ($recurse || $verbose || $query || ($stdout=1, next));
    /v/ && ($stdout || ($verbose+=1, next));
    /s/ && ($log=0, next);
    /q/ && ($stdout || ($query=1, next));
    /r/ && ($stdout || $post || ($recurse=$image=$link=1, next));
    /i/ && ($stdout || $post || ($image=$recurse=1, next));
    /l/ && ($stdout || $post || ($link=$recurse=1, next));
    /m/ && ($map=1, next);
    /c/ && ($cgi=1, next);
    /a/ && ($absolute=1, next);
    /f/ && ($full=$absolute=1, next);
    /p/ && ($samepath=0, next);
    /d/ && ($complete=1, next);
    /u/ && ($uselocal=1, next);
    /n/ && ($noproxy=1, next);
    /h|\?/ && ($help=1, next);
    /(o|v|q|r|i|l)/ && die "$0: Option $_ conflicts with previous ones\n";
    /(x|w|t)/ && die "$0: Option $_ requires argument\n";
    die "$0: Invalid option $_\n";
}

$delay=~/^\d+$/ || die "$0: Option -t requires numerical argument\n";

$cd && $cd=~m!/$! && ($cd.='.');
$cd eq "" || -d $cd || die "$0: Cannot find directory $cd\n";

exit &about if $help;

$url=shift || die "$0: Syntax: $0 [options] http://host:port/path/file" .
    " [http://proxy:port]\n(try \"$0 -h\" for help\n";

$ENV{'http_proxy'}=$_ if $_=shift;

die "$0: Extra arguments specified\n" if shift;

if ($post) {
    if ($post_data) {
	die "$0: Cannot find POST data ".&real($post_data) . "\n"
	    unless ((-r &real($post_data)) || (-r $post_data));
	$post_data=&read_file($post_data);
    } else {
	die "$0: Cannot find POST data in URL\n"
	    unless $url=~/\?/;
	($url,$post_data)=($url=~/^([^\?]*)\?(.*)$/);
#	print "URL($url)\nPD($post_data)\n";
    }
}

unless (($host,$port,$path,$file,$extra)=&split_url($url)) {
    die "$0: Malformed http URL\n" unless $path;
}

if ($log) {
    open(L,">".&real("$logfile")) || die "$0: Cannot log\n";
    $_=select(L); $|=1; select($_);
}

&log("$webcopy $url\ndate ".&now."\n");
&log("server $host port $port ($path)\n");


if (!$noproxy && $ENV{'http_proxy'}) {
    unless (($phost,$pport)=&split_url($ENV{'http_proxy'})) {
	die "$0: Malformed http URL for proxy '$ENV{'http_proxy'}'\n";
    }
    unless ((gethostbyname($phost))[4] eq (gethostbyname($host))[4] &&
	    $pport==$port) {
	&log("proxy $phost port $pport\n");
	$proxy="http://$host:$port";
    }
    &init_sock($phost,$pport) || die "$0: Cannot initialize socket\n";
} else {
    &init_sock($host,$port) || die "$0: Cannot initialize socket\n";
}

$basepath=$path;
$allowfirst=1;
&add_url("$path$file$extra",1,"");
undef $delay_time;

# HTTP/1.0 codes handled:
# 200 Ok (process normally)
# 302 Redirection (new url must be queued)
# 304 Use local copy (not modified in server since last access)

# MAIN LOOP: process each file in the queue.
#
 MAINLOOP: while ($file=shift(@files)) {

    if ($delay_time) {
	sleep($delay_time);
    } else {
	$delay_time=$delay;
    }
     
    $path=$file;
    $path=~s![^/]*$!!;
    $out=$file;
    $out=~s!^$basepath!/!o unless $complete;
    $out.=$index if $out=~m!/$!;
    $in="http://$host:$port$file";
    
    if ($query) {
	print "$in [no/yes/all/quit]?";
	$_=<>;
	(&log("aborted $in\n"),last MAINLOOP) if /^[qQ]/;
	$query=0 if /^[aA]/;
	(&log("discarded $in\n"),next MAINLOOP) unless /^[yYaA]/;
    } elsif ($verbose) {
	print "$in\n";
    }
    &log("getting $in ");
    ($error,$errormsg,$html)=&transfer($file,$out);
    &log("$error $errormsg\n");
    if ($error==302) {
	local($h,$n,$p,$f,$e)=&split_url($html);
	&add_url( ($h eq $host && $n eq $port)?"$p$f$e":$html, 1, "");
    } elsif (($error==200) || ($error==304)) {
	$parse= $html || $file=~m!\.html?$!i || $file=~m!/$!;
	if ($recurse && $parse && open(I,&real($out))) {
	    &log("parsing $out\n");
	    $line="";
	    while (<I>) {
		$line.=$_;
		$line=~tr/\n\t\r/   /;
		while ($line=~/\<([^\>]*)\>(.*)$/) {
		    $fulltag=$1;
		    $line=$2;
		    ($tag,%args)=&split_tag($fulltag);
		    &check_tag("A",   "HREF",  $link);
		    &check_tag("IMG", "SRC",   $image);
		    &check_tag("FIG" ,"SRC",   $image);
		    &check_tag("FORM","ACTION",$form);
		}
	    }
	    close(I);
	} elsif (-e &real($out)) {
	    &log("storing $out\n");
	}
    }
}
&log("end\n");
close(L) if $log;
exit;

sub split_tag {
    local($fulltag)=@_;
    local($tagname,$etc,$var,$quote,$arg,$value,@tagargs);
    undef $etc;
    ($tagname,$etc)=$fulltag=~/\s*(\S+)\s*(.*)$/;
    while ($etc!~/^\s*$/) {
	($arg,$var,$value,$etc)=$etc=~/\s*([^=\s]+)\s*(=)?\s*(\S+)?(.*)$/;
	($value,$etc)=((undef),$value.$etc) unless $var=~/^=$/;
	($quote)=$value=~/^([\"\'])/;
	if ($quote) {
	    if ($value!~/.$quote$/) {
		($_,$etc)=$etc=~/^([^$quote]*)$quote?(.*)$/;
		$value.=$_.$quote;
	    }
	    $value=~s/$quote\s*(.*)\s*$quote$/$1/;
	    $value=~s/\s*$//;
	}
	$arg="\U$arg\E";
	push(@tagargs,$arg,$value);
    }
    $tagname="\U$tagname\E";
    ($tagname,@tagargs);
}

sub check_tag {
    local($thetag,$thearg,$cando)=@_;
    if (($thetag eq $tag) && defined($args{$thearg})) {
	&add_url($args{$thearg},$cando,$thetag);
    }
}

sub add_url {
    local($ref,$cando,$what)=@_;
    local($isdir,$qs,$dir,@new);
    &log("reference $ref");
    unless ($cando) {
	&log(" disabled-$what\n");
	return;
    }
    if ($ref=~/^\s*$/) {
	&log("null reject-invalid\n");
	return;
    }
    if ($full && $ref=~/^\w+:/) {
	local($h,$n,$p,$f,$e)=&split_url($ref);
	if ($h ne "localhost" && ($h ne $host || $n ne $port)) {
	    &log(" reject-remote\n");
	    return;
	}
	$ref="$p$f$e";
    } elsif ($ref=~/^\w+:/) {
	&log(" reject-full\n");
	return;
    }
    if ($ref=~m!^/!) {
	unless ($absolute || $allowfirst) {
	    &log(" reject-absolute\n");
	    return;
	}
    }
    if ($ref=~/^\#/) {
	unless ($allowfirst) {
	    &log(" reject-same\n");
	    return;
	}
    }
    undef $allowfirst;
    $ref=~s/\#.*$//; # Remove NAME references
    unless ($cgi) {
	if ($url=~/\?/) {
	    &log(" reject-cgi\n");
	    return;
	}
    }
    $ref="$path$ref" unless $ref=~m!^/!;
    ($ref,$qs)=$ref=~m!^([^?]*)(.*)$!;
    $isdir=$ref=~m!/$!;
    @new=();
    foreach $dir (split('/',$ref)) {
	if ($dir eq "..") {
	    pop(@new);
	} elsif ($dir ne ".") {
	    push(@new,$dir);
	}
    }
    $url=join('/',@new);
    $url.="/" if $isdir;
    &log(" changed\nnew $url") if ($url ne $ref);
    $url.=$qs;
    unless ($cgi) {
	if ($url=~/\?|\.cgi$/) {
	    &log(" reject-cgi\n");
	    return;
	}	    
	foreach $_ (@cgidir) {
	    if ($url=~/$_/) {
		&log(" reject-cgi\n");
		return;
	    }
	}
    }
    if (($url!~/^$basepath/o) && $samepath) {
	&log(" reject-non-base\n");
	return;
    }
    unless (defined($cache{$url}) || defined($cache{&unescape($url)})) {
	&log(" accept\n");
	push(@files,$url);
	$cache{&unescape($url)}=1;
    } else {
	&log(" previous-ref\n");
    }
}

sub split_url {
    local($url)=@_;
    local($host,$port,$path,$file,$extra,$v);
    ($url,$extra)=$url=~m!^([^\?\#]*)(.*)$!;
    $v=$url=~m!^http:(//([^:/]*)(:(\d*))?)?((/([^/]+/)*)?([^/]*))?$!;
    return () unless $v;
    ($host,$port,$path,$file)=(($2?$2:"localhost"),($4?$4:80),$6,$8);
    ($host,$port,$path,$file,$extra);
}

sub init_sock {
    local($host,$port)=@_;
    $sockaddr='S n a4 x8';
    chop($local_host=`hostname`);
    $local_prot=(getprotobyname('tcp'))[2];
    $local_sock=pack($sockaddr,&AF_INET,0,(gethostbyname($local_host))[4]);
    $remote_sock=pack($sockaddr,&AF_INET,$port,(gethostbyname($host))[4]);
}

sub transfer {
    local($url,$file)=@_;
    local($status,$code,$info,$line,$length,$html,$redirect,$update,$data,
	  $bytes,$step,$slice);
    $update=(($_=&get_date($file)) &&
	     ("If-Modified-Since: $_$rn")) unless $post;
    $uselocal && $update && return(304,"Use local",0);
    socket(S,&PF_INET,&SOCK_STREAM,$local_prot) || die "$0: socket: $!\n";
    bind(S,$local_sock) || die "$0: bind: $!\n";
    connect(S,$remote_sock) || die "$0: connect: $!\n";
    $_=select(S); $|=1; select($_);
    $data="Content-Length: " . length($post_data) .
	"${rn}Content-Type: application/x-www-form-urlencoded$rn" if $post;
    print S ($post?"POST":"GET") . " $proxy$url HTTP/1.0$rn" .
	"User-Agent: $agent${rn}Accept: */*${rn}$update$data$rn";
    print S $post_data if $post;
    chop($status=<S>);
    $status=~s/\r$//;
    undef $code;
    ($code,$info)=($status=~m!^HTTP\S+\s+(\d\d\d)\s*(.*)$!i);
    print "Status: $code - $info -\n" if $verbose;
    if ($code==200 || $code==302) {
	undef $length;
	undef $redirect;
	undef $update;
	chop($line=<S>);
	$line=~s/\r$//;
	while (defined $line) {
	    last if $line=~/^$/;
	    print "$line\n" if $verbose>1;
	    $html=1 if $line=~m!^Content-Type:\s+[^/\s]+/html.*!i;
	    $length=$1 if $line=~/^Content-length:\s*(\d+)/i;
	    $redirect=$1 if $line=~/^Location:\s+(\S+)/i;
	    $update=$1 if $line=~/^Last-modified:\s*(.*)/i;
	    chop($line=<S>);
	    $line=~s/\r$//;
	}
	if ($code==302 && defined($redirect)) {
	    $html=$redirect;
	    while (defined($data=<S>)) { } # chomp!
	} elsif (open(F,$stdout ? ">&STDOUT" : ">".&real($tmpfile))) {
	    if (defined($length)) {
		print "$length bytes " if $verbose;
		$step=$slice=($length/10);
	    } else {
		print "Unknown length " if $verbose;
		$step=$slice=2048;
	    }
	    $bytes=0;
	    binmode(S);
	    while (defined($data=<S>)) {
		print F $data;
		$bytes+=length($data);
		print "." while ($verbose && $bytes>=$step && ($step+=$slice));
	    }
	    close(F);
	    &move($file) unless $stdout;
	    &set_date($file,$update) if ((defined($update)) && !$stdout);
	    if (defined($length) && $bytes!=$length) {
		&log("($bytes<>$length)");
	    }
	    print " $bytes bytes" if ($verbose && !defined($length));
	    print "\n" if $verbose;
	} else {
	    $code=100;
	    $info="Transfer $!";
	}
    }
    close(S);
    ($code,$info,$html);
}

sub move {
    local($file)=@_;
    local($path)=$file;
    local($dir)="";
    local($name);
    $path=~s!^/!!;
    local(@subdirs)=split('/',$path);
    local($sub);
    $name=pop(@subdirs);	# get name of file from path
    for $sub (@subdirs) {
	$dir.= "/".&unescape($sub);
	stat(&real($dir));
	return 0 if -f _;
	mkdir(&real($dir),0755) unless -e _;
    }
    return "$dir/$name" if rename(&real($tmpfile),&real("$dir/$name"));
}

sub read_file {
    local($file)=@_;
    local($data);
    if (open(D,&real($file)) || open(D,$file)) {
	while (<D>) {
	    $data.=$_;
	}
	close(D);
	$data=~s/\s//g;
    }
    $data;
}

sub set_date {
    local($file,$date)=@_;
    local($time,$ss,$mm,$hh,$dd,$nn,$yy,$gmt);
    $date=~s/^\s*\S+\s+//;	# Ignore weekday and leading spaces
    $date=~s/\s+/ /g;
    if (($nn,$dd,$hh,$mm,$ss,$gmt,$yy)=
	($date=~/^(\w+) (\d+) (\d+):(\d+):(\d+) (\S+)? ?(\d+)?/)) {
	$yy || ($yy=$gmt);
    } elsif (($dd,$nn,$yy,$hh,$mm,$ss,$gmt)=
	     ($date=~/^(\d+)-(\w+)-(\d+) (\d+):(\d+):(\d+) ?(\S+)?/)) {
    } elsif (($dd,$nn,$yy,$hh,$mm,$ss,$gmt)=
	     ($date=~/^(\d+) (\w+) (\d+) (\d+):(\d+):(\d+) ?(\S+)?/)) {
    } else {
	return undef;
    }
    $yy>99 && ($yy-=1900);
    unless ($nn=~/^\d+$/) {
	$nn=~s/^(...).*$/$1/;
	$nn=~tr/A-Z/a-z/;
	$nn=$monthnum{$nn};
    }
    $time=&timegm($ss,$mm,$hh,$dd,$nn,$yy);
    utime($time,$time,&real($file));
}

sub get_date {
    local($file)=@_;
    return(undef) unless (-e ($_=&real($file)));
    local($ss,$mm,$hh,$dd,$nn,$yy,$ww)=(gmtime((stat($_))[9]))[0..6];
    return sprintf("%s, %02g-%s-%02g %02g:%02g:%02g GMT",
		   $weekday[$ww], $dd, $month[$nn], $yy, $hh, $mm, $ss);
}

sub now {
    local($ss,$mm,$hh,$dd,$nn,$yy,$ww)=localtime;
    return sprintf("%s, %02g-%s-%02g %02g:%02g:%02g",
		   $weekday[$ww], $dd, $month[$nn], $yy, $hh, $mm, $ss);
}
    
sub unescape {
    ($_)=@_;
    tr/+/ /;
#    s!\\(.)!$1!ge;
    s/%(..)/pack("c",hex($1))/ge;
    $_;
}

sub log {
    local($message)=@_;
    print(L $message) if $log;
}

sub real {
    ($cd || ".") . $_[0];
}

sub about {
    print <<EOH;

WebCopy $version  (C) 1994-1995 by Victor Parada (vparada\@inf.utfsm.cl)

Copy remote files (recursively) using http protocol.

Syntax:  $0 [options] http://host:port/path/file [http://proxy:port]

Options (can be combined):
 -o  output to stdout          -tdelay  set delay seconds
 -v  verbose mode              -wpath   set working directory
 -q  query transfer            -xfile   set default index.html
 -s  suppress log              -zfile   post 'file' or query string
 -r  recurse (same as -il)     -c  allow links to CGI scripts (require -paf)
 -i  include images            -a  allow absolute references
 -l  hypertext links           -f  allow full URL references
 -m  imagemaps (unavailable)   -p  allow other paths (-d recommended)
 -n  don't use proxy           -d  keep directory path in URL for local file
 -u  use local copy of file if exists
 -h  this help (ignores other options specified)

EOH
    undef;
}

__END__
