#!/usr/bin/perl -w
use strict;
use Getopt::Long;

# Master executable for perl MapReduce. This calls and monitors
# map tasks and reduce tasks on all participating machines.

my $usage = "
usage: mr [opts] task.mr input-corpus output-corpus
       mr [ -help | -cmd=cmd | -gather corpus resultdir ]

opts: conf=file, hosts=hostname, repository=dir
";


# Map: 
# - partition input into N files/host
# - Swap parts between hosts. All files k -> kth host
# Reduce:
# - Sort swapped files by partition key to one file / host
# - Reduce task produces output files



my %opts = (
  "conf=s"       => ar("/etc/mapreduce.conf"),
  "master=s"     => ar(''),
  "R=i"          => ar(0),
  "hosts=s@"     => ar(0),
  "repository=s" => ar(''),
  "maps=i"       => ar(0), # num of map procs per machine
  "reduces=i"    => ar(0), # num of reduce procs per machine
  "replicate=i"  => ar(0),  # f-way replication
  "cmd=s"        => ar(''), # execute this command on all hosts
  "gather"       => ar(0),
  "help"         => ar(0),  # print help message
);
 

my %conf = getconf(%opts);

if($conf{help}->[0]){
  print $usage;
  exit;
}

$conf{hosts} = $conf{hosts}->[0] 
  if ref $conf{hosts}->[0]; # hack, fix later

if( $conf{cmd}->[0] ){
  poolcommand(\%conf);
  exit;
}

if( $conf{gather}->[0] ){
  gather(\%conf, @ARGV);
  exit;
}

my $repository = $conf{repository}->[0];
my $R = $conf{R}->[0] || @{ $conf{hosts} };

my($taskfile, $corpus, $result) = @ARGV;
die $usage unless @ARGV == 3;
die "Can't find taskfile $taskfile\n" unless -f $taskfile;

my ($taskfilename) = ($taskfile =~ m|/([^/]+)$|);



# distribute the code to all hosts
system "scp $taskfile $_:/tmp/$taskfilename" 
  for @{ $conf{hosts} };

print STDERR "preparing status dirs and gathering file manifest\n";

my @reposfiles =
  prepare_job( $repository, $corpus, $result, $R, $conf{hosts} );

print STDERR "executing $taskfilename on ", 
              scalar @reposfiles, " files\n";

# execute map tasks
spawn_tasks( "map", $conf{master}->[0], $conf{hosts}, 
             $conf{replicate}->[0], $conf{maps}->[0], $R,
             $taskfilename, $repository, 
             $corpus, $result, '' );


spawn_tasks( "reduce", $conf{master}->[0], $conf{hosts}, 
             $conf{replicate}->[0], $conf{reduces}->[0], $R,
			 $taskfilename, $repository, 
             $corpus, $result, '' );
exit;











sub spawn_tasks {
		
  my ($task, $master, $hosts, $fway, $procs, $R, $taskfilename, 
      $repository, $corpus, $result, $perlopts ) = @_;

  my $start = time;
  my %minions = ();
  for my $k ( 0 .. @$hosts - 1 ) {
    my $host = $hosts->[$k];
    my $pid  =
      spawn_task( $task, $master, $hosts, $k, $fway, $procs, $R, $taskfilename, 
                  $repository, $corpus, $result, $perlopts );

    $minions{$pid} = $host;
    print STDERR "spawned $task task on $k th host ($host)\n";
  }

  print STDERR scalar keys %minions, " $task monkeys left: ",
    join( ", ", values %minions ), "\n";

  while ( my $reaped = wait ) {
    last if $reaped == -1;
    delete $minions{$reaped};
    print STDERR scalar keys %minions, " $task monkeys left: ",
      join( ", ", values %minions ), "\n";
  }

  print STDERR "Finished $task task in ", time - $start, " seconds\n";

}



sub spawn_task {
  my ( $task, $master, $hosts, $k, $fway, $procs, $R, $taskfile,
       $repository, $corpus, $result, $perlopts ) = @_;

  my $pid;
  return $pid if ($pid = fork); # spawn and hand back child id

  my $host = $hosts->[$k];
  $hosts = join(" ", @$hosts);
  my $cmd = join " ", 
    qq{ perl $perlopts /tmp/$taskfile $task $master $repository $corpus $result $k $fway $procs $R $hosts };

  my $start = time;
  print STDERR "$host: $cmd\n";
  open REMOTE, "ssh $host '$cmd' |";
  while(<REMOTE>){
    print STDERR;
  }
  print STDERR "$host finished $task task in ", time - $start, " seconds\n";
  
  exit;
}



sub poolcommand {
  my %conf = %{ $_[0] };
  my $cmd = $conf{cmd}->[0];
  for my $host ( @{ $conf{hosts} } ){
    print STDERR "$host: $cmd\n";
    system "ssh $host '$cmd'";
  }
}


sub gather {
  my %conf = %{ $_[0] };
  my($corpus, $result) = @_[1,2];
  my $repos = $conf{repository}->[0];
  mkdir $result unless -d $result;
  for my $host ( @{ $conf{hosts} } ){
    print STDERR "$host: gathering $corpus to $result\n";
    system "scp $host:$repos/$corpus/* $result/.";
  }
}


sub prepare_job {
  my( $repository, $corpus, $result, $R, $hosts ) = @_;

  system "rm -rf $repository/$result/.status";
  mkdir "$repository/$result";
  mkdir "$repository/$result/.status";
  mkdir "$repository/$result/.status/map";
  mkdir "$repository/$result/.status/reduce";
  mkdir "$repository/$result/.status/machine";

  my %files;
  for my $host (@$hosts){
    open LS, "ssh $host ls $repository/$corpus | cat |";
    while(<LS>){
      chomp;
      next unless $_;
      $files{$_} = 1;
      system "touch $repository/$result/.status/map/$_.unprocessed";
    }
    close LS;
  }

  for my $i (0..$R-1){
    system "touch $repository/$result/.status/reduce/$i.unprocessed";
  }

  return keys %files;
}







# CONFIG

# Make an anonymous variable reference.
# This is necessary to abide by the Getopt::Long way.
sub ar {
  my $var = shift;
  return \$var;
}

# Get all configuration info, with precedence given to cmdline
sub getconf {
  my %opts = @_;  

  GetOptions( %opts );
  
  my %cmdline = 
    map { (split '=', $_)[0] => [ ${$opts{$_}} ] } 
    grep { ${$opts{$_}} } 
    keys %opts;
  
  # Read in config file and overrwrite with cmd line options
  my %conf = readconf( @{$cmdline{conf}} );
  $conf{$_} = $cmdline{$_} for keys %cmdline;

  return %conf;
}


sub readconf {
  my $fname = shift;
  return () unless -e $fname; 
  my @c     = ();
  open F, "<$fname";
  while (<F>) {
    chomp;
    s/\r//;
    next unless $_;
    next if /^#/;
    push @c, [ split /\t+/ ];
  }
  my %conf = ();
  for my $list (@c) {
    if ( @$list > 2 ) {    # this is a vector
      push @{ $conf{ shift @$list } }, [@$list];
    }
    else {                 # this is a single key/value
      push @{ $conf{ $list->[0] } }, $list->[1];
    }
  }
  return %conf;
}



__END__


=head1 mr - a MapReduce front end


This is part of the Perl implementation of the MapReduce distributed processing model.

Please use "perldoc MapReduce" for instructions.



