#!/usr/bin/perl

# rsubsamp.pl -l length [-x extrafile] [infile]
#
# Prints LENGTH lines, chosen at random from INFILE (or STDIN), which 
# must have strictly more than LENGTH lines for the program to work.
# If '-x EXTRAFILE' is used, puts all the lines that weren't chosen 
# into EXTRAFILE, in the original order.
#
# This can be used to create a training set and test set, for instance.
#
# Perform (Perceptron Classifier in Inform) v1.0
# Nick Montfort  http://nickm.com  2004-06-24

use strict;
use warnings;
use Getopt::Std;
use vars qw/ $opt_l $opt_x $opt_u /;

getopt("l:x:u");
if(!$opt_l or $opt_u)
  { die ("Usage: $0 -l length [-x extrafile] [infile]\n"); }
if( $opt_x ) 
  { open EXTRA, ">$opt_x" or die("<!> Can't open $opt_x for writing."); }
my @data;

# Open infile, if there is one specified.
if ($ARGV[0])
  {
  open DATA, "<$ARGV[0]" or die("<!> Can't open $ARGV[0] for reading.");
  @data = <DATA>;
  close DATA;
  }
else
  { @data = <STDIN>; }

# Check that we can take a (proper) subsample.
if( scalar(@data) < $opt_l ) 
  { die("<!> Only ".scalar(@data)." elements in $ARGV[0], can't subsample $opt_l.\n"); }

# Sample 'em, swapping lines from the end into the "holes."
while($opt_l)
  {
  my $next = rand(scalar(@data));
  print $data[$next];
  $data[$next] = pop(@data);
  $opt_l--;
  }

# Write out what's left (it won't be in the original order).
if( $opt_x ) 
  {
  for(@data) { print EXTRA $_ };
  close EXTRA;
  }

__END__
