#!/usr/bin/perl -w

## April 23, 2014: DHA completed new implementation of fassort
## uses Mergesort when going over a memory limit

use POSIX;
use Getopt::Long qw(:config bundling require_order auto_version);
use Pod::Usage;
use FAST;
use FAST::Bio::SeqIO;
use strict;
use Sort::Key qw/ keysort_inplace nkeysort_inplace rkeysort_inplace rnkeysort_inplace / ;
#use Devel::Size qw/ total_size /;
use Sort::MergeSort;
use File::Temp qw/ tempfile /;
use Storable qw/ store_fd /; 
use constant { true => 1, false => 0 };

use vars qw($VERSION $DESC $NAME $COMMAND $DATE);
$VERSION = $FAST::VERSION; 
$DESC    = "Sort sequence records in a multifasta file or data-stream.\n";
$NAME    = $0;
$NAME    =~ s/^.*\///;
$COMMAND = join " ",$NAME,@ARGV;
$DATE = POSIX::strftime("%c",localtime());

## DEFAULT OPTION VALUES
my $def_format  = $FAST::DEF_FORMAT;  #7/1/13 "fasta";
my $def_logname = $FAST::DEF_LOGNAME; #7/1/13 "FAST.log.txt";

my $def_split_on_regex = ' ';

## OPTION VARIABLES
my $man                  = undef;                 # --man
my $help                 = undef;                 # -h
my $moltype              = undef;                 # -m, in case bioperl can't tell
my $format               = $def_format;           # --format
my $log                  = undef;                 # -l
my $logname              = $def_logname;          # -L
my $comment              = undef;                 # -C
my $split_on_regex       = $def_split_on_regex;   # -S 

## in increasing priority:
my $sequence       = undef;  # -s
my $description    = undef;  # -d
my $field          = undef;  # -f
my $regex          = undef;  # -x
my $tag            = undef;  # -t 

my $reverse             = undef;  # -r
my $numeric             = undef;  # -n

my $memory_limit        = 524288;  # -M; one-quarter GB
my $fastq                = undef;


GetOptions('help|h'         		 => \$help, 
	   'man'            		 => \$man,
	   'moltype|m=s'                 => sub{  my (undef,$val) = @_; 
						  die "$NAME: --moltype or -m option expects argument \"dna\", \"rna\" or \"protein\"\n" 
						    unless $val =~ /dna|rna|protein/i; 
						  $moltype = $val;
						},
	   'format=s'                    => \$format,
	   'log|l'                       => \$log,
	   'logname|L=s'                 => \$logname,
	   'comment|C=s'                 => \$comment,


	   'description|d'          => \$description,
	   'sequence|s'             => \$sequence,
	   'field|f=i'              => sub{  my (undef,$val) = @_; 
						  die "$NAME: --field or -f option expects non-zero integer argument\n" 
						    unless $val != 0; 
						  $field = $val;
						},
	   'split-on-regex|S=s'          => \$split_on_regex,
	   'tag|t=s'                => sub{  my (undef,$val) = @_; 
						  die "$NAME: --tag or -t option expects string argument exclusively from [a-zA-Z0-9_-]\n" 
						    unless $val =~ /^[a-zA-Z0-9_-]+$/; 
						  $tag = $val;
						},
	   'regex|x=s'              => sub{  my (undef,$val) = @_; 
						  die "$NAME: --regex or -x option expects regex argument with exactly one capture buffer\n" 
						    unless $val =~ /^[^()]*\([^)]+\)[^()]*$/; 
						  $regex = $val;
						},
	   'reverse|r'                   => \$reverse,
	   'numeric|n'                   => \$numeric,   
	   'memory_limit|M=i'            => \$memory_limit,
           'fastq|q'                     => sub{$format = 'fastq';},

	  ) 
 or exit(1);
		  
#Sequence Length = 542 nt; Size = 1746 bytes; 3.22 bytes/nt
$memory_limit = $memory_limit / 3.22; #convert memory to # of nt/aa 

pod2usage(-verbose => 1) if $help;
pod2usage(-verbose => 2) if $man;
my $fromSTDIN = ((-t STDIN) ? false : true);
pod2usage("$NAME: Requires at least one argument file1 [file2…fileN] unless taking input from STDIN.\n") if ((-t STDIN) && (@ARGV < 1));
pod2usage("$NAME: Requires exactly zero arguments when taking input from STDIN.\n") if (!(-t STDIN) && @ARGV != 0);

&FAST::log($logname, $DATE, $COMMAND, $comment, $fromSTDIN) if ($log);

my $split_re = ' ';
if ($split_on_regex) {
  $split_re = qr/$split_on_regex/;
}

my $index;
if ($field and $field > 0) {
  $index = $field - 1;
}
elsif ($field and $field < 0) { # $match_field < 0
  $index = $field;
}


my $keysort;
if ($numeric){
  if ($reverse) {
    $keysort = sub {my $ar = shift; return rnkeysort_inplace { $_->[0] } @$ar; }
  }
  else {
    $keysort = sub {my $ar = shift; return  nkeysort_inplace { $_->[0] } @$ar; }
  }
}
else {
  if ($reverse) {
    $keysort = sub {my $ar = shift; return  rkeysort_inplace { $_->[0] } @$ar; }
  }
  else {
    $keysort = sub {my $ar = shift; return    keysort_inplace { $_->[0] } @$ar; }
  }
}

my $keyf;
if ($tag) {
  $keyf = sub { my $s = shift; my $re = qr/$tag[:=](\S+)/p; my $r = $s->desc(); if ($r =~ $re){ return $1;} else {return;} };
}
elsif ($regex) {
  my $m = 'id'; 
     $m = 'seq'  if ($sequence);
     $m = 'desc' if ($description); 
  $keyf = sub { my $s = shift;  my $re = qr/$regex/p; my $r = $s->$m(); if ($r =~ $re) { return $1;} else {return;} };
}
elsif ($field) {
  $keyf = sub { my $s = shift; my @fields = split $split_re,$s->desc(); if (exists $fields[$index]) {return $fields[$index]} else {return;} };
}
elsif ($description) {
  $keyf = sub { my $s = shift; $s->desc() };
}
elsif ($sequence) {
  $keyf = sub { my $s = shift; $s->seq() };
}
else {
  $keyf = sub { my $s = shift; $s->id() };
}



my $OUT = FAST::Bio::SeqIO->newFh('-format' => $format);
my $IN;
unless (@ARGV) {
    if ($moltype) {
	$IN = FAST::Bio::SeqIO->new(-fh => *STDIN{IO}, '-format' => $format, '-alphabet' => $moltype);
    }
    else {
	$IN = FAST::Bio::SeqIO->new(-fh => *STDIN{IO}, '-format' => $format);
    }
}

my $seqs = [];
my $seqlength = 0;
my $merge;
my @fnames = ();
while ($IN or @ARGV) {
  if (@ARGV) {
    my $file = shift (@ARGV);
    unless (-e $file) {
      warn "$NAME: Could not find file $file. Skipping.\n";
      next;
    }
    elsif ($moltype) {
      $IN = FAST::Bio::SeqIO->new(-file => $file, '-format' => $format, '-alphabet' => $moltype);
    }
    else {
      $IN = FAST::Bio::SeqIO->new(-file => $file, '-format' => $format);
    }
  }
  if ($IN) { 
    while (my $seq = $IN->next_seq()) {
      $seqlength += $seq->length();
      my $key = &$keyf($seq);
      push @$seqs, [$key,$seq];
      if ($seqlength > $memory_limit) {
        $seqlength = 0;
	$merge = 1;                                     ## PREPARE TO MERGE SORT
	&$keysort(\@$seqs);                             ## SORT DATA IN MEMORY
	my ($fh, $filename) = File::Temp::tempfile();   ## OPEN TEMPFILE 
	push @fnames, $filename;                        ## REMEMBER NAME                 
	Storable::store_fd($seqs, $fh)                  ## STORE DATA
	    or die("$NAME: Could not write data to $filename with Storable");                 
	$seqs = [];                                     ## FREE MEMORY
      }
      
    }
    undef $IN;
  }
}

&$keysort(\@$seqs); 

unless ($merge) {
  foreach my $seq (@$seqs) {
    print $OUT $seq->[1];
  }
}
else {
  my $last_iter_coderef = sub { return shift @$seqs };
  my $last_iter = Sort::MergeSort::Iterator->new($last_iter_coderef);
  my @iters = ();
  push @iters,$last_iter;
  while (@fnames) {
    my $fname = shift (@fnames);
    my $arrayref = Storable::retrieve($fname) 
      or die ("$NAME: Could not retrieve data from $fname with Storable");  
    my $iter_coderef = sub { return shift @$arrayref };
    my $iter = Sort::MergeSort::Iterator->new($iter_coderef);
    push @iters, $iter;
  }
  my $comparefunc = &return_compare_function();
  my $iterator = Sort::MergeSort::mergesort($comparefunc, @iters);
  while (<$iterator>) {
    print $OUT $_->[1];
  }
}


sub return_compare_function {
  if ($numeric){
    if ($reverse) {
      return sub { my($a,$b)=@_; return ($b->[0] <=> $a->[0]); };
    }
    else {
      return sub { my($a,$b)=@_; return ($a->[0] <=> $b->[0]); };
    }
  }
  else {
    if ($reverse) {
      return sub { my($a,$b)=@_; return ($b->[0] cmp $a->[0]); };
    }
    else {
      return sub { my($a,$b)=@_; return ($a->[0] cmp $b->[0]); };
    }
  }
}


__END__

=head1 NAME

B<fassort> - sort sequences based on identifiers

=head1 SYNOPSIS

B<fassort> [OPTION]... [MULTIFASTA-FILE]...

=head1 DESCRIPTION

B<fassort> takes sequence or alignment data as input, and outputs
sequence records sorted by some user-defined criteria. By default,
sequences are sorted ASCIIbetically by their identifiers. Optionally
sequences may be sorted by their sequences, descriptions or components
of descripions.

Options specific to B<fassort>:
  B<-d>, B<--description>	                sort on description
  B<-s>, B<--sequence>                    sort on sequence
  B<-f>, B<--field>=<int>	                sort on field in description
  B<-t>, B<--tag>=<string>                sort on value of a (tag,value) pair in the description
  B<-x>, B<--regex>=<regex>               sort on value of a regex capture
  B<-S>, B<--split-on-regex>=<regex>      split description for field matching using regex 
  B<-r>, B<--reverse>                     reverse result of comparisons
  B<-n>, B<--numeric>                     compare according to string numerical value
  

Options general to FAST:
  -B<h>, B<--help>                        print a brief help message
  B<--man>             	           print full documentation
  B<--version>                         print version
  -B<l>, B<--log>                         create/append to logfile	
  -B<L>, B<--logname>=<string>            use logfile name <string>
  -B<C>, B<--comment>=<string>            save comment <string> to log
  B<--format>=<format>                 use alternative format for input  
  B<--moltype>=<[dna|rna|protein]>     specify input sequence type
  -q, --fastq                       use fastq format as input and output

=head1 INPUT AND OUTPUT

B<fassort> is part of FAST, the FAST Analysis of Sequences Toolbox, based
on Bioperl. Most core FAST utilities expect input and return output in
multifasta format. Input can occur in one file or on STDIN. Output
occurs to STDOUT. The FAST utility B<fasconvert> can reformat other
formats to and from multifasta.

=head1 DEFAULT AND OPTIONAL SORTING BEHAVIOR

By default, B<fassort> sorts sequence records by their
identifiers. Options described below modify which parts of sequence
records are sorted on. These options take effect as follows with
decreasing priority: B<-t> > B<-xd> > B<-xs> > B<-x> > B<-f> > B<-d> > B<-s>

=head1 OPTIONS

=over 8

=item B<-d>,
      B<--description> 		

Sort sequence records by their descriptions. If used in combination with the 
-x, --regex option, sort records by values of a regex capture
applied to their descriptions. 

=item B<-s>,
      B<--match-sequence> 		

Sort sequence records by their sequences. If used in combination with the 
-x, --regex option, sort records by values of a regex capture
applied to their sequences.   

=item B<-f [int]>,
      B<--field=[int]>     

Sort sequence records by values at a specific field in sequence descriptions.
With this option, the description is split into fields using strings of white
space as field delimiters (the default Perl behavior for splitting
lines of data into fields).

This option takes a mandatory positive integer option argument giving
the index for which field the regex should query. One-based indexing
is used, so the first field after the sequence identifier has index
1. As standard in Perl, negative indices count backwards from the last
field in the description; field "-1" is the last field, "-2" is the
second-to-last etc. Sequence records for which the specified field
does not exist will sort on a null key.

=item B<-S [regex]>,
      B<--split-on-regex=[regex]>   

Use regex <regex> to split descriptions for the -f option instead
of the perl default (which splits on one or more whitespace
characters). Special characters must be quoted to protect them from
the shell.

=item B<-t [string]>,
      B<--tag=[string]>   

Sort sequence records by values of a named tag in the description.
Name-value pairs in the description are expected to have the format
"name:value" as generated by FAST tools (such as faslen) or
"name=value" as common in General Feature Format. The "name" must
contain only characters from the set [a-zA-Z0-9_-]. The "value" is any
string of non-whitespace characters. Sequence records for which the
specified tag does not exist will sort on a null key.

=item B<-x [regex]>,
      B<--regex=[regex]>   

Sort sequence records by values of a regex capture applied to
identifers (by default), descriptions (in combination with the -d
option), or sequences (in combination with the -s option). The format
of the regex is a perl regular expression that must contain exactly
one "capture buffer" indicated by a pair or parentheses. The value
captured in this buffer is the sort key. Sequence records that do not
match the regex will sort on a null key.

=item B<-r>,
      B<--reverse> 	   

Reverse the result of comparisons so that greater keys come first.

=item B<-n>,
      B<--numeric> 	   

Compare keys by their string numerical value.

=item B<-h>,
      B<--help>

Print a brief help message and exit.

=item B<--man>

Print the manual page and exit.

=item B<--version>

Print version information and exit.

=item B<-l>,
      B<--log>

Creates, or appends to, a generic FAST logfile in the current working
directory. The logfile records date/time of execution, full command
with options and arguments, and an optional comment.

=item B<-L [string]>,
      B<--logname=[string]>

Use [string] as the name of the logfile. Default is "FAST.log.txt".

=item B<-C [string]>,
      B<--comment=[string]>

Include comment [string] in logfile. No comment is saved by default.

=item B<--format=[format]> 		  

Use alternative format for input. See man page for "fasconvert" for
allowed formats. This is for convenience; the FAST tools are designed
to exchange data in Fasta format, and "fasta" is the default format
for this tool.

=item B<-m [dna|rna|protein]>,
      B<--moltype=[dna|rna|protein]> 		  

Specify the type of sequence on input (should not be needed in most
cases, but sometimes Bioperl cannot guess and complains when
processing data).

=item B<-q>
      B<--fastq>

Use fastq format as input and output.

=back

=head1 EXAMPLES

Sort sequences by their length:

=over 8

B<faslen> data.fas | B<fassort> -nt length

=back

Sort sequences by their third-position adenosine contents:

=over 8

B<fascut> 1:-1:3 cds.fas | B<fascomp> | B<fassort> -nt comp_A

=back

=head1 SEE ALSO

=over 8

=item C<man perlre>

=item C<perldoc perlre>

Documentation on perl regular expressions.

=item C<man FAST>

=item C<perldoc FAST>

Introduction and cookbook for FAST

=item L<The FAST Home Page|http://compbio.ucmerced.edu/ardell/FAST>"

=back 

=head1 CITING

If you use FAST, please cite I<Lawrence et al. (2015). FAST: FAST Analysis of
Sequences Toolbox.> and Bioperl I<Stajich et al.>. 

=cut
