#!/usr/bin/perl -w
use POSIX;
use Getopt::Long qw(:config bundling require_order auto_version);
use Pod::Usage;
use FAST;
use FAST::Bio::SeqIO;
use FAST::Bio::Seq;
use strict;
use FAST::Bio::Tools::CodonTable;

use vars qw($VERSION $DESC $NAME $COMMAND $DATE);
$VERSION = $FAST::VERSION; 
$DESC    = "counts codon usage";
$NAME    = $0;
$NAME    =~ s/^.*\///;
$COMMAND = join " ",$NAME,@ARGV;
$DATE = POSIX::strftime("%c",localtime());

use constant { true => 1, false => 0 };

## DEFAULT OPTION VALUES
my $def_format  = $FAST::DEF_FORMAT;  #7/1/13 "fasta";
my $def_logname = $FAST::DEF_LOGNAME; #7/1/13 "FAST.log.txt";
my $def_join_string = $FAST::DEF_JOIN_STRING;

## OPTION VARIABLES
my $man                  = undef;  # --man
my $help                 = undef;  # -h
my $moltype              = undef;  # -m, in case bioperl can't tell
my $format               = $def_format;  # --format
my $log                  = undef;        # -l
my $logname              = $def_logname; # -L
my $comment              = undef;        # -C
my $base_order           = undef; #-b
my $verbose              = undef; #--verbose
my $amino_order          = undef; #-o
my $code                 = undef; #-c
my $codes                = undef;
my $table                = undef; #-t // table output 
my $join                 = $def_join_string; # -j
my $fastq                = undef;
my $width                = undef;
my $no_starts            = undef;
my $precision            = 3;
my $rscu                 = undef;
my $summary              = undef;

GetOptions('help|h'         		 => \$help, 
	   'man'            		 => \$man,
	   'moltype|m=s'                 => sub{  my (undef,$val) = @_; 
						  die "$NAME: --moltype or -m option argument must be \"dna\", \"rna\" or \"protein\"" 
						    unless $val =~ /dna|rna|protein/i; 
						  $moltype = $val;
						},
	   'format=s'                    => \$format,
	   'log|l'                       => \$log,
	   'logname|L=s'                 => \$logname,
	   'comment|C=s'                 => \$comment,
           'base-order|b=s'              =>  sub{  my (undef,$val) = @_; 
						  die "$NAME: --base-order or -b option takes string of T,C,A, and G in some order" 
						    unless ($val =~ /[TCAG]{4}/ and $val =~ /T/ and $val =~ /C/ and $val =~ /A/ and $val =~ /G/); 
						   $base_order = $val;
						},
           'verbose'                     => \$verbose,
           'amino-order|o:s'             => \$amino_order, #optional option argument. If not given, "ARNDCQEGHILKMFPSTWYV*"
           'no-starts|nostarts'          => \$no_starts,
           'code|c=i'                    => \$code,
           'table|t'                     => \$table,
	   'summary|s'                   => \$summary,
	   'join|j=s'                    => \$join,
	   'width|w=i'                   => \$width,
	   'precision|p=i'               => \$precision,
	   'rscu|r'                      => \$rscu,
           'fastq|q'                     => sub{$format = 'fastq';},
	   'codes'                       => \$codes,

) 
or pod2usage(2);

$join = "\t" if ($join eq '\t');
	  
pod2usage(-verbose => 1) if $help;
pod2usage(-verbose => 2) if $man;

if ($codes) {
  my $tables = FAST::Bio::Tools::CodonTable->tables;
  my ($id, $name);
  foreach $id (sort {$a <=> $b} sort keys %$tables) {
    $name = $$tables{$id};
    print "$id = $name\n";
  }
  die;
}

my $fromSTDIN = ((-t STDIN) ? false : true);
pod2usage("$NAME: Requires at least one argument FILE [FILE2…FILEN] unless input from STDIN.\n") if (!($fromSTDIN) && (@ARGV == 0));
pod2usage("$NAME: Requires exactly zero arguments if input is from STDIN.\n") if ($fromSTDIN && (@ARGV != 0));

&FAST::log($logname, $DATE, $COMMAND, $comment) if ($log); 

my $geneticcode = FAST::Bio::Tools::CodonTable->new(-id => $code);



my @append_seqs;

my @bases;
if ($base_order) {
  @bases = split //, $base_order;
}
else {
  @bases = qw/T C A G/;   
}

if (defined $amino_order and $amino_order eq ""){
  $amino_order = "ARNDCQEGHILKMFPSTWYV*";
}

my %synonyms = ();
foreach my $aa (split //, "ARNDCQEGHILKMFPSTWYV*") {
  push @{ $synonyms{$aa}},  map {uc $_} $geneticcode->revtranslate($aa);
}

my $count = {};
my $startcount = {}; ## start codons
my $totalcount = {};
my $totalstartcount = {}; ## start codons

my $middlecodons = my $startcodons = my $stopcodons = my $ambiguous_middle = my $ambiguous_start = my $ambiguous_stop = my $bad_length = my $numseqs = my $stop_at_middle = my $stop_at_start = my $nonstop = my $odd_length = my $too_short = 0;


my $OUT = FAST::Bio::SeqIO->newFh('-format' => $format);
my $IN;

unless ($width) {
  $width = $precision + 2;
}

unless (@ARGV) {
  if ($moltype) {
    $IN = FAST::Bio::SeqIO->new(-fh => *STDIN{IO}, '-format' => $format, '-alphabet' => $moltype);
  }
  else {
    $IN = FAST::Bio::SeqIO->new(-fh => *STDIN{IO}, '-format' => $format);
  }
}

if ($table) {
  if ($no_starts) {
    printf ":          USAGE %${width}s\n","RSCU";
  }
  else {
    printf ":          USAGE     STARTS %${width}s %${width}s\n","RSCU","S-RSCU";
  }
}
while ($IN or @ARGV) {
  if (@ARGV) {
    my $file = shift (@ARGV);
    unless (-e $file) {
      warn "$NAME: Could not find file $file. Skipping.\n";
      next;
    }
    elsif ($moltype) {
      $IN = FAST::Bio::SeqIO->new(-file => $file, '-format' => $format, '-alphabet' => $moltype);
    }
    else {
      $IN = FAST::Bio::SeqIO->new(-file => $file, '-format' => $format);
    }
  }
  if ($IN) { 
    while (my $seq = $IN->next_seq()) {
      $count = {};
      $startcount = {} unless ($no_starts); ## start codons
      $numseqs++;
      
      my $id=$seq->id();
      my $length = $seq->length(); ## is this slow?
      if ($length % 3) {
        warn "count_codons error_4: $id has length not of modulo 3, skipping this sequence.\n" if ($verbose);
        $odd_length++;
      } 
      if ($length % 3) {
        warn "count_codons error_4: $id has length not of modulo 3, skipping this sequence.\n" if ($verbose);
        $too_short++;
      } 
      
            
      my $seqseq = uc $seq->seq();
      ##    $seqseq =~ y/T/U/;
      my @bases = split//,$seqseq; 

      unless ($no_starts) {      
	## count start codons separately
	my $startcodon = join "",splice(@bases, 0, 3);
	my $aa = $geneticcode->translate($startcodon); 
	if ($aa eq "X") { ## ambiguous amino acid from ambiguous base, bioperl can resolve IUPAC codes
	  $ambiguous_start++;
	}
	elsif ($aa eq "*") {
	  warn "count_codons error_5: $id starts in a stop codon.\n" if ($verbose);
	  $stop_at_start++;
	}
	else { 
	  $$startcount{$startcodon}++;
	  $$totalstartcount{$startcodon}++;
	  $startcodons++;
	}
      }	
      

      next if (@bases < 3);
      
      ## count stop codons together with other middle codons but distinguish them for counting
      my $stopcodon = join "",splice(@bases, -3);
      my $aa = $geneticcode->translate($stopcodon); 
      if ($aa eq "X") {
	$ambiguous_stop++ 
      }
      else {
	if ($aa ne "*") {
	  warn "count_codons error_1: $id does not end in a stop codon, it ends in $stopcodon.\n" if ($verbose);
	  $nonstop++;
	}
	else {
	  $stopcodons++
	}
	$middlecodons++;
	$$count{$stopcodon}++; 
	$$totalcount{$stopcodon}++; 
      }
      
      
      while (@bases) {
	my $codon = join "",splice(@bases, 0, 3);
	$aa = $geneticcode->translate($codon);
	if ($aa eq "X") {
	  $ambiguous_middle++ 
	}
	elsif ($aa eq "*") {
	  warn "count_codons error_2: $id has a premature stop codon.\n" if ($verbose);
	  $stop_at_middle++;
	  $$count{$codon}++;
	  $$totalcount{$codon}++; 
	  $middlecodons++;
	}
	else { ## ambiguous amino acid from ambiguous base, bioperl can resolve IUPAC codes
	  $$count{$codon}++;
	  $$totalcount{$codon}++; 
	  $middlecodons++;
	}
	
      }
      if ($table){
	my $id = $seq->id ;# . $seq->desc();
	&print_output($id,$startcount,$count) unless ($summary);
      }
      else{
	my $old_desc = $seq->desc();
	my $new_desc = &print_output($id,$startcount,$count);
	$new_desc = join $join, $old_desc,$new_desc;
	$seq->desc($new_desc);
	print $OUT $seq;
      }
    }
    undef $IN;
  }
}
if ($table) {
  &print_output("ALL_DATA",$totalstartcount,$totalcount);
  &print_end_totals; 
}

sub print_output {
  my ($seqid,$startcount,$count) = @_;
  # calculate RSCU for middle codons
  my $append_count = 0;
  my $sum;
  my %rscu = ();
  my %startrscu = () unless ($no_starts);
  foreach my $aa (split //, "ARNDCQEGHILKMFPSTWYV*") {
    my @syns = @{$synonyms{$aa}};
    for (@syns) {
      $$count{$_} ||= 0;
    }
    @rscu{@syns} = @$count{@syns};
    foreach my $syn (@syns) { 
      $sum += $rscu{$syn};
    }
    $sum ||= 1;
    foreach my $syn (@syns) { 
      $rscu{$syn} /= $sum;
    }
  }
  
  unless ($no_starts) {
    # calculate RSCU for start codons
    # only consider start codons actually seen
    %startrscu  = %$startcount;
    $sum = 0;
    foreach my $codon (keys %startrscu) { 
      $sum += $startrscu{$codon};
    }
    $sum ||= 1;
    foreach my $codon (keys %startrscu) { 
      $startrscu{$codon} /= $sum;
    }
    map {$sum+=$_} values %startrscu;
    map {$_/$sum}  values %startrscu;
  }
  my @annotations = ();
  if ($amino_order) { ## group output by amino acid
    foreach my $aa (split //, $amino_order) {
      foreach my $codon (@{$synonyms{$aa}}) {
	$rscu{$codon} ||= 0;
	$startrscu{$codon} ||= 0 unless ($no_starts);
	$$count{$codon} ||= 0;
	$$startcount{$codon} ||= 0 unless ($no_starts);
	if ($table){
	  unless ($no_starts) {
	    printf("%s %s %10u %10u %${width}.${precision}f %${width}.${precision}f %s\n",$geneticcode->translate($codon),$codon,$$count{$codon},$$startcount{$codon},$rscu{$codon},$startrscu{$codon},$seqid);
	  }
	  else {
	    printf("%s %s %10u %${width}.${precision}f %s\n",$geneticcode->translate($codon),$codon,$$count{$codon},$rscu{$codon},$seqid);
	  }
	}
	else{ #annotate
	  if ($rscu) {
	    push @annotations, sprintf("RSCU_%s_%s:%${width}.${precision}f",$geneticcode->translate($codon),$codon,$rscu{$codon});
	  }
	  else {
	    push @annotations, sprintf("freq_%s_%s:%u",$geneticcode->translate($codon),$codon,$$count{$codon});
	  }
	}
      }
    }
  }
  else {
    foreach my $first (@bases) { 
      foreach my $second (@bases) { 
	foreach my $third (@bases) { 
	  my $codon = join "",$first,$second,$third;
	  $rscu{$codon} ||= 0;
	  $startrscu{$codon} ||= 0 unless ($no_starts);
	  $$count{$codon} ||= 0;
	  $$startcount{$codon} ||= 0 unless ($no_starts);
	  if ($table){
	    unless ($no_starts) {
	      printf("%s %s %10u %10u %${width}.${precision}f %${width}.${precision}f %s\n",$geneticcode->translate($codon),$codon,$$count{$codon},$$startcount{$codon},$rscu{$codon},$startrscu{$codon},$seqid);
	    }
	    else {
	      printf("%s %s %10u %${width}.${precision}f %s\n",$geneticcode->translate($codon),$codon,$$count{$codon},$rscu{$codon},$seqid);
	    }
	  }
	  else{ #annotate
	    if ($rscu) {
	      push @annotations, sprintf("RSCU_%s_%s:%${width}.${precision}f",$geneticcode->translate($codon),$codon,$rscu{$codon});
	    }
	    else {
	      push @annotations, sprintf("freq_%s_%s:%u",$geneticcode->translate($codon),$codon,$$count{$codon});
	    }
	  }
	}
      }
    }
  }
  unless ($table) {
    return join $join,@annotations;
  }
}
  

sub print_end_totals { 
  unless ($no_starts) {
    printf "::--------------------------------------\n";
    printf ":1    %10u %10u            | unambiguous codons at tail (middle+end) and at start\n",$middlecodons,$startcodons;
    printf ":2    %10u %10u %10u | ambiguous   codons in middle, at start, and at end\n",$ambiguous_middle,$ambiguous_start,$ambiguous_stop;
    printf "::                                     | (ambiguous codons are skipped)\n";
    printf ":3    %10u %10u            | stop        codons in middle and at start\n",$stop_at_middle,$stop_at_start;
    printf ":4                          %10u | stop        codons at end (included in line 1)\n",$stopcodons;
    printf ":4                          %10u | sense       codons at end (included in line 1)\n",$nonstop;
    printf "::--------------------------------------\n";
    printf ":5    %10u of %-10u            sequences had a length non-modulo 3 and were skipped.\n",$bad_length,$numseqs;
  }
  else {
    printf "::---------------------------\n";
    printf ":1    %10u            | unambiguous codons\n",$middlecodons;
    printf ":2    %10u %10u | ambiguous   codons in middles and ends\n",$ambiguous_middle,$ambiguous_stop;
    printf "::                          |  (ambiguous codons are skipped)\n";
    printf ":3    %10u            | stop        codons not at ends\n",$stop_at_middle;
    printf ":4               %10u | stop        codons at end (included in line 1)\n",$stopcodons;
    printf ":4               %10u | sense       codons at end (included in line 1)\n",$nonstop;
    printf "::---------------------------\n";
    printf ":5    %10u of %-10u          sequences had a length non-modulo 3 and were skipped.\n",$bad_length,$numseqs;    
  }
}
    
    
    __END__

=head1 NAME

B<fascodon> - counts codon usage

=head1 SYNOPSIS

B<fascodon> [options] [MULTIFASTA-FILE...]

=head1 DESCRIPTION

B<fascodon> takes multifasta format sequence or alignment data as
input, and analyzes codon usage and other coding characteristics of
the data. By default, B<fascodon> analyzes each sequence record
individually, outputting raw codon frequencies appended to the
description of each record, which allows sorting or filtering of data
with other FAST tools. Optionally, B<fascodon -t> in "table-mode"
outputs data in tabular format, followed by an aggregate summary of
all the data on input. By default, start codons are analyzed
separately, assuming that all sequences are at least two codons long.

Options specific to B<fascodon>:
  B<-r>, B<--rscu>                   annotate with RSCU values rather than raw frequency counts 
  B<-j>, B<--join>=<string>          use <string> to join tagged values in descriptions
  B<--no-starts>                  do not analyze start codons separately
  B<-p>, B<--precision>=<int>        print RSCU with <int> digits after the decimal point 
  B<-w>, B<--width>=<int>            print frequencies in fields of width <int>
  B<-t>, B<--table>                  table-mode, outputs codon frequency table to STDOUT 
  B<-s>, B<--summary>                in table-mode, only output summaries for all data
  B<-b>, B<--base-order>=<string>    use base-order <string> to enumerate codons
                                     default order is "TCAG"              
  B<-o>, B<--amino-order>:<string>   enumerate codons by amino acids they encode
                                     optionally, only for amino acids in <string>, or if
                                     no argument, use default "ARNDCQEGHILKMFPSTWYV*";
  B<-c>, B<--code>=<int>             use NCBI genetic code tableID <int> to translate
  B<--codes>                      print NCBI tableIDs of genetic codes for -c option
  B<--verbose>                    issue warnings about coding properties to STDERR

Options general to FAST:
  B<-h>, B<--help>                  	 print a brief help message
  B<--man>             	           print full documentation
  B<--version>                         print version
  B<-l>, B<--log>                         create/append to logfile	
  B<-L>, B<--logname>=<string>            use logfile name <string>
  B<-C>, B<--comment>=<string>            save comment <string> to log
  B<--format>=<format>                 use alternative format for input  
  B<--moltype>=<[dna|rna|protein]>     specify input sequence type
  B<-q>, B<--fastq>                       use fastq format as input and output

=head1 INPUT AND OUTPUT

B<fascodon> is part of FAST, the FAST Analysis of Sequences Toolbox, based
on Bioperl. Most core FAST utilities expect input and return output in
multifasta format. Input can occur in one or more files or on
STDIN. Output occurs to STDOUT. The FAST utility B<fasconvert> can
reformat other formats to and from multifasta.

=head1 OPTIONS

=over 8

=item B<-r>
      B<--rscu>

Output Relative Synonymous Codon Usage (RSCU) values rather than raw
frequencies (default).

=item B<-c [int]>
      B<--code=[int]>

Use NCBI genetic code tableID <int> for translating sequences.

=item B<--codes>

Output a list of NCBI genetic code tableIDs and exit.

=item B<-b [string]>
      B<--base-order=[string]>

Use bases in [string] order to enumerate codons. Default is "TCAG." 

=item B<--verbose>,

Issue warnings to STDERR about sequences with premature stop
codons, that do not end in stop codons, sequences that are not
divisible by 3, etc.

=item B<-o>
      B<--amino-order>
      B<-o [string]>
      B<--amino-order=[string]>

Enumerate codons by the amino acids they encode. If no option argument
is given, codons are enumerated in the default order
"ARNDCQEGHILKMFPSTWYV*". If option argument is given, it determines
which amino acids (codon families) will be analyzed and in what order.

=item B<-j [string]>
      B<--join=[string]>

Use <string> to join tagged value output in sequence record
descriptions. Use with argument "\t" to indicate a tab-character.


=item B<-t>
      B<--table>

Print output in a table to STDOUT. 

=item B<-h>,
      B<--help>

Print a brief help message and exit.

=item B<--man>

Print the manual page and exit.

=item B<--version>

Print version information and exit.

=item B<-l>,
      B<--log>

Creates, or appends to, a generic FAST logfile in the current working
directory. The logfile records date/time of execution, full command
with options and arguments, and an optional comment.

=item B<-L [string]>,
      B<--logname=[string]>

Use [string] as the name of the logfile. Default is "FAST.log.txt".

=item B<-C [string]>,
      B<--comment=[string]>

Include comment [string] in logfile. No comment is saved by default.

=item B<--format=[format]> 		  

Use alternative format for input. See man page for "fasconvert" for
allowed formats. This is for convenience; the FAST tools are designed
to exchange data in Fasta format, and "fasta" is the default format
for this tool.

=item B<-m [dna|rna|protein]>,
      B<--moltype=[dna|rna|protein]> 		  

Specify the type of sequence on input (should not be needed in most
cases, but sometimes Bioperl cannot guess and complains when
processing data).

=item B<-q>,
      B<--fastq>

use fastq format as input and output.

=back

=head1 EXAMPLES

Print codon usage of sequences:

=over 8

cat data.fas | B<fascodon>

=back



=head1 SEE ALSO

=over 8

=item C<man perlre>

=item C<perldoc perlre>

Documentation on perl regular expressions.

=item C<man FAST>

=item C<perldoc FAST>

Introduction and cookbook for FAST

=item L<The FAST Home Page|http://compbio.ucmerced.edu/ardell/FAST>"

=back 

=head1 CITING

If you use FAST, please cite I<Lawrence et al. (2015). FAST: FAST Analysis of
Sequences Toolbox.> and Bioperl I<Stajich et al.>. 

=cut
