#!/usr/bin/env perl
# ABSTRACT: A FASTA/FASTQ sequence counter
# PODNAME: fqc

use 5.012;
use Pod::Usage;
use Term::ANSIColor;
use Getopt::Long;
use FindBin qw($Bin);
use lib "$Bin/../lib";
use File::Basename;
use FASTX::Reader;
use Data::Dumper;
use JSON::PP;

die "ERROR: FASTX::Reader required version is >= 0.3\n" if ($FASTX::Reader::VERSION < 0.3);

my %output;
my $sort_function;
my $counter = 0;

my $opt_invalid = 'n/a';
my $opt_separator ="\t";
my $opt_sortby = 'order';
my ($opt_abs, $opt_basename, $opt_thousand, $opt_fastx, $opt_csv, $opt_json, $opt_rev, $opt_screen, $opt_tsv, $opt_help, $opt_pretty_json, $opt_verbose);
my $sort_direction = 'desc';
my $_opt = GetOptions(
  'a|abspath'    => \$opt_abs,
  'b|basename'   => \$opt_basename,
  'c|csv'        => \$opt_csv,
  'd|thousandsep'=> \$opt_thousand,
  'f|fastx'      => \$opt_fastx,
  'h|help'       => \$opt_help,
  'j|json'       => \$opt_json,
  'p|pretty'     => \$opt_pretty_json,
  'r|reverse'    => \$opt_rev,
  's|sortby=s'   => \$opt_sortby,
  't|tsv'        => \$opt_tsv,
  'v|verbose'    => \$opt_verbose,
  'x|screen'     => \$opt_screen,

);


init();

foreach my $input_file (@ARGV) {
  print STDERR " - Reading $input_file... " if ($opt_verbose);
  $counter++;
  my $print_name = $input_file;
  my $abs_path   = File::Spec->rel2abs($input_file);

  if ($opt_abs) {
    $print_name = $abs_path;
  } elsif ($opt_basename) {
    $print_name = basename($input_file);
  }
  $output{$abs_path}{name} = $print_name;
  $output{$abs_path}{order} = $counter;

  if (! -e "$input_file") {
    
    $output{$abs_path}{count} = $opt_invalid;
    $output{$abs_path}{print_count} = $opt_invalid;
    $output{$abs_path}{parser} = "NONE";
    $output{$abs_path}{compressed} = "$opt_invalid";
    print STDERR "NOT FOUND\r" if ($opt_verbose);
    print STDERR color('red'), "[WARNING] ", color('reset'), "File <$input_file> not found. Skipping.\n";
    next;
  }
  my  $seq_reader = FASTX::Reader->new({filename => "$input_file"});

  if ($input_file=~/(\.fastq|\.fq)/ and not defined $opt_fastx) {
    # parse FASTQ
    while (my $seq = $seq_reader->getFastqRead()) {
      next;
    }
    $output{$abs_path}{count} = $seq_reader->{counter};
    $output{$abs_path}{parser} = 'FASTQ';
    if ($seq_reader->{status} == 0) {
      $output{$abs_path}{count} = "$opt_invalid";
      $output{$abs_path}{print_count} = "$opt_invalid";
    }  
  } else {
    # generic FASTX parser
    while (my $seq = $seq_reader->getRead()) {
      next;
    }
    $output{$abs_path}{parser} = 'FASTX';
    $output{$abs_path}{count} = $seq_reader->{counter};
  }
  $output{$abs_path}{count} = $opt_invalid unless (defined $output{$abs_path}{count});
  $output{$abs_path}{compressed}  = $seq_reader->{compressed};
  $output{$abs_path}{print_count} = $output{$abs_path}{count};
  $output{$abs_path}{print_count} =~ s/(\d{1,3}?)(?=(\d{3})+$)/$1,/g if ($opt_thousand);
  print STDERR $output{$abs_path}{print_count}, "     \r" if ($opt_verbose);
}

print STDERR "\n" if ($opt_verbose);

# PRINT OUTPUT

if ($opt_json) {
  print STDERR "Printing output in JSON format:\n" if ($opt_verbose);
  print encode_json(\%output);
} elsif ($opt_pretty_json) {
  print STDERR "Printing output in JSON (PRETTY_PRINT) format:\n" if ($opt_verbose);
  my $json = JSON::PP->new->ascii->pretty->allow_nonref;
  print $json->encode( \%output );

} elsif ($opt_csv or $opt_tsv) {
  foreach my $r (sort $sort_function  keys %output  ) {
    print join("$opt_separator",
      $output{$r}->{order},
      $output{$r}->{name},
      $output{$r}->{print_count},
      $output{$r}->{compressed},
      $output{$r}->{parser},
    ), "\n";
  }
} elsif ($opt_screen) {
  print STDERR "Printing output in ASCII tabular format:\n" if ($opt_verbose);
  my $t = Text::ASCIITable->new();
  $t->setCols('#','Name','Seqs', 'Gz','Parser');
  foreach my $r (sort $sort_function  keys %output  ) {

    $t->addRow(
      $output{$r}->{order},
      $output{$r}->{name},
      $output{$r}->{print_count},
      $output{$r}->{compressed},
      $output{$r}->{parser},
      );
  }
  print $t;

} else {
  # Default: only name and counts
  print STDERR "Printing output in default format:\n" if ($opt_verbose);
  foreach my $r (sort $sort_function  keys %output  ) {
    print join("$opt_separator", $output{$r}->{name}, $output{$r}->{print_count}), "\n";
  }
}

sub init {

  pod2usage({-exitval => 0, -verbose => 2}) if $opt_help;

  if (! defined $ARGV[0]) {
    print STDERR<<END;
    fqc - FASTA/FASTQ counter $FASTX::Reader::VERSION
    ---------------------------------------------------------------
    USAGE:
    fqc [options] FILE1 FILE2 ... FILEn

    fqc --help for full manual

END
  }
  my $count_options = 0;
  $count_options++ if (defined $opt_screen);
  $count_options++ if (defined $opt_json);
  $count_options++ if (defined $opt_tsv);
  $count_options++ if (defined $opt_csv);
  $count_options++ if (defined $opt_pretty_json);

  die "FATAL ERROR:\n",
  "Just select one (or none) output format: either --csv, --tsv, --screen  or --json (none for default).\n" if ($count_options>1);
  if ($opt_screen) {
    eval {
      require Text::ASCIITable;
      Text::ASCIITable->import();
    };
    if ($@) {
      print STDERR " WARNING: Text::ASCIITable not found, rolling back to default output.\n";
      $opt_screen = undef;
    }
  }

  $opt_separator = "," if ($opt_csv);

  # SORT BY

  our %sorters = (
     num_asc         => sub { $output{$b}{$opt_sortby} <=> $output{$a}{$opt_sortby} },
     num_desc        => sub { $output{$a}{$opt_sortby} <=> $output{$b}{$opt_sortby} },
     string_desc     => sub { $a cmp $b },
     string_asc      => sub { $b cmp $a },
  );

  my %valid_sort = (
    'name'  => 'string_',
    'order' => 'num_',
    'count' => 'num_',
  );


  $sort_direction = 'asc' if ($opt_rev);

  if (not defined $valid_sort{$opt_sortby}) {
    die " Fatal error: Sort by '$opt_sortby' not supported. Valid options are: ", join(', ', keys %valid_sort), "\n";
  } else {
    my $sort_function_name = $valid_sort{$opt_sortby} . $sort_direction;
    $sort_function = $sorters{$sort_function_name};
  }
}

__END__

=pod

=encoding UTF-8

=head1 NAME

fqc - A FASTA/FASTQ sequence counter

=head1 VERSION

version 1.2.1

=head1 SYNOPSIS

  fqc [options] [FILE1 FILE2 FILE3...]

=head1 DESCRIPTION

This program parses a list of FASTA/FASTQ files printing the number of sequences
found in each file. Reads both uncompressed and GZipped files.
Default output is the filename, tab, sequence count. Can be changed with options.

The table "key" is the absolute path of each input file, but the printed name can be
changed with options.

=head1 PARAMETERS

=head2 FILE NAME

=over 4

=item I<-a, --abspath>

Print the absolute path of the filename (the absolute path is always the table key,
but if relative paths are supplied, they will be printed).

=item I<-b, --basename>

Print the filename without the path.

=item I<-d, --thousandsep>

Print reads number with a "," used as thousand separator

=back

=head2 OUTPUT FORMAT

Default output format is the filename and reads counts, tab separated. Options formatting
either filename (C<-a>, C<-b>) and reads counts (C<-d>) will still work.

=over 4

=item I<-t, --tsv> and I<-c, --csv>

Print a tabular output either tab separated (with C<-t>) or comma separated (with C<-c>).

=item I<-j, --json>

Print full output in JSON format.

=item I<-p,  --pretty>

Same as JSON but in "pretty" format.

=item I<-x, --screen>

This feature requires L<Term::ASCIITable>.
Print an ASCII-art table like:

  .---------------------------------------------------.
  | # | Name                     | Seqs | Gz | Parser |
  +---+--------------------------+------+----+--------+
  | 1 | data/comments.fasta      |    3 |  0 | FASTX  |
  | 2 | data/comments.fastq      |    3 |  0 | FASTQ  |
  | 3 | data/compressed.fasta.gz |    3 |  1 | FASTX  |
  | 4 | data/compressed.fastq.gz |    3 |  1 | FASTQ  |
  '---+--------------------------+------+----+--------'

=back

=head2 SORTING

=over 4

=item I<-s, --sortby>

Sort by field: 'order' (default, that is the order of the input files as supplied by the user),
'count' (number of sequences), 'name' (filename).
By default will be descending for numeric fields, ascending for 'path'.
See C<-r, --reverse>.

=item I<-r, --reverse>

Reverse the sorting order.

=back

=head2 OTHER

=over 4

=item I<-f, --fastx>

Force FASTX reader also for files ending by .fastq or .fq (by default would use getFastqRead() )

=item I<-v, --verbose>

Increase verbosity

=item I<-h, --help>

Display this help page

=back

=head1 AUTHOR

Andrea Telatin <andrea@telatin.com>

=head1 COPYRIGHT AND LICENSE

This software is Copyright (c) 2019 by Andrea Telatin.

This is free software, licensed under:

  The MIT (X11) License

=cut
