#!/usr/bin/env perl
package Bio::AssemblyImprovement::Bin::ReadCorrectionWithSGA;
# ABSTRACT: Given two fastq files (forward and reverse), try and correct the reads using SGA before doing the assembly
# PODNAME: read_correction_with_sga


BEGIN { unshift( @INC, '../lib' ) }
use lib "/software/pathogen/internal/prod/lib";
use Moose;
use Getopt::Long;
use Cwd;
use Cwd 'abs_path';

use Bio::AssemblyImprovement::Assemble::SGA::Main;

my ( $forward_reads_file, $reverse_reads_file, $sga_exec, $min_length, $quality_filter, $quality_trim);
my ($algorithm, $threads, $disk, $kmer_threshold, $kmer_length, $output_directory, $output_filename, $debug, $help );

GetOptions(
    'f|forward_fastq=s'     => \$forward_reads_file,
    'r|reverse_fastq=s'     => \$reverse_reads_file,
    's|sga_exec=s'		    => \$sga_exec,
    'm|min_length=i'		=> \$min_length,
    'q|quality_trim=i'		=> \$quality_trim,
    'l|quality_filter=i'	=> \$quality_filter,
    'a|algorithm=s'		    => \$algorithm,
    't|threads=i'		    => \$threads,
    'i|disk=i'				=> \$disk,
    'h|kmer_threshold=i'	=> \$kmer_threshold,
    'k|kmer_length=i'       => \$kmer_length,
    'o|output_directory=s'  => \$output_directory,
    'z|output_filename=s'   => \$output_filename,
    'd|debug'               => \$debug,
    'h|help'                => \$help,
);


( defined($forward_reads_file) && defined($reverse_reads_file) && ( -e $forward_reads_file ) && ( -e $reverse_reads_file ) && !$help ) or die <<USAGE;
Usage: read_correction_with_sga [options]
	
        -f|forward_fastq       <forward reads file - zipped or unzipped>
        -r|reverse_fastq       <reverse reads files - zipped or unzipped >
        -s|sga_exec	 		   <path to sga script>
        
        -m|min_length	       <discard sequences that are shorter than this during preprocess>
        -q|quality_trim	       <trim reads using Heng Li's BWT trimming algorithm>
        -l|quality_filter	   <discard read if it contains more than this many low-quality bases>
        
        -a|algorithm		   <indexing algorithm - ropebwt or sais>
        -t|threads	           <number of threads to use for computation>
        -i|disk				   <use disk-based BWT construction algorithm. The suffix array/BWT will be constructed
                                for batchs of this many reads at a time. To construct the suffix array of 200 megabases of sequence
                                requires ~2GB of memory, set this parameter accordingly.>
        -h|kmer_threshold	   <kmer threshold. Attempt to correct kmers that appear less than this many times>
        -k|kmer_length	       <the length of kmer to be used>
        
        -o|output_directory	   <output directory for results file(s)>
        -z|output_filename     <output filename if not using default of _sga_error_corrected.fastq.gz>
        -d|debug			   <debug>
        -h|help      		   <this message>
        
Takes in two FASTQ files (forward and reverse), and then performs read correction using SGA

# outputs a file called _sga_error_corrected.fastq.gz
read_correction_with_sga -f 123_1.fastq -r 123_2.fastq 

# Gzipped input files are accepted
read_correction_with_sga -f 123_1.fastq.gz -r 123_2.fastq.gz

# Sga_exec defaults to standard sga installation, threads default to 1, indexing algorithm defaults to ropebwt and kmer_length defaults to 60% of read length if not specified
read_correction_with_sga -f 123_1.fastq -r 123_2.fastq -a sais -t 8 -k 41

# This help message
read_correction_with_sga -h

USAGE

$sga_exec ||= '/software/pathogen/external/apps/usr/local/src/SGA/sga'; #Do we need to check if executable available?

$min_length ||= 66;
$quality_trim ||= 20;
$quality_filter ||= 3;

$algorithm ||= 'ropebwt';
$threads ||= 1;
$disk ||= 1000000;
$kmer_threshold ||= 5;
$kmer_length ||=41; 
$output_directory ||= abs_path (getcwd());
$output_filename ||= '_sga_error_corrected.fastq';
$debug           ||= 0;

my @input_files = ( $forward_reads_file, $reverse_reads_file );

my $sga = Bio::AssemblyImprovement::Assemble::SGA::Main->new(
    input_files      => \@input_files, 
    algorithm 		 => $algorithm,
    min_length	     => $min_length,
    quality_filter	 => $quality_filter,
    quality_trim	 => $quality_trim,
    threads			 => $threads,
    disk			 => $disk,
    kmer_threshold   => $kmer_threshold,
    kmer_length		 => $kmer_length,
    output_directory => $output_directory,
    output_filename  => $output_filename,
    sga_exec		 => $sga_exec,
    debug            => $debug,
)->run();

__END__

=pod

=head1 NAME

read_correction_with_sga - Given two fastq files (forward and reverse), try and correct the reads using SGA before doing the assembly

=head1 VERSION

version 1.131060

=head1 SYNOPSIS

Given two input fastq files, perform error correction on them. The results are put into a fastq file called _sga_error_corrected.fastq in the current
working directory (or, if specified, a directory of your choice).

Usage: read_correction_with_sga [options]

		-f|forward_fastq       <forward reads file - zipped or unzipped>
        -r|reverse_fastq       <reverse reads files - zipped or unzipped >
        -s|sga_exec	 		   <path to sga script>
        
        -m|min_length	       <discard sequences that are shorter than this during preprocess>
        -q|quality_trim	       <trim reads using Heng Li's BWT trimming algorithm>
        -l|quality_filter	   <discard read if it contains more than this many low-quality bases>
        
        -a|algorithm		   <indexing algorithm - ropebwt or sais>
        -t|threads	           <number of threads to use for computation. Choose carefully>
        -i|disk				   <for disk based index algorithm>
        -h|kmer_threshold	   <kmer threshold. Attempt to correct kmers that appear less than this many times>
        -k|kmer_length	       <the length of kmer to be used>
        -o|output_directory	   <output directory for results file>
        -z|output_filename	   <name of output filename if not using default of _sga_error_corrected.fastq.gz>
        -d|debug			   <debug>
        -h|help      		   <this message>

Takes in two FASTQ files (forward and reverse), and then performs read correction using SGA

# outputs a file called _sga_error_corrected.fastq
read_correction_with_sga -f 123_1.fastq -r 123_2.fastq 

# Gzipped input files are accepted
read_correction_with_sga -f 123_1.fastq.gz -r 123_2.fastq.gz

# This help message
read_correction_with_sga -h 

=head1 AUTHOR

Andrew J. Page <ap13@sanger.ac.uk>

=head1 COPYRIGHT AND LICENSE

This software is Copyright (c) 2012 by Wellcome Trust Sanger Institute.

This is free software, licensed under:

  The GNU General Public License, Version 3, June 2007

=cut
