# TOOL tophat2.R: "TopHat2 for paired end reads" (Aligns paired end Illumina RNA-seq reads to a genome in order to identify exon-exon splice junctions. The alignment process consists of several steps. If annotation is available as a GTF file, TopHat will extract the transcript sequences and use Bowtie to align reads to this virtual transcriptome first. Only the reads that do not fully map to the transcriptome will then be mapped on the genome. The reads that remain still unmapped are split into shorter segments, which are then aligned to the genome. Alignment results are given in a BAM file, which is automatically indexed and hence ready to be viewed in Chipster genome browser.)
# INPUT reads1.fq: "Read file 1" TYPE GENERIC
# INPUT reads2.fq: "Read file 2 with mates in matching order" TYPE GENERIC
# INPUT OPTIONAL genes.gtf: "Optional GTF file" TYPE GENERIC
# OUTPUT OPTIONAL tophat.bam
# OUTPUT OPTIONAL tophat.bam.bai
# OUTPUT OPTIONAL junctions.bed
# OUTPUT OPTIONAL insertions.bed
# OUTPUT OPTIONAL deletions.bed
# OUTPUT OPTIONAL tophat-summary.txt
# PARAMETER organism: "Genome or transcriptome" TYPE [Arabidopsis_thaliana.TAIR10.23, Bos_taurus.UMD3.1, Canis_familiaris.CanFam3.1, Drosophila_melanogaster.BDGP5, Gallus_gallus.Galgal4, Gasterosteus_aculeatus.BROADS1, Halorubrum_lacusprofundi_atcc_49239.GCA_000022205.1.23, Homo_sapiens.GRCh37.75, Homo_sapiens.GRCh38, Mus_musculus.GRCm38, Ovis_aries.Oar_v3.1, Rattus_norvegicus.Rnor_5.0, Schizosaccharomyces_pombe.ASM294v2.23, Sus_scrofa.Sscrofa10.2, Vitis_vinifera.IGGP_12x.23, Yersinia_enterocolitica_subsp_palearctica_y11.GCA_000253175.1.23] DEFAULT Homo_sapiens.GRCh38 (Genome or transcriptome that you would like to align your reads against.)
# PARAMETER OPTIONAL use.gtf: "Use annotation GTF" TYPE [yes, no] DEFAULT yes (If this option is provided, TopHat will extract the transcript sequences and use Bowtie to align reads to this virtual transcriptome first. Only the reads that do not fully map to the transcriptome will then be mapped on the genome. The reads that did map on the transcriptome will be converted to genomic mappings (spliced as needed\) and merged with the novel mappings and junctions in the final TopHat output. If no GTF file is provided by user, internal annotation file will be used if available. Internal annotation is provided for human, mouse, rat, dog and fruitfly.)
# PARAMETER OPTIONAL no.novel.juncs: "When GTF file is used, ignore novel junctions" TYPE [yes, no] DEFAULT no (Only look for reads across junctions indicated in the supplied GTF file.)
# PARAMETER OPTIONAL quality.format: "Base quality encoding used" TYPE [sanger: "Sanger - Phred+33", phred64: "Phred+64"] DEFAULT sanger (Quality encoding used in the fastq file.)
# PARAMETER OPTIONAL mate.inner.distance: "Expected inner distance between mate pairs" TYPE INTEGER DEFAULT 200 (Expected mean inner distance between mate pairs. For example, if your fragment size is 300 bp and read length is 50 bp, the inner distance is 200.)
# PARAMETER OPTIONAL mate.std.dev: "Standard deviation for the inner distances between mate pairs" TYPE INTEGER DEFAULT 20 (The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp.)
# PARAMETER OPTIONAL max.multihits: "How many hits is a read allowed to have" TYPE INTEGER FROM 1 TO 1000000 DEFAULT 20 (Instructs TopHat to allow up to this many alignments to the reference for a given read.)
# PARAMETER OPTIONAL mismatches: "Number of mismatches allowed in final alignment" TYPE INTEGER FROM 0 TO 5 DEFAULT 2 (Final read alignments having more than this many mismatches are discarded.)
# PARAMETER OPTIONAL min.anchor.length: "Minimum anchor length" TYPE INTEGER FROM 3 TO 1000 DEFAULT 8 (TopHat2 will report junctions spanned by reads with at least this many bases on each side of the junction. Note that individual spliced alignments may span a junction with fewer than this many bases on one side. However, every junction involved in spliced alignments is supported by at least one read with this many bases on each side.)
# PARAMETER OPTIONAL splice.mismatches: "Maximum number of mismatches allowed in the anchor" TYPE INTEGER FROM 0 TO 2 DEFAULT 0 (The maximum number of mismatches that may appear in the anchor region of a spliced alignment.)
# PARAMETER OPTIONAL min.intron.length: "Minimum intron length" TYPE INTEGER FROM 10 TO 1000 DEFAULT 70 (TopHat2 will ignore donor-acceptor pairs closer than this many bases apart.)
# PARAMETER OPTIONAL max.intron.length: "Maximum intron length" TYPE INTEGER FROM 1000 TO 1000000 DEFAULT 500000 (TopHat2 will ignore donor-acceptor pairs farther than this many bases apart, except when such a pair is supported by a split segment alignment of a long read.)
# PARAMETER OPTIONAL no.discordant: "Report only concordant alignments" TYPE [yes, no] DEFAULT yes (Report only concordant mappings.) 
# PARAMETER OPTIONAL no.mixed: "Report only paired alignments" TYPE [yes, no] DEFAULT yes (Only report read alignments if both reads in a pair can be mapped.)

# EK 17.4.2012 added -G and -g options
# MG 24.4.2012 added ability to use gtf files from Chipster server
# AMS 19.6.2012 added unzipping
# AMS 27.6.2012 added parameter mate.std.dev, allow negative values for mate.inner.distance
# AMS 4.10.2012 added BED sorting
# KM 10.7. 2012 added RN5
# AMS 11.11.2013 added thread support
# AMS 3.1.2014 added transcriptome index for human
# EK 3.1.2014 added alignment summary to output, added quality and mismatch parameter
# EK 3.6.2014 rn4 commented out
# AMS 04.07.2014 New genome/gtf/index locations & names

# OUTPUT OPTIONAL tophat2.log

# check out if the file is compressed and if so unzip it
source(file.path(chipster.common.path, "zip-utils.R"))
unzipIfGZipFile("reads1.fq")
unzipIfGZipFile("reads2.fq")

options(scipen = 10)
# max.intron.length <- formatC(max.intron.length, "f", digits = 0)

# setting up TopHat
tophat.binary <- c(file.path(chipster.tools.path, "tophat2", "tophat2"))
path.bowtie <- c(file.path(chipster.tools.path, "bowtie2"))
path.samtools <- c(file.path(chipster.tools.path, "samtools"))
set.path <-paste(sep="", "PATH=", path.bowtie, ":", path.samtools, ":$PATH")
path.bowtie.index <- c(file.path(chipster.tools.path, "genomes", "indexes", "bowtie2", organism))
path.tophat.index <- c(file.path(chipster.tools.path, "genomes", "indexes", "tophat2", organism))

# command start
command.start <- paste("bash -c '", set.path, tophat.binary)

# parameters
#command.parameters <- paste("--bowtie1 -r", mate.inner.distance, "--mate-std-dev", mate.std.dev, "-a", min.anchor.length, "-m", splice.mismatches, "-i", min.intron.length, "-I", max.intron.length, "-g", max.multihits, "--library-type fr-unstranded")
command.parameters <- paste("-p", chipster.threads.max, "-r", mate.inner.distance, "--mate-std-dev", mate.std.dev, "--read-mismatches", mismatches, "-a", min.anchor.length, "-m", splice.mismatches, "-i", min.intron.length, "-I", max.intron.length, "-g", max.multihits, "--library-type fr-unstranded")

if ( quality.format == "phred64") {
	command.parameters <- paste(command.parameters, "--phred64-quals")
}

if (no.discordant == "yes"){
	command.parameters <- paste(command.parameters, "--no-discordant")
}

if (no.mixed == "yes"){
	command.parameters <- paste(command.parameters, "--no-mixed")
}

# Options --no-novel-juncs and --transcriptome-idex are only valid when -G (use.gtf) option is selected
if (use.gtf == "yes") {
	if (file.exists("annotation.gtf")){
		# If user has provided a gtf we use that
		annotation.file <- "annotation.gtf"
	}else{
		# If not, we use the internal one.
		annotation.file <- file.path(chipster.tools.path, "genomes", "gtf", paste(organism, "*.gtf" ,sep="" ,collapse=""))
	}
	command.parameters <- paste(command.parameters, "-G", annotation.file)
	if (no.novel.juncs == "yes") {
		command.parameters <- paste(command.parameters, "--no-novel-juncs")
	}
	command.parameters <- paste(command.parameters, "--transcriptome-index", path.tophat.index)
}

# command ending
command.end <- paste(path.bowtie.index, "reads1.fq reads2.fq >> tophat2.log '")

# run tophat
command <- paste(command.start, command.parameters, command.end)

echo.command <- paste("echo '",command ,"' > tophat2.log " )
system(echo.command)
#stop(paste('CHIPSTER-NOTE: ', command))
system(command)

# samtools binary
samtools.binary <- c(file.path(chipster.tools.path, "samtools", "samtools"))

# sort bam (removed because TopHat itself does the sorting)
# system(paste(samtools.binary, "sort tophat_out/accepted_hits.bam tophat"))
system("mv tophat_out/accepted_hits.bam tophat.bam") 

# index bam
system(paste(samtools.binary, "index tophat.bam"))

system("mv tophat_out/junctions.bed junctions.u.bed")
system("mv tophat_out/insertions.bed insertions.u.bed")
system("mv tophat_out/deletions.bed deletions.u.bed")
system("mv tophat_out/align_summary.txt tophat-summary.txt")

# sorting BEDs
source(file.path(chipster.common.path, "bed-utils.R"))

size <- 0
size <- file.info("junctions.u.bed")$size

echo.command <- paste("echo ",size ," >> tophat2.log" )
system(echo.command)
#system("sleep 120")
#stop(paste('CHIPSTER-NOTE: ', size))

if (file.exists("junctions.u.bed")){
	size <- file.info("junctions.u.bed")$size
	if (size > 100){	
		bed <- read.table(file="junctions.u.bed", skip=1, sep="\t")
		colnames(bed)[1:2] <- c("chr", "start")
		sorted.bed <- sort.bed(bed)
		write.table(sorted.bed, file="junctions.bed", sep="\t", row.names=F, col.names=F, quote=F)
	}	
}

if (file.exists("insertions.u.bed")){
	size <- file.info("insertions.u.bed")$size
	if (size > 100){
		bed <- read.table(file="insertions.u.bed", skip=1, sep="\t")
		colnames(bed)[1:2] <- c("chr", "start")
		sorted.bed <- sort.bed(bed)
		write.table(sorted.bed, file="insertions.bed", sep="\t", row.names=F, col.names=F, quote=F)
	}
}

if (file.exists("insertions.u.bed")){
	size <- file.info("deletions.u.bed")$size
	if (size > 100){
		bed <- read.table(file="deletions.u.bed", skip=1, sep="\t")
		colnames(bed)[1:2] <- c("chr", "start")
		sorted.bed <- sort.bed(bed)
		write.table(sorted.bed, file="deletions.bed", sep="\t", row.names=F, col.names=F, quote=F)
	}
}
