# ANALYSIS Pathways/"Correlation analysis of miRNA targets" (Performs a statistical test
# to detect miRNA targets whose ecpression is significantly negatively correlated
# to the expression of the miRNA.)
# INPUT GENE_EXPRS normalized_gene.tsv, GENE_EXPRS normalized_mirna.tsv, GENERIC phenodata_gene.tsv, GENERIC phenodata_mirna.tsv
# OUTPUT mirna-gene-positive-correlation.tsv, mirna-gene-negative-correlation.tsv
# PARAMETER order.column.mirna METACOLUMN_SEL DEFAULT group (Phenodata column describing the order of the samples, so that the gene
# expression and miRNA expression arrays can be correctly matched in the analysis. For time course
# experiments the actual time can be used, for multiple-condition type of experiments it is
# adviced to encode the different condition with a number, e.g. 1, 2, 3, 4 and 5 for
# an experiment where five different conditions have been assessed. NOTE: If a custom array was used
# for assessing the gene expression it is crucial that ENTREZ gene ID or HUGO gene symbols have
# been specified as identifier when importing the data into CHIPSTER.)
# PARAMETER order.column.gene METACOLUMN_SEL DEFAULT group (Phenodata column describing the order of the samples, so that the gene
# expression and miRNA expression arrays can be correctly matched in the analysis. For time course
# experiments the actual time can be used, for multiple-condition type of experiments it is
# adviced to encode the different condition with a number, e.g. 1, 2, 3, 4 and 5 for
# an experiment where five different conditions have been assessed. NOTE: If a custom array was used
# for assessing the gene expression it is crucial that ENTREZ gene ID or HUGO gene symbols have
# been specified as identifier when importing the data into CHIPSTER.)
# PARAMETER correlation.method [pearson, spearman, kendall] DEFAULT pearson (Method for calculating the correlation. Peasron's method is parametric, 
# whereas Spearman's correlation is a non-parametric rank-based method that is less sensitive to outliers.
# Kendall's method is suitable in those cases one is interested in the sign of the changes in expression between adjacent
# data points, rather than the magnitude.)
# PARAMETER p.value.threshold DECIMAL FROM 0 TO 1 DEFAULT 0.05 (P-value cut-off for significant results)
# PARAMETER p.value.adjustment.method [none, Bonferroni, Holm, Hochberg, BH, BY] DEFAULT BH (Multiple testing correction method)

# setup parameters for script development
#order.column.mirna <- "order"
#order.column.gene <- "order"
#correlation.method <- "pearson"
#p.value.threshold <- "0.05"
#p.value.adjustment.method <- "none"

# Correlation analysis of miRNA targets
# MG, 23.12.2009

# Loads the libraries
library(RmiR)

# Loads the normalized data
mirna.data <- read.table(file="normalized_mirna.tsv", header=T, sep="\t", row.names=1)
gene.data <- read.table(file="normalized_gene.tsv", header=T, sep="\t", row.names=1)

# Separates expression values and flags
mirna.data.2 <- mirna.data[,grep("chip", names(mirna.data))]
gene.data.2 <- gene.data[,grep("chip", names(gene.data))]

# Load the sample descriptions and get sample order for matching
mirna.phenodata <- read.table("phenodata_mirna.tsv", header=T, sep="\t")
mirna.order <- mirna.phenodata[,grep(order.column.mirna, colnames(mirna.phenodata))]
gene.phenodata <- read.table("phenodata_gene.tsv", header=T, sep="\t")
gene.order <- mirna.phenodata[,grep(order.column.gene, colnames(gene.phenodata))]

# Read the chiptype that was used for the gene expression data
chip.type <- as.character(gene.phenodata[1,grep("chiptype", names(gene.phenodata))])

# Sanity checks to make sure the experiment have enough conditions
if(length(unique(mirna.order))==1 | length(unique(gene.order))==1) {
	stop("You need to have at least 2 conditions or time points to run this analysis!")
}

# Sanity checks to make sure that the mirna and gene expression data sets
# have the same number of conditions
if(length(unique(mirna.order))!=length(unique(gene.order))) {
	stop("You need to have the same number of conditions, or time points, in the two data sets!")
}

# Define number of conditions
number.conditions <- length(mirna.order)

# Covert probe id:s into gene id:s
library(package=chip.type, character.only=T)
info.type <- sub(".db", "ENTREZID", chip.type) 
mget(genes, eval(as.name(info.type)))

# Arrange the columns in the two datset so they match
mirna.data.3 <- mirna.data.2[,order(mirna.order)]
gene.data.3 <- gene.data.2[,order(gene.order)]

# Create data set appropriate for correlation testing
mirna.data.4 <- cbind(rownames(mirna.data.3), as.numeric(mirna.data.3[,1]))
gene.data.4 <- cbind(rownames(gene.data.3), as.numeric(gene.data.3[,1]))
mirna.data.4 <- as.data.frame(mirna.data.4)
gene.data.4 <- as.data.frame(gene.data.4)
mirna.data.4[,2] <- as.numeric(mirna.data.4[,2])
gene.data.4[,2] <- as.numeric(gene.data.4[,2])
merged.table <- read.mir(gene=gene.data.4, mirna=mirna.data.4,
		annotation=chip.type, verbose=TRUE)
for (count in 2:number.samples) {
	mirna.data.4 <- cbind(rownames(mirna.data.3), as.numeric(mirna.data.3[,count]))
	gene.data.4 <- cbind(rownames(gene.data.3), as.numeric(gene.data.3[,count]))
	mirna.data.4 <- as.data.frame(mirna.data.4)
	gene.data.4 <- as.data.frame(gene.data.4)
	mirna.data.4[,2] <- as.numeric(mirna.data.4[,2])
	gene.data.4[,2] <- as.numeric(gene.data.4[,2])
	temp.table <- read.mir(gene=gene.data.4, mirna=mirna.data.4,
			annotation=chip.type, verbose=TRUE)
	temp.table
	merged.table <- cbind (merged.table, temp.table)
}

# Extract the matching mirna and gene expression values into two vectors
mirna.expression <- merged.table[, grep("mirExpr", names(merged.table))]
gene.expression <- merged.table[, grep("geneExpr", names(merged.table))]

# Calculate the pearson correlation value for each mirna-gene pair
results.table <- data.frame(merged.table$mature_miRNA, merged.table$gene_id, merged.table$symbol, correlation.coefficient=NA, correlation.p.value=NA)
names (results.table) <- c("miRNA", "gene id", "gene symbol", "correlation coefficient", "p-value")
number.mirna <- dim(merged.table)[1]
for (mirna.count in 1:number.mirna) {
	correlation.coefficient <- cor (as.numeric(mirna.expression[mirna.count,]),as.numeric(gene.expression[mirna.count,]), method=correlation.method)
	correlation.p.value <- cor.test (as.numeric(mirna.expression[mirna.count,]),as.numeric(gene.expression[mirna.count,]), method=correlation.method)
	correlation.p.value <- correlation.p.value$p.value
	results.table[mirna.count,4] <- correlation.coefficient
	results.table[mirna.count,5] <- correlation.p.value
}

# Find genes with statistically significant positive correlation
results.positive <- results.table[results.table[,4]<0,]
results.positive.significant <- results.positive[results.positive[,5]<=p.value.threshold,]

# Find genes with statistically significant negative correlation
results.negative <- results.table[results.table[,4]>=0,]
results.negative.significant <- results.negative[results.negative[,5]<=p.value.threshold,]

# Write the results to tables to be read into Chipster
write.table(results.positive.significant, file="mirna-gene-positive-correlation.tsv", sep="\t", quote=FALSE)
write.table(results.negative.significant, file="mirna-gene-negative-correlation.tsv", sep="\t", quote=FALSE)



