intCNGEan.tune <-
function(CNdata, GEdata, test.statistic, ngenetune=250, nperm_tuning=250, minCallProbMass=0.05, trace=TRUE){
######################################################################################################
# function that filters genes that have a low probability of becoming significant
# and determines which test is performed: loss-vs-no.loss or no.gain-vs-gain.
######################################################################################################

countth <- function(statlist){
################################################################################################
# Counts the number of values in statlist exceed threshold.
################################################################################################

threshold <- as.numeric(statlist[length(statlist)])
statlist <- as.numeric(statlist[c(1:(length(statlist)-1))])
return(length(statlist[statlist>=threshold]))
}

pval.perm.marg <- function(observed, permuted, nperm){
################################################################################################
# Calculates the marginal p-values.
# The p-value of gene is calculated using the null-ditribution 
# resulting from the permutations of that gene.
################################################################################################

perm.and.obs <- cbind(permuted, observed)
return(apply(perm.and.obs, 1, "countth")/nperm)
}

rawps <- function(stats.obs, nulldists, nperm){
################################################################################################
# calculate the raw p-values using the observed test statistics
# and their null distribution.
################################################################################################

pval.ln <- pval.perm.marg(stats.obs, nulldists, nperm)
return(pval.ln)
}

wcvm.test.stats <- function(cgh.em, nosamp, a){
################################################################################################
# function that calculates the wcvm test statistics for one clone.
################################################################################################

# makes a matrix of call probs
cgh.2cat <- matrix(cgh.em[c(1:(a*nosamp))], ncol=a, byrow=TRUE)

# calculate contrast coefficients
alphaas <- t(cgh.2cat) %*% cgh.2cat
cs <- as.numeric(solve(alphaas) %*% matrix(c(-1,1), ncol=1))

cgh.em <- cbind(cgh.2cat, cgh.em[c((a*nosamp+1):((a+1)*nosamp))])
cgh.em <- cbind(cgh.em[order(cgh.em[,3]),], rep(1/dim(cgh.em)[1],dim(cgh.em)[1]))
cgh.em <- cbind(cgh.em, cumsum(cgh.em[,4]),cumsum(cgh.em[,1]*cgh.em[,dim(cgh.em)[2]])*cs[1],cumsum(cgh.em[,2]*cgh.em[,dim(cgh.em)[2]])*cs[2])

test.stat <- -sum(cgh.em[,7]+cgh.em[,6])/nosamp
return(test.stat)
}

wmw.test.stats <- function(cgh.em, nosamp, a){
################################################################################################
# function that calculates the wMW-like test statistics for one clone.
################################################################################################

# makes a matrix; first 2 call probs, last is expression
cgh.em <- cbind(matrix(cgh.em[c(1:(a*nosamp))],ncol=a,byrow=TRUE), cgh.em[c((a*nosamp+1):((a+1)*nosamp))]) 

# sort by expression
cgh.em <- cgh.em[order(cgh.em[,(a+1)]),]  

# shift probs 1 position, because I_{Xi < Xj}
cgh.em <- cbind(cgh.em, rbind(rep(0,a), apply(cgh.em[,1:a], 2, cumsum)[-nosamp,])) 

# '2' = prob. class 1, a+2 = cum. prob class 2
test.stat <- sum(cgh.em[,a+2]*cgh.em[,2]) 
return(test.stat)
}

prob.test.stats <- function(cgh.em, nosamp, a){
################################################################################################
# function that calculates the wMW-like test statistics for one clone.
################################################################################################

# makes a matrix of call probs
cgh.2cat <- matrix(cgh.em[c(1:(a*nosamp))], ncol=a, byrow=TRUE)

# calculate contrast coefficients
alphaas <- t(cgh.2cat) %*% cgh.2cat
cs <- c(det(alphaas), alphaas[1,2]*sum(alphaas)/2)

# makes a matrix; first 2 call probs, last is expression
cgh.em <- cbind(matrix(cgh.em[c(1:(a*nosamp))], ncol=a,byrow=TRUE), cgh.em[c((a*nosamp+1):((a+1)*nosamp))]) 

# sort by expression
cgh.em <- cgh.em[order(cgh.em[,(a+1)]),]  

# shift probs 1 position, because I_{Xi < Xj}
cgh.em <- cbind(cgh.em,rbind(rep(0,a), apply(cgh.em[,1:a], 2, cumsum)[-nosamp,])) 

# '2' = prob. class 1, a+2 = cum. prob class 2
test.stat <- ( sum(cgh.em[,a+2]*cgh.em[,2]) - cs[2])/cs[1]
return(test.stat)
}

pretest <- function(alphascgh){
################################################################################################
# function that determines which hypothesis is tested (loss vs. no-loss or no-gain vs. gain).
# also merges the call probabilities of the aberrated class that is not-dominant.
################################################################################################

alphas <- alphascgh[1:3]
probs <- matrix(alphascgh[-(1:3)], ncol=3, byrow=TRUE)
if (alphas[1] >= alphas[3]){
probs2 <- cbind(probs[,1], probs[,2]+probs[,3])
alphas2 <- c(alphas[1], alphas[2]+alphas[3])
return(c(1, alphas2, as.vector(t(probs2))))
} else {
probs2 <- cbind(probs[,1]+probs[,2],probs[,3])
alphas2 <- c(alphas[1]+alphas[2], alphas[3])
return(c(2, alphas2, as.vector(t(probs2))))
} 
}

alphaest <- function(cgh.em, nosamp, a){
################################################################################################
# function that calculated the first order moments of the call probabilities.
################################################################################################

cgh.em <- matrix(cgh.em[c(1:(a*nosamp))], ncol=a,byrow=TRUE) 
return(apply(cgh.em,2,mean))
}

alphabivariate <- function(cgh.em, nosamp, a){
################################################################################################
# function that calculated the second order moments of the call probabilities.
################################################################################################
    
cgh.em <- matrix(cgh.em[c(1:(a*nosamp))], ncol=a, byrow=TRUE) 
cgh.em1 <- cgh.em[,1]
cgh.em2 <- cgh.em[,2]
return(c(1/nosamp*(cgh.em1%*%cgh.em1), 1/nosamp*(cgh.em2%*%cgh.em2), 1/nosamp*(cgh.em1%*%cgh.em2)))
}

probs2calls <- function(problist){
################################################################################################
# function that converts sof calls to hard calls
################################################################################################

maxprobposition <- which.max(problist)
call.list <- rep(0, length(problist))
call.list[maxprobposition] <- 1
return(call.list)
}

shift.est <- function(row, nosamp, a, data2, alphabmat, alphanmat, minalphathr){
################################################################################################
# function that estimates the effect size.
# if a gene exceeds the power unbalance threshold NA is returned.
################################################################################################

cgh.em <- cbind(matrix(data2[row,c(1:(a*nosamp))],ncol=a,byrow=TRUE),data2[row,c((a*nosamp+1):((a+1)*nosamp))])
alphasbiv<-alphabmat[row,]
alphas <- alphanmat[row,]
c1 <- (alphasbiv[1]/alphasbiv[3]-alphas[1]/alphas[2])^(-1)
c2 <- (alphasbiv[2]/alphasbiv[3]-alphas[2]/alphas[1])^(-1)
if (is.na(c1) | is.na(c2)) {
return(NA)
} else {
if (max(c1,c2) >= minalphathr) {
return(NA)
} else {
mu1 <- 1 / nosamp*sum(cgh.em[,3]*(cgh.em[,1]*alphas[2] - alphasbiv[3])/(alphasbiv[1]*alphas[2]-alphas[1]*alphasbiv[3]))
mu2 <- 1 / nosamp*sum(cgh.em[,3]*(cgh.em[,2]*alphas[1] - alphasbiv[3])/(alphasbiv[2]*alphas[1]-alphas[2]*alphasbiv[3]))
shiftest <- mu2 - mu1
return(shiftest)
}
}
}
   
powerunbalance <- function(row, alphabmat){
################################################################################################
# function that calculates the power unbalance
################################################################################################

alphasbiv <- alphabmat[row,]
unbalance <- alphasbiv[1]*alphasbiv[2]-alphasbiv[3]^2
return(unbalance)
}

countdiscoveries <- function(powerquant, rawp_powerunbal, fdrcutoff){
################################################################################################
# function that converts sof calls to hard calls
################################################################################################

rawp_filt <- rawp_powerunbal[rawp_powerunbal[,2]>=powerquant,][,1]
m <- length(rawp_filt)
adpjrawp_filt <- cbind(rawp_filt, p.adjust(rawp_filt, "BH"))
selected <- matrix(adpjrawp_filt[adpjrawp_filt[,2] <= fdrcutoff,],ncol=2)
count <- nrow(selected) 
false <- ifelse(count>0, max(selected[,1])*m, 0)
countS <- count - false
return(countS)
}

redraw <- function(col, shiftest, shiftsam, alpharow, dataexp, datacgh){
################################################################################################
# function that .....
################################################################################################

if(is.na(shiftest)){
return(sample(dataexp,1)) 
} else {  
#this returns a re-sample from the data when there is basically only one group
alpha <- alpharow[1]
Ig <- sample(c(1,2),1,prob=c(alpha,1-alpha))
if (Ig==1){
xib <- sample(dataexp,1,prob=datacgh[,1]) + shiftest 
} else {
xib <- sample(dataexp,1,prob=datacgh[,2])
}
Ig2 <- sample(c(1,2), 1, prob = c( max(0, min(1,datacgh[col,1])), max(0, min(1, 1-datacgh[col,1]))))
if (Ig2==1){
xib2 <- xib
} else {
xib2 <- xib + shiftsam
}
return(xib2)
}
}

redrawcol <- function(row, allest, alphanmat, shiftsampled, data2, a, nosamp){
################################################################################################
# function that .......
################################################################################################

shiftest <- allest[row]
datacgh <- matrix(data2[row,c(1:(a*nosamp))],ncol=a,byrow=TRUE)
dataexp <- data2[row,c((a*nosamp+1):((a+1)*nosamp))]
shiftsam <- shiftsampled[row]
alpharow <- alphanmat[row,]
result <- sapply(1:nosamp,redraw,shiftest=shiftest,alpharow=alpharow,shiftsam=shiftsam,dataexp=dataexp,datacgh=datacgh)
return(result)
}

pi0est <- function(data.in, test.stat="wmw", seqg=seqgenes, nperm=nperm_tuning, a, nosamp){
################################################################################################
# function that estimates the proportion of rejected null-hypothesis, i.e., the number
# of gene whose expression is afffected by copy number changes.
################################################################################################

data.in2 <- data.in[seqg,]
if(test.stat=="wmw") test.stats <- apply(data.in2, 1, wmw.test.stats, nosamp, a) else test.stats <- apply(data.in2, 1, wcvm.test.stats, nosamp, a)
rawpvals.test <- rawps(test.stats, null.dists.tune, nperm)
pi0 <- convest(rawpvals.test)
return(pi0)
}

tuning <- function(data.in, datacgh.in, test.stat="wmw", seqg=seqgenes, nperm=nperm_tuning, pi0, fdrcut=0.25, nresamp=100, gridnr=30, minim=10, a=a, nosamp=nosamp, shiftsam=shiftsam, trace){
################################################################################################
# function that does the tuning
################################################################################################

probs <- seq(0,gridnr/(gridnr+1), 1/(gridnr+1))
probstruncate <- probs[probs<=((100-minim)/100)]
allest2 <- allest[seqg]
alphanewmat2<-alphanewmat[seqg,]
data.in2=data.in[seqg,]
datacgh.in2 <- datacgh.in[seqg,]
powerunbal2 <- powerunbal[seqg]
quant <- quantile(powerunbal2, probs=probstruncate)
whatthrmat <- c()
for (j in 1:nresamp){
if ((j %% 50) == 0){ if (trace){ cat(paste(j, "of", nresamp, "resamples done, and counting...", sep=" "), "\n") } }
shiftsampled <- sample(shiftsam, nrow(data.in2), replace=TRUE)
pi0cutoff <- quantile(shiftsampled, pi0)
shiftsampled <- sapply(shiftsampled, function(x){if(x <= pi0cutoff) return(0) else return(x)})
redrawall <- t(sapply(1:nrow(data.in2), redrawcol, allest=allest2, alphanmat=alphanewmat2, shiftsampled=shiftsampled, data2=data.in2, a=a, nosamp=nosamp))
newdata <- cbind(datacgh.in2[,-(1:3)], redrawall) 
if(test.stat=="wmw") test.resamp <- apply(newdata, 1, wmw.test.stats, nosamp, a) else test.resamp <- apply(newdata, 1, wcvm.test.stats, nosamp, a)
rawpvals.test.resamp <- rawps(test.resamp, null.dists.tune, nperm)
rawp_powerunbal <- cbind(rawpvals.test.resamp, powerunbal2)
whatthresh <- sapply(quant, countdiscoveries, rawp_powerunbal=rawp_powerunbal, fdrcutoff=fdrcut)
whatthrmat <- rbind(whatthrmat, whatthresh)  
}
whatthrmean <- apply(whatthrmat, 2, mean)
# print(whatthrmean)
bestthr <- which.max(whatthrmean)    
return(quant[bestthr])
}

datareduce <- function(powerunbal, unbalthr){
################################################################################################
# function that selects rows that pass the power unbalance criterion
################################################################################################

datarows <- which(powerunbal >= unbalthr)
return(datarows)
}

nulldist.all.wcvm <- function(data.both, nosamp, a, nperm, trace){
################################################################################################
# function that calculates the null distribution of the weighted CvM test statistics for tuning.
################################################################################################
    
# Permutes data and calculates test statistic on permuted data
cvm.like.mat <- NULL
for(i in 1:nperm){
if ((i %% 50) == 0){ if (trace){ cat(i," of ", nperm, " permutations done, and counting...", "\n") } }
x <- sample(1:nosamp,nosamp) + a*nosamp
data.ran <- cbind(data.both[,c(1:(a*nosamp))], data.both[,x])
cvm.like.ran <- apply(data.ran, 1, wcvm.test.stats, nosamp, a)
cvm.like.mat <- cbind(cvm.like.mat,cvm.like.ran)
}
return(cvm.like.mat)
}

nulldist.all.wmw <- function(data.both, nosamp, a, nperm, trace){
################################################################################################
# function that calculates the null distribution of the weighted CvM test statistics for tuning.
################################################################################################
    
# Permutes data and calculates test statistic on permuted data
wmw.ln.mat <- NULL
cghdata <- data.both[,c(1:(a*nosamp))]
for(i in 1:nperm){
if ((i %% 50) == 0){ if (trace){ cat(i," of ", nperm, " permutations done, and counting...", "\n") } }
x <- sample(1:nosamp, nosamp) + a*nosamp
data.ran <- cbind(cghdata, data.both[,x])
wmw.ran <- apply(data.ran, 1, wmw.test.stats, nosamp, a)
wmw.ln.mat <- cbind(wmw.ln.mat, wmw.ran)
}
return(wmw.ln.mat)
}

# check for presence missing values.
if (sum(is.na(exprs(GEdata))) > 0){ 
stop("Gene expression matrix contains missing values: not allowed.")
}

# check for equal number of samples.
if (dim(exprs(GEdata))[2] != dim(copynumber(CNdata))[2]){ 
stop("Gene expression and copy number matrices contain unequal number of samples.")
}

# check for equal number of probes.
if (dim(exprs(GEdata))[1] != dim(copynumber(CNdata))[1]){ 
stop("Gene expression and copy number matrices contain unequal number of samples: impossible after matching.")
}

# extract data from objects
data <- list()
data$ann <- fData(GEdata)
data$em <- exprs(GEdata)
nosamp <- ncol(exprs(GEdata))
cghdata.probs <- numeric()
for (i in 1:dim(calls(CNdata))[2]){
if (is.null(probamp(CNdata)[,i])){
cghdata.probs <- cbind(cghdata.probs, cbind(probloss(CNdata)[,i], probnorm(CNdata)[,i], probgain(CNdata)[,i]))
} else {
cghdata.probs <- cbind(cghdata.probs, cbind(probloss(CNdata)[,i], probnorm(CNdata)[,i], probgain(CNdata)[,i] + probamp(CNdata)[,i]))
}
}
data$cgh <- cghdata.probs
nclass <- dim(cghdata.probs)[2] / dim(calls(CNdata))[2]

# Estimate alpha parameters per clone, a0 classes
if (trace){ cat("pre-test...", "\n") }
alphamat <- t(apply(data$cgh, 1, alphaest, nosamp=nosamp, a=nclass))

# Perform pre-test and merge columns
datacgh2 <- as.matrix(t(apply(cbind(alphamat, data$cgh), 1, pretest)))
lossorgain <- datacgh2[,1]
alphas2 <- datacgh2[,2:3]
data.both <- as.matrix(cbind(datacgh2, data$em)[,-(1:3)])

# remove features with too little spread in call probs mass
suffCPM <- which(apply(alphas2, 1, min) > minCallProbMass)
data$ann <- data$ann[suffCPM,]
data$cgh <- data$cgh[suffCPM,]
data$em <- data$em[suffCPM,]
alphamat <- alphamat[suffCPM,]
datacgh2 <- datacgh2[suffCPM,]
lossorgain <- lossorgain[suffCPM]
alphas2 <- alphas2[suffCPM,]
data.both <- data.both[suffCPM,]
init.prop.kept <- length(suffCPM)/dim(exprs(GEdata))[1]

# Estimate new alpha and bivariate alpha parameters per clone
alphanewmat <- alphas2
alphabivmat <- t(apply(data.both, 1, alphabivariate, nosamp=nosamp, a=2))
colnames(alphabivmat) <- c("a11", "a22", "a12")

# start of tuning
if (trace){ cat("tuning started...", "\n") }

# Calculate the measure of unbalance and estimate the effect sizes
# UNBALANCE, SHIFT ESTIMATES AND TUNING, needs to be done AFTER null distr computation, BEFORE final p-value and fdr computation.
powerunbal <- sapply(1:nrow(data.both), powerunbalance, alphabmat=alphabivmat)
allest <- sapply(1:nrow(data.both), shift.est, data2=data.both, alphabmat=alphabivmat, alphanmat=alphanewmat, nosamp=nosamp, a=2, minalphathr=10)

# "Robustly" estimate effect sizes. These are used for generating empirical distribution of shift parameters
estrobust <- sapply(1:nrow(data.both), shift.est, data2=data.both, alphabmat=alphabivmat, alphanmat=alphanewmat, nosamp=nosamp, a=2, minalphathr=3) 

# Draw histogram of robust effect sizes
shiftsam <- estrobust[!is.na(estrobust)]

# Determine null distribution for tuning genes.
seqgenes <- floor(seq(1,nrow(data.both), length.out=ngenetune))
if (test.statistic == "wcvm") {
null.dists.tune <- nulldist.all.wcvm(data.both[seqgenes,], nosamp, 2, nperm=nperm_tuning, trace)
} else {
null.dists.tune <- nulldist.all.wmw(data.both[seqgenes,], nosamp, 2, nperm=nperm_tuning, trace)
}   

# Determine the proportion of diff exp genes on small data set
pi0 <- pi0est(data.in=data.both, seqg=seqgenes, test.stat=test.statistic, nperm=nperm_tuning, a=2, nosamp=nosamp) 

# Do actual tuning, returns list of genes that are propagated into the test
data.tuned <- tuning(data.in=data.both, datacgh.in=datacgh2, seqg=seqgenes, test.stat=test.statistic, nperm=nperm_tuning, pi0=pi0, fdrcut=0.05, nresamp=100, gridnr=200, minim=10, a=2, nosamp=nosamp, shiftsam=shiftsam, trace)
genestotest <- datareduce(powerunbal, as.real(data.tuned))
prop_kept <- length(genestotest)/nrow(data.both)

if (trace){ cat("ready: tuning done", "\n") }

datacgh2 <- datacgh2[genestotest,]
lossorgain <- lossorgain[genestotest]
alphas2 <- alphas2[genestotest,]
data.both <- data.both[genestotest,]
alphanewmat <- alphanewmat[genestotest,]
alphabivmat <- alphabivmat[genestotest,]
powerunbal <- powerunbal[genestotest]
allest <- allest[genestotest]
estrobust <- estrobust[genestotest]

data.tuned <- list()
data.tuned$datafortest <- data.both
data.tuned$lossorgain <- lossorgain
data.tuned$genestotest <- suffCPM[genestotest]
data.tuned$callprobs <- alphas2
data.tuned$alleffects <- allest
data.tuned$ann <- data$ann[genestotest,]
data.tuned$nosamp <- nosamp

return(data.tuned)
}

