intCNGEan.match <-
function(CNdata, GEdata, CNbpend="no", GEbpend="no", method="distance"){
   ############################################################################
# function that matches features from the copy number and gene expression
# platforms on genomic location
############################################################################

min.dist.cgh.feature <- function(GEfeature.ann, CNann){
CNfeatures.kept.temp <- which(CNann[,1] == as.numeric(GEfeature.ann[1]))
if (length(CNfeatures.kept.temp) >= 1){
first.feature <- min(CNfeatures.kept.temp) - 1
CNann <- CNann[CNfeatures.kept.temp, , drop=FALSE]
CNfeature.closest <- which.min(abs(CNann[,2] - GEfeature.ann[2]))[1] + first.feature
return(CNfeature.closest)
} else {
return(NA)
}
}

overlap.percentage <- function(GEfeature.ann, CNann){
CNfeatures.kept.temp <- which(CNann[,1] == as.numeric(GEfeature.ann[1]))
first.feature <- min(CNfeatures.kept.temp) - 1
CNann <- CNann[CNfeatures.kept.temp, , drop=FALSE]
junk <- sapply(CNann[,3], function(x,y){ min(x,y) }, y=GEfeature.ann[3]) - sapply(CNann[,2], function(x,y){ max(x,y) }, y=GEfeature.ann[2]) + 1
perc.overlap <- as.numeric(sapply(junk, function(x){ max(x, 0) }) / (GEfeature.ann[3] - GEfeature.ann[2] +1))
candidate <- as.numeric(which.max(perc.overlap)[1])
if (perc.overlap[candidate] == 0){
return(NA)
} else {
return(candidate + first.feature)
}
}

overlap.plus.interpolation <- function(GEfeature.ann, CNann, CNprobs.temp){
CNfeatures.kept.temp <- which(CNann[,1] == as.numeric(GEfeature.ann[1]))
first.feature <- min(CNfeatures.kept.temp) - 1
CNann <- CNann[CNfeatures.kept.temp, , drop=FALSE]
CNprobs.temp <- CNprobs.temp[CNfeatures.kept.temp, , drop=FALSE]
junk <- sapply(CNann[,3], function(x,y){ min(x,y) }, y=as.numeric(GEfeature.ann[3])) - sapply(CNann[,2], function(x,y){ max(x,y) }, y=as.numeric(GEfeature.ann[2])) + 1
perc.overlap <- as.numeric(sapply(junk, function(x){ max(x, 0) }) / (as.numeric(GEfeature.ann[3]) - as.numeric(GEfeature.ann[2]) +1))
candidate <- as.numeric(which.max(perc.overlap)[1])
if (perc.overlap[candidate] == 0){
dists <- cbind(CNann[,3]-as.numeric(GEfeature.ann[3]), as.numeric(GEfeature.ann[2]) - CNann[,2])
ids.temp <- which(dists[,1] > 0)
if (length(ids.temp) > 0){ id1 <- ids.temp[which.min(dists[ids.temp,1])] } else { id1 <- which.max(dists[,1]) }
ids.temp <- which(dists[,2] > 0)
if (length(ids.temp) > 0){ id2 <- ids.temp[which.min(dists[ids.temp,2])] } else { id2 <- which.max(dists[,2]) }
ids <- sort(c(id1, id2))
if ( all(CNprobs.temp[ids,][1,] == CNprobs.temp[ids,][2,]) ){ 
return(ids[which.min(apply(abs(dists[ids,]), 1, mean))[1]] + first.feature)
} else {
return(NA)
}
} else {
return(candidate + first.feature)
}
}

# extract annotation info
cat("perform input checks...", "\n")
if (GEbpend == "no"){ 
if (any(colnames(fData(GEdata)) == "End")){ stop(cat("Error: column name End already in use.", "\n", "This indicates GE bp end position is available, while specified as unavailable.", "\n", "Either change column name or input. ", "\n", sep="")) }
fData(GEdata) <- fData(GEdata)[,sort(c(1:ncol(fData(GEdata)), 2))]
colnames(fData(GEdata))[3] <- "End" 
}
if (CNbpend == "no"){ 
if (any(colnames(fData(CNdata)) == "End")){ stop(cat("Error: column name End already in use.", "\n", "This indicates GE bp end position is available, while specified as unavailable.", "\n", "Either change column name or input.", "\n", sep="")) }
fData(CNdata) <- fData(CNdata)[,sort(c(1:ncol(fData(CNdata)), 2))]
colnames(fData(CNdata))[3] <- "End" 
}
if (GEbpend == "yes"){ 
if (!any(colnames(fData(GEdata)) == "End")){ cat("Warning: column name End not found.", "\n", "This indicates GE bp end position is unavailable, while specified as available.", "\n", "Proceeded as if unavailable. ", "\n", sep="") 
fData(GEdata) <- fData(GEdata)[,sort(c(1:ncol(fData(GEdata)), 2))]
colnames(fData(GEdata))[3] <- "End" 
}
}
if (CNbpend == "yes"){ 
if (!any(colnames(fData(CNdata)) == "End")){ cat("Warning: column name End not found.", "\n", "This indicates GE bp end position is unavailable, while specified as available.", "\n", "Proceeded as if unavailable. ", "\n", sep="")
fData(CNdata) <- fData(CNdata)[,sort(c(1:ncol(fData(CNdata)), 2))]
colnames(fData(CNdata))[3] <- "End" 
}
}
anncols <- c(1:3)
CNann <- fData(CNdata)[, anncols, drop=FALSE]
GEann <- fData(GEdata)[, anncols, drop=FALSE]

# in case columns are of wrong class ....
for (i in 1:3){
if (is.factor(CNann[,i])){ CNann[,i] <- as.numeric(levels(CNann[,i]))[CNann[,i]] }
if (is.factor(GEann[,i])){ GEann[,i] <- as.numeric(levels(GEann[,i]))[GEann[,i]] }
if (is.character(CNann[,i])){ CNann[,i] <- as.numeric(CNann[,i]) }
if (is.character(GEann[,i])){ GEann[,i] <- as.numeric(GEann[,i]) }
}

# ensure data from gene expression platform is genomicly ordered.
cat("order both data sets genomically...", "\n")
exprs(GEdata) <- exprs(GEdata)[order(GEann[,1], GEann[,2]), ]
fData(GEdata) <- fData(GEdata)[order(GEann[,1], GEann[,2]), ]
GEann <- GEann[order(GEann[,1], GEann[,2]), ]

# ensure data from copy number platform is genomicly ordered.
fData(CNdata) <- fData(CNdata)[order(CNann[,1], CNann[,2]),]
copynumber(CNdata) <- copynumber(CNdata)[order(CNann[,1], CNann[,2]),]
segmented(CNdata) <- segmented(CNdata)[order(CNann[,1], CNann[,2]),]
calls(CNdata) <- calls(CNdata)[order(CNann[,1], CNann[,2]),]
probloss(CNdata) <- probloss(CNdata)[order(CNann[,1], CNann[,2]),]
probnorm(CNdata) <- probnorm(CNdata)[order(CNann[,1], CNann[,2]),]
probgain(CNdata) <- probgain(CNdata)[order(CNann[,1], CNann[,2]),]
if (is.null(probamp(CNdata))){ } else { probamp(CNdata) <- probamp(CNdata)[order(CNann[,1], CNann[,2]),] }

# make temporary copy of call probabilities
if (method == "overlapplus"){
# create temporary call probs data
CNprobs.temp <- probloss(CNdata)
CNprobs.temp <- cbind(CNprobs.temp, probnorm(CNdata))
CNprobs.temp <- cbind(CNprobs.temp, probgain(CNdata))
if (is.null(probamp(CNdata))){ } else { CNprobs.temp <- cbind(CNprobs.temp, probamp(CNdata)) }
}

# filter genes with complete annotation
cat("select probes with complete annotation...", "\n")
GEfeatures.kept <- which(apply(GEann, 1, function(x){ all(is.na(x) == FALSE) }))
if (length(GEfeatures.kept) < 1){ stop("no annotation info for GE-array features provided") }
GEann <- GEann[GEfeatures.kept, , drop=FALSE]
CNfeatures.kept <- which(apply(CNann, 1, function(x){ all(is.na(x) == FALSE) }))
if (length(CNfeatures.kept) < 1){ stop("no annotation info for CN-array features provided") }
CNann <- CNann[CNfeatures.kept, , drop=FALSE]
if (method == "overlapplus"){CNprobs.temp <- CNprobs.temp[CNfeatures.kept, , drop=FALSE] }

# chromosomes with genes on the expression array
cat("select probes on chromosomes present in both data sets...", "\n")
GEchrs <- sort(unique(GEann[,1]))
CNfeatures.kept.2 <- which(CNann[,1] %in% GEchrs)
CNann <- CNann[CNfeatures.kept.2, , drop=FALSE]
if (method == "overlapplus"){CNprobs.temp <- CNprobs.temp[CNfeatures.kept.2, , drop=FALSE] }
CNfeatures.kept <- CNfeatures.kept[CNfeatures.kept.2]

# chromosomes with features on the CN array
CNchrs <- sort(unique(CNann[,1]))
GEfeatures.kept.2 <- which(GEann[,1] %in% CNchrs)
GEann <- GEann[GEfeatures.kept.2, , drop=FALSE]
GEfeatures.kept <- GEfeatures.kept[GEfeatures.kept.2]

# match by percentage of overlap
cat("start actual matching procedure...", "\n")
if (method == "overlap"){
if (GEbpend == "no" & CNbpend == "no"){
cat("no base pair end positions provided for both platform:", "\n")
cat("distance matching method performed instead of overlap!", "\n")
method <- "distance"
} else {
# find features with biggest overlap and remove features without any overlap
CNGEmatched.features <- apply(GEann, 1, overlap.percentage, CNann)
no.match.found <- (is.na(CNGEmatched.features) == FALSE)
GEfeatures.kept <- GEfeatures.kept[no.match.found]
CNGEmatched.features <- CNGEmatched.features[no.match.found]
CNfeatures.kept <- CNfeatures.kept[CNGEmatched.features]
}
}

# match by percentage of overlap with some interpolation
if (method == "overlapplus"){
if (GEbpend == "no" & CNbpend == "no"){
cat("no base pair end positions provided for both platform:", "\n")
cat("distance matching method performed instead of overlapplus!", "\n")
method <- "distance"
} else {
# find features with biggest overlap and remove features without any overlap
CNGEmatched.features <- apply(GEann, 1, overlap.plus.interpolation, CNann, CNprobs.temp)
no.match.found <- (is.na(CNGEmatched.features) == FALSE)
GEfeatures.kept <- GEfeatures.kept[no.match.found]
CNGEmatched.features <- CNGEmatched.features[no.match.found]
CNfeatures.kept <- CNfeatures.kept[CNGEmatched.features]
}
}

# match by distance of mid base pair
if (method == "distance"){
# determine mid base pair
CNann <- cbind(CNann[,1], apply(CNann[,2:3,drop=FALSE], 1, mean, na.rm=TRUE))
GEann <- cbind(GEann[,1], apply(GEann[,2:3,drop=FALSE], 1, mean, na.rm=TRUE))

# per chromosome find CN-features closed to GE-features
CNGEmatched.features <- apply(GEann, 1, min.dist.cgh.feature, CNann)
CNfeatures.kept <- CNfeatures.kept[CNGEmatched.features]
}
cat("matching done, reformat data sets...", "\n")

# in the unlikely event that some genes could not be matched: remove
if (sum(is.na(CNfeatures.kept)) > 0){
GEfeatures.kept <- GEfeatures.kept[is.na(CNfeatures.kept) == FALSE]
CNfeatures.kept <- CNfeatures.kept[is.na(CNfeatures.kept) == FALSE]
}

# in the unlikely event that nothing could be matched: stop
if ((length(CNfeatures.kept) == 0) | (length(GEfeatures.kept) == 0)){
stop("No features could be matched. Check format of annotation information.")
}

# sort out matched cghCall & ExpressionSet objects to be returned
fData(GEdata) <- fData(GEdata)[GEfeatures.kept, , drop=FALSE]
exprs(GEdata) <- exprs(GEdata)[GEfeatures.kept, , drop=FALSE]

# sort out matched cghCall & ExpressionSet objects to be returned
fd <- fData(CNdata)[CNfeatures.kept, , drop=FALSE]
newRowNames <- paste(rownames(copynumber(CNdata))[CNfeatures.kept], "_", rownames(fData(GEdata)), sep="")
rownames(fd) <- newRowNames
metaData <- data.frame(labelDescription=c( "Chromosome", "Start", "End")) 
fd <- new("AnnotatedDataFrame", data=data.frame(fd), varMetadata=metaData)
cn <- copynumber(CNdata)[CNfeatures.kept, , drop=FALSE]
rownames(cn) <- newRowNames
sg <- segmented(CNdata)[CNfeatures.kept, , drop=FALSE]
rownames(sg) <- newRowNames
hc <- calls(CNdata)[CNfeatures.kept, , drop=FALSE]
rownames(hc) <- newRowNames
pl <- probloss(CNdata)[CNfeatures.kept, , drop=FALSE]
rownames(pl) <- newRowNames
pn <- probnorm(CNdata)[CNfeatures.kept, , drop=FALSE]
rownames(pn) <- newRowNames
pg <- probgain(CNdata)[CNfeatures.kept, , drop=FALSE]
rownames(pg) <- newRowNames
if (is.null(probamp(CNdata)[CNfeatures.kept, , drop=FALSE])){ pa <- NULL } else { pa <- probamp(CNdata)[CNfeatures.kept, , drop=FALSE]; rownames(pa) <- newRowNames }
if (is.null(pa)){
CNdata <- new('cghCall', copynumber = cn, segmented = sg, calls = hc, probloss = pl, probnorm = pn, probgain = pg, featureData = fd) 
} else {
CNdata <- new('cghCall', copynumber = cn, segmented = sg, calls = hc, probloss = pl, probnorm = pn, probgain = pg, probamp = pa, featureData = fd) 
}

return(list(CNdata.matched = CNdata, GEdata.matched = GEdata))
}

