# TOOL merge-datasets.R: "Merge data sets" (Merges two data sets. Only rows common to both are kept, and all annotation data is taken from the first data set. A new phenodata file is generated by combining the two original phenodata files.)
# INPUT normalized_1.tsv: normalized_1.tsv TYPE GENE_EXPRS 
# INPUT normalized_2.tsv: normalized_2.tsv TYPE GENE_EXPRS 
# INPUT phenodata_1.tsv: phenodata_1.tsv TYPE GENERIC 
# INPUT phenodata_2.tsv: phenodata_2.tsv TYPE GENERIC 
# OUTPUT merged.tsv: merged.tsv 
# OUTPUT phenodata-merged.tsv: phenodata-merged.tsv 

# Ilari Scheinin <firstname.lastname@gmail.com>
# 2012-10-12

# read data set 1
file1 <- 'normalized_1.tsv'
dat <- read.table(file1, header=TRUE, sep='\t', quote='', row.names=1, check.names=FALSE)

# read data set 2
file2 <- 'normalized_2.tsv'
dat2 <- read.table(file2, header=TRUE, sep='\t', quote='', row.names=1, check.names=FALSE)

# take common rows
common.rows <- intersect(rownames(dat), rownames(dat2))
dat <- dat[common.rows,]
dat2 <- dat2[common.rows,]

# extract data
ratios <- dat[,grep('chip', colnames(dat))]
calls <- dat[,grep('flag', colnames(dat))]
ratios <- cbind(ratios, dat2[,grep('chip', colnames(dat2))])
calls <- cbind(calls, dat2[,grep('flag', colnames(dat2))])

# generate new identifiers
microarrays <- sprintf('microarray%.3i', 1:ncol(ratios))
colnames(ratios) <- paste('chip.', microarrays, sep='')
if (ncol(ratios) == ncol(calls))
  colnames(calls) <- paste('flag.', microarrays, sep='')

# remove old matrices from data set 1
dat <- dat[,-grep('chip', colnames(dat))]
if (length(grep('flag', colnames(dat))) > 0)
  dat <- dat[,-grep('flag', colnames(dat))]

# calculate new frequencies
if ('loss.freq' %in% colnames(dat) && ncol(ratios) == ncol(calls)) {
  dat$loss.freq <- round(mean(as.data.frame(t(calls==-1))), digits=3)
  dat$gain.freq <- round(mean(as.data.frame(t(calls==1))), digits=3)
  if (2 %in% calls) {
    dat$amp.freq <- round(mean(as.data.frame(t(calls==2))), digits=3)
  } else {
    dat$amp.freq <- NULL
  }
} else {
  dat$loss.freq <- NULL
  dat$gain.freq <- NULL
  dat$amp.freq <- NULL
}

# generate new table
dat <- cbind(dat, ratios)
if (ncol(ratios) == ncol(calls))
  dat <- cbind(dat, calls)

# process phenodata
phenodata1 <- read.table('phenodata_1.tsv', header=TRUE, sep='\t', as.is=TRUE)
phenodata2 <- read.table('phenodata_2.tsv', header=TRUE, sep='\t', as.is=TRUE)

# fill in columns present only in one phenodata table
for (col in setdiff(colnames(phenodata1), colnames(phenodata2)))
  phenodata2[,col] <- NA
for (col in setdiff(colnames(phenodata2), colnames(phenodata1)))
  phenodata1[,col] <- NA

# combine phenodata tables and update sample identifiers
phenodata <- rbind(phenodata1, phenodata2)
phenodata$sample <- microarrays

# write files
options(scipen=10)
write.table(dat, file='merged.tsv', quote=FALSE, sep='\t', row.names=TRUE, col.names=TRUE)
write.table(phenodata, file='phenodata-merged.tsv', quote=FALSE, sep='\t', na='', row.names=FALSE, col.names=TRUE)

# EOF
