# TOOL extract-samples-from-dataset.R: "Extract samples from dataset" (Extracts samples from a dataset. Saves the extracted samples as a new dataset. If there are missing values in the specified phenodata column, the samples that do have a value are extracted. If there are no missing values, the samples to be extracted have to be coded with 1, and the samples to be deleted with 0 in the same column.)
# INPUT normalized.tsv: normalized.tsv TYPE GENE_EXPRS 
# INPUT META phenodata.tsv: phenodata.tsv TYPE GENERIC 
# OUTPUT extract.tsv: extract.tsv 
# OUTPUT META phenodata.tsv: phenodata.tsv 
# PARAMETER column.extract: column.extract TYPE METACOLUMN_SEL DEFAULT group (Phenodata column containing the samples to be extracted)

# Extracts subset of samples from a dataset
# JTT, 19.10.2007
#
# Modified to also include annotation info in the output dataset.
# MG, 20.4.2010
#
# Modified to first filter out samples with missing values. And in the case of no missing values, follow
# the earlier behaviour of coding samples to be extracted with 1 and samples to be removed with 0.
# IS, 28.7.2010
#
# Modified to handle 2-color array data when extracting single sample.
# MG, 17.6.2011

# Parameter settings (default) for testing purposes
#column.extract<-"group"

# Loads the data file
file<-c("normalized.tsv")
dat<-read.table(file, header=T, sep="\t", row.names=1)

# Separates expression values and flags
calls<-dat[,grep("flag", names(dat))]
dat2<-dat[,grep("chip", names(dat))]

# Handle average columns for 2-color arrays
dat_average <- dat[,grep("average", names(dat))]

# Check if there is annotation info available and if so extract it
annotations <- dat[,-c(grep("chip",names(dat)), grep("flag", names(dat)), grep("average", names(dat)))]
if (length(annotations)>0) {
	rownames(annotations) <- rownames(dat)
}

# Loads phenodata
phenodata<-read.table("phenodata.tsv", header=T, sep="\t")

# Extract the data from the phenodata column
extract<-phenodata[,pmatch(column.extract,colnames(phenodata))]

# If there are samples with missing values, extract the ones that do have values.
if (length(extract[is.na(extract)])>0) {
	extract[!is.na(extract)] <- 1
	extract[is.na(extract)] <- 0
}

# Sanity checks
if(length(unique(extract))>2) {
	stop("CHIPSTER-NOTE: You have specified more than two groups! You need to define exactly two groups.")
}
if(max(extract>1)) {
	stop("CHIPSTER-NOTE: The groups should be defined with values of 0 and 1! You have numbers larger than 1 in the definitions.")
}

# Extracting the samples
dat3<-dat2[,which(extract==1)]
if (dim(dat_average)[2]>0) {
	dat3<-data.frame (dat2[,which(extract==1)], dat_average[,which(extract==1)])
	colnames(dat3) <- c(names(dat2)[which(extract==1)], names(dat_average)[which(extract==1)])
} 
if(ncol(calls)>=1) {
	calls2<-calls[,which(extract==1)]
}
phenodata2<-phenodata[which(extract==1),]

# Writing the data to disk
if (length(annotations)>0) {
	dat3 <- data.frame(annotations,dat3)
}
if(ncol(calls)>=1) {
	write.table(data.frame(dat3, calls2), file="extract.tsv", sep="\t", row.names=T, col.names=T, quote=F)
} else {
	write.table(data.frame(dat3), file="extract.tsv", sep="\t", row.names=T, col.names=T, quote=F)
}
write.table(phenodata2, file="phenodata.tsv", sep="\t", row.names=F, col.names=T, quote=F, na='')
