/********************************************************************
 * FILE: gomo.c
 * AUTHOR: Fabian Buske
 * CREATE DATE: 18/06/2008
 * PROJECT: MEME suite
 * COPYRIGHT: 2008, UQ
 *
 * GOMO is an implementation of the algorithm described in
 * "Associating transcription factor binding site motifs with target
 * Go terms and target genes"
 * authors: Mikael Boden and Timothy L. Bailey
 * published: Nucl. Acids Res (2008)
 *
 ********************************************************************/

#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <float.h>
#include <math.h>
#include <ctype.h>
#include <time.h>
#include "matrix.h"
#include "alphabet.h"
#include "cisml.h"
#include "fasta-io.h"
#include "meme-io.h"
#include "string-list.h"
#include "hash_table.h"
#include "simple-getopt.h"
#include "read_csv.h"
#include "ranksum_test.h"
#include "io.h"
#include "read_tamo.h"
#include "dir.h"
#include "qvalue.h"
#include "xml-util.h"
#include "config.h"
#include "projrel.h"

#include <libxslt/xslt.h>
#include <libxslt/xsltInternals.h>
#include <libxslt/transform.h>
#include <libxslt/xsltutils.h>

#define min(a,b)      (a<b)?a:b
#define max(a,b)      (a<b)?b:a
//#define concat(a, b)  a ## b

char* program_name = "gomo";
VERBOSE_T verbosity = NORMAL_VERBOSE;
const int GFF_FORMAT=0;   // GFF file format
const int CISML_FORMAT=1; // cisML file format
#define HASH_BINS 1000 	/* Number of bins used for hashing. */
#define SHOW_EVALUE 10 	/* E-value threshold */
#define MCHUNK 10 			/* size of memory chunks to allocate for lines */
const int DEFAULT_MAX_ENTRIES = 10; /* maximum number of entries added to the list */
static BOOLEAN clobber = FALSE; /* default is not to overwrite existing files */
static char *default_output_dirname = "gomo_out";  /* default name of output */

/* gomo result */
typedef struct gomo_result GOMO_RESULT_T;
struct gomo_result {
  char* goterm;	/* go term */
  double score;	/* score for go term */
  int index;
};

/*************************************************************************
 * indexing data structure
 *************************************************************************/
typedef struct entry_list_t ENTRY_LIST_T;

struct entry_list_t {
  int  size;    /* current size of the array. */
  int  max_size; /* the maximum size */
  HASH_TABLE_ENTRY** index_list;/* The array itself */
};
/**
 * Init new entry list structure
 * @return new list structure
 */
ENTRY_LIST_T* new_entry_list () {
  ENTRY_LIST_T* new_list;  /* The list being created. */
  new_list = (ENTRY_LIST_T*)mm_calloc(1, sizeof(ENTRY_LIST_T));
  new_list->size = 0;
  new_list->max_size = 0;
  new_list->index_list = NULL;
  return(new_list);
}
/**
 * Check list for validity
 * @param a_list
 */
static void check_null_list (ENTRY_LIST_T*  a_list){
  if (a_list == NULL) {
    die("Attempted to access null index list.\n");
  }
}
/**
 * Add an entry to the list
 * @param entry to add
 * @param a_list list to add entry to
 */
void add_entry (HASH_TABLE_ENTRY* entry, ENTRY_LIST_T* a_list){
  check_null_list(a_list);
  int i;
  for (i=0;i<a_list->size;++i){
	  if (a_list->index_list[i] == entry)
		die("try to put entry twice in entrylist");
  }

  /* Reallocate space if there isn't any. */
  if (a_list->size >= a_list->max_size) {
	Resize(a_list->index_list, a_list->max_size+DEFAULT_MAX_ENTRIES, HASH_TABLE_ENTRY*);
    a_list->max_size += DEFAULT_MAX_ENTRIES;
  }
  a_list->index_list[a_list->size] = entry;
  (a_list->size)++;
}
/**
 * Returns number of currently saved entries in the list
 * @param a_list list of interest
 * @return number of entries
 */
int get_amount_entries (ENTRY_LIST_T*  a_list){
  check_null_list(a_list);
  return(a_list->size);
}
/**
 * return an entry at the specified position in the list
 * @param n position
 * @param a_list list of interest
 * @return entry at position n
 */
HASH_TABLE_ENTRY* get_nth_entry (int n, ENTRY_LIST_T*  a_list){
  check_null_list(a_list);
  if (n > a_list->max_size) {
    die("Attempted to access string beyond end of list.\n");
  } else if (n > a_list->size) {
    die("Attempted to access uninitialized string.\n");
  }

  return(a_list->index_list[n]);
}
/**
 * free the entry list structure
 * @param a_list list to free
 */
void free_entry_list (ENTRY_LIST_T* a_list){
  if (a_list == NULL)
	  return;
  myfree(a_list->index_list);
  myfree(a_list);
}

/*************************************************************************
 * gomo_dts()
 *************************************************************************/
static char *gomo_dts = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n"
	"<?xml-stylesheet type=\"text/xsl\" href=\"gomo.xsl\"?>\n"
	"<!-- Document definition -->\n"
	"<!DOCTYPE gomo[\n"
	"<!ELEMENT gomo (program-name, model, species )>\n"
	"<!ATTLIST gomo xmlns:xsi CDATA #IMPLIED\n"
	"	version CDATA #REQUIRED\n"
	"	release CDATA #REQUIRED\n"
	">\n"
	"<!ELEMENT program-name (#PCDATA)>\n"
	"<!ELEMENT model (command_line, significance_threshold,score_E_threshold,use_Evalues)>\n"
	"<!ELEMENT command_line (#PCDATA)>\n"
	"<!ELEMENT significance_threshold (#PCDATA)>\n"
	"<!ELEMENT score_E_threshold (#PCDATA)>\n"
	"<!ELEMENT use_Evalues (#PCDATA)>\n"
	"<!ELEMENT species (input-files, (pattern+) )>\n"
	"<!ATTLIST species name CDATA #REQUIRED\n"
	"   id	CDATA #REQUIRED\n"
	">\n"
	"<!ELEMENT input-files (scoring-file, godb-file)>\n"
	"<!ELEMENT scoring-file (#PCDATA)>\n"
	"<!ELEMENT godb-file (#PCDATA)>\n"
	"<!ELEMENT pattern (goterm*)>\n"
	"<!ATTLIST pattern accession CDATA #REQUIRED\n"
	"	name CDATA #REQUIRED\n"
	">\n"
	"<!ELEMENT goterm (#PCDATA)>\n"
	"<!ATTLIST goterm id CDATA #REQUIRED\n"
	"	description CDATA #REQUIRED\n"
	"	E-value CDATA #REQUIRED\n"
	"	qvalue CDATA #REQUIRED\n"
	">\n"
	"]>\n";

/*************************************************************************
 * add_to_hash()
 *
 * Add one sequence go-term pair to the hash
 *************************************************************************/
inline void add_to_hash(char* go_term, HASH_TABLE* go_ht, HASH_TABLE_ENTRY* seq_entry){
	assert(go_term != NULL);
	assert(go_ht != NULL);
	assert(seq_entry != NULL);

	int dummy; // dummy: required by hash_lookup interface.
	HASH_TABLE_ENTRY* he = NULL;
	he = hash_lookup_str(go_term, *go_ht, &dummy);
	if (he == NULL) {
		// Record current go term
		BOOLEAN b = hash_insert_str(go_term, *go_ht);
		if (b==FALSE)
			die("Hashing of go term failed!");
		else
			he = hash_lookup_str(go_term, *go_ht, &dummy);
	}
	if (hash_get_entry_value(he) == NULL){
		hash_set_entry_value(new_entry_list(),he);
	}
	// add sequence (hash entry) to list of sequences assigned to this go term
	add_entry(seq_entry, (ENTRY_LIST_T*)(hash_get_entry_value(he)));
}

/*************************************************************************
 * add_to_description_hash()
 *
 * Adds one go term - description pair to the hash
 *************************************************************************/
inline void add_to_description_hash(char* go_term, HASH_TABLE* godesc_ht,
		char* go_description){

	assert(go_term != NULL);
	assert(go_description != NULL);
	assert(godesc_ht != NULL);

	int dummy; // dummy: required by hash_lookup interface.
	HASH_TABLE_ENTRY* he = NULL;
	he = hash_lookup_str(go_term, *godesc_ht, &dummy);
	if (he == NULL) {
		// add go term to go-description hash
		if (hash_insert_str(go_term, *godesc_ht)){
			he = hash_lookup_str(go_term, *godesc_ht, &dummy);
			// add description copy as the value
			char* go_desc_copy = NULL;
			Resize(go_desc_copy, strlen(go_description)+1, char);
			strcpy(go_desc_copy,go_description);
			hash_set_entry_value(go_desc_copy,he);
		} else
			die("ERROR: Hashing of go description failed.");
	}
}

/*************************************************************************
 * get_unique_go_terms()
 *
 * Reads all go files for a gomo run and returns a unique list of the union
 * of all the go_terms contained in the files
 *************************************************************************/
void get_unique_go_terms(
		STRING_LIST_T* gofiles,	// all go files to scan IN
		STRING_LIST_T* go_terms // STRING_LIST holding the set of unique go terms OUT
){
	assert(go_terms != NULL);
	assert(get_num_strings(go_terms) == 0);
	assert(gofiles != NULL);

	FILE *inputFilePtr;            	/* Pointer to input file */
	STRING_LIST_T **content = NULL;	/* content of input file */

	if (verbosity >= HIGH_VERBOSE){
		fprintf(stderr,"Read in %d go files in total.\n", get_num_strings(gofiles));
	}

	STRING_LIST_T* processedFiles = new_string_list();
	int c,i,j,length;

	for (c=0; c<get_num_strings(gofiles);++c){
		length = 0;
		content = NULL;	/* content of input file */

		// if file already processed than skip it
		if (have_string(get_nth_string(c,gofiles),processedFiles)){
			if (verbosity >= HIGH_VERBOSE)
				fprintf(stderr,"Skip file %s... (already processed)\n",get_nth_string(c,gofiles));
			continue;
		}

		if (verbosity >= HIGH_VERBOSE){
			fprintf(stderr,"Read file %s...\n",get_nth_string(c,gofiles));
		}

		inputFilePtr = fopen(get_nth_string(c,gofiles), "r");  	/* Open in TEXT mode */
		read_csv(inputFilePtr,'#','\t',&content,&length); 		/* Get csv content*/
		fclose(inputFilePtr);

		for (i=0;i<length;++i){ /* read in each and every line */
			if (get_num_strings(content[i])>0){
				char* go_term = get_nth_string(0,content[i]);
				if (!have_string(go_term,go_terms)){
					add_string(go_term, go_terms);
				}
			}
			// free content
			free_string_list(content[i]);
		}
		add_string(get_nth_string(c,gofiles), processedFiles);
		myfree(content);
	}

	free_string_list(processedFiles);

	if (verbosity >= NORMAL_VERBOSE){
		fprintf(stderr, "Total number of unique go terms %d\n", get_num_strings(go_terms));
	}
}

/*************************************************************************
 * read_go_file()
 *
 * Reads in go term to sequence file
 *************************************************************************/
void read_go_file(char* filename, STRING_LIST_T* go_terms, HASH_TABLE* go_ht,
		HASH_TABLE* godesc_ht, HASH_TABLE* sequence_list){
	FILE *inputFilePtr;            	/* Pointer to input file */
	STRING_LIST_T **content = NULL;

	assert(filename != NULL);
	assert(go_terms != NULL);
	assert(go_ht != NULL);
	assert(godesc_ht != NULL);
	assert(sequence_list != NULL);

	int i,j,hash_value,length = 0;
	inputFilePtr = fopen(filename, "r");  	/* Open in TEXT mode */
	read_csv(inputFilePtr,'#','\t',&content,&length); /* Get csv content*/
	fclose(inputFilePtr);

	if (verbosity >= NORMAL_VERBOSE){
		fprintf(stderr, "Read in %d lines from %s\n", length,filename);
	}

	for (i=0;i<length;++i){ /* read in each and every line */
		if (get_num_strings(content[i])<=2){
			if (verbosity >= NORMAL_VERBOSE){
				fprintf(stderr, "Go-term does not exists or lacks assigned sequences. Skip this entry!\n");
			}
		} else {
			char* go_term = get_nth_string(0,content[i]);
			// link sequence ids to go term via hash
			for (j=2;j<get_num_strings(content[i]);++j){
				char* seq_id = get_nth_string(j,content[i]);
				// get the sequence hash entry from the sequence_list or put in
				// there if not in yet
				HASH_TABLE_ENTRY* he = NULL;
				int dummy;
				he = hash_lookup_str(seq_id, *sequence_list, &dummy);
				if (he == NULL) {
					// Record current sequence id
					BOOLEAN b = hash_insert_str(seq_id, *sequence_list);
					if (b==FALSE)
						die("Hashing of sequence id failed!");
					else
						he = hash_lookup_str(seq_id, *sequence_list, &dummy);
				}
				// fprintf(stderr,"%s add %s\n",go_term,hash_get_entry_key(he));
				add_to_hash(go_term, go_ht, he);
			}
			/* add go-term to key list if not included yet */
			if (!have_string(go_term,go_terms)){
				add_string(go_term, go_terms);
			}
			/* add description */
			add_to_description_hash(go_term, godesc_ht, get_nth_string(1,content[i]));
		}
	}
	// free content
	for (i=0;i<length;++i){
		free_string_list(content[i]);
	}
	myfree(content);
}

/*************************************************************************
 * remove_empty_goterms()
 *
 * Remove the go terms from the list of terms that lack any associated
 * sequence
 *************************************************************************/
void remove_empty_goterms(STRING_LIST_T* go_terms, HASH_TABLE* go_ht){
	int i, dummy;
	HASH_TABLE_ENTRY* hex = NULL;
	STRING_LIST_T *remove = new_string_list();
	for (i=0;i<get_num_strings(go_terms);++i){
		char* go_term = get_nth_string(i,go_terms);
		hex = hash_lookup_str(go_term, *go_ht, &dummy);
		if (hex == NULL){
			add_string(go_term,remove);
		}
	}
	if (get_num_strings(remove)>0){
		remove_strings(remove, go_terms);
	}
	free_string_list(remove);
}

/*************************************************************************
 * gomo_result_cmp()
 *
 * Compares two gomo results with respect to its scores
 *************************************************************************/
inline int gomo_result_cmp(const void *v1, const void *v2)
{
  assert(v1 != NULL);
  assert(v2 != NULL);
  GOMO_RESULT_T s1 = *(GOMO_RESULT_T *)v1;
  GOMO_RESULT_T s2 = *(GOMO_RESULT_T *)v2;
  double diff = s1.score - s2.score;
  if (diff > 0.0){
	  return 1;
  } else if (diff < 0.0){
	  return -1;
  } else {
	  return 0;
  }
}

/*************************************************************************
 * sequence_pvalue_cmp()
 *
 * Compares two scanned sequences with regard to their p-values
 * P-values are sorted ascending
 *************************************************************************/
inline int sequence_pvalue_cmp(const void *v1, const void *v2)
{
  assert(v1 != NULL);
  assert(v2 != NULL);
  SCANNED_SEQUENCE_T* s1 = *(SCANNED_SEQUENCE_T **)v1;
  SCANNED_SEQUENCE_T* s2 = *(SCANNED_SEQUENCE_T **)v2;
  double diff = get_scanned_sequence_pvalue(s1)-get_scanned_sequence_pvalue(s2);
  if (diff > 0.0){
	  return 1;
  } else if (diff < 0.0){
	  return -1;
  } else {
	  return 0;
  }
}

/*************************************************************************
 * sequence_score_cmp()
 *
 * Compares two scanned sequences with regard to their scores
 * Scores are sorted decending
 *************************************************************************/
inline int sequence_score_cmp(const void *v1, const void *v2)
{
  assert(v1 != NULL);
  assert(v2 != NULL);
  SCANNED_SEQUENCE_T* s1 = *(SCANNED_SEQUENCE_T **)v1;
  SCANNED_SEQUENCE_T* s2 = *(SCANNED_SEQUENCE_T **)v2;
  double diff = get_scanned_sequence_score(s1)-get_scanned_sequence_score(s2);
  if (diff < 0.0){
	  return 1;
  } else if (diff > 0.0){
	  return -1;
  } else {
	  return 0;
  }
}

/*************************************************************************
 * sort_scanned_sequences()
 *
 * Sorts the scanned sequences contained in the cisml pattern data structure
 * regarding their E-value (or scores) depending on use_Evalues
 *************************************************************************/
void sort_scanned_sequences(PATTERN_T *pattern, BOOLEAN_T use_Evalues){
  assert(pattern != NULL);
  SCANNED_SEQUENCE_T ** seq_arr = get_pattern_scanned_sequences(pattern);
  int size = get_pattern_num_scanned_sequences(pattern);
  if (use_Evalues)
	  qsort(seq_arr,size,sizeof(seq_arr[0]),sequence_pvalue_cmp);
  else
	  qsort(seq_arr,size,sizeof(seq_arr[0]),sequence_score_cmp);


  if (verbosity >= HIGHER_VERBOSE){
	  int i;
	  fprintf(stderr,"Pattern: %s\nOrdered Sequences:\n",get_pattern_name(pattern));
	  if (use_Evalues){
		  for (i=0;i<size;++i){
			  SCANNED_SEQUENCE_T* s = get_pattern_scanned_sequences(pattern)[i];
			  fprintf(stderr,"%s : %f\n",get_scanned_sequence_accession(s),get_scanned_sequence_pvalue(s));
		  }
	  } else {
		  for (i=0;i<size;++i){
			  SCANNED_SEQUENCE_T* s = get_pattern_scanned_sequences(pattern)[i];
			  fprintf(stderr,"%s : %f\n",get_scanned_sequence_accession(s),get_scanned_sequence_score(s));
		  }
	  }
  }
}

/*************************************************************************
 * hash_sequence_ranks()
 *
 * Hash the sequence ids to its ranks
 *************************************************************************/
void hash_sequence_ranks(HASH_TABLE* seqRanks_ht, PATTERN_T* pattern){
  assert(seqRanks_ht != NULL);
  assert(pattern != NULL);
  int seqs = get_pattern_num_scanned_sequences(pattern);

  // Hash mapping all sequences to its rank
  *seqRanks_ht = hash_create(seqs*4);
  int k,dummy;
  for (k=0;k<seqs;++k){
	  SCANNED_SEQUENCE_T *seq = get_pattern_scanned_sequences(pattern)[k];
	  char* seqName = get_scanned_sequence_accession(seq);
	  HASH_TABLE_ENTRY *he = hash_lookup_str(seqName,*seqRanks_ht,&dummy);
	  if (he == NULL){
		  BOOLEAN b = hash_insert_str(seqName, *seqRanks_ht);
		  he = hash_lookup_str(seqName, *seqRanks_ht, &dummy);
		  if (b==FALSE)
			  die("Hashing of sequences to its rank did not work out! bad luck!");

		  if (hash_get_entry_value(he) == NULL){
			  int* value = mm_malloc(sizeof(int));
			  *value = k;
			  hash_set_entry_value(value,he);
		  }
	  } else {
		  die("Error! Key already exists in the hash table: %s (%s hash_sequence_ranks pattern %s)\n",
				  seqName,hash_get_entry_key(he),get_pattern_accession(pattern));
	  }
  }
}


/*************************************************************************
 * add_to_motifhash()
 *
 * adds a new motif to the motif hashtable if not contained already
 *************************************************************************/
void add_to_motifht(
		HASH_TABLE *motif_ht,
		STRING_LIST_T* motifs,
		STRING_LIST_T* all_go_terms,
		char* accession,
		int num_organisms
){
	int dummy,j,k;
	char* g;
	double* organisms;
	HASH_TABLE_ENTRY* he;
	HASH_TABLE tmp_ht;

	// add motif if not in motif hash yet
	he = hash_lookup_str(accession,*motif_ht, &dummy);
	if (he == NULL) {
		  // add motif to keyset
		  add_string(accession, motifs);
		  // add motif to hashtable
		  hash_insert_str(accession, *motif_ht);
		  // get corresponding hash entry
		  he = hash_lookup_str(accession, *motif_ht, &dummy);
		  // add go term hashtable as value
		  tmp_ht = hash_create(get_num_strings(all_go_terms)*2);
		  hash_set_entry_value(tmp_ht,he);
		  // fill goterm hashtable with all go_terms as key and
		  // assign an array of size = number of organisms
		  for (j=0;j<get_num_strings(all_go_terms);++j){
			  g = get_nth_string(j,all_go_terms);
			  hash_insert_str(g, tmp_ht);
			  he = hash_lookup_str(g, tmp_ht, &dummy);
			  organisms = mm_malloc(sizeof(double)*num_organisms);
			  for (k=0;k<num_organisms;++k)
				  organisms[k] = 1.0; // set to worst case
			  hash_set_entry_value(organisms,he);
		  }
	}
}

/*************************************************************************
 * run_gomo_on_organism()
 *
 *
 *************************************************************************/
void run_gomo_on_organism(
		CISML_T* cisml,			// CISML containing scored sequences and patterns IN
		PATTERN_T *pattern,		// pattern of interest IN
		HASH_TABLE go_ht,		// GO-term 2 seq id hashtable IN
		HASH_TABLE godesc_ht,	// GO-term 2 GO description hashtable IN
		//STRING_LIST_T* sequence_list, // list of all sequences IN
		HASH_TABLE sequence_list, // list of all sequences IN
		STRING_LIST_T* organism_go_terms,	// all GO terms specific to this species IN
		STRING_LIST_T* all_go_terms, // all unique Go terms across all species IN
		double e_threshold,		// threshold for E-value IN
		int organism,			// current species of interest
		int num_organisms,		// all organisms
  		HASH_TABLE motif_ht,
		BOOLEAN_T status,		// FLAG whether to output the program status IN
		BOOLEAN_T text_only,	// FLAG whether to output text only IN
		FILE* xml_output,		// target for xml output IN
		FILE* text_output,		// target for txt output IN
		BOOLEAN_T use_Evalues,  // FLAG whether to use the E-values rather than (Ama) scores
		STRING_LIST_T* motifs 	// motifs processed for this species OUT
){
  int i = 0;
  int j = 0;
  int k = 0;
  int dummy = 0;
  int s_length = 0;
  int class_a = 0;
  double p_value;
  HASH_TABLE seqRanks_ht = NULL;
  HASH_TABLE_ENTRY *he = NULL;
  char* go_term = NULL;
  int* selected = NULL;
  double* score_pred = NULL;
  double* scores = NULL;
  BOOLEAN_T* group = NULL;
  RSD_T* ranksum_dataset = NULL;
  RSR_T *ranksum_result = NULL;
  ENTRY_LIST_T* gene_entry_list = NULL;
  char* accession = NULL;		// sequence accession
  int seqs;				// number of sequences in a pattern
  ARRAY_T* pv = NULL;	// pvalue array
  double* v = NULL;		// p-values
  int orphants=0;		// GO terms with gene id but lacking any sequence for these

  if (status)
	  fprintf(stderr,"Process organism %d : 0.0 %% \r",organism+1);

  selected = NULL;
  s_length = 0;
  seqs = get_pattern_num_scanned_sequences(pattern);
  accession = get_pattern_accession(pattern);

  if (!text_only){
	  fprintf(xml_output,"<pattern accession=\"%s\" name=\"%s\">\n",accession,get_pattern_name(pattern));
  }

  // sort scanned sequences with regard to score
  sort_scanned_sequences(pattern, use_Evalues);

  // add motif to motif hashtable if more than one organism is scanned
  if (num_organisms > 1){
	  add_to_motifht(&motif_ht, motifs, all_go_terms, accession, num_organisms);
  }

  // create hash that maps gene ids to its rank
  hash_sequence_ranks(&seqRanks_ht,pattern);

  // predicted score
  score_pred = mm_malloc(sizeof(double)*get_num_strings(organism_go_terms));
  // init with -1
  for (j=0;j<get_num_strings(organism_go_terms);++j){
	  score_pred[j] = -1.0;
  }
  // class of each sequence
  group = mm_malloc(sizeof(BOOLEAN_T)*seqs);
  // get the ordered scores list
  scores = mm_malloc(sizeof(double)*seqs);
  for (j=0;j<seqs;++j){
	  SCANNED_SEQUENCE_T *seq = get_pattern_scanned_sequences(pattern)[j];
	  if (use_Evalues) {
		  scores[j] = get_scanned_sequence_pvalue(seq);
	  } else {
		  scores[j] = get_scanned_sequence_score(seq);
      }
  }

  // Copy the scores and the uninitialized classes (group) to
  // a special object for ranksum routines.  Calculate the
  // (adjusted) rank of each score.
  // The group of each score will be reassigned differently for
  // each GO term.
  ranksum_dataset = get_ranksum_dataset(scores, group, seqs);

  /* get list f sorted p-values and compute qvalues */
  Resize(pv,1,ARRAY_T);

  /* check each go-term */
  for (j=0;j<get_num_strings(organism_go_terms);++j){

	  // number of elements in class a
	  class_a = 0;

	  // get go-term (key)
	  go_term = get_nth_string(j,organism_go_terms);

	  // define all entries as not belonging to the GO-term by default
	  for (k=0;k<seqs;++k){
		  group[k] = FALSE;
	  }

	  // look up go-term associated gene ids
	  he = hash_lookup_str(go_term,go_ht,&dummy);
	  gene_entry_list = (ENTRY_LIST_T*)(hash_get_entry_value(he));

	  // for each gene id look up its rank
	  for (k=0;k<get_amount_entries(gene_entry_list);++k){
		  // Get the geneid from the entry list and use it as a key
		  // to get the position (rank) of the gene in the sorted list.
		  HASH_TABLE_ENTRY *hlp = hash_lookup_str(hash_get_entry_key(get_nth_entry(k,gene_entry_list)),seqRanks_ht,&dummy);
		  if (hlp != NULL){
			group[*((int*)hash_get_entry_value(hlp))] = TRUE;
			++class_a;
		  }
	  }
	  // if not all genes are assigned to this go term but at least one is
	  // i.e. some information is found ... proceed
	  if (seqs > get_amount_entries(gene_entry_list) && class_a > 0){

		  set_ranksum_groups(ranksum_dataset, group);
		  /* perform ranksum test on ordered dataset */
		  ranksum_result = run_ranksum_test_on_ordered_dataset(ranksum_dataset);

		  // returns p-value of left-tailed tests
		  p_value = RSR_get_p_left(ranksum_result);

		  // look up motif in motif-hashtable if more than one organism investigated
		  if (num_organisms > 1){
			  he = hash_lookup_str(accession, motif_ht, &dummy);
			  if (he != NULL){
				  // look up go term for motif
				  he = hash_lookup_str(go_term, (HASH_TABLE)he->value, &dummy);
				  if (he != NULL){
					  ((double*)he->value)[organism] = p_value;
				  } else {
					  die("Error: Could not lookup go term in hashtable. "
							  "Something during initialization must have failed.");
				  }
			  } else {
				  die("Error: Could not lookup motif in hashtable. "
						  "Something during initialization must have failed.");
			  }
		  }

		  // save p_value
		  score_pred[j] = p_value;
		  destroy_rsr(ranksum_result);
	  } else{
		  ++orphants;
		  if (verbosity >= HIGHER_VERBOSE){
			  fprintf(stderr,"Skip due to all or no GO term assigned to this gene:%d %d %s\n",get_amount_entries(gene_entry_list), class_a,go_term);
		  }
	  }
  }

  if (verbosity >= HIGH_VERBOSE){
	  if (orphants>0)
		  fprintf(stderr,"%d GO terms have assigned genes which lack a sequence entry or for other reasons have no information to work with. These are skipped!\n",orphants);
  }
  int num_go_terms = get_num_strings(organism_go_terms)-orphants;

  myfree(scores);
  myfree(group);
  destroy_rsd(ranksum_dataset);

  /* order results with respect to p-value */
  GOMO_RESULT_T* res = NULL;

  Resize(v,num_go_terms,double);
  pv->items = v;
  pv->num_items = num_go_terms;


  /* correcting for multiple tests */
  int c=0;
  for (j=0;j<get_num_strings(organism_go_terms);++j){
	  if (score_pred[j]!= -1){
		  set_array_item(c++, score_pred[j], pv);
		  if (e_threshold <= 0.0 || score_pred[j]*num_go_terms <= e_threshold){
			  /* resize array of pointers if necessary */
			  if ((s_length % MCHUNK) == 0){
				  Resize(selected, s_length+MCHUNK, int);
			  }
			  selected[s_length++] = j;
		  } else if (verbosity >= HIGHER_VERBOSE){
			  fprintf(stderr,"Not satisfying go term:%s %g\n", go_term, score_pred[j]*num_go_terms);
		  }
	  }
  }
  sort_array(FALSE,pv);

/*
  compute_qvalues(FALSE, // Don't stop with FDR.
		  TRUE,  // Do estimate pi-zero.
		  NULL,  // Don't store pi-zero in a file.
		  NUM_BOOTSTRAPS,
		  NUM_BOOTSTRAP_SAMPLES,
		  NUM_LAMBDA,
		  MAX_LAMBDA,
		  num_go_terms,
		  pv);
*/

  /* output results with respect to e-value */
  if (s_length > 0){
	  Resize(res,s_length,GOMO_RESULT_T);
	  for (j=0;j<s_length;++j){
		  char* go_term = get_nth_string(selected[j],organism_go_terms);
		  res[j].score = score_pred[selected[j]];
		  res[j].goterm = go_term;
		  res[j].index = selected[j];
	  }
	  qsort(res,s_length,sizeof(GOMO_RESULT_T),gomo_result_cmp);

	  /* output go-terms*/
	  for (j=0;j<s_length;++j){
		  char* go_desc = "";
		  he = hash_lookup_str(res[j].goterm,godesc_ht,&dummy);
		  if (he != NULL && hash_get_entry_value(he) != NULL){
			  go_desc = (char*)(hash_get_entry_value(he));
		  }
		  fprintf(text_output,"%s\t%s\t%.3e\t%.3e\t%s\n",get_pattern_accession(pattern),
				  get_nth_string(res[j].index,organism_go_terms),
				  res[j].score*num_go_terms,get_array_item(j,pv),go_desc);

		  if (!text_only){
			  fprintf(xml_output,"<goterm id=\"%s\" E-value=\"%.3e\" qvalue=\"%.3e\" description=\"%s\"/>\n",
					  res[j].goterm,res[j].score*num_go_terms,get_array_item(j,pv),go_desc);
		  }
	  }
  }

  fprintf(text_output,"# %d total go-terms out of %d satisfying E <= %g for motif %s \n",s_length,num_go_terms,e_threshold,get_pattern_accession(pattern));

  /**********
  * clean up
  ***********/

  // free sequence hashmap
  for (k=0;k<seqs;++k){
	  SCANNED_SEQUENCE_T *seq = get_pattern_scanned_sequences(pattern)[k];
	  char* seqName = get_scanned_sequence_accession(seq);
	  he = hash_lookup_str(seqName,seqRanks_ht,&dummy);
	  if (he != NULL && hash_get_entry_value(he) != NULL){
		  int* x = (int *)hash_get_entry_value(he);
		  myfree(x);
	  }
  }

  hash_destroy(seqRanks_ht);
  myfree(score_pred);
  myfree(selected);
  myfree(res);
  free_array(pv);

  if (!text_only){
	  fprintf(xml_output,"</pattern>\n");
  }

  if (status)
	  fprintf(stderr, "Process organism %d: %3.1f %% \r",organism+1,(i)*100.0/(float)get_cisml_num_patterns(cisml));
}

/*************************************************************************
 * start_xml_output()
 *
 * initiate the output
 *************************************************************************/
void start_xml_output(FILE *xml_output,
		char* program_name,
		double e_threshold,
		float score_E_thresh,
		BOOLEAN_T use_Evalues,
		int argc,
		char *argv[]
){
  char* command = NULL;
  int lc = 4;
  int alloc_size = 0;
  int i = 0;
  int j = 0;

  const char *archive_date = ARCHIVE_DATE;
  int a = strlen(archive_date);

  fprintf(xml_output,gomo_dts);
  fprintf(xml_output,
	"<gomo xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" version=\"%s\" release=\"%.*s\">\n",
	VERSION,
	a,
	archive_date
  );
  fprintf(xml_output,"<program-name>%s</program-name>\n",program_name);

  for (i=1;i<argc;++i){
	  alloc_size += strlen(argv[i])+2;
  }
  // create command line
  Resize(command, alloc_size, char);
  strcpy(command,"gomo");
  if (argc>0){
	  for (i=1;i<argc;++i){
		  command[lc++] = ' ';
		  for (j=0;j<strlen(argv[i]);++j)
			  command[lc++] = argv[i][j];
	  }
  }
  command[lc++] = '\0';
  fprintf(xml_output,"<model>\n");
  fprintf(xml_output,"<command_line>%s</command_line>\n",command);
  fprintf(xml_output,"<significance_threshold>%g</significance_threshold>\n",e_threshold);
  fprintf(xml_output,"<score_E_threshold>%g</score_E_threshold>\n",score_E_thresh);
  fprintf(xml_output,"<use_Evalues>%s</use_Evalues>\n",use_Evalues?"1":"0");
  fprintf(xml_output,"</model>\n");

  myfree(command);
}

/*************************************************************************
 * write_combined_organism_output()
 *
 * write the output for the combined p_values of several organisms
 * submitted
 *************************************************************************/
void write_combined_organism_output(
		FILE* text_output,			/* the text file to write the results in */
		FILE* xml_output,			/* the xml file to write the results in */
		HASH_TABLE motif_ht, 		/* the motif hashtable */
		HASH_TABLE godesc_ht, 		/* the go description hashtable */
		STRING_LIST_T* motifs, 		/* the list of motifs */
		STRING_LIST_T* all_go_terms, /* the list of go terms */
		int num_organisms,			/* the number of organisms */
		double e_threshold,			/* the  E-value threshold */
		BOOLEAN_T text_only
){
  int c,num_comb_goterms,i,j,k,dummy;			/* bunch of counter variables */
  STRING_LIST_T* combined_set; 	/* keeps track of the go terms that exist in all organisms */
  HASH_TABLE_ENTRY* he;
  GOMO_RESULT_T* res = NULL;
  float pvalue;
  ARRAY_T* pv = NULL;	// pvalue array
  double* v = NULL;		// p-values
  char *accession;

  if (text_only) {
	  fprintf(text_output,"# species combined\n");
  } else {
	  fprintf(xml_output,"<species id=\"0\" name=\"combined\">\n");
	  fprintf(xml_output,"<input-files>\n<scoring-file>none</scoring-file>\n");
	  fprintf(xml_output,"<godb-file>none</godb-file>\n</input-files>\n");
  }

  // consider each motif
  for (i=0;i<get_num_strings(motifs);++i){
	  // get motif accession
	  accession = get_nth_string(i,motifs);
	  // lookup motif in hashtable
	  he = hash_lookup_str(accession, motif_ht, &dummy);
	  if (he != NULL) {

		  if (!text_only){
			  fprintf(xml_output,"<pattern accession=\"%s\" name=\"%s\">\n",accession,accession);
		  }
		  HASH_TABLE goterms = (HASH_TABLE)hash_get_entry_value(he);
		  combined_set = new_string_list();
		  /* determine the go terms that have been found in all organisms */
		  /*
		  for (j=0;j<get_num_strings(all_go_terms);++j){
			  he = hash_lookup_str(get_nth_string(j,all_go_terms), goterms, &dummy);
			  if (he != NULL) {
				  c = 0;
				  double *pvalues = (double*)hash_get_entry_value(he);
				  for (k=0;k<num_organisms;++k){
					  if (pvalues[k]>0.0)
						  ++c;
				  }
				  if (c==num_organisms)
					  add_string(get_nth_string(j,all_go_terms),combined_set);
			  }
		  }

		  // combine p_values of go terms that exist in all organisms
		  num_comb_goterms = get_num_strings(combined_set); // number of combined go terms

		  */
		  num_comb_goterms = get_num_strings(all_go_terms);
		  if (num_comb_goterms>0)
			  Resize(res,num_comb_goterms,GOMO_RESULT_T);

		  /* get list f sorted p-values and compute qvalues */
		  Resize(pv,1,ARRAY_T);
		  Resize(v,num_comb_goterms,double);
		  pv->items = v;
		  pv->num_items = num_comb_goterms;

		  c=0;
		  for (j=0;j<num_comb_goterms;++j){
			  //char* go_term = get_nth_string(j,combined_set);
			  //he = hash_lookup_str(get_nth_string(j,combined_set), goterms, &dummy);
			  char* go_term = get_nth_string(j,all_go_terms);
			  he = hash_lookup_str(get_nth_string(j,all_go_terms), goterms, &dummy);
			  if (he != NULL) {
				  pvalue = 1.0;
				  double *pvalues = (double*)hash_get_entry_value(he);
				  for (k=0;k<num_organisms;++k){
					  pvalue *= pvalues[k];
				  }
				  // take nth root
				  pvalue = pow(pvalue,1.0/num_organisms);
				  res[j].score = pvalue;
				  res[j].goterm = go_term;
				  set_array_item(c++, pvalue, pv);
			  }
		  }
		  qsort(res,num_comb_goterms,sizeof(GOMO_RESULT_T),gomo_result_cmp);

		  sort_array(FALSE,pv);

		  /*
		  compute_qvalues(FALSE, // Don't stop with FDR.
				  TRUE,  // Do estimate pi-zero.
				  NULL,  // Don't store pi-zero in a file.
				  NUM_BOOTSTRAPS,
				  NUM_BOOTSTRAP_SAMPLES,
				  NUM_LAMBDA,
				  MAX_LAMBDA,
				  num_comb_goterms,
				  pv);
*/

		  /* output go-terms*/
		  for (j=0;j<num_comb_goterms;++j){
			  char* go_desc = "";
			  he = hash_lookup_str(res[j].goterm,godesc_ht,&dummy);
			  if (he != NULL && hash_get_entry_value(he) != NULL){
				  go_desc = (char*)(hash_get_entry_value(he));
			  }

			  // multiple hypothesis testing
			  if (e_threshold<=0.0 || res[j].score*num_comb_goterms <= e_threshold) {
				  fprintf(text_output,"%s\t%s\t%1.3e\t%.3e\t%s\n",
						  accession,
						  res[j].goterm,
						  res[j].score*num_comb_goterms,
						  get_array_item(j,pv),
						  go_desc
				  );
				  if (!text_only){
					  fprintf(xml_output,"<goterm id=\"%s\" E-value=\"%1.3e\" qvalue=\"%.3e\" description=\"%s\"/>\n",
						  res[j].goterm,res[j].score*num_comb_goterms,get_array_item(j,pv),go_desc);
				  }
			  }
		  }

		  if (!text_only){
			  fprintf(xml_output,"</pattern>\n");
		  }

		  // free go set
		  //free_string_list(combined_set);
		  myfree(res);
	  }
  }
  if (!text_only){
	  fprintf(xml_output,"</species>\n");
	  fflush(xml_output);
  }
  fflush(text_output);
}


/*************************************************************************
 * Entry point for gomo
 *************************************************************************/
int main(int argc, char *argv[]) {

  STRING_LIST_T* selected_motifs = NULL;

  char *output_dirname = default_output_dirname;
  const char *TEXT_FILENAME = "gomo_species";
  const char *TEXT_FILESUFFIX = ".txt";
  const char *XML_FILENAME = "gomo.xml";
  const char *HTML_STYLESHEET = "gomo.xsl";
  const char *HTML_FILENAME = "gomo.html";
  char *xml_path = NULL;

  double score_E_thresh = 0.0;
  char* go_term = NULL;
  FILE *text_output = NULL;       	/* destination for text output */
  FILE *xml_output = NULL;       	/* destination for xml output */
  BOOLEAN_T text_only = FALSE;
  BOOLEAN_T status = TRUE;
  BOOLEAN_T use_Evalues = TRUE;		/* indicates that E-values show be used rather than (Ama) scores */
  HASH_TABLE_ENTRY *he = NULL;
  float e_threshold = SHOW_EVALUE;

  HASH_TABLE motif_ht = NULL;	// A hash_table recording gomo results for each motif
  STRING_LIST_T* motifs = new_string_list(); // all motifs used as keys in motif_ht

  clock_t c0, c1; /* measuring cpu_time */

  /**********************************************
   * COMMAND LINE PROCESSING
   **********************************************/

  const int num_options = 9;
  cmdoption const motif_scan_options[] = {
	{"gs", NO_VALUE},
	{"motif",REQUIRED_VALUE},
	{"nostatus", NO_VALUE},
	{"o", REQUIRED_VALUE},
	{"oc", REQUIRED_VALUE},
	{"score-E-thresh", REQUIRED_VALUE},
	{"text", NO_VALUE},
	{"t", REQUIRED_VALUE},
	{"verbosity", REQUIRED_VALUE}
  };

  int option_index = 0;

  // Define the usage message.
  char usage[] =
    "USAGE: gomo [options] <scoring file> <go-term file> [<scoring file> <go-term file>]*\n"
    "\n"
    "   Options:\n"
	"     --gs\t\t\tFLAG use gene scores rather than p-values for the calculations\n"
	"     --motif <id>\t\t\t(default=all)\n"
	"     --nostatus\t\t\tFLAG prevent output of the progress report to the terminal\n"
	"     --o <output dir>\t\t\tname of the directory for output, will not replacing existing dir\n"
	"     --oc <output dir>\t\t\tname of the directory for output, will replace existing dir\n"
	"     --score-E-thresh\t\t\t<float> E-value threshold above which all scored sequences obtain the same rank\n"
	"     --text\t\t\tFLAG output in text format only\n"
	"     --t <float>\t\t\tE-value threshold considered significant (default = 10, t <= 0.0 shows all results)\n"
	"     --verbosity [1|2|3|4]\t\t\t(default 2)\n"
	"\n";

  // Parse the command line.
  if (simple_setopt(argc, argv, num_options, motif_scan_options) != NO_ERROR) {
    die("Error processing command line options: option name too long.\n");
  }

  while (TRUE) {
    int c = 0;
    char* option_name = NULL;
    char* option_value = NULL;
    const char * message = NULL;

    // Read the next option, and break if we're done.
    c = simple_getopt(&option_name, &option_value, &option_index);
    if (c == 0) {
      break;
    }
    else if (c < 0) {
      (void) simple_getopterror(&message);
      die("Error processing command line options (%s)\n", message);
    }
    if (strcmp(option_name, "verbosity") == 0){
      verbosity = atoi(option_value);
    }
    else if (strcmp(option_name, "gs") == 0){
    	use_Evalues = FALSE;
	}
    else if (strcmp(option_name, "motif") == 0){
      if (selected_motifs == NULL) {
        selected_motifs = new_string_list();
      }
      add_string(option_value, selected_motifs);
    }
    else if (strcmp(option_name, "o") == 0){
    	output_dirname = option_value;
    }
    else if (strcmp(option_name, "t") == 0){
		float x = atof(option_value);
		e_threshold = x;
    }
    else if (strcmp(option_name, "oc") == 0){
    	output_dirname = option_value;
    	clobber = TRUE;
    }
    else if (strcmp(option_name, "score-E-thresh") == 0){
		float x = atof(option_value);
		score_E_thresh = x;
		// check score-E-thresh > 0
		if (score_E_thresh <= 0) die("score-E-thresh (%f) must be larger than 0.\n", score_E_thresh);
	}
    else if (strcmp(option_name, "text") == 0){
    	text_only = TRUE;
	}
    else if (strcmp(option_name, "nostatus") == 0){
    	status = FALSE;
    }
  }

  // Must have scoring file and go term file
  if (argc < option_index + 2 || (argc - option_index) % 2 != 0) {
    fprintf(stderr, usage);
    exit(EXIT_FAILURE);
  }


  STRING_LIST_T* scoring_filenames = new_string_list();
  STRING_LIST_T* goterm_filenames = new_string_list();
  while (option_index < argc){
	  add_string(argv[option_index],scoring_filenames);
	  option_index++;
	  add_string(argv[option_index],goterm_filenames);
	  option_index++;
  }

  set_alphabet(verbosity, "ACGT");

  // measure time
  c0 = clock();


  /**********************************************
   * Get an idea about the number of go terms to
   * handle
   **********************************************/
  STRING_LIST_T* all_go_terms = new_string_list();
  get_unique_go_terms(goterm_filenames, all_go_terms);
  // A hash_table mapping of go terms to its description
  HASH_TABLE godesc_ht = hash_create(get_num_strings(all_go_terms)*2);

  /**********************************************
   * prepare output
   **********************************************/
  if (text_only) {
    // Legacy: plain text output to standard out.
    text_output = stdout;
  }
  else {
	  // allow clobbering of the default output directory
	  if (strcmp(output_dirname,default_output_dirname)==0) {
	 	  clobber = TRUE;
	   }
	   if (create_output_directory(output_dirname, clobber, status)) {
	 	  // Failed to create output directory.
	 	  exit(1);
	   }
	   // Create the name of the text output file
	   // "<dir>/XML_FILENAME" and open it for writing
	   xml_path = make_path_to_file(output_dirname, XML_FILENAME);
	   xml_output = fopen(xml_path, "w");
  }

  /**********************************************
   * Start output
   **********************************************/

  if (text_only){
	  fprintf(text_output,"# GOMO %s\n",VERSION);
	  fprintf(text_output,"# Motif accession number \tGO-ID\tE-value\tq-value\tGO description\n");
  } else {
	  start_xml_output(xml_output, program_name, e_threshold, score_E_thresh, use_Evalues, argc, argv);
  }

  /**********************************************
   * Run gomo on each dataset
   **********************************************/
  int org_index,i,j,k,dummy;
  CISML_T *cisml;

  // process each species
  int num_organisms = get_num_strings(scoring_filenames);
  for (org_index=0; org_index<num_organisms;++org_index){
	  // prepare text output for individual species
	  if (!text_only) {
		  char *chIntVal = mymalloc(3);
		  sprintf(chIntVal, "%d", org_index );
		  char *fname = concat_string(concat_string(TEXT_FILENAME,chIntVal),TEXT_FILESUFFIX);
		  char *txt_path = make_path_to_file(output_dirname, fname);
		  text_output = fopen(txt_path, "w"); //FIXME CEG check for errors
		  myfree(txt_path);
		  myfree(fname);
		  myfree(chIntVal);
		  fprintf(text_output,"# GOMO %s\n",VERSION);
		  fprintf(text_output,"# Motif accession number \tGO-ID\tE-value\tq-value\tGO description\n");
	  }

	  // a key list for the hash containing all the go-terms for one species
	  STRING_LIST_T* organism_go_terms = new_string_list();

	  /**********************************************
	   * Read the scored sequence list.
	   **********************************************/

	  if (verbosity >= HIGH_VERBOSE){
		  fprintf(stderr, "Reading in %s \n", get_nth_string(org_index, scoring_filenames));
	  }
	  cisml = read_cisml(get_nth_string(org_index,scoring_filenames));

	  // if all sequences contain p-values
	  // convert all p-values to E-value and set all E-value above the
	  // score_E_threshold to the maximum (equal their ranks)
	  double evalue = 0.0;
	  PATTERN_T ** patterns = get_cisml_patterns(cisml);
	  for (i=0; i<get_cisml_num_patterns(cisml);++i){
		  SCANNED_SEQUENCE_T ** sequences = get_pattern_scanned_sequences(patterns[i]);
		  int num_sequences = get_pattern_num_scanned_sequences(patterns[i]);
		  for (j=0;j<num_sequences;++j){
			  if (use_Evalues){
				  // check if p-value is available otherwise use scores only
				  if (has_scanned_sequence_pvalue(sequences[j]) == FALSE){
					  die("The sequence score file does not provide a "
						  "p-value for sequence %s (pattern %s). "
						  "Use --gs to prompt GOMO working on "
						  "gene scores rather than gene-score p-values.\n",
						  get_scanned_sequence_name(sequences[j]),
						  get_pattern_accession(patterns[i]));
				  }
				  evalue = get_scanned_sequence_pvalue(sequences[j])*num_sequences;
				  if (score_E_thresh > 0 && evalue > score_E_thresh) {
					  set_scanned_sequence_pvalue(sequences[j],num_sequences);
				  } else {
					  set_scanned_sequence_pvalue(sequences[j],evalue);
				  }
			  } else { // use gene scores
				  if (has_scanned_sequence_score(sequences[j]) == FALSE){
					  die("The sequence score file does not provide a "
							  "score for sequence %s (pattern %s). "
							  "Remove --gs to prompt GOMO working on "
							  "gene-score p-values rather than gene scores.\n",
							  get_scanned_sequence_name(sequences[j]),
							  get_pattern_accession(patterns[i]));
				  }
			  }
		  }
	  }

	  if (verbosity >= NORMAL_VERBOSE){
		  fprintf(stderr, "Sequence file from %s \n", get_cisml_program_name(cisml));
	  }

	  if (cisml == NULL) {
		// Wasn't cisml XML
		die("The scoring file needs to be in CisML format: %s",
				get_nth_string(org_index,scoring_filenames));
	  }

	  // estimate number of motifs and create hashtable
	  if (num_organisms >= 1 && org_index==0){
		  motif_ht = hash_create(get_cisml_num_patterns(cisml)*2);
	  }

	  /**********************************************
	   * Read the go term mapping.
	   **********************************************/
	  // A hash_table recording go terms
	  HASH_TABLE go_ht = hash_create(get_num_strings(all_go_terms)*2);
	  // A hash_table recording all sequence ids
	  HASH_TABLE sequence_list = hash_create(get_num_strings(all_go_terms)*2);
	  read_go_file(get_nth_string(org_index,goterm_filenames), organism_go_terms,
			  &go_ht, &godesc_ht, &sequence_list);

	  /**********************************************
	   * Perform gomo calculations
	   **********************************************/

	  if (text_only) {
		  fprintf(text_output,"# species %d\n",org_index);
	  } else {
		  fprintf(xml_output,"<species id=\"%d\" name=\"%d\">\n",org_index+1, org_index+1);
		  fprintf(xml_output,"<input-files>\n<scoring-file>%s</scoring-file>\n",
				  get_nth_string(org_index,scoring_filenames));
		  fprintf(xml_output,"<godb-file>%s</godb-file>\n</input-files>\n",
				  get_nth_string(org_index,goterm_filenames));
	  }

	  // process each motif
	  for (i=0; i<get_cisml_num_patterns(cisml);++i){

		  char* accession = get_pattern_accession(get_cisml_patterns(cisml)[i]);
		  // check if only a particular motif should be scanned
		  if (selected_motifs != NULL &&  !have_string(accession,selected_motifs))
			  continue;

		  run_gomo_on_organism(
	  		cisml,
	  		get_cisml_patterns(cisml)[i],
	  		go_ht,
	  		godesc_ht,
	  		sequence_list,
	  		organism_go_terms,
	  		all_go_terms,
	  		e_threshold,
	  		org_index,
	  		num_organisms,
	  		motif_ht,
	  		status,
	  		text_only,
	  		xml_output,
	  		text_output,
	  		use_Evalues,
	  		motifs
		  );
	  }

	  if (status){
	  	  fprintf(stderr, "Process organism %d: 100 %%\n",org_index+1);
	  	  fprintf(stderr, "Writing output\n");
	  	}

	  	if (!text_only){
	  	  fprintf(xml_output,"</species>\n");
	  	}

	  	// clean
	  	for (j=0;j<get_num_strings(organism_go_terms);++j){
	  	  go_term = get_nth_string(j,organism_go_terms);
	  	  HASH_TABLE_ENTRY *he = hash_lookup_str(go_term,go_ht,&dummy);
	  	  if (he != NULL && hash_get_entry_value(he) != NULL){
	  		  char* v = (char*)(hash_get_entry_value(he));
	  		  myfree(v);
	  	  }
	  	}
	  	hash_destroy(go_ht);
	  	free_cisml(cisml);
	  	free_string_list(organism_go_terms);
	  	hash_destroy(sequence_list);

	  	if (!text_only) {
	  		fflush(text_output);
	  		fclose(text_output);
	  	}
  }

  // write combined organism results
  if (num_organisms > 1){
	  // prepare text output for individual species
	  if (!text_only) {
		  char *fname = concat_string(concat_string(TEXT_FILENAME,"_combined"),TEXT_FILESUFFIX);
		  char *txt_path = make_path_to_file(output_dirname, fname);
		  text_output = fopen(txt_path, "w"); //FIXME CEG check for errors
		  myfree(txt_path);
		  myfree(fname);
	  }
	  fprintf(text_output,"# GOMO %s\n",VERSION);
	  fprintf(text_output,"# Motif accession number \tGO-ID\tE-value\tq-value\tGO description\n");
	  write_combined_organism_output(text_output, xml_output, motif_ht, godesc_ht,
			  motifs, all_go_terms, num_organisms, e_threshold,text_only);
	  if (!text_only) {
		  fflush(text_output);
		  fclose(text_output);
	  }
  }

  // finish output
  if (!text_only){
	  fprintf(xml_output,"</gomo>\n");
	  fclose(xml_output);

	  /* html files become to big for non-constrainted E values*/
	  if (e_threshold>0.0){
		  char *stylesheet_path = make_path_to_file(ETC_DIR, HTML_STYLESHEET);
		  char *html_path = make_path_to_file(output_dirname, HTML_FILENAME);
		  print_xml_filename_to_filename_using_stylesheet(
			    xml_path,        	/* path to XML input file IN */
			    stylesheet_path,	/* path to MEME XSL stylesheet IN */
			    html_path   		/* path to HTML output file IN */
		  );

		  myfree(html_path);
		  myfree(stylesheet_path);
	  } else {
		  fprintf(stderr,"Threshold value does not allow HTML output due to size and memory constraint.\n");
	  }

	  myfree(xml_path);
  }



  // measure time
  c1 = clock();
  if (verbosity >= NORMAL_VERBOSE) { // starting time
 	  fprintf(stderr,"cycles (CPU);            %ld cycles\n", (long) c1);
 	  fprintf(stderr,"elapsed CPU time:        %f seconds\n", (float) (c1 - c0)/CLOCKS_PER_SEC);
  }

  /**************************************
   * Clean up.
   **************************************/
  // free hashtables

  for (j=0;j<get_num_strings(all_go_terms);++j){
	  char* go_term = get_nth_string(j,all_go_terms);
	  HASH_TABLE_ENTRY *he = hash_lookup_str(go_term,godesc_ht,&dummy);
	  if (he != NULL && hash_get_entry_value(he) != NULL){
		  char* v = (char*)(hash_get_entry_value(he));
		  myfree(v);
	  }
  }
  hash_destroy(godesc_ht);

  //FIXME: free hash appropriately
  hash_destroy(motif_ht);

  free_string_list(all_go_terms);
  free_string_list(motifs);
  free_string_list(scoring_filenames);
  free_string_list(goterm_filenames);
  return(0);
}
