/**********************************************************************
 * FILE: ama_scan.c
 * AUTHOR: Fabian Buske / Robert McLeay for refactoring / T L Bailey
 * PROJECT: MEME
 * COPYRIGHT: 2007-2008, UQ
 * VERSION: $Revision: 1.0$
 * DESCRIPTION: Routines to perform average motif affinity scans
 *
 **********************************************************************/

#include "ama_scan.h"

#define sqrt2 sqrt(2.0)

/*************************************************************************
 * Calculate the odds score for each motif-sized window at each
 * site in the sequence using the given nucleotide frequencies.
 *
 * This function is a lightweight version based on the one contained in
 * motiph-scoring. Several calculations that are unnecessary for gomo
 * have been removed in order to speed up the process.
 * Scores sequence with up to two motifs.
 *************************************************************************/
double score_sequence(
  SEQ_T*        seq,		// sequence to scan (IN)
  PSSM_PAIR_T*  pssm_pair,	// pos and neg pssms (IN)
  int method, 			// method used for scoring (IN)
  BOOLEAN_T* isFeasible	// FLAG indicated if there is at least one position
					    // where the motif could be matched against (OUT)
)
{
  assert(pssm_pair != NULL);
  assert(seq != NULL);

  PSSM_T* pos_pssm = pssm_pair->pos_pssm;
  assert(pos_pssm != NULL);
  PSSM_T* neg_pssm = pssm_pair->neg_pssm;
  int n_motifs = neg_pssm ? 2 : 1;

  char* raw_seq = get_raw_sequence(seq);
  int seq_length = get_seq_length(seq);
  int w = get_num_rows(pos_pssm->matrix);
  int n = seq_length - w + 1;
  int N_scored = n_motifs * n;		// number of sites scored
  char* alphabet = get_alphabet(FALSE);
  int alph_size = get_alph_size(ALPH_SIZE);


  // For each motif (positive and reverse complement)
  double max_odds = 0.0;
  double sum_odds = 0.0;
  double requested_odds = 0.0;
  int i;
  for (i=0; i<n_motifs; i++) { 	// pos (and negative) motif
    PSSM_T* pssm = (i==0 ? pos_pssm : neg_pssm);	// choose +/- motif
    // For each site in the sequence
    int seq_index;
    for (seq_index = 0; seq_index < n; seq_index++) {
      double odds = 1.0;
      // For each position in the motif window
      int motif_position;
      for (motif_position = 0; motif_position < w; motif_position++) {
		char c = raw_seq[seq_index + motif_position];
		// Check for gaps at this site
		if (c == '-' || c == '.') { N_scored--; odds = 0; break; }
		// Check for ambiguity codes at this site
		int alph_index = alphabet_index(c, alphabet);
		if (alph_index >= alph_size || alph_index < 0) { N_scored--; odds = 0; break; }
		odds *= get_matrix_cell(motif_position, alph_index, pssm->matrix);
      } // position
      sum_odds += odds;				// sum of odds
      if (odds > max_odds) max_odds = odds;	// max of odds
    } // site
  } // motif

  // has there been anything matched at all?
  if (N_scored == 0){
	  fprintf(stderr,"Sequence \'%s\' offers no location to match the motif against (sequence length too short?)\n",get_seq_name(seq));
	  *isFeasible = FALSE;
	  return 0.0;
  // return odds as requested (MAX or AVG scoring)
  } else if (method == AVG_ODDS) {
	requested_odds = sum_odds / N_scored;	// mean
  } else if (method == MAX_ODDS) {
    requested_odds = max_odds;			// maximum
  }

  return(requested_odds);
} // score_sequence

/**********************************************************************
  ama_sequence_scan()

  Scan a given sequence with a specified motif using either
  average motif affinity scoring or maximum one. In addition z-scores
  may be calculated.

  The motif has to be converted to odds in advance (in order
  to speed up the scanning).

  The result will be stored in the scanned_sequence parameter.
 **********************************************************************/
void ama_sequence_scan(
  SEQ_T* sequence,		// the sequence to scan (IN)
  PSSM_PAIR_T* pssm_pair,	// the pos/neg pssms (IN)
  int scoring,			// AVG_ODDS or MAX_ODDS (IN)
  int zscoring,			// the number of shuffled sequences
				// used for z-score computation (IN)
  SCANNED_SEQUENCE_T* scanned_seq // the scanned sequence results (OUT)
)
{
    assert(sequence != NULL);
    assert(pssm_pair != NULL);
    assert(zscoring >= 0);

    // FLAG indicates if sequence was suitable for motif matching
    BOOLEAN_T isFeasible = TRUE;

    // Score the sequence.
	double odds = score_sequence(sequence, pssm_pair, scoring, &isFeasible);
	set_scanned_sequence_score(scanned_seq, odds);

	// Compute the p-value of the AVG_ODDS score.
	if (!isFeasible){
	  fprintf(stderr,"Sequence '%s' not suited for motif. P-value set to 1.0!\n",get_scanned_sequence_accession(scanned_seq));
	  set_scanned_sequence_pvalue(scanned_seq, 1.0);
	} else if (odds < 0.0){
	  fprintf(stderr,"Sequence '%s' got invalid (negative) odds score. P-value set to 1.0!\n",get_scanned_sequence_accession(scanned_seq));
	  set_scanned_sequence_pvalue(scanned_seq, 1.0);
	} else if (scoring == AVG_ODDS && zscoring==0) {
	  double pvalue = get_ama_pv(odds, get_seq_length(sequence), pssm_pair);
	  set_scanned_sequence_pvalue(scanned_seq, pvalue);
	}

	// compute z score
	if (zscoring>0) {
		// Create a temp pattern
		PATTERN_T *shuffle_pattern = allocate_pattern("shuffled", "shuffled");

		double* shuffle_scores = mm_malloc(sizeof(double)*zscoring);

		/* create the shuffled sequences*/
		int i;
		int seed = 0;
		for (i=0;i<zscoring;++i){
			SEQ_T* shuffled_s =  NULL;
			shuffle_sequence(sequence,seed+i,&shuffled_s);

			char* shuffled_seq_name = "shuffled sequences";

			// Create a scanned_sequence record and record it in pattern.
			SCANNED_SEQUENCE_T *shuffled_random_seq =
				allocate_scanned_sequence(shuffled_seq_name, shuffled_seq_name, shuffle_pattern);
			set_scanned_sequence_length(shuffled_random_seq, get_seq_length(shuffled_s));

			// Score the sequence.
			isFeasible = TRUE;
			double z_odds = score_sequence(shuffled_s, pssm_pair, scoring, &isFeasible);

			free_seq(shuffled_s);
			shuffle_scores[i] = z_odds;
		} // sequence shuffling

		ZSCORE_T *z = allocate_zscore(shuffle_scores, zscoring);
		double zscore = get_zscore_z(z,odds);
		set_scanned_sequence_score(scanned_seq, zscore);

		/* free shuffle related stuff since its not required anymore */
		free_pattern(shuffle_pattern);
		myfree(shuffle_scores);
		destroy_zscore(z);
	} // z score computation
} // ama_sequence_scan
