#include <stdio.h>
#include <math.h>

#include "easy.h"

#define MATMUL_HOST "matmul_host"
#define MATMUL_NODE "matmul_node"

extern void print_mat(char *text, double *C, int lsize);
extern void matmul(double *C, double *A, double *B,
		   int msize,int nproc, char *nodeprog,
		   char *machine);

static double timing(char *label, double mflops)
{
  static double t0 = -1;
  static double tprevcall = -1;
  double t,deltat;

  if (t0 == -1) {
    t0 = dclock();
    tprevcall = t0;
  }

  t = dclock();
  deltat = t - t0;

  if (label) {
    double tdelta = t - tprevcall;
    if (tdelta > 0 && mflops > 0) {
      printf("%s:  %.2fs (%.2fs -- %.2f MFlop/s)\n",
	     label,deltat,tdelta,
	     mflops/tdelta/1.0E6);
    }
    else {
      printf("%s:  %.2fs (%.2fs)\n",label,deltat,tdelta);
    }
    fflush(stdout);
  }
  tprevcall = t;

  return deltat;
}


int main(int argc, char *argv[])
{
  double *A=NULL, *B=NULL, *C=NULL;
  int i,j,k;
  int msize, lsize;
  int nproc, sqrt_nproc;
  double mflops;
  extern int atoi(char *);

  timing("start time",0);

  if (--argc < 2) {
    fprintf(stderr,"Usage: %s matrix_size number_of_procs\n",
	    MATMUL_HOST);
    exit(1);
  }

  msize = atoi(argv[1]);
  if (msize < 1) {
    fprintf(stderr,"%s: Matrix size (=%d) must be > 0.\n",
	    MATMUL_HOST,msize);
    exit(1);
  }

  nproc = atoi(argv[2]);
  sqrt_nproc = sqrt(nproc);

  if (nproc < 0 || nproc != sqrt_nproc * sqrt_nproc) {
    fprintf(stderr,"%s: Number of procs (=%d) must be {0,1,4,9,16,25,..,k^2}.\n",
	    MATMUL_HOST,nproc);
    exit(1);
  }

  if ( sqrt_nproc > 0 ) {
    int remainder = msize % sqrt_nproc;
    
    if (remainder) {
      fprintf(stderr,
	      "%s: Matrix size (=%d) not divisible by SQRT of # of procs (=%d)\n",
	      MATMUL_HOST,msize,sqrt_nproc);
      exit(1);
    }
    lsize = msize / sqrt_nproc;

  }
  else {
    lsize = msize;
  }


  printf(">>Global matrix size = %d\n",msize);
  printf(">>   Number of procs = %d\n",nproc);
  printf(">> Local matrix size = %d\n",lsize);

  A = (double *)malloc(msize * msize * sizeof(*A));
  B = (double *)malloc(msize * msize * sizeof(*B));
  C = (double *)malloc(msize * msize * sizeof(*C));

  if (!A || !B || !C) {
    perror("malloc() of {A,B,C}");
    exit(1);
  }

  timing("mallocs done",0);

  /* Assume Fortran-order of storing values, i.e. cols first, then rows */

#define a(i,j) A[(i) - 1 + ((j) - 1)*msize]
#define b(i,j) B[(i) - 1 + ((j) - 1)*msize]

  k = 0;
  for (j=1; j<=msize; j++)
    for (i=1; i<=msize; i++) {
      a(i,j) = (i+j) % msize + k;
      k++;
    }

  k = 0;
  for (j=1; j<=msize; j++)
    for (i=1; i<=msize; i++) {
      b(i,j) = -((i+j) % msize + k);
      k++;
    }

  timing("A & B set. Start pipe_multiply_roll",0);

  matmul(C,A,B,msize,nproc,MATMUL_NODE,"$MATMUL_NODE");

  mflops = (double)2*msize*msize*msize;
  timing("matmul(1st): C = A * B",mflops);

  print_mat(">> C",C,msize);

  if ( (nproc > 0 && msize < 500) || (nproc > 1 && msize < 1000)) {
    timing("Start pipe_multiply_roll(2nd)",0);
    
    matmul(C,A,B,msize,nproc,"noprogram","nomachine");
    
    timing("matmul(2nd): C = A * B",mflops);

    print_mat(">> C",C,msize);
  }

  free(A);
  free(B);
  free(C);

  timing("end time",0);

  exit(0);
}
