#if defined(rs6000)
#define fdaxpy2_ fdaxpy2
#endif
daxpy( x, y, n, alpha )
register double *x, *y, alpha;
register int n;
{
register int i;
for (i=0; i<n; i++) y[i] += alpha * x[i];
}
daxpy1( x, y, n, alpha )
register double *x, *y, alpha;
register int n;
{
register int i;
register double s1, s2;
for (i=0; i<n-1; i+=2) {
   s1     = alpha * x[i];
   s2     = alpha * x[i+1];
   s1     += y[i];
   s2     += y[i+1];
   y[i]   = s1;
   y[i+1] = s2;
   }
if (n & 0x1) {
    y[i] += alpha * x[i];
    }
}
daxpy2( x, y, n, alpha )
register double *x, *y, alpha;
register int n;
{
register int i;
register double s1, s2, s3, s4;
for (i=0; i<n-3; i+=4) {
   s1     = alpha * x[i];
   s2     = alpha * x[i+1];
   s3     = alpha * x[i+2];
   s4     = alpha * x[i+3];
   s1     += y[i];
   s2     += y[i+1];
   s3     += y[i+2];
   s4     += y[i+3];
   y[i]   = s1;
   y[i+1] = s2;
   y[i+2] = s3;
   y[i+3] = s4;
   }
n = n & 0x3;
while (n--) {
    y[i] += alpha * x[i];
    i++;
    }
}
daxpy4( x, y, n, alpha )
register double *x, *y, alpha;
register int n;
{
register int i, n1;
register double s1, s2, s3, s4;
static double cache[4096];
register double *c1 = cache, *c2 = cache + 2048;

/* This doesn't work because the compiler won't generate pipelined loads */
for (i=0; i<n; i++) c1[i] = x[i];
for (i=0; i<n; i++) c2[i] = y[i];
for (i=0; i<n-3; i+=4) {
   s1     = c1[i];
   s2     = c1[i+1];
   s3     = c1[i+2];
   s4     = c1[i+3];
   s1     *= alpha;
   s2     *= alpha;
   s3     *= alpha;
   s4     *= alpha;
   s1     += c2[i];
   s2     += c2[i+1];
   s3     += c2[i+2];
   s4     += c2[i+3];
   c2[i]   = s1;
   c2[i+1] = s2;
   c2[i+2] = s3;
   c2[i+3] = s4;
   }
n1 = n & 0x3;
while (n1--) {
    c2[i] += alpha * c1[i];
    i++;
    }
for (i=0; i<n; i++) y[i] = c2[i];
}

#include <stdio.h>
#include "tools.h"
extern double SYGetCPUTime();
main()
{
double *xs, *ys;
int    n, m, k, i;

fprintf( stdout, "\n" );
n = 200000;
m = n / 100;
k = n / m;
xs = (double *)MALLOC( n * sizeof(double) );
ys = (double *)MALLOC( n * sizeof(double) );
RunTests( xs, ys, n, m, k );
FREE( xs );
FREE( ys );

xs = (double *)MALLOC( (n + 4) * sizeof(double) );
ys = (double *)MALLOC( (n + 4) * sizeof(double) );

for (i=0; i<4; i++) {
    printf( "offset by %d\n", i );
    RunTests( xs, ys+i, n, m, k );
    }
}

RunTests( xs, ys, n, m, k )
double *xs, *ys;
int    n,  m, k;
{
double *x, *y, alpha, t, t1;
int    i, j;

x = xs;
y = ys;
for (i=0; i<n; i++) {
    x[i] = 0.0;
    y[i] = 0.0;
    }
t = SYGetCPUTime();
for (j=0; j<10; j++) {
    y = ys;
    x = xs;
    for (i=0; i<m; i++) {
	daxpy4( y, x, k, alpha );
	y += k;
	x += k;
	}
    }
t1 = SYGetCPUTime();

fprintf( stdout, "(daxpy4) time = %e, = %e Mf\n", t1-t, 1.0e-6*n*2*10/(t1-t) );

t = SYGetCPUTime();
for (j=0; j<10; j++) {
    y = ys;
    x = xs;
    for (i=0; i<m; i++) {
	daxpy2( y, x, k, alpha );
	y += k;
	x += k;
	}
    }
t1 = SYGetCPUTime();

fprintf( stdout, "(daxpy2) time = %e, = %e Mf\n", t1-t, 1.0e-6*n*2*10/(t1-t) );

t = SYGetCPUTime();
for (j=0; j<10; j++) {
    y = ys;
    x = xs;
    for (i=0; i<m; i++) {
	daxpy1( y, x, k, alpha );
	y += k;
	x += k;
	}
    }
t1 = SYGetCPUTime();

fprintf( stdout, "(daxpy1) time = %e, = %e Mf\n", t1-t, 1.0e-6*n*2*10/(t1-t) );

t = SYGetCPUTime();
for (j=0; j<10; j++) {
    y = ys;
    x = xs;
    for (i=0; i<m; i++) {
	daxpy( y, x, k, alpha );
	y += k;
	x += k;
	}
    }
t1 = SYGetCPUTime();

fprintf( stdout, "(daxpy ) time = %e, = %e Mf\n", t1-t, 1.0e-6*n*2*10/(t1-t) );

t = SYGetCPUTime();
for (j=0; j<10; j++) {
    y = ys;
    x = xs;
    for (i=0; i<m; i++) {
	fdaxpy2_( y, x, &k, &alpha );
	y += k;
	x += k;
	}
    }
t1 = SYGetCPUTime();

fprintf( stdout, "(fdaxpy2_) time = %e, = %e Mf\n", t1-t, 
	 1.0e-6*n*2*10/(t1-t) );
}
/* The winner is daxpy2 (4.6 Mf vs 3.7 Mf on i860) */


