#ifndef lint
static char SCCSid[] = "@(#) ./blkcm/compile/sortsend.c 07/23/93";
#endif

/*
   This routine reorganizes sends so that they have a better chance of 
   not bottlenecking.  In general, we could try to solve the general
   scheduling problem to insure minimum communications cost.  Since
   I'm trying to do this Friday morning 4/17/92, I'll use a simple
   heuristic:  sort the sends for each phase in increasing order of
   destination processor, then rotate until the first send is to the
   processor with the smallest processor number GREATER than the processor
   doing the sending.

   A refinement to this is to also order the RECEIVES:  each processor
   could send to its destination when it is actually going to do the send;
   the receives can be ordered to reflect this (in case of ties, either 
   choosing an arbitrary order or negotiating with the processors to 
   choose a new order.

   To increase the utility of these routines, there are two parts:
   A general re-organization routine, and a scheduling routine that
   implements the above algorithm.
 */
#include "tools.h" 
#include "blkcm/bcp.h"
#include "blkcm/bc.h"

/* 
   Sort remote sends

   The algorithm is:

   1. Find the number of sends to remote processors in a phase
   2. Allocate a block for that number of sends and fill it in with:
      destination_processor, pointer to program line
   3. Sort by (destination_processor + current processor) (stablely)
   4. re-write the program

   Complications:
   In the presence of buffered sends, there will be multiple sends to 
   a given processor.  The last one MUST come last; the others are 
   filling the buffer to be sent.

   Note:
   This is the first of a collection of scheduling routines.  Some
   schedules may need to run with specialized send/recv/wait routines.

   Input Parameters:
.  Program - the program to re-order
.  collect - routine to collect those parts of the program to reorder
.  flag    - Indicates what is NOT reordered:
$            0x1 sends reordered
$            0x2 recvs reordered
$            The bits may be or'd together (i.e., 0x3 = both sends and
             receives reordered).
 */
void BCReorderProgram( Program, collect, flag )
BCPGM *Program;
int  (*collect)();
int  flag;
{
int        n, nsend, ninphase, i;
BCentry    *pgm;
BC_Sblock  *sblock;

n   = Program->n;
pgm = Program->pgm;
/* This ordering is done for each phase.  Phases are separated (for now)
   by BLOCK_COMM_SYNC_BLOCK types */
while (n) {
    ninphase = BCSizeOfPhase( pgm, n );
    /* fprintf( stderr, "[%d] n in phase %d\n", MYPROCID, ninphase ); */
    if (ninphase <= 0) break;
    sblock   = (BC_Sblock *)MALLOC( ninphase * sizeof(BC_Sblock) );
    CHKPTR(sblock);
    /* Collect and set the order of the steps to sort */
    nsend    = collect( sblock, Program, pgm, ninphase );
    
    /* fprintf( stderr, "[%d] found %d sends\n", MYPROCID, nsend ); */
    if (nsend > 0) {
	/* Sort */
	BCSortSblock( sblock, nsend );
	/* Make a copy */
	BCCopyPhaseWithSend( pgm, ninphase, sblock, nsend, flag );
	}

    /* Recover space */
    FREE( sblock );
    pgm += ninphase;
    n   -= ninphase;
    }
}

/* 
   Return the number of entries in the phase to which pgm belongs.
   There are n lines in the program from pgm on
 */
int BCSizeOfPhase( pgm, n )
BCentry *pgm;
int n;
{
int cnt = 0;
int cphase = pgm->phase;
while (n--) {
    if (pgm->phase != cphase)
	break;
    cnt ++;
    NEXTLINE(pgm);
    }
return cnt;
}

static int BCdifSend( p1, p2 )
BC_Sblock *p1, *p2;
{
if (p1->order != p2->order)
    return p1->order - p2->order;
/* Keep the blocks in the same order otherwise */
return (int)(p1-p2);
}
BCSortSblock( sb, n )
BC_Sblock *sb;
int       n;
{
qsort( (char *)sb, n, sizeof(BC_Sblock), (int (*)())BCdifSend );
}

BCCopyPhaseWithSend( pgm, n, sblock, ns, flag )
BCentry    *pgm;
int        n, ns;
BC_Sblock  *sblock;
int        flag;
{
BCentry *npgm;
int     i, j;

npgm = (BCentry *)MALLOC( n * sizeof(BCentry) );   CHKPTR(npgm);
j    = 0;
/* Copy all of the recv events if they are not reordered */
/* Note that we want to copy the LOCAL SRC's to the end of the block
   (this allows the best overlap of IO with local ops) */
if (!(flag & 0x2)) {
    for (i=0; i<n; i++) {
        if (!IS_SRC(pgm+i) && !IS_LOCAL_SRC(pgm+i))
	    npgm[j++] = pgm[i];
        }
    }
/* Now, add the re-ordered events */
for (i=0; i<ns; i++) {
    /* fprintf( stderr, "[%d] %d\n", MYPROCID, sblock[i].entry->processor ); */
    npgm[j++] = *(sblock[i].entry);
    }
/* Add nonlocal sends if they are not ordered */    
if (!(flag & 0x1)) {
    for (i=0; i<n; i++) {
        if (!IS_DEST(pgm+i) && !IS_LOCAL_DEST(pgm+i))
	    npgm[j++] = pgm[i];
        }
    }

/* Add local sends/recvs */
for (i=0; i<n; i++) {
    if (IS_LOCAL_SRC(pgm+i)) {
	/* This is a little special, since we need to update the offsets */
	npgm[j]           = pgm[i];
	npgm[j].processor += (i-j);
	j++;
	}
    }

/* Copy back */
for (i=0; i<n; i++) {
    pgm[i] = npgm[i];
    }

FREE( npgm );
}

/*
   Collect sends and order them by rotating them with the rank of this
   task.
 */
int BCCollectSends( sblock, program, pgm, ninphase )
BC_Sblock *sblock;
BCPGM     *program;
BCentry   *pgm;
int       ninphase;
{
int nsend = 0, i;

for (i=0; i<ninphase; i++) {
    /* Collect all of the remote sends */
    if (IS_SRC(pgm+i) && !IS_LOCAL_SRC(pgm+i)) {
        sblock[nsend].entry = pgm+i;
        sblock[nsend].order = (pgm+i)->processor - PSMYPROCID(program->ps);
        if (sblock[nsend].order < 0)
            sblock[nsend].order += PSNUMNODES(program->ps);
	sblock[nsend].idx   = i;
	nsend++;
	}
    }
return nsend;
}

/*
    Reorder the sends so that there MAY be a better chance of reducing
    collisions.  This uses a very, very simple method.  It is selected
    by using the SCHEDULE option to BCCompile.
 */
void BCSortSends( Program )
BCPGM *Program;
{
BCReorderProgram( Program, BCCollectSends, 0x1 );
}

/*
   Collect sends and receives and order them so that they are ordered in
   send/receive pairs.

   This should be the default for systems without nonblocking communication.
 */
int BCCollectSRPair( sblock, program, pgm, ninphase )
BC_Sblock *sblock;
BCPGM     *program;
BCentry   *pgm;
int       ninphase;
{
int nsend = 0, i;
int mask, nbr, issend, cnt, myrank;

/* First, collect all of the send/receives */
for (i=0; i<ninphase; i++) {
    /* Collect all of the remote sends and receives */
    if ((IS_SRC(pgm+i) && !IS_LOCAL_SRC(pgm+i)) ||
        (IS_DEST(pgm+i) && !IS_LOCAL_DEST(pgm+i))) {
        sblock[nsend].entry = pgm+i;
        sblock[nsend].order = -1;
	sblock[nsend].idx   = i;
	nsend++;
	}
    }
/* Now, order them so that send/receives are paired */
cnt    = 0;
myrank = PSMYPROCID(program->ps);
mask   = 0x1;
while (mask && cnt < nsend) {
    for (i=0; i<nsend; i++) {
    	if (sblock[i].order == -1) {
    	    nbr    = sblock[i].entry->processor;
    	    /* Need to convert nbr to rank in group */
    	    issend = IS_SRC( sblock[i].entry );

	    /* I'm odd, nbr is even, and doing a receive */
    	    if ( !(myrank & mask) && (nbr & mask) && !issend)
    	        sblock[i].order = cnt++;
    	    /* I'm even, nbr is odd, and doing a send */
    	    else if ( (myrank & mask) && !(nbr & mask) && issend)
    	    	sblock[i].order = cnt++;
    	    }
        }
    for (i=0; i<nsend; i++) {
    	if (sblock[i].order == -1) {
    	    nbr    = sblock[i].entry->processor;
    	    /* Need to convert nbr to rank in group */
    	    issend = IS_SRC( sblock[i].entry );

	    /* I'm odd, nbr is even, and doing a send */
    	    if ( !(myrank & mask) && (nbr & mask) && issend)
    	        sblock[i].order = cnt++;
    	    /* I'm even, nbr is odd, and doing a receive */
    	    else if ( (myrank & mask) && !(nbr & mask) && !issend)
    	    	sblock[i].order = cnt++;
    	    }
        }
    mask <<= 1;
    }
return nsend;
}

/*
    Reorder the sends and receives so that they match up in the sense that two
    nodes do not send to each other (one will receive first).
    This uses a very, very simple method.  It is selected by using the 
    ?? option to BCCompile.
 */
void BCSortSRPairs( Program )
BCPGM *Program;
{
BCReorderProgram( Program, BCCollectSRPair, 0x3 );
}

