#ifndef lint
static char SCCSid[] = "@(#) ./comm/global/global.c 07/23/93";
#endif


/*
   This file contains some routines for applying global operations to
   subsets of processors.  No attempt is made to make these optimally
   efficient (for example, using pipelining to improve the performance).
 */

#include "comm/comm.h"
#include "comm/procset.h"
#include "comm/global/global.h"
#include <stdio.h>

/* 
   The following will allow us to control at runtime which combination
   routine is used in the global reductions.  There should also be
   entries for the other routines (scatter etc) and a way for the user
   to add to this collection of routines.
 */
extern void gsetopT(), gscattersetT(), gscattersetsrcT(), 
            gcolsetT(), gsyncsetT();
void (*PIcomb)()       = gsetopT;
void (*PIscatter)()    = gscattersetT;
void (*PIscattersrc)() = gscattersetsrcT;
void (*PIcol)()        = gcolsetT;
void (*PIsync)()       = gsyncsetT;

static void (*PIGetTreeNodes)() = PISetTreeNodes;

/* These allow the global operations to use precomputed neighbors */
int _PILCHILD, _PIRCHILD, _PIPARENT = -2, _PIPHASE = 0;
int _PIRIGHT, _PILEFT, _PIAM_LEFT;

/* This says whether binary operations should be assumed non-associative
   (i.e., (a+b)+c != a+(b+c)); this forces a certain order on the collection
   operations */
int _PINONASSOCIATIVE = 0;

/*
    Collective operations can be effectively pipelined in most cases.
    We give the user control over the packet size used, with a well-chosen
    default.  Below is a very simple-minded analysis that may be used
    to understand why pipelining is so effective.

    Consider a distribution in a ring of depth D (eventually, we'll use
    log p for this).  No processor can begin sending until it has
    received data from its parent (a similar analysis holds for trees
    and other structures).
    If the time to send a single message of length n is (s + rn), then
    the time to fully distribute the message is D(s + rn).
    Now, divide the message into packets of size k.  The time to
    distribute the message is the time until the just before the
    last packet leaves the originating node ((n/k-1)(s+rk)) + the time
    to distribute that packet (D(s+rk)).  The time to distribute the
    packetized data is less than sending a single large message when
        D(s+rk) + ((n/k)-1)(s+rk) < D(s+rn)
    Simplifying, this becomes
        k > (s/(r(D-1)))
    Note that this is independent of n, and depends only on the ratio s/r and
    the depth of the distribution D.  Further, even for fairly heavy-weight
    message-passing systems (large s/r), this value is relavtively small.
 */    
int  _PIPKTSIZE = -1;
int  _PIRRSIZE  = -1;
/*@
    PISetPacketSize - Set the packet size for collective operations

    Input Parameter:
.   val - size of packets in bytes

    Note:
    If this routine has not been called, a reasonable default will be used.
@*/
void PISetPacketSize( val )
int val;
{
_PIPKTSIZE = val;
}

/*@
    PISetRRSize - Set the size for use of ready-receiver (force) in 
                  collective operations

    Input Parameter:
.   val - message size at which ready-receiver should be used.

    Note:
    If this routine has not been called, a reasonable default will be used.
    If the value is negative, ready-receiver will not be used.
@*/
void PISetRRSize( val )
int val;
{
_PIRRSIZE = val;
}

/* ProcSet *ALLNODES = 0  -- eventually change this to allocate a procset
   over the entire machine -- */
#ifdef p4
/* Hold the globals for p4 here */
int __P4GLOBALTYPE=MSG_GLOBAL;
#endif

void PISetTreeNodes( myid, np, l_child, r_child, parent, am_left )
int myid, np, *l_child, *r_child, *parent, *am_left;
{
*l_child = 2 * myid + 1;
*r_child = *l_child + 1;
*parent  = (myid-1)/2;
if (myid == 0)       *parent = -1;
else {
    /* By definition, the left child is always ODD and the right child EVEN */
    *am_left = myid & 0x1;
    }
if (*l_child >= np) *l_child = -1;
if (*r_child >= np) *r_child = -1;
}

/* 
   This is an alternate routine that finds tree positions such that
   the tree is divided into non-overlapping sections.  This produces
   a better schedule on mesh-connected processors
 */
void PISetTreeNodesFast( myid, np, l_child, r_child, parent, am_left )
int myid, np, *l_child, *r_child, *parent, *am_left;
{
int t, low, high;

*r_child = -1;
*l_child = -1;
if (myid == 0) {
    *parent  = -1;
    if (np > 1)
	*r_child = 1;
    if (np > 3) 
	*l_child = (np + 1) / 2;
    }
else {
    *parent = 0;
    low     = 0;
    high    = np - 1;
    /* t needs to be the last element of the "low" tree */
    t       = (low + high) / 2;
    while (t) {
	if (myid == low) {
	    /* Found my position in the tree.  Parent was previous low.
	       Note that right child is ALWAYS myid + 1 */
	    *am_left = (myid != *parent + 1);
	    
	    *r_child = myid + 1;
	    *l_child = t + 1;
	    /* Make sure that the children are valid */
	    if (*r_child > high) *r_child = -1;
	    if (*l_child > high) *l_child = -1;
	    if (*l_child == *r_child) *l_child = -1;
	    if (*l_child == myid)     *l_child = -1;
	    break;
	    }
	*parent = low++;
	if (myid <= t) {
	    high = t;
	    }
	else {
	    low = t + 1;
	    }
	t = (low + high) / 2;
	}
    }
}


/*@
  PISetCombFunc - Set the function use for reductions (GDSUM etc)

  Input Parameters:
. func - pointer to function.  

  Note:
  The following routines are provided in Tools
. gsetopT - Tree-oriented
. gsetopL - Line (row) oriented
$ Other routines may be defined by the user.
@*/
void PISetCombFunc( func )
void (*func)();
{
PIcomb = func;
}

/*@
  PISetScatterFunc - Set the function use for scatters (GSCATTER)

  Input Parameters:
. func    - pointer to function for GSCATTER
. funcsrc - pointer to function for GSCATTERSRC

  Note:
  The following routines are provided in Tools
. gscattersetT - Tree-oriented
. gscattersetR - Recursive subdivision
. gscattersetsrcT - Tree-oriented with known source
$ Other routines may be defined by the user.

  If either function pointer is null, that routine is not set.
@*/
void PISetScatterFunc( func, funcsrc )
void (*func)(), (*funcsrc)();
{
if (func)
    PIscatter    = func;
if (funcsrc)
    PIscattersrc = funcsrc;
}

/*@
  PISetCollectionFunc - Set the function use for collections (GCOL)

  Input Parameters:
. func - pointer to function.  

  Note:
  The following routines are provided in Tools
. gcolsetT - Tree-oriented
$ Other routines may be defined by the user.
@*/
void PISetCollectionFunc( func )
void (*func)();
{
PIcol = func;
}

/*@
  PISetSyncFunc - Set the function use for synchronizations (GSYNC)

  Input Parameters:
. func - pointer to function.  

  Note:
  The following routines are provided in Tools
. gsyncsetT - Tree-oriented
$ Other routines may be defined by the user.
@*/
void PISetSyncFunc( func )
void (*func)();
{
PIsync = func;
}

/*
    This routine sets up the collective operations, given a tree
 */     
void PISetCollectiveTree( r )
void (*r)();
{
PIGetTreeNodes = r;
}

void PISetupCollectiveTree( )
{
(*PIGetTreeNodes)( MYPROCID, NUMNODES, &_PILCHILD, &_PIRCHILD, &_PIPARENT, 
		   &_PIAM_LEFT );
/* printf( "[%d]%d,%d,%d\n", MYPROCID, _PIPARENT, _PILCHILD, _PIRCHILD ); */
}

/* Routine to access the defined collective tree (used in procset.c to set the
   tree) */
void PIGetCollectiveTree( myid, np, l_child, r_child, parent, am_left )
int myid, np, *l_child, *r_child, *parent, *am_left;
{
(*PIGetTreeNodes)( myid, np, l_child, r_child, parent, am_left );
}


/* 
   Interfaces to the global routines: scatter, sync, col.  
   scattersrc, colx to come.
 */
void gscatterset( buf, size, issrc, procset, datatype )
char    *buf;
int     size, issrc;
ProcSet *procset;
int     datatype;
{
LOGPUSHATOMIC;
(*PIscatter)( buf, size, issrc, procset, datatype );
LOGPOPATOMIC;
}

void gscattersetsrc( buf, size, src, procset, datatype )
char    *buf;
int     size, src;
ProcSet *procset;
int     datatype;
{
LOGPUSHATOMIC;
(*PIscattersrc)( buf, size, src, procset, datatype );
LOGPOPATOMIC;
}

void gsyncset( procset )
ProcSet *procset;
{
LOGPUSHATOMIC;
(*PIsync)( procset );
LOGPOPATOMIC;
}

void gcolset( lbuf, lsize, gbuf, gsiz, glen, procset, datatype )
char    *lbuf, *gbuf;
int     lsize, gsiz, *glen, datatype;
ProcSet *procset;
{
LOGPUSHATOMIC;
(*PIcol)( lbuf, lsize, gbuf, gsiz, glen, procset, datatype );
LOGPOPATOMIC;
}

void PIcombine( val, n, work, procset, elmsize, datatype, op )
void    *val, *work;
int     n, elmsize, datatype;
ProcSet *procset;
void    (*op)();
{
(*PIcomb)( val, n, work, procset, elmsize, datatype, op );
}
