/*****************************************************************************
 * $Source: /cvsroot/charm/src/libs/ck-libs/ampi/ampi.C,v $
 * $Author: chao $
 * $Date: 2003/08/08 22:56:33 $
 * $Revision: 1.141 $
 *****************************************************************************/

#define exit exit /*Supress definition of exit in ampi.h*/
#include <iostream.h>
#include "ampiimpl.h"
#include "tcharm.h"
#include "ampiEvents.h" /*** for trace generation for projector *****/
#include "ampiProjections.h"

#define CART_TOPOL 1

/* change this define to "x" to trace all send/recv's */
#define MSG_ORDER_DEBUG(x) /* empty */
#define STARTUP_DEBUG(x)  /* ckout<<"ampi[pe "<<CkMyPe()<<"] "<< x <<endl; */

#if 0
#define AMPI_DEBUG CkPrintf
#else
#define AMPI_DEBUG /* empty */
#endif

#ifndef AMPI_COMLIB
#  define AMPI_COMLIB 0
#endif

//------------- startup -------------
static mpi_comm_worlds mpi_worlds;

int mpi_nworlds; /*Accessed by ampif*/
int MPI_COMM_UNIVERSE[MPI_MAX_COMM_WORLDS]; /*Accessed by user code*/

// ------------ maxLoc/minLoc reduction support -----------
// The Sun CC compiler (and possibly others) *can't* build
//  a function pointer from a template without a templated
//  argument in the argument list.  Hence these dummy arguments.
#define STUPID_SUN_TEMPLATES 1

template <class VType,class IType>
CkReductionMsg *maxLoc(int nMsg,CkReductionMsg **msgs
#if STUPID_SUN_TEMPLATES
		       ,VType ignored1,IType ignored2
#endif
		       )
{
  class PairType{
  public:
    VType val;
    IType idx;
  };
  int size = msgs[0]->getSize();
  int count = size/(sizeof(VType)+sizeof(IType));
  PairType *m;
  PairType *ret = new PairType [count];

  // assuming nMsg > 0
  m=(PairType *)msgs[0]->getData();
  for(int j=0;j<count;j++){
    ret[j].val = m[j].val;
    ret[j].idx = m[j].idx;
  }
  for (int i=1;i<nMsg;i++){
      m=(PairType *)msgs[i]->getData();
      for(int j=0;j<count;j++){
        if(ret[j].val < m[j].val){
          ret[j].val = m[j].val;
          ret[j].idx = m[j].idx;
        }
      }
    }
  CkReductionMsg *retmsg = CkReductionMsg::buildNew(size,ret);
  delete [] ret;
  return retmsg;
}

template <class VType,class IType>
CkReductionMsg *minLoc(int nMsg,CkReductionMsg **msgs
#if STUPID_SUN_TEMPLATES
		       ,VType ignored1,IType ignored2
#endif
		       )
{
  class PairType{
  public:
    VType val;
    IType idx;
  };
  int size = msgs[0]->getSize();
  int count = size/(sizeof(VType)+sizeof(IType));
  PairType *m;
  PairType *ret = new PairType [count];

  // assuming nMsg > 0
  m=(PairType *)msgs[0]->getData();
  for(int j=0;j<count;j++){
    ret[j].val = m[j].val;
    ret[j].idx = m[j].idx;
  }
  for (int i=1;i<nMsg;i++){
      m=(PairType *)msgs[i]->getData();
      for(int j=0;j<count;j++)
        if(ret[j].val > m[j].val){
          ret[j].val = m[j].val;
          ret[j].idx = m[j].idx;
        }
    }
  CkReductionMsg *retmsg = CkReductionMsg::buildNew(size,ret);
  delete [] ret;
  return retmsg;
}

// ------------ logical/bitwise and/or/xor reduction support -----------
// Logical operations are for integer (sometimes disguised as logical)
// Bitwise operations are for both integer and byte
// Logical AND and OR have been implemented in Charm++
CkReductionMsg *LXOR(int nMsg,CkReductionMsg **msgs)
{
	int size = msgs[0]->getSize();
	int count = size/sizeof(int);
	int *m;
	int *ret = new int [count];
	// assuming nMsg > 0
	m = (int*)msgs[0]->getData();
	for(int j=0;j<count;j++)  ret[j] = m[j];
	for(int i=1;i<nMsg;i++){
		m = (int *)msgs[i]->getData();
		for(int j=0;j<count;j++)  ret[j] = (ret[j]&&(!m[j]))||(!(ret[j])&&m[j]); //emulate ^^
	}
	CkReductionMsg *retmsg = CkReductionMsg::buildNew(size,ret);
	delete [] ret;
	return retmsg;
}

template <class Type>
CkReductionMsg *BAND(int nMsg,CkReductionMsg **msgs
#if STUPID_SUN_TEMPLATES
		       ,Type ignored
#endif
			)
{
	int size = msgs[0]->getSize();
	int count = size/sizeof(Type);
	Type *m;
	Type *ret = new Type [count];
	// assuming nMsg > 0
	m = (Type *)msgs[0]->getData();
	for(int j=0;j<count;j++)  ret[j] = m[j];
	for(int i=1;i<nMsg;i++){
		m = (Type *)msgs[i]->getData();
		for(int j=0;j<count;j++)  ret[j] = (m[j] & ret[j]);
	}
	CkReductionMsg *retmsg = CkReductionMsg::buildNew(size,ret);
	delete [] ret;
	return retmsg;
}

template <class Type>
CkReductionMsg *BOR(int nMsg,CkReductionMsg **msgs
#if STUPID_SUN_TEMPLATES
		       ,Type ignored
#endif
			)
{
	int size = msgs[0]->getSize();
	int count = size/sizeof(Type);
	Type *m;
	Type *ret = new Type [count];
	// assuming nMsg > 0
	m = (Type *)msgs[0]->getData();
	for(int j=0;j<count;j++)  ret[j] = m[j];
	for(int i=1;i<nMsg;i++){
		m = (Type *)msgs[i]->getData();
		for(int j=0;j<count;j++)  ret[j] = (m[j] | ret[j]);
	}
	CkReductionMsg *retmsg = CkReductionMsg::buildNew(size,ret);
	delete [] ret;
	return retmsg;
}

template <class Type>
CkReductionMsg *BXOR(int nMsg,CkReductionMsg **msgs
#if STUPID_SUN_TEMPLATES
		       ,Type ignored
#endif
			)
{
	int size = msgs[0]->getSize();
	int count = size/sizeof(Type);
	Type *m;
	Type *ret = new Type [count];
	// assuming nMsg > 0
	m = (Type *)msgs[0]->getData();
	for(int j=0;j<count;j++)  ret[j] = m[j];
	for(int i=1;i<nMsg;i++){
		m = (Type *)msgs[i]->getData();
		for(int j=0;j<count;j++)  ret[j] = (m[j] ^ ret[j]);
	}
	CkReductionMsg *retmsg = CkReductionMsg::buildNew(size,ret);
	delete [] ret;
	return retmsg;
}

typedef long double longdouble;

// This hideous little macro calls its argument for
//   every valid combination of maxLoc/minLoc types.
// This keeps us from having to repeat the list of types over
//   and over again.
#define MAXMIN_MAP_TYPES(MACRO) \
  MACRO(maxLoc,float,int) MACRO(minLoc,float,int) \
  MACRO(maxLoc,double,int) MACRO(minLoc,double,int) \
  MACRO(maxLoc,long,int) MACRO(minLoc,long,int) \
  MACRO(maxLoc,int,int) MACRO(minLoc,int,int) \
  MACRO(maxLoc,short,int) MACRO(minLoc,short,int) \
  MACRO(maxLoc,longdouble,int) MACRO(minLoc,longdouble,int) \
  MACRO(maxLoc,float,float) MACRO(minLoc,float,float) \
  MACRO(maxLoc,double,double) MACRO(minLoc,double,double)
#define BITWISE_MAP_TYPES(MACRO) \
  MACRO(BAND,int) MACRO(BAND,char) \
  MACRO(BOR,int) MACRO(BOR,char) \
  MACRO(BXOR,int) MACRO(BXOR,char)

// Declare the maxLoc/minLoc reducerTypes
#define MAXMIN_REDUCER(fn,V,I) \
	CkReduction::reducerType fn##V##I##Reducer;
#define BITWISE_REDUCER(fn,T) \
	CkReduction::reducerType fn##T##Reducer;
MAXMIN_MAP_TYPES(MAXMIN_REDUCER)
BITWISE_MAP_TYPES(BITWISE_REDUCER)
CkReduction::reducerType LXORReducer;

// Instantiate the maxLoc/minLoc templates
#if STUPID_SUN_TEMPLATES
#  define MAXMIN_INSTANTIATE(fn,V,I) \
  template CkReductionMsg *fn<V,I>(int n,CkReductionMsg **m,V,I); \
  typedef CkReductionMsg *(* fn##V##I##Type)(int n,CkReductionMsg **m,V,I);
#  define BITWISE_INSTANTIATE(fn,T) \
  template CkReductionMsg *fn<T>(int n,CkReductionMsg **m,T);\
  typedef CkReductionMsg *(* fn##T##Type)(int n,CkReductionMsg **m,T);
#else
#  define MAXMIN_INSTANTIATE(fn,V,I) \
  template CkReductionMsg *fn<V,I>(int n,CkReductionMsg **m);
#  define BITWISE_INSTANTIATE(fn,T) \
  template CkReductionMsg *fn<T>(int n,CkReductionMsg **m);
#endif
MAXMIN_MAP_TYPES(MAXMIN_INSTANTIATE)
BITWISE_MAP_TYPES(BITWISE_INSTANTIATE)

static void ampiSetupReductions(void) {
  // add reducers for MPI_MINLOC/MPI_MAXLOC
#if STUPID_SUN_TEMPLATES
  /* Hideous: specify which version of the template
     we want by type-casting a function pointer! */
#  define MAXMIN_REGISTER(fn,V,I) \
    fn##V##I##Reducer = CkReduction::addReducer( \
	(CkReduction::reducerFn)(fn##V##I##Type)fn);
#  define BITWISE_REGISTER(fn,T) \
    fn##T##Reducer = CkReduction::addReducer( \
	(CkReduction::reducerFn)(fn##T##Type)fn);
#else
  /* Sane compiler: just specify template using <> */
#  define MAXMIN_REGISTER(fn,V,I) \
    fn##V##I##Reducer = CkReduction::addReducer(fn<V,I>);
#  define BITWISE_REGISTER(fn,T) \
    fn##T##Reducer = CkReduction::addReducer(fn<T>);
#endif
  MAXMIN_MAP_TYPES(MAXMIN_REGISTER)
  BITWISE_MAP_TYPES(BITWISE_REGISTER)
  LXORReducer=CkReduction::addReducer(LXOR);
}


// ------------ startup support -----------

int _ampi_fallback_setup_count;
CDECL void MPI_Setup(void);
FDECL void FTN_NAME(MPI_SETUP,mpi_setup)(void);

int MPI_Main_cpp(int argc,char **argv);
CDECL int MPI_Main(int argc,char **argv);
FDECL void FTN_NAME(MPI_MAIN,mpi_main)(void);

/*Main routine used when missing MPI_Setup routine*/
CDECL void MPI_Fallback_Main(int argc,char **argv)
{
  MPI_Main_cpp(argc,argv);
  MPI_Main(argc,argv);
  FTN_NAME(MPI_MAIN,mpi_main)();
}

void ampiCreateMain(MPI_MainFn mainFn, const char *name,int nameLen);
/*Startup routine used if user *doesn't* write
  a TCHARM_User_setup routine.
 */
CDECL void MPI_Setup_Switch(void) {
  _ampi_fallback_setup_count=0;
  FTN_NAME(MPI_SETUP,mpi_setup)();
  MPI_Setup();
  if (_ampi_fallback_setup_count==2)
  { //Missing MPI_Setup in both C and Fortran:
    ampiCreateMain(MPI_Fallback_Main,"default",strlen("default"));
  }
}

static int nodeinit_has_been_called=0;
CtvDeclare(ampiParent*, ampiPtr);
CtvDeclare(int, ampiInitDone);
static void ampiNodeInit(void)
{
  mpi_nworlds=0;
  for(int i=0;i<MPI_MAX_COMM_WORLDS; i++)
  {
    MPI_COMM_UNIVERSE[i] = MPI_COMM_WORLD+1+i;
  }
  TCHARM_Set_fallback_setup(MPI_Setup_Switch);

  ampiSetupReductions();

  nodeinit_has_been_called=1;
}

static void ampiProcInit(void){
  CtvInitialize(ampiParent*, ampiPtr);
  CtvInitialize(int,ampiInitDone);
  REGISTER_AMPI
  initAmpiProjections();
}

PUPbytes(MPI_MainFn);

class MPI_threadstart_t {
public:
	MPI_MainFn fn;
	MPI_threadstart_t() {}
	MPI_threadstart_t(MPI_MainFn fn_)
		:fn(fn_) {}
	void start(void) {
		char **argv=CmiCopyArgs(CkGetArgv());
		int argc=CkGetArgc();
		(fn)(argc,argv);
	}
	void pup(PUP::er &p) {
		p|fn;
	}
};
PUPmarshall(MPI_threadstart_t);

extern "C" void MPI_threadstart(void *data)
{
	STARTUP_DEBUG("MPI_threadstart")
	MPI_threadstart_t t;
	pupFromBuf(data,t);
	t.start();
}

void ampiCreateMain(MPI_MainFn mainFn, const char *name,int nameLen)
{
	STARTUP_DEBUG("ampiCreateMain")
	int _nchunks=TCHARM_Get_num_chunks();
	//Make a new threads array:
	MPI_threadstart_t s(mainFn);
	memBuf b; pupIntoBuf(b,s);
	TCHARM_Create_data( _nchunks,MPI_threadstart,
			  b.getData(), b.getSize());
}

/* TCharm Semaphore ID for AMPI startup */
#define AMPI_TCHARM_SEMAID 0x00A34100 /* __AMPI__ */

static CProxy_ampiWorlds ampiWorldsGroup;

/*
Called from MPI_Init, a collective initialization call:
 creates a new AMPI array and attaches it to the current
 set of TCHARM threads.
*/
static ampi *ampiInit(char **argv)
{
  if (CtvAccess(ampiInitDone)) return NULL; /* Already called ampiInit */
  STARTUP_DEBUG("ampiInit> begin")

  // Parse command-line arguments (Commlib)
  int strat = USE_DIRECT;
  char *comlibStrat;
  if(0!=CmiGetArgString(argv, "+strategy", &comlibStrat)){
		//CkPrintf("AMPI: Comlib initialized with %s\n",comlibStrat);
		if(0==strcmp(comlibStrat,"USE_DIRECT")){
			strat = USE_DIRECT;
		} else if(0==strcmp(comlibStrat,"USE_MESH")){
			strat = USE_MESH;
		} else if(0==strcmp(comlibStrat,"USE_GRID")){
			strat = USE_GRID;
		} else if(0==strcmp(comlibStrat,"USE_HYPERCUBE")){
			strat = USE_HYPERCUBE;
		}
  }

  if (TCHARM_Element()==0)
  { /* I'm responsible for building the arrays: */
	STARTUP_DEBUG("ampiInit> creating arrays")

// FIXME: Need to serialize global communicator allocation in one place.
	//Allocate the next communicator
	if(mpi_nworlds == MPI_MAX_COMM_WORLDS)
	{
		CkAbort("AMPI> Number of registered comm_worlds exceeded limit.\n");
	}
	int new_idx=mpi_nworlds;
	MPI_Comm new_world=MPI_COMM_WORLD+1+new_idx;

        ComlibInstanceHandle cinst;

#if AMPI_COMLIB
	cinst=CkGetComlibInstance()
#endif
        
        //Create and attach the ampiParent array
	CkArrayID threads; int _nchunks;
        CkArrayOptions opts=TCHARM_Attach_start(&threads,&_nchunks);
	CProxy_ampiParent parent;
	parent=CProxy_ampiParent::ckNew(new_world,threads,cinst, opts);

#if AMPI_COMLIB
        //CProxy_ComlibManager comlib = CProxy_ComlibManager::ckNew(strat, 1);
	//comlib.ckLocalBranch()->createId();
	EachToManyMulticastStrategy *strategy = new EachToManyMulticastStrategy(strat, parent.ckGetArrayID(), parent.ckGetArrayID());
        cinst.setStrategy(strategy);

        //strategy->setSourceArray(parent.ckGetArrayID());
        //strategy->setDestArray(parent.ckGetArrayID());
#endif
        
	//Make a new ampi array
	CkArrayID empty;
	CkPupBasicVec<int> _indices;
	for(int i=0;i<_nchunks;i++) _indices.push_back(i);
	ampiCommStruct emptyComm(new_world,empty,_nchunks,_indices);
	CProxy_ampi arr;
	arr=CProxy_ampi::ckNew(parent,emptyComm,opts);

	//Broadcast info. to the mpi_worlds array
	// FIXME: remove race condition from MPI_COMM_UNIVERSE broadcast
	ampiCommStruct newComm(new_world,arr,_nchunks);
	if (ampiWorldsGroup.ckGetGroupID().isZero())
		ampiWorldsGroup=CProxy_ampiWorlds::ckNew(newComm);
	else
		ampiWorldsGroup.add(newComm);
	STARTUP_DEBUG("ampiInit> arrays created")
  }

  // Find our ampi object:
  ampi *ptr=(ampi *)TCharm::get()->semaGet(AMPI_TCHARM_SEMAID);
  CtvAccess(ampiInitDone)=1;
  STARTUP_DEBUG("ampiInit> complete")

  return ptr;
}

/// This group is used to broadcast the MPI_COMM_UNIVERSE communicators.
class ampiWorlds : public CBase_ampiWorlds {
public:
    ampiWorlds(const ampiCommStruct &nextWorld) {
        ampiWorldsGroup=thisgroup;
        add(nextWorld);
    }
    ampiWorlds(){ /* this group is not meant to be PUP'ed(?) */ }
    int useDefCtor(void){ return 1; }
    void add(const ampiCommStruct &nextWorld) {
        int new_idx=nextWorld.getComm()-(MPI_COMM_WORLD+1);
        mpi_worlds[new_idx].comm=nextWorld;
	if (mpi_nworlds<=new_idx) mpi_nworlds=new_idx+1;
	STARTUP_DEBUG("ampiInit> listed MPI_COMM_UNIVERSE "<<new_idx)
    }
};

//-------------------- ampiParent -------------------------
ampiParent::ampiParent(MPI_Comm worldNo_,CProxy_TCharm threads_, ComlibInstanceHandle comlib_)
    :threads(threads_), worldNo(worldNo_), comlib(comlib_)
{
  STARTUP_DEBUG("ampiParent> starting up")
  thread=NULL;
  worldPtr=NULL;
  myDDT=&myDDTsto;
  prepareCtv();
}
ampiParent::ampiParent(CkMigrateMessage *msg):CBase_ampiParent(msg) {
  thread=NULL;
  worldPtr=NULL;
  myDDT=&myDDTsto;
}
void ampiParent::pup(PUP::er &p) {
  ArrayElement1D::pup(p);
  p|threads;
  p|comlib;
  p|worldStruct;
  myDDT->pup(p);
  p|splitComm;
  p|groupComm;
  p|groups;
  p|ampiReqs;
}
void ampiParent::prepareCtv(void) {
  thread=threads[thisIndex].ckLocal();
  if (thread==NULL) CkAbort("AMPIParent cannot find its thread!\n");
  CtvAccessOther(thread->getThread(),ampiPtr) = this;
  STARTUP_DEBUG("ampiParent> found TCharm")
}

void ampiParent::ckJustMigrated(void) {
  ArrayElement1D::ckJustMigrated();
  prepareCtv();
}

ampiParent::~ampiParent() {
}

//Children call this when they are first created or just migrated
TCharm *ampiParent::registerAmpi(ampi *ptr,ampiCommStruct s,bool forMigration)
{
  if (thread==NULL) prepareCtv(); //Prevents CkJustMigrated race condition

  if (s.getComm()>=MPI_COMM_WORLD)
  { //We now have our COMM_WORLD-- register it
    //Note that split communicators don't keep a raw pointer, so
    //they don't need to re-register on migration.
     if (worldPtr!=NULL) CkAbort("One ampiParent has two MPI_COMM_WORLDs");
     worldPtr=ptr;
     worldStruct=s;

    //MPI_COMM_SELF has the same member as MPI_COMM_WORLD, but it's alone:
     CkPupBasicVec<int> _indices;
     _indices.push_back(thisIndex);
     selfStruct = ampiCommStruct(MPI_COMM_SELF,s.getProxy(),1,_indices);
  }
  
  if (!forMigration)
  { //Register the new communicator:
     MPI_Comm comm = s.getComm();
     STARTUP_DEBUG("ampiParent> registering new communicator "<<comm)
     if (comm>=MPI_COMM_WORLD) { 
       // Pass the new ampi to the waiting ampiInit
       thread->semaPut(AMPI_TCHARM_SEMAID, ptr);
     } else if (isSplit(comm)) {
       splitChildRegister(s);
     } else if (isGroup(comm)) {
       groupChildRegister(s);
     } else if (isCart(comm)) {
       cartChildRegister(s);
     } else if (isGraph(comm)) {
       graphChildRegister(s);
     }else
       CkAbort("ampiParent recieved child with bad communicator");
  }

  return thread;
}

void ampiParent::startCheckpoint(char* dname){
  if(thisIndex==0) thisProxy[thisIndex].Checkpoint(strlen(dname),dname);
  thread->stop();
}
void ampiParent::Checkpoint(int len, char* dname){
  char dirname[256];
  strncpy(dirname,dname,len);
  dirname[len]='\0';
  CkCallback cb(CkIndex_ampiParent::ResumeThread(),thisArrayID);
  CkStartCheckpoint(dirname,cb);
}
void ampiParent::ResumeThread(void){
  thread->resume();
}

int ampiParent::createKeyval(MPI_Copy_function *copy_fn, MPI_Delete_function *delete_fn,
                             int *keyval, void* extra_state){
	KeyvalNode* newnode = new KeyvalNode(copy_fn, delete_fn, extra_state);
	int idx = kvlist.size();
	kvlist.resize(idx+1);
	kvlist[idx] = newnode;
	*keyval = idx;
	return 0;
}
int ampiParent::freeKeyval(int *keyval){
	if(*keyval != MPI_KEYVAL_INVALID && *keyval >= kvlist.size() && kvlist[*keyval] && !kvlist[*keyval]->valid) return -1;
	delete kvlist[*keyval];
	kvlist[*keyval] = NULL;
	return 0;
}
int ampiParent::putAttr(MPI_Comm comm, int keyval, void* attribute_val){
	if(keyval != MPI_KEYVAL_INVALID && keyval >= kvlist.size() && kvlist[keyval] && !kvlist[keyval]->valid) return -1;
	KeyvalNode* node = kvlist[keyval];
	node->comm = comm;
	node->value = attribute_val;
	return 0;
}
int ampiParent::getAttr(MPI_Comm comm, int keyval, void *attribute_val, int *flag){
	if(keyval != MPI_KEYVAL_INVALID && keyval >= kvlist.size() && kvlist[keyval] && !kvlist[keyval]->valid) return -1;
	KeyvalNode* node = kvlist[keyval];
	if(comm == node->comm) {
		*flag = true;
		attribute_val = node->value;
	}else{
		*flag = false;
	}
	return 0;
}
int ampiParent::deleteAttr(MPI_Comm comm, int keyval){
	if(keyval != MPI_KEYVAL_INVALID && keyval >= kvlist.size() && kvlist[keyval] && !kvlist[keyval]->valid) return -1;
	kvlist[keyval]->valid = false;
	return 0;
}

//----------------------- ampi -------------------------
void ampi::init(void) {
  parent=NULL;
  thread=NULL;
  parent=NULL;
  thread=NULL;
  resumeOnRecv=false;
  comlibEnabled=AMPI_COMLIB;
}

ampi::ampi()
{
  init();
  msgs=NULL;
  seqEntries=-1;
}

ampi::ampi(CkArrayID parent_,const ampiCommStruct &s)
   :parentProxy(parent_)
{
  init();

  myComm=s; myComm.setArrayID(thisArrayID);
  myRank=myComm.getRankForIndex(thisIndex);

  findParent(false);

  msgs = CmmNew();
  nbcasts = 0;

  seqEntries=parent->numElements;
  oorder = new AmpiSeqQ[seqEntries];
  nextseq = new int[seqEntries];
  for(int i=0;i<seqEntries;i++) {
    nextseq[i] = 0;
    oorder[i].init();
  }
}

ampi::ampi(CkMigrateMessage *msg):CBase_ampi(msg)
{
  msgs=NULL;
  seqEntries=-1;
}

void ampi::ckJustMigrated(void)
{
	ArrayElement1D::ckJustMigrated();
	findParent(true);
}

void ampi::findParent(bool forMigration) {
        STARTUP_DEBUG("ampi> finding my parent")
	parent=parentProxy[thisIndex].ckLocal();
	if (parent==NULL) CkAbort("AMPI can't find its parent!");
	thread=parent->registerAmpi(this,myComm,forMigration);
	if (thread==NULL) CkAbort("AMPI can't find its thread!");
}

void ampi::pup(PUP::er &p)
{
  if(!p.isUserlevel())
    ArrayElement1D::pup(p);//Pack superclass
  p|parentProxy;
  p|myComm;
  p|myRank;
  p|parentProxy;
  p|nbcasts;
  p|tmpVec;

  msgs=CmmPup((pup_er)&p,msgs);

  p|seqEntries;
  if(p.isUnpacking())
  {
    oorder = new AmpiSeqQ[seqEntries];
    nextseq = new int[seqEntries];
  }
  for(int i=0; i<seqEntries; i++) p | oorder[i];
  p(nextseq, seqEntries);
}

ampi::~ampi()
{
  delete[] oorder;
  delete[] nextseq;
  CmmFree(msgs);
}

//------------------------ Communicator Splitting ---------------------
class ampiSplitKey {
public:
	int nextSplitComm;
	int color; //New class of processes we'll belong to
	int key; //To determine rank in new ordering
	int rank; //Rank in old ordering
	ampiSplitKey() {}
	ampiSplitKey(int nextSplitComm_,int color_,int key_,int rank_)
		:nextSplitComm(nextSplitComm_), color(color_), key(key_), rank(rank_) {}
};

/* "type" may indicate whether call is for a cartesian topology etc. */

void ampi::split(int color,int key,MPI_Comm *dest, int type)
{
  if (type == CART_TOPOL) {
	ampiSplitKey splitKey(parent->getNextCart(),color,key,getRank());
	int rootIdx=myComm.getIndexForRank(0);
	CkCallback cb(CkIndex_ampi::splitPhase1(0),CkArrayIndex1D(rootIdx),myComm.getProxy());
	contribute(sizeof(splitKey),&splitKey,CkReduction::concat,cb);

	thread->suspend(); //Resumed by ampiParent::cartChildRegister
	MPI_Comm newComm=parent->getNextCart()-1;
	*dest=newComm;

  }
  else {
	ampiSplitKey splitKey(parent->getNextSplit(),color,key,getRank());
	int rootIdx=myComm.getIndexForRank(0);
	CkCallback cb(CkIndex_ampi::splitPhase1(0),CkArrayIndex1D(rootIdx),myComm.getProxy());
	contribute(sizeof(splitKey),&splitKey,CkReduction::concat,cb);

	thread->suspend(); //Resumed by ampiParent::splitChildRegister
	MPI_Comm newComm=parent->getNextSplit()-1;
	*dest=newComm;
  }

}

extern "C" int compareAmpiSplitKey(const void *a_, const void *b_) {
	const ampiSplitKey *a=(const ampiSplitKey *)a_;
	const ampiSplitKey *b=(const ampiSplitKey *)b_;
	if (a->color!=b->color) return a->color-b->color;
	if (a->key!=b->key) return a->key-b->key;
	return a->rank-b->rank;
}

void ampi::splitPhase1(CkReductionMsg *msg)
{
	//Order the keys, which orders the ranks properly:
	int nKeys=msg->getSize()/sizeof(ampiSplitKey);
	ampiSplitKey *keys=(ampiSplitKey *)msg->getData();
	if (nKeys!=getSize()) CkAbort("ampi::splitReduce expected a split contribution from every rank!");
	qsort(keys,nKeys,sizeof(ampiSplitKey),compareAmpiSplitKey);

	MPI_Comm newComm = -1;
	for(int i=0;i<nKeys;i++)
		if(keys[i].nextSplitComm>newComm)
			newComm = keys[i].nextSplitComm;

	//Loop over the sorted keys, which gives us the new arrays:
	int lastColor=keys[0].color-1; //The color we're building an array for
	CProxy_ampi lastAmpi; //The array for lastColor
	int lastRoot=0; //C value for new rank 0 process for latest color
	ampiCommStruct lastComm; //Communicator info. for latest color
	for (int c=0;c<nKeys;c++) {
		if (keys[c].color!=lastColor)
		{ //Hit a new color-- need to build a new communicator and array
			lastColor=keys[c].color;
			lastRoot=c;
			CkArrayOptions opts;
        		opts.bindTo(parentProxy);
			opts.setNumInitial(0);
			CkArrayID unusedAID; ampiCommStruct unusedComm;
			lastAmpi=CProxy_ampi::ckNew(unusedAID,unusedComm,opts);
			lastAmpi.doneInserting(); //<- Meaning, I need to do my own creation race resolution

			CkPupBasicVec<int> indices; //Maps rank to array indices for new arrau
			for (int i=c;i<nKeys;i++) {
				if (keys[i].color!=lastColor) break; //Done with this color
				int idx=myComm.getIndexForRank(keys[i].rank);
				indices.push_back(idx);
			}

			//FIXME: create a new communicator for each color, instead of
			// (confusingly) re-using the same MPI_Comm number for each.
			lastComm=ampiCommStruct(newComm,lastAmpi,indices.size(),indices);
		}
		int newRank=c-lastRoot;
		int newIdx=lastComm.getIndexForRank(newRank);

		//CkPrintf("[%d (%d)] Split (%d,%d) %d insert\n",newIdx,newRank,keys[c].color,keys[c].key,newComm);
		lastAmpi[newIdx].insert(parentProxy,lastComm);
	}

	delete msg;
}

//...newly created array elements register with the parent, which calls:
void ampiParent::splitChildRegister(const ampiCommStruct &s) {
	int idx=s.getComm()-MPI_COMM_FIRST_SPLIT;
	if (splitComm.size()<=idx) splitComm.resize(idx+1);
	splitComm[idx]=new ampiCommStruct(s);
	thread->resume(); //Matches suspend at end of ampi::split
}

//-----------------create communicator from group--------------
// The procedure is like that of comm_split very much,
// so the code is shamelessly copied from above
//   1. reduction to make sure all members have called
//   2. the root in the old communicator create the new array
//   3. ampiParent::register is called to register new array as new comm
class vecStruct {
public:
  int nextgroup;
  groupStruct vec;
  vecStruct():nextgroup(-1){}
  vecStruct(int nextgroup_, groupStruct vec_)
    : nextgroup(nextgroup_), vec(vec_) { }
};

void ampi::commCreate(const groupStruct vec,MPI_Comm* newcomm){
  int rootIdx=vec[0];
  tmpVec = vec;
  CkCallback cb(CkIndex_ampi::commCreatePhase1(NULL),CkArrayIndex1D(rootIdx),myComm.getProxy());
  MPI_Comm nextgroup = parent->getNextGroup();
  contribute(sizeof(nextgroup), &nextgroup,CkReduction::max_int,cb);

  if(getPosOp(thisIndex,vec)>=0){
    thread->suspend(); //Resumed by ampiParent::groupChildRegister
    MPI_Comm retcomm = parent->getNextGroup()-1;
    *newcomm = retcomm;
  }else{
    *newcomm = MPI_COMM_NULL;
  }
}

void ampi::commCreatePhase1(CkReductionMsg *msg){
  MPI_Comm *nextGroupComm = (int *)msg->getData();

  CkArrayOptions opts;
  opts.bindTo(parentProxy);
  opts.setNumInitial(0);
  CkArrayID unusedAID;
  ampiCommStruct unusedComm;
  CProxy_ampi newAmpi=CProxy_ampi::ckNew(unusedAID,unusedComm,opts);
  newAmpi.doneInserting(); //<- Meaning, I need to do my own creation race resolution

  groupStruct indices = tmpVec;
  ampiCommStruct newCommstruct = ampiCommStruct(*nextGroupComm,newAmpi,indices.size(),indices);
  for(int i=0;i<indices.size();i++){
    int newIdx=indices[i];
    newAmpi[newIdx].insert(parentProxy,newCommstruct);
  }
  delete msg;
}

void ampiParent::groupChildRegister(const ampiCommStruct &s) {
  int idx=s.getComm()-MPI_COMM_FIRST_GROUP;
  if (groupComm.size()<=idx) groupComm.resize(idx+1);
  groupComm[idx]=new ampiCommStruct(s);
  thread->resume(); //Matches suspend at end of ampi::split
}

/* Virtual topology communicator creation */

void ampi::cartCreate(const groupStruct vec,MPI_Comm* newcomm){
  int rootIdx=vec[0];
  tmpVec = vec;
  CkCallback cb(CkIndex_ampi::cartCreatePhase1(NULL),CkArrayIndex1D(rootIdx),myComm.getProxy());

  MPI_Comm nextcart = parent->getNextCart();
  contribute(sizeof(nextcart), &nextcart,CkReduction::max_int,cb);
  
  if(getPosOp(thisIndex,vec)>=0){
    thread->suspend(); //Resumed by ampiParent::cartChildRegister
     MPI_Comm retcomm = parent->getNextCart()-1;
     *newcomm = retcomm;
  }else
    *newcomm = MPI_COMM_NULL;
}

void ampi::cartCreatePhase1(CkReductionMsg *msg){
  MPI_Comm *nextCartComm = (int *)msg->getData();
  
  CkArrayOptions opts;
  opts.bindTo(parentProxy);
  opts.setNumInitial(0);
  CkArrayID unusedAID;
  ampiCommStruct unusedComm;
  CProxy_ampi newAmpi=CProxy_ampi::ckNew(unusedAID,unusedComm,opts);
  newAmpi.doneInserting(); //<- Meaning, I need to do my own creation race resolution
    
  groupStruct indices = tmpVec;
  ampiCommStruct newCommstruct = ampiCommStruct(*nextCartComm,newAmpi,indices.
						size(),indices);
  for(int i=0;i<indices.size();i++){
    int newIdx=indices[i];
    newAmpi[newIdx].insert(parentProxy,newCommstruct);
  }
  delete msg;
}

void ampiParent::cartChildRegister(const ampiCommStruct &s) {
  int idx=s.getComm()-MPI_COMM_FIRST_CART;
  if (cartComm.size()<=idx) {
    cartComm.resize(idx+1);
    cartComm.length()=idx+1;
  }
  cartComm[idx]=new ampiCommStruct(s);
  thread->resume(); //Matches suspend at end of ampi::cartCreate
}

void ampi::graphCreate(const groupStruct vec,MPI_Comm* newcomm){
  int rootIdx=vec[0];
  tmpVec = vec;
  CkCallback cb(CkIndex_ampi::graphCreatePhase1(NULL),CkArrayIndex1D(rootIdx),
		myComm.getProxy());
  MPI_Comm nextgraph = parent->getNextGraph();
  contribute(sizeof(nextgraph), &nextgraph,CkReduction::max_int,cb);
  
  if(getPosOp(thisIndex,vec)>=0){
    thread->suspend(); //Resumed by ampiParent::graphChildRegister
    MPI_Comm retcomm = parent->getNextGraph()-1;
    *newcomm = retcomm;
  }else
    *newcomm = MPI_COMM_NULL;
}

void ampi::graphCreatePhase1(CkReductionMsg *msg){
  MPI_Comm *nextGraphComm = (int *)msg->getData();

  CkArrayOptions opts;
  opts.bindTo(parentProxy);
  opts.setNumInitial(0);
  CkArrayID unusedAID;
  ampiCommStruct unusedComm;
  CProxy_ampi newAmpi=CProxy_ampi::ckNew(unusedAID,unusedComm,opts);
  newAmpi.doneInserting(); //<- Meaning, I need to do my own creation race resolution

  groupStruct indices = tmpVec;
  ampiCommStruct newCommstruct = ampiCommStruct(*nextGraphComm,newAmpi,indices
						.size(),indices);
  for(int i=0;i<indices.size();i++){
    int newIdx=indices[i];
    newAmpi[newIdx].insert(parentProxy,newCommstruct);
  }
  delete msg;
}

void ampiParent::graphChildRegister(const ampiCommStruct &s) {
  int idx=s.getComm()-MPI_COMM_FIRST_GRAPH;
  if (graphComm.size()<=idx) {
    graphComm.resize(idx+1);
    graphComm.length()=idx+1;
  }
  graphComm[idx]=new ampiCommStruct(s);
  thread->resume(); //Matches suspend at end of ampi::graphCreate
}


//------------------------ communication -----------------------
const ampiCommStruct &universeComm2proxy(MPI_Comm universeNo)
{
  if (universeNo>MPI_COMM_WORLD) {
    int worldDex=universeNo-MPI_COMM_WORLD-1;
    if (worldDex>=mpi_nworlds)
      CkAbort("Bad world communicator passed to universeComm2proxy");
    return mpi_worlds[worldDex].comm;
  }
  CkAbort("Bad communicator passed to universeComm2proxy");
}

void
ampi::generic(AmpiMsg* msg)
{
MSG_ORDER_DEBUG(
  CkPrintf("AMPI Rank %d arrival: tag=%d, src=%d, comm=%d  (from %d, seq %d)\n",
  	getRank(),msg->tag,msg->srcRank,msg->comm, msg->srcIdx, msg->seq);
)
//	AmpiMsg *msgcopy = msg;
  if(msg->seq != -1) {
    int srcIdx = msg->srcIdx;
    oorder[srcIdx].put(msg->seq, msg);
    while((msg=oorder[srcIdx].get())!=0) {
      inorder(msg);
    }
  } else { //Cross-world or system messages are unordered
    inorder(msg);
  }
  if(resumeOnRecv){
    thread->resume();
  }	
}

void
ampi::inorder(AmpiMsg* msg)
{
MSG_ORDER_DEBUG(
  CkPrintf("AMPI Rank %d inorder: tag=%d, src=%d, comm=%d  (from %d, seq %d)\n",
  	getRank(),msg->tag,msg->srcRank,msg->comm, msg->srcIdx, msg->seq);
)
  int tags[3];
  tags[0] = msg->tag; tags[1] = msg->srcRank; tags[2] = msg->comm;
  CmmPut(msgs, 3, tags, msg);
}

AmpiMsg *ampi::makeAmpiMsg(int destIdx,
	int t,int sRank,const void *buf,int count,int type,MPI_Comm destcomm)
{
  CkDDT_DataType *ddt = getDDT()->getType(type);
  int len = ddt->getSize(count);
  int sIdx=thisIndex;
  int seq = -1;
  if (destIdx>=0 && destcomm<=MPI_COMM_WORLD && t<=MPI_TAG_UB)
  { //Not cross-module: set seqno
     seq = nextseq[destIdx]++;
  }
  AmpiMsg *msg = new (&len, 0) AmpiMsg(seq, t, sIdx, sRank, len, destcomm);
  ddt->serialize((char*)buf, (char*)msg->data, count, 1);
  return msg;
}

void
ampi::send(int t, int sRank, const void* buf, int count, int type,  int rank, MPI_Comm destcomm)
{
  const ampiCommStruct &dest=comm2proxy(destcomm);
  delesend(t,sRank,buf,count,type,rank,destcomm,dest.getProxy());
}

void
ampi::sendraw(int t, int sRank, void* buf, int len, CkArrayID aid, int idx)
{
  AmpiMsg *msg = new (&len, 0) AmpiMsg(-1, t, -1, sRank, len, MPI_COMM_WORLD);
  memcpy(msg->data, buf, len);
  CProxy_ampi pa(aid);
  pa[idx].generic(msg);
}

void
ampi::delesend(int t, int sRank, const void* buf, int count, int type,  int rank, MPI_Comm destcomm, CProxy_ampi arrproxy)
{
  const ampiCommStruct &dest=comm2proxy(destcomm);
  int destIdx = dest.getIndexForRank(rank);
MSG_ORDER_DEBUG(
  CkPrintf("AMPI Rank %d send: tag=%d, src=%d, comm=%d (to %d)\n",
  	getRank(),t,sRank,destcomm,destIdx);
)
  arrproxy[destIdx].generic(makeAmpiMsg(destIdx,t,sRank,buf,count,type,destcomm));
  
#ifndef CMK_OPTIMIZE
  int size=0;
  MPI_Type_size(type,&size);
  _LOG_E_AMPI_MSG_SEND(t,destIdx,count,size)
#endif
}

void
ampi::recv(int t, int s, void* buf, int count, int type, int comm, int *sts)
{
  _LOG_E_END_AMPI_PROCESSING(thisIndex)

  int tags[3];
  AmpiMsg *msg = 0;
  CkDDT_DataType *ddt = getDDT()->getType(type);
  int len = ddt->getSize(count);
MSG_ORDER_DEBUG(
  CkPrintf("AMPI Rank %d blocking recv: tag=%d, src=%d, comm=%d\n",getRank(),t,s,comm);
)
  resumeOnRecv=true;
  while(1) {
    tags[0] = t; tags[1] = s; tags[2] = comm;
    msg = (AmpiMsg *) CmmGet(msgs, 3, tags, sts);
    if (msg) break;
    thread->suspend();
  }
  resumeOnRecv=false;
  if(sts)
    ((MPI_Status*)sts)->MPI_LENGTH = msg->length;
  if (msg->length > len)
  { /* Received more data than we were expecting */
    char einfo[1024];
    sprintf(einfo, "FATAL ERROR in rank %d MPI_Recv (tag=%d, source=%d)\n"
    	"  Expecting only %d bytes (%d items of type %d), \n"
	"  but received %d bytes from rank %d\nAMPI> MPI_Send was longer than matching MPI_Recv.",
            thisIndex,t,s,
	    len, count, type,
	    msg->length, msg->srcRank);
    CkAbort(einfo);
  }
  ddt->serialize((char*)buf, (char*)msg->data, msg->length/(ddt->getSize(1)), (-1));
  delete msg;
  
  _LOG_E_BEGIN_AMPI_PROCESSING(thisIndex,s,count)
}

void
ampi::probe(int t, int s, int comm, int *sts)
{
  int tags[3];
  AmpiMsg *msg = 0;
  resumeOnRecv=true;
  while(1) {
    tags[0] = t; tags[1] = s; tags[2] = comm;
    msg = (AmpiMsg *) CmmProbe(msgs, 3, tags, sts);
    if (msg) break;
    thread->suspend();
  }
  resumeOnRecv=false;
  if(sts)
    ((MPI_Status*)sts)->MPI_LENGTH = msg->length;
}

int
ampi::iprobe(int t, int s, int comm, int *sts)
{
  int tags[3];
  AmpiMsg *msg = 0;
  tags[0] = t; tags[1] = s; tags[2] = comm;
  msg = (AmpiMsg *) CmmProbe(msgs, 3, tags, sts);
  if (msg) {
    if(sts)
      ((MPI_Status*)sts)->MPI_LENGTH = msg->length;
    return 1;
  }
  thread->schedule();
  return 0;
}


const int MPI_BCAST_COMM=MPI_COMM_WORLD+1000;
void
ampi::bcast(int root, void* buf, int count, int type,MPI_Comm destcomm)
{
  const ampiCommStruct &dest=comm2proxy(destcomm);
  int rootIdx=dest.getIndexForRank(root);
  if(rootIdx==thisIndex) {
    /* Broadcast my message to the array proxy */
    dest.getProxy().generic(makeAmpiMsg(-1,MPI_BCAST_TAG,0, buf,count,type, MPI_BCAST_COMM));
  }
  recv(MPI_BCAST_TAG,0, buf,count,type, MPI_BCAST_COMM);
  nbcasts++;
}

void
ampi::bcastraw(void* buf, int len, CkArrayID aid)
{
  AmpiMsg *msg = new (&len, 0) AmpiMsg(-1, MPI_BCAST_TAG, -1, 0, len, 0);
  memcpy(msg->data, buf, len);
  CProxy_ampi pa(aid);
  pa.generic(msg);
}

int MPI_null_copy_fn (MPI_Comm comm, int keyval, void *extra_state,
			void *attr_in, void *attr_out, int *flag){
  (*flag) = 0;
  return (MPI_SUCCESS);
}
int MPI_dup_fn(MPI_Comm comm, int keyval, void *extra_state,
			void *attr_in, void *attr_out, int *flag){
  (*(void **)attr_out) = attr_in;
  (*flag) = 1;
  return (MPI_SUCCESS);
}
int MPI_null_delete_fn (MPI_Comm comm, int keyval, void *attr, void *extra_state ){
  return (MPI_SUCCESS);
}

//------------------ External Interface -----------------

static ampiParent *getAmpiParent(void) {
  ampiParent *p = CtvAccess(ampiPtr);
#ifndef CMK_OPTIMIZE
  if (p==NULL) CkAbort("Cannot call MPI routines before AMPI is initialized.\n");
#endif
  return p;
}

ampi *getAmpiInstance(MPI_Comm comm) {
  ampi *ptr=getAmpiParent()->comm2ampi(comm);
#ifndef CMK_OPTIMIZE
  if (ptr==NULL) CkAbort("AMPI's getAmpiInstance> null pointer\n");
#endif
  return ptr;
}

static AmpiRequestList *getReqs(void) {
  return &(getAmpiParent()->ampiReqs);
}

CDECL void MPI_Migrate(void)
{
  AMPIAPI("MPI_Migrate");
  TCHARM_Migrate();
}

CDECL int MPI_Init(int *p_argc, char*** p_argv)
{
  if (nodeinit_has_been_called) {
    AMPIAPI("MPI_Init");
    char **argv;
    if (p_argv) argv=*p_argv;
    else argv=CkGetArgv();
    ampiInit(argv);
    if (p_argc) *p_argc=CmiGetArgc(argv);
  }
  else
  { /* Charm hasn't been started yet! */
    CkAbort("AMPI_Init> Charm is not initialized!");
  }
  return 0;
}

CDECL int MPI_Initialized(int *isInit)
{
  AMPIAPI("MPI_Initialized");
  if (nodeinit_has_been_called) {
  	*isInit=CtvAccess(ampiInitDone);
  } 
  else /* !nodeinit_has_been_called */ {
  	*isInit=nodeinit_has_been_called;
  }
  return 0;
}

CDECL int MPI_Comm_rank(MPI_Comm comm, int *rank)
{
  AMPIAPI("MPI_Comm_rank");
  *rank = (comm==MPI_COMM_SELF)?0:getAmpiInstance(comm)->getRank();
  return 0;
}

CDECL
int MPI_Comm_size(MPI_Comm comm, int *size)
{
  AMPIAPI("MPI_Comm_size");
  *size = (comm==MPI_COMM_SELF)?1:getAmpiInstance(comm)->getSize();
  return 0;
}

CDECL void MPI_Exit(int /*exitCode*/)
{
  AMPIAPI("MPI_Exit");
  TCHARM_Done();
}
FDECL void FTN_NAME(MPI_EXIT,mpi_exit)(int *exitCode)
{
  MPI_Exit(*exitCode);
}

CDECL
int MPI_Finalize(void)
{
  AMPIAPI("MPI_Finalize");
  MPI_Exit(0);
  return 0;
}

static int common_send(void *msg, int count, MPI_Datatype type, int dest,
                        int tag, MPI_Comm comm)
{
  ampi *ptr = getAmpiInstance(comm);
  int srcRank=ptr->getRank();
  if(comm==MPI_COMM_SELF) { /* our rank in MPI_COMM_SELF is zero*/
  	srcRank=0;
  }
  ptr->send(tag, srcRank, msg, count, type, dest, comm);
  return 0;
}

CDECL
int MPI_Send(void *msg, int count, MPI_Datatype type, int dest,
                        int tag, MPI_Comm comm)
{
  AMPIAPI("MPI_Send");
  return common_send(msg,count,type,dest,tag,comm);
}

CDECL
int MPI_Recv(void *msg, int count, MPI_Datatype type, int src, int tag,
              MPI_Comm comm, MPI_Status *status)
{
	AMPIAPI("MPI_Recv");
	ampi *ptr = getAmpiInstance(comm);

	//CkPrintf("dedede%d[[",CpvAccess(_traceCoreOn));if(CpvAccess(_traceCoreOn) !=0) ampi_endProcessing(ptr->thisIndex); else CkPrintf("]]]] %ddededededededede\n",CpvAccess(_traceCoreOn));
	ptr->recv(tag,src,msg,count,type, comm, (int*) status);

	return 0;
}

CDECL
int MPI_Probe(int src, int tag, MPI_Comm comm, MPI_Status *status)
{
  AMPIAPI("MPI_Probe");
  ampi *ptr = getAmpiInstance(comm);
  ptr->probe(tag,src, comm, (int*) status);
  return 0;
}

CDECL
int MPI_Iprobe(int src,int tag,MPI_Comm comm,int *flag,MPI_Status *status)
{
  AMPIAPI("MPI_Iprobe");
  ampi *ptr = getAmpiInstance(comm);
  *flag = ptr->iprobe(tag,src,comm,(int*) status);
  return 0;
}

CDECL
int MPI_Sendrecv(void *sbuf, int scount, int stype, int dest,
                  int stag, void *rbuf, int rcount, int rtype,
                  int src, int rtag, MPI_Comm comm, MPI_Status *sts)
{
  AMPIAPI("MPI_Sendrecv");
  int se=MPI_Send(sbuf,scount,stype,dest,stag,comm);
  int re=MPI_Recv(rbuf,rcount,rtype,src,rtag,comm,sts);
  if (se) return se;
  else return re;
}

CDECL
int MPI_Sendrecv_replace(void* buf, int count, MPI_Datatype datatype,
                         int dest, int sendtag, int source, int recvtag,
                         MPI_Comm comm, MPI_Status *status)
{
  AMPIAPI("MPI_Sendrecv_replace");
  return MPI_Sendrecv(buf, count, datatype, dest, sendtag,
                      buf, count, datatype, source, recvtag, comm, status);
}


CDECL
int MPI_Barrier(MPI_Comm comm)
{
  AMPIAPI("MPI_Barrier");
  //HACK: Use collective operation as a barrier.
  MPI_Allreduce(NULL,NULL,0,MPI_INT,MPI_SUM,comm);
  return 0;
}

CDECL
int MPI_Bcast(void *buf, int count, MPI_Datatype type, int root,
                         MPI_Comm comm)
{
  AMPIAPI("MPI_Bcast");
  if(comm==MPI_COMM_SELF) return 0;
  ampi *ptr = getAmpiInstance(comm);
  ptr->bcast(root, buf, count, type,comm);
  return 0;
}

static CkReduction::reducerType
getReductionType(int type, int op)
{
  CkReduction::reducerType mytype;
  switch(op) {
    case MPI_MAX :
      switch(type) {
        case MPI_FLOAT : mytype = CkReduction::max_float; break;
        case MPI_INT : mytype = CkReduction::max_int; break;
        case MPI_DOUBLE : mytype = CkReduction::max_double; break;
        default:
          ckerr << "Type " << type << " with Op " << op << " not supported." << endl;
          CmiAbort("exiting");
      }
      break;
    case MPI_MIN :
      switch(type) {
        case MPI_FLOAT : mytype = CkReduction::min_float; break;
        case MPI_INT : mytype = CkReduction::min_int; break;
        case MPI_DOUBLE : mytype = CkReduction::min_double; break;
        default:
          ckerr << "Type " << type << " with Op " << op << " not supported." << endl;
          CmiAbort("exiting");
      }
      break;
    case MPI_SUM :
      switch(type) {
        case MPI_FLOAT : mytype = CkReduction::sum_float; break;
        case MPI_INT : mytype = CkReduction::sum_int; break;
        case MPI_DOUBLE : mytype = CkReduction::sum_double; break;
        default:
          ckerr << "Type " << type << " with Op " << op << " not supported." << endl;
          CmiAbort("exiting");
      }
      break;
    case MPI_PROD :
      switch(type) {
        case MPI_FLOAT : mytype = CkReduction::product_float; break;
        case MPI_INT : mytype = CkReduction::product_int; break;
        case MPI_DOUBLE : mytype = CkReduction::product_double; break;
        default:
          ckerr << "Type " << type << " with Op " << op << " not supported." << endl;
          CmiAbort("exiting");
      }
      break;
    case MPI_MAXLOC:
      switch(type) {
	case MPI_FLOAT_INT : mytype = maxLocfloatintReducer; break;
	case MPI_DOUBLE_INT : mytype = maxLocdoubleintReducer; break;
	case MPI_LONG_INT : mytype = maxLoclongintReducer; break;
	case MPI_2INT : mytype = maxLocintintReducer; break;
	case MPI_SHORT_INT : mytype = maxLocshortintReducer; break;
	case MPI_LONG_DOUBLE_INT : mytype = maxLoclongdoubleintReducer; break;
	case MPI_2FLOAT : mytype = maxLocfloatfloatReducer; break;
	case MPI_2DOUBLE : mytype = maxLocdoubledoubleReducer; break;
        default:
          ckerr << "Type " << type << " with Op " << op << " not supported." << endl;
          CmiAbort("exiting");
      }
      break;
    case MPI_MINLOC:
      switch(type) {
	case MPI_FLOAT_INT : mytype = minLocfloatintReducer; break;
	case MPI_DOUBLE_INT : mytype = minLocdoubleintReducer; break;
	case MPI_LONG_INT : mytype = minLoclongintReducer; break;
	case MPI_2INT : mytype = minLocintintReducer; break;
	case MPI_SHORT_INT : mytype = minLocshortintReducer; break;
	case MPI_LONG_DOUBLE_INT : mytype = minLoclongdoubleintReducer; break;
	case MPI_2FLOAT : mytype = minLocfloatfloatReducer; break;
	case MPI_2DOUBLE : mytype = minLocdoubledoubleReducer; break;
        default:
          ckerr << "Type " << type << " with Op " << op << " not supported." << endl;
          CmiAbort("exiting");
      }
      break;
    case MPI_BAND :
      switch(type) {
        case MPI_INT : mytype = BANDintReducer; break;
        case MPI_BYTE : mytype = BANDcharReducer; break;
        default:
          ckerr << "Type " << type << " with Op " << op << " not supported." << endl;
          CmiAbort("exiting");
      }
      break;
    case MPI_BOR :
      switch(type) {
        case MPI_INT : mytype = BORintReducer; break;
        case MPI_BYTE : mytype = BORcharReducer; break;
        default:
          ckerr << "Type " << type << " with Op " << op << " not supported." << endl;
          CmiAbort("exiting");
      }
      break;
    case MPI_BXOR :
      switch(type) {
        case MPI_INT : mytype = BXORintReducer; break;
        case MPI_BYTE : mytype = BXORcharReducer; break;
        default:
          ckerr << "Type " << type << " with Op " << op << " not supported." << endl;
          CmiAbort("exiting");
      }
      break;
    case MPI_LAND :
      switch(type) {
        case MPI_INT :
        case MPI_LOGICAL : mytype = CkReduction::logical_and; break;
        default:
          ckerr << "Type " << type << " with Op " << op << " not supported." << endl;
          CmiAbort("exiting");
      }
      break;
    case MPI_LOR :
      switch(type) {
        case MPI_INT :
        case MPI_LOGICAL : mytype = CkReduction::logical_or; break;
        default:
          ckerr << "Type " << type << " with Op " << op << " not supported." << endl;
          CmiAbort("exiting");
      }
      break;
    case MPI_LXOR :
      switch(type) {
        case MPI_INT :
        case MPI_LOGICAL : mytype = LXORReducer; break;
        default:
          ckerr << "Type " << type << " with Op " << op << " not supported." << endl;
          CmiAbort("exiting");
      }
      break;
    case MPI_CONCAT :
      mytype = CkReduction::concat; break;
    default:
      ckerr << "Type " << type << " with Op " << op << " not supported." << endl;
      CmiAbort("exiting");
  }
  return mytype;
}

/// This routine is called with the results of a Reduce or AllReduce
const int MPI_REDUCE_SOURCE=0;
const int MPI_REDUCE_COMM=MPI_COMM_WORLD;
void ampi::reduceResult(CkReductionMsg *msg)
{
  ampi::sendraw(MPI_REDUCE_TAG, MPI_REDUCE_SOURCE, msg->getData(), msg->getSize(),
             thisArrayID,thisIndex);
  delete msg;
}

static CkReductionMsg *makeRednMsg(CkDDT_DataType *ddt,const void *inbuf,int count,int type,MPI_Op op)
{
  CkReduction::reducerType redtype = getReductionType(type,op);
  int size = ddt->getSize(count);
  CkReductionMsg *msg=CkReductionMsg::buildNew(size,NULL,redtype);
  ddt->serialize((char*)inbuf, (char*)msg->getData(), count, 1);
  return msg;
}

// Copy the MPI datatype "type" from inbuf to outbuf
static int copyDatatype(MPI_Comm comm,MPI_Datatype type,int count,const void *inbuf,void *outbuf) {
  // ddts don't have "copy", so fake it by serializing into a temp buffer, then
  //  deserializing into the output.
  ampi *ptr = getAmpiInstance(comm);
  CkDDT_DataType *ddt=ptr->getDDT()->getType(type);
  int len=ddt->getSize(count);
  char *serialized=new char[len];
  ddt->serialize((char*)inbuf,(char*)serialized,count,1);
  ddt->serialize((char*)outbuf,(char*)serialized,count,-1); 
  delete [] serialized;		// < memory leak!  // gzheng 
  
  return MPI_SUCCESS;
}

CDECL
int MPI_Reduce(void *inbuf, void *outbuf, int count, int type, MPI_Op op,
               int root, MPI_Comm comm)
{
  AMPIAPI("MPI_Reduce");
  if(comm==MPI_COMM_SELF) return copyDatatype(comm,type,count,inbuf,outbuf);
  ampi *ptr = getAmpiInstance(comm);
  CkReductionMsg *msg=makeRednMsg(ptr->getDDT()->getType(type),inbuf,count,type,op);
  int rootIdx=ptr->comm2proxy(comm).getIndexForRank(root);
  CkCallback reduceCB(CkIndex_ampi::reduceResult(0),CkArrayIndex1D(rootIdx),ptr->getProxy(),true);
  msg->setCallback(reduceCB);
  ptr->contribute(msg);

  if (ptr->thisIndex == rootIdx){
    /*HACK: Use recv() to block until reduction data comes back*/
    if(op==MPI_CONCAT) count*=ptr->getSize();
    ptr->recv(MPI_REDUCE_TAG, MPI_REDUCE_SOURCE, outbuf, count, type, MPI_REDUCE_COMM);
  }
  return 0;
}

CDECL
int MPI_Allreduce(void *inbuf, void *outbuf, int count, int type,
                  MPI_Op op, MPI_Comm comm)
{
  AMPIAPI("MPI_Allreduce");
  if(comm==MPI_COMM_SELF) return copyDatatype(comm,type,count,inbuf,outbuf);
  ampi *ptr = getAmpiInstance(comm);
  CkReductionMsg *msg=makeRednMsg(ptr->getDDT()->getType(type),inbuf,count,type,op);
  CkCallback allreduceCB(CkIndex_ampi::reduceResult(0),ptr->getProxy());
  msg->setCallback(allreduceCB);
  ptr->contribute(msg);

  /*HACK: Use recv() to block until the reduction data comes back*/
  ptr->recv(MPI_REDUCE_TAG, MPI_REDUCE_SOURCE, outbuf, count, type, MPI_REDUCE_COMM);
  return 0;
}

CDECL
int MPI_Iallreduce(void *inbuf, void *outbuf, int count, int type,
                   MPI_Op op, MPI_Comm comm, MPI_Request* request)
{
  AMPIAPI("MPI_Allreduce");
  ampi *ptr = getAmpiInstance(comm);
  if(comm==MPI_COMM_SELF) {
    // Just do a local copy: no need to do a reduction:
    return copyDatatype(comm,type,count,inbuf,outbuf);
  }
  CkReductionMsg *msg=makeRednMsg(ptr->getDDT()->getType(type),inbuf,count,type,op);
  CkCallback allreduceCB(CkIndex_ampi::reduceResult(0),ptr->getProxy());
  msg->setCallback(allreduceCB);
  ptr->contribute(msg);

  // using irecv instead recv to non-block the call and get request pointer
  AmpiRequestList* reqs = getReqs();
  IReq *newreq = new IReq(outbuf,count,type,MPI_REDUCE_SOURCE,MPI_REDUCE_TAG,MPI_REDUCE_COMM);
  *request = reqs->insert(newreq);
  return 0;
}

CDECL
int MPI_Reduce_scatter(void* sendbuf, void* recvbuf, int *recvcounts,
                       MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
{
  AMPIAPI("MPI_Reduce_scatter");
  if(comm==MPI_COMM_SELF) return copyDatatype(comm,datatype,recvcounts[0],sendbuf,recvbuf);
  ampi *ptr = getAmpiInstance(comm);
  int size = ptr->getSize();
  int count=0;
  int *displs = new int [size];
  int len;
  void *tmpbuf;

  //under construction
  for(int i=0;i<size;i++){
    displs[i] = count;
    count+= recvcounts[i];
  }
  len = ptr->getDDT()->getType(datatype)->getSize(count);
  tmpbuf = malloc(len);
  MPI_Reduce(sendbuf, tmpbuf, count, datatype, op, 0, comm);
  MPI_Scatterv(tmpbuf, recvcounts, displs, datatype,
               recvbuf, recvcounts[ptr->getRank()], datatype, 0, comm);
  free(tmpbuf);
  delete [] displs;	// < memory leak ! // gzheng
  return 0;
}

CDECL
int MPI_Scan(void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm ){
  AMPIAPI("MPI_Scan");

  ampi *ptr = getAmpiInstance(comm);
  int numvps = ptr->getSize();
  CkDDT_DataType *ddt = ptr->getDDT()->getType(datatype);
  int blklen = ddt->getSize(count);
  void* tmpbuf = malloc(blklen*numvps); // holds P*count*sizeof(datatype)

  MPI_Reduce(sendbuf, tmpbuf, count, datatype, MPI_CONCAT, 0, comm);

  if(ptr->getRank()==0){
    switch(datatype){
    case MPI_FLOAT:
      for(int i=1;i<numvps;i++)
        for(int j=0;j<count;j++)
        switch(op){
	  case MPI_MAX:
	    if(((float *)tmpbuf)[count*(i-1)+j] > ((float *)tmpbuf)[count*i+j]) ((float *)tmpbuf)[count*i+j] = ((float *)tmpbuf)[count*(i-1)+j];
	    break;
	  case MPI_MIN:
	    if(((float *)tmpbuf)[count*(i-1)+j] < ((float *)tmpbuf)[count*i+j]) ((float *)tmpbuf)[count*i+j] = ((float *)tmpbuf)[count*(i-1)+j];
	    break;
	  case MPI_SUM:
	    ((float *)tmpbuf)[count*i+j] += ((float *)tmpbuf)[count*(i-1)+j];
	    break;
	  case MPI_PROD:
	    ((float *)tmpbuf)[count*i+j] *= ((float *)tmpbuf)[count*(i-1)+j];
	    break;
	  default:
            ckerr << "Scan on type " << datatype << " with Op " << op << " not supported." << endl;
	    CmiAbort("MPI_Scan()");
	  }
      break;
    case MPI_INT:
      for(int i=1;i<numvps;i++)
        for(int j=0;j<count;j++)
          switch(op){
	  case MPI_MAX:
	    if(((int *)tmpbuf)[count*(i-1)+j] > ((int *)tmpbuf)[count*i+j]) ((int *)tmpbuf)[count*i+j] = ((int *)tmpbuf)[count*(i-1)+j];
	    break;
	  case MPI_MIN:
	    if(((int *)tmpbuf)[count*(i-1)+j] < ((int *)tmpbuf)[count*i+j]) ((int *)tmpbuf)[count*i+j] = ((int *)tmpbuf)[count*(i-1)+j];
	    break;
	  case MPI_SUM:
	    ((int *)tmpbuf)[count*i+j] += ((int *)tmpbuf)[count*(i-1)+j];
	    break;
	  case MPI_PROD:
	    ((int *)tmpbuf)[count*i+j] *= ((int *)tmpbuf)[count*(i-1)+j];
	    break;
	  default:
            ckerr << "Scan on type " << datatype << " with Op " << op << " not supported." << endl;
	    CmiAbort("MPI_Scan()");
	  }
      break;
    case MPI_DOUBLE:
      for(int i=1;i<numvps;i++)
        for(int j=0;j<count;j++)
          switch(op){
	  case MPI_MAX:
	    if(((double *)tmpbuf)[count*(i-1)+j] > ((double *)tmpbuf)[count*i+j]) ((double *)tmpbuf)[count*i+j] = ((double *)tmpbuf)[count*(i-1)+j];
	    break;
	  case MPI_MIN:
	    if(((double *)tmpbuf)[count*(i-1)+j] < ((double *)tmpbuf)[count*i+j]) ((double *)tmpbuf)[count*i+j] = ((double *)tmpbuf)[count*(i-1)+j];
	    break;
	  case MPI_SUM:
	    ((double *)tmpbuf)[count*i+j] += ((double *)tmpbuf)[count*(i-1)+j];
	    break;
	  case MPI_PROD:
	    ((double *)tmpbuf)[count*i+j] *= ((double *)tmpbuf)[count*(i-1)+j];
	    break;
	  default:
            ckerr << "Scan on type " << datatype << " with Op " << op << " not supported." << endl;
	    CmiAbort("MPI_Scan()");
	  }
      break;
    default:
      ckerr << "Scan on type " << datatype << " with Op " << op << " not supported." << endl;
      CmiAbort("MPI_Scan()");
    }
  }

  MPI_Scatter(tmpbuf, count, datatype, recvbuf, count, datatype, 0, comm);
  free(tmpbuf);
  return 0;
}

CDECL
double MPI_Wtime(void)
{
  AMPIAPI("MPI_Wtime");
  return TCHARM_Wall_timer();
}

CDECL
double MPI_Wtick(void){
  AMPIAPI("MPI_Wtick");
  return 1e-6;
}

int PersReq::start(){
  if(sndrcv == 1) { // send request
    ampi *aptr=getAmpiInstance(comm);
    aptr->send(tag, aptr->getRank(), buf, count, type, src, comm);
  }
  return 0;
}

CDECL
int MPI_Start(MPI_Request *request)
{
  AMPIAPI("MPI_Start");
  AmpiRequestList *reqs = getReqs();
  if(-1==(*reqs)[*request]->start()){
    CkAbort("MPI_Start could be used only on persistent communication requests!");
  }
  return 0;
}

CDECL
int MPI_Startall(int count, MPI_Request *requests){
  AMPIAPI("MPI_Startall");
  AmpiRequestList *reqs = getReqs();
  for(int i=0;i<count;i++){
    if(-1==(*reqs)[requests[i]]->start())
      CkAbort("MPI_Start could be used only on persistent communication requests!");
  }
  return 0;
}

int PersReq::wait(MPI_Status *sts){
	if(sndrcv == 2) {
		getAmpiInstance(comm)->recv(tag, src, buf, count,
				type, comm, (int*)sts);
	}
	return 0;
}
int IReq::wait(MPI_Status *sts){
	getAmpiInstance(comm)->recv(tag, src, buf, count,
			type, comm, (int*)sts);
	return 0;
}
int ATAReq::wait(MPI_Status *sts){
	int i;
	for(i=0;i<count;i++){
		getAmpiInstance(myreqs[i].comm)->recv(myreqs[i].tag, myreqs[i].src, myreqs[i].buf,
				myreqs[i].count, myreqs[i].type, myreqs[i].comm, (int *)sts);
	}
	return 0;
}

CDECL
int MPI_Wait(MPI_Request *request, MPI_Status *sts)
{
  AMPIAPI("MPI_Wait");
  if(*request == MPI_REQUEST_NULL)
    return 0;
  AmpiRequestList* reqs = getReqs();
  AMPI_DEBUG("MPI_Wait: request=%d, reqs.size=%d, &reqs=%d\n",*request,reqs->size(),reqs);
  (*reqs)[*request]->wait(sts);
  (*reqs)[*request]->free();
  return 0;
}

CDECL
int MPI_Waitall(int count, MPI_Request *request, MPI_Status *sts)
{
  AMPIAPI("MPI_Waitall");
  for(int i=0;i<count;i++) {
    MPI_Wait(&request[i], sts+i);
  }
  return 0;
}

CDECL
int MPI_Waitany(int count, MPI_Request *request, int *idx, MPI_Status *sts)
{
  AMPIAPI("MPI_Waitany");
  int flag=0;
  while(count>0){
    for(int i=0;i<count;i++) {
      MPI_Test(&request[i], &flag, sts);
      if(flag == 1){
        *idx = i;
        return 0;
      }
    }
  }
  *idx = MPI_UNDEFINED;
  return 0;
}

CDECL
int MPI_Waitsome(int incount, MPI_Request *array_of_requests, int *outcount,
                 int *array_of_indices, MPI_Status *array_of_statuses)
{
  AMPIAPI("MPI_Waitsome");
  MPI_Status sts;
  int flag;
  *outcount = 0;
  while(1){
    for(int i=0;i<incount;i++) {
      MPI_Test(&array_of_requests[i], &flag, &sts);
      if(flag == 1){
        array_of_indices[(*outcount)]=i;
	array_of_statuses[(*outcount)++]=sts;
      }
    }
    if(outcount > 0) break; // alternative: turn around and test [0 ~ AOI[0]-1]
  }
  return 0;
}

CmiBool PersReq::test(MPI_Status *sts){
	if(sndrcv == 2) 	// recv request
		return getAmpiInstance(comm)->iprobe(tag, src, comm, (int*)sts);
	else			// send request
		return 1;

}
void PersReq::complete(MPI_Status *sts){
	getAmpiInstance(comm)->recv(tag, src, buf, count, type, comm, (int*)sts);
}

CmiBool IReq::test(MPI_Status *sts){
	return getAmpiInstance(comm)->iprobe(tag, src, comm, (int*)sts);
}
void IReq::complete(MPI_Status *sts){
	getAmpiInstance(comm)->recv(tag, src, buf, count, type, comm, (int*)sts);
}

CmiBool ATAReq::test(MPI_Status *sts){
	int i, flag=1;
	for(i=0;i<count;i++){
		flag *= getAmpiInstance(myreqs[i].comm)->iprobe(myreqs[i].tag, myreqs[i].src,
					myreqs[i].comm, (int*) sts);
	}
	return flag;
}
void ATAReq::complete(MPI_Status *sts){
	int i;
	for(i=0;i<count;i++){
	getAmpiInstance(myreqs[i].comm)->recv(myreqs[i].tag, myreqs[i].src, myreqs[i].buf,
			myreqs[i].count, myreqs[i].type, myreqs[i].comm, (int*)sts);
	}
}

CDECL
int MPI_Test(MPI_Request *request, int *flag, MPI_Status *sts)
{
  AMPIAPI("MPI_Test");
  if(*request==MPI_REQUEST_NULL) {
    *flag = 1;
    return 0;
  }
  AmpiRequestList* reqs = getReqs();
  if(1 == (*flag = (*reqs)[*request]->test(sts))){
    (*reqs)[*request]->complete(sts);
    (*reqs)[*request]->free();
  }
  return 0;
}

CDECL
int MPI_Testany(int count, MPI_Request *request, int *index, int *flag, MPI_Status *sts){
  AMPIAPI("MPI_Testany");
  *flag=0;
  for(int i=0;i<count;i++)
  {
    MPI_Test(&request[i], flag, sts);
    if(*flag==1){
      *index = i;
      return 0;
    }
  }
  *index = MPI_UNDEFINED;
  return 0;
}

CDECL
int MPI_Testall(int count, MPI_Request *request, int *flag, MPI_Status *sts)
{
  AMPIAPI("MPI_Testall");
  int tmpflag;
  *flag = 1;
  for(int i=0;i<count;i++)
  {
    MPI_Test(&request[i], &tmpflag, sts+i);
    *flag *= tmpflag;
  }
  return 0;
}

CDECL
int MPI_Testsome(int incount, MPI_Request *array_of_requests, int *outcount,
                 int *array_of_indices, MPI_Status *array_of_statuses)
{
  AMPIAPI("MPI_Testsome");
  MPI_Status sts;
  int flag;
  *outcount = 0;
  for(int i=0;i<incount;i++) {
    MPI_Test(&array_of_requests[i], &flag, &sts);
    if(flag == 1){
      array_of_indices[(*outcount)]=i;
      array_of_statuses[(*outcount)++]=sts;
    }
  }
  return 0;
}

CDECL
int MPI_Request_free(MPI_Request *request){
  AMPIAPI("MPI_Request_free");
  if(*request==MPI_REQUEST_NULL) return 0;
  AmpiRequestList* reqs = getReqs();
  (*reqs)[*request]->free();
  return 0;
}

CDECL
int MPI_Cancel(MPI_Request *request){
  AMPIAPI("MPI_Request_free");
  return MPI_Request_free(request);
}

CDECL
int MPI_Recv_init(void *buf, int count, int type, int src, int tag,
                   MPI_Comm comm, MPI_Request *req)
{
  AMPIAPI("MPI_Recv_init");
  AmpiRequestList* reqs = getReqs();
  PersReq *newreq = new PersReq(buf,count,type,src,tag,comm,2);
  *req = reqs->insert(newreq);
  return 0;
}

CDECL
int MPI_Send_init(void *buf, int count, int type, int dest, int tag,
                   MPI_Comm comm, MPI_Request *req)
{
  AMPIAPI("MPI_Send_init");
  AmpiRequestList* reqs = getReqs();
  PersReq *newreq = new PersReq(buf,count,type,dest,tag,comm,1);
  *req = reqs->insert(newreq);
  return 0;
}

static CkDDT *getDDT(void) {
  return getAmpiParent()->myDDT;
}

CDECL
int MPI_Type_contiguous(int count, MPI_Datatype oldtype,
                         MPI_Datatype *newtype)
{
  AMPIAPI("MPI_Type_contiguous");
  getDDT()->newContiguous(count, oldtype, newtype);
  return 0;
}

extern  "C"
int MPI_Type_vector(int count, int blocklength, int stride,
                     MPI_Datatype oldtype, MPI_Datatype*  newtype)
{
  AMPIAPI("MPI_Type_vector");
  getDDT()->newVector(count, blocklength, stride, oldtype, newtype);
  return 0 ;
}

extern  "C"  
int MPI_Type_hvector(int count, int blocklength, MPI_Aint stride, 
                      MPI_Datatype oldtype, MPI_Datatype*  newtype)
{
  AMPIAPI("MPI_Type_hvector");
  getDDT()->newHVector(count, blocklength, stride, oldtype, newtype);
  return 0 ;
}

extern  "C"  
int MPI_Type_indexed(int count, int* arrBlength, int* arrDisp, 
                      MPI_Datatype oldtype, MPI_Datatype*  newtype)
{
  AMPIAPI("MPI_Type_indexed");
  getDDT()->newIndexed(count, arrBlength, arrDisp, oldtype, newtype);
  return 0 ;
}

extern  "C"  
int MPI_Type_hindexed(int count, int* arrBlength, MPI_Aint* arrDisp,
                       MPI_Datatype oldtype, MPI_Datatype*  newtype)
{
  AMPIAPI("MPI_Type_hindexed");
  getDDT()->newHIndexed(count, arrBlength, arrDisp, oldtype, newtype);
  return 0 ;
}

extern  "C"  
int MPI_Type_struct(int count, int* arrBlength, int* arrDisp, 
                     MPI_Datatype* oldtype, MPI_Datatype*  newtype)
{
  AMPIAPI("MPI_Type_struct");
  getDDT()->newStruct(count, arrBlength, arrDisp, oldtype, newtype);
  return 0 ;
}

CDECL
int MPI_Type_commit(MPI_Datatype *datatype)
{
  AMPIAPI("MPI_Type_commit");
  return 0;
}

CDECL
int MPI_Type_free(MPI_Datatype *datatype)
{
  AMPIAPI("MPI_Type_free");
  getDDT()->freeType(datatype);
  return 0;
}


CDECL
int MPI_Type_extent(MPI_Datatype datatype, MPI_Aint *extent)
{
  AMPIAPI("MPI_Type_extent");
  *extent = getDDT()->getExtent(datatype);
  return 0;
}

CDECL
int MPI_Type_size(MPI_Datatype datatype, int *size)
{
  AMPIAPI("MPI_Type_size");
  *size=getDDT()->getSize(datatype);
  return 0;
}

CDECL
int MPI_Isend(void *buf, int count, MPI_Datatype type, int dest,
              int tag, MPI_Comm comm, MPI_Request *request)
{
  AMPIAPI("MPI_Isend");
  ampi *ptr = getAmpiInstance(comm);
  ptr->send(tag, ptr->getRank(), buf, count, type, dest, comm);
  *request = MPI_REQUEST_NULL;
  return 0;
}

CDECL
int MPI_Irecv(void *buf, int count, MPI_Datatype type, int src,
              int tag, MPI_Comm comm, MPI_Request *request)
{
  AMPIAPI("MPI_Irecv");
  AmpiRequestList* reqs = getReqs();
  IReq *newreq = new IReq(buf,count,type,src,tag,comm);
  *request = reqs->insert(newreq);
  return 0;
}

CDECL
int MPI_Ireduce(void *sendbuf, void *recvbuf, int count, int type, MPI_Op op,
                 int root, MPI_Comm comm, MPI_Request *request)
{
  AMPIAPI("MPI_Ireduce");
  if(comm==MPI_COMM_SELF) return copyDatatype(comm,type,count,sendbuf,recvbuf);
  ampi *ptr = getAmpiInstance(comm);
  CkReductionMsg *msg=makeRednMsg(ptr->getDDT()->getType(type),sendbuf,count,type,op);
  int rootIdx=ptr->comm2proxy(comm).getIndexForRank(root);
  CkCallback reduceCB(CkIndex_ampi::reduceResult(0),CkArrayIndex1D(rootIdx),ptr->getProxy(),true);
  msg->setCallback(reduceCB);
  ptr->contribute(msg);

  if (ptr->thisIndex == rootIdx){
    // using irecv instead recv to non-block the call and get request pointer
    AmpiRequestList* reqs = getReqs();
    IReq *newreq = new IReq(recvbuf,count,type,0,MPI_REDUCE_TAG,comm);
    *request = reqs->insert(newreq);
  }
  return 0;
}

CDECL
int MPI_Allgather(void *sendbuf, int sendcount, MPI_Datatype sendtype,
                  void *recvbuf, int recvcount, MPI_Datatype recvtype,
                  MPI_Comm comm)
{
  AMPIAPI("MPI_Allgather");
  if(comm==MPI_COMM_SELF) return copyDatatype(comm,sendtype,sendcount,sendbuf,recvbuf);
  ampi *ptr = getAmpiInstance(comm);
  int size = ptr->getSize();
  int i;

  // commlib support
  CProxy_ampi arrproxy = ptr->comlibBegin();
  for(i=0;i<size;i++) {
    ptr->delesend(MPI_GATHER_TAG, ptr->getRank(), sendbuf, sendcount,
                  sendtype, i, comm, arrproxy);
  }
  ptr->comlibEnd();

  MPI_Status status;
  CkDDT_DataType* dttype = ptr->getDDT()->getType(recvtype) ;
  int itemsize = dttype->getSize(recvcount) ;

  for(i=0;i<size;i++) {
    MPI_Recv(((char*)recvbuf)+(itemsize*i), recvcount, recvtype,
             i, MPI_GATHER_TAG, comm, &status);
  }
  return 0;
}

CDECL
int MPI_Iallgather(void *sendbuf, int sendcount, MPI_Datatype sendtype,
                    void *recvbuf, int recvcount, MPI_Datatype recvtype,
                    MPI_Comm comm, MPI_Request* request)
{
  AMPIAPI("MPI_Iallgather");
  if(comm==MPI_COMM_SELF) return copyDatatype(comm,sendtype,sendcount,sendbuf,recvbuf);
  ampi *ptr = getAmpiInstance(comm);
  int size = ptr->getSize();
  int i;

  // commlib support
  CProxy_ampi arrproxy = ptr->comlibBegin();
  for(i=0;i<size;i++) {
    ptr->delesend(MPI_GATHER_TAG, ptr->getRank(), sendbuf, sendcount,
                  sendtype, i, comm, arrproxy);
  }
  ptr->comlibEnd();

  CkDDT_DataType* dttype = ptr->getDDT()->getType(recvtype) ;
  int itemsize = dttype->getSize(recvcount) ;

  // copy+paste from MPI_Irecv
  AmpiRequestList* reqs = getReqs();
  ATAReq *newreq = new ATAReq(size);
  for(i=0;i<size;i++){
    if(newreq->addReq(((char*)recvbuf)+(itemsize*i),recvcount,recvtype,i,MPI_GATHER_TAG,comm)!=(i+1))
      CkAbort("MPI_Iallgather: Error adding requests into ATAReq!");
  }
  *request = reqs->insert(newreq);
  AMPI_DEBUG("MPI_Iallgather: request=%d, reqs.size=%d, &reqs=%d\n",*request,reqs->size(),reqs);

  return 0;
}

CDECL
int MPI_Allgatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype,
                   void *recvbuf, int *recvcounts, int *displs,
                   MPI_Datatype recvtype, MPI_Comm comm)
{
  AMPIAPI("MPI_Allgatherv");
  if(comm==MPI_COMM_SELF) return copyDatatype(comm,sendtype,sendcount,sendbuf,recvbuf);
  ampi *ptr = getAmpiInstance(comm);
  int size = ptr->getSize();
  int i;

  // commlib support
  CProxy_ampi arrproxy = ptr->comlibBegin();
  for(i=0;i<size;i++) {
    ptr->delesend(MPI_GATHER_TAG, ptr->getRank(), sendbuf, sendcount,
                  sendtype, i, comm, arrproxy);
  }
  ptr->comlibEnd();

  MPI_Status status;
  CkDDT_DataType* dttype = ptr->getDDT()->getType(recvtype) ;
  int itemsize = dttype->getSize() ;

  for(i=0;i<size;i++) {
    MPI_Recv(((char*)recvbuf)+(itemsize*displs[i]), recvcounts[i], recvtype,
             i, MPI_GATHER_TAG, comm, &status);
  }
  return 0;
}

CDECL
int MPI_Gather(void *sendbuf, int sendcount, MPI_Datatype sendtype,
               void *recvbuf, int recvcount, MPI_Datatype recvtype,
               int root, MPI_Comm comm)
{
  AMPIAPI("MPI_Gather");
  if(comm==MPI_COMM_SELF) return copyDatatype(comm,sendtype,sendcount,sendbuf,recvbuf);
  ampi *ptr = getAmpiInstance(comm);
  int size = ptr->getSize();
  int i;
  MPI_Send(sendbuf, sendcount, sendtype, root, MPI_GATHER_TAG, comm);

  if(ptr->getRank()==root) {
    MPI_Status status;
    CkDDT_DataType* dttype = ptr->getDDT()->getType(recvtype) ;
    int itemsize = dttype->getSize(recvcount) ;

    for(i=0;i<size;i++) {
      MPI_Recv(((char*)recvbuf)+(itemsize*i), recvcount, recvtype,
               i, MPI_GATHER_TAG, comm, &status);
    }
  }
  return 0;
}

CDECL
int MPI_Gatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype,
                void *recvbuf, int *recvcounts, int *displs,
                MPI_Datatype recvtype, int root, MPI_Comm comm)
{
  AMPIAPI("MPI_Gatherv");
  if(comm==MPI_COMM_SELF) return copyDatatype(comm,sendtype,sendcount,sendbuf,recvbuf);
  ampi *ptr = getAmpiInstance(comm);
  int size = ptr->getSize();
  int i;

  MPI_Send(sendbuf, sendcount, sendtype, root, MPI_GATHER_TAG, comm);

  if(ptr->getRank() == root) {
    MPI_Status status;
    CkDDT_DataType* dttype = ptr->getDDT()->getType(recvtype) ;
    int itemsize = dttype->getSize() ;

    for(i=0;i<size;i++) {
      MPI_Recv(((char*)recvbuf)+(itemsize*displs[i]), recvcounts[i], recvtype,
               i, MPI_GATHER_TAG, comm, &status);
    }
  }
  return 0;
}

CDECL
int MPI_Scatter(void *sendbuf, int sendcount, MPI_Datatype sendtype,
                void *recvbuf, int recvcount, MPI_Datatype recvtype,
                int root, MPI_Comm comm)
{
  AMPIAPI("MPI_Scatter");
  if(comm==MPI_COMM_SELF) return copyDatatype(comm,sendtype,sendcount,sendbuf,recvbuf);
  ampi *ptr = getAmpiInstance(comm);
  int size = ptr->getSize();
  int i;

  if(ptr->getRank()==root) {
    CProxy_ampi arrproxy = ptr->comlibBegin();
    CkDDT_DataType* dttype = ptr->getDDT()->getType(sendtype) ;
    int itemsize = dttype->getSize(sendcount) ;
    for(i=0;i<size;i++) {
      ptr->delesend(MPI_SCATTER_TAG, ptr->getRank(), ((char*)sendbuf)+(itemsize*i),
                    sendcount, sendtype, i, comm, arrproxy);
    }
    ptr->comlibEnd();
  }

  MPI_Status status;
  MPI_Recv(recvbuf, recvcount, recvtype, root, MPI_SCATTER_TAG, comm, &status);

  return 0;
}

CDECL
int MPI_Scatterv(void *sendbuf, int *sendcounts, int *displs, MPI_Datatype sendtype,
                 void *recvbuf, int recvcount, MPI_Datatype recvtype,
                 int root, MPI_Comm comm)
{
  AMPIAPI("MPI_Scatterv");
  if(comm==MPI_COMM_SELF) return copyDatatype(comm,sendtype,sendcounts[0],sendbuf,recvbuf);
  ampi *ptr = getAmpiInstance(comm);
  int size = ptr->getSize();
  int i;

  if(ptr->getRank() == root) {
    CProxy_ampi arrproxy = ptr->comlibBegin();
    CkDDT_DataType* dttype = ptr->getDDT()->getType(sendtype) ;
    int itemsize = dttype->getSize() ;
    for(i=0;i<size;i++) {
      ptr->delesend(MPI_SCATTER_TAG, ptr->getRank(), ((char*)sendbuf)+(itemsize*displs[i]),
                    sendcounts[i], sendtype, i, comm, arrproxy);
    }
    ptr->comlibEnd();
  }

  MPI_Status status;
  MPI_Recv(recvbuf, recvcount, recvtype, root, MPI_SCATTER_TAG, comm, &status);

  return 0;
}

CDECL
int MPI_Alltoall(void *sendbuf, int sendcount, MPI_Datatype sendtype,
                 void *recvbuf, int recvcount, MPI_Datatype recvtype,
                 MPI_Comm comm)
{
  AMPIAPI("MPI_Alltoall");
  if(comm==MPI_COMM_SELF) return copyDatatype(comm,sendtype,sendcount,sendbuf,recvbuf);
  ampi *ptr = getAmpiInstance(comm);
  int size = ptr->getSize();
  CkDDT_DataType* dttype = ptr->getDDT()->getType(sendtype) ;
  int itemsize = dttype->getSize(sendcount) ;
  int i;

  // commlib support
  CProxy_ampi arrproxy = ptr->comlibBegin();
  for(i=0;i<size;i++) {
    ptr->delesend(MPI_GATHER_TAG, ptr->getRank(), ((char*)sendbuf)+(itemsize*i), sendcount,
                  sendtype, i, comm, arrproxy);
  }
  ptr->comlibEnd();

  MPI_Status status;
  dttype = ptr->getDDT()->getType(recvtype) ;
  itemsize = dttype->getSize(recvcount) ;

  for(i=0;i<size;i++) {
    MPI_Recv(((char*)recvbuf)+(itemsize*i), recvcount, recvtype,
              i, MPI_GATHER_TAG, comm, &status);
  }
  return 0;
}

CDECL
int MPI_Ialltoall(void *sendbuf, int sendcount, MPI_Datatype sendtype,
                 void *recvbuf, int recvcount, MPI_Datatype recvtype,
                 MPI_Comm comm, MPI_Request *request)
{
  AMPIAPI("MPI_Ialltoall");
  if(comm==MPI_COMM_SELF) return copyDatatype(comm,sendtype,sendcount,sendbuf,recvbuf);
  ampi *ptr = getAmpiInstance(comm);
  int size = ptr->getSize();
  CkDDT_DataType* dttype = ptr->getDDT()->getType(sendtype) ;
  int itemsize = dttype->getSize(sendcount) ;
  int i;

  // commlib support
  CProxy_ampi arrproxy = ptr->comlibBegin();
  for(i=0;i<size;i++) {
    ptr->delesend(MPI_GATHER_TAG, ptr->getRank(), ((char*)sendbuf)+(itemsize*i), sendcount,
                  sendtype, i, comm, arrproxy);
  }
  ptr->comlibEnd();

  // copy+paste from MPI_Irecv
  AmpiRequestList* reqs = getReqs();
  ATAReq *newreq = new ATAReq(size);
  for(i=0;i<size;i++){
    if(newreq->addReq(((char*)recvbuf)+(itemsize*i),recvcount,recvtype,i,MPI_GATHER_TAG,comm)!=(i+1))
      CkAbort("MPI_Ialltoall: Error adding requests into ATAReq!");
  }
  *request = reqs->insert(newreq);
  AMPI_DEBUG("MPI_Ialltoall: request=%d, reqs.size=%d, &reqs=%d\n",*request,reqs->size(),reqs);
  return 0;
}

CDECL
int MPI_Alltoallv(void *sendbuf, int *sendcounts, int *sdispls,
                  MPI_Datatype sendtype, void *recvbuf, int *recvcounts,
                  int *rdispls, MPI_Datatype recvtype, MPI_Comm comm)
{
  AMPIAPI("MPI_Alltoallv");
  if(comm==MPI_COMM_SELF) return 0;
  ampi *ptr = getAmpiInstance(comm);
  int size = ptr->getSize();
  CkDDT_DataType* dttype = ptr->getDDT()->getType(sendtype) ;
  int itemsize = dttype->getSize() ;
  int i;

  // commlib support
  CProxy_ampi arrproxy = ptr->comlibBegin();
  for(i=0;i<size;i++) {
    ptr->delesend(MPI_GATHER_TAG,ptr->getRank(),((char*)sendbuf)+(itemsize*sdispls[i]),sendcounts[i],
                  sendtype, i, comm, arrproxy);
  }
  ptr->comlibEnd();

  MPI_Status status;
  dttype = ptr->getDDT()->getType(recvtype) ;
  itemsize = dttype->getSize() ;

  for(i=0;i<size;i++) {
    MPI_Recv(((char*)recvbuf)+(itemsize*rdispls[i]), recvcounts[i], recvtype,
             i, MPI_GATHER_TAG, comm, &status);
  }
  return 0;
}

CDECL
int MPI_Comm_dup(int comm, int *newcomm)
{
  AMPIAPI("MPI_Comm_dup");
  *newcomm = comm;
  return 0;
}

CDECL
int MPI_Comm_split(int src,int color,int key,int *dest)
{
  AMPIAPI("MPI_Comm_split");
  getAmpiInstance(src)->split(color,key,dest, 0);
  return 0;
}

CDECL
int MPI_Comm_free(int *comm)
{
  AMPIAPI("MPI_Comm_free");
  return 0;
}

CDECL
int MPI_Comm_test_inter(MPI_Comm comm, int *flag){
  AMPIAPI("MPI_Comm_test_inter");
  *flag = false;
  return 0;
}

CDECL
int MPI_Abort(int comm, int errorcode)
{
  AMPIAPI("MPI_Abort");
  CkAbort("AMPI: User called MPI_Abort!\n");
  return errorcode;
}

CDECL
int MPI_Get_count(MPI_Status *sts, MPI_Datatype dtype, int *count){
  AMPIAPI("MPI_Get_count");
  CkDDT_DataType* dttype = getDDT()->getType(dtype);
  int itemsize = dttype->getSize() ;
  *count = sts->MPI_LENGTH/itemsize;
  return 0;
}

CDECL
int MPI_Type_lb(MPI_Datatype dtype, MPI_Aint* displacement){
  AMPIAPI("MPI_Type_lb");
  *displacement = getDDT()->getLB(dtype);
  return 0;
}

CDECL
int MPI_Type_ub(MPI_Datatype dtype, MPI_Aint* displacement){
  AMPIAPI("MPI_Type_ub");
  *displacement = getDDT()->getUB(dtype);
  return 0;
}

CDECL
int MPI_Address(void* location, MPI_Aint *address){
  AMPIAPI("MPI_Address");
  *address = (MPI_Aint)(unsigned long)(char *)location;
  return 0;
}

CDECL
int MPI_Get_elements(MPI_Status *sts, MPI_Datatype dtype, int *count){
  AMPIAPI("MPI_Get_elements");
  CkDDT_DataType* dttype = getDDT()->getType(dtype) ;
  int basesize = dttype->getBaseSize() ;
  *count = sts->MPI_LENGTH/basesize;
  return 0;
}

CDECL
int MPI_Pack(void *inbuf, int incount, MPI_Datatype dtype, void *outbuf,
              int outsize, int *position, MPI_Comm comm)
{
  AMPIAPI("MPI_Pack");
  CkDDT_DataType* dttype = getDDT()->getType(dtype) ;
  int itemsize = dttype->getSize();
  dttype->serialize((char*)inbuf, ((char*)outbuf)+(*position), incount, 1);
  *position += (itemsize*incount);
  return 0;
}

CDECL
int MPI_Unpack(void *inbuf, int insize, int *position, void *outbuf,
              int outcount, MPI_Datatype dtype, MPI_Comm comm)
{
  AMPIAPI("MPI_Unpack");
  CkDDT_DataType* dttype = getDDT()->getType(dtype) ;
  int itemsize = dttype->getSize();
  dttype->serialize(((char*)inbuf+(*position)), (char*)outbuf, outcount, 1);
  *position += (itemsize*outcount);
  return 0;
}

CDECL
int MPI_Pack_size(int incount,MPI_Datatype datatype,MPI_Comm comm,int *sz)
{
  AMPIAPI("MPI_Pack_size");
  CkDDT_DataType* dttype = getDDT()->getType(datatype) ;
  return incount*dttype->getSize() ;
}

CDECL
int MPI_Get_processor_name(char *name, int *resultlen){
  AMPIAPI("MPI_Get_processor_name");
  ampiParent *ptr = getAmpiParent();
  sprintf(name,"AMPI_VP[%d]_PE[%d]\n",ptr->thisIndex,ptr->getMyPe());
  *resultlen = strlen(name);
  return 0;
}

/* Error handling */
#if defined(USE_STDARG)
void error_handler(MPI_Comm *, int *, ...);
#else
void error_handler ( MPI_Comm *, int * );
#endif

CDECL
int MPI_Errhandler_create(MPI_Handler_function *function, MPI_Errhandler *errhandler){
	AMPIAPI("MPI_Errhandler_create");
	return MPI_SUCCESS;
}

CDECL
int MPI_Errhandler_set(MPI_Comm comm, MPI_Errhandler errhandler){
	AMPIAPI("MPI_Errhandler_set");
	return MPI_SUCCESS;
}

CDECL
int MPI_Errhandler_get(MPI_Comm comm, MPI_Errhandler *errhandler){
	AMPIAPI("MPI_Errhandler_get");
	return MPI_SUCCESS;
}

CDECL
int MPI_Errhandler_free(MPI_Errhandler *errhandler){
	AMPIAPI("MPI_Errhandler_free");
	return MPI_SUCCESS;
}

CDECL
int MPI_Error_class(int errorcode, int *errorclass){
	AMPIAPI("MPI_Error_class");
	*errorclass = errorcode;
	return MPI_SUCCESS;
}


CDECL
int MPI_Error_string(int errorcode, char *string, int *resultlen)
{
  AMPIAPI("MPI_Error_string");
  const char *ret="";
  switch(errorcode) {
  case MPI_SUCCESS:
     ret="Success";
     break;
  default:
     return 1;/*LIE: should be MPI_ERR_something */
  };
  *resultlen=strlen(ret);
  strcpy(string,ret);
  return MPI_SUCCESS;
}

/* Group operations */
CDECL
int MPI_Comm_group(MPI_Comm comm, MPI_Group *group)
{
  AMPIAPI("MPI_Comm_Group");
  *group = getAmpiParent()->comm2group(comm);
  return 0;
}

CDECL
int MPI_Group_union(MPI_Group group1, MPI_Group group2, MPI_Group *newgroup)
{
  AMPIAPI("MPI_Group_union");
  groupStruct vec1, vec2, newvec;
  ampiParent *ptr = getAmpiParent();
  vec1 = ptr->group2vec(group1);
  vec2 = ptr->group2vec(group2);
  newvec = unionOp(vec1,vec2);
  *newgroup = ptr->saveGroupStruct(newvec);
  return 0;
}

CDECL
int MPI_Group_intersection(MPI_Group group1, MPI_Group group2, MPI_Group *newgroup)
{
  AMPIAPI("MPI_Group_intersection");
  groupStruct vec1, vec2, newvec;
  ampiParent *ptr = getAmpiParent();
  vec1 = ptr->group2vec(group1);
  vec2 = ptr->group2vec(group2);
  newvec = intersectOp(vec1,vec2);
  *newgroup = ptr->saveGroupStruct(newvec);
  return 0;
}

CDECL
int MPI_Group_difference(MPI_Group group1, MPI_Group group2, MPI_Group *newgroup)
{
  AMPIAPI("MPI_Group_difference");
  groupStruct vec1, vec2, newvec;
  ampiParent *ptr = getAmpiParent();
  vec1 = ptr->group2vec(group1);
  vec2 = ptr->group2vec(group2);
  newvec = diffOp(vec1,vec2);
  *newgroup = ptr->saveGroupStruct(newvec);
  return 0;
}

CDECL
int MPI_Group_size(MPI_Group group, int *size)
{
  AMPIAPI("MPI_Group_size");
  *size = (getAmpiParent()->group2vec(group)).size();
  return 0;
}

CDECL
int MPI_Group_rank(MPI_Group group, int *rank)
{
  AMPIAPI("MPI_Group_rank");
  *rank = getAmpiParent()->getRank(group);
  return 0;
}

CDECL
int MPI_Group_translate_ranks (MPI_Group group1, int n, int *ranks1, MPI_Group group2, int *ranks2)
{
  AMPIAPI("MPI_Group_translate_ranks");
  ampiParent *ptr = getAmpiParent();
  groupStruct vec1, vec2;
  vec1 = ptr->group2vec(group1);
  vec2 = ptr->group2vec(group2);
  ranks2 = translateRanksOp(n, vec1, ranks1, vec2);
  return 0;
}

CDECL
int MPI_Group_compare(MPI_Group group1,MPI_Group group2, int *result)
{
  AMPIAPI("MPI_Group_compare");
  ampiParent *ptr = getAmpiParent();
  groupStruct vec1, vec2;
  vec1 = ptr->group2vec(group1);
  vec2 = ptr->group2vec(group2);
  *result = compareVecOp(vec1, vec2);
  return 0;
}

CDECL
int MPI_Group_incl(MPI_Group group, int n, int *ranks, MPI_Group *newgroup)
{
  AMPIAPI("MPI_Group_incl");
  groupStruct vec, newvec;
  ampiParent *ptr = getAmpiParent();
  vec = ptr->group2vec(group);
  newvec = inclOp(n,ranks,vec);
  *newgroup = ptr->saveGroupStruct(newvec);
  return 0;
}
CDECL
int MPI_Group_excl(MPI_Group group, int n, int *ranks, MPI_Group *newgroup)
{
  AMPIAPI("MPI_Group_excl");
  groupStruct vec, newvec;
  ampiParent *ptr = getAmpiParent();
  vec = ptr->group2vec(group);
  newvec = exclOp(n,ranks,vec);
  *newgroup = ptr->saveGroupStruct(newvec);
//outputOp(vec); outputOp(newvec);
  return 0;
}
CDECL
int MPI_Group_range_incl(MPI_Group group, int n, int ranges[][3], MPI_Group *newgroup)
{
  AMPIAPI("MPI_Group_range_incl");
  groupStruct vec, newvec;
  ampiParent *ptr = getAmpiParent();
  vec = ptr->group2vec(group);
  newvec = rangeInclOp(n,ranges,vec);
  *newgroup = ptr->saveGroupStruct(newvec);
  return 0;
}
CDECL
int MPI_Group_range_excl(MPI_Group group, int n, int ranges[][3], MPI_Group *newgroup)
{
  AMPIAPI("MPI_Group_range_excl");
  groupStruct vec, newvec;
  ampiParent *ptr = getAmpiParent();
  vec = ptr->group2vec(group);
  newvec = rangeExclOp(n,ranges,vec);
  *newgroup = ptr->saveGroupStruct(newvec);
  return 0;
}
CDECL
int MPI_Group_free(MPI_Group *group)
{
  AMPIAPI("MPI_Group_free");
  return 0;
}
CDECL
int MPI_Comm_create(MPI_Comm comm, MPI_Group group, MPI_Comm* newcomm)
{
  AMPIAPI("MPI_Comm_create");
  groupStruct vec = getAmpiParent()->group2vec(group);
  getAmpiInstance(comm)->commCreate(vec, newcomm);
  return 0;
}

/* Charm++ Extentions to MPI standard: */
CDECL
void MPI_Checkpoint(char *dname)
{
  AMPIAPI("MPI_Checkpoint");
  MPI_Barrier(MPI_COMM_WORLD);
  getAmpiParent()->startCheckpoint(dname);
}

CDECL
void MPI_Print(char *str)
{
  AMPIAPI("MPI_Print");
  ampiParent *ptr = getAmpiParent();
  CkPrintf("[%d] %s\n", ptr->thisIndex, str);
}

CDECL
int MPI_Register(void *d, MPI_PupFn f)
{
  AMPIAPI("MPI_Register");
  return TCHARM_Register(d,f);
}

CDECL
void *MPI_Get_userdata(int idx)
{
  AMPIAPI("MPI_Get_userdata");
  return TCHARM_Get_userdata(idx);
}

CDECL
void MPI_Register_main(MPI_MainFn mainFn,const char *name)
{
  AMPIAPI("MPI_Register_main");
  if (TCHARM_Element()==0)
  { // I'm responsible for building the TCHARM threads:
    ampiCreateMain(mainFn,name,strlen(name));
  }
}
FDECL
void FTN_NAME(MPI_REGISTER_MAIN,mpi_register_main)
  (MPI_MainFn mainFn,const char *name,int nameLen)
{
  AMPIAPI("MPI_register_main");
  if (TCHARM_Element()==0)
  { // I'm responsible for building the TCHARM threads:
    ampiCreateMain(mainFn,name,nameLen);
  }
}

CDECL
int MPI_Keyval_create(MPI_Copy_function *copy_fn, MPI_Delete_function *delete_fn, int *keyval, void* extra_state){
  AMPIAPI("MPI_Keyval_create");
  return getAmpiParent()->createKeyval(copy_fn,delete_fn,keyval,extra_state);
}

CDECL
int MPI_Keyval_free(int *keyval){
  AMPIAPI("MPI_Keyval_free");
  return getAmpiParent()->freeKeyval(keyval);
}

CDECL
int MPI_Attr_put(MPI_Comm comm, int keyval, void* attribute_val){
  AMPIAPI("MPI_Attr_put");
  return getAmpiParent()->putAttr(comm,keyval,attribute_val);
}

CDECL
int MPI_Attr_get(MPI_Comm comm, int keyval, void *attribute_val, int *flag){
  AMPIAPI("MPI_Attr_get");
  return getAmpiParent()->getAttr(comm,keyval,attribute_val,flag);
}

CDECL
int MPI_Attr_delete(MPI_Comm comm, int keyval){
  AMPIAPI("MPI_Attr_delete");
  return getAmpiParent()->deleteAttr(comm,keyval);
}

CDECL
int MPI_Cart_map(MPI_Comm comm, int ndims, int *dims, int *periods,
		 int *newrank) {
  AMPIAPI("MPI_Cart_map");

  MPI_Comm_rank(comm, newrank);

  return 0;
}

CDECL
int MPI_Graph_map(MPI_Comm comm, int nnodes, int *index, int *edges,
		  int *newrank) {
  AMPIAPI("MPI_Graph_map");
  MPI_Comm_rank(comm, newrank);

  return 0;
}

CDECL
int MPI_Cart_create(MPI_Comm comm_old, int ndims, int *dims, int *periods,
                   int reorder, MPI_Comm *comm_cart) {

  AMPIAPI("MPI_Cart_create");

  /* Create new cartesian communicator. No attention is being paid to mapping
     virtual processes to processors, which ideally should be handled by the
     load balancer with input from virtual topology information.
     
     No reorder done here. reorder input is ignored, but still stored in the
     communicator with other VT info.
  */
  
  int newrank;
  MPI_Cart_map(comm_old, ndims, dims, periods, &newrank);//no change in rank 

  ampiParent *ptr = getAmpiParent();
  groupStruct vec = ptr->group2vec(ptr->comm2group(comm_old));
  getAmpiInstance(comm_old)->cartCreate(vec, comm_cart);
  ampiCommStruct &c = ptr->getCart(*comm_cart);
  c.setndims(ndims);
  
  CkPupBasicVec<int> dimsv;
  CkPupBasicVec<int> periodsv;

  for (int i = 0; i < ndims; i++) {
    dimsv.push_back(dims[i]);
    periodsv.push_back(periods[i]);
    if ((periods[i] != 0) && (periods[i] != 1))
      CkAbort("MPI_Cart_create: periods should be all booleans\n");
  }

  c.setdims(dimsv);
  c.setperiods(periodsv);

  return 0;
}

CDECL
int MPI_Graph_create(MPI_Comm comm_old, int nnodes, int *index, int *edges,
		     int reorder, MPI_Comm *comm_graph) {
  AMPIAPI("MPI_Graph_create");
  
  /* No mapping done */
  int newrank;
  MPI_Graph_map(comm_old, nnodes, index, edges, &newrank);
  
  ampiParent *ptr = getAmpiParent();
  groupStruct vec = ptr->group2vec(ptr->comm2group(comm_old));
  getAmpiInstance(comm_old)->graphCreate(vec, comm_graph);
  
  ampiCommStruct &c = ptr->getGraph(*comm_graph);
  c.setnvertices(nnodes);

  CkPupBasicVec<int> index_;
  CkPupBasicVec<int> edges_;

  for (int i = 0; i < nnodes; i++)
    index_.push_back(index[i]);
  
  c.setindex(index_);

  for (int i = 0; i < index[nnodes - 1]; i++)
    edges_.push_back(edges[i]);

  c.setedges(edges_);

  return 0;
}

CDECL
int MPI_Topo_test(MPI_Comm comm, int *status) {
  AMPIAPI("MPI_Topo_test");
  
  ampiParent *ptr = getAmpiParent();
  
  if (ptr->isCart(comm))
    *status = MPI_CART;
  else if (ptr->isGraph(comm))
    *status = MPI_GRAPH;
  else *status = MPI_UNDEFINED;
  
  return 0;
}

CDECL
int MPI_Cartdim_get(MPI_Comm comm, int *ndims) {
  AMPIAPI("MPI_Cartdim_get");

  *ndims = getAmpiParent()->getCart(comm).getndims();

  return 0;
}

CDECL
int MPI_Cart_get(MPI_Comm comm, int maxdims, int *dims, int *periods, 
		 int *coords){
  int ndims;

  AMPIAPI("MPI_Cart_get");

  ampiCommStruct &c = getAmpiParent()->getCart(comm);
  ndims = c.getndims();
  int rank;

  MPI_Comm_rank(comm, &rank);

  const CkPupBasicVec<int> &dims_ = c.getdims();
  const CkPupBasicVec<int> &periods_ = c.getperiods();
  
  for (int i = 0; i < maxdims; i++) {
    dims[i] = dims_[i];
    periods[i] = periods_[i];
  }

  for (int i = ndims - 1; i >= 0; i--) {
    if (i < maxdims)
      coords[i] = rank % dims_[i];
    rank = (int) (rank / dims_[i]);
  }

  return 0;
}

CDECL
int MPI_Cart_rank(MPI_Comm comm, int *coords, int *rank) {
  AMPIAPI("MPI_Cart_rank");

  ampiCommStruct &c = getAmpiParent()->getCart(comm);
  int ndims = c.getndims();
  const CkPupBasicVec<int> &dims = c.getdims();
  const CkPupBasicVec<int> &periods = c.getperiods();

  int prod = 1;
  int r = 0;

  for (int i = ndims - 1; i >= 0; i--) {
    if ((coords[i] < 0) || (coords[i] >= dims[i]))
      if (periods[i] == 1)
	if (coords[i] > 0)
	  coords[i] %= dims[i];
	else
	  coords[i] += (((-coords[i] / dims[i]) + 1) * dims[i]) % dims[i];
    r += prod * coords[i];
    prod *= dims[i];
  }

  *rank = r;

  return 0;
}

CDECL
int MPI_Cart_coords(MPI_Comm comm, int rank, int maxdims, int *coords) {
  AMPIAPI("MPI_Cart_coords");

  ampiCommStruct &c = getAmpiParent()->getCart(comm);
  int ndims = c.getndims();
  const CkPupBasicVec<int> &dims = c.getdims();

  for (int i = ndims - 1; i >= 0; i--) {
    if (i < maxdims)
      coords[i] = rank % dims[i];
    rank = (int) (rank / dims[i]);
  }
  
  return 0;
}

CDECL
int MPI_Cart_shift(MPI_Comm comm, int direction, int disp, int *rank_source, 
		   int *rank_dest) {
  AMPIAPI("MPI_Cart_shift");
  
  ampiCommStruct &c = getAmpiParent()->getCart(comm);
  int ndims = c.getndims();
  const CkPupBasicVec<int> &dims = c.getdims();
  const CkPupBasicVec<int> &periods = c.getperiods();
  int *coords = new int[ndims];

  MPI_Comm_rank(comm, rank_source);
  MPI_Cart_coords(comm, *rank_source, ndims, coords);

  if ((direction < 0) || (direction >= ndims))
    CkAbort("MPI_Cart_shift: direction not within dimensions range");

  coords[direction] += disp;
  if (coords[direction] < 0)
    if (periods[direction] == 1) {
      coords[direction] += dims[direction];
      MPI_Cart_rank(comm, coords, rank_dest);
    }
    else
      *rank_dest = MPI_PROC_NULL;
  else
    MPI_Cart_rank(comm, coords, rank_dest);

  delete [] coords;
  return 0;
}

CDECL
int MPI_Graphdims_get(MPI_Comm comm, int *nnodes, int *nedges) {
  AMPIAPI("MPI_Graphdim_get");

  ampiCommStruct &c = getAmpiParent()->getGraph(comm);
  *nnodes = c.getnvertices();
  const CkPupBasicVec<int> &index = c.getindex();
  *nedges = index[(*nnodes) - 1];
  
  return 0;
}

CDECL
int MPI_Graph_get(MPI_Comm comm, int maxindex, int maxedges, int *index, 
		  int *edges) {
  AMPIAPI("MPI_Graph_get");

  ampiCommStruct &c = getAmpiParent()->getGraph(comm);

  const CkPupBasicVec<int> &index_ = c.getindex();
  const CkPupBasicVec<int> &edges_ = c.getedges();

  if (maxindex > index_.size())
    maxindex = index_.size();

  for (int i = 0; i < maxindex; i++)
    index[i] = index_[i];

  for (int i = 0; i < maxedges; i++)
    edges[i] = edges_[i];

  return 0;
} 

CDECL
int MPI_Graph_neighbors_count(MPI_Comm comm, int rank, int *nneighbors) {
  AMPIAPI("MPI_Graph_neighbors_count");

  ampiCommStruct &c = getAmpiParent()->getGraph(comm);

  const CkPupBasicVec<int> &index = c.getindex();

  if ((rank >= index.size()) || (rank < 0))
    CkAbort("MPI_Graph_neighbors_count: rank not within range");

  if (rank == 0)
    *nneighbors = index[rank];
  else 
    *nneighbors = index[rank] - index[rank - 1];

  return 0;
}

CDECL
int MPI_Graph_neighbors(MPI_Comm comm, int rank, int maxneighbors,
			int *neighbors) {
  AMPIAPI("MPI_Graph_neighbors");

  ampiCommStruct &c = getAmpiParent()->getGraph(comm);
  const CkPupBasicVec<int> &index = c.getindex();
  const CkPupBasicVec<int> &edges = c.getedges();
  
  int numneighbors = (rank == 0) ? index[rank] : index[rank] - index[rank - 1];
  if (maxneighbors > numneighbors)
    maxneighbors = numneighbors;

  if (maxneighbors < 0)
    CkAbort("MPI_Graph_neighbors: maxneighbors < 0");

  if ((rank >= index.size()) || (rank < 0))
    CkAbort("MPI_Graph_neighbors: rank not within range");

  if (rank == 0)
    for (int i = 0; i < maxneighbors; i++)
      neighbors[i] = edges[i];
  else
    for (int i = 0; i < maxneighbors; i++)
      neighbors[i] = edges[index[rank - 1] + i];

  return 0;
}

/* Factorization code by Orion. Idea thrashed out by Orion and Prakash */

/**
  Return the integer "d'th root of n"-- the largest
  integer r such that
        r^d <= n
*/
int integerRoot(int n,int d) {
        double epsilon=0.001; /* prevents roundoff in "floor" */
        return (int)floor(pow(n+epsilon,1.0/d));
}

/**
  Factorize "n" into "d" factors, stored in "dims[0..d-1]".
  All the factors must be greater than or equal to m.
  The factors are chosen so that they are all as near together
  as possible (technically, chosen so that the increasing-size
  ordering is lexicagraphically as large as possible).
*/

bool factors(int n, int d, int *dims, int m) {
        if (d==1)
        { /* Base case */
                if (n>=m) { /* n is an acceptable factor */
                        dims[0]=n;
                        return true;
                }
        }
        else { /* induction case */
                int k_up=integerRoot(n,d);
                for (int k=k_up;k>=m;k--)
                if (n%k==0) { /* k divides n-- try it as a factor */
                        dims[0]=k;
                        if (factors(n/k,d-1,&dims[1],k))
                                return true;
                }
        }
        /* If we fall out here, there were no factors available */
        return false;
}

CDECL
int MPI_Dims_create(int nnodes, int ndims, int *dims) {
  AMPIAPI("MPI_Dims_create");

  int n, d, *pdims;

  n = nnodes;
  d = ndims;

  for (int i = 0; i < ndims; i++)
    if (dims[i] != 0)
      if (n % dims[i] != 0)
	CkAbort("MPI_Dims_Create: Value in dimensions array infeasible!");
      else {
	n = n / dims[i];
	d--;
      }

  pdims = new int[d];

  if (!factors(n, d, pdims, 1))
    CkAbort("MPI_Dims_Create: Factorization failed. Wonder why?");

  int j = 0;
  for (int i = 0; i < ndims; i++)
    if (dims[i] == 0) {
      dims[i] = pdims[j];
      j++;
    }

  delete [] pdims;

  return 0;
}

/* Implemented with call to MPI_Comm_Split. Color and key are single integer
   encodings of the lost and preserved dimensions, respectively,
   of the subgraphs.
*/

CDECL
int MPI_Cart_sub(MPI_Comm comm, int *remain_dims, MPI_Comm *newcomm) {
  AMPIAPI("MPI_Cart_sub");

  int color, key, *coords, ndims, rank;

  MPI_Comm_rank(comm, &rank);
  ampiCommStruct &c = getAmpiParent()->getCart(comm);
  ndims = c.getndims();
  const CkPupBasicVec<int> &dims = c.getdims();
  int num_remain_dims = 0;

  coords = new int [ndims];
  MPI_Cart_coords(comm, rank, ndims, coords);

  for (int i = 0; i < ndims; i++)
    if (remain_dims[i]) {
      /* key single integer encoding*/
      key = key * dims[i] + coords[i];
      num_remain_dims++;
    }
    else
      /* color */
      color = color * dims[i] + coords[i];

  getAmpiInstance(comm)->split(color, key, newcomm, CART_TOPOL);

  ampiCommStruct &newc = getAmpiParent()->getCart(*newcomm);
  newc.setndims(num_remain_dims);
  CkPupBasicVec<int> dimsv;
  const CkPupBasicVec<int> &periods = c.getperiods();
  CkPupBasicVec<int> periodsv;

  for (int i = 0; i < ndims; i++)
    if (remain_dims[i]) {
      dimsv.push_back(dims[i]);
      periodsv.push_back(periods[i]);
    }

  newc.setdims(dimsv);
  newc.setperiods(periodsv);

  delete [] coords;
  return 0;
}

void _registerampif(void)
{
  _registerampi();
}

void MPI_Datatype_iscontig(MPI_Datatype datatype, int *flag){
  *flag = getDDT()->iscontig(datatype);
}

CDECL
int MPI_Type_get_envelope(MPI_Datatype datatype, int *ni, int *na, int *nd, int *combiner){
  AMPIAPI("MPI_Type_get_envelope");
  return getDDT()->getEnvelope(datatype,ni,na,nd,combiner);
}

CDECL
int MPI_Type_get_contents(MPI_Datatype datatype, int ni, int na, int nd, int i[], MPI_Aint a[], MPI_Datatype d[]){
  AMPIAPI("MPI_Type_get_contents");
  return getDDT()->getContents(datatype,ni,na,nd,i,a,d);
}

#include "ampi.def.h"

