#include "charm++.h"
#include "envelope.h"
#include "register.h"

#include "ckmulticast.h"

#define DEBUGF(x)  // CkPrintf x;

// turn on or off fragmentation in multicast
#define SPLIT_MULTICAST 0
// each multicast message is split into SPLIT_NUM fragments
#define SPLIT_NUM 2

// maximum number of fragments into which a message can be broken
#define MAXFRAGS 5

typedef CkQ<multicastGrpMsg *>   multicastGrpMsgBuf;
typedef CkVec<CkArrayIndexMax>   arrayIndexList;
typedef CkVec<CkSectionInfo>     sectionIdList;
typedef CkVec<CkReductionMsg *>  reductionMsgs;
typedef CkQ<int>                 PieceSize;
typedef CkVec<LDObjid>          ObjKeyList;
typedef unsigned char            byte;

class reductionInfo {
public:
  int            lcount [MAXFRAGS]; /**< local elem collected */
  int            ccount [MAXFRAGS]; /**< children node collected */
  int            gcount [MAXFRAGS]; /**< total elem collected */
  int            npProcessed;
  CkCallback*    storedCallback;    /**< user callback */
  redClientFn    storedClient;      /**< reduction client function */
  void*          storedClientParam; /**< user provided data */
  int            redNo;             /**< reduction sequence number */
  reductionMsgs  msgs [MAXFRAGS];   /**< messages for this reduction */
  reductionMsgs  futureMsgs;        /**< messages of future reductions */
public:
  reductionInfo(): storedCallback(NULL), storedClientParam(NULL), redNo(0),
                   npProcessed(0) {
    for (int i=0; i<MAXFRAGS; i++) 
      lcount [i] = ccount [i] = gcount [i] = 0;
  }
};

/// cookie status
#define COOKIE_NOTREADY 0
#define COOKIE_READY    1
#define COOKIE_OLD      2

class mCastPacket {
public:
  CkSectionInfo cookie;
  int n;
  char *data;
  int seqno;
  int count;
  int totalsize;

  mCastPacket(CkSectionInfo &_cookie, int _n, char *_d, int _s, int _c, int _t):
		cookie(_cookie), n(_n), data(_d), seqno(_s), count(_c), totalsize(_t) {}
};

typedef CkQ<mCastPacket *> multicastGrpPacketBuf;

class SectionLocation {
public:
  mCastEntry *entry;
  int         pe;
public:
  SectionLocation(): entry(NULL), pe(-1) {}
  SectionLocation( mCastEntry *e, int p) { set(e, p); }
  inline void set(mCastEntry *e, int p) { entry = e; pe = p; }
  inline void clear() { entry = NULL; pe = -1; }
};

/// cookie for an array section 
class mCastEntry {
public:
  CkArrayID     aid;		/**< array ID */
  CkSectionInfo parentGrp;	/**< spanning tree parent */
  sectionIdList children;       /**< children section list */
  int numChild;
  arrayIndexList allElem;	// only useful on root
  ObjKeyList     allObjKeys;    // only useful on root for LB
  arrayIndexList localElem;
  int pe;			/**< should always be mype */
  CkSectionInfo rootSid;      /**< section ID of the root */
  multicastGrpMsgBuf msgBuf;
  multicastGrpPacketBuf packetBuf;   /**< pending packets */
  char *asm_msg;		/**< for multicast packetization */
  int   asm_fill;
  mCastEntry *oldc, *newc;    /**< link list of entries on same processor */
  SectionLocation   oldtree;    /**< old spanning tree */
  // for reduction
  reductionInfo red;
  char needRebuild;

private:
  char flag;
public:
  mCastEntry(CkArrayID _aid): aid(_aid), numChild(0), 
                    asm_msg(NULL), asm_fill(0),
                    oldc(NULL), newc(NULL),
                    needRebuild(0), flag(COOKIE_NOTREADY) {}
  mCastEntry(mCastEntry *);
  inline int hasParent() { return parentGrp.get_val()?1:0; }
  inline int isObsolete() { return (flag == COOKIE_OLD); }
  inline void setObsolete() { flag=COOKIE_OLD; }
  inline int notReady() { return (flag == COOKIE_NOTREADY); }
  inline void setReady() { flag=COOKIE_READY; }
  inline void incReduceNo() {
                red.redNo ++;
                for (mCastEntry *next = newc; next; next=next->newc) 
                   next->red.redNo++;
              }
  inline CkArrayID getAid() { return aid; }
  inline int hasOldtree() { return oldtree.entry != NULL; }
  inline void print() {
    CmiPrintf("[%d] mCastEntry: %p, numChild: %d pe: %d flag: %d asm_msg:%p asm_fill:%d\n", CkMyPe(), this, numChild, pe, flag, asm_msg, asm_fill);
  }
};

class cookieMsg: public CMessage_cookieMsg {
public:
  CkSectionInfo cookie;
public:
  cookieMsg() {};
  cookieMsg(CkSectionInfo m): cookie(m) {};
};


/// multicast tree setup message
class multicastSetupMsg: public CMessage_multicastSetupMsg {
public:
  int  nIdx;
  CkArrayIndexMax *arrIdx;
  int      *lastKnown;
  CkSectionInfo parent;
  CkSectionInfo rootSid;
  int redNo;
};

/// message send in spanning tree
class multicastGrpMsg: public CkMcastBaseMsg, public CMessage_multicastGrpMsg {
};

extern void CkPackMessage(envelope **pEnv);
extern void CkUnpackMessage(envelope **pEnv);

void _ckMulticastInit(void)
{
/*
  CkDisableTracing(CkIndex_CkMulticastMgr::recvMsg(0));
  CkDisableTracing(CkIndex_CkMulticastMgr::recvRedMsg(0));
*/
}

mCastEntry::mCastEntry (mCastEntry *old): 
  numChild(0), oldc(NULL), newc(NULL), flag(COOKIE_NOTREADY)
{
  int i;
  aid = old->aid;
  parentGrp = old->parentGrp;
  for (i=0; i<old->allElem.length(); i++)
    allElem.push_back(old->allElem[i]);
#if CMK_LBDB_ON
  CmiAssert(old->allElem.length() == old->allObjKeys.length());
  for (i=0; i<old->allObjKeys.length(); i++)
    allObjKeys.push_back(old->allObjKeys[i]);
#endif
  pe = old->pe;
  red.storedCallback = old->red.storedCallback;
  red.storedClient = old->red.storedClient;
  red.storedClientParam = old->red.storedClientParam;
  red.redNo = old->red.redNo;
  needRebuild = 0;
  asm_msg = NULL;
  asm_fill = 0;
}

extern LDObjid idx2LDObjid(const CkArrayIndex &idx);    // cklocation.C

// call setup to return a sectionid.
void CkMulticastMgr::setSection(CkSectionInfo &_id, CkArrayID aid, CkArrayIndexMax *al, int n)
{
  mCastEntry *entry = new mCastEntry(aid);
  for (int i=0; i<n; i++) {
    entry->allElem.push_back(al[i]);
#if CMK_LBDB_ON
    const LDObjid key = idx2LDObjid(al[i]);
    entry->allObjKeys.push_back(key);
#endif
  }
//  entry->aid = aid;
  _id.aid = aid;
  _id.get_val() = entry;		// allocate table for this section
  initCookie(_id);
}

void CkMulticastMgr::setSection(CkSectionInfo &id)
{
  initCookie(id);
}

// this is deprecated
void CkMulticastMgr::setSection(CProxySection_ArrayElement &proxy)
{
  CkArrayID aid = proxy.ckGetArrayID();
  CkSectionInfo &_id = proxy.ckGetSectionInfo();

  mCastEntry *entry = new mCastEntry(aid);

  const CkArrayIndexMax *al = proxy.ckGetArrayElements();
  for (int i=0; i<proxy.ckGetNumElements(); i++) {
    entry->allElem.push_back(al[i]);
#if CMK_LBDB_ON
    const LDObjid key = idx2LDObjid(al[i]);
    entry->allObjKeys.push_back(key);
#endif
  }
  _id.type = MulticastMsg;
  _id.aid = aid;
  _id.get_val() = entry;		// allocate table for this section
  initCookie(_id);
}

// to recreate section
// when root migrate
void CkMulticastMgr::resetSection(CProxySection_ArrayElement &proxy)
{
  CkSectionInfo &info = proxy.ckGetSectionInfo();

  int oldpe = info.get_pe();
  if (oldpe == CkMyPe()) return;	// we don't have to recreate one

  CkArrayID aid = proxy.ckGetArrayID();
  CkSectionID &sid = proxy.ckGetSectionID();
  mCastEntry *entry = new mCastEntry(aid);

  mCastEntry *oldentry = (mCastEntry *)info.get_val();
  DEBUGF(("[%d] resetSection: old entry:%p new entry:%p\n", CkMyPe(), oldentry, entry));

  const CkArrayIndexMax *al = sid._elems;
  CmiAssert(info.aid == aid);
  prepareCookie(entry, sid, al, sid._nElems, aid);

  CProxy_CkMulticastMgr  mCastGrp(thisgroup);

    // store old tree info
  entry->oldtree.set(oldentry, oldpe);

    // obsolete old tree
  mCastGrp[oldpe].retire(CkSectionInfo(oldpe, oldentry, 0, entry->getAid()), info);

  // find reduction number
  mCastGrp[oldpe].retrieveCookie(CkSectionInfo(oldpe, oldentry, 0, aid), info);
}

// prepare a mCastEntry entry and set up in CkSectionID
void CkMulticastMgr::prepareCookie(mCastEntry *entry, CkSectionID &sid, const CkArrayIndexMax *al, int count, CkArrayID aid)
{
  for (int i=0; i<count; i++) {
    entry->allElem.push_back(al[i]);
#if CMK_LBDB_ON
    const LDObjid key = idx2LDObjid(al[i]);
    entry->allObjKeys.push_back(key);
#endif
  }
  sid._cookie.type = MulticastMsg;
  sid._cookie.aid = aid;
  sid._cookie.get_val() = entry;	// allocate table for this section
  sid._cookie.get_pe() = CkMyPe();
}

// this is used
void CkMulticastMgr::initDelegateMgr(CProxy *cproxy)
{
  CProxySection_ArrayBase *proxy = (CProxySection_ArrayBase *)cproxy;
  CkArrayID aid = proxy->ckGetArrayID();
  CkSectionID &sid = proxy->ckGetSectionID();

  mCastEntry *entry = new mCastEntry(aid);

  const CkArrayIndexMax *al = proxy->ckGetArrayElements();
  prepareCookie(entry, sid, al, proxy->ckGetNumElements(), aid);
  initCookie(sid._cookie);
}

void CkMulticastMgr::retrieveCookie(CkSectionInfo s, CkSectionInfo srcInfo)
{
  mCastEntry *entry = (mCastEntry *)s.get_val();
  CProxy_CkMulticastMgr  mCastGrp(thisgroup);
  mCastGrp[srcInfo.get_pe()].recvCookieInfo(srcInfo, entry->red.redNo);
}

// now that we get reduction number from the old cookie,
// we continue to build the spanning tree
void CkMulticastMgr::recvCookieInfo(CkSectionInfo s, int red)
{
  mCastEntry *entry = (mCastEntry *)s.get_val();
  entry->red.redNo = red;

  initCookie(s);

  // TODO delete old tree
}

void CkMulticastMgr::initCookie(CkSectionInfo s)
{
  mCastEntry *entry = (mCastEntry *)s.get_val();
  int n = entry->allElem.length();
  DEBUGF(("init: %d elems %p\n", n, s.get_val()));
  multicastSetupMsg *msg = new (n, n, 0) multicastSetupMsg;
  msg->nIdx = n;
  msg->parent = CkSectionInfo(entry->getAid());
  msg->rootSid = s;
  msg->redNo = entry->red.redNo;
  CkArray *array = CProxy_ArrayBase(s.aid).ckLocalBranch();
  for (int i=0; i<n; i++) {
    msg->arrIdx[i] = entry->allElem[i];
    int ape = array->lastKnown(entry->allElem[i]);
    CmiAssert(ape >=0 && ape < CkNumPes());
    msg->lastKnown[i] = ape;
  }
  CProxy_CkMulticastMgr  mCastGrp(thisgroup);
  mCastGrp[CkMyPe()].setup(msg);
}

// mark obsolete, release buffered messages
void CkMulticastMgr::teardown(CkSectionInfo cookie)
{
  int i;
  mCastEntry *sect = (mCastEntry *)cookie.get_val();

  sect->setObsolete();

  releaseBufferedReduceMsgs(sect);

  CProxy_CkMulticastMgr mp(thisgroup);
  for (i=0; i<sect->children.length(); i++) {
    mp[sect->children[i].get_pe()].teardown(sect->children[i]);
  }

}

// mark obsolete, reset root section info and release buffered messages
void CkMulticastMgr::retire(CkSectionInfo cookie, CkSectionInfo newroot)
{
  int i;
  mCastEntry *sect = (mCastEntry *)cookie.get_val();
  sect->rootSid = newroot;

  sect->setObsolete();

  releaseBufferedReduceMsgs(sect);

  CProxy_CkMulticastMgr mp(thisgroup);
  for (i=0; i<sect->children.length(); i++) {
    mp[sect->children[i].get_pe()].teardown(sect->children[i]);
  }

}

void CkMulticastMgr::freeup(CkSectionInfo cookie)
{
  mCastEntry *sect = (mCastEntry *)cookie.get_val();

  CProxy_CkMulticastMgr mp(thisgroup);
  while (sect) {
    for (int i=0; i<sect->children.length(); i++) {
      CkSectionInfo &s = sect->children[i];
      mp[s.get_pe()].freeup(s);
    }
    // free cookie itself
    DEBUGF(("[%d] Free up on %p\n", CkMyPe(), sect));
    mCastEntry *oldc= sect->oldc;
    delete sect;
    sect = oldc;
  }
}

void CkMulticastMgr::setup(multicastSetupMsg *msg)
{
  int i,j;
  mCastEntry *entry;
  CkArrayID aid = msg->rootSid.aid;
  if (msg->parent.get_pe() == CkMyPe()) 
	entry = (mCastEntry *)msg->rootSid.get_val(); //sid.val;
  else 
	entry = new mCastEntry(aid);
  entry->aid = aid;
  entry->pe = CkMyPe();
  entry->rootSid = msg->rootSid;
  entry->parentGrp = msg->parent;
  DEBUGF(("[%d] setup: %p redNo: %d => %d with %d elems\n", CkMyPe(), entry, entry->red.redNo, msg->redNo, msg->nIdx));
  entry->red.redNo = msg->redNo;

  int numpes = CkNumPes();
  arrayIndexPosList *lists = new arrayIndexPosList[numpes];
  for (i=0; i<msg->nIdx; i++) {
    // msg->arrIdx[i] is local ?
    int lastKnown = msg->lastKnown[i];
    if (lastKnown == CkMyPe()) {
      entry->localElem.insertAtEnd(msg->arrIdx[i]);
    }
    else {
      lists[lastKnown].push_back(IndexPos(msg->arrIdx[i], lastKnown));
    }
  }
  // divide into MAXMCASTCHILDREN slots
  int numchild = 0;
  int num = 0;
  for (i=0; i<numpes; i++) {
    if (i==CkMyPe()) continue;
    if (lists[i].length()) num++;
  }
  if (factor <= 0) numchild = num;
  else numchild = num<factor?num:factor;

  entry->numChild = numchild;

  if (numchild) {
    arrayIndexPosList *slots = new arrayIndexPosList[numchild];
    num = 0;
    for (i=0; i<numpes; i++) {
      if (i==CkMyPe()) continue;
      if (lists[i].length() == 0) continue;
      for (j=0; j<lists[i].length(); j++)
	slots[num].push_back(lists[i][j]);
      num = (num+1) % numchild;
    }

    // send messages
    CProxy_CkMulticastMgr  mCastGrp(thisgroup);
    for (i=0; i<numchild; i++) {
      int n = slots[i].length();
      multicastSetupMsg *m = new (n, n, 0) multicastSetupMsg;
      m->parent = CkSectionInfo(aid, entry);
      m->nIdx = slots[i].length();
      m->rootSid = msg->rootSid;
      m->redNo = msg->redNo;
      for (j=0; j<slots[i].length(); j++) {
        m->arrIdx[j] = slots[i][j].idx;
        m->lastKnown[j] = slots[i][j].pe;
      }
      int childroot = slots[i][0].pe;
      DEBUGF(("[%d] call set up %d numelem:%d\n", CkMyPe(), childroot, n));
      mCastGrp[childroot].setup(m);
    }
    delete [] slots;
  }
  else {
    childrenReady(entry);
  }
  delete [] lists;
}

void CkMulticastMgr::childrenReady(mCastEntry *entry)
{
  entry->setReady();
  CProxy_CkMulticastMgr  mCastGrp(thisgroup);
  DEBUGF(("[%d] entry %p childrenReady with %d elems.\n", CkMyPe(), entry, entry->allElem.length()));
  if (entry->hasParent()) {
    mCastGrp[entry->parentGrp.get_pe()].recvCookie(entry->parentGrp, CkSectionInfo(entry->getAid(), entry));
  }
#if SPLIT_MULTICAST
  // clear packet buffer
  while (!entry->packetBuf.isEmpty()) {
    mCastPacket *packet = entry->packetBuf.deq();
    packet->cookie.get_val() = entry;
    mCastGrp[CkMyPe()].recvPacket(packet->cookie, packet->n, packet->data, packet->seqno, packet->count, packet->totalsize, 1);
    delete [] packet->data;
    delete packet;
  }
#else
  // clear msg buffer
  while (!entry->msgBuf.isEmpty()) {
    multicastGrpMsg *newmsg = entry->msgBuf.deq();
    DEBUGF(("[%d] release buffer %p %d\n", CkMyPe(), newmsg, newmsg->ep));
    newmsg->_cookie.get_val() = entry;
    mCastGrp[CkMyPe()].recvMsg(newmsg);
  }
#endif
  // release reduction msgs
  releaseFutureReduceMsgs(entry);
}

void CkMulticastMgr::recvCookie(CkSectionInfo sid, CkSectionInfo child)
{
  mCastEntry *entry = (mCastEntry *)sid.get_val();
  entry->children.push_back(child);
  if (entry->children.length() == entry->numChild) {
    childrenReady(entry);
  }
}

// rebuild is called when root not migrated
// when rebuilding, all multicast msgs will be buffered.
void CkMulticastMgr::rebuild(CkSectionInfo &sectId)
{
  // tear down old tree
  mCastEntry *curCookie = (mCastEntry*)sectId.get_val();
  CkAssert(curCookie->pe == CkMyPe());
  // make sure I am the newest one
  while (curCookie->newc) curCookie = curCookie->newc;
  if (curCookie->isObsolete()) return;

  //CmiPrintf("tree rebuild\n");
  mCastEntry *newCookie = new mCastEntry(curCookie);  // allocate table for this section

  // build a chain
  newCookie->oldc = curCookie;
  curCookie->newc = newCookie;

  sectId.get_val() = newCookie;

  DEBUGF(("rebuild: redNo:%d oldc:%p newc;%p\n", newCookie->red.redNo, curCookie, newCookie));

  curCookie->setObsolete();

  resetCookie(sectId);
}

// mark old cookie spanning tree as old and 
// build a new one
void CkMulticastMgr::resetCookie(CkSectionInfo s)
{
  mCastEntry *newCookie = (mCastEntry*)s.get_val();
  mCastEntry *oldCookie = newCookie->oldc;

  // get rid of old one
  DEBUGF(("reset: oldc: %p\n", oldCookie));
  CProxy_CkMulticastMgr  mCastGrp(thisgroup);
  int mype = CkMyPe();
  mCastGrp[mype].teardown(CkSectionInfo(mype, oldCookie, 0, oldCookie->getAid()));

  // build a new one
  initCookie(s);
}

void CkMulticastMgr::SimpleSend(int ep,void *m, CkArrayID a, CkSectionID &sid, int opts)
{
  DEBUGF(("[%d] SimpleSend: nElems:%d\n", CkMyPe(), sid._nElems));
    // set an invalid cookie since we don't have it
  ((multicastGrpMsg *)m)->_cookie = CkSectionInfo(-1, NULL, 0, a);
  for (int i=0; i< sid._nElems-1; i++) {
     CProxyElement_ArrayBase ap(a, sid._elems[i]);
     void *newMsg=CkCopyMsg((void **)&m);
     ap.ckSend((CkArrayMessage *)newMsg,ep,opts|CK_MSG_LB_NOTRACE);
  }
  if (sid._nElems > 0) {
     CProxyElement_ArrayBase ap(a, sid._elems[sid._nElems-1]);
     ap.ckSend((CkArrayMessage *)m,ep,opts|CK_MSG_LB_NOTRACE);
  }
}

void CkMulticastMgr::ArraySectionSend(CkDelegateData *pd,int ep,void *m, CkArrayID a, CkSectionID &sid, int opts)
{
  DEBUGF(("ArraySectionSend\n"));

  multicastGrpMsg *msg = (multicastGrpMsg *)m;
//  msg->aid = a;
  msg->ep = ep;

  CkSectionInfo &s = sid._cookie;
  CmiAssert(a == s.aid);

  mCastEntry *entry;
  if (s.get_pe() == CkMyPe()) {
    entry = (mCastEntry *)s.get_val();   
    if (entry == NULL) {
      CmiAbort("Unknown array section, Did you forget to register the array section to CkMulticastMgr using setSection()?");
    }

    // update entry pointer in case there is a newer one.
    if (entry->newc) {
      do { entry=entry->newc; } while (entry->newc);
      s.get_val() = entry;
    }

#if CMK_LBDB_ON
    // fixme: running obj?
    envelope *env = UsrToEnv(msg);
    const LDOMHandle &om = CProxy_ArrayBase(a).ckLocMgr()->getOMHandle();
    LBDatabaseObj()->MulticastSend(om,entry->allObjKeys.getVec(),entry->allObjKeys.size(),env->getTotalsize());
#endif

    // first time need to rebuild, we do simple send to refresh lastKnown
    if (entry->needRebuild == 1) {
      msg->_cookie = s;
      SimpleSend(ep, msg, a, sid, opts);
      entry->needRebuild = 2;
      return;
    }
    else if (entry->needRebuild == 2) rebuild(s);
  }
  else {
    // fixme - in this case, not recorded in LB
    CmiPrintf("Warning: Multicast not optimized after multicast root migrated. \n");
  }

  // don't need packing here
/*
  register envelope *env = UsrToEnv(m);
  CkPackMessage(&env);
  m = EnvToUsr(env);
*/

  // update cookie
  msg->_cookie = s;

#if SPLIT_MULTICAST
  // split multicast msg into SPLIT_NUM copies
  register envelope *env = UsrToEnv(m);
  CkPackMessage(&env);
  int totalsize = env->getTotalsize();
  int packetSize = totalsize/SPLIT_NUM;
  if (totalsize%SPLIT_NUM) packetSize ++;
  int totalcount = SPLIT_NUM;
  CProxy_CkMulticastMgr  mCastGrp(thisgroup);
  int sizesofar = 0;
  char *data = (char*) env;
  for (int i=0; i<totalcount; i++) {
    int mysize = packetSize;
    if (mysize + sizesofar > totalsize) {
      mysize = totalsize-sizesofar;
    }
    //CmiPrintf("[%d] send to %d : mysize: %d total: %d \n", CkMyPe(), s.get_pe(), mysize, totalsize);
    mCastGrp[s.get_pe()].recvPacket(s, mysize, data, i, totalcount, totalsize, 0);
    sizesofar += mysize;
    data += mysize;
  }
  CmiFree(env);
#else
  if (s.get_pe() == CkMyPe()) {
    recvMsg(msg);
  }
  else {
    CProxy_CkMulticastMgr  mCastGrp(thisgroup);
    mCastGrp[s.get_pe()].recvMsg(msg);
  }
#endif
}

void CkMulticastMgr::recvPacket(CkSectionInfo &_cookie, int n, char *data, int seqno, int count, int totalsize, int fromBuffer)
{
  int i;
  mCastEntry *entry = (mCastEntry *)_cookie.get_val();


  if (!fromBuffer && (entry->notReady() || !entry->packetBuf.isEmpty())) {
    char *newdata = new char[n];
    memcpy(newdata, data, n);
    entry->packetBuf.enq(new mCastPacket(_cookie, n, newdata, seqno, count, totalsize));
//CmiPrintf("[%d] Buffered recvPacket: seqno: %d %d frombuf:%d empty:%d entry:%p\n", CkMyPe(), seqno, count, fromBuffer, entry->packetBuf.isEmpty(),entry);
    return;
  }

//CmiPrintf("[%d] recvPacket ready: seqno: %d %d buffer: %d entry:%p\n", CkMyPe(), seqno, count, fromBuffer, entry);

  // send to spanning tree children
  // can not optimize using list send because the difference in cookie
  CProxy_CkMulticastMgr  mCastGrp(thisgroup);
  for (i=0; i<entry->children.length(); i++) {
    mCastGrp[entry->children[i].get_pe()].recvPacket(entry->children[i], n, data, seqno, count, totalsize, 0);
  }

  if (seqno == 0) {
    if (entry->asm_msg != NULL || entry->asm_fill != 0) {
      entry->print();
      CmiAssert(entry->asm_msg == NULL && entry->asm_fill==0);
    }
    entry->asm_msg = (char *)CmiAlloc(totalsize);
  }
  memcpy(entry->asm_msg+entry->asm_fill, data, n);
  entry->asm_fill += n;
  if (seqno + 1 == count) {
    CmiAssert(entry->asm_fill == totalsize);
    CkUnpackMessage((envelope **)&entry->asm_msg);
    multicastGrpMsg *msg = (multicastGrpMsg *)EnvToUsr((envelope*)entry->asm_msg);
    msg->_cookie = _cookie;
//    mCastGrp[CkMyPe()].recvMsg(msg);
    recvMsg(msg);
    entry->asm_msg = NULL;
    entry->asm_fill = 0;
  }
//  if (fromBuffer) delete [] data;
}

void CkMulticastMgr::recvMsg(multicastGrpMsg *msg)
{
  int i;
  CkSectionInfo &sectionInfo = msg->_cookie;
  mCastEntry *entry = (mCastEntry *)msg->_cookie.get_val();
  CmiAssert(entry->getAid() == sectionInfo.aid);

#if ! SPLIT_MULTICAST
  if (entry->notReady()) {
    DEBUGF(("entry not ready, enq buffer %p\n", msg));
    entry->msgBuf.enq(msg);
    return;
  }

  // send to spanning tree children
  // can not optimize using list send because the difference in cookie
  CProxy_CkMulticastMgr  mCastGrp(thisgroup);
  for (i=0; i<entry->children.length(); i++) {
    multicastGrpMsg *newmsg = (multicastGrpMsg *)CkCopyMsg((void **)&msg);
    newmsg->_cookie = entry->children[i];
    mCastGrp[entry->children[i].get_pe()].recvMsg(newmsg);
  }
#endif

  // send to local
  int nLocal = entry->localElem.length();
  DEBUGF(("send to local %d\n", nLocal));
  for (i=0; i<nLocal-1; i++) {
    CProxyElement_ArrayBase ap(sectionInfo.aid, entry->localElem[i]);
    if (_entryTable[msg->ep]->noKeep) {
      CkSendMsgArrayInline(msg->ep, msg, sectionInfo.aid, entry->localElem[i], CK_MSG_KEEP);
    }
    else {
      // send through scheduler queue
      multicastGrpMsg *newm = (multicastGrpMsg *)CkCopyMsg((void **)&msg);
      ap.ckSend((CkArrayMessage *)newm, msg->ep, CK_MSG_LB_NOTRACE);
    }
    // use CK_MSG_DONTFREE so that the message can be reused
    // the drawback of this scheme bypassing queue is that 
    // if # of local element is huge, this leads to a long time occupying CPU
    // also load balancer seems not be able to correctly instrument load
//    CkSendMsgArrayInline(msg->ep, msg, msg->aid, entry->localElem[i], CK_MSG_KEEP);
    //CmiNetworkProgressAfter(3);
  }
  if (nLocal) {
    CProxyElement_ArrayBase ap(sectionInfo.aid, entry->localElem[nLocal-1]);
    ap.ckSend((CkArrayMessage *)msg, msg->ep, CK_MSG_LB_NOTRACE);
//    CkSendMsgArrayInline(msg->ep, msg, msg->aid, entry->localElem[nLocal-1]);
  }
  else {
    CkAssert (entry->rootSid.get_pe() == CkMyPe());
    delete msg;
  }
}

// user function
// to retrieve section info from a multicast msg
void CkGetSectionInfo(CkSectionInfo &id, void *msg)
{
  CkMcastBaseMsg *m = (CkMcastBaseMsg *)msg;
  if (CkMcastBaseMsg::checkMagic(m) == 0) {
    CmiPrintf("ERROR: This is not a CkMulticast message!\n");
    CmiAbort("Did you remember to do CkMulticast delegation, and inherit multicast message from CkMcastBaseMsg in correct order?");
  }
  // ignore invalid cookie sent by SimpleSend
  if (m->gpe() != -1) {
    id.type = MulticastMsg;
    id.get_pe() = m->gpe();
    id.get_val() = m->cookie();
  }
  // note: retain old redNo
}

// Reduction

void CkMulticastMgr::setReductionClient(CProxySection_ArrayElement &proxy, CkCallback *cb)
{
  CkSectionInfo &id = proxy.ckGetSectionInfo();
  mCastEntry *entry = (mCastEntry *)id.get_val();
  entry->red.storedCallback = cb;
}

void CkMulticastMgr::setReductionClient(CProxySection_ArrayElement &proxy, redClientFn fn,void *param)
{
  CkSectionInfo &id = proxy.ckGetSectionInfo();
  mCastEntry *entry = (mCastEntry *)id.get_val();
  entry->red.storedClient = fn;
  entry->red.storedClientParam = param;
}

inline CkReductionMsg *CkMulticastMgr::buildContributeMsg(int dataSize,void *data,CkReduction::reducerType type, CkSectionInfo &id, CkCallback &cb)
{
  CkReductionMsg *msg = CkReductionMsg::buildNew(dataSize, data);
  msg->reducer = type;
  msg->sid = id;
  msg->sourceFlag = 1;   // from array element
  msg->redNo = id.get_redNo();
  msg->gcount = 1;
  msg->rebuilt = (id.get_pe() == CkMyPe())?0:1;
  msg->callback = cb;
  return msg;
}

void CkMulticastMgr::contribute(int dataSize,void *data,CkReduction::reducerType type, CkSectionInfo &id, int fragSize)
{
  CkCallback cb;
  contribute(dataSize, data, type, id, cb, fragSize);
}

void CkMulticastMgr::contribute(int dataSize,void *data,CkReduction::reducerType type, CkSectionInfo &id, CkCallback &cb, int fragSize)
{
  if (id.get_val() == NULL || id.get_redNo() == -1) 
    CmiAbort("contribute: SectionID is not initialized\n");

  int nFrags;
  if (-1 == fragSize) {		// no frag
    nFrags = 1;
    fragSize = dataSize;
  }
  else {
    CmiAssert (dataSize >= fragSize);
    nFrags = dataSize/fragSize;
    if (dataSize%fragSize) nFrags++;
  }

  if (MAXFRAGS < nFrags) {
    CmiPrintf ("Recompile CkMulticast library for fragmenting msgs into more than %d fragments\n", MAXFRAGS);
    CmiAbort ("frag size too small\n");
  }

  int mpe = id.get_pe();
  CProxy_CkMulticastMgr  mCastGrp(thisgroup);

  // break the message into k-piece fragments
  int fSize = fragSize;
  for (int i=0; i<nFrags; i++) {
    if ((0 != i) && ((nFrags-1) == i) && (0 != dataSize%fragSize)) {
      fSize = dataSize%fragSize;
    }

    CkReductionMsg *msg = CkReductionMsg::buildNew(fSize, data);

    // initialize the new msg
    msg->reducer            = type;
    msg->sid                = id;
    msg->nFrags             = nFrags;
    msg->fragNo             = i;
    msg->sourceFlag         = 1;
    msg->redNo              = id.get_redNo();
    msg->gcount             = 1;
    msg->rebuilt            = (mpe == CkMyPe())?0:1;
    msg->callback           = cb;

    mCastGrp[mpe].recvRedMsg(msg);

    data = (void*)(((char*)data) + fSize);
  }

  id.get_redNo()++;
  DEBUGF(("[%d] val: %d %p\n", CkMyPe(), id.get_pe(), id.get_val()));
}

CkReductionMsg* CkMulticastMgr::combineFrags (CkSectionInfo& id, 
                                              mCastEntry* entry,
                                              reductionInfo& redInfo) {
  int i;
  int dataSize = 0;
  int nFrags   = redInfo.msgs[0][0]->nFrags;

  // to avoid memcpy and allocation cost for non-pipelined reductions
  if (1 == nFrags) {
    CkReductionMsg* msg = redInfo.msgs[0][0];

    // free up the msg slot
    redInfo.msgs[0].length() = 0;

    return msg;
  }

  for (i=0; i<nFrags; i++) {
    dataSize += redInfo.msgs[i][0]->dataSize;
  }

  CkReductionMsg *msg = CkReductionMsg::buildNew(dataSize, NULL);

  // initialize msg header
  msg->redNo      = redInfo.msgs[0][0]->redNo;
  msg->reducer    = redInfo.msgs[0][0]->reducer;
  msg->sid        = id;
  msg->nFrags     = nFrags;

  // I guess following fields need not be initialized
  msg->sourceFlag = 2;
  msg->rebuilt    = redInfo.msgs[0][0]->rebuilt;
  msg->callback   = redInfo.msgs[0][0]->callback;

  byte* data = (byte*)msg->getData ();
  for (i=0; i<nFrags; i++) {
    // copy data from fragments to msg
    memcpy(data, redInfo.msgs[i][0]->getData(), redInfo.msgs[i][0]->dataSize);
    data += redInfo.msgs[i][0]->dataSize;

    // free fragments
    delete redInfo.msgs[i][0];
    redInfo.msgs[i].length() = 0;    
  }

  return msg;
}

void CkMulticastMgr::reduceFragment (int index, CkSectionInfo& id,
                                     mCastEntry* entry, reductionInfo& redInfo,
                                     int& updateReduceNo, int currentTreeUp){

  CProxy_CkMulticastMgr  mCastGrp(thisgroup);
  reductionMsgs& rmsgs = redInfo.msgs[index];
  int dataSize         = rmsgs[0]->dataSize;
  CkReduction::reducerType reducer = rmsgs[0]->reducer;
  int i;
  int oldRedNo = redInfo.redNo;
  int nFrags   = rmsgs[0]->nFrags;
  int fragNo   = rmsgs[0]->fragNo;
                                                                                
  // reduce msgs
  CkReduction::reducerFn f= CkReduction::reducerTable[reducer];
  CkAssert(NULL != f);

  // check valid callback in msg and check if migration happened
  CkCallback msg_cb;
  int rebuilt = 0;
  for (i=0; i<rmsgs.length(); i++) {
    if (rmsgs[i]->rebuilt) rebuilt = 1;
    if (!rmsgs[i]->callback.isInvalid()) msg_cb = rmsgs[i]->callback;
  }

  CkReductionMsg *newmsg = (*f)(rmsgs.length(), rmsgs.getVec()); 
  newmsg->redNo  = redInfo.redNo;
  newmsg->nFrags = nFrags;
  newmsg->fragNo = fragNo;

  // increment num-frags processed
  redInfo.npProcessed ++;

  // check if migration and free messages
  for (i=0; i<rmsgs.length(); i++) {
    delete rmsgs[i];
  }
  rmsgs.length() = 0;

  if (redInfo.npProcessed == nFrags) {
    entry->incReduceNo();
    DEBUGF(("Advanced entry:%p redNo: %d\n", entry, entry->red.redNo));
  }
  if (updateReduceNo) mCastGrp[CkMyPe()].updateRedNo(entry, redInfo.redNo);
                                                                                
  if (entry->hasParent()) {
    // send up to parent
    newmsg->sid        = entry->parentGrp;
    newmsg->reducer    = reducer;
    newmsg->sourceFlag = 2;
    newmsg->redNo      = oldRedNo;
    newmsg->gcount     = redInfo.gcount [index];
    newmsg->rebuilt    = rebuilt;
    newmsg->callback   = msg_cb;
    DEBUGF(("send to parent %p: %d\n", entry->parentGrp.get_val(), entry->parentGrp.get_pe()));
    mCastGrp[entry->parentGrp.get_pe()].recvRedMsg(newmsg);
  } else { // root
    newmsg->sid = id;
    // buffer message
    rmsgs.push_back (newmsg);

    //if (entry->allElem.length() == redInfo.gcount) {
    if (redInfo.npProcessed == nFrags) {

      newmsg = combineFrags (id, entry, redInfo);

      if (!msg_cb.isInvalid()) {
        msg_cb.send(newmsg);
      }
      else if (redInfo.storedCallback != NULL) {
        redInfo.storedCallback->send(newmsg);
      }
      else if (redInfo.storedClient != NULL) {
        redInfo.storedClient(id, redInfo.storedClientParam, dataSize,
           newmsg->data);
        delete newmsg;
      }
      else
        CmiAbort("Did you forget to register a reduction client?");
                                                                                
      DEBUGF(("Reduction client called - currentTreeUp: %d entry:%p oldc: %p\n", currentTreeUp, entry, entry->oldc));
      if (currentTreeUp) {
        if (entry->oldc) {
            // free old tree on same processor;
          mCastGrp[CkMyPe()].freeup(CkSectionInfo(id.get_pe(), entry->oldc, 0, entry->getAid()));
          entry->oldc = NULL;
        }
        if (entry->hasOldtree()) {
            // free old tree on old processor
          int oldpe = entry->oldtree.pe;
          mCastGrp[oldpe].freeup(CkSectionInfo(oldpe, entry->oldtree.entry, 0, entry->getAid()));
          entry->oldtree.clear();
        }
      }
      if (rebuilt && !entry->needRebuild) entry->needRebuild = 1;
    }
  }
}

void CkMulticastMgr::recvRedMsg(CkReductionMsg *msg)
{
  int i;
  CkSectionInfo id = msg->sid;
  mCastEntry *entry = (mCastEntry *)id.get_val();
  CmiAssert(entry!=NULL);
//CmiPrintf("[%d] recvRedMsg: entry: %p\n", CkMyPe(), entry);

  CProxy_CkMulticastMgr  mCastGrp(thisgroup);

  int updateReduceNo = 0;

  // update entry if obsolete
  if (entry->isObsolete()) {
      // send up to root
    DEBUGF(("[%d] entry obsolete-send to root %d\n", CkMyPe(), entry->rootSid.pe));
    if (!entry->hasParent()) { //rootSid.pe == CkMyPe()
      // I am root, set to the new cookie if there is
      mCastEntry *newentry = entry->newc;
      while (newentry && newentry->newc) newentry=newentry->newc;
      if (newentry) entry = newentry;
      CmiAssert(entry!=NULL);
    }
    if (!entry->hasParent() && !entry->isObsolete()) {
       // root find the latest cookie that is not obsolete
      msg->sourceFlag = 0;	 // indicate it is not on old spanning tree
      updateReduceNo = 1;        // reduce from old tree, new entry need update.
    }
    else {
      CmiAssert(entry->rootSid.get_pe() != CkMyPe() || entry->rootSid.get_val() != entry);
      // entry is obsolete, send to root directly
      msg->sid = entry->rootSid;

      msg->sourceFlag = 0;
      mCastGrp[entry->rootSid.get_pe()].recvRedMsg(msg);
      return;
    }
  }

  reductionInfo &redInfo = entry->red;

  DEBUGF(("[%d] msg %p red:%d, entry:%p redno:%d\n", CkMyPe(), msg, msg->redNo, entry, entry->red.redNo));
  // old message come, ignore
  if (msg->redNo < redInfo.redNo) {
    CmiPrintf("[%d] msg redNo:%d, msg:%p, entry:%p redno:%d\n", CkMyPe(), msg->redNo, msg, entry, redInfo.redNo);
    CmiAbort("Could never happen! \n");
  }
  if (entry->notReady() || msg->redNo > redInfo.redNo) {
    DEBUGF(("[%d] Future redmsgs, buffered! msg:%p entry:%p ready:%d msg red:%d sys redno:%d\n", CkMyPe(), msg, entry, entry->notReady(), msg->redNo, redInfo.redNo));
    redInfo.futureMsgs.push_back(msg);
    return;
  }

  DEBUGF(("[%d] recvRedMsg rebuilt:%d red:%d\n", CkMyPe(), msg->rebuilt, redInfo.redNo));

  const int index = msg->fragNo;

  // buffer this msg
  if (msg->sourceFlag == 1) {
    // new reduction message from ArrayElement
    redInfo.lcount [index] ++;
  }

  if (msg->sourceFlag == 2) {
    redInfo.ccount [index] ++;
  }

  redInfo.gcount [index] += msg->gcount;

  // buffer the msg
  // first check if message is of proper size
  if ((0 != redInfo.msgs[index].length()) && 
      (msg->dataSize != (redInfo.msgs [index][0]->dataSize))) {
    CmiAbort("Reduction data are not of same length!");
  }

  redInfo.msgs [index].push_back(msg);

  const int numFragsRcvd = redInfo.msgs [index].length();

  DEBUGF(("[%d] index:%d lcount:%d-%d, ccount:%d-%d, gcount:%d-%d root:%d\n", CkMyPe(),index, entry->red.lcount[index],entry->localElem.length(), entry->red.ccount[index], entry->children.length(), entry->red.gcount[index], entry->allElem.length(), !entry->hasParent()));

  int currentTreeUp = 0;
  if (redInfo.lcount [index] == entry->localElem.length() &&
      redInfo.ccount [index] == entry->children.length())
      currentTreeUp = 1;

  int mixTreeUp = 0;
  const int numElems = entry->allElem.length();
  
  if (!entry->hasParent()) {
    mixTreeUp = 1;
    for (int i=0; i<msg->nFrags; i++) {
      if (entry->allElem.length() != redInfo.gcount [i]) {
        mixTreeUp = 0;
      }
    }
  }

  if (currentTreeUp || mixTreeUp)
  {
    const int nFrags = msg->nFrags;  
    
    // msg from children contain only one fragment
    reduceFragment (index, id, entry, redInfo, updateReduceNo, 
                    currentTreeUp);

    if (redInfo.npProcessed == nFrags) {
      // reset counters
      for (i=0; i<nFrags; i++) {
        redInfo.lcount [i] = 0;
        redInfo.ccount [i] = 0;
        redInfo.gcount [i] = 0;
      }
      redInfo.npProcessed = 0;

      // release future msgs
      releaseFutureReduceMsgs(entry);
    }
  }
}

void CkMulticastMgr::releaseFutureReduceMsgs(mCastEntryPtr entry)
{
  CProxy_CkMulticastMgr  mCastGrp(thisgroup);

  for (int i=0; i<entry->red.futureMsgs.length(); i++) {
    DEBUGF(("releaseFutureReduceMsgs: %p\n", entry->red.futureMsgs[i]));
    mCastGrp[CkMyPe()].recvRedMsg(entry->red.futureMsgs[i]);
  }
  entry->red.futureMsgs.length() = 0;
}

// these messages have to be sent to root
void CkMulticastMgr::releaseBufferedReduceMsgs(mCastEntryPtr entry)
{
  int i;
  CProxy_CkMulticastMgr  mCastGrp(thisgroup);

  for (int j=0; j<MAXFRAGS; j++) {
    for (i=0; i<entry->red.msgs[j].length(); i++) {
      CkReductionMsg *msg = entry->red.msgs[j][i];
      DEBUGF(("releaseBufferedReduceMsgs:%p red:%d in entry:%p\n", msg, msg->redNo, entry));
      msg->sid = entry->rootSid;
      msg->sourceFlag = 0;
      mCastGrp[entry->rootSid.get_pe()].recvRedMsg(msg);
    }
    entry->red.msgs[j].length() = 0;
  }


  for (i=0; i<entry->red.futureMsgs.length(); i++) {
    CkReductionMsg *msg = entry->red.futureMsgs[i];
    DEBUGF(("releaseBufferedFutureReduceMsgs: %p red:%d in entry: %p\n", msg,msg->redNo, entry));
    msg->sid = entry->rootSid;
    msg->sourceFlag = 0;
    mCastGrp[entry->rootSid.get_pe()].recvRedMsg(msg);
  }
  entry->red.futureMsgs.length() = 0;
}

void CkMulticastMgr::updateRedNo(mCastEntryPtr entry, int red)
{
  DEBUGF(("[%d] updateRedNo entry:%p to %d\n", CkMyPe(), entry, red));
  if (entry->red.redNo < red)
    entry->red.redNo = red;

  CProxy_CkMulticastMgr mp(thisgroup);
  for (int i=0; i<entry->children.length(); i++) {
    mp[entry->children[i].get_pe()].updateRedNo((mCastEntry *)entry->children[i].get_val(), red);
  }

  releaseFutureReduceMsgs(entry);
}

#if 0
////////////////////////////////////////////////////////////////////////////////
/////
///////////////// Builtin Reducer Functions //////////////
static CkReductionMsg *invalid_reducer(int nMsg,CkReductionMsg **msg)
{CkAbort("ERROR! Called the invalid reducer!\n");return NULL;}

/* A simple reducer, like sum_int, looks like this:
static CkReductionMsg *sum_int(int nMsg,CkReductionMsg **msg)
{
  int i,ret=0;
  for (i=0;i<nMsg;i++)
    ret+=*(int *)(msg[i]->data);
  return CkReductionMsg::buildNew(sizeof(int),(void *)&ret);
}
*/

#define SIMPLE_REDUCTION(name,dataType,typeStr,loop) \
static CkReductionMsg *name(int nMsg, CkReductionMsg **msg)\
{\
  int m,i;\
  int nElem=msg[0]->getSize()/sizeof(dataType);\
  dataType *ret=(dataType *)(msg[0]->getData());\
  for (m=1;m<nMsg;m++)\
  {\
    dataType *value=(dataType *)(msg[m]->getData());\
    for (i=0;i<nElem;i++)\
    {\
      loop\
    }\
  }\
  return CkReductionMsg::buildNew(nElem*sizeof(dataType),(void *)ret);\
}

//Use this macro for reductions that have the same type for all inputs
#define SIMPLE_POLYMORPH_REDUCTION(nameBase,loop) \
  SIMPLE_REDUCTION(nameBase##_int,int,"%d",loop) \
  SIMPLE_REDUCTION(nameBase##_float,float,"%f",loop) \
  SIMPLE_REDUCTION(nameBase##_double,double,"%f",loop)


//Compute the sum the numbers passed by each element.
SIMPLE_POLYMORPH_REDUCTION(sum,ret[i]+=value[i];)

SIMPLE_POLYMORPH_REDUCTION(product,ret[i]*=value[i];)

SIMPLE_POLYMORPH_REDUCTION(max,if (ret[i]<value[i]) ret[i]=value[i];)

SIMPLE_POLYMORPH_REDUCTION(min,if (ret[i]>value[i]) ret[i]=value[i];)

CkReduction::reducerFn CkMulticastMgr::reducerTable[CkMulticastMgr::MAXREDUCERS]={
    ::invalid_reducer,
  //Compute the sum the numbers passed by each element.
    ::sum_int,::sum_float,::sum_double,
    ::product_int,::product_float,::product_double,
    ::max_int,::max_float,::max_double,
    ::min_int,::min_float,::min_double
};
#endif

#include "CkMulticast.def.h"
