OpenAtom  Version1.5a
ckPairCalculator.C
1 #include "ckPairCalculator.h"
2 #include "utility/matrix2file.h"
3 #include <sstream>
4 
5 ComlibInstanceHandle mcastInstanceCP;
6 ComlibInstanceHandle mcastInstanceACP;
7 
8 CkReduction::reducerType sumMatrixDoubleType;
9 
10 
11 void registersumMatrixDouble(void)
12 {
13  sumMatrixDoubleType=CkReduction::addReducer(sumMatrixDouble);
14 }
15 /*CkReduction::reducerType sumFastDoubleType;
16 CkReductionMsg *sumFastDouble(int nMsg, CkReductionMsg **msgs);*/
17 void fastAdd (double *a, double *b, int nelem);
18 
19 
20 /** \brief sum together matrices of doubles */
21 /// possibly faster than CkReduction::sum_double due to minimizing copies
22 /// and calling CmiNetworkProgress
23 inline CkReductionMsg *sumMatrixDouble(int nMsg, CkReductionMsg **msgs)
24 {
25  double *ret=(double *)msgs[0]->getData();
26 
27  // CkAssert ((unsigned int) ret % 8 == 0);
28  int size0=msgs[0]->getSize();
29  int size=size0/sizeof(double);
30 
31  double *inmatrix;
32  // int progcount=0;
33  if(nMsg>3) // switch loops and unroll
34  {
35  int i=1;
36  // idea here is to have only 1 store for 4 loads
37  for(int d=0;d<size;d++)
38  {
39  for(i=1; i<nMsg-3;i+=3)
40  ret[d]+= ((double *) msgs[i]->getData())[d] + ((double *) msgs[i+1]->getData())[d] + ((double *) msgs[i+2]->getData())[d];
41  for(; i<nMsg;i++)
42  {
43  ret[d]+=((double *) msgs[i]->getData())[d];
44  }
45  }
46  }
47  else
48  for(int i=1; i<nMsg;i++)
49  {
50 
51  inmatrix=(double *) msgs[i]->getData();
52  for(int d=0;d<size;d++)
53  ret[d]+=inmatrix[d];
54  }
55  // CmiNetworkProgress();
56  return CkReductionMsg::buildNew(size*sizeof(double),ret);
57 }
58 
59 /** \brief A functor to simply delegate a gemm to either zgemm or dgemm based on how its instantiated */
60 void myGEMM(char *opA, char *opB, int *m, int *n, int *k, double *alpha, complex *A, int *lda, complex *B, int *ldb, double *beta, complex *C, int *ldc)
61 {
62  complex cAlpha(*alpha,0.), cBeta(*beta,0.);
63  ZGEMM(opA, opB, m, n, k, &cAlpha, A, lda, B, ldb, &cBeta, C, ldc);
64 }
65 
66 void myGEMM(char *opA, char *opB, int *m, int *n, int *k, double *alpha, double *A, int *lda, double *B, int *ldb, double *beta, double *C, int *ldc)
67 {
68  DGEMM(opA, opB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
69 }
70 
71 
72 /*
73  * @addtogroup PairCalculator
74  * @{
75  */
76 
77 PairCalculator::PairCalculator(CkMigrateMessage *m) { }
78 
79 /** \brief constructor */
81 {
82 #ifdef _PAIRCALC_DEBUG_PLACE_
83  CkPrintf("{%d} [PAIRCALC] [%d,%d,%d,%d,%d] inited on pe %d \n", _instance,thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,_sym, CkMyPe());
84 #endif
85 
86 
87  int remainder = cfg.numStates%cfg.grainSize;
88  grainSizeX=(cfg.numStates- thisIndex.x == cfg.grainSize+remainder) ? cfg.grainSize+remainder: cfg.grainSize;
89  grainSizeY=(cfg.numStates- thisIndex.y == cfg.grainSize+remainder) ? cfg.grainSize+remainder: cfg.grainSize;
90 
91  this->numPoints = -1;
92  this->cb_aid = cfg.gSpaceAID;
93  this->cb_ep = cfg.gSpaceEP;
94  this->cb_ep_tol = cfg.PsiVEP;
98  gemmSplitFWk = cfg.gemmSplitFWm;
99  gemmSplitBW = cfg.gemmSplitBW;
100  existsLeft=false;
101  existsRight=false;
102  existsOut=false;
103  existsNew=false;
104  numRecd = 0;
105  numRecdBW = 0;
106  numRecdBWOT = 0;
107  numRecRight = 0;
108  numRecLeft = 0;
109  streamCaughtR=0;
110  streamCaughtL=0;
112  amPhantom=(cfg.arePhantomsOn && (thisIndex.y<thisIndex.x) && cfg.isSymmetric ) ? true : false;
113  /* if(amPhantom)
114  { // ye old switcheroo for the phantoms
115  numExpectedX=grainSizeY;
116  grainSizeY=grainSizeX;
117  grainSizeX=numExpectedX;
118  }
119  */
123 
124  notOnDiagonal= (thisIndex.x!=thisIndex.y) ? true: false;
125  symmetricOnDiagonal=(cfg.isSymmetric && thisIndex.x==thisIndex.y) ? true: false;
126  // If I lie on the chare array diagonal, I expect to get only left matrix data
129  // If I am a phantom chare, I expect to get only right matrix data
130  if(amPhantom)
131  {
132  ///@todo: This is a hack to ensure that phantoms which get matrix blocks
133  // which should actually be *their* left matrix blocks but arrive as right
134  // data dont choke when running remaindery grainSizes. Read commit log for info
137  numExpectedY= numExpected;
138  }
139  resumed=true;
140 
141  touchedTiles=NULL;
142  msgLeft = msgRight = 0;
143  inDataLeft = NULL;
144  inDataRight = NULL;
145  allCaughtLeft=NULL;
146  allCaughtRight=NULL;
147  outData = NULL;
148  mynewData= NULL;
149  othernewData= NULL;
150  inResult1=NULL;
151  inResult2=NULL;
152  usesAtSync=true;
153  if(cfg.isLBon)
154  setMigratable(true);
155  else
156  setMigratable(false);
157  resultCookies=NULL;
158  otherResultCookies=NULL;
159  // TODO: technically we can make fewer of these if cfg.grainSize>grainSizeX || grainSizeY but you'll only save a few bytes
160  resultCookies=new CkSectionInfo[numExpectedX];
164  orthoCookies=new CkSectionInfo[numOrtho*2];
165  orthoCB=new CkCallback[numOrtho*2];
167  {
168  columnCount= new int[numOrthoCol];
169  columnCountOther= new int[numOrthoCol];
170  bzero(columnCount, sizeof(int) * numOrthoCol);
171  bzero(columnCountOther, sizeof(int) * numOrthoCol);
172  }
173  else
174  {
175  columnCountOther=NULL;
176  columnCount=NULL;
177  }
178  if(notOnDiagonal)
179  // we don't actually use these in the asymmetric minimization case
180  // but we make them anyway
181  otherResultCookies=new CkSectionInfo[numExpectedY];
182 
183  /** -------- Setup the forward path input message handling -------- **/
184  /// Set isDataReady flags to false only for those inputs this chare will be getting
185  if (amPhantom)
186  {
187  /// Phantoms will get only a right matrix input from their mirror chares
188  isLeftReady = true;
189  isRightReady = false;
190  }
191  else
192  {
193  /// Non-phantoms will always get at least a left matrix input
194  isLeftReady = false;
195  /// Chares on the chare array diagonals will get ONLY a left matrix (which will also be the right).
197  isRightReady = true;
198  else
199  isRightReady = false;
200  }
201  /// Setup the callbacks
202  CkArrayIndex4D myIndex(thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z);
203  CkCallback leftTrigger (CkIndex_PairCalculator::acceptLeftData (NULL),myIndex,thisProxy,true);
204  CkCallback rightTrigger(CkIndex_PairCalculator::acceptRightData(NULL),myIndex,thisProxy,true);
205  /// Create a string that holds the chare ID and pass it to the message handlers
206  std::ostringstream idStream;
207  idStream<<"["<<thisIndex.w<<","<<thisIndex.x<<","<<thisIndex.y<<","<<thisIndex.z<<","<<cfg.isSymmetric<<"]";
208  /// Create the message handlers for the left and right input matrix blocks
209  leftCollator = new CollatorType (idStream.str()+" LeftHandler" , leftTrigger, numExpectedX,(cfg.conserveMemory<=0),thisIndex.x);
210  rightCollator= new CollatorType (idStream.str()+" RightHandler",rightTrigger, numExpectedY,(cfg.conserveMemory<=0),thisIndex.y);
211  #ifdef DEBUG_CP_PAIRCALC_INPUTDATAHANDLER
212  CkPrintf("[%d,%d,%d,%d,%d] My left and right data collators: %p %p\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,cfg.isSymmetric,leftCollator,rightCollator);
213  #endif
214  /// This is the first point during execution when I can supply my InputDataHandler with pointers to the msg handlers, hence
215  /// it is (now) safe to insert the [w,x,y,z]th element of the InputDataHandler chare array (as it will immediately clamor
216  /// for access to these handlers)
217  myMsgHandler = inProxy;
218  #ifdef DEBUG_CP_PAIRCALC_CREATION
219  CkPrintf("[%d,%d,%d,%d,%d] Inserting my InputDataHandler\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,cfg.isSymmetric);
220  #endif
221  myMsgHandler(thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z).insert(thisProxy);
222  myMsgHandler.doneInserting();
223 }
224 
225 
226 /** \brief pack and unpack */
227 void
229 {
230  ArrayElement4D::pup(p);
231  p|numRecd;
232  p|numRecdBW;
233  p|numRecdBWOT;
234  p|numExpected;
235  p|numExpectedX;
236  p|numExpectedY;
237  p|grainSizeX;
238  p|grainSizeY;
239  p|numPoints;
241  p|notOnDiagonal;
242  p|cb_aid;
243  p|cb_ep;
244  p|cb_ep_tol;
245  p|existsLeft;
246  p|existsRight;
247  p|existsOut;
248  p|existsNew;
249  p|mCastGrpId;
250  p|mCastGrpIdOrtho;
251  p|resumed;
252  p|rck;
253  p|actionType;
256  p|expectOrthoT;
257  p|amPhantom;
258  p|numOrthoCol;
259  p|numOrthoRow;
260  p|numOrtho;
261  if (p.isUnpacking())
262  {
263  mynewData=NULL;
264  othernewData=NULL;
265  resultCookies=new CkSectionInfo[cfg.grainSize];
266  if(notOnDiagonal)
267  otherResultCookies= new CkSectionInfo[cfg.grainSize];
268  else
269  otherResultCookies=NULL;
270  if(existsOut)
271  outData= new internalType[grainSizeX*grainSizeY];
272  else
273  outData=NULL;
274  /// @todo: Fix this to allocate or grab a msgLeft and msgRight. inDataLeft/Right is no longer allocated directly
275  if(existsLeft)
276  inDataLeft = new internalType[numExpectedX*numPoints*pcDataSizeFactor];
277  else
278  inDataLeft=NULL;
279  if(existsRight)
280  inDataRight = new internalType[numExpectedY*numPoints*pcDataSizeFactor];
281  else
282  inDataRight=NULL;
283  orthoCookies= new CkSectionInfo[numOrtho*2];
284  orthoCB= new CkCallback[numOrtho*2];
286  {
287  columnCount= new int[numOrthoCol];
288  columnCountOther= new int[numOrthoCol];
289 
290  }
291 
292  }
293  int i;
294  for (i=0; i<cfg.grainSize; i++) p|resultCookies[i];
295  if(notOnDiagonal)
296  for (i=0; i<cfg.grainSize; i++) p|otherResultCookies[i];
297  for (int i=0; i<cfg.grainSize; i++)
298  CmiAssert(resultCookies[i].get_redNo() > 0);
299  //if(existsOut)
300  //p(outData, cfg.grainSize * cfg.grainSize);
301  /** @todo: Fix this to pack msgLeft and msgRight directly. inDataLeft/Right is no longer allocated directly
302  * msgLeft and msgRight will already be packed and available as msgs. They just wont be packed together with the rest of paircalc.
303  * Is there any way we could simply hand the msgs back to charm and ask it to deliver them to me after I am done migrating.?
304  * Or am I talking crap?
305  if(existsLeft)
306  p(inDataLeft, numExpectedX * numPoints * 2);
307  if(existsRight)
308  p(inDataRight, numExpectedY* numPoints * 2);
309  */
310  PUParray(p,orthoCookies,numOrtho*2);
311  PUParray(p,orthoCB,numOrtho*2);
313  {
314  PUParray(p,columnCount,numOrthoCol);
315  PUParray(p,columnCountOther,numOrthoCol);
316  }
317 #ifdef _PAIRCALC_DEBUG_
318  if (p.isUnpacking())
319  {
320  CkPrintf("[%d,%d,%d,%d,%d] pup unpacking on %d resumed=%d memory %d\n",thisIndex.w,thisIndex.x, thisIndex.y, thisIndex.z,cfg.isSymmetric,CkMyPe(),resumed, CmiMemoryUsage());
321  CkPrintf("[%d,%d,%d,%d,%d] pupped : %d,%d,%d,%d,%d %d %d %d %d %d %d cb cb_aid %d %d %d cb_lb inDataLeft inDataRight outData %d \n",thisIndex.w,thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, numRecd, numExpected, cfg.grainSize, cfg.numStates, cfg.numChunks, numPoints, cfg.isSymmetric, cfg.conserveMemory, cfg.isLBon, cfg.reduce, cb_ep, existsLeft, existsRight, resumed);
322 
323  }
324  else
325  CkPrintf("[%d,%d,%d,%d,%d] pup called on %d\n",thisIndex.w,thisIndex.x, thisIndex.y, thisIndex.z,cfg.isSymmetric,CkMyPe());
326 #endif
327 
328 
329 }
330 
331 /** \brief destructor */
333 {
334 
335 #ifdef _PAIRCALC_DEBUG_
336  CkPrintf("[%d,%d,%d,%d,%d] destructs on [%d]\n",thisIndex.w,thisIndex.x,thisIndex.y, thisIndex.z, cfg.isSymmetric, CkMyPe());
337 #endif
338  if(outData!=NULL)
339  delete [] outData;
340  if(msgLeft)
341  delete msgLeft;
342  if(msgRight)
343  delete msgRight;
344  if(mynewData!=NULL)
345  delete [] mynewData;
346  if(othernewData!=NULL)
347  delete [] othernewData;
348  // redundant paranoia
349  outData=NULL;
350  inDataRight=NULL;
351  inDataLeft=NULL;
352  existsLeft=false;
353  existsRight=false;
354  existsNew=false;
355  existsOut=false;
356  CkAbort("paircalc pup needs fw streaming work");
357  if(orthoCookies!=NULL)
358  delete [] orthoCookies;
359  if(orthoCB!=NULL)
360  delete [] orthoCB;
361  if(resultCookies!=NULL)
362  delete [] resultCookies;
363  if(notOnDiagonal && otherResultCookies !=NULL)
364  delete [] otherResultCookies;
365 }
366 
367 
368 
369 /** \brief initialize the ortho reduction trees and cookies */
371 {
372 //////////////////////////////////////////////////////////////////////////////
373 /// Do not delete msg. Its a nokeep.
374 //////////////////////////////////////////////////////////////////////////////
375 
376  int maxorthostateindex=(cfg.numStates/cfg.orthoGrainSize-1)*cfg.orthoGrainSize;
377  // int orthoIndexX=(msg->orthoX*cfg.orthoGrainSize-thisIndex.x)/cfg.orthoGrainSize;
378  // int orthoIndexY=(msg->orthoY*cfg.orthoGrainSize-thisIndex.y)/cfg.orthoGrainSize;
379  int orthoIndexX=msg->orthoX*cfg.orthoGrainSize;
380  orthoIndexX= (orthoIndexX>maxorthostateindex) ? maxorthostateindex : orthoIndexX;
381  int orthoIndexY=msg->orthoY*cfg.orthoGrainSize;
382  orthoIndexY= (orthoIndexY>maxorthostateindex) ? maxorthostateindex : orthoIndexY;
383  orthoIndexX=(orthoIndexX-thisIndex.x)/cfg.orthoGrainSize;
384  orthoIndexY=(orthoIndexY-thisIndex.y)/cfg.orthoGrainSize;
385 
386  int orthoIndex=orthoIndexX*numOrthoCol+orthoIndexY;
387 
388 #ifdef _PAIRCALC_DEBUG_
389  CkPrintf("[%d,%d,%d,%d,%d] initGRed ox %d oy %d oindex %d oxindex %d oyindex %d numRecd %d numOrtho %d\n",thisIndex.w,thisIndex.x,thisIndex.y, thisIndex.z, cfg.isSymmetric,msg->orthoX, msg->orthoY,orthoIndex, orthoIndexX, orthoIndexY, numRecd, numOrtho);
390 #endif
391  // numOrtho here is numOrtho per sGrain
392  CkAssert(orthoIndex<numOrtho*2);
393  CkGetSectionInfo(orthoCookies[orthoIndex],msg);
394  orthoCB[orthoIndex]=msg->cb;
395  mCastGrpIdOrtho=msg->mCastGrpId;
396  /* cfg.reduce=section;
397  if(msg->lbsync)
398  {
399  int foo=1;
400  contribute(sizeof(int), &foo , CkReduction::sum_int, msg->synccb);
401  }
402 
403  */
404 
405  /// @note: numRecd here is just used as some counter during the init phase. Not related to its usual purpose
406  ++numRecd;
407  // CkPrintf("[%d,%d,%d,%d,%d] initGRed ox %d oy %d oindex %d oxindex %d oyindex %d numRecd %d numOrtho %d\n",thisIndex.w,thisIndex.x,thisIndex.y, thisIndex.z, cfg.isSymmetric,msg->orthoX, msg->orthoY,orthoIndex, orthoIndexX, orthoIndexY, numRecd, numOrtho);
408  if(numRecd==numOrtho)
409  {
410  contribute(sizeof(int), &numRecd , CkReduction::sum_int, cfg.uponSetupCompletion, cfg.instanceIndex);
411  numRecd=0;
412  }
414  {
415 
416  // CkPrintf("[%d,%d,%d,%d,%d] phantom trigger\n");
417  thisProxy(thisIndex.w,thisIndex.y, thisIndex.x,thisIndex.z).phantomDone();
418  }
419 
420  // do not delete nokeep msg
421 }
422 
423 /** \brief setup of phantom is done */
425 {
426  // CkPrintf("[%d,%d,%d,%d,%d] phantom contrib\n");
427  contribute(sizeof(int), &numOrtho , CkReduction::sum_int, cfg.uponSetupCompletion, cfg.instanceIndex);
428 }
429 
430 /** \brief initialize the multicast tree and cookies for backward path */
432 {
433 //////////////////////////////////////////////////////////////////////////////
434 /// Do not delete msg. Its a nokeep.
435 //////////////////////////////////////////////////////////////////////////////
436 
437  CkAssert(msg->offset<cfg.grainSize);
438  if(msg->dest == thisIndex.x && thisIndex.x != thisIndex.y)
439  {
440  CkGetSectionInfo(otherResultCookies[msg->offset],msg);
441 
442 #ifdef _PAIRCALC_DEBUG_SPROXY_
443  CkPrintf("[%d,%d,%d,%d,%d] other initResultSection for dest %d offset %d\n",thisIndex.w,thisIndex.x,thisIndex.y, thisIndex.z, cfg.isSymmetric,msg->dest, msg->offset);
444 #endif
445  }
446  else
447  {
448  CkGetSectionInfo(resultCookies[msg->offset],msg);
449 
450 #ifdef _PAIRCALC_DEBUG_SPROXY_
451  CkPrintf("[%d,%d,%d,%d,%d] initResultSection for dest %d offset %d\n",thisIndex.w,thisIndex.x,thisIndex.y, thisIndex.z, cfg.isSymmetric,msg->dest, msg->offset);
452 #endif
453  }
454  rck++;
455 
456  mCastGrpId=msg->mCastGrpId;
457  //to force synchronize in lb resumption
458  if(msg->lbsync && rck==cfg.grainSize)
459  {
460  contribute(sizeof(int), &rck, CkReduction::sum_int, msg->synccb);
461  }
462  // do not delete nokeep msg
463 }
464 
465 void PairCalculator::ResumeFromSync() {
466  resumed=true;
467  // we own no proxies so we have nothing to reset
468  if(!resumed){
469 
470 #ifdef _PAIRCALC_DEBUG_
471  CkPrintf("[%d,%d,%d,%d,%d] resumes from sync\n",thisIndex.w,thisIndex.x,thisIndex.y, thisIndex.z, cfg.isSymmetric);
472 #endif
473  }
474 }
475 
476 
477 /** \brief accept data for left side of pair */
479 {
480  inputType *data = msg->data();
481  const int numRows = msg->numRows();
482  const int numCols = msg->numCols();
483  /// Assert that data is a valid pointer
484  CkAssert(data != NULL);
485  /// Assert that numRows is as expected
486  CkAssert(numRows == numExpectedX);
487  /// Check data validity
488  #ifdef _NAN_CHECK_
489  for(int i=0; i < numRows*numCols; i++)
490  CkAssert( isfinite(data[i]) );
491  #endif
492  /// Once the basic checks have passed, and if we're debugging print status info
493  #ifdef _PAIRCALC_DEBUG_
494  CkPrintf("[%d,%d,%d,%d,%d] Received left matrix block of size %d x %d at %p\n",
495  thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,cfg.isSymmetric,numRows,numCols,data);
496  #endif
497 
498  /// Set member data pertinent to the left block
499  msgLeft = msg;
500  inDataLeft = reinterpret_cast<internalType*> (data);
501  existsLeft = true;
502  numRecd += numRows;
503  numPoints = numCols;
504  isLeftReady = true;
505 
506  /// If all data is ready
507  if (isLeftReady && isRightReady)
508  {
509  // Obtain a sample incoming msg from the collator and extract relevant data from it
511  if (sampleMsg)
512  {
513  msgLeft->doPsiV = sampleMsg->doPsiV;
514  msgLeft->blkSize= sampleMsg->blkSize;
515  msgLeft->flag_dp= sampleMsg->flag_dp;
516  }
517  else
518  {
519  // If RDMA is enabled, there will be no sample msgs available for any non-PsiV loop. Hence doPsiV is false
520  #ifdef PC_USE_RDMA
521  msgLeft->doPsiV = false;
522  // If RDMA is NOT enabled, we should have obtained a sample message. Something must be wrong
523  #else
524  std::stringstream dbgStr;
525  dbgStr<<"["<<thisIndex.w<<","<<thisIndex.x<<","<<thisIndex.y<<","<<thisIndex.z<<","<<cfg.isSymmetric<<"]"
526  <<" My collator was not able to give me a sample message. Aborting...";
527  CkAbort(dbgStr.str().c_str());
528  #endif
529  }
530  delete sampleMsg;
531  /// Trigger the computation
532  launchComputations(msgLeft);
533  }
534 }
535 
536 
537 /** \brief accept data for right side of pair */
539 {
540  inputType *data = msg->data();
541  const int numRows = msg->numRows();
542  const int numCols = msg->numCols();
543  /// Assert that data is a valid pointer
544  CkAssert(data != NULL);
545  /// Assert that numRows is as expected
546  CkAssert(numRows == numExpectedY);
547  /// Check data validity
548  #ifdef _NAN_CHECK_
549  for(int i=0; i < numRows*numCols; i++)
550  CkAssert( isfinite(data[i]) );
551  #endif
552  /// Once the basic checks have passed, and if we're debugging print status info
553  #ifdef _PAIRCALC_DEBUG_
554  CkPrintf("[%d,%d,%d,%d,%d] Received right matrix block of size %d x %d at %p\n",
555  thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,cfg.isSymmetric,numRows,numCols,data);
556  #endif
557 
558  /// Set member data pertinent to the right block
559  msgRight = msg;
560  inDataRight = reinterpret_cast<internalType*> (data);
561  existsRight = true;
562  numRecd += numRows;
563  numPoints = numCols;
564  isRightReady = true;
565 
566  /// If all data is ready
567  if (isLeftReady && isRightReady)
568  {
569  /// Phantom chares already have correct info in the incoming msg. Only non-phantoms have to be handled
570  if (!amPhantom)
571  {
572  // Obtain a sample incoming msg from the collator and extract relevant data from it
573  paircalcInputMsg *sampleMsg = rightCollator->getSampleMsg();
574  if (sampleMsg)
575  {
576  msgRight->doPsiV = sampleMsg->doPsiV;
577  msgRight->blkSize= sampleMsg->blkSize;
578  msgRight->flag_dp= sampleMsg->flag_dp;
579  }
580  else
581  {
582  // If RDMA is enabled, there will be no sample msgs available for any non-PsiV loop. Hence doPsiV is false
583  #ifdef PC_USE_RDMA
584  msgRight->doPsiV = false;
585  // If RDMA is NOT enabled, we should have obtained a sample message. Something must be wrong
586  #else
587  std::stringstream dbgStr;
588  dbgStr<<"["<<thisIndex.w<<","<<thisIndex.x<<","<<thisIndex.y<<","<<thisIndex.z<<","<<cfg.isSymmetric<<"]"
589  <<" My collator was not able to give me a sample message. Aborting...";
590  CkAbort(dbgStr.str().c_str());
591  #endif
592  }
593  delete sampleMsg;
594  }
595  /// Trigger the computation
597  }
598 }
599 
600 
601 /** once the data has arrived, launch the forward path */
603 {
604  #ifdef _PAIRCALC_DEBUG_
605  CkPrintf("[%d,%d,%d,%d,%d] Going to launch computations... numRecd = %d numExpected = %d numExpectedX = %d, numExpectedY = %d\n",
606  thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,cfg.isSymmetric,
608  #endif
609  /// Ensure that we're really ready to launch computations
610  CkAssert(numRecd == numExpected);
611  blkSize = aMsg->blkSize;
612  bool isForwardPathPending = false; ///< Flag to make sure we cleanup only if FW path starts
613 
614  // If this is not a PsiV loop, trigger the forward path for just the non-phantom chares
615  if(!aMsg->doPsiV)
616  {
617  // This iteration is a normal loop. Hence normal behavior
618  actionType = NORMALPC;
619 
620  // Start the forward path substep timer
621  #ifdef _CP_SUBSTEP_TIMING_
622  if(cfg.forwardTimerID > 0)
623  {
624  double pstart=CmiWallTimer();
625  contribute(sizeof(double),&pstart,CkReduction::min_double, cfg.beginTimerCB , cfg.forwardTimerID);
626  }
627  #endif
628 
629  if (!amPhantom)
630  {
631  /** expectOrthoT is false in any scenario other than asymmetric, dynamics.
632  * numRecdBWOT is equal to numOrtho only when it is asymm, dynamics and T has been
633  * received completely (from Ortho). So this condition, invokes multiplyForward() on
634  * all cases except (asymm, dynamics when T has not been received)
635  *
636  * In that exception scenario, we dont do anything now. Later, when all of T is received,
637  * both multiplyForward() and bwMultiplyDynOrthoT() are called. Look for these calls in acceptOrthoT().
638  */
640  thisProxy(thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z).multiplyForward(aMsg->flag_dp);
641  else
642  {
643  isForwardPathPending = true;
644  CkPrintf("[%d,%d,%d,%d,%d] Gamma beat OrthoT. Waiting for T to arrive before proceeding with forward path\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,cfg.isSymmetric);
645  }
646  }
647  else
648  {
649  /// Do nothing for the phantom chare, non-psiv loops. Computation will be triggered only in the backward path
650  // Just stop the forward path substep timer for the phantom chares
651  #ifdef _CP_SUBSTEP_TIMING_
652  if(cfg.forwardTimerID > 0)
653  {
654  double pstart=CmiWallTimer();
655  contribute(sizeof(double),&pstart,CkReduction::max_double, cfg.endTimerCB , cfg.forwardTimerID);
656  }
657  #endif
658  }
659  }
660  // else, if this is a PsiV loop (there is no forward path, only backward path computations to update PsiV)
661  else
662  {
663  // This is a PsiV loop. Hence behave accordingly
664  actionType = PSIV;
665  thisProxy(thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z).multiplyPsiV();
666  #ifdef PC_USE_RDMA
667  // Let the collators know that they should now expect the next batch of (non-PsiV) data via RDMA
669  rightCollator->expectNext();
670  #endif
671  }
672 
673  /// Reset the counters and flags for the next iteration
674 
675  /** If asymm,dyn and T has not yet arrived completely, fw path has not yet been triggered.
676  * In this case numRecd will be reset in acceptOrthoT() after the fw path has been triggered.
677  * numRecd is reset here for all other cases.
678  */
679  if (!isForwardPathPending)
680  numRecd = 0;
681  /// All non-phantoms should expect left matrix data again
682  if (!amPhantom)
683  isLeftReady = false;
684  /// All non(symm, on-diagonal) chares should expect right matrix data again
685  if (!symmetricOnDiagonal)
686  isRightReady = false;
687 }
688 
689 
690 
691 
692 void PairCalculator::reorder(int * offsetMap, int *revOffsetMap, double *data, double *scratch)
693 {
694 
695  int actualPoints= numPoints*2;
696  // rather ugly, but each iteration correctly places 2 rows
697  CkAssert(numExpectedY==numExpectedX);
698  //TODO HANDLE BORDER grainSizeX|Y funky cases
699 
700  int datasize=actualPoints*sizeof(double);
701  for(int off=0;off<numExpected;off++)
702  {
703  if(off!=offsetMap[off])
704  { // gotta move
705  int want = revOffsetMap[off];
706  int found = offsetMap[off];
707  int currentOffset = off * actualPoints;
708  int wantOffset = want * actualPoints;
709 
710  CmiMemcpy(scratch, &(data[currentOffset]), datasize);
711  CmiMemcpy(&(data[currentOffset]), &(data[wantOffset]), datasize);
712  if(want==found) //simple exchange
713  {
714  CmiMemcpy(&(data[wantOffset]),scratch, datasize);
715  }
716  else // three card shuffle
717  {
718  int foundOffset = found * actualPoints;
719  CmiMemcpy(&(data[wantOffset]), &(data[foundOffset]), datasize);
720  CmiMemcpy(&(data[foundOffset]),scratch, datasize);
721  // 1 more entry is changed
722  offsetMap[want]=offsetMap[found];
723  revOffsetMap[offsetMap[found]]=want;
724 
725  }
726  revOffsetMap[off]=off;
727  offsetMap[off]=off;
728  offsetMap[found]=found;
729  revOffsetMap[found]=found;
730  }
731  }
732  bzero(offsetMap, numExpected*sizeof(int));
733  bzero(revOffsetMap, numExpected*sizeof(int));
734 }
735 
736 /** @todo: The only use of flag_dp has been commented out. Check if this argument is still needed, else weed it out. */
737 void
739 {
740  // TODO: Changes necessary for remainder logic
741  // border tiles aren't of uniform size!
742  // For now we aren't supporting fw streaming with remainder
743  // fw streaming produces incorrect results under as yet unisolated conditions
744  CkAssert(orthoGrainSizeRemX==0 && orthoGrainSizeRemY==0);
745  int tilesq = cfg.orthoGrainSize * cfg.orthoGrainSize;
746  CkMulticastMgr *mcastGrp=CProxy_CkMulticastMgr(mCastGrpIdOrtho).ckLocalBranch();
747  // for(int orthoX=0; orthoX<numOrtho; orthoX++)
748  // for(int orthoY=0; orthoY<numOrtho; orthoY++)
749 
750  int progcounter=0;
751  for(int orthoIndex=0;orthoIndex<numOrtho;orthoIndex++)
752  {
753  // copy into submatrix, contribute
754  // we need to stride by cfg.grainSize and copy by orthoGrainSize
755  // int orthoIndex=orthoX*numOrthoCol+orthoY;
756 
757  if(touchedTiles[orthoIndex]==tilesq)
758  {
759 #ifdef _PAIRCALC_DEBUG_CONTRIB_
760  if(cfg.isSymmetric)
761  CkPrintf("[%d,%d,%d,%d,%d]: contributes %d \n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, orthoIndex);
762 #endif
763  // if (flag_dp) {
764  // for (int i = 0; i < cfg.orthoGrainSize * cfg.orthoGrainSize; i++)
765  // outTiles[orthoIndex][i] *= 2.0;
766  // }
767 #ifdef _PAIRCALC_DEBUG_PARANOID_FW_
768  int orthoX=orthoIndex/numOrthoCol;
769  int orthoY=orthoIndex%numOrthoCol;
770  char filename[80];
771  snprintf(filename,80,"fwoutTile_%d_%d:",orthoX,orthoY);
772  dumpMatrix(filename, outTiles[orthoIndex], cfg.orthoGrainSize, cfg.orthoGrainSize,thisIndex.x+orthoX*cfg.orthoGrainSize, thisIndex.y+orthoY*cfg.orthoGrainSize);
773 #endif
774 
775 #ifdef _NAN_CHECK_
776  for(int i=0; i<cfg.orthoGrainSize * cfg.orthoGrainSize; i++)
777  CkAssert( isfinite(outTiles[orthoIndex][i]) );
778 #endif
779 
780  mcastGrp->contribute(cfg.orthoGrainSize * cfg.orthoGrainSize*sizeof(double), outTiles[orthoIndex], sumMatrixDoubleType, orthoCookies[orthoIndex], orthoCB[orthoIndex]);
781  //mcastGrp->contribute(cfg.orthoGrainSize*orthoGrainSize*sizeof(double), outTiles[orthoIndex], CkReduction::sum_double, orthoCookies[orthoIndex], orthoCB[orthoIndex]);
782  touchedTiles[orthoIndex]=0;
783  if(++progcounter>8)
784  {progcounter=0;CmiNetworkProgress();}
785  }
786  else if(touchedTiles[orthoIndex]>tilesq)
787  {
788  CkPrintf("tile orthoIndex %d has %d vs %d\n",orthoIndex, touchedTiles[orthoIndex], tilesq);
789  CkAbort("invalid large number of tiles touched");
790  }
791  else
792  {
793 #ifdef _PAIRCALC_DEBUG_CONTRIB_
794  CkPrintf("[%d,%d,%d,%d,%d]: %i not ready with %d \n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, orthoIndex, touchedTiles[orthoIndex]);
795 #endif
796  }
797  }
798 }
799 
800 
801 /**
802  * Forward path multiply. Essentially does a simple GEMM on the input matrices.
803  *
804  * A = [numExpectedX, numPoints] (left matrix, input)
805  * B = [numExpectedY, numPoints] (right matrix, input)
806  * C = [numExpectedX, numExpectedY] (output matrix, also called S matrix)
807  *
808  * In terms of the input matrix dimensions, we need:
809  * [numExpectedX, numExpectedY] = [numExpectedX, numPoints] X [numPoints, numExpectedY]
810  *
811  *
812  * GEMM performs: C = alpha * op(A).op(B) + beta * C
813  * where the matrix dimensions are:
814  * op(A) = [m,k]
815  * op(B) = [k,n]
816  * C = [m,n]
817  * with
818  * m = numExpectedX
819  * k = numPoints
820  * n = numExpectedY
821  *
822  * At first glance, it appears we need to transpose matrix B to make this work.
823  * However, since GEMMs are fortran routines, they have a transposed view of the data
824  * i.e, a matrix M[r,c] in C++ appears to be a matrix M'[c,r] if the raw array is
825  * directly passed into fortran.
826  *
827  * The input arrays passed directly into fortran routines will appear to be of dimensions
828  * [numPoints, numExpectedX] and [numPoints, numExpectedY]. Hence, to achieve the desired multiply,
829  * we transpose the first matrix (A). This gives us the solution matrix we want in one step.
830  */
832 {
833 #ifdef _PAIRCALC_DEBUG_
834  CkPrintf("[%d,%d,%d,%d,%d] PairCalculator::multiplyForward() Starting forward path computations.\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,cfg.isSymmetric);
835 #endif
836 
837  // Allocate space for the fw path results if needed
838  if(!existsOut)
839  {
840  CkAssert(outData==NULL);
841  outData = new internalType[grainSizeX * grainSizeY];
842  bzero(outData, sizeof(internalType)* grainSizeX * grainSizeY);
843  existsOut=true;
844 #ifdef _PAIRCALC_DEBUG_
845  CkPrintf("[%d,%d,%d,%d,%d] Allocated outData %d * %d\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,cfg.isSymmetric,grainSizeX, grainSizeY);
846 #endif
847  }
848 
849  // Configure the inputs to the GEMM describing the matrix dimensions and operations
850 #ifdef CP_PAIRCALC_USES_COMPLEX_MATH
851  char transformT = 'C'; // Transpose and conjugate of matrix A
852 #else
853  char transformT = 'T'; // Transpose matrix A
854 #endif
855  char transform = 'N'; // Retain matrix B as it is
856  int m_in = numExpectedY; // Rows of op(A) = Rows of C
857  int k_in = numPoints; // Columns of op(A) = Rows of op(B)
858  int n_in = numExpectedX; // Columns of op(B) = Columns of C
859  double alpha = double(1.0); // Scale B.A by this scalar factor
860  double beta = double(0.0); // Scale initial value of C by this factor
861 
862  // Get handles to the input and output matrices
863  internalType *matrixC = outData;
864  internalType *matrixB = reinterpret_cast<internalType*> ( msgLeft->data() );
865  internalType *matrixA;
867  matrixA = reinterpret_cast<internalType*> ( msgRight->data() );
868  else
869  {
870  // Symm PC chares on the array diagonal only get a left matrix. For these B serves as A too
871  matrixA = matrixB;
872  // Redundant, as numExpectedX == numExpectedY (except for the border chares?)
873  m_in = numExpectedX;
874  }
875 #ifdef TEST_ALIGN
876  CkAssert((unsigned int)matrixA%16==0);
877  CkAssert((unsigned int)matrixB%16==0);
878  CkAssert((unsigned int)matrixC%16==0);
879 #endif
880 
881  // If internal representation is as doubles, treat each complex as 2 doubles
882  k_in *= pcDataSizeFactor;
883  // Double packing (possible only in symm PC for real input) entails a scaling factor for psi
884  if (flag_dp)
885  alpha = 2.0;
886 
887 #ifdef _PAIRCALC_DEBUG_PARANOID_FW_
888  dumpMatrix("fwlmdata", matrixB, numExpectedX, numPoints*2, thisIndex.x, 0);
889  dumpMatrix("fwrmdata", matrixA, numExpectedY, numPoints*2, thisIndex.y, 0);
890 #endif
891 #ifdef PRINT_DGEMM_PARAMS
892  CkPrintf("HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", transformT, transform, m_in, n_in, k_in, alpha, beta, k_in, k_in, m_in);
893 #endif
894 
895  // Invoke the DGEMM (and bracket it in projections)
896 #ifdef CMK_TRACE_ENABLED
897  double StartTime=CmiWallTimer();
898 #endif
899  myGEMM(&transformT, &transform, &m_in, &n_in, &k_in, &alpha, matrixA, &k_in, matrixB, &k_in, &beta, matrixC, &m_in);
900 #ifdef CMK_TRACE_ENABLED
901  traceUserBracketEvent(210, StartTime, CmiWallTimer());
902 #endif
903 
904 #ifdef CMK_TRACE_ENABLED
905  StartTime=CmiWallTimer();
906 #endif
907 
908  // Do the slicing and dicing, and contribute the appropriate bits to the redn that reaches Ortho
909  contributeSubTiles(matrixC);
910 #ifdef _CP_SUBSTEP_TIMING_
911  if(cfg.forwardTimerID > 0)
912  {
913  double pstart=CmiWallTimer();
914  contribute(sizeof(double),&pstart,CkReduction::max_double, cfg.endTimerCB , cfg.forwardTimerID);
915  }
916 #endif
917 #ifdef CMK_TRACE_ENABLED
918  traceUserBracketEvent(220, StartTime, CmiWallTimer());
919 #endif
920 
921  // Mirror our input data to the phantoms so that they can use it in the bw path
923  {
924  CkAssert(existsRight);
925  paircalcInputMsg *msg2phantom = new (numExpectedY*numPoints, 8*sizeof(int)) paircalcInputMsg(numPoints,0,false,flag_dp,msgRight->data(),false,blkSize,numExpectedY);
926  bool prioPhan=false;
927  if(prioPhan)
928  {
929  CkSetQueueing(msg2phantom, CK_QUEUEING_IFIFO);
930  *(int*)CkPriorityPtr(msg2phantom) = 1; // just make it slower than non prioritized
931  }
932  thisProxy(thisIndex.w,thisIndex.y, thisIndex.x,thisIndex.z).acceptRightData(msg2phantom);
933  }
934 
935  /** If this is an asymmetric loop, dynamics case AND Ortho has already sent T,
936  * call bwMultiplyDynOrthoT() as we must also multiply orthoT by Fpsi
937  *
938  * @note: This if condition originally lived in acceptPairData(). Has been shoveled here
939  * to reduce branching over there.
940  */
942  thisProxy(thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z).bwMultiplyDynOrthoT();
943 }
944 
945 
946 
947 
948 void PairCalculator::contributeSubTiles(internalType *fullOutput)
949 {
950  #ifdef _PAIRCALC_DEBUG_
951  CkPrintf("[%d,%d,%d,%d,%d]: contributeSubTiles \n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric);
952  #endif
953  /**
954  * Done:
955  * necessary changes:
956  * 1: border tiles are not uniform size
957  * 2: offset calculation must handle non-uniformity
958  * solutions, use sGrainSize for all row skips
959  * use orthoGrainSizeX or orthoGrainSizeY as needed for row and column iteration
960  */
961 
962  CkMulticastMgr *mcastGrp=CProxy_CkMulticastMgr(mCastGrpIdOrtho).ckLocalBranch();
963  #ifdef _PAIRCALC_DEBUG_PARANOID_FW_
964  dumpMatrix("fullOutput", fullOutput, grainSizeX, grainSizeY, thisIndex.x, thisIndex.y);
965  #endif
966  internalType *outTile;
967  bool reuseTile = false;
968  bool borderX = orthoGrainSizeRemX != 0;
969  bool borderY = orthoGrainSizeRemY != 0;
970  bool borderXY = borderX && borderY;
971 
972  // remainder logic happens if you are a border sGrain
973  if(! borderX && ! borderY)
974  { // only do once to cut down on new/delete
975  reuseTile = true;
976  outTile = new internalType[cfg.orthoGrainSize * cfg.orthoGrainSize];
977  bzero(outTile,sizeof(internalType) * cfg.orthoGrainSize * cfg.orthoGrainSize);
978  }
979  // forward multiply ldc
980  int bigGindex=grainSizeY;
981 
982  for(int orthoX = 0; orthoX < numOrthoCol ; orthoX++)
983  {
984  // advance tilestart to new column
985  // only the size of tiles in last column is affected
986  int orthoGrainSizeX=(orthoX == numOrthoCol-1) ? cfg.orthoGrainSize + orthoGrainSizeRemX : cfg.orthoGrainSize;
987  int orthoXoff=cfg.orthoGrainSize*bigGindex*orthoX;
988  for(int orthoY=0; orthoY<numOrthoRow; orthoY++)
989  {
990  int orthoYoff=orthoY*cfg.orthoGrainSize;
991  int tileStart=orthoYoff+orthoXoff;
992  // only the last row is affected
993  int orthoGrainSizeY=(orthoY==numOrthoRow-1) ? cfg.orthoGrainSize+orthoGrainSizeRemY : cfg.orthoGrainSize;
994  int tileSize=orthoGrainSizeX*orthoGrainSizeY;
995  int bigOindex=orthoGrainSizeY;
996  int ocopySize=bigOindex*sizeof(internalType);
997  int orthoIndex=orthoX*numOrthoCol+orthoY;
998  if(! reuseTile)
999  {
1000  outTile = new internalType[tileSize];
1001  bzero(outTile,sizeof(internalType)*tileSize);
1002  }
1003  // copy into submatrix, contribute
1004  // we need to stride by cfg.grainSize and copy by orthoGrainSize
1005  CkAssert(orthoIndex<numOrtho);
1006  for(int ystart=0, itileStart=tileStart; ystart<tileSize; ystart+=bigOindex, itileStart+=bigGindex)
1007  CmiMemcpy(&(outTile[ystart]),&(fullOutput[itileStart]),ocopySize);
1008 
1009  #ifdef _PAIRCALC_DEBUG_PARANOID_FW_
1010  char filename[80];
1011  snprintf(filename,80,"fwoutTile_%d_%d:",orthoX,orthoY);
1012  dumpMatrix(filename, outTile, orthoGrainSizeX, orthoGrainSizeY,thisIndex.x+orthoX*cfg.orthoGrainSize, thisIndex.y+orthoY*cfg.orthoGrainSize);
1013  #endif
1014 
1015  mcastGrp->contribute(tileSize*sizeof(internalType), outTile, sumMatrixDoubleType, orthoCookies[orthoIndex], orthoCB[orthoIndex]);
1016  if(! reuseTile)
1017  delete [] outTile;
1018  }
1019  }
1020  if(reuseTile)
1021  delete [] outTile;
1022 }
1023 
1024 
1025 
1026 
1027 /** Basically calls collectTile to accept incoming ortho data from an Ortho chare
1028  * @note This method is used only during dynamics in the asymm loop. Talk to EB for more info
1029  */
1031 {
1032 //////////////////////////////////////////////////////////////////////////////
1033 /// Do not delete msg. Its a nokeep.
1034 //////////////////////////////////////////////////////////////////////////////
1035  ///ollect orthoT from matrix2
1036  CkAssert(expectOrthoT);
1037 
1038  numRecdBWOT++;
1039  // CkPrintf("[%d,%d,%d,%d,%d] acceptOrthoT, numRecdBWOT now %d \n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,cfg.isSymmetric,numRecdBWOT);
1040 #ifdef _NAN_CHECK_
1041  for(int i=0;i<msg->size;i++)
1042  CkAssert( isfinite(msg->matrix1[i]) );
1043 #endif
1044 
1045  int size=msg->size;
1046  int size2=msg->size2;
1047  internalType *matrix1=msg->matrix1;
1048  internalType *matrix2=msg->matrix2;
1049 #ifdef TEST_ALIGN
1050  CkAssert((unsigned int) msg->matrix2 %16 ==0);
1051 #endif
1052  int maxorthostateindex=(cfg.numStates/cfg.orthoGrainSize-1)*cfg.orthoGrainSize;
1053  // find our tile indices within this sGrain
1054  int orthoX=msg->orthoX*cfg.orthoGrainSize;
1055 
1056  int orthoY=msg->orthoY*cfg.orthoGrainSize;
1057  ///? @todo Document this after talking with EB. Shouldnt it be an error if orthoX/Y > maxorthostateindex?
1058  orthoX= (orthoX>maxorthostateindex) ? maxorthostateindex : orthoX;
1059  orthoY= (orthoY>maxorthostateindex) ? maxorthostateindex : orthoY;
1060  orthoX=(orthoX-thisIndex.x)/cfg.orthoGrainSize;
1061  orthoY=(orthoY-thisIndex.y)/cfg.orthoGrainSize;
1062 
1063  int orthoGrainSizeY=(orthoY==numOrthoRow-1) ? cfg.orthoGrainSize+orthoGrainSizeRemY : cfg.orthoGrainSize;
1064 
1065  int orthoGrainSizeX=(orthoX == numOrthoCol-1) ? cfg.orthoGrainSize + orthoGrainSizeRemX : cfg.orthoGrainSize;
1066  int matrixSize=grainSizeX*grainSizeY;
1067 
1068  collectTile(false, true, false,orthoX, orthoY, orthoGrainSizeX, orthoGrainSizeY, numRecdBWOT, matrixSize, matrix2, matrix1);
1069  if ((numRecdBWOT==numOrtho) && (numRecd == numExpected))
1070  { // forward path beat orthoT
1071  CkPrintf("[%d,%d,%d,%d,%d] Just received all of OrthoT. Triggering forward path multiply which has been pending since Gamma arrival\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,cfg.isSymmetric);
1072  actionType = NORMALPC;
1073  bool myfalse= false;
1074  thisProxy(thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z).multiplyForward(myfalse);
1075  /** @note: multiplyForward() already checks for numRecdBWOT and calls bwMultiplyDynOrthoT().
1076  * There is no need to call it again here. It appears, this would have been a correctness bug
1077  * that would have showed up if gamma did beat OrthoT. Verify.
1078  */
1079  //thisProxy(thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z).bwMultiplyDynOrthoT();
1080  numRecd = 0;
1081  }
1082 }
1083 
1084 
1085 
1086 
1087 inline void PairCalculator::enqueueBWsend(bool unitcoef, int priority)
1088 {
1089  /// Create a signal message
1090  sendBWsignalMsg *sigmsg;
1092  {
1093  sigmsg = new (8*sizeof(int)) sendBWsignalMsg;
1094  CkSetQueueing(sigmsg, CK_QUEUEING_IFIFO);
1095  *(int*)CkPriorityPtr(sigmsg) = priority; // Just make it slower (default value is 1)
1096  }
1097  else
1098  sigmsg = new sendBWsignalMsg;
1099 
1100  /// Collapse this into 1 flag
1101  if(amPhantom)
1102  sigmsg->otherdata= true;
1103  else if(((!cfg.arePhantomsOn && cfg.isSymmetric) || !unitcoef) && notOnDiagonal)
1104  sigmsg->otherdata=true;
1105  else
1106  sigmsg->otherdata= false;
1107 
1108  /// Either reduce the results or sum them direct in GSpace
1109  if(cfg.isOutputReduced)
1110  thisProxy(thisIndex.w,thisIndex.x, thisIndex.y,thisIndex.z).sendBWResult(sigmsg);
1111  else
1112  thisProxy(thisIndex.w,thisIndex.x, thisIndex.y,thisIndex.z).sendBWResultDirect(sigmsg);
1113 }
1114 
1115 
1116 
1117 
1118 ///PairCalculator::multiplyResult(int size, double *matrix1, double *matrix2)
1120 {
1121 //////////////////////////////////////////////////////////////////////////////
1122 /// Do not delete msg. Its a nokeep.
1123 //////////////////////////////////////////////////////////////////////////////
1124  multiplyResult(msg);
1125 }
1126 
1127 
1128 /**
1129  * Tolerance correction PsiV Backward path multiplication
1130  * This is the same as the regular one matrix backward path with the following exceptions:
1131  * - inDataLeft and inDataRight contain PsiV
1132  * - outData contains the orthoT from the previous (standard) backward path invocation
1133  */
1135 {
1136  #ifdef DEBUG_CP_PAIRCALC_PSIV
1137  CkPrintf("[%d,%d,%d,%d,%d] In multiplyPsiV\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,cfg.isSymmetric);
1138  #endif
1139 
1140  // If I am a non-phantom chare in the symmetric instance, send a copy of my data to my mirror phantom chare
1142  {
1143  CkAssert(existsRight);
1144  paircalcInputMsg *msg2phantom = new (numExpectedY*numPoints, 8*sizeof(int)) paircalcInputMsg(numPoints,0,false,true,msgRight->data(),true,blkSize,numExpectedY);
1145  bool prioPhan=false;
1146  if(prioPhan)
1147  {
1148  CkSetQueueing(msg2phantom, CK_QUEUEING_IFIFO);
1149  *(int*)CkPriorityPtr(msg2phantom) = 1; // just make it slower than non prioritized
1150  }
1151  thisProxy(thisIndex.w,thisIndex.y, thisIndex.x,thisIndex.z).acceptRightData(msg2phantom);
1152  }
1153 
1154  /// We do not need to go through the all of multiplyresult for PsiV. All we really need is the setup for multiplyHelper
1155  // call helper function to do the math
1156  int size=grainSizeX*grainSizeY;
1157  bool unitcoef=true;
1158  // TODO: figure out relationship between n_in k_in and grainSizeX grainSizeY
1159  int m_in=numPoints*2; // rows of op(A)==rows C
1160  int n_in=grainSizeY; // columns of op(B)==columns C
1161  int k_in=grainSizeX; // columns op(A) == rows op(B)
1162  /*
1163  if(amPhantom)
1164  {
1165  n_in=grainSizeX;
1166  k_in=grainSizeY;
1167  }
1168  */
1169  double beta(0.0);
1170  int orthoX=0;
1171  int orthoY=0;
1172  //BTransform=T offsets for C and A matrices
1173  int BTCoffset=0;
1174  int BTAoffset=0;
1175  //BTransform=N offsets for C and A matrices
1176  int BNCoffset=0;
1177  int BNAoffset=0;
1178  actionType=PSIV;
1179  bwMultiplyHelper(size, outData, NULL, outData, NULL, unitcoef, m_in, n_in, k_in, BNAoffset, BNCoffset, BTAoffset, BTCoffset, orthoX, orthoY, beta, grainSizeX, grainSizeY);
1180  /// Schedule the entry methods that will send the bw results out
1181  enqueueBWsend(unitcoef);
1182 }
1183 
1184 
1185 
1186 
1187 /**
1188  * Backward path multiplication. Involves more GEMMs
1189  *
1190  * The basic idea is to apply the orthonormalized 'correction' matrix to the
1191  * input matrices from the forward path and ship them back to GSpace.
1192  *
1193  * For the symmetric instance, since the input matrices (left and right) are
1194  * the same, there's only one input matrix to apply the correction to. We
1195  * achieve better load balance by making the phantom chares also participate
1196  * in this process. This is the reason the phantoms exist. And this is why
1197  * input data is shipped off to the phantoms at the end of the forward path.
1198  *
1199  * For symm instance:
1200  * phantoms do: inDataRight . transpose(amatrix)
1201  * non-phantoms do: inDataLeft . amatrix
1202  *
1203  * For asymm instance:
1204  * inDataLeft . amatrix
1205  * - inDataRight . amatrix2
1206  */
1208 {
1209  //////////////////////////////////////////////////////////////////////////////
1210  // Do not delete msg. Its a nokeep.
1211  //////////////////////////////////////////////////////////////////////////////
1212 
1213  #ifdef _PAIRCALC_DEBUG_
1214  CkPrintf("[%d,%d,%d,%d,%d]: MultiplyResult from orthoX %d orthoY %d size %d numRecdBW %d actionType %d amPhantom %d notOnDiagonal %d cfg.arePhantomsOn %d symmetricOnDiagonal %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, msg->orthoX, msg->orthoY, msg->size, numRecdBW, msg->actionType, amPhantom, notOnDiagonal, cfg.arePhantomsOn, symmetricOnDiagonal);
1215  #endif
1216  #ifdef _CP_SUBSTEP_TIMING_
1217  if(cfg.backwardTimerID>0)
1218  if(numRecdBW==0)
1219  {
1220  double pstart=CmiWallTimer();
1221  contribute(sizeof(double),&pstart,CkReduction::min_double, cfg.beginTimerCB , cfg.backwardTimerID);
1222  }
1223  #endif
1224 
1225  //-------------------- Set the stage for the bw path work. Counters, flags, buffers etc -------
1226 
1227  /// Increment the number of tiles received in the backward path
1228  numRecdBW++;
1229 
1230  #ifdef _NAN_CHECK_
1231  for(int i=0;i<msg->size;i++)
1232  CkAssert( isfinite(msg->matrix1[i]) );
1233  #endif
1234 
1235  int size=msg->size;
1236  int size2=msg->size2;
1237  internalType *matrix1=msg->matrix1;
1238  internalType *matrix2=msg->matrix2;
1239 
1240  #ifdef TEST_ALIGN
1241  if((unsigned int) msg->matrix1 %16 !=0)
1242  {
1243  CkPrintf("msg->matrix1 is %p\n",msg->matrix1);
1244  }
1245  CkAssert((unsigned int) msg->matrix1 %16 ==0);
1246  CkAssert((unsigned int) msg->matrix2 %16 ==0);
1247  #endif
1248 
1249  actionType=msg->actionType;
1250  bool unitcoef = false;
1251 
1252  /// Get the indices of the ortho that chare that sent this data to us
1253  int orthoX=msg->orthoX*cfg.orthoGrainSize;
1254  int orthoY=msg->orthoY*cfg.orthoGrainSize;
1255  // Phantom chares get the T matrix from the orthos corresponding to their mirrors. Hence swap their indices
1256  if(amPhantom)
1257  {
1258  int swap=orthoY;
1259  orthoY=orthoX;
1260  orthoX=swap;
1261  }
1262 
1263  // The state index corresponding to the last ortho tile
1264  int maxorthostateindex=(cfg.numStates/cfg.orthoGrainSize-1)*cfg.orthoGrainSize;
1265  // Find our tile indices within this sGrain
1266  orthoX= (orthoX>maxorthostateindex) ? maxorthostateindex : orthoX;
1267  orthoY= (orthoY>maxorthostateindex) ? maxorthostateindex : orthoY;
1268  orthoX=(orthoX-thisIndex.x)/cfg.orthoGrainSize;
1269  orthoY=(orthoY-thisIndex.y)/cfg.orthoGrainSize;
1270 
1271  // Compute the grainsize of the ortho that just sent this data
1272  int orthoGrainSizeY=(orthoY==numOrthoRow-1) ? cfg.orthoGrainSize+orthoGrainSizeRemY : cfg.orthoGrainSize;
1273  int orthoGrainSizeX=(orthoX == numOrthoCol-1) ? cfg.orthoGrainSize + orthoGrainSizeRemX : cfg.orthoGrainSize;
1274 
1275  // Determine if ortho sent us a matrix2
1276  if(matrix2==NULL||size2<1)
1277  unitcoef = true;
1278 
1279  /// If I am a phantom chare and have not yet received the right matrix data from my non-phantom mirror
1280  if(amPhantom && !existsRight)
1281  {
1282  cfg.areBWTilesCollected=true;
1283  cfg.isBWstreaming=false;
1284  CkPrintf("[%d,%d,%d,%d,%d] Warning! phantom got bw before fw, forcing tile collection\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,cfg.isSymmetric);
1285  }
1286 
1287  int matrixSize=grainSizeX*grainSizeY;
1288 
1289  /// @note: ASSUMING TMATRIX IS REAL (LOSS OF GENERALITY)
1290 
1291  #ifdef _PAIRCALC_DEBUG_PARANOID_BW_
1292  CkPrintf("orthoGrainSizeX %d orthoGrainSizeY %d orthoX %d orthoY %d e1 %.10g\n",orthoGrainSizeX, orthoGrainSizeY, orthoX, orthoY, msg->matrix1[0]);
1294  {
1295  dumpMatrix("bwm1idata",msg->matrix1,grainSizeX,grainSizeY,0,0,orthoX, orthoY);
1296  if(!unitcoef)
1297  { // CG non minimization case
1298  dumpMatrix("bwm2idata",msg->matrix2,grainSizeX,grainSizeY);
1299  }
1300  }
1301  else
1302  {
1303  dumpMatrix("bwm1idata",msg->matrix1,orthoGrainSizeX,orthoGrainSizeY,0,0,orthoX, orthoY);
1304  if(!unitcoef)
1305  { // CG non minimization case
1306  dumpMatrix("bwm2idata",msg->matrix2,orthoGrainSizeX,orthoGrainSizeY,0,0,orthoX, orthoY);
1307  }
1308  }
1309  #endif
1310 
1311  // default DGEMM for non streaming comp case
1312  int m_in = numPoints * pcDataSizeFactor; // rows of op(A)==rows C
1313  // TODO: is n_in grainSizeX or is k_in?
1314  int n_in=grainSizeY; // columns of op(B)==columns C
1315  int k_in=grainSizeX; // columns op(A) == rows op(B)
1316  double beta(0.0);
1317 
1318  // expectOrthoT is true only in asymm, dynamics.
1319  // Hence only for the asymm, off-diagonal chares in dynamics
1320  // we need to subtract fpsi*orthoT (ie, subtract the result of the GEMM)
1321  if(expectOrthoT && !notOnDiagonal)
1322  beta=1.0;
1323 
1324  // These default to 0, will be set to appropriate value if we're streaming the computation in the BW path
1325  //BTransform=T offsets for C and A matrices
1326  int BTCoffset=0;
1327  int BTAoffset=0;
1328  //BTransform=N offsets for C and A matrices
1329  int BNCoffset=0;
1330  int BNAoffset=0;
1331 
1332 
1333  internalType *amatrix=NULL;
1334  internalType *amatrix2=matrix2; ///< @note: may be overridden later
1335 
1336  // If there's only one paircalc chare (All at once no malloc. index is 0 in this case, so this is silly)
1337  if(cfg.numStates==cfg.grainSize)
1338  amatrix=matrix1;
1339 
1340  /// If the grain size for paircalc and ortho are the same
1342  {
1343  amatrix=matrix1;
1344  // Only one ortho will send to us. Ensure that the appropriate counters are set
1346  }
1347  /// else, if this is a PsiV loop
1348  else if(actionType==PSIV)
1349  {
1350  amatrix=matrix1;
1351  // The other tiles were already collected for PSIV
1353  }
1354  /// else, if PC is collecting all the tiles from Ortho
1355  else if (cfg.areBWTilesCollected)
1356  {
1357  collectTile(true, !unitcoef, false,orthoX, orthoY, orthoGrainSizeX, orthoGrainSizeY, numRecdBW, matrixSize, matrix1, matrix2);
1358  amatrix = inResult1;
1359  amatrix2 = inResult2;
1360  }
1361  else
1362  { // settings for streaming computation on each tile
1363  /* For Reference to collect tiles we do this
1364  * if(cfg.isSymmetric && notOnDiagonal) //swap the non diagonals
1365  * { // this is trickier to do than one might expect
1366  * // because the tiles can have funny shapes
1367  * bigGindex=grainSizeX;
1368  * bigOindex=orthoGrainSizeX;
1369  * orthoXoff=cfg.orthoGrainSize*bigGindex*orthoY;
1370  * orthoYoff=orthoX*cfg.orthoGrainSize;
1371  *
1372  * }
1373  * // which means we set up our multiply for oY lines of oX size
1374  * // embedded in gY lines of gX size
1375  */
1376 
1377  amatrix=matrix1;
1378  if(!unitcoef)
1379  amatrix2=matrix2;
1380  // fix n_in and k_in
1381  n_in=orthoGrainSizeY;
1382  k_in=orthoGrainSizeX;
1383  // For the non-phantom, off-diagonal chares, swap the ortho tile indices
1385  {
1386  int swap=orthoX;
1387  orthoX=orthoY;
1388  orthoY=swap;
1389  }
1390 
1391  // skip to the rows which apply to this ortho
1392  BTCoffset=orthoY * m_in * cfg.orthoGrainSize;
1393  BTAoffset=orthoX * m_in * cfg.orthoGrainSize;
1394  BNCoffset=orthoX * m_in * cfg.orthoGrainSize;
1395  BNAoffset=orthoY * m_in * cfg.orthoGrainSize;
1396 
1397  beta=1.0; // need to sum over tiles within orthoY columns
1398  }
1399 
1400 
1401 
1402  //---------------------------------- Perform the actual computations --------------------------
1403  /// If we have all the input we need (received all the tiles, or streaming the computations)
1405  {
1406  if(actionType!=PSIV && !cfg.areBWTilesCollected && n_in*k_in > size)
1407  {
1408  CkPrintf("[%d,%d,%d,%d,%d] Warning! your n_in %d and k_in %d is larger than matrix1->size %d\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,cfg.isSymmetric, n_in,k_in,size);
1409  n_in=cfg.orthoGrainSize;
1410  k_in=cfg.orthoGrainSize;
1411  }
1412  // Call helper function to do the math
1413  bwMultiplyHelper(size, matrix1, matrix2, amatrix, amatrix2, unitcoef, m_in, n_in, k_in, BNAoffset, BNCoffset, BTAoffset, BTCoffset, orthoX, orthoY, beta, orthoGrainSizeX, orthoGrainSizeY);
1414  }
1415 
1416 
1417  //------------------------------ Send out the results as appropriate --------------------------
1418  //#define SKIP_PARTIAL_SEND
1419  #ifndef SKIP_PARTIAL_SEND
1420  /// If we're stream computing without any barriers, simply send whatever results are ready now
1422  {
1423  bwSendHelper( orthoX, orthoY, k_in, n_in, orthoGrainSizeX, orthoGrainSizeY);
1424  /// If we're done with all the paircalc work in this loop (iteration), then cleanup
1425  if(numRecdBW==numOrtho)
1427  }
1428  #else
1429  /// If SKIP_PARTIAL_SEND is defined, and we're streaming, then we send only if we've received all the tiles
1431  enqueueBWsend(unitcoef);
1432  else
1433  {
1434  CkPrintf("[%d,%d,%d,%d,%d] not sending cfg.isBWstreaming %d cfg.areBWTilesCollected %d cfg.isBWbarriered %d actionType %d numRecdBW %d numOrtho%d \n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,cfg.isSymmetric,cfg.isBWstreaming,cfg.areBWTilesCollected,cfg.isBWbarriered,actionType, numRecdBW,numOrtho);
1435  }
1436  #endif
1437 
1438 
1439  /**
1440  * If all the computations are done, and we're either collecting tiles or are barriered, then we need to send our results to GSpace now
1441  *
1442  * @note: We do not enter this if we're streaming without barriers, as in that case data will be sent out as it is ready
1443  *
1444  * @note: cfg.orthoGrainSize == cfg.grainSize implies this PC chare will get only one msg in the bw path.
1445  * This is covered by the check numRecdBW==numOrtho, hence is just for paranoia.
1446  *
1447  * @todo: It appears actionType = PSIV will not enter multiplyResult. Needs to be verified as lots of conditions here chk for actionType==PSIV
1448  */
1451  || actionType==PSIV
1452  )
1453  {
1454  /// If we're barriered on the backward path
1455  if(cfg.isBWbarriered)
1456  {
1457  int wehaveours=1;
1458  contribute(sizeof(int),&wehaveours,CkReduction::sum_int,CkCallback(CkIndex_PairCalculator::bwbarrier(NULL),thisProxy));
1459  }
1460  else
1461  enqueueBWsend(unitcoef);
1462 
1463  /// If we're not streaming, delete inResult*
1464  if(cfg.conserveMemory>=0 && (cfg.areBWTilesCollected || !unitcoef))
1465  {
1466  if(inResult2!=NULL)
1467  delete [] inResult2;
1468  if(inResult1!=NULL)
1469  delete [] inResult1;
1470  inResult1=NULL;
1471  inResult2=NULL;
1472  }
1473  }
1474 }
1475 
1476 
1477 
1478 /** Resets counters, zeroes out data structures, frees mem when appropriate.
1479  * Readies other components (collators) for the next iteration. Ends the backward path substep
1480  * timing. Paircalc behavior under different cfg.conserveMemory settings are defined solely in this
1481  * method.
1482  */
1484 {
1485  #ifdef _PAIRCALC_DEBUG_
1486  CkPrintf("[%d,%d,%d,%d,%d] Cleaning up at end of BW path\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,cfg.isSymmetric);
1487  #endif
1488 
1489  /// Reset the backward path tile counter
1490  numRecdBW = 0;
1491 
1492  /// For all mem usage modes
1493  {
1494  /// If we're stream computing, zero out these containers
1495  if (cfg.isBWstreaming)
1496  {
1497  bzero(columnCount, sizeof(int) * numOrthoCol);
1498  bzero(columnCountOther, sizeof(int) * numOrthoCol);
1499  }
1500 
1501  /// Phantom chares will get a fresh right matrix every time in an incoming msg. Hence, delete it irrespective of cfg.conserveMemory or RDMA settings
1502  if (amPhantom)
1503  {
1504  delete msgRight;
1505  msgRight = 0;
1506  inDataRight = 0;
1507  existsRight = false;
1508  }
1509  }
1510 
1511  /// For only the speed-optimized (high-mem) mode
1512  if (cfg.conserveMemory == -1)
1513  {
1514  if (!amPhantom)
1515  {
1516  bzero(mynewData,numPoints*numExpectedY* sizeof(inputType));
1517  if (othernewData) ///< @todo: is this redundant? isnt othernewData always allocated?
1518  bzero(othernewData,numPoints*numExpectedX * sizeof(inputType));
1519  }
1520  else
1521  {
1522  if (othernewData) ///< @todo: is this redundant? isnt othernewData always allocated?
1523  bzero(othernewData,numPoints*numExpectedY * sizeof(inputType));
1524  }
1525  }
1526 
1527 
1528  /// For the normal and strict low-mem modes
1529  if (cfg.conserveMemory >= 0)
1530  {
1531  /// These deletes lived in sendBWResult() and sendBWResultDirect()
1532  if (!amPhantom)
1533  {
1534  if (mynewData)
1535  delete [] mynewData;
1536  mynewData = 0;
1537  existsNew = false;
1538  }
1539  if (othernewData)
1540  delete [] othernewData;
1541  othernewData = 0;
1542  }
1543 
1544 
1545  /// For the strictest of the low-mem footprint modes,
1546  if (cfg.conserveMemory == 1)
1547  {
1548  /** Delete the input matrices and they'll get reallocated on the next pass.
1549  * Only do this if we dont use RDMA because we dont want to setup an RDMA channel every iteration
1550  */
1551  #ifndef PC_USE_RDMA
1552  delete msgLeft;
1553  msgLeft = 0;
1554  inDataLeft = NULL;
1555  existsLeft = false;
1556  /// If this chare received a right matrix message and it still exists
1558  {
1559  delete msgRight;
1560  msgRight = 0;
1561  inDataRight = 0;
1562  existsRight = false;
1563  }
1564  #endif
1565 
1566  /// Delete the storage for the output matrix too
1567  if(outData!=NULL && actionType!=KEEPORTHO)
1568  {
1569  delete [] outData;
1570  outData = NULL;
1571  existsOut=false;
1572  }
1573  }
1574 
1575 
1576  #ifdef PC_USE_RDMA
1577  // Unless a PsiV step is next, let the collators know that they should now expect the next batch of data via RDMA. If a PsiV step is next, then PsiV data will come in via traditional messages. expectNext() will be called later at the end of multiplyPsiV().
1578  if ( !(cfg.isSymmetric && actionType == KEEPORTHO) )
1579  {
1581  rightCollator->expectNext();
1582  }
1583  else
1584  {
1585  #ifdef DEBUG_CP_PAIRCALC_PSIV
1586  CkPrintf("[%d,%d,%d,%d,%d]: Am NOT notifying the message handlers to expectNext() as a PsiV step is next (actionType=%d). Data should be arriving in messages. \n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, numRecdBW, actionType);
1587  #endif
1588  }
1589  #endif
1590 
1591 
1592  #ifdef _CP_SUBSTEP_TIMING_
1593  /// End the timing for this sub-step.
1594  if(cfg.backwardTimerID > 0)
1595  {
1596  double pstart=CmiWallTimer();
1597  contribute(sizeof(double),&pstart,CkReduction::max_double, cfg.endTimerCB , cfg.backwardTimerID);
1598  }
1599  #endif
1600 }
1601 
1602 
1603 
1604 
1605 void PairCalculator::collectTile(bool doMatrix1, bool doMatrix2, bool doOrthoT, int orthoX, int orthoY, int orthoGrainSizeX, int orthoGrainSizeY, int numRecdBW, int matrixSize, internalType *matrix1, internalType* matrix2)
1606 {
1607 #ifdef _PAIRCALC_DEBUG_
1608  CkPrintf("[%d,%d,%d,%d,%d]: collectTile aggregating numRecdBW %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, numRecdBW);
1609 #endif
1610 
1611 
1612  //do strided copy
1613  // stride amatrix by cfg.grainSize,
1614  // stride matrix1 by orthoGrainSize
1615  // copy out the orthograinSize section
1616  // into amatrix[goffset]
1617  // goffset+=grainSize
1618  if(!doOrthoT && doMatrix1 && numRecdBW==1 && inResult1==NULL) //alloc on first receipt
1619  {
1620  CkAssert(inResult1==NULL);
1621  inResult1 = new internalType[matrixSize];
1622  bzero(inResult1,sizeof(internalType)*matrixSize);
1623  }
1624 
1625  // advance tilestart to new row and column
1626  int bigGindex=grainSizeY;
1627  int bigOindex=orthoGrainSizeY;
1628 
1629  int orthoXoff=cfg.orthoGrainSize*bigGindex*orthoX;
1630  int orthoYoff=orthoY*cfg.orthoGrainSize;
1631 
1632  if(cfg.isSymmetric && notOnDiagonal) //swap the non diagonals
1633  {
1634  if(amPhantom)
1635  {
1636  bigGindex=grainSizeY;
1637  bigOindex=orthoGrainSizeY;
1638  orthoXoff=cfg.orthoGrainSize*bigGindex*orthoX;
1639  orthoYoff=orthoY*cfg.orthoGrainSize;
1640  }
1641  else{
1642  bigGindex=grainSizeX;
1643  bigOindex=orthoGrainSizeX;
1644  orthoXoff=cfg.orthoGrainSize*bigGindex*orthoY;
1645  orthoYoff=orthoX*cfg.orthoGrainSize;
1646  }
1647  }
1648  int tileStart=orthoYoff+orthoXoff;
1649 
1650 
1651  int tileSize=orthoGrainSizeX*orthoGrainSizeY;
1652 
1653  int ocopySize=bigOindex*sizeof(internalType);
1654 
1655  int savetileStart=tileStart;
1656 
1657  if(doMatrix2){ // CG non minimization case have GAMMA
1658  if(inResult2==NULL && numRecdBW==1) //alloc on first receipt
1659  {
1660  inResult2 = new internalType[matrixSize];
1661  bzero(inResult2,sizeof(internalType)*matrixSize);
1662  }
1663  for(int i=0; i<tileSize; i+=bigOindex,tileStart+=bigGindex)
1664  CmiMemcpy(&(inResult2[tileStart]),&(matrix2[i]),ocopySize);
1665 
1666 #ifdef _PAIRCALC_DEBUG_PARANOID_BW_
1667  char filename[80];
1668  snprintf(filename,80,"bwinResult2_%d_%d:",orthoX,orthoY);
1669  dumpMatrix(filename, matrix2, orthoGrainSizeX, orthoGrainSizeY,orthoX*cfg.orthoGrainSize, orthoY*cfg.orthoGrainSize);
1670 #endif
1671 
1672  }
1673  internalType *dest= (doOrthoT) ? outData : inResult1;
1674 
1675  tileStart=savetileStart;
1676 
1677  if(doMatrix1)
1678  {
1679 #ifdef _PAIRCALC_DEBUG_PARANOID_BW_
1680  CkPrintf("[%d,%d,%d,%d,%d]: collectTile copying tile bigOindex %d bigGindex %d tileSize %d tileStart %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, bigOindex, bigGindex, tileSize,tileStart);
1681 #endif
1682 
1683  for(int i=0; i< tileSize; i+=bigOindex,tileStart+=bigGindex)
1684  CmiMemcpy(&(dest[tileStart]),&(matrix1[i]),ocopySize);
1685 #ifdef _PAIRCALC_DEBUG_PARANOID_BW_
1686 
1687  char filename[80];
1688  if(doOrthoT)
1689  snprintf(filename,80,"bworthoT_%d_%d:",orthoX,orthoY);
1690  else
1691  snprintf(filename,80,"bwinResult1_%d_%d:",orthoX,orthoY);
1692  dumpMatrix(filename, matrix1, orthoGrainSizeX, orthoGrainSizeY,orthoX*cfg.orthoGrainSize, orthoY*cfg.orthoGrainSize);
1693 #endif
1694  }
1695 
1696 
1697 }
1698 
1699 
1700 
1701 void PairCalculator::bwbarrier(CkReductionMsg *msg)
1702 {
1703  // everyone is done
1704  delete msg;
1705  /// @todo: Comments says this is a minimization only hack! Figure out!
1706  bool unitcoef=true; ///heap hack for minimzation only
1707  enqueueBWsend(unitcoef);
1708 }
1709 
1710 
1711 
1712 /**
1713  * PRE: amatrix contains the entire matrix for the multiplication.
1714  * matrix contains just what was sent in the trigger message.
1715  */
1716 void PairCalculator::bwMultiplyHelper(int size, internalType *matrix1, internalType *matrix2, internalType *amatrix, internalType *amatrix2, bool unitcoef, int m_in, int n_in, int k_in, int BNAoffset, int BNCoffset, int BTAoffset, int BTCoffset, int orthoX, int orthoY, double beta, int orthoGrainSizeX, int orthoGrainSizeY)
1717 {
1718 #ifdef _PAIRCALC_DEBUG_
1720  streamCaughtR++;
1721  CkPrintf("[%d,%d,%d,%d,%d]: bwMultiplyHelper with size %d numRecdBW %d actionType %d orthoX %d orthoY %d orthoGrainSizeX %d orthoGrainSizeY %d BTCoffset %d BNCoffset %d m_in %d n_in %d k_in %d iter %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, size, numRecdBW, actionType, orthoX, orthoY,orthoGrainSizeX, orthoGrainSizeY, BTCoffset, BNCoffset, m_in, n_in, k_in, streamCaughtR);
1722 #endif
1723 #ifdef _PAIRCALC_DEBUG_PARANOID_BW_
1725  {
1726  dumpMatrix("bwm1cidata",amatrix,grainSizeX,grainSizeY,0,0,0,streamCaughtR);
1727  // CG non minimization case
1728  if(!unitcoef)
1729  dumpMatrix("bwm2cidata",amatrix2,grainSizeX, grainSizeY,0,0,0,streamCaughtR);
1730  }
1731 #endif
1732 
1733  int matrixSize=grainSizeX*cfg.grainSize;
1734 
1735  /* If I will be running a PsiV step immediately after this, reuse outData to hold onto ortho
1736  * It is safe to reuse this memory. The normal backward path has no use for outData
1737  * and the forward path won't be called again until after we're done with it
1738  */
1739  if(cfg.isSymmetric && actionType==KEEPORTHO)
1740  {
1741  if(outData==NULL)
1742  {
1743  CkAssert(!existsOut);
1744  outData=new internalType[matrixSize];
1745  bzero(outData,sizeof(internalType)*matrixSize);
1746  existsOut=true;
1747  }
1748  // Keep the orthoT we just received in matrix1
1750  collectTile(true, false, true,orthoX, orthoY, orthoGrainSizeX, orthoGrainSizeY, numRecdBW, matrixSize, matrix1, matrix2);
1751  else
1752  CmiMemcpy(outData, amatrix, size*sizeof(internalType));
1753  }
1754 
1755 
1756  // Allocate memory for mynewData and othernewData
1757  if(!amPhantom && mynewData==NULL)
1758  {
1759  CkAssert(numPoints>0);
1760 #ifdef _PAIRCALC_DEBUG_
1761  CkPrintf("[%d,%d,%d,%d,%d] Allocated mynewData %d * %d *2 \n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,cfg.isSymmetric,numExpectedY,numPoints);
1762 #endif
1764  bzero(mynewData,numPoints*numExpectedY* sizeof(inputType));
1765  existsNew=true;
1766  if(!amPhantom && ((cfg.isSymmetric || !unitcoef) && notOnDiagonal))
1767  {
1768  if(othernewData==NULL)
1769  {
1770 #ifdef _PAIRCALC_DEBUG_
1771  CkPrintf("[%d,%d,%d,%d,%d] Allocated other %d * %d *2 \n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,cfg.isSymmetric,numExpectedX,numPoints);
1772 #endif
1774  bzero(othernewData,numPoints*numExpectedX * sizeof(inputType));
1775  }
1776  }
1777  }
1778  else if(amPhantom)
1779  {
1780  if(othernewData==NULL)
1781  {
1782 #ifdef _PAIRCALC_DEBUG_
1783  CkPrintf("[%d,%d,%d,%d,%d] Allocated other %d * %d *2 \n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,cfg.isSymmetric,numExpectedY,numPoints);
1784 #endif
1785  // Phantoms live in a bizarre reverso world
1787  bzero(othernewData,numPoints*numExpectedY * sizeof(inputType));
1788  }
1789  }
1790 
1791 
1792  internalType *mynewDatad= reinterpret_cast<internalType*> (mynewData);
1793 
1794  // GEMM configuration
1795  double alpha(1.0);
1796  char transform='N';
1797  // Configure the inputs to the GEMM describing the matrix dimensions and operations
1798 #ifdef CP_PAIRCALC_USES_COMPLEX_MATH
1799  char transformT = 'C'; // Transpose and conjugate of amatrix
1800 #else
1801  char transformT = 'T'; // Transpose amatrix
1802 #endif
1803 
1804 #ifdef _PAIRCALC_DEBUG_PARANOID_BW_
1805  int chunksize=blkSize/cfg.numChunks;
1806  int ystart=chunksize*thisIndex.z;
1807  if(!amPhantom)
1808  dumpMatrix("bwmlodata",inDataLeft,numExpectedX,numPoints*2,thisIndex.x,0);
1809  if(!unitcoef||amPhantom)
1810  { // CG non minimization case
1811  dumpMatrix("bwmrodata",inDataRight,numExpectedY,numPoints*2,thisIndex.y,0);
1812  }
1813 #endif
1814 #ifdef CMK_TRACE_ENABLED
1815  double StartTime=CmiWallTimer();
1816 #endif
1817 
1818  // First multiply to apply the T or L matrix
1819  if(!amPhantom)
1820  {
1821  int lk_in=k_in;
1822  int ln_in=n_in;
1823  if(symmetricOnDiagonal && orthoX!=orthoY && !cfg.areBWTilesCollected && actionType!=PSIV )
1824  {
1825  // here is where we need some more juggling
1826  lk_in=n_in;
1827  ln_in=k_in;
1828  }
1829 #ifdef TEST_ALIGN
1830  CkAssert((unsigned int) &(inDataLeft[BNAoffset] )%16==0);
1831  CkAssert((unsigned int) amatrix %16==0);
1832  CkAssert((unsigned int)&(mynewDatad[BNCoffset] )%16==0);
1833 #endif
1834  if(cfg.isSymmetric)
1835  {
1836 #ifdef PRINT_DGEMM_PARAMS
1837  CkPrintf("HEY-DGEMM %c %c %d %d %d %f %f %d %d %d BNAoffset %d BNCoffset %d\n", transform, transform, m_in, n_in, k_in, alpha, beta, m_in, n_in, m_in, BNAoffset, BNCoffset);
1838 #endif
1839  // Note, your choice of k_in or n_in only matters on the border column, elsewhere k_in==n_in
1840  //orig no valgrind invalid read 8 on A
1841  myGEMM(&transform, &transform, &m_in, &ln_in, &lk_in, &alpha, &(inDataLeft[BNAoffset]), &m_in, amatrix, &lk_in, &beta, &(mynewDatad[BNCoffset]), &m_in);
1842  }
1843  else
1844  {
1845 #ifdef PRINT_DGEMM_PARAMS
1846  CkPrintf("HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", transform, transformT, m_in, n_in, k_in, alpha, beta, m_in, n_in, m_in);
1847 #endif
1848  myGEMM(&transform, &transformT, &m_in, &n_in, &k_in, &alpha, &(inDataLeft[BTAoffset]), &m_in, amatrix, &n_in, &beta, &(mynewDatad[BTCoffset]), &m_in);
1849  }
1850 #ifdef CMK_TRACE_ENABLED
1851  traceUserBracketEvent(230, StartTime, CmiWallTimer());
1852 #endif
1853 #ifdef _PAIRCALC_DEBUG_PARANOID_BW_
1854  char snark[80];
1855  snprintf(snark,80,"bwgmodata_%d_%d:",orthoX,orthoY);
1856  if(cfg.isSymmetric)
1857  dumpMatrixComplex(snark,mynewData,numExpectedY,numPoints,0,ystart,streamCaughtR);
1858  else
1859  dumpMatrixComplex(snark,mynewData,numExpectedY,numPoints,0,ystart,streamCaughtR);
1860 #endif
1861  }// end of !amPhantom
1862 
1863 #ifdef CMK_TRACE_ENABLED
1864  StartTime=CmiWallTimer();
1865 #endif
1866 
1867  // Multiply to compensate for the missing triangle in symmetric case
1869  {
1870  internalType *othernewDatad = reinterpret_cast <internalType*> (othernewData);
1871  if(amPhantom)
1872  {
1873  int swap= n_in;
1874  n_in=k_in;
1875  k_in=swap;
1876  }
1877 #ifdef TEST_ALIGN
1878  CkAssert((unsigned int) &(inDataRight[BTAoffset] )%16==0);
1879  CkAssert((unsigned int) amatrix %16==0);
1880  CkAssert((unsigned int)&(othernewDatad[BTCoffset] )%16==0);
1881 #endif
1882 #ifdef PRINT_DGEMM_PARAMS
1883  CkPrintf("HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", transform, transformT, m_in, n_in, k_in, alpha, beta, m_in, k_in, m_in);
1884 #endif
1885  myGEMM(&transform, &transformT, &m_in, &k_in, &n_in, &alpha, &(inDataRight[BTAoffset]), &m_in, amatrix, &k_in, &beta, &(othernewDatad[BTCoffset]), &m_in);
1886 #ifdef CMK_TRACE_ENABLED
1887  traceUserBracketEvent(250, StartTime, CmiWallTimer());
1888 #endif
1889 #ifdef _PAIRCALC_DEBUG_PARANOID_BW_
1890  char snark[80];
1891  snprintf(snark,80,"bwgomodata_%d_%d:",orthoX,orthoY);
1892  if(amPhantom)
1893  dumpMatrixComplex(snark,othernewData,numExpectedY,numPoints,0,ystart,streamCaughtR);
1894  else
1895  dumpMatrixComplex(snark,othernewData,numExpectedX,numPoints,0,ystart,streamCaughtR);
1896 #endif
1897  }
1898 
1899  // CG non minimization case GAMMA
1900  if(!unitcoef)
1901  {
1903  // this code is only correct in the non streaming case
1904  // output modified by subtracting an application of orthoT
1905  // C = alpha*A*B + beta*C
1906  // C= -1 * inRight * orthoT + C
1907  internalType *othernewDatad;
1908  alpha=-1.0; ///omes in with a minus sign
1909  if(notOnDiagonal)
1910  {
1911  // setting beta to 0 here means you cannot stream process the
1912  // application of orthoT by tile, you can only process them once
1913  // the completed column has been accumulated in inResult2
1914  beta=0.0; // new contribution off-diagonal
1916  beta=1.0; // need to accumulate
1917  othernewDatad= reinterpret_cast <internalType *> (othernewData);
1918  }
1919  else
1920  {
1921  beta=1.0; //subtract contribution from existing on diagonal
1922  othernewDatad=mynewDatad;
1923  }//endif
1924 
1925  // Funny thing here, this logic works unchanged for remainder case.
1926  // off diagonals use the usual funny size othernewData
1927  // diagonals use a MxM newData
1928  CmiNetworkProgress();
1929 #ifdef PRINT_DGEMM_PARAMS
1930  CkPrintf("HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", transform, transform, m_in, n_in, k_in, alpha, beta, m_in, k_in, m_in);
1931 #endif
1932 
1933 #ifdef CMK_TRACE_ENABLED
1934  StartTime=CmiWallTimer();
1935 #endif
1936  myGEMM(&transform, &transform, &m_in, &k_in, &n_in, &alpha, &(inDataRight[BNAoffset]), &m_in, amatrix2, &n_in, &beta, &(othernewDatad[BNCoffset]), &m_in);
1937 #ifdef CMK_TRACE_ENABLED
1938  traceUserBracketEvent(240, StartTime, CmiWallTimer());
1939 #endif
1940 
1941 #ifdef _PAIRCALC_DEBUG_PARANOID_BW_
1942  if(notOnDiagonal) // exists right matrix
1943  if(amPhantom)
1944  dumpMatrixComplex("bwg2modata",othernewData,numExpectedY,numPoints,0,ystart,streamCaughtR);
1945  else
1946  dumpMatrixComplex("bwg2modata",othernewData,numExpectedX,numPoints,0,ystart,streamCaughtR);
1947 #endif
1948  } // end CG case
1949 
1950 #ifdef _PAIRCALC_VALID_OUT_
1951  CkPrintf("[PAIRCALC] [%d,%d,%d,%d,%d] backward gemm out %.10g %.10g %.10g %.10g \n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,cfg.isSymmetric, mynewDatad[0],mynewDatad[1],mynewData[numPoints*grainSizeX-1].re,mynewData[numPoints*grainSizeX-1].im);
1952 #endif
1953 }
1954 
1955 
1956 
1957 /* when streaming and dynamics and ograin<sgrain
1958  * You cannot stream process both gamma and orthoT at the same time.
1959  * Due to the different tranpose the tile ox=1 oy=0 and ox=0 oy=1
1960  * operates on one axis of the gamma operation and a different axes
1961  * for the orthoT. So the gamma output rows you want to use in orthoT
1962  * may not be ready yet.
1963  *
1964  * To avoid this race condition we send orthoT ahead of time (end of
1965  * Symm S->T) and multiply it with Fpsi at the end of the Asymm
1966  * forward path.
1967  *
1968  * Then when gamma is ready for the backward path we do the gamma
1969  * multiply, streaming if desired, and simply perform the subtraction
1970  * in place by using a nonzero beta in the gamma multiply.
1971  */
1973 {
1974 
1975  /// Ensure that we're the correct paircalc instance
1976  CkAssert(expectOrthoT);
1977  /// Ensure that we really have received all the OrthoT pieces
1978  CkAssert( numRecdBWOT == numOrtho);
1979  /// Reset the counter
1980  numRecdBWOT = 0;
1981 
1982  // output modified by subtracting an application of orthoT
1983  // C = alpha*A*B + beta*C
1984  // C= -1 * inRight * orthoT + C
1985 
1986  if(othernewData==NULL)
1987  {
1988 #ifdef _PAIRCALC_DEBUG_
1989  CkPrintf("[%d,%d,%d,%d,%d] Allocated other %d * %d\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,cfg.isSymmetric,numExpectedX,numPoints);
1990 #endif
1992  bzero(othernewData,numPoints*numExpectedX * sizeof(inputType));
1993  existsNew=true;
1994  }
1995  if(mynewData==NULL)
1996  {
1997  CkAssert(numPoints>0);
1998 #ifdef _PAIRCALC_DEBUG_
1999  CkPrintf("[%d,%d,%d,%d,%d] Allocated mynewData %d * %d\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,cfg.isSymmetric,numExpectedY,numPoints);
2000 #endif
2001 
2003  bzero(mynewData,numPoints*numExpectedY* sizeof(inputType));
2004  existsNew=true;
2005  }
2006 
2007 #ifdef CMK_TRACE_ENABLED
2008  double StartTime=CmiWallTimer();
2009 #endif
2010  double alpha=-1.0; //set it up with a minus sign
2011  double beta=0.0; // new contribution off-diagonal
2012  internalType *othernewDatad= reinterpret_cast <internalType*> (mynewData);
2013  if(notOnDiagonal)
2014  othernewDatad= reinterpret_cast <internalType*> (othernewData);
2015 
2016  CmiNetworkProgress();
2017 
2018  char transform='N';
2019  // If internal representation is as doubles, treat each complex as 2 doubles
2020  int m_in=numPoints * pcDataSizeFactor;
2021  int n_in=grainSizeY; // columns of op(B)==columns C
2022  int k_in=grainSizeX; // columns op(A) == rows op(B)
2023 #ifdef PRINT_DGEMM_PARAMS
2024  CkPrintf("HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", transform, transform, m_in, n_in, k_in, alpha, beta, m_in, k_in, m_in);
2025 #endif
2026  myGEMM(&transform, &transform, &m_in, &k_in, &n_in, &alpha, inDataRight,
2027  &m_in, inResult2, &n_in, &beta, othernewDatad, &m_in);
2028 
2029 #ifdef CMK_TRACE_ENABLED
2030  traceUserBracketEvent(240, StartTime, CmiWallTimer());
2031 #endif
2032 #ifdef _PAIRCALC_DEBUG_PARANOID_BW_
2033  int chunksize=blkSize/cfg.numChunks;
2034  int ystart=chunksize*thisIndex.z;
2035  if(notOnDiagonal)
2036  dumpMatrixComplex("bwg2modata",othernewData,numExpectedX,numPoints,0,ystart,streamCaughtR);
2037 #endif
2038 
2039 }
2040 
2041 
2042 
2043 
2044 void PairCalculator::bwSendHelper(int orthoX, int orthoY, int sizeX, int sizeY, int orthoGrainSizeX, int orthoGrainSizeY)
2045 {
2046 #ifdef _PAIRCALC_DEBUG_
2047  CkPrintf("[%d,%d,%d,%d,%d]: bwSendHelper with numRecdBW %d actionType %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, numRecdBW, actionType);
2048 #endif
2049 
2050 
2051  // Check to see if this column is complete, if so send it.
2052 
2053  // symmetric uses BNCoffset on newData for diagonal.
2054  // symmetric also BTCoffset on otherNewData for off diagonal.
2055  // asymmetric uses BTC offset on newData
2056 
2057  // determine the orthograinsizeXY for the orthoX,orthoY tile
2058  // do the calculation based on numExpectedX numExpectedY
2059  // to account for last row col remainder expansion in PC
2060 
2061 
2062  int maxorthostateindex=(cfg.numStates/cfg.orthoGrainSize-1)*cfg.orthoGrainSize;
2063 
2064  int orthoXgrain=orthoX*cfg.orthoGrainSize;
2065  int orthoYgrain=orthoY*cfg.orthoGrainSize;
2066 
2067  orthoXgrain= (orthoXgrain>maxorthostateindex) ? maxorthostateindex : orthoXgrain;
2068  orthoYgrain= (orthoYgrain>maxorthostateindex) ? maxorthostateindex : orthoYgrain;
2069 
2070  // CkAssert(orthoGrainSizeY==sizeY);
2071  // CkAssert(orthoGrainSizeX==sizeX);
2072  if(cfg.isSymmetric)
2073  {
2074  if(!amPhantom)
2075  {
2076  /* if(notOnDiagonal)
2077  {
2078  int size=sizeX;
2079  sizeX=sizeY;
2080  sizeY=size;
2081  }*/
2082  int index=orthoY;
2083  if(notOnDiagonal)
2084  index=orthoX;
2085  if(++columnCount[index]==numOrthoCol ) // BNC
2086  {
2087 #ifdef _PAIRCALC_DEBUG_
2088  CkPrintf("[%d,%d,%d,%d,%d]: bwSendHelper !amPhantom orthoXgrain %d sizeX %d orthoYgrain %d sizeY %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, orthoXgrain, sizeX, orthoYgrain, sizeY);
2089 #endif
2090 
2091  int startGrain=orthoYgrain;
2092  int endGrain=startGrain+sizeY;
2093  if(notOnDiagonal)
2094  {
2095  startGrain=orthoXgrain;
2096  endGrain=startGrain+sizeY;
2097  }
2098  // send orthoX in newData
2099  if(cfg.isOutputReduced)
2100  sendBWResultColumn(false, startGrain, endGrain);
2101  else
2102  sendBWResultColumnDirect(false, startGrain, endGrain);
2103  }
2104  CkAssert(columnCount[index]<=numOrthoCol);
2105  }
2106 
2107  }
2108  else //asymm
2109  {
2110  if(++columnCount[orthoY]==numOrthoCol) // BTC
2111  {
2112 #ifdef _PAIRCALC_DEBUG_
2113  CkPrintf("[%d,%d,%d,%d,%d]: bwSendHelper asymm orthoYgrain %d sizeY %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, orthoYgrain, sizeY);
2114 #endif
2115 
2116  //send orthoY in newdata
2117  if(cfg.isOutputReduced)
2118  sendBWResultColumn(false, orthoYgrain, orthoYgrain+sizeY);
2119  else
2120  sendBWResultColumnDirect(false, orthoYgrain, orthoYgrain+sizeY);
2121  }
2122  CkAssert(columnCount[orthoY]<=numOrthoCol);
2123  }
2124  // send the othernewData for symm off diag (including phantom) and
2125  // asymm off diag dynamics
2127  {
2128  int index =orthoX;
2129  if(cfg.isSymmetric)
2130  index=orthoY;
2131  if(++columnCountOther[index]==numOrthoCol) // BTC
2132  {
2133 #ifdef _PAIRCALC_DEBUG_
2134  CkPrintf("[%d,%d,%d,%d,%d]: bwSendHelper amPhantom | other orthoXgrain %d sizeX %d orthoYgrain %d sizeY %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, orthoXgrain, sizeX, orthoYgrain, sizeY);
2135 #endif
2136 
2137 
2138  int startGrain=orthoXgrain;
2139  int endGrain=sizeX+startGrain;
2140 
2141  //int startGrain=orthoYgrain;
2142  //int endGrain=sizeY+startGrain;
2143  if(cfg.isSymmetric)
2144  {
2145  startGrain=orthoYgrain;
2146  endGrain=sizeX+startGrain;
2147  }
2148  if(amPhantom)
2149  {
2150  startGrain=orthoYgrain;
2151  //startGrain=orthoYgrain;
2152  // endGrain=startGrain+orthoGrainSizeY;
2153  // endGrain=startGrain+sizeY;
2154  endGrain=startGrain+sizeY;
2155  }
2156 #ifdef _PAIRCALC_DEBUG_
2157  CkPrintf("[%d,%d,%d,%d,%d]: bwSendHelper amPhantom | other startGrain %d endGrain %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, startGrain, endGrain );
2158 #endif
2159  //send orthoY in otherNewData
2160  if(cfg.isOutputReduced)
2161  sendBWResultColumn(true, startGrain, endGrain);
2162  else
2163  sendBWResultColumnDirect(true, startGrain, endGrain);
2164  }
2165  CkAssert(columnCountOther[index]<=numOrthoCol);
2166  }
2167 
2168  // this could be refined to track an array of completed columns
2169  // and send them in some grouping scheme
2170  CkAssert(numRecdBW<=numOrtho);
2171 }
2172 
2173 
2174 
2175 
2176 void
2177 PairCalculator::sendBWResultColumnDirect(bool otherdata, int startGrain, int endGrain )
2178 {
2179 #ifdef _PAIRCALC_DEBUG_CONTRIB_
2180  CkPrintf("[%d,%d,%d,%d,%d]: sendBWResultColumnDirect with otherdata %d actionType %d startGrain %d endGrain %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, otherdata, actionType, startGrain, endGrain);
2181 #endif
2182 
2183 
2184  int cp_entry=cb_ep;
2185  if(actionType==PSIV)
2186  {
2187  cp_entry= cb_ep_tol;
2188  }
2189  CkAssert(endGrain<=cfg.numStates);
2190  int numToSend=endGrain-startGrain;
2191  int permuter=(int) ((float) thisIndex.z/ (float) cfg.numChunks) * (float) numToSend;
2192  if(otherdata){ // we have this othernewdata issue for the symmetric case
2193  // and the asymmetric dynamic case
2194  // for the off diagonal elements
2195  CkAssert(othernewData!=NULL);
2196  int index=thisIndex.x;
2197  if(amPhantom)
2198  index=thisIndex.y;
2199 
2200  for(int j=startGrain;j<endGrain;j++)
2201  {
2202  // shift the send order proportional to chunk index
2203  int jPermuted=(j+permuter>endGrain) ? (j+permuter-numToSend) : (j+permuter) ;
2204 
2205  inputType *computed=&(othernewData[jPermuted*numPoints]);
2206  CkCallback mycb(cp_entry, CkArrayIndex2D(jPermuted+ index ,thisIndex.w), cb_aid);
2207  partialResultMsg *msg=new (numPoints, 8*sizeof(int) ) partialResultMsg;
2209  {
2210  *((int*)CkPriorityPtr(msg)) = cfg.resultMsgPriority;
2211  CkSetQueueing(msg, CK_QUEUEING_IFIFO);
2212  }
2213  msg->init(thisIndex,numPoints, thisIndex.z, computed);
2214 
2215 #ifdef _PAIRCALC_DEBUG_
2216  CkPrintf("[%d,%d,%d,%d,%d]: sending partial other of size %d offset %d to [%d %d]\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, numPoints,jPermuted,index+jPermuted,thisIndex.w);
2217 #endif
2218 
2219 #ifdef _NAN_CHECK_
2220  for(int i=0;i<msg->N ;i++)
2221  {
2222  if( !isfinite(msg->result[i]) )
2223  CkPrintf("[%d,%d,%d,%d,%d]: sendBWResultColumnDirect nan at %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, i);
2224  CkAssert( isfinite(msg->result[i]) );
2225  }
2226 #endif
2227  mycb.send(msg);
2228  }
2229  }
2230  else
2231  {
2232  CkAssert(!amPhantom);
2233  CkAssert(mynewData!=NULL);
2234  for(int j=startGrain;j<endGrain;j++) //mynewdata
2235  {
2236  int jPermuted=(j+permuter>endGrain) ? (j+permuter-numToSend) : (j+permuter) ;
2237  inputType *computed=&(mynewData[jPermuted*numPoints]);
2238 #ifdef _NAN_CHECK_
2239  for(int i=0;i<numPoints ;i++)
2240  {
2241  if( !isfinite(computed[i]) )
2242  {
2243  fprintf(stderr,"[%d,%d,%d,%d,%d]: sendBWResultColumnDirect nan in computed at %d %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, jPermuted,i);
2244  CkAssert( isfinite(computed[i]) );
2245  }
2246  }
2247 #endif
2248  int index=thisIndex.y;
2249  CkCallback mycb(cp_entry, CkArrayIndex2D(jPermuted+index ,thisIndex.w), cb_aid);
2250  partialResultMsg *msg=new (numPoints, 8*sizeof(int) ) partialResultMsg;
2252  {
2253  *((int*)CkPriorityPtr(msg)) = cfg.resultMsgPriority;
2254  CkSetQueueing(msg, CK_QUEUEING_IFIFO);
2255  }
2256  msg->init(thisIndex,numPoints, thisIndex.z, computed);
2257  /*
2258  msg->N=numPoints;
2259  msg->myoffset = thisIndex.z; // chunkth
2260  CmiMemcpy(msg->result,mynewData+jPermuted*numPoints,msg->N*sizeof(inputType));
2261  */
2262 #ifdef _PAIRCALC_DEBUG_
2263  CkPrintf("[%d,%d,%d,%d,%d]:sending partial of size %d offset %d to [%d %d]\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, numPoints,jPermuted,index+jPermuted,thisIndex.w);
2264 #endif
2265 
2266 #ifdef _NAN_CHECK_
2267  for(int i=0;i<msg->N ;i++)
2268  {
2269  if( !isfinite(msg->result[i]) )
2270  CkPrintf("[%d,%d,%d,%d,%d]: sendBWResultColumnDirect nan at %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, i);
2271  CkAssert( isfinite(msg->result[i]) );
2272  }
2273 #endif
2274  mycb.send(msg);
2275  }
2276  }
2277 }
2278 
2279 
2280 
2281 /**
2282  * Send the result for this column
2283  */
2284 
2285 void PairCalculator::sendBWResultColumn(bool otherdata, int startGrain, int endGrain )
2286 {
2287 
2288 
2289 #ifdef _PAIRCALC_DEBUG_CONTRIB_
2290  CkPrintf("[%d,%d,%d,%d,%d]: sendBWResultColumn with actionType %d startGrain %d sendGrain %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, actionType, startGrain, endGrain);
2291 #endif
2292 
2293  // Now we have results in mynewData and if(otherData) othernewData
2294  CkMulticastMgr *mcastGrp=CProxy_CkMulticastMgr(mCastGrpId).ckLocalBranch();
2295  int cp_entry=cb_ep;
2296  if(actionType==PSIV)
2297  {
2298  cp_entry= cb_ep_tol;
2299  }
2300  if(otherdata){ // we have this othernewdata issue for the symmetric case
2301  // and the asymmetric dynamic case
2302  // for the off diagonal elements
2303  CkAssert(othernewData!=NULL);
2304  int index=thisIndex.x;
2305  if(amPhantom)
2306  index=thisIndex.y;
2307  for(int j=startGrain;j<endGrain;j++)
2308  {
2309  //this callback creation could be obviated by keeping an
2310  //array of callbacks, not clearly worth doing
2311  inputType *computed=&(othernewData[j*numPoints]);
2312 #ifdef CMK_TRACE_ENABLED
2313  double StartTime=CmiWallTimer();
2314 #endif
2315 
2316  CkCallback mycb(cp_entry, CkArrayIndex2D(j+index ,thisIndex.w), cb_aid);
2317 #ifdef _PAIRCALC_DEBUG_CONTRIB_
2318  CkPrintf("[%d,%d,%d,%d,%d] contributing other %d offset %d to [%d %d]\n",thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, numPoints,j,thisIndex.x+j,thisIndex.w);
2319 #endif
2320 
2321  int outOffset=thisIndex.z;
2322  mcastGrp->contribute(numPoints*sizeof(inputType), computed, sumMatrixDoubleType, otherResultCookies[j], mycb, outOffset);
2323 
2324 #ifdef CMK_TRACE_ENABLED
2325  traceUserBracketEvent(220, StartTime, CmiWallTimer());
2326 #endif
2327  // if((j-startGrain) % 8)
2328  CmiNetworkProgress();
2329 
2330  }
2331  }
2332  else
2333  {
2334  CkAssert(mynewData!=NULL);
2335  int index=thisIndex.y;
2336  for(int j=startGrain;j<endGrain;j++) //mynewdata
2337  {
2338 
2339  inputType *computed=&(mynewData[j*numPoints]);
2340 #ifdef CMK_TRACE_ENABLED
2341  double StartTime=CmiWallTimer();
2342 #endif
2343  CkCallback mycb(cp_entry, CkArrayIndex2D(j+thisIndex.y ,thisIndex.w), cb_aid);
2344 
2345 #ifdef _PAIRCALC_DEBUG_CONTRIB_
2346  CkPrintf("[%d,%d,%d,%d,%d] contributing %d offset %d to [%d %d]\n",thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric,numPoints,j,thisIndex.y+j,thisIndex.w);
2347 #endif
2348 
2349  int outOffset=thisIndex.z;
2350  mcastGrp->contribute(numPoints*sizeof(inputType), computed, sumMatrixDoubleType, resultCookies[j], mycb, outOffset);
2351 
2352 #ifdef CMK_TRACE_ENABLED
2353  traceUserBracketEvent(220, StartTime, CmiWallTimer());
2354 #endif
2355 
2356  // if((j-startGrain) % 8)
2357  CmiNetworkProgress();
2358  }
2359  }
2360 }
2361 
2362 
2363 
2365 {
2366  #ifdef _PAIRCALC_DEBUG_
2367  CkPrintf("[%d,%d,%d,%d,%d]: sendBWResultDirect with actionType %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, actionType);
2368  #endif
2369  // Now we have results in mynewData and if(cfg.isSymmetric) othernewData
2370  bool otherdata=msg->otherdata;
2371  delete msg;
2372  /// Determine the callback to use
2373  int cp_entry=cb_ep;
2374  if(actionType==PSIV)
2375  cp_entry= cb_ep_tol;
2376 
2377  /// Off-diagonal PCs in the symm (and asymm, in dynamics) instance, have to handle othernewdata
2378  if(otherdata)
2379  {
2380  CkAssert(othernewData!=NULL);
2381  int size=grainSizeX;
2382  int index=thisIndex.x;
2383  if(amPhantom)
2384  {
2385  index=thisIndex.y;
2386  size=grainSizeY;
2387  }
2388  for(int j=0;j<size;j++)
2389  {
2390  inputType *computed=&(othernewData[j*numPoints]);
2391  //this callback creation could be obviated by keeping an
2392  //array of callbacks, not clearly worth doing
2393  CkCallback mycb(cp_entry, CkArrayIndex2D(j+ index ,thisIndex.w), cb_aid);
2394  partialResultMsg *omsg= new (numPoints, 8*sizeof(int) ) partialResultMsg;
2396  {
2397  *((int*)CkPriorityPtr(omsg)) = cfg.resultMsgPriority;
2398  CkSetQueueing(omsg, CK_QUEUEING_IFIFO);
2399  }
2400  omsg->init(thisIndex,numPoints, thisIndex.z, computed);
2401  #ifdef _PAIRCALC_DEBUG_
2402  CkPrintf("[%d,%d,%d,%d,%d]:sending other partial of size %d offset %d to [%d %d]\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, numPoints,j,index+j,thisIndex.w);
2403  #endif
2404  #ifdef _NAN_CHECK_
2405  for(int i=0;i<omsg->N ;i++)
2406  {
2407  if( !isfinite(omsg->result[i]) )
2408  CkPrintf("[%d,%d,%d,%d,%d]: sendBWResultDirect nan at %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, i);
2409  CkAssert( isfinite(omsg->result[i]) );
2410  }
2411  #endif
2412  mycb.send(omsg);
2413  }
2414  }
2415 
2416  ///
2417  if(!amPhantom)
2418  {
2419  CkAssert(mynewData!=NULL);
2420  int outsize=grainSizeY;
2421  int index=thisIndex.y;
2422  for(int j=0;j<outsize;j++) //mynewdata
2423  {
2424  inputType *computed=&(mynewData[j*numPoints]);
2425  CkCallback mycb(cp_entry, CkArrayIndex2D(j+index ,thisIndex.w), cb_aid);
2426  partialResultMsg *omsg= new (numPoints, 8*sizeof(int) ) partialResultMsg;
2428  {
2429  *((int*)CkPriorityPtr(omsg)) = cfg.resultMsgPriority;
2430  CkSetQueueing(omsg, CK_QUEUEING_IFIFO);
2431  }
2432  omsg->init(thisIndex,numPoints, thisIndex.z, computed);
2433  #ifdef _PAIRCALC_DEBUG_
2434  CkPrintf("[%d,%d,%d,%d,%d]:sending partial of size %d offset %d to [%d %d]\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric,numPoints,j,index+j,thisIndex.w);
2435  #endif
2436  #ifdef _NAN_CHECK_
2437  for(int i=0;i<omsg->N ;i++)
2438  {
2439  if( !isfinite(omsg->result[i]) )
2440  CkPrintf("[%d,%d,%d,%d,%d]: sendBWResultDirect nan at %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, i);
2441  CkAssert( isfinite(omsg->result[i]) );
2442  }
2443  #endif
2444  mycb.send(omsg);
2445  }
2446  }
2447 
2448  /// If we're done with all the paircalc work in this loop (iteration), then cleanup
2449  if (numRecdBW == numOrtho || actionType == PSIV)
2451 }
2452 
2453 
2454 
2455 
2456 /** This is an entry method to allow us to delay this outbound communication
2457  * to minimize brain dead BG/L interference we have a signal to prioritize this
2458  */
2460 {
2461  #ifdef _PAIRCALC_DEBUG_
2462  CkPrintf("[%d,%d,%d,%d,%d]: sendBWResult with actionType %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, actionType);
2463  #endif
2464 
2465  bool otherdata=msg->otherdata;
2466  delete msg;
2467  // Now we have results in mynewData and if(cfg.isSymmetric) othernewData
2468  CkMulticastMgr *mcastGrp=CProxy_CkMulticastMgr(mCastGrpId).ckLocalBranch();
2469  int cp_entry=cb_ep;
2470  if(actionType==PSIV)
2471  cp_entry= cb_ep_tol;
2472 
2473  if(otherdata)
2474  {
2475  // We have this othernewdata issue for the symmetric case
2476  // and the asymmetric dynamic case for the off diagonal elements
2477  CkAssert(othernewData!=NULL);
2478  int size=grainSizeX;
2479  int index=thisIndex.x;
2480  if(amPhantom)
2481  {
2482  index=thisIndex.y;
2483  size=grainSizeY;
2484  }
2485  for(int j=0;j<size;j++)
2486  {
2487  // This callback creation could be obviated by keeping an
2488  // array of callbacks, not clearly worth doing
2489  inputType *computed=&(othernewData[j*numPoints]);
2490  CkCallback mycb(cp_entry, CkArrayIndex2D(j+index ,thisIndex.w), cb_aid);
2491  #ifdef _PAIRCALC_DEBUG_CONTRIB_
2492  CkPrintf("[%d,%d,%d,%d,%d] contributing other %d offset %d to [%d %d]\n",thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric, numPoints,j,thisIndex.x+j,thisIndex.w);
2493  #endif
2494  int outOffset=thisIndex.z;
2495  mcastGrp->contribute(numPoints*sizeof(inputType),computed, sumMatrixDoubleType, otherResultCookies[j], mycb, outOffset);
2496  }
2497  }
2498 
2499  if(!amPhantom)
2500  {
2501  int outsize=grainSizeY;
2502  int index=thisIndex.y;
2503  CkAssert(mynewData!=NULL);
2504  for(int j=0;j<outsize;j++) //mynewdata
2505  {
2506  inputType *computed=&(mynewData[j*numPoints]);
2507  CkCallback mycb(cp_entry, CkArrayIndex2D(j+index,thisIndex.w), cb_aid);
2508  #ifdef _PAIRCALC_DEBUG_CONTRIB_
2509  CkPrintf("[%d,%d,%d,%d,%d] contributing %d offset %d to [%d %d]\n",thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z, cfg.isSymmetric,numPoints,j,thisIndex.y+j,thisIndex.w);
2510  #endif
2511  int outOffset=thisIndex.z;
2512  mcastGrp->contribute(numPoints*sizeof(inputType), computed, sumMatrixDoubleType, resultCookies[j], mycb, outOffset);
2513  }
2514  }
2515 
2516  /// If we're done with all the paircalc work in this loop (iteration), then cleanup
2517  if (numRecdBW == numOrtho || actionType == PSIV)
2519 }
2520 
2521 
2522 
2523 void PairCalculator::dumpMatrix(const char *infilename, double *matrix, int xdim, int ydim, int xstart, int ystart, int xtra1, int xtra2)
2524 {
2525  char fmt[1000];
2526  char filename[1000];
2527  memset(fmt, 0 , 1000);
2528  memset(filename, 0 , 1000);
2529  strncpy(fmt,infilename,999);
2530  strncat(fmt,"_%d_%d_%d_%d_%d_%d_%d.out\0",999);
2531  sprintf(filename, fmt, thisIndex.w,thisIndex.x, thisIndex.y, thisIndex.z,
2532  xtra1, xtra2, cfg.isSymmetric);
2533  FILE *loutfile = fopen(filename, "w");
2534  for(int i=0;i<xdim;i++)
2535  for(int j=0;j<ydim;j++)
2536  fprintf(loutfile,"%d %d %.12g\n",i+xstart,j+ystart,matrix[i*ydim+j]);
2537  fclose(loutfile);
2538 }
2539 
2540 void PairCalculator::dumpMatrix(const char *infilename, complex *matrix, int xdim, int ydim, int xstart, int ystart, int xtra1, int xtra2)
2541 {
2542  char fmt[1000];
2543  char filename[1000];
2544  memset(fmt, 0 , 1000);
2545  memset(filename, 0 , 1000);
2546  strncpy(fmt,infilename,999);
2547  strncat(fmt,"_%d_%d_%d_%d_%d_%d_%d.out",999);
2548  sprintf(filename, fmt, thisIndex.w,thisIndex.x, thisIndex.y, thisIndex.z,
2549  xtra1, xtra2, cfg.isSymmetric);
2550  FILE *loutfile = fopen(filename, "w");
2551  for(int i=0;i<xdim;i++)
2552  for(int j=0;j<ydim;j++)
2553  fprintf(loutfile,"%d %d %.12g %.12g\n", i+xstart, j+ystart, matrix[i*ydim+j].re, matrix[i*ydim+j].im);
2554  fclose(loutfile);
2555 }
2556 
2557 void PairCalculator::dumpMatrixComplex(const char *infilename, complex *matrix, int xdim, int ydim, int xstart, int ystart, int iter)
2558 {
2559  char fmt[1000];
2560  char filename[1000];
2561  memset(fmt, 0 , 1000);
2562  memset(filename, 0 , 1000);
2563  strncpy(fmt,infilename,999);
2564  strncat(fmt,"_%d_%d_%d_%d_%d_%d.out",999);
2565  sprintf(filename, fmt, thisIndex.w,thisIndex.x, thisIndex.y,thisIndex.z, iter ,cfg.isSymmetric);
2566  FILE *loutfile = fopen(filename, "w");
2567  for(int i=0;i<xdim;i++)
2568  for(int j=0;j<ydim;j++)
2569  fprintf(loutfile,"%d %d %.12g %.12g \n",i+xstart,j+ystart,matrix[i*ydim+j].re, matrix[i*ydim+j].im);
2570  fclose(loutfile);
2571 }
2572 
2573 void PairCalculator::copyIntoTiles(double *source, double**dest, int sourceRows, int sourceCols, int *offsetsRow, int *offsetsCol, int *touched, int tileSize, int tilesPerRow )
2574 
2575 {
2576 
2577  // foreach element
2578  for(int j=0;j<sourceRows;j++) // x is inner index
2579  for(int i=0;i<sourceCols;i++)
2580  {
2581  int x=offsetsRow[i];
2582  int y=offsetsCol[j];
2583  int sourceOffset=j*sourceCols+i;
2584  double value=source[sourceOffset];
2585  int tilex=x/tileSize;
2586  int tiley=y/tileSize*tilesPerRow;
2587  int tilei=x%tileSize;
2588  int tilej=y%tileSize;
2589  touched[tiley+tilex]++;
2590 #ifdef _NAN_CHECK_
2591  CkAssert( isfinite(value) );
2592  CkAssert(touched[tiley+tilex]<=tileSize*tileSize);
2593  CkAssert(tilex+tiley<numOrtho);
2594 #endif
2595  dest[tiley+tilex][tilej*tileSize+tilei]=value;
2596  // if(cfg.isSymmetric)
2597  // CkPrintf(" j %d i %d, x %d y %d copy %g into tilex %d tiley %d, offset %d touched %d\n", j,i, x, y, value, tilex, tiley, tilej*tileSize+tilei, touched[tiley+tilex]);
2598 
2599  }
2600 }
2601 
2602 void PairCalculator::dgemmSplitFwdStreamMK(int m, int n, int k, char *trans, char *transT, double *alpha, double *A, int *lda, double *B, int *ldb, double *C, int *ldc)
2603 {
2604 #ifdef CMK_TRACE_ENABLED
2605  double StartTime=CmiWallTimer();
2606 #endif
2607  double betap = 1.0;
2608  int Ksplit_m = gemmSplitFWk;
2609  int Ksplit = ( (k > Ksplit_m) ? Ksplit_m : k);
2610  int Krem = (k % Ksplit);
2611  int Kloop = k/Ksplit-1;
2612  int Msplit_m = gemmSplitFWm;
2613  int Msplit = ( (m > Msplit_m) ? Msplit_m : m);
2614  int Mrem = (m % Msplit);
2615  int Mloop = m/Msplit;
2616 
2617  for(int ms=1;ms<=Mloop;ms++)
2618  {
2619  int moff = (ms-1)*k*Msplit;
2620  int moffc = (ms-1)*Msplit;
2621  int MsplitU = (ms==Mloop ? Msplit+Mrem : Msplit);
2622 #ifndef BUNDLE_USER_EVENT
2623 #ifdef CMK_TRACE_ENABLED
2624  StartTime=CmiWallTimer();
2625 #endif
2626 #endif
2627 #ifdef TEST_ALIGN
2628  CkAssert((unsigned int) A[moff] % 16==0);
2629  CkAssert((unsigned int) B % 16==0);
2630  CkAssert((unsigned int) C[moffc] % 16==0);
2631 #endif
2632 
2633 #ifdef PRINT_DGEMM_PARAMS
2634  CkPrintf("HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", *transT, *trans, MsplitU, n, Ksplit, *alpha, betap, *lda, *ldb, *ldc);
2635 #endif
2636  DGEMM(transT, trans, &MsplitU, &n, &Ksplit, alpha, &A[moff], lda, B, ldb, &betap, &C[moffc], ldc);
2637 
2638 #ifndef BUNDLE_USER_EVENT
2639 #ifdef CMK_TRACE_ENABLED
2640  traceUserBracketEvent(210, StartTime, CmiWallTimer());
2641 #endif
2642 #endif
2643  CmiNetworkProgress();
2644  for(int ks=1;ks<=Kloop;ks++)
2645  {
2646  int koff = ks*Ksplit;
2647  int KsplitU = (ks==Kloop ? Ksplit+Krem : Ksplit);
2648 #ifndef BUNDLE_USER_EVENT
2649 #ifdef CMK_TRACE_ENABLED
2650  StartTime=CmiWallTimer();
2651 #endif
2652 #endif
2653 #ifdef TEST_ALIGN
2654  CkAssert((unsigned int) A[koff+moff] % 16==0);
2655  CkAssert((unsigned int) B[koff] % 16==0);
2656  CkAssert((unsigned int) C[moffc] % 16==0);
2657 #endif
2658 
2659 #ifdef PRINT_DGEMM_PARAMS
2660  CkPrintf("HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", *transT, *trans, MsplitU, n, KsplitU, *alpha, betap, *lda, *ldb, *ldc);
2661 #endif
2662  DGEMM(transT, trans, &MsplitU, &n, &KsplitU, alpha, &A[koff+moff], lda, &B[koff], ldb, &betap, &C[moffc], ldc);
2663 #ifndef BUNDLE_USER_EVENT
2664 #ifdef CMK_TRACE_ENABLED
2665 
2666  traceUserBracketEvent(210, StartTime, CmiWallTimer());
2667 #endif
2668 #endif
2669  CmiNetworkProgress();
2670  }//endfor
2671  }//endfor
2672 
2673 #ifdef BUNDLE_USER_EVENT
2674 #ifdef CMK_TRACE_ENABLED
2675  traceUserBracketEvent(210, StartTime, CmiWallTimer());
2676 #endif
2677 #endif
2678 }
2679 
2680 void PairCalculator::dgemmSplitFwdStreamNK(int m, int n, int k, char *trans, char *transT, double *alpha, double *A, int *lda, double *B, int *ldb, double *C, int *ldc)
2681 {
2682 #ifdef CMK_TRACE_ENABLED
2683  double StartTime=CmiWallTimer();
2684 #endif
2685  double betap = 1.0;
2686  int Ksplit_m = gemmSplitFWk;
2687  int Ksplit = ( (k > Ksplit_m) ? Ksplit_m : k);
2688  int Krem = (k % Ksplit);
2689  int Kloop = k/Ksplit-1;
2690  int Nsplit_m = gemmSplitFWm;
2691  int Nsplit = ( (n > Nsplit_m) ? Nsplit_m : n);
2692  int Nrem = (n % Nsplit);
2693  int Nloop = n/Nsplit;
2694 #ifdef CMK_TRACE_ENABLED
2695  StartTime=CmiWallTimer();
2696 #endif
2697  for(int ns=1;ns<=Nloop;ns++)
2698  {
2699  int noff = (ns-1)*k*Nsplit;
2700  int noffc = (ns-1)*(*ldc)*Nsplit;
2701  int NsplitU = (ns==Nloop ? Nsplit+Nrem : Nsplit);
2702 #ifndef BUNDLE_USER_EVENT
2703 #ifdef CMK_TRACE_ENABLED
2704  StartTime=CmiWallTimer();
2705 #endif
2706 #endif
2707 #ifdef TEST_ALIGN
2708  CkAssert((unsigned int) A % 16==0);
2709  CkAssert((unsigned int) B[noff] % 16==0);
2710  CkAssert((unsigned int) C[noffc] % 16==0);
2711 #endif
2712 
2713 #ifdef PRINT_DGEMM_PARAMS
2714  CkPrintf("HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", *transT, *trans, m, NsplitU, Ksplit, *alpha, betap, *lda, *ldb, *ldc);
2715 #endif
2716  DGEMM(transT, trans, &m, &NsplitU, &Ksplit, alpha, A, lda, &B[noff], ldb, &betap, &C[noffc], ldc);
2717 #ifndef BUNDLE_USER_EVENT
2718 #ifdef CMK_TRACE_ENABLED
2719  traceUserBracketEvent(210, StartTime, CmiWallTimer());
2720 #endif
2721 #endif
2722  CmiNetworkProgress();
2723  for(int ks=1;ks<=Kloop;ks++){
2724  int koff = ks*Ksplit;
2725  int KsplitU = (ks==Kloop ? Ksplit+Krem : Ksplit);
2726 #ifndef BUNDLE_USER_EVENT
2727 #ifdef CMK_TRACE_ENABLED
2728  StartTime=CmiWallTimer();
2729 #endif
2730 #endif
2731 #ifdef TEST_ALIGN
2732  CkAssert((unsigned int) A[koff] % 16==0);
2733  CkAssert((unsigned int) B[koff+noff] % 16==0);
2734  CkAssert((unsigned int) C[noffc] % 16==0);
2735 #endif
2736 
2737 #ifdef PRINT_DGEMM_PARAMS
2738  CkPrintf("HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", *transT, *trans, m, NsplitU, KsplitU, *alpha, betap, *lda, *ldb, *ldc);
2739 #endif
2740  DGEMM(transT, trans, &m, &NsplitU, &KsplitU, alpha, &A[koff], lda, &B[koff+noff], ldb, &betap, &C[noffc], ldc);
2741 
2742 #ifndef BUNDLE_USER_EVENT
2743 #ifdef CMK_TRACE_ENABLED
2744  traceUserBracketEvent(210, StartTime, CmiWallTimer());
2745 #endif
2746 #endif
2747  CmiNetworkProgress();
2748  }//endfor
2749  }//endfor
2750 
2751 #ifdef BUNDLE_USER_EVENT
2752 #ifdef CMK_TRACE_ENABLED
2753  traceUserBracketEvent(210, StartTime, CmiWallTimer());
2754 #endif
2755 #endif
2756 }
2757 
2758 void PairCalculator::dgemmSplitBwdM(int m, int n, int k, char *trans, char *transT, double *alpha, double *A, double *B, double *bt, double *C)
2759 {
2760 #ifdef CMK_TRACE_ENABLED
2761  double StartTime=CmiWallTimer();
2762 #endif
2763  int Msplit_m = gemmSplitBW;
2764  int Msplit = ( (m > Msplit_m) ? Msplit_m : m);
2765  int Mrem = (m % Msplit);
2766  int Mloop = m/Msplit-1;
2767 
2768 #ifdef TEST_ALIGN
2769  CkAssert((unsigned int) A % 16==0);
2770  CkAssert((unsigned int) B % 16==0);
2771  // CkAssert((unsigned int) C % 16==0);
2772 #endif
2773 
2774 #ifdef PRINT_DGEMM_PARAMS
2775  CkPrintf("HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", *trans, *transT, Msplit, n, k, *alpha, *bt, m, k, m);
2776 #endif
2777  int ldb=k;
2778  if(*transT=='T')
2779  ldb=n;
2780  DGEMM(trans, transT, &Msplit, &n, &k, alpha, A, &m, B, &ldb, bt, C, &m);
2781 #ifndef BUNDLE_USER_EVENT
2782 #ifdef CMK_TRACE_ENABLED
2783  traceUserBracketEvent(230, StartTime, CmiWallTimer());
2784 #endif
2785 #endif
2786 
2787  CmiNetworkProgress();
2788  for(int i=1;i<=Mloop;i++)
2789  {
2790  int off = i*Msplit;
2791  if(i==Mloop) { Msplit+=Mrem; }
2792 #ifndef BUNDLE_USER_EVENT
2793 #ifdef CMK_TRACE_ENABLED
2794  StartTime=CmiWallTimer();
2795 #endif
2796 #endif
2797 #ifdef TEST_ALIGN
2798  CkAssert((unsigned int) A[off] % 16==0);
2799  CkAssert((unsigned int) B % 16==0);
2800  // CkAssert((unsigned int) C[off] % 16==0);
2801 #endif
2802 
2803 #ifdef PRINT_DGEMM_PARAMS
2804  CkPrintf("HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", *trans, *transT, Msplit, n, k, *alpha, *bt, m, k, m);
2805 #endif
2806  DGEMM(trans, transT, &Msplit, &n, &k, alpha, &A[off], &m, B, &ldb, bt, &C[off], &m);
2807 
2808 #ifndef BUNDLE_USER_EVENT
2809 #ifdef CMK_TRACE_ENABLED
2810  traceUserBracketEvent(230, StartTime, CmiWallTimer());
2811 #endif
2812 #endif
2813  CmiNetworkProgress();
2814  } //endfor
2815 #ifdef BUNDLE_USER_EVENT
2816 #ifdef CMK_TRACE_ENABLED
2817  traceUserBracketEvent(230, StartTime, CmiWallTimer());
2818 #endif
2819 #endif
2820 }
2821 
2822 void manmult(int numRowsA, int numRowsB, int rowLength, double *A, double *B, double *C, double alpha)
2823 {
2824  // foreach row multiply all members
2825  for( int i=0; i<numRowsA; i++)
2826  {
2827  double *athrow=&(A[i*rowLength]);
2828  for( int j=0; j<numRowsB; j++)
2829  {
2830  double *bthrow=&(B[j*rowLength]);
2831  for( int e=0; e<rowLength; e++)
2832  {
2833  // C[i*numRowsB +j]+=alpha*athrow[e]*bthrow[e];
2834  C[j*numRowsA +i]+=alpha*athrow[e]*bthrow[e];
2835  }
2836  }
2837  }
2838 }
2839 /*@}*/
2840 
2841 #include "pcMessages.def.h"
2842 #include "InputDataHandler.h"
2843 #include "inputDataHandler.def.h"
2844 #include "ckPairCalculator.def.h"
2845 
bool isSymmetric
Is this a symmetric or asymmetric paircalc instance.
Definition: pcConfig.h:30
CkSectionInfo * resultCookies
array of bw path section cookies
int streamCaughtL
number of rows caught so far L stream
bool expectOrthoT
Is true only in asymmetric, dynamics scenario. For PC instances in the asymmetric chare array...
int streamCaughtR
number of rows caught so far R stream
void sendBWResultColumn(bool other, int startGrain, int endGrain)
Send the result for this column.
int grainSizeY
number of states per chare y-axis
int orthoGrainSizeRemX
sgrainSizeX % orthoGrainSize
CkCallback * orthoCB
forward path callbacks
internalType * inDataRight
the input pair to be transformed
int gSpaceEP
The entry point to which this instance should send results to.
Definition: pcConfig.h:63
int numRecdBW
number of messages received BW
void bwSendHelper(int orthoX, int orthoY, int sizeX, int sizeY, int ogx, int ogy)
Called on the normal backward path (non-psiV) to set up the data sends to GSpace. ...
void bwMultiplyDynOrthoT()
Multiplies Fpsi by T (from Ortho)
void sendTiles(bool flag_dp)
Contribute orthoGrainSized tiles of data (that are ready?) to the corresponding ortho chares...
double * allCaughtRight
unordered rows of FW input
inputType * data()
A pointer to the message data. No checks on pointer validity. Use with a pinch of salt...
Definition: pcMessages.h:133
int orthoGrainSizeRemY
sgrainSizeY % orthoGrainSize
inputType * othernewData
results of sym off diagonal multiply,
CkReductionMsg * sumMatrixDouble(int nMsg, CkReductionMsg **msgs)
sum together matrices of doubles
internalType * outData
results of fw multiply
bool isBWbarriered
Should we impose a hard barrier in the BW path to sync all PC chares?
Definition: pcConfig.h:101
int cb_ep
bw path callback entry point
void collectTile(bool doMatrix1, bool doMatrix2, bool doOrthoT, int orthoX, int orthoY, int orthoGrainSizeX, int orthoGrainSizeY, int numRecdBW, int matrixSize, internalType *matrix1, internalType *matrix2)
Receive data from ortho chares and copy into matrix.
int numExpectedY
number of messages expected y-axis
int numCols() const
The number of columns of data units in the data array stored in this msg.
Definition: pcMessages.h:129
bool existsLeft
inDataLeft allocated
CkArrayID gSpaceAID
The array ID of the GSpace chare array this instance talks to.
Definition: pcConfig.h:61
bool existsNew
newData allocated
int numRecRight
number of rows so far total right
int numExpected
number of messages expected all
int numRows() const
The number of rows in the data array stored in this msg.
Definition: pcMessages.h:127
bool existsRight
inDataRight allocated
void multiplyResult(multiplyResultMsg *msg)
Entry Method. Backward path multiplication.
CkCallback uponSetupCompletion
Callback to trigger at the end of a paircalc array's init.
Definition: pcConfig.h:59
int numRecLeft
number of rows so far total left
void sendBWResult(sendBWsignalMsg *msg)
Entry Method. Send the results via multiple reductions as triggered by a prioritized message...
void sendBWResultDirect(sendBWsignalMsg *msg)
Entry Method.
int * columnCountOther
count of processed rows in BW by column
bool symmetricOnDiagonal
diagonal symmetric special case
int gemmSplitFWk
{ BGL's painful NIC forces us to split long computations.
Definition: pcConfig.h:121
int gemmSplitFWk
number of rows in split FW dgemm
inputType * mynewData
results of bw multiply
int numStates
The total number of states in the system.
Definition: pcConfig.h:41
int gemmSplitBW
number of rows in split BW dgemm
void multiplyResultI(multiplyResultMsg *msg)
Entry Method. Simply forwards the call to multiplyResult(). Dont seem to be any instances in source ...
paircalcInputMsg * msgRight
Incoming messages with left and right matrix data that are kept around so that we can directly comput...
CollatorType * leftCollator
Data collators for the left and right matrix blocks.
int conserveMemory
The mem footprint vs performance setting for the paircalcs (tribool)
Definition: pcConfig.h:75
double ** outTiles
in output streaming we populate the tiles directly
CProxy_InputDataHandler< CollatorType, CollatorType > myMsgHandler
A handle to the co-located chare array that handles data input.
CkGroupID mCastGrpId
group id for multicast manager bw
int gemmSplitFWm
number of columns in split FW dgemm
int grainSize
The grain size along the states dimensions (plural) (number of states per PC chare) ...
Definition: pcConfig.h:45
int numOrthoCol
sGrainSizeX/orthoGrainSize
bool isDynamics
Is this a minimization or dynamics run.
Definition: pcConfig.h:28
CkArrayID cb_aid
bw path callback array ID
int PsiVEP
The entry point to which this instance should send PsiV tolerance update results to.
Definition: pcConfig.h:65
CkGroupID mCastGrpIdOrtho
group id for multicast manager ortho
void initResultSection(initResultMsg *msg)
Entry Method. Initialize the section cookie for each slice of the result.
void multiplyPsiV()
Dynamics: PsiV Tolerance correction loop called on symmetric instances. Technically, backward path.
internalType * inDataLeft
or the C=-1 inRight orthoT +c in dynamics
int actionType
matrix usage control [NORMAL, KEEPORTHO, PSIV]
Class that buffers incoming data (via messages/RDMA) till it counts a pre-specified number of arrival...
int numOrthoRow
sGrainSizeY/orthoGrainSize
int instanceIndex
The proxyOffset value of thisInstance of OpenAtom computations.
Definition: pcConfig.h:57
PairCalculator(CProxy_InputDataHandler< CollatorType, CollatorType > inProxy, const pc::pcConfig _cfg)
Entry Method. (obviously)
bool isOutputReduced
Should the results from each PC chare be reduced or delivered individually to GSpace?
Definition: pcConfig.h:91
int numChunks
The number of chunks (4th dimension of decomposition)
Definition: pcConfig.h:43
int numRecd
number of messages received
int numPoints
number of points in this chunk
int grainSizeX
number of states per chare x-axis
The new message for sending input data to the PairCalculator.
Definition: pcMessages.h:117
int cb_ep_tol
bw path callback entry point for psiV tolerance
bool amPhantom
consolidate thisIndex.x<thisIndex.y && cfg.isSymmetric && phantomsym
void acceptOrthoT(multiplyResultMsg *msg)
Entry Method. During dynamics, each Ortho calls this on the Asymm loop PC instances to send its share...
int numRecdBWOT
number of messages received BW orthoT
void bwMultiplyHelper(int size, internalType *matrix1, internalType *matrix2, internalType *amatrix, internalType *amatrix2, bool unitcoef, int m_in, int n_in, int k_in, int BNAoffset, int BNCoffset, int BTAoffset, int BTCoffset, int orthoX, int orthoY, double beta, int ogx, int ogy)
multiplyPsiV() and multiplyResult() call this to perform the matrix multiply math on the backward pat...
bool resumed
have resumed from load balancing
void launchComputations(paircalcInputMsg *aMsg)
NOT an entry method. Called locally from the acceptData* methods to launch the appropriate number-cru...
bool isLeftReady
Flags indicating if the left and right matrix blocks have been received.
msgType * getSampleMsg()
Get a copy of a sample message from the last completed batch of messages. Copy managed by the user...
bool isLBon
Should the paircalcs worry about load balancing.
Definition: pcConfig.h:77
bool shouldDelayBWsend
Should we tweak msg priority to delay msgs to GSpace carrying the results?
Definition: pcConfig.h:103
Dumb structure that holds all the configuration inputs required for paircalc instantiation, functioning and interaction.
Definition: pcConfig.h:23
Input handler chare array proxies.
bool isBWstreaming
Should this instance stream the result fragments in the BW path as they become ready?
Definition: pcConfig.h:99
int orthoGrainSize
The grain size along the states dimensions for Ortho chares.
Definition: pcConfig.h:52
void phantomDone()
Entry Method. To handle correct startup for symm PC w/phantoms.
void bwbarrier(CkReductionMsg *msg)
Entry Method. a debugging tool: a barrier at the end of the backward path before anything is sent ove...
bool arePhantomsOn
If this is a symmetric instance, should it use phantom chares to balance the BW path.
Definition: pcConfig.h:32
void copyIntoTiles(double *source, double **dest, int sourceRows, int sourceCols, int *offsetsRow, int *offsetsCol, int *touched, int tileSize, int tilesPerRow)
Copy the results from outdata1 and outdata2 into the tiles.
bool notOnDiagonal
being on or off diagonal changes many things
void acceptLeftData(paircalcInputMsg *msg)
Entry Method. Method to send in the complete block of the left matrix.
void pup(PUP::er &)
PUP method.
~PairCalculator()
Destructor (nothing needs to be done?)
int * touchedTiles
tracker to detect when tiles are full
void initGRed(initGRedMsg *msg)
Entry Method. Initializes the section cookie and the reduction client. Called on startup as the chare...
int blkSize
number points in gspace plane
void cleanupAfterBWPath()
Cleans up at end of an iteration (fw-bw computation loop); frees mem, resets counters etc...
double * allCaughtLeft
unordered rows of FW input
internalType * inResult2
used in gamma calc (non minimization)
void enqueueBWsend(bool unitcoef, int priority=1)
Schedules the entry methods that send out the results to GSpace with appropriate priority.
CkSectionInfo * otherResultCookies
extra array of bw path section cookies for sym off diag, or dynamics
internalType * inResult1
accumulate ortho or lambda
int * columnCount
count of processed rows in BW by column
int numExpectedX
number of messages expected x-axis
void contributeSubTiles(internalType *fullOutput)
Piece up a tile and send all the pieces as this PC's contribution to the Ortho chares.
int resultMsgPriority
If shouldDelayBWsend, what priority should this instance use for the result msgs. ...
Definition: pcConfig.h:93
int numOrtho
number of orthos in our grain = numOrthoCol*numOrthoRow
CkSectionInfo * orthoCookies
forward path reduction cookie
bool existsOut
outData allocated
pc::pcConfig cfg
A private copy of the input configurations.
int rck
count of received cookies
void expectNext()
Ask me to notify converse that we're ready for the next batch of messages.
void acceptRightData(paircalcInputMsg *msg)
Entry Method. Method to send in the complete block of the right matrix.
bool areBWTilesCollected
Should this instance collect result fragments and send them out together in the BW path...
Definition: pcConfig.h:97
void multiplyForward(bool flag_dp)
Forward path multiply driver. Prepares matrices, calls DGEMM, contributes results to Ortho subTiles a...