1 #include "ckPairCalculator.h"
2 #include "utility/matrix2file.h"
5 ComlibInstanceHandle mcastInstanceCP;
6 ComlibInstanceHandle mcastInstanceACP;
8 CkReduction::reducerType sumMatrixDoubleType;
11 void registersumMatrixDouble(
void)
17 void fastAdd (
double *a,
double *b,
int nelem);
25 double *ret=(
double *)msgs[0]->getData();
28 int size0=msgs[0]->getSize();
29 int size=size0/
sizeof(double);
37 for(
int d=0;d<size;d++)
39 for(i=1; i<nMsg-3;i+=3)
40 ret[d]+= ((
double *) msgs[i]->getData())[d] + ((
double *) msgs[i+1]->getData())[d] + ((
double *) msgs[i+2]->getData())[d];
43 ret[d]+=((
double *) msgs[i]->getData())[d];
48 for(
int i=1; i<nMsg;i++)
51 inmatrix=(
double *) msgs[i]->getData();
52 for(
int d=0;d<size;d++)
56 return CkReductionMsg::buildNew(size*
sizeof(
double),ret);
60 void myGEMM(
char *opA,
char *opB,
int *m,
int *n,
int *k,
double *alpha,
complex *A,
int *lda,
complex *B,
int *ldb,
double *beta,
complex *C,
int *ldc)
62 complex cAlpha(*alpha,0.), cBeta(*beta,0.);
63 ZGEMM(opA, opB, m, n, k, &cAlpha, A, lda, B, ldb, &cBeta, C, ldc);
66 void myGEMM(
char *opA,
char *opB,
int *m,
int *n,
int *k,
double *alpha,
double *A,
int *lda,
double *B,
int *ldb,
double *beta,
double *C,
int *ldc)
68 DGEMM(opA, opB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
82 #ifdef _PAIRCALC_DEBUG_PLACE_
83 CkPrintf(
"{%d} [PAIRCALC] [%d,%d,%d,%d,%d] inited on pe %d \n", _instance,thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,_sym, CkMyPe());
156 setMigratable(
false);
189 isRightReady =
false;
199 isRightReady =
false;
202 CkArrayIndex4D myIndex(thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z);
203 CkCallback leftTrigger (CkIndex_PairCalculator::acceptLeftData (NULL),myIndex,thisProxy,
true);
204 CkCallback rightTrigger(CkIndex_PairCalculator::acceptRightData(NULL),myIndex,thisProxy,
true);
206 std::ostringstream idStream;
207 idStream<<
"["<<thisIndex.w<<
","<<thisIndex.x<<
","<<thisIndex.y<<
","<<thisIndex.z<<
","<<
cfg.
isSymmetric<<
"]";
211 #ifdef DEBUG_CP_PAIRCALC_INPUTDATAHANDLER
212 CkPrintf(
"[%d,%d,%d,%d,%d] My left and right data collators: %p %p\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,
cfg.
isSymmetric,
leftCollator,rightCollator);
218 #ifdef DEBUG_CP_PAIRCALC_CREATION
219 CkPrintf(
"[%d,%d,%d,%d,%d] Inserting my InputDataHandler\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,
cfg.
isSymmetric);
221 myMsgHandler(thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z).insert(thisProxy);
230 ArrayElement4D::pup(p);
317 #ifdef _PAIRCALC_DEBUG_
320 CkPrintf(
"[%d,%d,%d,%d,%d] pup unpacking on %d resumed=%d memory %d\n",thisIndex.w,thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric,CkMyPe(),
resumed, CmiMemoryUsage());
321 CkPrintf(
"[%d,%d,%d,%d,%d] pupped : %d,%d,%d,%d,%d %d %d %d %d %d %d cb cb_aid %d %d %d cb_lb inDataLeft inDataRight outData %d \n",thisIndex.w,thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric,
numRecd,
numExpected,
cfg.
grainSize,
cfg.
numStates,
cfg.
numChunks,
numPoints,
cfg.
isSymmetric,
cfg.
conserveMemory,
cfg.
isLBon,
cfg.reduce,
cb_ep,
existsLeft,
existsRight,
resumed);
325 CkPrintf(
"[%d,%d,%d,%d,%d] pup called on %d\n",thisIndex.w,thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric,CkMyPe());
335 #ifdef _PAIRCALC_DEBUG_
336 CkPrintf(
"[%d,%d,%d,%d,%d] destructs on [%d]\n",thisIndex.w,thisIndex.x,thisIndex.y, thisIndex.z,
cfg.
isSymmetric, CkMyPe());
356 CkAbort(
"paircalc pup needs fw streaming work");
380 orthoIndexX= (orthoIndexX>maxorthostateindex) ? maxorthostateindex : orthoIndexX;
382 orthoIndexY= (orthoIndexY>maxorthostateindex) ? maxorthostateindex : orthoIndexY;
386 int orthoIndex=orthoIndexX*
numOrthoCol+orthoIndexY;
388 #ifdef _PAIRCALC_DEBUG_
389 CkPrintf(
"[%d,%d,%d,%d,%d] initGRed ox %d oy %d oindex %d oxindex %d oyindex %d numRecd %d numOrtho %d\n",thisIndex.w,thisIndex.x,thisIndex.y, thisIndex.z,
cfg.
isSymmetric,msg->orthoX, msg->orthoY,orthoIndex, orthoIndexX, orthoIndexY,
numRecd,
numOrtho);
417 thisProxy(thisIndex.w,thisIndex.y, thisIndex.x,thisIndex.z).phantomDone();
438 if(msg->dest == thisIndex.x && thisIndex.x != thisIndex.y)
442 #ifdef _PAIRCALC_DEBUG_SPROXY_
443 CkPrintf(
"[%d,%d,%d,%d,%d] other initResultSection for dest %d offset %d\n",thisIndex.w,thisIndex.x,thisIndex.y, thisIndex.z,
cfg.
isSymmetric,msg->dest, msg->offset);
450 #ifdef _PAIRCALC_DEBUG_SPROXY_
451 CkPrintf(
"[%d,%d,%d,%d,%d] initResultSection for dest %d offset %d\n",thisIndex.w,thisIndex.x,thisIndex.y, thisIndex.z,
cfg.
isSymmetric,msg->dest, msg->offset);
460 contribute(
sizeof(
int), &
rck, CkReduction::sum_int, msg->synccb);
465 void PairCalculator::ResumeFromSync() {
470 #ifdef _PAIRCALC_DEBUG_
471 CkPrintf(
"[%d,%d,%d,%d,%d] resumes from sync\n",thisIndex.w,thisIndex.x,thisIndex.y, thisIndex.z,
cfg.
isSymmetric);
481 const int numRows = msg->
numRows();
482 const int numCols = msg->
numCols();
484 CkAssert(data != NULL);
489 for(
int i=0; i < numRows*numCols; i++)
490 CkAssert( isfinite(data[i]) );
493 #ifdef _PAIRCALC_DEBUG_
494 CkPrintf(
"[%d,%d,%d,%d,%d] Received left matrix block of size %d x %d at %p\n",
495 thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,
cfg.
isSymmetric,numRows,numCols,data);
500 inDataLeft =
reinterpret_cast<internalType*
> (data);
513 msgLeft->doPsiV = sampleMsg->doPsiV;
515 msgLeft->flag_dp= sampleMsg->flag_dp;
521 msgLeft->doPsiV =
false;
524 std::stringstream dbgStr;
525 dbgStr<<
"["<<thisIndex.w<<
","<<thisIndex.x<<
","<<thisIndex.y<<
","<<thisIndex.z<<
","<<
cfg.
isSymmetric<<
"]"
526 <<
" My collator was not able to give me a sample message. Aborting...";
527 CkAbort(dbgStr.str().c_str());
541 const int numRows = msg->
numRows();
542 const int numCols = msg->
numCols();
544 CkAssert(data != NULL);
549 for(
int i=0; i < numRows*numCols; i++)
550 CkAssert( isfinite(data[i]) );
553 #ifdef _PAIRCALC_DEBUG_
554 CkPrintf(
"[%d,%d,%d,%d,%d] Received right matrix block of size %d x %d at %p\n",
555 thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,
cfg.
isSymmetric,numRows,numCols,data);
560 inDataRight =
reinterpret_cast<internalType*
> (data);
576 msgRight->doPsiV = sampleMsg->doPsiV;
578 msgRight->flag_dp= sampleMsg->flag_dp;
587 std::stringstream dbgStr;
588 dbgStr<<
"["<<thisIndex.w<<
","<<thisIndex.x<<
","<<thisIndex.y<<
","<<thisIndex.z<<
","<<
cfg.
isSymmetric<<
"]"
589 <<
" My collator was not able to give me a sample message. Aborting...";
590 CkAbort(dbgStr.str().c_str());
604 #ifdef _PAIRCALC_DEBUG_
605 CkPrintf(
"[%d,%d,%d,%d,%d] Going to launch computations... numRecd = %d numExpected = %d numExpectedX = %d, numExpectedY = %d\n",
612 bool isForwardPathPending =
false;
621 #ifdef _CP_SUBSTEP_TIMING_
622 if(
cfg.forwardTimerID > 0)
624 double pstart=CmiWallTimer();
625 contribute(
sizeof(
double),&pstart,CkReduction::min_double,
cfg.beginTimerCB ,
cfg.forwardTimerID);
640 thisProxy(thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z).multiplyForward(aMsg->flag_dp);
643 isForwardPathPending =
true;
644 CkPrintf(
"[%d,%d,%d,%d,%d] Gamma beat OrthoT. Waiting for T to arrive before proceeding with forward path\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,
cfg.
isSymmetric);
651 #ifdef _CP_SUBSTEP_TIMING_
652 if(
cfg.forwardTimerID > 0)
654 double pstart=CmiWallTimer();
655 contribute(
sizeof(
double),&pstart,CkReduction::max_double,
cfg.endTimerCB ,
cfg.forwardTimerID);
665 thisProxy(thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z).multiplyPsiV();
679 if (!isForwardPathPending)
686 isRightReady =
false;
692 void PairCalculator::reorder(
int * offsetMap,
int *revOffsetMap,
double *data,
double *scratch)
700 int datasize=actualPoints*
sizeof(double);
703 if(off!=offsetMap[off])
705 int want = revOffsetMap[off];
706 int found = offsetMap[off];
707 int currentOffset = off * actualPoints;
708 int wantOffset = want * actualPoints;
710 CmiMemcpy(scratch, &(data[currentOffset]), datasize);
711 CmiMemcpy(&(data[currentOffset]), &(data[wantOffset]), datasize);
714 CmiMemcpy(&(data[wantOffset]),scratch, datasize);
718 int foundOffset = found * actualPoints;
719 CmiMemcpy(&(data[wantOffset]), &(data[foundOffset]), datasize);
720 CmiMemcpy(&(data[foundOffset]),scratch, datasize);
722 offsetMap[want]=offsetMap[found];
723 revOffsetMap[offsetMap[found]]=want;
726 revOffsetMap[off]=off;
728 offsetMap[found]=found;
729 revOffsetMap[found]=found;
732 bzero(offsetMap, numExpected*
sizeof(
int));
733 bzero(revOffsetMap, numExpected*
sizeof(
int));
746 CkMulticastMgr *mcastGrp=CProxy_CkMulticastMgr(
mCastGrpIdOrtho).ckLocalBranch();
751 for(
int orthoIndex=0;orthoIndex<
numOrtho;orthoIndex++)
759 #ifdef _PAIRCALC_DEBUG_CONTRIB_
761 CkPrintf(
"[%d,%d,%d,%d,%d]: contributes %d \n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, orthoIndex);
767 #ifdef _PAIRCALC_DEBUG_PARANOID_FW_
771 snprintf(filename,80,
"fwoutTile_%d_%d:",orthoX,orthoY);
777 CkAssert( isfinite(
outTiles[orthoIndex][i]) );
784 {progcounter=0;CmiNetworkProgress();}
788 CkPrintf(
"tile orthoIndex %d has %d vs %d\n",orthoIndex,
touchedTiles[orthoIndex], tilesq);
789 CkAbort(
"invalid large number of tiles touched");
793 #ifdef _PAIRCALC_DEBUG_CONTRIB_
794 CkPrintf(
"[%d,%d,%d,%d,%d]: %i not ready with %d \n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, orthoIndex,
touchedTiles[orthoIndex]);
833 #ifdef _PAIRCALC_DEBUG_
834 CkPrintf(
"[%d,%d,%d,%d,%d] PairCalculator::multiplyForward() Starting forward path computations.\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,
cfg.
isSymmetric);
844 #ifdef _PAIRCALC_DEBUG_
845 CkPrintf(
"[%d,%d,%d,%d,%d] Allocated outData %d * %d\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,
cfg.
isSymmetric,
grainSizeX, grainSizeY);
850 #ifdef CP_PAIRCALC_USES_COMPLEX_MATH
851 char transformT =
'C';
853 char transformT =
'T';
855 char transform =
'N';
859 double alpha = double(1.0);
860 double beta = double(0.0);
863 internalType *matrixC =
outData;
864 internalType *matrixB =
reinterpret_cast<internalType*
> ( msgLeft->
data() );
865 internalType *matrixA;
867 matrixA =
reinterpret_cast<internalType*
> (
msgRight->
data() );
876 CkAssert((
unsigned int)matrixA%16==0);
877 CkAssert((
unsigned int)matrixB%16==0);
878 CkAssert((
unsigned int)matrixC%16==0);
882 k_in *= pcDataSizeFactor;
887 #ifdef _PAIRCALC_DEBUG_PARANOID_FW_
891 #ifdef PRINT_DGEMM_PARAMS
892 CkPrintf(
"HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", transformT, transform, m_in, n_in, k_in, alpha, beta, k_in, k_in, m_in);
896 #ifdef CMK_TRACE_ENABLED
897 double StartTime=CmiWallTimer();
899 myGEMM(&transformT, &transform, &m_in, &n_in, &k_in, &alpha, matrixA, &k_in, matrixB, &k_in, &beta, matrixC, &m_in);
900 #ifdef CMK_TRACE_ENABLED
901 traceUserBracketEvent(210, StartTime, CmiWallTimer());
904 #ifdef CMK_TRACE_ENABLED
905 StartTime=CmiWallTimer();
910 #ifdef _CP_SUBSTEP_TIMING_
911 if(
cfg.forwardTimerID > 0)
913 double pstart=CmiWallTimer();
914 contribute(
sizeof(
double),&pstart,CkReduction::max_double,
cfg.endTimerCB ,
cfg.forwardTimerID);
917 #ifdef CMK_TRACE_ENABLED
918 traceUserBracketEvent(220, StartTime, CmiWallTimer());
929 CkSetQueueing(msg2phantom, CK_QUEUEING_IFIFO);
930 *(
int*)CkPriorityPtr(msg2phantom) = 1;
932 thisProxy(thisIndex.w,thisIndex.y, thisIndex.x,thisIndex.z).acceptRightData(msg2phantom);
942 thisProxy(thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z).bwMultiplyDynOrthoT();
950 #ifdef _PAIRCALC_DEBUG_
951 CkPrintf(
"[%d,%d,%d,%d,%d]: contributeSubTiles \n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric);
962 CkMulticastMgr *mcastGrp=CProxy_CkMulticastMgr(
mCastGrpIdOrtho).ckLocalBranch();
963 #ifdef _PAIRCALC_DEBUG_PARANOID_FW_
966 internalType *outTile;
967 bool reuseTile =
false;
970 bool borderXY = borderX && borderY;
973 if(! borderX && ! borderY)
982 for(
int orthoX = 0; orthoX <
numOrthoCol ; orthoX++)
991 int tileStart=orthoYoff+orthoXoff;
994 int tileSize=orthoGrainSizeX*orthoGrainSizeY;
995 int bigOindex=orthoGrainSizeY;
996 int ocopySize=bigOindex*
sizeof(internalType);
997 int orthoIndex=orthoX*numOrthoCol+orthoY;
1000 outTile =
new internalType[tileSize];
1001 bzero(outTile,
sizeof(internalType)*tileSize);
1006 for(
int ystart=0, itileStart=tileStart; ystart<tileSize; ystart+=bigOindex, itileStart+=bigGindex)
1007 CmiMemcpy(&(outTile[ystart]),&(fullOutput[itileStart]),ocopySize);
1009 #ifdef _PAIRCALC_DEBUG_PARANOID_FW_
1011 snprintf(filename,80,
"fwoutTile_%d_%d:",orthoX,orthoY);
1015 mcastGrp->contribute(tileSize*
sizeof(internalType), outTile, sumMatrixDoubleType,
orthoCookies[orthoIndex],
orthoCB[orthoIndex]);
1041 for(
int i=0;i<msg->size;i++)
1042 CkAssert( isfinite(msg->matrix1[i]) );
1046 int size2=msg->size2;
1047 internalType *matrix1=msg->matrix1;
1048 internalType *matrix2=msg->matrix2;
1050 CkAssert((
unsigned int) msg->matrix2 %16 ==0);
1058 orthoX= (orthoX>maxorthostateindex) ? maxorthostateindex : orthoX;
1059 orthoY= (orthoY>maxorthostateindex) ? maxorthostateindex : orthoY;
1068 collectTile(
false,
true,
false,orthoX, orthoY, orthoGrainSizeX, orthoGrainSizeY,
numRecdBWOT, matrixSize, matrix2, matrix1);
1071 CkPrintf(
"[%d,%d,%d,%d,%d] Just received all of OrthoT. Triggering forward path multiply which has been pending since Gamma arrival\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,
cfg.
isSymmetric);
1073 bool myfalse=
false;
1074 thisProxy(thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z).multiplyForward(myfalse);
1094 CkSetQueueing(sigmsg, CK_QUEUEING_IFIFO);
1095 *(
int*)CkPriorityPtr(sigmsg) = priority;
1102 sigmsg->otherdata=
true;
1104 sigmsg->otherdata=
true;
1106 sigmsg->otherdata=
false;
1110 thisProxy(thisIndex.w,thisIndex.x, thisIndex.y,thisIndex.z).sendBWResult(sigmsg);
1112 thisProxy(thisIndex.w,thisIndex.x, thisIndex.y,thisIndex.z).sendBWResultDirect(sigmsg);
1136 #ifdef DEBUG_CP_PAIRCALC_PSIV
1137 CkPrintf(
"[%d,%d,%d,%d,%d] In multiplyPsiV\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,
cfg.
isSymmetric);
1145 bool prioPhan=
false;
1148 CkSetQueueing(msg2phantom, CK_QUEUEING_IFIFO);
1149 *(
int*)CkPriorityPtr(msg2phantom) = 1;
1151 thisProxy(thisIndex.w,thisIndex.y, thisIndex.x,thisIndex.z).acceptRightData(msg2phantom);
1179 bwMultiplyHelper(size,
outData, NULL,
outData, NULL, unitcoef, m_in, n_in, k_in, BNAoffset, BNCoffset, BTAoffset, BTCoffset, orthoX, orthoY, beta,
grainSizeX, grainSizeY);
1213 #ifdef _PAIRCALC_DEBUG_
1214 CkPrintf(
"[%d,%d,%d,%d,%d]: MultiplyResult from orthoX %d orthoY %d size %d numRecdBW %d actionType %d amPhantom %d notOnDiagonal %d cfg.arePhantomsOn %d symmetricOnDiagonal %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, msg->orthoX, msg->orthoY, msg->size,
numRecdBW, msg->actionType,
amPhantom,
notOnDiagonal,
cfg.
arePhantomsOn,
symmetricOnDiagonal);
1216 #ifdef _CP_SUBSTEP_TIMING_
1217 if(
cfg.backwardTimerID>0)
1220 double pstart=CmiWallTimer();
1221 contribute(
sizeof(
double),&pstart,CkReduction::min_double,
cfg.beginTimerCB ,
cfg.backwardTimerID);
1231 for(
int i=0;i<msg->size;i++)
1232 CkAssert( isfinite(msg->matrix1[i]) );
1236 int size2=msg->size2;
1237 internalType *matrix1=msg->matrix1;
1238 internalType *matrix2=msg->matrix2;
1241 if((
unsigned int) msg->matrix1 %16 !=0)
1243 CkPrintf(
"msg->matrix1 is %p\n",msg->matrix1);
1245 CkAssert((
unsigned int) msg->matrix1 %16 ==0);
1246 CkAssert((
unsigned int) msg->matrix2 %16 ==0);
1250 bool unitcoef =
false;
1266 orthoX= (orthoX>maxorthostateindex) ? maxorthostateindex : orthoX;
1267 orthoY= (orthoY>maxorthostateindex) ? maxorthostateindex : orthoY;
1276 if(matrix2==NULL||size2<1)
1284 CkPrintf(
"[%d,%d,%d,%d,%d] Warning! phantom got bw before fw, forcing tile collection\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,
cfg.
isSymmetric);
1291 #ifdef _PAIRCALC_DEBUG_PARANOID_BW_
1292 CkPrintf(
"orthoGrainSizeX %d orthoGrainSizeY %d orthoX %d orthoY %d e1 %.10g\n",orthoGrainSizeX, orthoGrainSizeY, orthoX, orthoY, msg->matrix1[0]);
1295 dumpMatrix(
"bwm1idata",msg->matrix1,
grainSizeX,grainSizeY,0,0,orthoX, orthoY);
1298 dumpMatrix(
"bwm2idata",msg->matrix2,
grainSizeX,grainSizeY);
1303 dumpMatrix(
"bwm1idata",msg->matrix1,orthoGrainSizeX,orthoGrainSizeY,0,0,orthoX, orthoY);
1306 dumpMatrix(
"bwm2idata",msg->matrix2,orthoGrainSizeX,orthoGrainSizeY,0,0,orthoX, orthoY);
1312 int m_in =
numPoints * pcDataSizeFactor;
1333 internalType *amatrix=NULL;
1334 internalType *amatrix2=matrix2;
1357 collectTile(
true, !unitcoef,
false,orthoX, orthoY, orthoGrainSizeX, orthoGrainSizeY,
numRecdBW, matrixSize, matrix1, matrix2);
1381 n_in=orthoGrainSizeY;
1382 k_in=orthoGrainSizeX;
1408 CkPrintf(
"[%d,%d,%d,%d,%d] Warning! your n_in %d and k_in %d is larger than matrix1->size %d\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,
cfg.
isSymmetric, n_in,k_in,size);
1413 bwMultiplyHelper(size, matrix1, matrix2, amatrix, amatrix2, unitcoef, m_in, n_in, k_in, BNAoffset, BNCoffset, BTAoffset, BTCoffset, orthoX, orthoY, beta, orthoGrainSizeX, orthoGrainSizeY);
1419 #ifndef SKIP_PARTIAL_SEND
1423 bwSendHelper( orthoX, orthoY, k_in, n_in, orthoGrainSizeX, orthoGrainSizeY);
1434 CkPrintf(
"[%d,%d,%d,%d,%d] not sending cfg.isBWstreaming %d cfg.areBWTilesCollected %d cfg.isBWbarriered %d actionType %d numRecdBW %d numOrtho%d \n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,
cfg.
isSymmetric,
cfg.
isBWstreaming,
cfg.
areBWTilesCollected,
cfg.
isBWbarriered,
actionType,
numRecdBW,
numOrtho);
1458 contribute(
sizeof(
int),&wehaveours,CkReduction::sum_int,CkCallback(CkIndex_PairCalculator::bwbarrier(NULL),thisProxy));
1485 #ifdef _PAIRCALC_DEBUG_
1486 CkPrintf(
"[%d,%d,%d,%d,%d] Cleaning up at end of BW path\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,
cfg.
isSymmetric);
1585 #ifdef DEBUG_CP_PAIRCALC_PSIV
1586 CkPrintf(
"[%d,%d,%d,%d,%d]: Am NOT notifying the message handlers to expectNext() as a PsiV step is next (actionType=%d). Data should be arriving in messages. \n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric,
numRecdBW,
actionType);
1592 #ifdef _CP_SUBSTEP_TIMING_
1594 if(
cfg.backwardTimerID > 0)
1596 double pstart=CmiWallTimer();
1597 contribute(
sizeof(
double),&pstart,CkReduction::max_double,
cfg.endTimerCB ,
cfg.backwardTimerID);
1605 void PairCalculator::collectTile(
bool doMatrix1,
bool doMatrix2,
bool doOrthoT,
int orthoX,
int orthoY,
int orthoGrainSizeX,
int orthoGrainSizeY,
int numRecdBW,
int matrixSize, internalType *matrix1, internalType* matrix2)
1607 #ifdef _PAIRCALC_DEBUG_
1608 CkPrintf(
"[%d,%d,%d,%d,%d]: collectTile aggregating numRecdBW %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, numRecdBW);
1618 if(!doOrthoT && doMatrix1 && numRecdBW==1 &&
inResult1==NULL)
1621 inResult1 =
new internalType[matrixSize];
1622 bzero(
inResult1,
sizeof(internalType)*matrixSize);
1627 int bigOindex=orthoGrainSizeY;
1637 bigOindex=orthoGrainSizeY;
1643 bigOindex=orthoGrainSizeX;
1648 int tileStart=orthoYoff+orthoXoff;
1651 int tileSize=orthoGrainSizeX*orthoGrainSizeY;
1653 int ocopySize=bigOindex*
sizeof(internalType);
1655 int savetileStart=tileStart;
1660 inResult2 =
new internalType[matrixSize];
1661 bzero(
inResult2,
sizeof(internalType)*matrixSize);
1663 for(
int i=0; i<tileSize; i+=bigOindex,tileStart+=bigGindex)
1664 CmiMemcpy(&(
inResult2[tileStart]),&(matrix2[i]),ocopySize);
1666 #ifdef _PAIRCALC_DEBUG_PARANOID_BW_
1668 snprintf(filename,80,
"bwinResult2_%d_%d:",orthoX,orthoY);
1675 tileStart=savetileStart;
1679 #ifdef _PAIRCALC_DEBUG_PARANOID_BW_
1680 CkPrintf(
"[%d,%d,%d,%d,%d]: collectTile copying tile bigOindex %d bigGindex %d tileSize %d tileStart %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, bigOindex, bigGindex, tileSize,tileStart);
1683 for(
int i=0; i< tileSize; i+=bigOindex,tileStart+=bigGindex)
1684 CmiMemcpy(&(dest[tileStart]),&(matrix1[i]),ocopySize);
1685 #ifdef _PAIRCALC_DEBUG_PARANOID_BW_
1689 snprintf(filename,80,
"bworthoT_%d_%d:",orthoX,orthoY);
1691 snprintf(filename,80,
"bwinResult1_%d_%d:",orthoX,orthoY);
1716 void PairCalculator::bwMultiplyHelper(
int size, internalType *matrix1, internalType *matrix2, internalType *amatrix, internalType *amatrix2,
bool unitcoef,
int m_in,
int n_in,
int k_in,
int BNAoffset,
int BNCoffset,
int BTAoffset,
int BTCoffset,
int orthoX,
int orthoY,
double beta,
int orthoGrainSizeX,
int orthoGrainSizeY)
1718 #ifdef _PAIRCALC_DEBUG_
1721 CkPrintf(
"[%d,%d,%d,%d,%d]: bwMultiplyHelper with size %d numRecdBW %d actionType %d orthoX %d orthoY %d orthoGrainSizeX %d orthoGrainSizeY %d BTCoffset %d BNCoffset %d m_in %d n_in %d k_in %d iter %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, size,
numRecdBW,
actionType, orthoX, orthoY,orthoGrainSizeX, orthoGrainSizeY, BTCoffset, BNCoffset, m_in, n_in, k_in,
streamCaughtR);
1723 #ifdef _PAIRCALC_DEBUG_PARANOID_BW_
1744 outData=
new internalType[matrixSize];
1745 bzero(
outData,
sizeof(internalType)*matrixSize);
1750 collectTile(
true,
false,
true,orthoX, orthoY, orthoGrainSizeX, orthoGrainSizeY,
numRecdBW, matrixSize, matrix1, matrix2);
1752 CmiMemcpy(
outData, amatrix, size*
sizeof(internalType));
1760 #ifdef _PAIRCALC_DEBUG_
1761 CkPrintf(
"[%d,%d,%d,%d,%d] Allocated mynewData %d * %d *2 \n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,
cfg.
isSymmetric,
numExpectedY,
numPoints);
1770 #ifdef _PAIRCALC_DEBUG_
1771 CkPrintf(
"[%d,%d,%d,%d,%d] Allocated other %d * %d *2 \n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,
cfg.
isSymmetric,
numExpectedX,
numPoints);
1782 #ifdef _PAIRCALC_DEBUG_
1783 CkPrintf(
"[%d,%d,%d,%d,%d] Allocated other %d * %d *2 \n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,
cfg.
isSymmetric,
numExpectedY,
numPoints);
1792 internalType *mynewDatad=
reinterpret_cast<internalType*
> (
mynewData);
1798 #ifdef CP_PAIRCALC_USES_COMPLEX_MATH
1799 char transformT =
'C';
1801 char transformT =
'T';
1804 #ifdef _PAIRCALC_DEBUG_PARANOID_BW_
1806 int ystart=chunksize*thisIndex.z;
1814 #ifdef CMK_TRACE_ENABLED
1815 double StartTime=CmiWallTimer();
1830 CkAssert((
unsigned int) &(
inDataLeft[BNAoffset] )%16==0);
1831 CkAssert((
unsigned int) amatrix %16==0);
1832 CkAssert((
unsigned int)&(mynewDatad[BNCoffset] )%16==0);
1836 #ifdef PRINT_DGEMM_PARAMS
1837 CkPrintf(
"HEY-DGEMM %c %c %d %d %d %f %f %d %d %d BNAoffset %d BNCoffset %d\n", transform, transform, m_in, n_in, k_in, alpha, beta, m_in, n_in, m_in, BNAoffset, BNCoffset);
1841 myGEMM(&transform, &transform, &m_in, &ln_in, &lk_in, &alpha, &(
inDataLeft[BNAoffset]), &m_in, amatrix, &lk_in, &beta, &(mynewDatad[BNCoffset]), &m_in);
1845 #ifdef PRINT_DGEMM_PARAMS
1846 CkPrintf(
"HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", transform, transformT, m_in, n_in, k_in, alpha, beta, m_in, n_in, m_in);
1848 myGEMM(&transform, &transformT, &m_in, &n_in, &k_in, &alpha, &(
inDataLeft[BTAoffset]), &m_in, amatrix, &n_in, &beta, &(mynewDatad[BTCoffset]), &m_in);
1850 #ifdef CMK_TRACE_ENABLED
1851 traceUserBracketEvent(230, StartTime, CmiWallTimer());
1853 #ifdef _PAIRCALC_DEBUG_PARANOID_BW_
1855 snprintf(snark,80,
"bwgmodata_%d_%d:",orthoX,orthoY);
1863 #ifdef CMK_TRACE_ENABLED
1864 StartTime=CmiWallTimer();
1870 internalType *othernewDatad = reinterpret_cast <internalType*> (
othernewData);
1878 CkAssert((
unsigned int) &(
inDataRight[BTAoffset] )%16==0);
1879 CkAssert((
unsigned int) amatrix %16==0);
1880 CkAssert((
unsigned int)&(othernewDatad[BTCoffset] )%16==0);
1882 #ifdef PRINT_DGEMM_PARAMS
1883 CkPrintf(
"HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", transform, transformT, m_in, n_in, k_in, alpha, beta, m_in, k_in, m_in);
1885 myGEMM(&transform, &transformT, &m_in, &k_in, &n_in, &alpha, &(
inDataRight[BTAoffset]), &m_in, amatrix, &k_in, &beta, &(othernewDatad[BTCoffset]), &m_in);
1886 #ifdef CMK_TRACE_ENABLED
1887 traceUserBracketEvent(250, StartTime, CmiWallTimer());
1889 #ifdef _PAIRCALC_DEBUG_PARANOID_BW_
1891 snprintf(snark,80,
"bwgomodata_%d_%d:",orthoX,orthoY);
1907 internalType *othernewDatad;
1917 othernewDatad= reinterpret_cast <internalType *> (
othernewData);
1922 othernewDatad=mynewDatad;
1928 CmiNetworkProgress();
1929 #ifdef PRINT_DGEMM_PARAMS
1930 CkPrintf(
"HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", transform, transform, m_in, n_in, k_in, alpha, beta, m_in, k_in, m_in);
1933 #ifdef CMK_TRACE_ENABLED
1934 StartTime=CmiWallTimer();
1936 myGEMM(&transform, &transform, &m_in, &k_in, &n_in, &alpha, &(
inDataRight[BNAoffset]), &m_in, amatrix2, &n_in, &beta, &(othernewDatad[BNCoffset]), &m_in);
1937 #ifdef CMK_TRACE_ENABLED
1938 traceUserBracketEvent(240, StartTime, CmiWallTimer());
1941 #ifdef _PAIRCALC_DEBUG_PARANOID_BW_
1950 #ifdef _PAIRCALC_VALID_OUT_
1951 CkPrintf(
"[PAIRCALC] [%d,%d,%d,%d,%d] backward gemm out %.10g %.10g %.10g %.10g \n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, mynewDatad[0],mynewDatad[1],
mynewData[
numPoints*
grainSizeX-1].re,
mynewData[
numPoints*
grainSizeX-1].im);
1988 #ifdef _PAIRCALC_DEBUG_
1989 CkPrintf(
"[%d,%d,%d,%d,%d] Allocated other %d * %d\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,
cfg.
isSymmetric,
numExpectedX,
numPoints);
1998 #ifdef _PAIRCALC_DEBUG_
1999 CkPrintf(
"[%d,%d,%d,%d,%d] Allocated mynewData %d * %d\n",thisIndex.w,thisIndex.x,thisIndex.y,thisIndex.z,
cfg.
isSymmetric,
numExpectedY,
numPoints);
2007 #ifdef CMK_TRACE_ENABLED
2008 double StartTime=CmiWallTimer();
2012 internalType *othernewDatad= reinterpret_cast <internalType*> (
mynewData);
2014 othernewDatad= reinterpret_cast <internalType*> (
othernewData);
2016 CmiNetworkProgress();
2023 #ifdef PRINT_DGEMM_PARAMS
2024 CkPrintf(
"HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", transform, transform, m_in, n_in, k_in, alpha, beta, m_in, k_in, m_in);
2026 myGEMM(&transform, &transform, &m_in, &k_in, &n_in, &alpha,
inDataRight,
2027 &m_in,
inResult2, &n_in, &beta, othernewDatad, &m_in);
2029 #ifdef CMK_TRACE_ENABLED
2030 traceUserBracketEvent(240, StartTime, CmiWallTimer());
2032 #ifdef _PAIRCALC_DEBUG_PARANOID_BW_
2034 int ystart=chunksize*thisIndex.z;
2046 #ifdef _PAIRCALC_DEBUG_
2047 CkPrintf(
"[%d,%d,%d,%d,%d]: bwSendHelper with numRecdBW %d actionType %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric,
numRecdBW,
actionType);
2067 orthoXgrain= (orthoXgrain>maxorthostateindex) ? maxorthostateindex : orthoXgrain;
2068 orthoYgrain= (orthoYgrain>maxorthostateindex) ? maxorthostateindex : orthoYgrain;
2087 #ifdef _PAIRCALC_DEBUG_
2088 CkPrintf(
"[%d,%d,%d,%d,%d]: bwSendHelper !amPhantom orthoXgrain %d sizeX %d orthoYgrain %d sizeY %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, orthoXgrain, sizeX, orthoYgrain, sizeY);
2091 int startGrain=orthoYgrain;
2092 int endGrain=startGrain+sizeY;
2095 startGrain=orthoXgrain;
2096 endGrain=startGrain+sizeY;
2102 sendBWResultColumnDirect(
false, startGrain, endGrain);
2112 #ifdef _PAIRCALC_DEBUG_
2113 CkPrintf(
"[%d,%d,%d,%d,%d]: bwSendHelper asymm orthoYgrain %d sizeY %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, orthoYgrain, sizeY);
2120 sendBWResultColumnDirect(
false, orthoYgrain, orthoYgrain+sizeY);
2133 #ifdef _PAIRCALC_DEBUG_
2134 CkPrintf(
"[%d,%d,%d,%d,%d]: bwSendHelper amPhantom | other orthoXgrain %d sizeX %d orthoYgrain %d sizeY %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, orthoXgrain, sizeX, orthoYgrain, sizeY);
2138 int startGrain=orthoXgrain;
2139 int endGrain=sizeX+startGrain;
2145 startGrain=orthoYgrain;
2146 endGrain=sizeX+startGrain;
2150 startGrain=orthoYgrain;
2154 endGrain=startGrain+sizeY;
2156 #ifdef _PAIRCALC_DEBUG_
2157 CkPrintf(
"[%d,%d,%d,%d,%d]: bwSendHelper amPhantom | other startGrain %d endGrain %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, startGrain, endGrain );
2163 sendBWResultColumnDirect(
true, startGrain, endGrain);
2177 PairCalculator::sendBWResultColumnDirect(
bool otherdata,
int startGrain,
int endGrain )
2179 #ifdef _PAIRCALC_DEBUG_CONTRIB_
2180 CkPrintf(
"[%d,%d,%d,%d,%d]: sendBWResultColumnDirect with otherdata %d actionType %d startGrain %d endGrain %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, otherdata,
actionType, startGrain, endGrain);
2190 int numToSend=endGrain-startGrain;
2191 int permuter=(int) ((
float) thisIndex.z/ (float)
cfg.
numChunks) * (float) numToSend;
2196 int index=thisIndex.x;
2200 for(
int j=startGrain;j<endGrain;j++)
2203 int jPermuted=(j+permuter>endGrain) ? (j+permuter-numToSend) : (j+permuter) ;
2206 CkCallback mycb(cp_entry, CkArrayIndex2D(jPermuted+ index ,thisIndex.w),
cb_aid);
2211 CkSetQueueing(msg, CK_QUEUEING_IFIFO);
2213 msg->init(thisIndex,
numPoints, thisIndex.z, computed);
2215 #ifdef _PAIRCALC_DEBUG_
2216 CkPrintf(
"[%d,%d,%d,%d,%d]: sending partial other of size %d offset %d to [%d %d]\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric,
numPoints,jPermuted,index+jPermuted,thisIndex.w);
2220 for(
int i=0;i<msg->N ;i++)
2222 if( !isfinite(msg->result[i]) )
2223 CkPrintf(
"[%d,%d,%d,%d,%d]: sendBWResultColumnDirect nan at %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, i);
2224 CkAssert( isfinite(msg->result[i]) );
2234 for(
int j=startGrain;j<endGrain;j++)
2236 int jPermuted=(j+permuter>endGrain) ? (j+permuter-numToSend) : (j+permuter) ;
2241 if( !isfinite(computed[i]) )
2243 fprintf(stderr,
"[%d,%d,%d,%d,%d]: sendBWResultColumnDirect nan in computed at %d %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, jPermuted,i);
2244 CkAssert( isfinite(computed[i]) );
2248 int index=thisIndex.y;
2249 CkCallback mycb(cp_entry, CkArrayIndex2D(jPermuted+index ,thisIndex.w),
cb_aid);
2254 CkSetQueueing(msg, CK_QUEUEING_IFIFO);
2256 msg->init(thisIndex,numPoints, thisIndex.z, computed);
2262 #ifdef _PAIRCALC_DEBUG_
2263 CkPrintf(
"[%d,%d,%d,%d,%d]:sending partial of size %d offset %d to [%d %d]\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, numPoints,jPermuted,index+jPermuted,thisIndex.w);
2267 for(
int i=0;i<msg->N ;i++)
2269 if( !isfinite(msg->result[i]) )
2270 CkPrintf(
"[%d,%d,%d,%d,%d]: sendBWResultColumnDirect nan at %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, i);
2271 CkAssert( isfinite(msg->result[i]) );
2289 #ifdef _PAIRCALC_DEBUG_CONTRIB_
2290 CkPrintf(
"[%d,%d,%d,%d,%d]: sendBWResultColumn with actionType %d startGrain %d sendGrain %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric,
actionType, startGrain, endGrain);
2294 CkMulticastMgr *mcastGrp=CProxy_CkMulticastMgr(
mCastGrpId).ckLocalBranch();
2304 int index=thisIndex.x;
2307 for(
int j=startGrain;j<endGrain;j++)
2312 #ifdef CMK_TRACE_ENABLED
2313 double StartTime=CmiWallTimer();
2316 CkCallback mycb(cp_entry, CkArrayIndex2D(j+index ,thisIndex.w),
cb_aid);
2317 #ifdef _PAIRCALC_DEBUG_CONTRIB_
2318 CkPrintf(
"[%d,%d,%d,%d,%d] contributing other %d offset %d to [%d %d]\n",thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, numPoints,j,thisIndex.x+j,thisIndex.w);
2321 int outOffset=thisIndex.z;
2324 #ifdef CMK_TRACE_ENABLED
2325 traceUserBracketEvent(220, StartTime, CmiWallTimer());
2328 CmiNetworkProgress();
2335 int index=thisIndex.y;
2336 for(
int j=startGrain;j<endGrain;j++)
2340 #ifdef CMK_TRACE_ENABLED
2341 double StartTime=CmiWallTimer();
2343 CkCallback mycb(cp_entry, CkArrayIndex2D(j+thisIndex.y ,thisIndex.w),
cb_aid);
2345 #ifdef _PAIRCALC_DEBUG_CONTRIB_
2346 CkPrintf(
"[%d,%d,%d,%d,%d] contributing %d offset %d to [%d %d]\n",thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric,numPoints,j,thisIndex.y+j,thisIndex.w);
2349 int outOffset=thisIndex.z;
2350 mcastGrp->contribute(numPoints*
sizeof(
inputType), computed, sumMatrixDoubleType,
resultCookies[j], mycb, outOffset);
2352 #ifdef CMK_TRACE_ENABLED
2353 traceUserBracketEvent(220, StartTime, CmiWallTimer());
2357 CmiNetworkProgress();
2366 #ifdef _PAIRCALC_DEBUG_
2367 CkPrintf(
"[%d,%d,%d,%d,%d]: sendBWResultDirect with actionType %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric,
actionType);
2370 bool otherdata=msg->otherdata;
2382 int index=thisIndex.x;
2388 for(
int j=0;j<size;j++)
2393 CkCallback mycb(cp_entry, CkArrayIndex2D(j+ index ,thisIndex.w),
cb_aid);
2398 CkSetQueueing(omsg, CK_QUEUEING_IFIFO);
2400 omsg->init(thisIndex,numPoints, thisIndex.z, computed);
2401 #ifdef _PAIRCALC_DEBUG_
2402 CkPrintf(
"[%d,%d,%d,%d,%d]:sending other partial of size %d offset %d to [%d %d]\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, numPoints,j,index+j,thisIndex.w);
2405 for(
int i=0;i<omsg->N ;i++)
2407 if( !isfinite(omsg->result[i]) )
2408 CkPrintf(
"[%d,%d,%d,%d,%d]: sendBWResultDirect nan at %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, i);
2409 CkAssert( isfinite(omsg->result[i]) );
2421 int index=thisIndex.y;
2422 for(
int j=0;j<outsize;j++)
2425 CkCallback mycb(cp_entry, CkArrayIndex2D(j+index ,thisIndex.w),
cb_aid);
2430 CkSetQueueing(omsg, CK_QUEUEING_IFIFO);
2432 omsg->init(thisIndex,numPoints, thisIndex.z, computed);
2433 #ifdef _PAIRCALC_DEBUG_
2434 CkPrintf(
"[%d,%d,%d,%d,%d]:sending partial of size %d offset %d to [%d %d]\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric,numPoints,j,index+j,thisIndex.w);
2437 for(
int i=0;i<omsg->N ;i++)
2439 if( !isfinite(omsg->result[i]) )
2440 CkPrintf(
"[%d,%d,%d,%d,%d]: sendBWResultDirect nan at %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, i);
2441 CkAssert( isfinite(omsg->result[i]) );
2461 #ifdef _PAIRCALC_DEBUG_
2462 CkPrintf(
"[%d,%d,%d,%d,%d]: sendBWResult with actionType %d\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric,
actionType);
2465 bool otherdata=msg->otherdata;
2468 CkMulticastMgr *mcastGrp=CProxy_CkMulticastMgr(
mCastGrpId).ckLocalBranch();
2479 int index=thisIndex.x;
2485 for(
int j=0;j<size;j++)
2490 CkCallback mycb(cp_entry, CkArrayIndex2D(j+index ,thisIndex.w),
cb_aid);
2491 #ifdef _PAIRCALC_DEBUG_CONTRIB_
2492 CkPrintf(
"[%d,%d,%d,%d,%d] contributing other %d offset %d to [%d %d]\n",thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric, numPoints,j,thisIndex.x+j,thisIndex.w);
2494 int outOffset=thisIndex.z;
2502 int index=thisIndex.y;
2504 for(
int j=0;j<outsize;j++)
2507 CkCallback mycb(cp_entry, CkArrayIndex2D(j+index,thisIndex.w),
cb_aid);
2508 #ifdef _PAIRCALC_DEBUG_CONTRIB_
2509 CkPrintf(
"[%d,%d,%d,%d,%d] contributing %d offset %d to [%d %d]\n",thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z,
cfg.
isSymmetric,numPoints,j,thisIndex.y+j,thisIndex.w);
2511 int outOffset=thisIndex.z;
2512 mcastGrp->contribute(numPoints*
sizeof(
inputType), computed, sumMatrixDoubleType,
resultCookies[j], mycb, outOffset);
2523 void PairCalculator::dumpMatrix(
const char *infilename,
double *matrix,
int xdim,
int ydim,
int xstart,
int ystart,
int xtra1,
int xtra2)
2526 char filename[1000];
2527 memset(fmt, 0 , 1000);
2528 memset(filename, 0 , 1000);
2529 strncpy(fmt,infilename,999);
2530 strncat(fmt,
"_%d_%d_%d_%d_%d_%d_%d.out\0",999);
2531 sprintf(filename, fmt, thisIndex.w,thisIndex.x, thisIndex.y, thisIndex.z,
2533 FILE *loutfile = fopen(filename,
"w");
2534 for(
int i=0;i<xdim;i++)
2535 for(
int j=0;j<ydim;j++)
2536 fprintf(loutfile,
"%d %d %.12g\n",i+xstart,j+ystart,matrix[i*ydim+j]);
2540 void PairCalculator::dumpMatrix(
const char *infilename,
complex *matrix,
int xdim,
int ydim,
int xstart,
int ystart,
int xtra1,
int xtra2)
2543 char filename[1000];
2544 memset(fmt, 0 , 1000);
2545 memset(filename, 0 , 1000);
2546 strncpy(fmt,infilename,999);
2547 strncat(fmt,
"_%d_%d_%d_%d_%d_%d_%d.out",999);
2548 sprintf(filename, fmt, thisIndex.w,thisIndex.x, thisIndex.y, thisIndex.z,
2550 FILE *loutfile = fopen(filename,
"w");
2551 for(
int i=0;i<xdim;i++)
2552 for(
int j=0;j<ydim;j++)
2553 fprintf(loutfile,
"%d %d %.12g %.12g\n", i+xstart, j+ystart, matrix[i*ydim+j].re, matrix[i*ydim+j].im);
2557 void PairCalculator::dumpMatrixComplex(
const char *infilename,
complex *matrix,
int xdim,
int ydim,
int xstart,
int ystart,
int iter)
2560 char filename[1000];
2561 memset(fmt, 0 , 1000);
2562 memset(filename, 0 , 1000);
2563 strncpy(fmt,infilename,999);
2564 strncat(fmt,
"_%d_%d_%d_%d_%d_%d.out",999);
2565 sprintf(filename, fmt, thisIndex.w,thisIndex.x, thisIndex.y,thisIndex.z, iter ,
cfg.
isSymmetric);
2566 FILE *loutfile = fopen(filename,
"w");
2567 for(
int i=0;i<xdim;i++)
2568 for(
int j=0;j<ydim;j++)
2569 fprintf(loutfile,
"%d %d %.12g %.12g \n",i+xstart,j+ystart,matrix[i*ydim+j].re, matrix[i*ydim+j].im);
2573 void PairCalculator::copyIntoTiles(
double *source,
double**dest,
int sourceRows,
int sourceCols,
int *offsetsRow,
int *offsetsCol,
int *touched,
int tileSize,
int tilesPerRow )
2578 for(
int j=0;j<sourceRows;j++)
2579 for(
int i=0;i<sourceCols;i++)
2581 int x=offsetsRow[i];
2582 int y=offsetsCol[j];
2583 int sourceOffset=j*sourceCols+i;
2584 double value=source[sourceOffset];
2585 int tilex=x/tileSize;
2586 int tiley=y/tileSize*tilesPerRow;
2587 int tilei=x%tileSize;
2588 int tilej=y%tileSize;
2589 touched[tiley+tilex]++;
2591 CkAssert( isfinite(value) );
2592 CkAssert(touched[tiley+tilex]<=tileSize*tileSize);
2595 dest[tiley+tilex][tilej*tileSize+tilei]=value;
2602 void PairCalculator::dgemmSplitFwdStreamMK(
int m,
int n,
int k,
char *trans,
char *transT,
double *alpha,
double *A,
int *lda,
double *B,
int *ldb,
double *C,
int *ldc)
2604 #ifdef CMK_TRACE_ENABLED
2605 double StartTime=CmiWallTimer();
2609 int Ksplit = ( (k > Ksplit_m) ? Ksplit_m : k);
2610 int Krem = (k % Ksplit);
2611 int Kloop = k/Ksplit-1;
2613 int Msplit = ( (m > Msplit_m) ? Msplit_m : m);
2614 int Mrem = (m % Msplit);
2615 int Mloop = m/Msplit;
2617 for(
int ms=1;ms<=Mloop;ms++)
2619 int moff = (ms-1)*k*Msplit;
2620 int moffc = (ms-1)*Msplit;
2621 int MsplitU = (ms==Mloop ? Msplit+Mrem : Msplit);
2622 #ifndef BUNDLE_USER_EVENT
2623 #ifdef CMK_TRACE_ENABLED
2624 StartTime=CmiWallTimer();
2628 CkAssert((
unsigned int) A[moff] % 16==0);
2629 CkAssert((
unsigned int) B % 16==0);
2630 CkAssert((
unsigned int) C[moffc] % 16==0);
2633 #ifdef PRINT_DGEMM_PARAMS
2634 CkPrintf(
"HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", *transT, *trans, MsplitU, n, Ksplit, *alpha, betap, *lda, *ldb, *ldc);
2636 DGEMM(transT, trans, &MsplitU, &n, &Ksplit, alpha, &A[moff], lda, B, ldb, &betap, &C[moffc], ldc);
2638 #ifndef BUNDLE_USER_EVENT
2639 #ifdef CMK_TRACE_ENABLED
2640 traceUserBracketEvent(210, StartTime, CmiWallTimer());
2643 CmiNetworkProgress();
2644 for(
int ks=1;ks<=Kloop;ks++)
2646 int koff = ks*Ksplit;
2647 int KsplitU = (ks==Kloop ? Ksplit+Krem : Ksplit);
2648 #ifndef BUNDLE_USER_EVENT
2649 #ifdef CMK_TRACE_ENABLED
2650 StartTime=CmiWallTimer();
2654 CkAssert((
unsigned int) A[koff+moff] % 16==0);
2655 CkAssert((
unsigned int) B[koff] % 16==0);
2656 CkAssert((
unsigned int) C[moffc] % 16==0);
2659 #ifdef PRINT_DGEMM_PARAMS
2660 CkPrintf(
"HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", *transT, *trans, MsplitU, n, KsplitU, *alpha, betap, *lda, *ldb, *ldc);
2662 DGEMM(transT, trans, &MsplitU, &n, &KsplitU, alpha, &A[koff+moff], lda, &B[koff], ldb, &betap, &C[moffc], ldc);
2663 #ifndef BUNDLE_USER_EVENT
2664 #ifdef CMK_TRACE_ENABLED
2666 traceUserBracketEvent(210, StartTime, CmiWallTimer());
2669 CmiNetworkProgress();
2673 #ifdef BUNDLE_USER_EVENT
2674 #ifdef CMK_TRACE_ENABLED
2675 traceUserBracketEvent(210, StartTime, CmiWallTimer());
2680 void PairCalculator::dgemmSplitFwdStreamNK(
int m,
int n,
int k,
char *trans,
char *transT,
double *alpha,
double *A,
int *lda,
double *B,
int *ldb,
double *C,
int *ldc)
2682 #ifdef CMK_TRACE_ENABLED
2683 double StartTime=CmiWallTimer();
2687 int Ksplit = ( (k > Ksplit_m) ? Ksplit_m : k);
2688 int Krem = (k % Ksplit);
2689 int Kloop = k/Ksplit-1;
2691 int Nsplit = ( (n > Nsplit_m) ? Nsplit_m : n);
2692 int Nrem = (n % Nsplit);
2693 int Nloop = n/Nsplit;
2694 #ifdef CMK_TRACE_ENABLED
2695 StartTime=CmiWallTimer();
2697 for(
int ns=1;ns<=Nloop;ns++)
2699 int noff = (ns-1)*k*Nsplit;
2700 int noffc = (ns-1)*(*ldc)*Nsplit;
2701 int NsplitU = (ns==Nloop ? Nsplit+Nrem : Nsplit);
2702 #ifndef BUNDLE_USER_EVENT
2703 #ifdef CMK_TRACE_ENABLED
2704 StartTime=CmiWallTimer();
2708 CkAssert((
unsigned int) A % 16==0);
2709 CkAssert((
unsigned int) B[noff] % 16==0);
2710 CkAssert((
unsigned int) C[noffc] % 16==0);
2713 #ifdef PRINT_DGEMM_PARAMS
2714 CkPrintf(
"HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", *transT, *trans, m, NsplitU, Ksplit, *alpha, betap, *lda, *ldb, *ldc);
2716 DGEMM(transT, trans, &m, &NsplitU, &Ksplit, alpha, A, lda, &B[noff], ldb, &betap, &C[noffc], ldc);
2717 #ifndef BUNDLE_USER_EVENT
2718 #ifdef CMK_TRACE_ENABLED
2719 traceUserBracketEvent(210, StartTime, CmiWallTimer());
2722 CmiNetworkProgress();
2723 for(
int ks=1;ks<=Kloop;ks++){
2724 int koff = ks*Ksplit;
2725 int KsplitU = (ks==Kloop ? Ksplit+Krem : Ksplit);
2726 #ifndef BUNDLE_USER_EVENT
2727 #ifdef CMK_TRACE_ENABLED
2728 StartTime=CmiWallTimer();
2732 CkAssert((
unsigned int) A[koff] % 16==0);
2733 CkAssert((
unsigned int) B[koff+noff] % 16==0);
2734 CkAssert((
unsigned int) C[noffc] % 16==0);
2737 #ifdef PRINT_DGEMM_PARAMS
2738 CkPrintf(
"HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", *transT, *trans, m, NsplitU, KsplitU, *alpha, betap, *lda, *ldb, *ldc);
2740 DGEMM(transT, trans, &m, &NsplitU, &KsplitU, alpha, &A[koff], lda, &B[koff+noff], ldb, &betap, &C[noffc], ldc);
2742 #ifndef BUNDLE_USER_EVENT
2743 #ifdef CMK_TRACE_ENABLED
2744 traceUserBracketEvent(210, StartTime, CmiWallTimer());
2747 CmiNetworkProgress();
2751 #ifdef BUNDLE_USER_EVENT
2752 #ifdef CMK_TRACE_ENABLED
2753 traceUserBracketEvent(210, StartTime, CmiWallTimer());
2758 void PairCalculator::dgemmSplitBwdM(
int m,
int n,
int k,
char *trans,
char *transT,
double *alpha,
double *A,
double *B,
double *bt,
double *C)
2760 #ifdef CMK_TRACE_ENABLED
2761 double StartTime=CmiWallTimer();
2764 int Msplit = ( (m > Msplit_m) ? Msplit_m : m);
2765 int Mrem = (m % Msplit);
2766 int Mloop = m/Msplit-1;
2769 CkAssert((
unsigned int) A % 16==0);
2770 CkAssert((
unsigned int) B % 16==0);
2774 #ifdef PRINT_DGEMM_PARAMS
2775 CkPrintf(
"HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", *trans, *transT, Msplit, n, k, *alpha, *bt, m, k, m);
2780 DGEMM(trans, transT, &Msplit, &n, &k, alpha, A, &m, B, &ldb, bt, C, &m);
2781 #ifndef BUNDLE_USER_EVENT
2782 #ifdef CMK_TRACE_ENABLED
2783 traceUserBracketEvent(230, StartTime, CmiWallTimer());
2787 CmiNetworkProgress();
2788 for(
int i=1;i<=Mloop;i++)
2791 if(i==Mloop) { Msplit+=Mrem; }
2792 #ifndef BUNDLE_USER_EVENT
2793 #ifdef CMK_TRACE_ENABLED
2794 StartTime=CmiWallTimer();
2798 CkAssert((
unsigned int) A[off] % 16==0);
2799 CkAssert((
unsigned int) B % 16==0);
2803 #ifdef PRINT_DGEMM_PARAMS
2804 CkPrintf(
"HEY-DGEMM %c %c %d %d %d %f %f %d %d %d\n", *trans, *transT, Msplit, n, k, *alpha, *bt, m, k, m);
2806 DGEMM(trans, transT, &Msplit, &n, &k, alpha, &A[off], &m, B, &ldb, bt, &C[off], &m);
2808 #ifndef BUNDLE_USER_EVENT
2809 #ifdef CMK_TRACE_ENABLED
2810 traceUserBracketEvent(230, StartTime, CmiWallTimer());
2813 CmiNetworkProgress();
2815 #ifdef BUNDLE_USER_EVENT
2816 #ifdef CMK_TRACE_ENABLED
2817 traceUserBracketEvent(230, StartTime, CmiWallTimer());
2822 void manmult(
int numRowsA,
int numRowsB,
int rowLength,
double *A,
double *B,
double *C,
double alpha)
2825 for(
int i=0; i<numRowsA; i++)
2827 double *athrow=&(A[i*rowLength]);
2828 for(
int j=0; j<numRowsB; j++)
2830 double *bthrow=&(B[j*rowLength]);
2831 for(
int e=0; e<rowLength; e++)
2834 C[j*numRowsA +i]+=alpha*athrow[e]*bthrow[e];
2841 #include "pcMessages.def.h"
2842 #include "InputDataHandler.h"
2843 #include "inputDataHandler.def.h"
2844 #include "ckPairCalculator.def.h"
bool isSymmetric
Is this a symmetric or asymmetric paircalc instance.
CkSectionInfo * resultCookies
array of bw path section cookies
int streamCaughtL
number of rows caught so far L stream
bool expectOrthoT
Is true only in asymmetric, dynamics scenario. For PC instances in the asymmetric chare array...
int streamCaughtR
number of rows caught so far R stream
void sendBWResultColumn(bool other, int startGrain, int endGrain)
Send the result for this column.
int grainSizeY
number of states per chare y-axis
int orthoGrainSizeRemX
sgrainSizeX % orthoGrainSize
CkCallback * orthoCB
forward path callbacks
internalType * inDataRight
the input pair to be transformed
int gSpaceEP
The entry point to which this instance should send results to.
int numRecdBW
number of messages received BW
void bwSendHelper(int orthoX, int orthoY, int sizeX, int sizeY, int ogx, int ogy)
Called on the normal backward path (non-psiV) to set up the data sends to GSpace. ...
void bwMultiplyDynOrthoT()
Multiplies Fpsi by T (from Ortho)
void sendTiles(bool flag_dp)
Contribute orthoGrainSized tiles of data (that are ready?) to the corresponding ortho chares...
double * allCaughtRight
unordered rows of FW input
int orthoGrainSizeRemY
sgrainSizeY % orthoGrainSize
inputType * othernewData
results of sym off diagonal multiply,
CkReductionMsg * sumMatrixDouble(int nMsg, CkReductionMsg **msgs)
sum together matrices of doubles
internalType * outData
results of fw multiply
bool isBWbarriered
Should we impose a hard barrier in the BW path to sync all PC chares?
int cb_ep
bw path callback entry point
void collectTile(bool doMatrix1, bool doMatrix2, bool doOrthoT, int orthoX, int orthoY, int orthoGrainSizeX, int orthoGrainSizeY, int numRecdBW, int matrixSize, internalType *matrix1, internalType *matrix2)
Receive data from ortho chares and copy into matrix.
int numExpectedY
number of messages expected y-axis
bool existsLeft
inDataLeft allocated
CkArrayID gSpaceAID
The array ID of the GSpace chare array this instance talks to.
bool existsNew
newData allocated
int numRecRight
number of rows so far total right
int numExpected
number of messages expected all
bool existsRight
inDataRight allocated
void multiplyResult(multiplyResultMsg *msg)
Entry Method. Backward path multiplication.
CkCallback uponSetupCompletion
Callback to trigger at the end of a paircalc array's init.
int numRecLeft
number of rows so far total left
void sendBWResult(sendBWsignalMsg *msg)
Entry Method. Send the results via multiple reductions as triggered by a prioritized message...
void sendBWResultDirect(sendBWsignalMsg *msg)
Entry Method.
int * columnCountOther
count of processed rows in BW by column
bool symmetricOnDiagonal
diagonal symmetric special case
int gemmSplitFWk
{ BGL's painful NIC forces us to split long computations.
int gemmSplitFWk
number of rows in split FW dgemm
inputType * mynewData
results of bw multiply
int numStates
The total number of states in the system.
int gemmSplitBW
number of rows in split BW dgemm
void multiplyResultI(multiplyResultMsg *msg)
Entry Method. Simply forwards the call to multiplyResult(). Dont seem to be any instances in source ...
paircalcInputMsg * msgRight
Incoming messages with left and right matrix data that are kept around so that we can directly comput...
CollatorType * leftCollator
Data collators for the left and right matrix blocks.
int conserveMemory
The mem footprint vs performance setting for the paircalcs (tribool)
double ** outTiles
in output streaming we populate the tiles directly
CProxy_InputDataHandler< CollatorType, CollatorType > myMsgHandler
A handle to the co-located chare array that handles data input.
CkGroupID mCastGrpId
group id for multicast manager bw
int gemmSplitFWm
number of columns in split FW dgemm
int grainSize
The grain size along the states dimensions (plural) (number of states per PC chare) ...
int numOrthoCol
sGrainSizeX/orthoGrainSize
bool isDynamics
Is this a minimization or dynamics run.
CkArrayID cb_aid
bw path callback array ID
int PsiVEP
The entry point to which this instance should send PsiV tolerance update results to.
CkGroupID mCastGrpIdOrtho
group id for multicast manager ortho
void initResultSection(initResultMsg *msg)
Entry Method. Initialize the section cookie for each slice of the result.
void multiplyPsiV()
Dynamics: PsiV Tolerance correction loop called on symmetric instances. Technically, backward path.
internalType * inDataLeft
or the C=-1 inRight orthoT +c in dynamics
int actionType
matrix usage control [NORMAL, KEEPORTHO, PSIV]
Class that buffers incoming data (via messages/RDMA) till it counts a pre-specified number of arrival...
int numOrthoRow
sGrainSizeY/orthoGrainSize
int instanceIndex
The proxyOffset value of thisInstance of OpenAtom computations.
PairCalculator(CProxy_InputDataHandler< CollatorType, CollatorType > inProxy, const pc::pcConfig _cfg)
Entry Method. (obviously)
bool isOutputReduced
Should the results from each PC chare be reduced or delivered individually to GSpace?
int numChunks
The number of chunks (4th dimension of decomposition)
int numRecd
number of messages received
int numPoints
number of points in this chunk
int grainSizeX
number of states per chare x-axis
int cb_ep_tol
bw path callback entry point for psiV tolerance
bool amPhantom
consolidate thisIndex.x<thisIndex.y && cfg.isSymmetric && phantomsym
void acceptOrthoT(multiplyResultMsg *msg)
Entry Method. During dynamics, each Ortho calls this on the Asymm loop PC instances to send its share...
int numRecdBWOT
number of messages received BW orthoT
void bwMultiplyHelper(int size, internalType *matrix1, internalType *matrix2, internalType *amatrix, internalType *amatrix2, bool unitcoef, int m_in, int n_in, int k_in, int BNAoffset, int BNCoffset, int BTAoffset, int BTCoffset, int orthoX, int orthoY, double beta, int ogx, int ogy)
multiplyPsiV() and multiplyResult() call this to perform the matrix multiply math on the backward pat...
bool resumed
have resumed from load balancing
void launchComputations(paircalcInputMsg *aMsg)
NOT an entry method. Called locally from the acceptData* methods to launch the appropriate number-cru...
bool isLeftReady
Flags indicating if the left and right matrix blocks have been received.
msgType * getSampleMsg()
Get a copy of a sample message from the last completed batch of messages. Copy managed by the user...
bool isLBon
Should the paircalcs worry about load balancing.
bool shouldDelayBWsend
Should we tweak msg priority to delay msgs to GSpace carrying the results?
Dumb structure that holds all the configuration inputs required for paircalc instantiation, functioning and interaction.
bool isBWstreaming
Should this instance stream the result fragments in the BW path as they become ready?
int orthoGrainSize
The grain size along the states dimensions for Ortho chares.
void phantomDone()
Entry Method. To handle correct startup for symm PC w/phantoms.
void bwbarrier(CkReductionMsg *msg)
Entry Method. a debugging tool: a barrier at the end of the backward path before anything is sent ove...
bool arePhantomsOn
If this is a symmetric instance, should it use phantom chares to balance the BW path.
void copyIntoTiles(double *source, double **dest, int sourceRows, int sourceCols, int *offsetsRow, int *offsetsCol, int *touched, int tileSize, int tilesPerRow)
Copy the results from outdata1 and outdata2 into the tiles.
bool notOnDiagonal
being on or off diagonal changes many things
void acceptLeftData(paircalcInputMsg *msg)
Entry Method. Method to send in the complete block of the left matrix.
void pup(PUP::er &)
PUP method.
~PairCalculator()
Destructor (nothing needs to be done?)
int * touchedTiles
tracker to detect when tiles are full
void initGRed(initGRedMsg *msg)
Entry Method. Initializes the section cookie and the reduction client. Called on startup as the chare...
int blkSize
number points in gspace plane
void cleanupAfterBWPath()
Cleans up at end of an iteration (fw-bw computation loop); frees mem, resets counters etc...
double * allCaughtLeft
unordered rows of FW input
internalType * inResult2
used in gamma calc (non minimization)
void enqueueBWsend(bool unitcoef, int priority=1)
Schedules the entry methods that send out the results to GSpace with appropriate priority.
CkSectionInfo * otherResultCookies
extra array of bw path section cookies for sym off diag, or dynamics
internalType * inResult1
accumulate ortho or lambda
int * columnCount
count of processed rows in BW by column
int numExpectedX
number of messages expected x-axis
void contributeSubTiles(internalType *fullOutput)
Piece up a tile and send all the pieces as this PC's contribution to the Ortho chares.
int resultMsgPriority
If shouldDelayBWsend, what priority should this instance use for the result msgs. ...
int numOrtho
number of orthos in our grain = numOrthoCol*numOrthoRow
CkSectionInfo * orthoCookies
forward path reduction cookie
bool existsOut
outData allocated
pc::pcConfig cfg
A private copy of the input configurations.
int rck
count of received cookies
void expectNext()
Ask me to notify converse that we're ready for the next batch of messages.
void acceptRightData(paircalcInputMsg *msg)
Entry Method. Method to send in the complete block of the right matrix.
bool areBWTilesCollected
Should this instance collect result fragments and send them out together in the BW path...
void multiplyForward(bool flag_dp)
Forward path multiply driver. Prepares matrices, calls DGEMM, contributes results to Ortho subTiles a...