OpenAtom  Version1.5a
ckPairCalculator.h
1 #ifndef CK_PAIR_CALCULATOR_H
2 #define CK_PAIR_CALCULATOR_H
3 
4 #undef OLD_COMMLIB
5 //#define OLD_COMMLIB 1
6 #include "debug_flags.h"
7 
8 #include "ckmulticast.h"
9 #include "ckhashtable.h"
10 #include "ckcomplex.h"
11 
12 #include "PipeBroadcastStrategy.h"
13 #include "BroadcastStrategy.h"
14 #include "DirectMulticastStrategy.h"
15 #include "RingMulticastStrategy.h"
16 #include "MultiRingMulticast.h"
17 #include "NodeMulticast.h"
18 
19 
20 /// If the machine is capable of RDMA...
21 #if CMK_DIRECT
22  // Enable GSpace-PairCalc RDMA
23  //#define PC_USE_RDMA
24  #ifdef PC_USE_RDMA
25  #include "cmidirect.h"
26  // Turn RDMA on for the message data collator
27  #define COLLATOR_ENABLE_RDMA
28  // If RDMA debugging is toggled on, propagate the toggle
29  #ifdef DEBUG_CP_PAIRCALC_RDMA
30  #define DEBUG_MESSAGEDATACOLLATOR_RDMA
31  #endif
32  #endif
33 #endif
34 
35 /// Define the GEMM macros that paircalc will use to invoke the appropriate matrix multiplys
36 #ifdef FORTRANUNDERSCORE
37  #define ZGEMM zgemm_
38  #define DGEMM dgemm_
39  #define DCOPY dcopy_
40  #define ZTODO ztodo_
41 #else
42  #define ZGEMM zgemm
43  #define DGEMM dgemm
44  #define DCOPY dcopy
45  #define ZTODO ztodo
46 #endif
47 
48 
49 /// Flags to control semantic for matrix contents along different paths thro the PairCalcs
50 #define NORMALPC 0 ///< standard
51 #define KEEPORTHO 1 ///< retain orthoT
52 #define PSIV 2 ///< multiply new psiV by retained orthoT
53 
54 extern ComlibInstanceHandle mcastInstanceCP;
55 
56 extern "C" {
57 
58 void DGEMM (char *, char *, int *, int *, int *,double *,double *, int *, double *, int *, double *, double *, int * );
59 void ZGEMM (char *, char *, int *, int *, int *,complex *,complex *, int *, complex *, int *, complex *, complex *, int * );
60 }
61 
62 #include "MessageDataCollator.h"
63 #include "pcMessages.h"
64 #include "pcConfig.h"
65 #include "ckPairCalculator.decl.h"
66 
67 
68 
69 
70 /**
71  * @defgroup PairCalculator PairCalculator
72  * @addtogroup PairCalculator
73  * @{
74  * \brief PairCalculator computes the electron state pairwise psi and force updates for CP_State_GSpacePlane. It can be directed to enact orthonormalization constraints on Psi via \ref Ortho, which are then applied to the input states and returned to CP_State_GSpacePlane. In the dynamics case it performs additional functions based on additional inputs.
75  *
76  * Paircalculator (and ortho) are used to satisfy the orthogonality constraint.
77  * \f$\sum_{g_x,g_y,g_z\epsilon |{\bf g}|<g_{cut}}
78  * \Psi(s,g_x,g_y,g_z)\Psi^*(s',g_x,g_y,g_z) = f_s\delta_{ss'}\f$
79 
80  *
81  * PairCalculator (PC) is a 4D chare array that is, at its heart, a glorified wrapper
82  * for a bunch of matrix multiplications. It serves the function of managing the
83  * complexity of the decomposition and communication logic that is required to
84  * scale well. In the process, it introduces some more :).
85  *
86  * There are two separate PairCalc chare arrays instantiated per OpenAtom simulation
87  * instance. Both of these serve GSpace and process data whenever invoked by GSpace.
88  * Both these PC arrays share a single Ortho chare array to which some of the multiply
89  * work is delegated.
90  *
91  * The extra complications are for parallelization and the multiplication
92  * of the forces and energies.
93  *
94  * In normal use the calculator is created. Then forces are sent to it
95  * and multiplied in a big dgemm. Then this result is reduced to the
96  * answer matrix and shipped back. The received left and/or right data is
97  * retained for the backward pass which is triggered by the finishPairCalc
98  * call. This carries in another set of matrices for multiplication.
99  * The results are again reduced and cast back. Thus terminating the life
100  * cycle of the data in the pair calculator. As the calculator will be
101  * reused again throughout each iteration the calculators themselves are
102  * only created once.
103  *
104  *
105  * The paircalculator is a 4 dimensional array. Those dimensions are:
106  * w: gspace state plane (the second index of the 2D gspace)
107  * x: coordinate offset within plane (a factor of grainsize)
108  * y: coordinate offset within plane (a factor of grainsize)
109  * z: chunk offset within array of nonzero points
110  * So, for an example grainsize of 64 for a 128x128 problem:
111 \verbatim
112  * numStates/grainsize gives us a 2x2 decomposition.
113  * 1st quadrant ranges from [0,0] to [63,63] index [w,0,0,0]
114  * 2nd quadrant ranges from [0,64] to [63,127] index [w,0,64,0]
115  * 3rd quadrant ranges from [64,0] to [127,63] index [w,64,0,0]
116  * 4th quadrant ranges from [64,64] to [127,127] index [w,64,64,0]
117 
118  0 64 127
119  0 _________
120  | | |
121  | 1 | 2 |
122  64 ---------
123  | | |
124  | 3 | 4 |
125  127 ---------
126 \endverbatim
127  *
128  *
129  * Further complication arises from the fact that each plane is a
130  * cross-section of a sphere. So the actual data is sparse and is
131  * represented by a contiguous set of the nonzero elements. This is
132  * commonly referred to as numPoints or size within the calculator.
133  *
134  * In the dynamics case there are two additional complications. In the
135  * backward path of the asymmetric pairCalculator we will receive 2 input
136  * matrices, gamma and orthoT. Where orthoT came from the
137  * orthonormalization following the symmetric pairCalculator. And
138  * gamma was produced by a multiplication with orthoT in the Ortho
139  * object.
140  *
141  * If ortho falls out of tolerance then Ortho will signal the GSP that
142  * a tolerance update is needed. We then proceed with the psi
143  * calculation as normal. On receipt of newpsi, Gspace will then
144  * react by sending the PC the Psi velocities (PsiV) in the same way
145  * (acceptLeft/RightData) that it sends Psi, but with the psiv flag set
146  * true. These will be recorded in the left data array. We will then
147  * multiply with the orthoT we kept from the previous invocation (psi)
148  * of the backward path. We then ship the corrected velocities back
149  * to gspace via the acceptnewVpsi reduction. Same procedure as for
150  * acceptnewpsi, just a different entry method.
151  *
152  * Fourth dimension decomposition is along the axis of the nonzero
153  * values in gspace. Therefore it is fundamentally different from the
154  * 2nd and 3rd dimensions which divide the states up into
155  * (states/grainsize)^2 pieces. The fourth dimension divides along
156  * the nonzeros of gspace. A X,0,0,N division will have the entirety
157  * of a state, but only a K/Nth (where K is the number of nonzero
158  * elements) chunk of the nonzero values. It can therefore perform
159  * the dgemm on that chunk, its multicast degree will be 1, and have a
160  * portion of the total solution. Thereby reducing the PC inbound
161  * communication volume substantially. This comes at the cost of an
162  * additional reduction. The result for the nonzero chunks has to be
163  * pasted together to form the result for the entire nonzero. Then
164  * the results are summed together across the planes to produce the
165  * complete S or L matrix. Only the first of those reductions is new.
166  *
167  * More about this "extra" reduction. If we consider the Multiply as
168  * C = AB. Where A is nstates x numpoints and B is numpoints x
169  * nstates to result in C of nstates x nstates. The 4th dim
170  * decomposition chops only the inner index numpoints. Thereby
171  * resulting in numblocks C submatrices all of size nstates x nstates.
172  * Making C from numblock C(i) is just matrix addition. So for the
173  * forward path there is in fact no "extra" reduction or stitching
174  * necessary for the 4th dim decomposition. All the "stitching" is in
175  * the statewise decompostion for the forward path. So the only
176  * change for the forward path is in adding the numblock elements to
177  * the reduction tree.
178  *
179  *
180  * An important distinction between these methods is that in the
181  * absence of a grainsize decomposition the sections for the second
182  * reduction to ortho are essentially arbitrary with respect to the
183  * PairCalculator decomposition.
184  *
185  *
186  * Similarly, the backward path for a chunk with grainsize==nstates
187  * needs input matrices of size nstates X nstates. Which means that
188  * the backward path cannot proceed until all ortho sections broadcast
189  * their pieces of the input matrices. The backward path reduction to
190  * gspace becomes richer in that now each chunk is contributing at an
191  * offset. So the acceptnew[psi|lambda|vpsi] methods would all need
192  * to paste the input of a contribution at the correct offset. This
193  * recalls the old contiguousreducer logic which did that pasting
194  * together in its reduction client. A functionality that is arguably
195  * worth resurrection. Just in a form that can actually be
196  * comprehended by people not from planet brainiac.
197  *
198  * Which means we should add a distinct parameter for the number of ortho
199  * objects. We'll also need to come up with a way for it to map its
200  * grainsize sections onto the chunketized PC chare array. The
201  * constraints on this mapping are only that they should use as many
202  * PCs as we can. The PCs will use the section reduction in their
203  * forward path reduction to deposit the S (or lambda) matrix in ortho.
204  * Ortho will have to broadcast its T (or lambda, or orthoT and gamma)
205  * to the PairCalculator.
206  *
207  * Which returns us to the bad old days of broadcasting matrices
208  * around. This can be ameliorated slightly by using [nokeep]
209  * messages so that we reduce the number of copies to 1 per PE. But
210  * you still have to send numPE*nstates*nstates doubles around (times
211  * 2 in the dynamics symmetric case to carry ortho and gamma).
212  * Luckily broadcasts turn out to be pretty fast on BG/L. So this may
213  * not be so bad. The tradeoff against the much larger nonzero
214  * multicast is net positive due to the larger constant size of the
215  * nonzeros compared to nstates.
216  *
217  * These communication patterns become more complex in the hybrid case
218  * where we have both grainsize and chunksize. The ortho->PC section
219  * mapping could revert to using grainsize, but now has to sum across
220  * all the chunks in that grain.
221  *
222  * If we want independant control over the number of ortho objects
223  * then we need to support the overlap issues where grainsize objects
224  * do not map nicely onto ortho objects.
225  *
226  * The reduction out of the backward path of the paircalculator is the
227  * one which is made more complicated by 4th dimension decomposition.
228  * Here we are sending the transformed Psi, which is necessarily
229  * numpoints in size. Therefore the reduction requires stitching of
230  * blocks while summing within a block to assemble the entire g-chare
231  * matrix of points. In practice this is just one big reduction with
232  * the userfield used to indicate the chunk number so gspace can lay
233  * the results out where they belong.
234  *
235  * In its current form orthoGrainSize must be an even mod of
236  * sGrainSize. This restriction avoids overlap and boundary issues
237  * for tiles between ortho and the calculator. Thereby avoiding some
238  * rather gnarly issues in the setup of multicasts and reductions.
239  * This is a fairly minor restriction as long as we do not require
240  * nstates % sgrainsize==0 or nstates & orthograinsize.
241  *
242  * sGrainSize need not be even mod of the number of states. nstates %
243  * sGrainSize = remainder requires some careful handling in the
244  * code. Whenever this occurs the multiplies and communications for
245  * the border edges which carry the remainder have to cope with
246  * asymmetric multiplies and funky remainder logic in communicating
247  * the result.
248  *
249  *
250  *
251  * Data from GSP travels to the PairCalculators via an InputDataHandler chare
252  * array of the same dimensions as, and bound to, the PairCalculator array.
253  * Appropriate sections/lists of this input handler array for multicasting
254  * the data from GSP to are built in makeLeftTree() and makeRightTree().
255  *
256  * Each iteration of the GSP-PC-Ortho-PC-GSP loop is started by GSP calling
257  * startPairCalcLeft() and startPairCalcRight(). These are simply #defines
258  * that turn into the appropriate function: sendLeftData() and sendRightData()
259  * or their RDMA equivalents. The input handler chares then collate all the
260  * incoming data and then wake their corresponding PC chares once all the data
261  * is available. The PCs then do their thing and the result is returned via
262  * the callback set in the create routine (which happens to be Ortho).
263  *
264  * The backward path is triggered by multiplyResult()
265  * Its result is returned via the callback entry point passed in
266  * during creation
267  *
268  * The results of the backward path are returned in a set of section
269  * reductions. The reduction contributes its slice of its matrix of
270  * doubles with the offset=thisIndex.z. The client then returns the
271  * sum of each slice to the GSP element that created the section with
272  * the offset so it can be copied into the correct place in the points
273  * array.
274  *
275  *
276  * NOTE: The magic number 2 appears in 2 contexts. Either we have
277  * twice as many inputs in the non diagonal elements of the matrix.
278  * Or in the case of transforming our arrays of complex into arrays of
279  * doubles. This transformation is done so we can use DGEMM instead
280  * of ZGEMM. Motivated by the empirical discovery that BLAS
281  * implementors do all their work on DGEMM and leave ZGEMM out in the
282  * unoptimized cold. The latter issue crops up everywhere we have to
283  * do allocation or manipulations of the input. Could arguably be
284  * abstracted away by absorbing it into a size field.
285  *
286  *
287  * After the main::main proc 0 phase the array sections used to populate
288  * and return data from the paircalculator are called.
289  * The forward path section reduction (to and from ortho) is initialized
290  * via initOneRedSect() the backward path is initialized via the
291  * appropriate makeOneResultSection_X() call. In each case the call
292  * should be made by each GSP or Ortho object. That way each one has its
293  * own proxy and the section tree will only include relevant processors.
294  *
295  * The chunk decomposition changes the setup substantially. In chunk
296  * decomposition we send a piece of the nonzero points of gspace to
297  * the paircalculators. It is not a multicast. Each GSP[P,S] will
298  * send its ith chunk to PC[P,0,0,i]. Once nstate chunks arrive at a
299  * PC the multiply can proceed.
300  *
301  * In the hybrid case this becomes a multicast of chunks to the
302  * appropriate state decomposition destination as before.
303  *
304  */
306 {
307  public:
308  /// @entry (obviously)
310  /// Constructor for migration
311  PairCalculator(CkMigrateMessage *);
312  /// Destructor (nothing needs to be done?)
313  ~PairCalculator();
314  /// Returns a pointer to the collator that will buffer the left matrix data (only for use by the corresponding InputDataHandler chare)
315  inline CollatorType* leftHandler() const { return leftCollator; }
316  /// Returns a pointer to the collator that will buffer the right matrix data (only for use by the corresponding InputDataHandler chare)
317  inline CollatorType* rightHandler() const { return rightCollator; }
318 
319  /// @entry To handle correct startup for symm PC w/phantoms
320  void phantomDone();
321  /// @entry Method to send in the complete block of the left matrix
322  void acceptLeftData(paircalcInputMsg *msg);
323  /// @entry Method to send in the complete block of the right matrix
325  /// NOT an entry method. Called locally from the acceptData* methods to launch the appropriate number-crunching method
327  /// Forward path multiply driver. Prepares matrices, calls DGEMM, contributes results to Ortho subTiles and also passes relevant data to phantom PC chares
328  void multiplyForward(bool flag_dp);
329  /// @entry Simply redirects call to multiplyForward()
331  /// Piece up a tile and send all the pieces as this PC's contribution to the Ortho chares
332  void contributeSubTiles(internalType *fullOutput);
333  /// Contribute orthoGrainSized tiles of data (that are ready?) to the corresponding ortho chares
334  void sendTiles(bool flag_dp);
335  /// Receive data from ortho chares and copy into matrix
336  void collectTile(bool doMatrix1, bool doMatrix2, bool doOrthoT, int orthoX, int orthoY, int orthoGrainSizeX, int orthoGrainSizeY, int numRecdBW, int matrixSize, internalType *matrix1, internalType* matrix2);
337  /// @entry Initializes the section cookie and the reduction client. Called on startup as the chare world is being created
338  void initGRed(initGRedMsg *msg);
339  /// @entry During dynamics, each Ortho calls this on the Asymm loop PC instances to send its share of T back to avoid a race condition between Gamma and T.
340  void acceptOrthoT(multiplyResultMsg *msg);
341  /// @entry Backward path multiplication
343  /// Dynamics: PsiV Tolerance correction loop called on symmetric instances. Technically, backward path
344  void multiplyPsiV();
345  /// Multiplies Fpsi by T (from Ortho)
346  void bwMultiplyDynOrthoT();
347  /// @entry Simply forwards the call to multiplyResult(). @ntodo Dont seem to be any instances in source which call this method. Check.
349  /// @entry a debugging tool: a barrier at the end of the backward path before anything is sent over to GSP
350  void bwbarrier(CkReductionMsg *msg);
351  /// multiplyPsiV() and multiplyResult() call this to perform the matrix multiply math on the backward path. This calls DGEMM routines for the same.
352  void bwMultiplyHelper(int size, internalType *matrix1, internalType *matrix2, internalType *amatrix, internalType *amatrix2, bool unitcoef, int m_in, int n_in, int k_in, int BNAoffset, int BNCoffset, int BTAoffset, int BTCoffset, int orthoX, int orthoY, double beta, int ogx, int ogy);
353  /// Called on the normal backward path (non-psiV) to set up the data sends to GSpace
354  void bwSendHelper(int orthoX, int orthoY, int sizeX, int sizeY, int ogx, int ogy);
355  /// @entry Send the results via multiple reductions as triggered by a prioritized message
356  void sendBWResult(sendBWsignalMsg *msg);
357  /// @entry
359  ///
360  void sendBWResultColumn(bool other, int startGrain, int endGrain);
361  ///
362  void sendBWResultColumnDirect(bool other, int startGrain, int endGrain);
363  /// @entry Initialize the section cookie for each slice of the result
364  void initResultSection(initResultMsg *msg);
365  /// PUP method
366  void pup(PUP::er &);
367  ///
368  void reorder(int *offsetMap, int *revOffsetMap, double *data, double *scratch);
369  ///
370  void dumpMatrix(const char *, double*, int, int, int xstart=0,int ystart=0, int xtra1=0, int xtra2=0);
371  ///
372  void dumpMatrix(const char *, complex*, int, int, int xstart=0,int ystart=0, int xtra1=0, int xtra2=0);
373  ///
374  void dumpMatrixComplex(const char *, complex *,int,int, int xstart=0, int ystart=0, int iter=0);
375  ///
376  void dgemmSplitBwdM(int m, int n, int k, char *trans, char *transT, double *alpha, double *A, double *B, double *bt, double *C);
377  ///
378  void dgemmSplitFwdStreamMK(int m, int n, int k, char *trans, char *transT, double *alpha, double *A, int *lda, double *B, int *ldb, double *C, int *ldc);
379  ///
380  void dgemmSplitFwdStreamNK(int m, int n, int k, char *trans, char *transT, double *alpha, double *A, int *lda, double *B, int *ldb, double *C, int *ldc);
381  ///
382  void ResumeFromSync();
383  /// @entry Something to sync?
384  void lbsync()
385  {
386  #ifdef _PAIRCALC_DEBUG_
387  CkPrintf("[%d,%d,%d,%d] atsyncs\n", thisIndex.w, thisIndex.x, thisIndex.y, thisIndex.z);
388  #endif
389  resumed=false;
390  rck=0;
391  AtSync();
392  };
393 
394  private:
395 
396  /// Schedules the entry methods that send out the results to GSpace with appropriate priority
397  void enqueueBWsend(bool unitcoef, int priority=1);
398  /// Cleans up at end of an iteration (fw-bw computation loop); frees mem, resets counters etc
399  void cleanupAfterBWPath();
400 
401 
402  /// A private copy of the input configurations
404  /// A handle to the co-located chare array that handles data input
406  /// Data collators for the left and right matrix blocks
407  CollatorType *leftCollator, *rightCollator;
408  /// Flags indicating if the left and right matrix blocks have been received
409  bool isLeftReady, isRightReady;
410  int numRecd; ///< number of messages received
411  int numRecdBW; ///< number of messages received BW
412  int numRecdBWOT; ///< number of messages received BW orthoT
413  int numExpected; ///< number of messages expected all
414  int numExpectedX; ///< number of messages expected x-axis
415  int numExpectedY; ///< number of messages expected y-axis
416  int grainSizeX; ///< number of states per chare x-axis
417  int grainSizeY; ///< number of states per chare y-axis
418  int orthoGrainSizeRemX; ///< sgrainSizeX % orthoGrainSize
419  int orthoGrainSizeRemY; ///< sgrainSizeY % orthoGrainSize
420  int blkSize; ///< number points in gspace plane
421  int numPoints; ///< number of points in this chunk
422 
423  int streamCaughtR; ///< number of rows caught so far R stream
424  int streamCaughtL; ///< number of rows caught so far L stream
425 
426  int numRecLeft; ///< number of rows so far total left
427  int numRecRight; ///< number of rows so far total right
428 
429  int gemmSplitFWk; ///< number of rows in split FW dgemm
430  int gemmSplitFWm; ///< number of columns in split FW dgemm
431  int gemmSplitBW; ///< number of rows in split BW dgemm
432 
433 
434  int *LeftOffsets; ///< index numbers of caught stream elements
435  int *RightOffsets; ///< index numbers of caught stream elements
436 
437  int *LeftRev; ///< reverse index numbers of caught stream elements
438  int *RightRev; ///< reverse index numbers of caught stream elements
439 
440  double **outTiles; ///< in output streaming we populate the tiles directly
441 
442  int *touchedTiles; ///< tracker to detect when tiles are full
443 
444  bool notOnDiagonal; ///< being on or off diagonal changes many things
445  bool symmetricOnDiagonal; ///< diagonal symmetric special case
446 
447  bool expectOrthoT; ///< Is true only in asymmetric, dynamics scenario. For PC instances in the asymmetric chare array, orthoT should arrive before end of fwd path
448  bool amPhantom; ///< consolidate thisIndex.x<thisIndex.y && cfg.isSymmetric && phantomsym
449 
450  CkArrayID cb_aid; ///< bw path callback array ID
451  int cb_ep; ///< bw path callback entry point
452  int cb_ep_tol; ///< bw path callback entry point for psiV tolerance
453  bool existsLeft; ///< inDataLeft allocated
454  bool existsRight; ///< inDataRight allocated
455  bool existsOut; ///< outData allocated
456  bool existsNew; ///< newData allocated
457  bool resumed; ///< have resumed from load balancing
458 
459  inputType *mynewData; ///< results of bw multiply
460  inputType *othernewData; ///< results of sym off diagonal multiply,
461  //! or the C=-1 *inRight* orthoT +c in dynamics
462  internalType *inDataLeft; ///< the input pair to be transformed
463  internalType *inDataRight; ///< the input pair to be transformed
464  paircalcInputMsg *msgLeft, *msgRight; ///< Incoming messages with left and right matrix data that are kept around so that we can directly compute on them
465  internalType *outData; ///< results of fw multiply
466  int actionType; ///< matrix usage control [NORMAL, KEEPORTHO, PSIV]
467 
468  double *allCaughtLeft; ///< unordered rows of FW input
469  double *allCaughtRight; ///< unordered rows of FW input
470 
471 
472  internalType *inResult1; ///< accumulate ortho or lambda
473  internalType *inResult2; ///< used in gamma calc (non minimization)
474 
475  /* to support the simpler section reduction*/
476  int rck; ///< count of received cookies
477  CkGroupID mCastGrpIdOrtho; ///< group id for multicast manager ortho
478  int numOrthoCol; ///< sGrainSizeX/orthoGrainSize
479  int numOrthoRow; ///< sGrainSizeY/orthoGrainSize
480  int numOrtho; ///< number of orthos in our grain = numOrthoCol*numOrthoRow
481 
482  CkGroupID mCastGrpId; ///< group id for multicast manager bw
483 
484  CkSectionInfo *resultCookies; ///< array of bw path section cookies
485  CkSectionInfo *otherResultCookies; ///< extra array of bw path section cookies for sym off diag, or dynamics
486 
487  CkCallback *orthoCB; ///< forward path callbacks
488  CkSectionInfo *orthoCookies; ///< forward path reduction cookie
489  int *columnCount; ///< count of processed rows in BW by column
490  int *columnCountOther; ///< count of processed rows in BW by column
491 
492  /** Copy the results from outdata1 and outdata2 into the tiles.
493  * Iterate through the source array, look up the destination row in offsetsRow, destination
494  * col in offsetsCol. This will be the destination row and column for the output if the
495  * output were considered as a single matrix. Use tileSize to map these values into the destination tile.
496  */
497  void copyIntoTiles(double *source, double**dest, int sourceRows, int sourceCols, int *offsetsRow, int *offsetsCol, int *touched, int tileSize, int tilesPerRow );
498 };
499 
500 ///forward declaration
501 CkReductionMsg *sumMatrixDouble(int nMsg, CkReductionMsg **msgs);
502 CkReductionMsg *sumBlockGrain(int nMsg, CkReductionMsg **msgs);
503 
504 void manmult(int numrowsA, int numRowsB, int rowLength, double *A, double *B, double *C, double alpha);
505 /*@}*/
506 
507 #endif // CK_PAIR_CALCULATOR_H
508 
bool isSymmetric
Is this a symmetric or asymmetric paircalc instance.
Definition: pcConfig.h:30
CkSectionInfo * resultCookies
array of bw path section cookies
int streamCaughtL
number of rows caught so far L stream
bool expectOrthoT
Is true only in asymmetric, dynamics scenario. For PC instances in the asymmetric chare array...
int streamCaughtR
number of rows caught so far R stream
void sendBWResultColumn(bool other, int startGrain, int endGrain)
Send the result for this column.
int grainSizeY
number of states per chare y-axis
int orthoGrainSizeRemX
sgrainSizeX % orthoGrainSize
CkCallback * orthoCB
forward path callbacks
internalType * inDataRight
the input pair to be transformed
int numRecdBW
number of messages received BW
void bwSendHelper(int orthoX, int orthoY, int sizeX, int sizeY, int ogx, int ogy)
Called on the normal backward path (non-psiV) to set up the data sends to GSpace. ...
void bwMultiplyDynOrthoT()
Multiplies Fpsi by T (from Ortho)
CollatorType * leftHandler() const
Returns a pointer to the collator that will buffer the left matrix data (only for use by the correspo...
void sendTiles(bool flag_dp)
Contribute orthoGrainSized tiles of data (that are ready?) to the corresponding ortho chares...
double * allCaughtRight
unordered rows of FW input
int orthoGrainSizeRemY
sgrainSizeY % orthoGrainSize
inputType * othernewData
results of sym off diagonal multiply,
CkReductionMsg * sumMatrixDouble(int nMsg, CkReductionMsg **msgs)
forward declaration
internalType * outData
results of fw multiply
int cb_ep
bw path callback entry point
void collectTile(bool doMatrix1, bool doMatrix2, bool doOrthoT, int orthoX, int orthoY, int orthoGrainSizeX, int orthoGrainSizeY, int numRecdBW, int matrixSize, internalType *matrix1, internalType *matrix2)
Receive data from ortho chares and copy into matrix.
int numExpectedY
number of messages expected y-axis
bool existsLeft
inDataLeft allocated
bool existsNew
newData allocated
int numRecRight
number of rows so far total right
int numExpected
number of messages expected all
bool existsRight
inDataRight allocated
void multiplyResult(multiplyResultMsg *msg)
Entry Method. Backward path multiplication.
int numRecLeft
number of rows so far total left
void sendBWResult(sendBWsignalMsg *msg)
Entry Method. Send the results via multiple reductions as triggered by a prioritized message...
int * RightOffsets
index numbers of caught stream elements
void sendBWResultDirect(sendBWsignalMsg *msg)
Entry Method.
int * columnCountOther
count of processed rows in BW by column
bool symmetricOnDiagonal
diagonal symmetric special case
int gemmSplitFWk
number of rows in split FW dgemm
inputType * mynewData
results of bw multiply
int gemmSplitBW
number of rows in split BW dgemm
void lbsync()
Entry Method. Something to sync?
void multiplyResultI(multiplyResultMsg *msg)
Entry Method. Simply forwards the call to multiplyResult(). Dont seem to be any instances in source ...
CollatorType * rightHandler() const
Returns a pointer to the collator that will buffer the right matrix data (only for use by the corresp...
paircalcInputMsg * msgRight
Incoming messages with left and right matrix data that are kept around so that we can directly comput...
CollatorType * leftCollator
Data collators for the left and right matrix blocks.
double ** outTiles
in output streaming we populate the tiles directly
CProxy_InputDataHandler< CollatorType, CollatorType > myMsgHandler
A handle to the co-located chare array that handles data input.
CkGroupID mCastGrpId
group id for multicast manager bw
int gemmSplitFWm
number of columns in split FW dgemm
int numOrthoCol
sGrainSizeX/orthoGrainSize
CkArrayID cb_aid
bw path callback array ID
CkGroupID mCastGrpIdOrtho
group id for multicast manager ortho
void initResultSection(initResultMsg *msg)
Entry Method. Initialize the section cookie for each slice of the result.
void multiplyPsiV()
Dynamics: PsiV Tolerance correction loop called on symmetric instances. Technically, backward path.
internalType * inDataLeft
or the C=-1 inRight orthoT +c in dynamics
int actionType
matrix usage control [NORMAL, KEEPORTHO, PSIV]
Class that buffers incoming data (via messages/RDMA) till it counts a pre-specified number of arrival...
int numOrthoRow
sGrainSizeY/orthoGrainSize
PairCalculator(CProxy_InputDataHandler< CollatorType, CollatorType > inProxy, const pc::pcConfig _cfg)
Entry Method. (obviously)
int * LeftOffsets
index numbers of caught stream elements
int numPoints
number of points in this chunk
int numRecd
number of messages received
int grainSizeX
number of states per chare x-axis
The new message for sending input data to the PairCalculator.
Definition: pcMessages.h:117
void multiplyForwardRDMA()
Entry Method. Simply redirects call to multiplyForward()
int cb_ep_tol
bw path callback entry point for psiV tolerance
bool amPhantom
consolidate thisIndex.x<thisIndex.y && cfg.isSymmetric && phantomsym
void acceptOrthoT(multiplyResultMsg *msg)
Entry Method. During dynamics, each Ortho calls this on the Asymm loop PC instances to send its share...
int numRecdBWOT
number of messages received BW orthoT
void bwMultiplyHelper(int size, internalType *matrix1, internalType *matrix2, internalType *amatrix, internalType *amatrix2, bool unitcoef, int m_in, int n_in, int k_in, int BNAoffset, int BNCoffset, int BTAoffset, int BTCoffset, int orthoX, int orthoY, double beta, int ogx, int ogy)
multiplyPsiV() and multiplyResult() call this to perform the matrix multiply math on the backward pat...
bool resumed
have resumed from load balancing
void launchComputations(paircalcInputMsg *aMsg)
NOT an entry method. Called locally from the acceptData* methods to launch the appropriate number-cru...
bool isLeftReady
Flags indicating if the left and right matrix blocks have been received.
int * LeftRev
reverse index numbers of caught stream elements
Dumb structure that holds all the configuration inputs required for paircalc instantiation, functioning and interaction.
Definition: pcConfig.h:23
Input handler chare array proxies.
void phantomDone()
Entry Method. To handle correct startup for symm PC w/phantoms.
void bwbarrier(CkReductionMsg *msg)
Entry Method. a debugging tool: a barrier at the end of the backward path before anything is sent ove...
void copyIntoTiles(double *source, double **dest, int sourceRows, int sourceCols, int *offsetsRow, int *offsetsCol, int *touched, int tileSize, int tilesPerRow)
Copy the results from outdata1 and outdata2 into the tiles.
bool notOnDiagonal
being on or off diagonal changes many things
void acceptLeftData(paircalcInputMsg *msg)
Entry Method. Method to send in the complete block of the left matrix.
int * RightRev
reverse index numbers of caught stream elements
void pup(PUP::er &)
PUP method.
~PairCalculator()
Destructor (nothing needs to be done?)
int * touchedTiles
tracker to detect when tiles are full
Useful debugging flags.
void initGRed(initGRedMsg *msg)
Entry Method. Initializes the section cookie and the reduction client. Called on startup as the chare...
int blkSize
number points in gspace plane
void cleanupAfterBWPath()
Cleans up at end of an iteration (fw-bw computation loop); frees mem, resets counters etc...
double * allCaughtLeft
unordered rows of FW input
internalType * inResult2
used in gamma calc (non minimization)
void enqueueBWsend(bool unitcoef, int priority=1)
Schedules the entry methods that send out the results to GSpace with appropriate priority.
CkSectionInfo * otherResultCookies
extra array of bw path section cookies for sym off diag, or dynamics
internalType * inResult1
accumulate ortho or lambda
int * columnCount
count of processed rows in BW by column
int numExpectedX
number of messages expected x-axis
void contributeSubTiles(internalType *fullOutput)
Piece up a tile and send all the pieces as this PC's contribution to the Ortho chares.
int numOrtho
number of orthos in our grain = numOrthoCol*numOrthoRow
CkSectionInfo * orthoCookies
forward path reduction cookie
bool existsOut
outData allocated
pc::pcConfig cfg
A private copy of the input configurations.
int rck
count of received cookies
void acceptRightData(paircalcInputMsg *msg)
Entry Method. Method to send in the complete block of the right matrix.
void multiplyForward(bool flag_dp)
Forward path multiply driver. Prepares matrices, calls DGEMM, contributes results to Ortho subTiles a...