OpenAtom  Version1.5a
ortho.C
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////
2 //////////////////////////////////////////////////////////////////////////////
3 //////////////////////////////////////////////////////////////////////////////
4 /** \file ortho.C
5  * @defgroup Ortho Ortho
6  * The Ortho object is basically used to accept the rows of the "S" matrix,
7  * use the orthoTransform subroutine to obtain the "T" matrix, and send the
8  * rows of the "T" matrix back to all the GSpace states.
9  *
10  * The S_to_T(...) method is invoked when the S-Calculators send their
11  * contributions either through the S_Reducer group or otherwise.
12  * Once the conversion from S to T is done, the preparation for the
13  * broadcast back to the S-Calculators is done.
14  *
15  * This method also takes of performing the load-balancing step for
16  * the experimental cases where that is in play.
17  *
18  * 2D version of ortho permits better parallelization of the diagonal
19  * and to facilitate pair calculator communication.
20  *
21  * Each pair calculator will perform its multiply and reduce the
22  * result within its "column" AKA quadrant via the Ortho array.
23  * Resulting in sub matrices on Ortho which could be pasted together
24  * on ortho[0,0] for the complete matrix.
25  *
26  * For the dynamics (non minization) case we have an additional
27  * multiplication of gamma= lambda*orthoT. And we pass both gamma and
28  * orthoT to the PC instead.
29  *
30  * For dynamics we also have a tolerance check for orthoT. The
31  * tolerance check is a min reduction on OrthoT. If it falls out of
32  * the tolerance range we have to correct the velocities. We notify
33  * the PC of this via a flag in the finishsection call. It will then
34  * be handled by gspace. We will keep the tolerance flag and set
35  * orthoT to the unit AKA identity matrix for the gamma calculation.
36  *
37  * We will perform that tolerance check every config.toleranceInterval
38  * steps.
39  *
40  * This complicates interaction with PC a bit since we can no longer
41  * just multiply our index by the sGrainSize to determine it.
42  *
43  */
44 //////////////////////////////////////////////////////////////////////////////
45 
46 #include "ortho.h"
47 #include "orthoHelper.h"
48 //#include "gSpaceDriver.decl.h"
49 #include "timeKeeper.decl.h"
50 
51 #include "charm++.h"
52 #include "utility/matrix2file.h"
53 #include "utility/util.h"
54 
55 //#include "fft_slab_ctrl/fftCacheSlab.h"
56 #include <unistd.h>
57 
58 #ifdef TRACE_MEMORY
59  #include <trace-projections.h>
60 #endif
61 
62 #include "src_mathlib/mathlib.h"
63 #include "src_piny_physics_v1.0/include/class_defs/CP_OPERATIONS/class_cporthog.h"
64 #include "src_piny_physics_v1.0/include/class_defs/piny_constants.h"
65 #define PRINTF CkPrintf
66 //////////////////////////////////////////////////////////////////////////////
67 extern Config config;
68 extern CProxy_TimeKeeper TimeKeeperProxy;
69 extern ComlibInstanceHandle orthoInstance;
70 
71 //////////////////////////////////////////////////////////////////////////////
72 
73 /** @addtogroup Ortho
74  @{
75 */
76 
77 Ortho::~Ortho()
78 {
79  delete [] A;
80  delete [] B;
81  delete [] C;
82  delete [] tmp_arr;
83  if(thisIndex.x==0 && thisIndex.y==0)
84  {
85  delete [] ortho;
86  delete [] orthoT;
87  }
88 }
89 
90 
91 
92 
93 //////////////////////////////////////////////////////////////////////////////
94 //////////////////////////////////////////////////////////////////////////////
95 //////////////////////////////////////////////////////////////////////////////
96 void Ortho::collect_error(CkReductionMsg *msg) {
97  #ifdef VERBOSE_ORTHO
98  CkPrintf("[%d,%d] Ortho::collect_error \n", thisIndex.x, thisIndex.y);
99  #endif
100  CmiAssert(thisIndex.x == 0 && thisIndex.y == 0);
101  // end_t = CmiWallTimer();
102  double error = *((double *) msg->getData());
103  error = sqrt(error / (cfg.numStates * cfg.numStates));
104  // CkPrintf("%d\t%f\t%g\n", iterations, end_t - start_t, error);
105  if(error > invsqr_tolerance && iterations < invsqr_max_iter){
106  // start_t = CmiWallTimer();
107  if(config.useOrthoSection)
108  {
109  orthoMtrigger *tmsg= new orthoMtrigger;
110  multiproxy.do_iteration(tmsg);
111  }
112  else
113  thisProxy.do_iteration();
114  }
115  else{
116  if(iterations>=config.invsqr_max_iter)
117  {
118  CkPrintf("Ortho reached max_iter %d with residual of %.10g, which is greater than invsqr_tolerance %.10g!\nEither increase invsqr_max_iter or invsqr_tolerance.\n If this is not the first step in a GenWave try lowering your timestep\n",config.invsqr_max_iter, error, config.invsqr_tolerance);
119  CkAbort("invsqr_tolerance not met within invsqr_max_iter\n");
120  }
121 
122 
123  if(config.useOrthoSection)
124  {
125  orthoMtrigger *tmsg= new orthoMtrigger;
126  multiproxy.collect_results(tmsg);
127  }
128  else
129  thisProxy.collect_results();
130  }
131  delete msg;
132 
133  }
134 
135 //////////////////////////////////////////////////////////////////////////////
136 
137 //////////////////////////////////////////////////////////////////////////////
138 //////////////////////////////////////////////////////////////////////////////
139 //////////////////////////////////////////////////////////////////////////////
140 void Ortho::start_calc(CkReductionMsg *msg){
141  #ifdef VERBOSE_ORTHO
142  CkPrintf("[%d,%d] Ortho::start_calc \n", thisIndex.x, thisIndex.y);
143  #endif
144 #ifdef _CP_SUBSTEP_TIMING_
145  if(timeKeep>0)
146  {
147  double ostart=CmiWallTimer();
148  CkCallback cb(CkIndex_TimeKeeper::collectStart(NULL),0,TimeKeeperProxy);
149  contribute(sizeof(double),&ostart,CkReduction::min_double, cb , timeKeep);
150  }
151 #endif
152  got_start = true;
153  internalType *S = (internalType*) msg->getData();
154 #ifdef _NAN_CHECK_
155  for(int i=0;i<msg->getSize()/sizeof(internalType) ;i++)
156  CkAssert( isfinite(S[i]) );
157 #endif
158 
159  step = 0;
160  iterations = 0;
161  CkIndex2D pc = symmSectionMgr.computePCStateIndices();
162  if(pc.x < pc.y)
163  {
164  //we get the reduction and //we have a spare to copy to
165  //make a copy
166  CkReductionMsg *omsg=CkReductionMsg::buildNew(msg->getSize(),msg->getData());
167  // transpose it
168  /*
169  double *dest= (double*) omsg->getData();;
170  double tmp;
171  for(int i = 0; i < m; i++)
172  for(int j = i + 1; j < n; j++){
173  tmp = dest[i * n + j];
174  dest[i * n + j] = dest[j*m + i];
175  dest[j * m + i] = tmp;
176  }
177  */
178  // simple out of place scheme
179 
180  internalType *dest= (internalType*) omsg->getData();;
181  internalType tmp;
182  for(int i = 0; i < m; i++)
183  for(int j = 0; j < n; j++)
184  dest[j * m + i] = S[i *n + j];
185 
186  thisProxy(thisIndex.y,thisIndex.x).start_calc(omsg);
187 
188  }
189  else if(pc.y < pc.x)
190  { //we get a transposed copy be happy
191 
192  }
193  else if((pc.x==pc.y) && (thisIndex.x > thisIndex.y))
194  { //we are a spare, got our matrix direct from Scalc
195 
196  }
197 
198 #ifdef _CP_ORTHO_DUMP_SMAT_
199  dumpMatrix("smat",(double *)S, m, n, numGlobalIter, thisIndex.x * cfg.grainSize, thisIndex.y * cfg.grainSize, 0, false);
200 
201 #endif
202 
203 #ifdef _CP_ORTHO_DEBUG_COMPARE_SMAT_
204  if(savedsmat==NULL)
205  { // load it
206  savedsmat= new double[m*n];
207  loadMatrix("smat",(double *)savedsmat, m, n,numGlobalIter, thisIndex.x*cfg.grainSize,thisIndex.y*cfg.grainSize,0,false);
208  }
209  for(int i=0;i<m*n;i++)
210  {
211  if(fabs(S[i]-savedsmat[i])>0.0001)
212  {
213  fprintf(stderr, "O [%d,%d] %d element ortho %.10g not %.10g\n",thisIndex.x, thisIndex.y,i, S[i], savedsmat[i]);
214  }
215 
216  CkAssert(fabs(S[i]-savedsmat[i])<0.0001);
217  CkAssert(fabs(S[i]-savedsmat[i])<0.0001);
218  }
219 #endif
220  if(config.UberMmax ==1) // if we ignore spin we divide by 2
221  {
222  for(int i = 0; i < m * n; i++){
223  B[i] = S[i] / 2.0;
224  }
225  }
226  else
227  {
228  for(int i = 0; i < m * n; i++){
229  B[i] = S[i];
230  }
231  }
232  memset(A, 0, sizeof(internalType) * m * n);
233  step = 0;
234  iterations = 0;
235  /* see if we have a non-zero part of I or T (=3I) */
236  if(thisIndex.x == thisIndex.y){
237  for(int i = 0; i < m; i++){
238  A[i * m + i] = 1;
239  }//endfor
240  }//endif
241  // do tolerance check on smat, do_iteration will be called by reduction root
242  if(cfg.isDynamics && (numGlobalIter % config.toleranceInterval)==0 && numGlobalIter>1){
243  double max =array_diag_max(m,n,S);
244  contribute(sizeof(double),&max, CkReduction::max_double,
245  CkCallback(CkIndex_Ortho::maxCheck(NULL),CkArrayIndex2D(0,0),
246  thisProxy.ckGetArrayID()));
247  }else{
248  if(num_ready == 10){do_iteration();}
249  }//endif
250  delete msg;
251 
252  //////////////////////////////////////////////////////////////////////////////
253 }//end routine
254 //////////////////////////////////////////////////////////////////////////////
255 
256 
257 
258 //////////////////////////////////////////////////////////////////////////////
259 //////////////////////////////////////////////////////////////////////////////
260 //////////////////////////////////////////////////////////////////////////////
262 //////////////////////////////////////////////////////////////////////////////
263 /// Output debug information then increment iteration counter
264 
265 #ifdef _CP_DEBUG_TMAT_
266  print_results();
267 #endif
268 #ifdef TRACE_MEMORY
269  traceMemoryUsage();
270 #endif
271  numGlobalIter++;
272 ///////////////////////////////////////////////////////////////////////==
273 /// Load balance controller
274 
275 
276  if (numGlobalIter <= config.maxIter+1)
277  resume();
278 //----------------------------------------------------------------------
279  }//end routine
280 ///////////////////////////////////////////////////////////////////////==
281 
282 
283 //////////////////////////////////////////////////////////////////////////////
284 /**
285  * Resume execution by finishing the backward path of the Psi calculator
286  */
287 //////////////////////////////////////////////////////////////////////////////
288 //////////////////////////////////////////////////////////////////////////////
289 //////////////////////////////////////////////////////////////////////////////
291 //////////////////////////////////////////////////////////////////////////////
292  int actionType=0; //normal
293  if(toleranceCheckOrthoT)
294  {
295  actionType=1; //keepOrtho
296  }
297  if(cfg.isDynamics){
298  // copy orthoT for use in gamma computation
299  // CkPrintf("O [%d %d] making copy of orthoT m %d n %d\n",thisIndex.x,thisIndex.y,m,n);
300  if(orthoT==NULL) //allocate if null
301  { orthoT = new internalType[m * n];}
302  CmiMemcpy(orthoT,A,m*n*sizeof(internalType));
303  }
304  CkIndex2D pc = symmSectionMgr.computePCStateIndices();
305  // if(thisIndex.y <= thisIndex.x) //we have the answer scalc wants
306  // if((pc.y < pc.x) || ((pc.y==pc.x)&&())) //we have the answer scalc wants
307 #ifdef _CP_ORTHO_DUMP_TMAT_
308  dumpMatrix("tmat",(double *)A, m, n,numGlobalIter,thisIndex.x * cfg.grainSize, thisIndex.y * cfg.grainSize, 0, false);
309 #endif
310 
311 #ifdef _CP_ORTHO_DEBUG_COMPARE_TMAT_
312  if(savedtmat==NULL)
313  { // load it
314  savedtmat= new double[m*n];
315  loadMatrix("tmat",(double *)savedtmat, m, n, numGlobalIter, thisIndex.x * cfg.grainSize, thisIndex.y*cfg.grainSize,0,false);
316  }
317  for(int i=0;i<m*n;i++)
318  {
319  if(fabs(A[i]-savedtmat[i])>0.0001)
320  {
321  fprintf(stderr, "O [%d,%d] %d element ortho %.10g not %.10g\n",thisIndex.x, thisIndex.y,i, A[i], savedtmat[i]);
322  }
323 
324  CkAssert(fabs(A[i]-savedtmat[i])<0.0001);
325  CkAssert(fabs(A[i]-savedtmat[i])<0.0001);
326  }
327 #endif
328 
329  /**
330  * orthos talking to non-phantoms in the symm pc instance will have to perform an extra transpose of their data.
331  * Hence, they do this only when they have to. We avoid this when phantoms are turned off by rigging the pc
332  * sections to make the mirror orthos deliver the data to our non-phantoms.
333  * Refer PCSectionManager::setupArraySection for more info.
334  */
335  if(pc.x == pc.y) //we have the answer scalc wants
336  symmSectionMgr.sendResults(m*n, A, 0, thisIndex.x, thisIndex.y, actionType, 0);
337  else if(thisIndex.y < thisIndex.x) //we have the answer scalc wants
338  symmSectionMgr.sendResults(m*n, A, 0, thisIndex.y, thisIndex.x, actionType, 0);
339  else if(thisIndex.y > thisIndex.x && config.phantomSym)
340  {
341  transpose(A,m,n);
342  /* double *dest= (double*) A;
343  double tmp;
344  for(int i = 0; i < m; i++)
345  for(int j = i + 1; j < n; j++){
346  tmp = dest[i * n + j];
347  dest[i * n + j] = dest[j * m + i];
348  dest[j * m + i] = tmp;
349  }
350  */
351  // we have a transposed copy of what scalc wants
352  symmSectionMgr.sendResults(m*n, A, 0, thisIndex.x, thisIndex.y, actionType, 0);
353 #ifdef _CP_ORTHO_DUMP_TMAT_
354  dumpMatrix("tmatT",(double *)A, m, n,numGlobalIter,thisIndex.x * cfg.grainSize, thisIndex.y * cfg.grainSize, 0, false);
355 
356 #endif
357  }
358 
359 
360 #ifdef _CP_SUBSTEP_TIMING_
361  if(timeKeep>0)
362  {
363  double oend=CmiWallTimer();
364  CkCallback cb(CkIndex_TimeKeeper::collectEnd(NULL),0,TimeKeeperProxy);
365  contribute(sizeof(double),&oend,CkReduction::max_double, cb , timeKeep);
366  }
367 #endif
368  /* this should be triggered by psi after sym fw path is done */
369  if(cfg.isDynamics && !config.PCCollectTiles)
370  {
372  }
373 
374 //----------------------------------------------------------------------------
375  }//end routine
376 //////////////////////////////////////////////////////////////////////////////
377 
378 
379 //////////////////////////////////////////////////////////////////////////////
380 //////////////////////////////////////////////////////////////////////////////
381 //////////////////////////////////////////////////////////////////////////////
383 //////////////////////////////////////////////////////////////////////////////
384  int actionType=0; //normal
385  // CkPrintf("[%d,%d] sending orthoT\n",thisIndex.x,thisIndex.y);
386  asymmSectionMgr.sendMatrix(m*n, orthoT, 0, thisIndex.x, thisIndex.y, actionType, asymmSectionMgr.msgPriority-1);
387 
388 }
389 //////////////////////////////////////////////////////////////////////////////
390 
391 
392 //////////////////////////////////////////////////////////////////////////////
393 //////////////////////////////////////////////////////////////////////////////
394 //////////////////////////////////////////////////////////////////////////////
395 void Ortho::maxCheck(CkReductionMsg *msg){
396 //////////////////////////////////////////////////////////////////////////////
397 
398  double tolMax=fabs(((double *) msg->getData())[0]);
399  delete msg;
400  CkPrintf("S matrix tolerance = %f while maxTolerance = %f\n",tolMax, cfg.maxTolerance);
401 
402  // CkPrintf("SMAT tol = %g\n", tolMax);
403  if(tolMax < cfg.maxTolerance){
404  toleranceCheckOrthoT=false;
405  thisProxy.do_iteration();
406  }else{
407  // smat is outside of the tolerance range need new PsiV
408  toleranceCheckOrthoT=true;
409  CkPrintf("recalculating PsiV due to tolerance failure \n");
410  // Use the callback to trigger tolerance failure events.
411  cfg.uponToleranceFailure.send();
412  // Simply suspend all work. We'll be resumed (by GSpaceDriver) when tolerance updates are done.
413  }//endif
414 
415 //////////////////////////////////////////////////////////////////////////////
416  }//end routine
417 //////////////////////////////////////////////////////////////////////////////
418 
419 /**
420  * Resume execution with the vpsi tolerance correction flag on
421  */
422 //////////////////////////////////////////////////////////////////////////////
423 //////////////////////////////////////////////////////////////////////////////
424 //////////////////////////////////////////////////////////////////////////////
425 void Ortho::resumeV(CkReductionMsg *msg){ // gspace tolerance check entry
426  delete msg;
427  toleranceCheckOrthoT=true;
428  do_iteration();
429 }
430 //////////////////////////////////////////////////////////////////////////////
431 
432 
433 
434 
435 //////////////////////////////////////////////////////////////////////////////
436 //////////////////////////////////////////////////////////////////////////////
437 //////////////////////////////////////////////////////////////////////////////
438 void Ortho::acceptSectionLambda(CkReductionMsg *msg) {
439 //////////////////////////////////////////////////////////////////////////////
440 
441  internalType *lambda = (internalType*)msg->getData();
442  int lambdaCount = msg->getSize()/sizeof(internalType);
443 
444 #ifdef _CP_ORTHO_DUMP_LMAT_
445  dumpMatrix("lmat",lambda, m, n, numGlobalIter, thisIndex.x * cfg.grainSize, thisIndex.y * cfg.grainSize, 0, false);
446 
447 #endif
448 #ifdef _CP_ORTHO_DEBUG_COMPARE_LMAT_
449  if(savedlmat==NULL)
450  { // load it
451  savedlmat= new double[m*n];
452  loadMatrix("lmat",(double *)savedlmat, m, n,numGlobalIter, thisIndex.x*cfg.grainSize,thisIndex.y*cfg.grainSize,0,false);
453  }
454  for(int i=0;i<m*n;i++)
455  {
456  if(fabs(lambda[i]-savedlmat[i])>0.0001)
457  {
458  fprintf(stderr, "O [%d,%d] %d element ortho %.10g not %.10g\n",thisIndex.x, thisIndex.y,i, lambda[i], savedlmat[i]);
459  }
460 
461  CkAssert(fabs(lambda[i]-savedlmat[i])<0.0001);
462  CkAssert(fabs(lambda[i]-savedlmat[i])<0.0001);
463  }
464 #endif
465 
466  // revise this to do a matmul replacing multiplyforgamma
467  if(cfg.isDynamics){
468 
469  if(toleranceCheckOrthoT)
470  {// replace orthoT with the identity matrix
471  if(thisIndex.x!=thisIndex.y)
472  { //not on diagonal
473  bzero(orthoT,m*n*sizeof(double));
474  }
475  else
476  { // on diagonal
477  for(int i=0;i<m;i++)
478  for(int j=0;j<n;j++)
479  {
480  if(i!=j)
481  orthoT[i*n+j]=0.0;
482  else
483  orthoT[i*n+j]=1.0;
484  }
485  }
486  toleranceCheckOrthoT=false;
487  }
488  if(ortho==NULL)
489  ortho= new internalType[m*n];
490  CmiMemcpy(ortho,orthoT,m*n*sizeof(internalType));
491  matA1.multiply(1, 0, orthoT, Ortho::gamma_done_cb, (void*) this,
492  thisIndex.x, thisIndex.y);
493  matB1.multiply(1, 0, lambda, Ortho::gamma_done_cb, (void*) this,
494  thisIndex.x, thisIndex.y);
495  matC1.multiply(1, 0, B, Ortho::gamma_done_cb, (void*) this,
496  thisIndex.x, thisIndex.y);
497 
498  ///ompleted gamma will call finishPairCalcSection2
499  }
500  else
501  {
502 
503 #ifdef _NAN_CHECK_
504  CkAssert(lambdaCount==m*n);
505  for(int i=0; i<m*n; i++)
506  CkAssert( isfinite(lambda[i]) );
507 #endif
508 
509  // finish pair calc
510  asymmSectionMgr.sendResults(lambdaCount, lambda, 0, thisIndex.x, thisIndex.y, 0, asymmSectionMgr.msgPriority+1);
511 #ifdef _CP_DEBUG_ORTHO_VERBOSE_
512  if(thisIndex.x==0 && thisIndex.y==0)
513  CkPrintf("[%d,%d] finishing asymm\n",thisIndex.x, thisIndex.y);
514 #endif
515  }
516  delete msg;
517 
518 //----------------------------------------------------------------------------
519  }// end routine
520 ///////////////////////////////////////////////////////////////////////////////=
521 
522 
523 //////////////////////////////////////////////////////////////////////////////
524 //////////////////////////////////////////////////////////////////////////////
525 //////////////////////////////////////////////////////////////////////////////
526 void Ortho::makeSections(const pc::pcConfig &cfgSymmPC, const pc::pcConfig &cfgAsymmPC, CkArrayID symAID, CkArrayID asymAID)
527 {
528  /** For runs using a large numPE, Orthos chares are typically mapped onto a small fraction of the cores
529  * However, array broadcasts in charm++ involve all PEs (due to some legacy quirk present because of any-time
530  * migration support). Hence, to avoid this anti-scaling overhead, we delegate array collectives to comlib / CkMulticast
531  * by building a section that includes all chare elements! :)
532  *
533  * The (0,0) Ortho chare sets up these sections and delegates them
534  */
535  if( (thisIndex.x==0 && thisIndex.y==0) && (config.useOrthoSection || config.useOrthoSectionRed))
536  {
537  /// Create an array section that includes the whole Ortho chare array
538  int numOrtho = cfg.numStates / cfg.grainSize;
539  multiproxy = CProxySection_Ortho::ckNew(thisProxy.ckGetArrayID(), 0, numOrtho-1,1, 0, numOrtho-1, 1);
540  /// If reductions are being delegated to a comm library
541  if(config.useOrthoSectionRed)
542  {
543  CProxySection_Ortho rproxy = multiproxy;
544  CkMulticastMgr *mcastGrp = CProxy_CkMulticastMgr(oRedGID).ckLocalBranch();
545  CkAssert(mcastGrp != NULL);
546  rproxy.ckSectionDelegate(mcastGrp);
547  initCookieMsg *redMsg=new initCookieMsg;
548  /// Ask the rest of the section (the whole array) to init their CkSectionInfo cookies that identify the mgr etc
549  rproxy.orthoCookieinit(redMsg);
550  }
551  /// If multicasts are being delegated to a comm library
552  if(config.useOrthoSection)
553  {
554  /// Use the appropriate library for multicasts
555  if( config.useCommlib && config.useOrthoDirect)
556  {
557  #ifdef USE_COMLIB
558  ComlibAssociateProxy(orthoInstance,multiproxy);
559  #endif
560  }
561  else
562  {
563  CkMulticastMgr *mcastGrp = CProxy_CkMulticastMgr(oMCastGID).ckLocalBranch();
564  CkAssert(mcastGrp != NULL);
565  multiproxy.ckSectionDelegate(mcastGrp);
566  }
567  }
568  }
569 
570  // Initialize the paircalc section managers with data from the paircalc config objects
571  symmSectionMgr.init (thisIndex, cfgSymmPC , symAID , oMCastGID, oRedGID);
572  asymmSectionMgr.init(thisIndex, cfgAsymmPC, asymAID, oMCastGID, oRedGID);
573 
574  /// Symmetric PC sections should trigger S -> T computations in Ortho via this method
575  CkCallback orthoCB(CkIndex_Ortho::start_calc(NULL), thisProxy(thisIndex.x, thisIndex.y));
576  /// Asymmetric sections should simply drop off lambda at this method
577  CkCallback orthoLambdaCB(CkIndex_Ortho::acceptSectionLambda(NULL), thisProxy(thisIndex.x, thisIndex.y));
578 
579  /// Setup the symmetric instance paircalc array section for communication with the symm PC chares
580  symmSectionMgr.setupArraySection(orthoCB,config.phantomSym,config.useOrthoDirect);
581  /// Setup the asymmetric instance paircalc array section for gather/scatter of lambda data from/to the asymm PC chares
582  asymmSectionMgr.setupArraySection(orthoLambdaCB, config.phantomSym, config.useOrthoDirect);
583 }
584 
585 
586 //////////////////////////////////////////////////////////////////////////////
587 //////////////////////////////////////////////////////////////////////////////
588 //////////////////////////////////////////////////////////////////////////////
590 //////////////////////////////////////////////////////////////////////////////
591 /// CkPrintf("[%d %d] sending ortho %g %g %g %g gamma %g %g %g
592 //%g\n",thisIndex.x,
593 ///thisIndex.y,orthoT[0],orthoT[1],orthoT[m*n-2],orthoT[m*n-1],B[0],B[1],B[m*n-2],B[m*n-1]);
594 
595 #ifdef _CP_ORTHO_DUMP_GMAT_
596  dumpMatrix("gmat",(double *)B, m, n, numGlobalIter, thisIndex.x * cfg.grainSize, thisIndex.y * cfg.grainSize, 0, false);
597 
598 #endif
599 
600  //CkAbort("HEY I ONLY WANT ONE DYNAMICS STEP");
601 
602 #ifdef _CP_ORTHO_DEBUG_COMPARE_GMAT_
603  if(savedgmat==NULL)
604  { // load it
605  savedgmat= new double[m*n];
606  loadMatrix("gmat",(double *)savedgmat, m, n,numGlobalIter,thisIndex.x*cfg.grainSize,thisIndex.y*cfg.grainSize,0,false);
607  }
608  for(int i=0;i<m*n;i++)
609  {
610  if(fabs(S[i]-savedgmat[i])>0.0001)
611  {
612  fprintf(stderr, "O [%d,%d] %d element ortho %.10g not %.10g\n",thisIndex.x, thisIndex.y,i, S[i], savedgmat[i]);
613  }
614 
615  CkAssert(fabs(S[i]-savedgmat[i])<0.0001);
616  CkAssert(fabs(S[i]-savedgmat[i])<0.0001);
617  }
618 #endif
619  if(config.PCCollectTiles)
620  {
621  // if not streaming we might as well just lockstep these
622  asymmSectionMgr.sendResults(m*n, B, ortho, thisIndex.x, thisIndex.y, 0, asymmSectionMgr.msgPriority+1);
623  }
624  else // orthoT was already sent ahead for processing
625  {
626  asymmSectionMgr.sendResults(m*n, B, 0, thisIndex.x, thisIndex.y, 0, asymmSectionMgr.msgPriority+1);
627  }
628 
629 //----------------------------------------------------------------------------
630  }// end routine
631 ///////////////////////////////////////////////////////////////////////////////=
632 
633 
634 
635 
636 //////////////////////////////////////////////////////////////////////////////
637 ///New functions necessary for using CLA_Matrix
638 //////////////////////////////////////////////////////////////////////////////
639 //////////////////////////////////////////////////////////////////////////////
640 //////////////////////////////////////////////////////////////////////////////
641 
642 Ortho::Ortho(int _m, int _n, CLA_Matrix_interface _matA1,
647  orthoConfig &_cfg,
648  CkArrayID _step2Helper,
649  int timekeep, CkGroupID _oMCastGID, CkGroupID _oRedGID):
650  cfg(_cfg),
651  oMCastGID(_oMCastGID), oRedGID(_oRedGID),
652  step2Helper(_step2Helper)
653 {
654 
655 /* do basic initialization */
656  parallelStep2=config.useOrthoHelpers;
657 
658  invsqr_max_iter=config.invsqr_max_iter;
659  invsqr_tolerance=config.invsqr_tolerance;
660  if(invsqr_tolerance==0)
661  invsqr_tolerance=INVSQR_TOLERANCE;
662  if(invsqr_max_iter==0)
663  invsqr_max_iter=INVSQR_MAX_ITER;
664  this->matA1 = _matA1; this->matB1 = _matB1; this->matC1 = _matC1;
665  this->matA2 = _matA2; this->matB2 = _matB2; this->matC2 = _matC2;
666  this->matA3 = _matA3; this->matB3 = _matB3; this->matC3 = _matC3;
667  timeKeep=timekeep;
668  int borderOrtho = cfg.numStates / cfg.grainSize - 1;
669  int remOrtho = cfg.numStates%cfg.grainSize;
670  if(thisIndex.x==borderOrtho)
671  this->m = _m + remOrtho;
672  else
673  this->m = _m;
674  if(thisIndex.y==borderOrtho)
675  this->n = _n + remOrtho;
676  else
677  this->n = _n;
678  A = new internalType[this->m * this->n];
679  B = new internalType[this->m * this->n];
680  C = new internalType[this->m * this->n];
681  tmp_arr = new internalType[this->m * this->n];
682  step2done=false;
683  step3done=false;
684  step = 0;
685  lbcaught=0;
686  num_ready = 0;
687  usesAtSync=true;
688  setMigratable(false);
689  got_start = false;
690  toleranceCheckOrthoT=false;
691  ortho=NULL;
692  orthoT=NULL;
693 
694  numGlobalIter = 0;
695 #ifdef _CP_ORTHO_DEBUG_COMPARE_GMAT_
696  savedgmat=NULL;
697 #endif
698 #ifdef _CP_ORTHO_DEBUG_COMPARE_SMAT_
699  savedsmat=NULL;
700 #endif
701 #ifdef _CP_ORTHO_DEBUG_COMPARE_LMAT_
702  savedlmat=NULL;
703 #endif
704 
705 #ifdef _CP_ORTHO_DEBUG_COMPARE_TMAT_
706  savedtmat=NULL;
707 #endif
708 }//end routine
709 
710 
711 
712 
713 /** start step 1 on proxy 1, the callback will be for step 2
714  * S2 = 3 * I - T * S1
715  * currently A has T, B has S1, need to construct 3*I in C
716  */
718  #ifdef VERBOSE_ORTHO
719  CkPrintf("[%d,%d] Ortho::do_iteration \n", thisIndex.x, thisIndex.y);
720  #endif
721  step = 1;
722  memset(C, 0, m * n * sizeof(internalType));
723  if(thisIndex.x == thisIndex.y){
724  for(int i = 0; i < n; i++)
725  C[i * n + i] = 3;
726  }
727 #ifdef _CP_ORTHO_DUMP_SMAT_
728  dumpMatrix("step1:A:",(double *)A, m, n, iterations, thisIndex.x * cfg.grainSize, thisIndex.y * cfg.grainSize, 0, false);
729  dumpMatrix("step1:B:",(double *)B, m, n, iterations, thisIndex.x * cfg.grainSize, thisIndex.y * cfg.grainSize, 0, false);
730  dumpMatrix("step1:C:",(double *)C, m, n, iterations, thisIndex.x * cfg.grainSize, thisIndex.y * cfg.grainSize, 0, false);
731 #endif
732 
733  matA1.multiply(-1, 1, A, Ortho::step_2_cb, (void*) this,
734  thisIndex.x, thisIndex.y);
735  CmiNetworkProgress();
736  matB1.multiply(-1, 1, B, Ortho::step_2_cb, (void*) this,
737  thisIndex.x, thisIndex.y);
738  CmiNetworkProgress();
739  matC1.multiply(-1, 1, C, Ortho::step_2_cb, (void*) this,
740  thisIndex.x, thisIndex.y);
741 }
742 
743 
744 
745 
746 /* S1 = 0.5 * S3 * S2 (on proxy 2)
747  * currently A has T, B has S1, C has S2
748  * Multiply tmp_arr = B*C
749  * tmp_arr not used in step3, therefore no data dependence
750  */
751 void Ortho::step_2(void){
752 
753  step = 2;
754  if(config.useOrthoHelpers)
755  {
756  // Send our data to the helper and await results which will arrive in recvStep2
757  OrthoHelperMsg *omsg= new (m*n, m*n, 0) OrthoHelperMsg;
758  omsg->init(m*n, B,C,0.5, 0.5, 0.5);
759  step2Helper(thisIndex.x,thisIndex.y).recvAB(omsg);
760  step_3();
761  }
762  else
763  {
764 #ifdef _CP_ORTHO_DUMP_SMAT_
765  dumpMatrix("step2:A:",(double *)B, m, n, iterations, thisIndex.x * cfg.grainSize, thisIndex.y * cfg.grainSize, 0, false);
766  dumpMatrix("step2:B:",(double *)C, m, n, iterations, thisIndex.x * cfg.grainSize, thisIndex.y * cfg.grainSize, 0, false);
767  // dumpMatrix("step2:C:",(double *)tmp_arr, m, n, iterations, thisIndex.x * cfg.grainSize, thisIndex.y * cfg.grainSize, 0, false);
768 #endif
769 
770  matA2.multiply(0.5, 0, B, Ortho::step_3_cb, (void*) this,
771  thisIndex.x, thisIndex.y);
772  matB2.multiply(0.5, 0, C, Ortho::step_3_cb, (void*) this,
773  thisIndex.x, thisIndex.y);
774  matC2.multiply(0.5, 0, tmp_arr, Ortho::step_3_cb, (void*) this,
775  thisIndex.x, thisIndex.y);
776  }
777 }
778 
779 
780 
781 /**
782  * T = 0.5 * S2 * S3 (do S3 = T before) (on proxy 3)
783  * currently A has T, B has S1 (old), C has S2, tmp_arr has new S1
784  */
786  step = 3;
787  CmiMemcpy(B, A, m * n * sizeof(internalType));
788 #ifdef _CP_ORTHO_DUMP_SMAT_
789  dumpMatrix("step3:A:",(double *)C, m, n, iterations, thisIndex.x * cfg.grainSize, thisIndex.y * cfg.grainSize, 0, false);
790  dumpMatrix("step3:B:",(double *)B, m, n, iterations, thisIndex.x * cfg.grainSize, thisIndex.y * cfg.grainSize, 0, false);
791  dumpMatrix("step3:C:",(double *)A, m, n, iterations, thisIndex.x * cfg.grainSize, thisIndex.y * cfg.grainSize, 0, false);
792 #endif
793 
794  matA3.multiply(0.5, 0, C, Ortho::tol_cb, (void*) this,
795  thisIndex.x, thisIndex.y);
796  matB3.multiply(0.5, 0, B, Ortho::tol_cb, (void*) this,
797  thisIndex.x, thisIndex.y);
798  matC3.multiply(0.5, 0, A, Ortho::tol_cb, (void*) this,
799  thisIndex.x, thisIndex.y);
800 }
801 //////////////////////////////////////////////////////////////////////////////
802 
803 
804 /** calculate error and reset pointers (for step 1)
805  * current: T -> A, S1 -> tmp_arr, S2 -> C, S3 -> B
806  */
808  step = 4;
809  step2done=false;
810  step3done=false;
811 
812 #ifdef _NAN_CHECK_
813  for(int i = 0; i < m * n; i++)
814  {
815  CkAssert( isfinite(A[i]) );
816  CkAssert( isfinite(B[i]) );
817  }
818 #endif
819 
820  internalType ret = 0;
821  for(int i = 0; i < m * n; i++){
822  internalType tmp = B[i] - A[i];
823  ret += tmp * tmp;
824  }
825  internalType *tmp = B;
826  B = tmp_arr;
827  tmp_arr = tmp;
828  if(config.useOrthoSectionRed)
829  {
830  CkCallback mycb(CkIndex_Ortho::collect_error(NULL), thisProxy(0, 0));
831  CkMulticastMgr *mcastGrp = CProxy_CkMulticastMgr(symmSectionMgr.orthomCastGrpID).ckLocalBranch();
832  mcastGrp->contribute(sizeof(internalType), &ret, CkReduction::sum_double, orthoCookie, mycb);
833  }
834  else
835  contribute(sizeof(internalType), &ret, CkReduction::sum_double);
836  iterations++;
837 }
838 
839 
840 
841 
842 void Ortho::pup(PUP::er &p){
843 /// CBase_Ortho::pup(p);
844  ArrayElement2D::pup(p);
845  p | m;
846  p | n;
847  p | step;
848  p | iterations;
849  p | num_ready;
850  p | got_start;
851  p | multiproxy;
852  p | numGlobalIter;
853  p | lbcaught;
854  p | orthoCookie;
855  p | toleranceCheckOrthoT;
856  p | step2done;
857  p | step3done;
858  if(p.isUnpacking() && thisIndex.x==0 &&thisIndex.y==0)
859  {
860  ortho = new internalType[cfg.numStates * cfg.numStates];
861  orthoT = new internalType[cfg.numStates * cfg.numStates];
862  }
863  if(thisIndex.x==0 && thisIndex.y==0)
864  {
865  PUParray(p,ortho,cfg.numStates*cfg.numStates);
866  PUParray(p,orthoT,cfg.numStates*cfg.numStates);
867  }
868  if(p.isUnpacking()){
869  A = new internalType[m * n];
870  B = new internalType[m * n];
871  C = new internalType[m * n];
872  tmp_arr = new internalType[m * n];
873  }
874  PUParray(p,A, m * n);
875  PUParray(p,B, m * n);
876  PUParray(p,C, m * n);
877  PUParray(p,tmp_arr, m * n);
878  }
879 
880 #include "orthoMap.h"
881 #include "ortho.def.h"
882 
883 /*@}*/
CkIndex2D computePCStateIndices(const int orthoX, const int orthoY)
Identify the state indices of the Paircalc chares this ortho chare needs to talk to.
void do_iteration(void)
Triggers the matrix multiplies in step 1 of an ortho iteration.
Definition: ortho.C:717
int grainSize
The block size for parallelization.
Definition: orthoConfig.h:31
PCSectionManager asymmSectionMgr
Section of asymmetric PC chare array used by an Ortho chare.
Definition: ortho.h:214
virtual void pup(PUP::er &p)
pack/unpack method
Definition: ortho.C:842
int numStates
The number of states in the simulation (the dimension of the input square matrix) ...
Definition: orthoConfig.h:29
CProxy_OrthoHelper step2Helper
The proxy of the step 2 helper chare array.
Definition: ortho.h:218
PCSectionManager symmSectionMgr
Section of symmetric PC chare array used by an Ortho chare.
Definition: ortho.h:212
static void step_2_cb(void *obj)
Static methods used as callbacks. Could be replaced by CkCallbacks.
Definition: ortho.h:178
double maxTolerance
The tolerance threshold for the S->T iterations in Ortho at which to trigger a PsiV update...
Definition: orthoConfig.h:36
Ortho()
Default empty constructor. For?
Definition: ortho.h:109
void resumeV(CkReductionMsg *msg)
Once all GSpaceDriver chares are notified, they resume Ortho execution via a redn broadcast to this m...
Definition: ortho.C:425
void step_2()
Triggers step 2, and optionally step 3 (if ortho helpers are being used)
Definition: ortho.C:751
Configuration settings for the ortho world.
Definition: orthoConfig.h:17
void collect_results(void)
Computes walltimes, prints simulation status msgs and calls resume() if more openatom iterations are ...
Definition: ortho.C:261
void print_results(void)
Dumps the T matrix to an appropriately named file.
Definition: ortho.h:262
void gamma_done()
Used in dynamics, to accept computed gamma and send it to the asymm PC instance. Also sends T if it h...
Definition: ortho.C:589
void step_3()
Triggers step 3 in the S->T process.
Definition: ortho.C:785
void sendMatrix(int n, internalType *ptr1, internalType *ptr2, int orthoX, int orthoY, int actionType, int priority)
Used to send OrthoT to the asymm instance. Replaces sendMatrix()
void sendOrthoTtoAsymm()
Used in dynamics, to send the results of S->T to the asymm paircalc instance which will use them...
Definition: ortho.C:382
bool isDynamics
Is this a minimization or dynamics run.
Definition: orthoConfig.h:21
void collect_error(CkReductionMsg *msg)
Computes the RMS error and either launches the next ortho iteration (if needed) or calls collect_resu...
Definition: ortho.C:96
void tolerance_check()
Computes square of the residuals and contributes to a reduction rooted at Ortho(0,0)::collect_error()
Definition: ortho.C:807
#define INVSQR_TOLERANCE
<
Definition: ortho.h:85
Made by Eric Bohm Login bohm@localhost.localdomain
CkGroupID orthomCastGrpID
The multicast and reduction groups that handle comm.
void maxCheck(CkReductionMsg *msg)
Called on ortho(0,0). Checks if PsiV update is needed based on the max deviation in S received via a ...
Definition: ortho.C:395
void sendResults(int n, internalType *ptr1, internalType *ptr2, int orthoX, int orthoY, int actionType, int priority)
Sends out the results to the paircalc section. Replaces finishPairCalcSection()
void makeSections(const pc::pcConfig &cfgSymmPC, const pc::pcConfig &cfgAsymmPC, CkArrayID symAID, CkArrayID asymAID)
Trigger the creation of appropriate sections of paircalcs to talk to. Also setup internal comm sectio...
Definition: ortho.C:526
CkGroupID oMCastGID
Group IDs for the multicast manager groups.
Definition: ortho.h:216
double array_diag_max(int sizem, int sizen, internalType *array)
S should be equal to 2I. This returns max value of deviation from that in this ortho's portion of the...
Definition: ortho.h:279
CkCallback uponToleranceFailure
Callback to notify bubble owner (GSpace) that a tolerance update is needed.
Definition: orthoConfig.h:38
void acceptSectionLambda(CkReductionMsg *msg)
Definition: ortho.C:438
Dumb structure that holds all the configuration inputs required for paircalc instantiation, functioning and interaction.
Definition: pcConfig.h:23
void resume()
Sends results (T matrix) to the symm PCs (and also the asymms if this is dynamics) ...
Definition: ortho.C:290
void start_calc(CkReductionMsg *msg)
Symmetric PCs contribute data that is summed via this reduction to deposit a portion of the S matrix ...
Definition: ortho.C:140
void init(const CkIndex2D orthoIdx, const pc::pcConfig &pcCfg, CkArrayID pcAID, CkGroupID oMCastGID, CkGroupID oRedGID)
An initializer method that fills this with data.
int msgPriority
The priority to use for messages to PC.
void setupArraySection(CkCallback cb, bool arePhantomsOn, bool useComlibForOrthoToPC)
Creates a paircalc array section given the necessary info. Replaces initOneRedSect() ...
Ortho is decomposed by orthoGrainSize.