00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042 #include "unistd.h"
00043
00044 #include "charm++.h"
00045 #include "ck.h"
00046 #include "register.h"
00047 #include "conv-ccs.h"
00048 #include <signal.h>
00049
00050 void noopck(const char*, ...)
00051 {}
00052
00053
00054
00055 #define DEBUGF noopck
00056
00057
00058
00059 #define NODE_CHECKPOINT 0
00060
00061
00062
00063 #if CMK_CONVERSE_MPI
00064 #define CK_NO_PROC_POOL 0
00065 #else
00066 #define CK_NO_PROC_POOL 1
00067 #endif
00068
00069 #define STREAMING_INFORMHOME 1
00070 CpvDeclare(int, _crashedNode);
00071
00072
00073 int CkMemCheckPT::inRestarting = 0;
00074 double CkMemCheckPT::startTime;
00075 char *CkMemCheckPT::stage;
00076 CkCallback CkMemCheckPT::cpCallback;
00077
00078 int _memChkptOn = 1;
00079
00080 CkGroupID ckCheckPTGroupID;
00081
00082 static int checkpointed = 0;
00083
00085
00086
00087 #ifdef CMK_MEM_CHECKPOINT
00088
00089 char *killFile;
00090
00091 int killFlag=0;
00092
00093 double killTime=0.0;
00094 #endif
00095
00097 CpvDeclare(CkProcCheckPTMessage*, procChkptBuf);
00098
00099
00100
00101 inline int ChkptOnPe(int pe) { return (pe+CmiMyNodeSize())%CkNumPes(); }
00102
00103 inline int CkMemCheckPT::BuddyPE(int pe)
00104 {
00105 int budpe;
00106 #if NODE_CHECKPOINT
00107
00108 int r1 = CmiPhysicalRank(pe);
00109 int budnode = CmiPhysicalNodeID(pe);
00110 do {
00111 budnode = (budnode+1)%CmiNumPhysicalNodes();
00112 int *pelist;
00113 int num;
00114 CmiGetPesOnPhysicalNode(budnode, &pelist, &num);
00115 budpe = pelist[r1 % num];
00116 } while (isFailed(budpe));
00117 if (budpe == pe) {
00118 CmiPrintf("[%d] Error: failed to find a buddy processor on a different node.\n", pe);
00119 CmiAbort("Failed to find a buddy processor");
00120 }
00121 #else
00122 budpe = pe;
00123 while (budpe == pe || isFailed(budpe))
00124 budpe = (budpe+1)%CkNumPes();
00125 #endif
00126 return budpe;
00127 }
00128
00129
00130
00131 #if CMK_MEM_CHECKPOINT
00132 void ArrayElement::init_checkpt() {
00133 if (_memChkptOn == 0) return;
00134 if (CkInRestarting()) {
00135 CkPrintf("[%d] Warning: init_checkpt called during restart, possible bug in migration constructor!\n");
00136 }
00137
00138 if (thisArray->getLocMgr()->firstManager->mgr!=thisArray) return;
00139
00140 budPEs[0] = CkMyPe();
00141 budPEs[1] = CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->BuddyPE(CkMyPe());
00142 CmiAssert(budPEs[0] != budPEs[1]);
00143
00144 CProxy_CkMemCheckPT checkptMgr(ckCheckPTGroupID);
00145
00146 checkptMgr[budPEs[0]].createEntry(thisArrayID, thisArray->getLocMgr()->getGroupID(), thisIndexMax, budPEs[1]);
00147 checkptMgr[budPEs[1]].createEntry(thisArrayID, thisArray->getLocMgr()->getGroupID(), thisIndexMax, budPEs[0]);
00148 }
00149 #endif
00150
00151
00152 void ArrayElement::inmem_checkpoint(CkArrayCheckPTReqMessage *m) {
00153 #if CMK_MEM_CHECKPOINT
00154
00155
00156
00157 CkLocMgr *locMgr = thisArray->getLocMgr();
00158 CmiAssert(myRec!=NULL);
00159 int size;
00160 {
00161 PUP::sizer p;
00162 locMgr->pupElementsFor (p, myRec, CkElementCreation_migrate);
00163 size = p.size();
00164 }
00165 int packSize = size/sizeof(double) +1;
00166 CkArrayCheckPTMessage *msg =
00167 new (packSize, 0) CkArrayCheckPTMessage;
00168 msg->len = size;
00169 msg->index =thisIndexMax;
00170 msg->aid = thisArrayID;
00171 msg->locMgr = locMgr->getGroupID();
00172 msg->cp_flag = 1;
00173 {
00174 PUP::toMem p(msg->packData);
00175 locMgr->pupElementsFor (p, myRec, CkElementCreation_migrate);
00176 }
00177
00178 CProxy_CkMemCheckPT checkptMgr(ckCheckPTGroupID);
00179 checkptMgr.recvData(msg, 2, budPEs);
00180 delete m;
00181 #endif
00182 }
00183
00184
00185 class CkMemCheckPTInfo: public CkCheckPTInfo
00186 {
00187 CkArrayCheckPTMessage *ckBuffer;
00188 public:
00189 CkMemCheckPTInfo(CkArrayID a, CkGroupID loc, CkArrayIndex idx, int pno):
00190 CkCheckPTInfo(a, loc, idx, pno)
00191 {
00192 ckBuffer = NULL;
00193 }
00194 ~CkMemCheckPTInfo()
00195 {
00196 if (ckBuffer) delete ckBuffer;
00197 }
00198 inline void updateBuffer(CkArrayCheckPTMessage *data)
00199 {
00200 CmiAssert(data!=NULL);
00201 if (ckBuffer) delete ckBuffer;
00202 ckBuffer = data;
00203 }
00204 inline CkArrayCheckPTMessage * getCopy()
00205 {
00206 if (ckBuffer == NULL) {
00207 CmiPrintf("[%d] recoverArrayElements: element does not have checkpoint data.", CkMyPe());
00208 CmiAbort("Abort!");
00209 }
00210 return (CkArrayCheckPTMessage *)CkCopyMsg((void **)&ckBuffer);
00211 }
00212 inline void updateBuddy(int b1, int b2) {
00213 CmiAssert(ckBuffer);
00214 ckBuffer->bud1 = b1; ckBuffer->bud2 = b2;
00215 pNo = b1; if (pNo == CkMyPe()) pNo = b2;
00216 CmiAssert(pNo != CkMyPe());
00217 }
00218 inline int getSize() {
00219 CmiAssert(ckBuffer);
00220 return ckBuffer->len;
00221 }
00222 };
00223
00224
00225 class CkDiskCheckPTInfo: public CkCheckPTInfo
00226 {
00227 char *fname;
00228 int bud1, bud2;
00229 int len;
00230 public:
00231 CkDiskCheckPTInfo(CkArrayID a, CkGroupID loc, CkArrayIndex idx, int pno, int myidx): CkCheckPTInfo(a, loc, idx, pno)
00232 {
00233 #if CMK_USE_MKSTEMP
00234 fname = new char[64];
00235 sprintf(fname, "/tmp/ckpt%d-%d-XXXXXX", CkMyPe(), myidx);
00236 mkstemp(fname);
00237 #else
00238 fname=tmpnam(NULL);
00239 #endif
00240 bud1 = bud2 = -1;
00241 len = 0;
00242 }
00243 ~CkDiskCheckPTInfo()
00244 {
00245 remove(fname);
00246 }
00247 inline void updateBuffer(CkArrayCheckPTMessage *data)
00248 {
00249 double t = CmiWallTimer();
00250
00251 envelope *env = UsrToEnv(data);
00252 CkUnpackMessage(&env);
00253 data = (CkArrayCheckPTMessage *)EnvToUsr(env);
00254 FILE *f = fopen(fname,"wb");
00255 PUP::toDisk p(f);
00256 CkPupMessage(p, (void **)&data);
00257
00258
00259 fclose(f);
00260 bud1 = data->bud1;
00261 bud2 = data->bud2;
00262 len = data->len;
00263 delete data;
00264
00265 }
00266 inline CkArrayCheckPTMessage * getCopy()
00267 {
00268 CkArrayCheckPTMessage *data;
00269 FILE *f = fopen(fname,"rb");
00270 PUP::fromDisk p(f);
00271 CkPupMessage(p, (void **)&data);
00272 fclose(f);
00273 data->bud1 = bud1;
00274 data->bud2 = bud2;
00275 return data;
00276 }
00277 inline void updateBuddy(int b1, int b2) {
00278 bud1 = b1; bud2 = b2;
00279 pNo = b1; if (pNo == CkMyPe()) pNo = b2;
00280 CmiAssert(pNo != CkMyPe());
00281 }
00282 inline int getSize() {
00283 return len;
00284 }
00285 };
00286
00287 CkMemCheckPT::CkMemCheckPT(int w)
00288 {
00289 int numnodes = 0;
00290 #if NODE_CHECKPOINT
00291 numnodes = CmiNumPhysicalNodes();
00292 #else
00293 numnodes = CkNumPes();
00294 #endif
00295 #if CK_NO_PROC_POOL
00296 if (numnodes <= 2)
00297 #else
00298 if (numnodes == 1)
00299 #endif
00300 {
00301 if (CkMyPe() == 0) CkPrintf("Warning: CkMemCheckPT is disabled due to too few nodes.\n");
00302 _memChkptOn = 0;
00303 }
00304 inRestarting = 0;
00305 recvCount = peCount = 0;
00306 ackCount = 0;
00307 expectCount = -1;
00308 where = w;
00309
00310 #if CMK_CONVERSE_MPI
00311 void pingBuddy();
00312 void pingCheckHandler();
00313 CcdCallOnCondition(CcdPERIODIC_100ms,(CcdVoidFn)pingBuddy,NULL);
00314 CcdCallOnCondition(CcdPERIODIC_5s,(CcdVoidFn)pingCheckHandler,NULL);
00315 #endif
00316 }
00317
00318 CkMemCheckPT::~CkMemCheckPT()
00319 {
00320 int len = ckTable.length();
00321 for (int i=0; i<len; i++) {
00322 delete ckTable[i];
00323 }
00324 }
00325
00326 void CkMemCheckPT::pup(PUP::er& p)
00327 {
00328 CBase_CkMemCheckPT::pup(p);
00329 p|cpStarter;
00330 p|thisFailedPe;
00331 p|failedPes;
00332 p|ckCheckPTGroupID;
00333 p|cpCallback;
00334 p|where;
00335 p|peCount;
00336 if (p.isUnpacking()) {
00337 recvCount = 0;
00338 #if CMK_CONVERSE_MPI
00339 void pingBuddy();
00340 void pingCheckHandler();
00341 CcdCallOnCondition(CcdPERIODIC_100ms,(CcdVoidFn)pingBuddy,NULL);
00342 CcdCallOnCondition(CcdPERIODIC_5s,(CcdVoidFn)pingCheckHandler,NULL);
00343 #endif
00344 }
00345 }
00346
00347
00348 void CkMemCheckPT::inmem_restore(CkArrayCheckPTMessage *m)
00349 {
00350 #if CMK_MEM_CHECKPOINT
00351 DEBUGF("[%d] inmem_restore restore: mgr: %d \n", CmiMyPe(), m->locMgr);
00352
00353 PUP::fromMem p(m->packData);
00354 CkLocMgr *mgr = CProxy_CkLocMgr(m->locMgr).ckLocalBranch();
00355 CmiAssert(mgr);
00356 #if STREAMING_INFORMHOME
00357 mgr->resume(m->index, p, CmiFalse);
00358 #else
00359 mgr->resume(m->index, p, CmiTrue);
00360 #endif
00361
00362
00363 ArrayElement *elt = (ArrayElement *)mgr->lookup(m->index, m->aid);
00364 CmiAssert(elt);
00365 CkLocRec_local *rec = elt->myRec;
00366 CkVec<CkMigratable *> list;
00367 mgr->migratableList(rec, list);
00368 CmiAssert(list.length() > 0);
00369 for (int l=0; l<list.length(); l++) {
00370 elt = (ArrayElement *)list[l];
00371 elt->budPEs[0] = m->bud1;
00372 elt->budPEs[1] = m->bud2;
00373
00374
00375 for (int i=0; i<CK_ARRAYLISTENER_MAXLEN; i++) {
00376 contributorInfo *c=(contributorInfo *)&elt->listenerData[i];
00377 if (c) c->redNo = 0;
00378 }
00379 }
00380 #endif
00381 }
00382
00383
00384 int CkMemCheckPT::isFailed(int pe)
00385 {
00386 for (int i=0; i<failedPes.length(); i++)
00387 if (failedPes[i] == pe) return 1;
00388 return 0;
00389 }
00390
00391
00392 void CkMemCheckPT::failed(int pe)
00393 {
00394 if (isFailed(pe)) return;
00395 failedPes.push_back(pe);
00396 }
00397
00398 int CkMemCheckPT::totalFailed()
00399 {
00400 return failedPes.length();
00401 }
00402
00403
00404 void CkMemCheckPT::createEntry(CkArrayID aid, CkGroupID loc, CkArrayIndex index, int buddy)
00405 {
00406
00407 int idx, len = ckTable.size();
00408 for (idx=0; idx<len; idx++) {
00409 CkCheckPTInfo *entry = ckTable[idx];
00410 if (index == entry->index) {
00411 if (loc == entry->locMgr) {
00412
00413 return;
00414 }
00415
00416
00417 if (aid == entry->aid) {
00418 CkPrintf("[%d] CkMemCheckPT::createEntry a duplciated entry for arrayID %d:", CkMyPe(), ((CkGroupID)aid).idx); index.print(); CkPrintf("\n");
00419 CmiAbort("CkMemCheckPT::createEntry a duplciated entry");
00420 }
00421 }
00422 }
00423 CkCheckPTInfo *newEntry;
00424 if (where == CkCheckPoint_inMEM)
00425 newEntry = new CkMemCheckPTInfo(aid, loc, index, buddy);
00426 else
00427 newEntry = new CkDiskCheckPTInfo(aid, loc, index, buddy, len+1);
00428 ckTable.push_back(newEntry);
00429
00430 }
00431
00432 void CkMemCheckPT::recoverEntry(CkArrayCheckPTMessage *msg)
00433 {
00434 int buddy = msg->bud1;
00435 if (buddy == CkMyPe()) buddy = msg->bud2;
00436 createEntry(msg->aid, msg->locMgr, msg->index, buddy);
00437 recvData(msg);
00438
00439 thisProxy[buddy].gotData();
00440 }
00441
00442
00443
00444 void CkMemCheckPT::doItNow(int starter, CkCallback &cb)
00445 {
00446 checkpointed = 1;
00447 cpCallback = cb;
00448 cpStarter = starter;
00449 if (CkMyPe() == cpStarter) {
00450 startTime = CmiWallTimer();
00451 CkPrintf("[%d] Start checkpointing starter: %d... \n", CkMyPe(), cpStarter);
00452 }
00453
00454 int len = ckTable.length();
00455 for (int i=0; i<len; i++) {
00456 CkCheckPTInfo *entry = ckTable[i];
00457
00458
00459
00460 if (!isMaster(entry->pNo)) continue;
00461
00462
00463 CkArrayCheckPTReqMessage *msg = new CkArrayCheckPTReqMessage;
00464 CkSendMsgArray(CkIndex_ArrayElement::inmem_checkpoint(NULL),(CkArrayMessage *)msg,entry->aid,entry->index);
00465 }
00466
00467 if (len == 0) contribute(CkCallback(CkReductionTarget(CkMemCheckPT, cpFinish), thisProxy[cpStarter]));
00468
00469
00470 sendProcData();
00471 }
00472
00473
00474 static inline void _handleProcData(PUP::er &p)
00475 {
00476
00477 CkPupROData(p);
00478
00479
00480 if(CkMyPe()==0) CkPupMainChareData(p, (CkArgMsg*)NULL);
00481
00482 #ifndef CMK_CHARE_USE_PTR
00483
00484 CkPupChareData(p);
00485 #endif
00486
00487
00488 CkPupGroupData(p);
00489
00490
00491 if(CkMyRank()==0) CkPupNodeGroupData(p);
00492 }
00493
00494 void CkMemCheckPT::sendProcData()
00495 {
00496
00497 int size;
00498 {
00499 PUP::sizer p;
00500 _handleProcData(p);
00501 size = p.size();
00502 }
00503 int packSize = size;
00504 CkProcCheckPTMessage *msg = new (packSize, 0) CkProcCheckPTMessage;
00505 DEBUGF("[%d] CkMemCheckPT::sendProcData - size: %d to %d\n", CkMyPe(), size, ChkptOnPe(CkMyPe()));
00506 {
00507 PUP::toMem p(msg->packData);
00508 _handleProcData(p);
00509 }
00510 msg->pe = CkMyPe();
00511 msg->len = size;
00512 msg->reportPe = cpStarter;
00513 thisProxy[ChkptOnPe(CkMyPe())].recvProcData(msg);
00514 }
00515
00516 void CkMemCheckPT::recvProcData(CkProcCheckPTMessage *msg)
00517 {
00518 if (CpvAccess(procChkptBuf)) delete CpvAccess(procChkptBuf);
00519 CpvAccess(procChkptBuf) = msg;
00520 DEBUGF("[%d] CkMemCheckPT::recvProcData report to %d\n", CkMyPe(), msg->reportPe);
00521 contribute(CkCallback(CkReductionTarget(CkMemCheckPT, cpFinish), thisProxy[msg->reportPe]));
00522 }
00523
00524
00525 void CkMemCheckPT::recvData(CkArrayCheckPTMessage *msg)
00526 {
00527 int len = ckTable.length();
00528 int idx;
00529 for (idx=0; idx<len; idx++) {
00530 CkCheckPTInfo *entry = ckTable[idx];
00531 if (msg->locMgr == entry->locMgr && msg->index == entry->index) break;
00532 }
00533 CkAssert(idx < len);
00534 int isChkpting = msg->cp_flag;
00535 ckTable[idx]->updateBuffer(msg);
00536 if (isChkpting) {
00537
00538
00539 recvCount ++;
00540 if (recvCount == ckTable.length()) {
00541 if (where == CkCheckPoint_inMEM) {
00542 contribute(CkCallback(CkReductionTarget(CkMemCheckPT, cpFinish), thisProxy[cpStarter]));
00543 }
00544 else if (where == CkCheckPoint_inDISK) {
00545
00546 CkCallback localcb(CkIndex_CkMemCheckPT::syncFiles(NULL),thisgroup);
00547 contribute(0,NULL,CkReduction::sum_int,localcb);
00548 }
00549 else
00550 CmiAbort("Unknown checkpoint scheme");
00551 recvCount = 0;
00552 }
00553 }
00554 }
00555
00556
00557 void CkMemCheckPT::syncFiles(CkReductionMsg *m)
00558 {
00559 delete m;
00560 #if CMK_HAS_SYNC && ! CMK_DISABLE_SYNC
00561 system("sync");
00562 #endif
00563 contribute(CkCallback(CkReductionTarget(CkMemCheckPT, cpFinish), thisProxy[cpStarter]));
00564 }
00565
00566
00567 void CkMemCheckPT::cpFinish()
00568 {
00569 CmiAssert(CkMyPe() == cpStarter);
00570 peCount++;
00571
00572 if (peCount == 2) {
00573 CmiPrintf("[%d] Checkpoint finished in %f seconds, sending callback ... \n", CkMyPe(), CmiWallTimer()-startTime);
00574 cpCallback.send();
00575 peCount = 0;
00576 thisProxy.report();
00577 }
00578 }
00579
00580
00581 void CkMemCheckPT::report()
00582 {
00583 int objsize = 0;
00584 int len = ckTable.length();
00585 for (int i=0; i<len; i++) {
00586 CkCheckPTInfo *entry = ckTable[i];
00587 CmiAssert(entry);
00588 objsize += entry->getSize();
00589 }
00590 CmiAssert(CpvAccess(procChkptBuf));
00591 CkPrintf("[%d] Checkpoint object size: %d len: %d Processor data: %d \n", CkMyPe(), objsize, len, CpvAccess(procChkptBuf)->len);
00592 }
00593
00594
00595
00596
00597
00598
00599 inline int CkMemCheckPT::isMaster(int buddype)
00600 {
00601 #if 0
00602 int mype = CkMyPe();
00603
00604 if (CkNumPes() - totalFailed() == 2) {
00605 return mype > buddype;
00606 }
00607 for (int i=1; i<CkNumPes(); i++) {
00608 int me = (buddype+i)%CkNumPes();
00609 if (isFailed(me)) continue;
00610 if (me == mype) return 1;
00611 else return 0;
00612 }
00613 return 0;
00614 #else
00615
00616 int mype = CkMyPe();
00617
00618 if (CkNumPes() - totalFailed() == 2) {
00619 return mype < buddype;
00620 }
00621 #if NODE_CHECKPOINT
00622 int pe_per_node = CmiNumPesOnPhysicalNode(CmiPhysicalNodeID(mype));
00623 for (int i=pe_per_node; i<CkNumPes(); i+=pe_per_node) {
00624 #else
00625 for (int i=1; i<CkNumPes(); i++) {
00626 #endif
00627 int me = (mype+i)%CkNumPes();
00628 if (isFailed(me)) continue;
00629 if (me == buddype) return 1;
00630 else return 0;
00631 }
00632 return 0;
00633 #endif
00634 }
00635
00636 #ifdef CKLOCMGR_LOOP
00637 #undef CKLOCMGR_LOOP
00638 #endif
00639
00640
00641 #define CKLOCMGR_LOOP(code) { \
00642 int numGroups = CkpvAccess(_groupIDTable)->size(); \
00643 for(int i=0;i<numGroups;i++) { \
00644 IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj(); \
00645 if(obj->isLocMgr()) { \
00646 CkLocMgr *mgr = (CkLocMgr*)obj; \
00647 code \
00648 } \
00649 } \
00650 }
00651
00652 #if 0
00653
00654 class ElementDestoryer : public CkLocIterator {
00655 private:
00656 CkLocMgr *locMgr;
00657 public:
00658 ElementDestoryer(CkLocMgr* mgr_):locMgr(mgr_){};
00659 void addLocation(CkLocation &loc) {
00660 CkArrayIndex idx=loc.getIndex();
00661 CkPrintf("[%d] destroy: ", CkMyPe()); idx.print();
00662 loc.destroy();
00663 }
00664 };
00665 #endif
00666
00667
00668 void CkMemCheckPT::resetLB(int diepe)
00669 {
00670 #if CMK_LBDB_ON
00671 int i;
00672 char *bitmap = new char[CkNumPes()];
00673
00674 get_avail_vector(bitmap);
00675
00676 for (i=0; i<failedPes.length(); i++)
00677 bitmap[failedPes[i]] = 0;
00678 bitmap[diepe] = 0;
00679
00680 #if CK_NO_PROC_POOL
00681 set_avail_vector(bitmap);
00682 #endif
00683
00684
00685 if (CkMyNode() == diepe)
00686 for (i=0; i<CkNumPes(); i++)
00687 if (bitmap[i]==0) failed(i);
00688
00689 delete [] bitmap;
00690 #endif
00691 }
00692
00693
00694
00695
00696
00697
00698 void CkMemCheckPT::restart(int diePe)
00699 {
00700 #if CMK_MEM_CHECKPOINT
00701 double curTime = CmiWallTimer();
00702 if (CkMyPe() == diePe)
00703 CkPrintf("[%d] Process data restored in %f seconds\n", CkMyPe(), curTime - startTime);
00704 stage = (char*)"resetLB";
00705 startTime = curTime;
00706 if (CkMyPe() == diePe)
00707 CkPrintf("[%d] CkMemCheckPT ----- restart.\n",CkMyPe());
00708
00709 #if CK_NO_PROC_POOL
00710 failed(diePe);
00711 #endif
00712 thisFailedPe = diePe;
00713
00714 if (CkMyPe() == diePe) CmiAssert(ckTable.length() == 0);
00715
00716 inRestarting = 1;
00717
00718
00719 if (CkMyPe() != diePe) resetLB(diePe);
00720
00721 CKLOCMGR_LOOP(mgr->startInserting(););
00722
00723 thisProxy[0].quiescence(CkCallback(CkIndex_CkMemCheckPT::removeArrayElements(), thisProxy));
00724
00725
00726
00727
00728 #endif
00729 }
00730
00731
00732 void CkMemCheckPT::removeArrayElements()
00733 {
00734 #if CMK_MEM_CHECKPOINT
00735 int len = ckTable.length();
00736 double curTime = CmiWallTimer();
00737 if (CkMyPe() == thisFailedPe)
00738 CkPrintf("[%d] CkMemCheckPT ----- %s len:%d in %f seconds.\n",CkMyPe(),stage,len,curTime-startTime);
00739 stage = (char*)"removeArrayElements";
00740 startTime = curTime;
00741
00742 if (cpCallback.isInvalid()) CkAbort("Didn't set restart callback\n");;
00743 if (CkMyPe()==thisFailedPe) CmiAssert(len == 0);
00744
00745
00746
00747 CKLOCMGR_LOOP(mgr->flushAllRecs(););
00748
00749
00750
00751 thisProxy[0].quiescence(CkCallback(CkIndex_CkMemCheckPT::resetReductionMgr(), thisProxy));
00752 #endif
00753 }
00754
00755
00756 void CkMemCheckPT::resetReductionMgr()
00757 {
00758
00759 int numGroups = CkpvAccess(_groupIDTable)->size();
00760 for(int i=0;i<numGroups;i++) {
00761 CkGroupID gID = (*CkpvAccess(_groupIDTable))[i];
00762 IrrGroup *obj = CkpvAccess(_groupTable)->find(gID).getObj();
00763 obj->flushStates();
00764 obj->ckJustMigrated();
00765 }
00766
00767
00768
00769 #if 1
00770 thisProxy[0].quiescence(CkCallback(CkIndex_CkMemCheckPT::recoverBuddies(), thisProxy));
00771 #else
00772 if (CkMyPe() == 0)
00773 CkStartQD(CkCallback(CkIndex_CkMemCheckPT::recoverBuddies(), thisProxy));
00774 #endif
00775 }
00776
00777
00778 void CkMemCheckPT::recoverBuddies()
00779 {
00780 int idx;
00781 int len = ckTable.length();
00782
00783
00784 double curTime = CmiWallTimer();
00785 if (CkMyPe() == thisFailedPe)
00786 CkPrintf("[%d] CkMemCheckPT ----- %s in %f seconds\n",CkMyPe(), stage, curTime-startTime);
00787 stage = (char *)"recoverBuddies";
00788 if (CkMyPe() == thisFailedPe)
00789 CkPrintf("[%d] CkMemCheckPT ----- %s starts at %f\n",CkMyPe(), stage, curTime);
00790 startTime = curTime;
00791
00792
00793 expectCount = 0;
00794 for (idx=0; idx<len; idx++) {
00795 CkCheckPTInfo *entry = ckTable[idx];
00796 if (entry->pNo == thisFailedPe) {
00797 #if CK_NO_PROC_POOL
00798
00799
00800
00801
00802
00803
00804
00805 int budPe = BuddyPE(CkMyPe());
00806
00807 #else
00808 int budPe = thisFailedPe;
00809 #endif
00810 entry->updateBuddy(CkMyPe(), budPe);
00811 #if 0
00812 thisProxy[budPe].createEntry(entry->aid, entry->locMgr, entry->index, CkMyPe());
00813 CkArrayCheckPTMessage *msg = entry->getCopy();
00814 msg->cp_flag = 0;
00815 thisProxy[budPe].recvData(msg);
00816 #else
00817 CkArrayCheckPTMessage *msg = entry->getCopy();
00818 msg->bud1 = budPe;
00819 msg->bud2 = CkMyPe();
00820 msg->cp_flag = 0;
00821 thisProxy[budPe].recoverEntry(msg);
00822 #endif
00823 expectCount ++;
00824 }
00825 }
00826
00827 #if 1
00828 if (expectCount == 0) {
00829 contribute(CkCallback(CkReductionTarget(CkMemCheckPT, recoverArrayElements), thisProxy));
00830
00831 }
00832 #else
00833 if (CkMyPe() == 0) {
00834 CkStartQD(CkCallback(CkIndex_CkMemCheckPT::recoverArrayElements(), thisProxy));
00835 }
00836 #endif
00837
00838
00839 }
00840
00841 void CkMemCheckPT::gotData()
00842 {
00843 ackCount ++;
00844 if (ackCount == expectCount) {
00845 ackCount = 0;
00846 expectCount = -1;
00847
00848 contribute(CkCallback(CkReductionTarget(CkMemCheckPT, recoverArrayElements), thisProxy));
00849 }
00850 }
00851
00852 void CkMemCheckPT::updateLocations(int n, CkGroupID *g, CkArrayIndex *idx,int nowOnPe)
00853 {
00854 for (int i=0; i<n; i++) {
00855 CkLocMgr *mgr = CProxy_CkLocMgr(g[i]).ckLocalBranch();
00856 mgr->updateLocation(idx[i], nowOnPe);
00857 }
00858 }
00859
00860
00861 void CkMemCheckPT::recoverArrayElements()
00862 {
00863 double curTime = CmiWallTimer();
00864 int len = ckTable.length();
00865 CkPrintf("[%d] CkMemCheckPT ----- %s len: %d in %f seconds \n",CkMyPe(), stage, len, curTime-startTime);
00866 stage = (char *)"recoverArrayElements";
00867 if (CkMyPe() == thisFailedPe)
00868 CkPrintf("[%d] CkMemCheckPT ----- %s starts at %f \n",CkMyPe(), stage, curTime);
00869 startTime = curTime;
00870
00871
00872 int count = 0;
00873 #if STREAMING_INFORMHOME
00874 CkVec<CkGroupID> * gmap = new CkVec<CkGroupID>[CkNumPes()];
00875 CkVec<CkArrayIndex> * imap = new CkVec<CkArrayIndex>[CkNumPes()];
00876 #endif
00877 for (int idx=0; idx<len; idx++)
00878 {
00879 CkCheckPTInfo *entry = ckTable[idx];
00880 #if CK_NO_PROC_POOL
00881
00882
00883 if (!isMaster(entry->pNo)) continue;
00884 #else
00885
00886 if (CkMyPe() == entry->pNo+1 ||
00887 CkMyPe()+CkNumPes() == entry->pNo+1) continue;
00888 #endif
00889
00890
00891 entry->updateBuddy(CkMyPe(), entry->pNo);
00892 CkArrayCheckPTMessage *msg = entry->getCopy();
00893
00894
00895 inmem_restore(msg);
00896 #if STREAMING_INFORMHOME
00897 CkLocMgr *mgr = CProxy_CkLocMgr(msg->locMgr).ckLocalBranch();
00898 int homePe = mgr->homePe(msg->index);
00899 if (homePe != CkMyPe()) {
00900 gmap[homePe].push_back(msg->locMgr);
00901 imap[homePe].push_back(msg->index);
00902 }
00903 #endif
00904 CkFreeMsg(msg);
00905 count ++;
00906 }
00907 #if STREAMING_INFORMHOME
00908 for (int i=0; i<CkNumPes(); i++) {
00909 if (gmap[i].size() && i!=CkMyPe()) {
00910 thisProxy[i].updateLocations(gmap[i].size(), gmap[i].getVec(), imap[i].getVec(), CkMyPe());
00911 }
00912 }
00913 delete [] imap;
00914 delete [] gmap;
00915 #endif
00916 DEBUGF("[%d] recoverArrayElements restore %d objects\n", CkMyPe(), count);
00917
00918 CKLOCMGR_LOOP(mgr->doneInserting(););
00919
00920 inRestarting = 0;
00921
00922 CpvAccess(_crashedNode) = -1;
00923
00924 if (CkMyPe() == 0)
00925 CkStartQD(CkCallback(CkIndex_CkMemCheckPT::finishUp(), thisProxy));
00926 }
00927
00928 static double restartT;
00929
00930
00931
00932 void CkMemCheckPT::finishUp()
00933 {
00934
00935
00936
00937 if (CkMyPe() == thisFailedPe)
00938 {
00939 CkPrintf("[%d] CkMemCheckPT ----- %s in %f seconds, callback triggered\n",CkMyPe(), stage, CmiWallTimer()-startTime);
00940
00941 cpCallback.send();
00942 CkPrintf("[%d] Restart finished in %f seconds at %f.\n", CkMyPe(), CkWallTimer()-restartT, CkWallTimer());
00943 }
00944
00945 #if CK_NO_PROC_POOL
00946 #if NODE_CHECKPOINT
00947 int numnodes = CmiNumPhysicalNodes();
00948 #else
00949 int numnodes = CkNumPes();
00950 #endif
00951 if (numnodes-totalFailed() <=2) {
00952 if (CkMyPe()==0) CkPrintf("Warning: CkMemCheckPT disabled!\n");
00953 _memChkptOn = 0;
00954 }
00955 #endif
00956 }
00957
00958
00959 void CkMemCheckPT::quiescence(CkCallback &cb)
00960 {
00961 static int pe_count = 0;
00962 pe_count ++;
00963 CmiAssert(CkMyPe() == 0);
00964
00965 if (pe_count == CkNumPes()) {
00966 pe_count = 0;
00967 cb.send();
00968 }
00969 }
00970
00971
00972
00973 void CkStartMemCheckpoint(CkCallback &cb)
00974 {
00975 #if CMK_MEM_CHECKPOINT
00976 if (_memChkptOn == 0) {
00977 CkPrintf("Warning: In-Memory checkpoint has been disabled! \n");
00978 cb.send();
00979 return;
00980 }
00981 if (CkInRestarting()) {
00982
00983 cb.send();
00984 return;
00985 }
00986
00987 CkMemCheckPT::cpCallback = cb;
00988
00989
00990 CProxy_CkMemCheckPT checkptMgr(ckCheckPTGroupID);
00991 checkptMgr.doItNow(CkMyPe(), cb);
00992 #else
00993
00994 cb.send();
00995 #endif
00996 }
00997
00998 void CkRestartCheckPoint(int diePe)
00999 {
01000 CkPrintf("CkRestartCheckPoint CkMemCheckPT GID:%d at time %f\n", ckCheckPTGroupID.idx, CkWallTimer());
01001 CProxy_CkMemCheckPT checkptMgr(ckCheckPTGroupID);
01002
01003 checkptMgr.restart(diePe);
01004 }
01005
01006 static int _diePE = -1;
01007
01008
01009 static void CkRestartCheckPointCallback(void *ignore, void *msg)
01010 {
01011 CkPrintf("[%d] CkRestartCheckPointCallback activated for diePe: %d at %f\n", CkMyPe(), _diePE, CkWallTimer());
01012 CkRestartCheckPoint(_diePE);
01013 }
01014
01015
01016 static int askPhaseHandlerIdx;
01017 static int recvPhaseHandlerIdx;
01018 static int askProcDataHandlerIdx;
01019 static int restartBcastHandlerIdx;
01020 static int recoverProcDataHandlerIdx;
01021 static int restartBeginHandlerIdx;
01022 static int notifyHandlerIdx;
01023
01024
01025 static void restartBeginHandler(char *msg)
01026 {
01027 #if CMK_MEM_CHECKPOINT
01028 static int count = 0;
01029 CmiFree(msg);
01030 CmiAssert(CkMyPe() == _diePE);
01031 count ++;
01032 if (count == CkNumPes()) {
01033 CkRestartCheckPointCallback(NULL, NULL);
01034 count = 0;
01035 }
01036 #endif
01037 }
01038
01039 extern void _discard_charm_message();
01040 extern void _resume_charm_message();
01041
01042 static void restartBcastHandler(char *msg)
01043 {
01044 #if CMK_MEM_CHECKPOINT
01045
01046 CkMemCheckPT::inRestarting = 1;
01047 _diePE = *(int *)(msg+CmiMsgHeaderSizeBytes);
01048
01049
01050
01051 if (CkMyPe()==_diePE)
01052 CkPrintf("[%d] restartBcastHandler cur_restart_phase=%d _diePE:%d at %f.\n", CkMyPe(), CpvAccess(_curRestartPhase), _diePE, CkWallTimer());
01053
01054
01055
01056
01057
01058
01059
01060
01061
01062
01063 CmiFree(msg);
01064
01065 _resume_charm_message();
01066
01067
01068 char *restartmsg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes);
01069 CmiSetHandler(restartmsg, restartBeginHandlerIdx);
01070 CmiSyncSendAndFree(_diePE, CmiMsgHeaderSizeBytes, (char *)restartmsg);
01071
01072 checkpointed = 0;
01073 #endif
01074 }
01075
01076 extern void _initDone();
01077
01078
01079 static void recoverProcDataHandler(char *msg)
01080 {
01081 #if CMK_MEM_CHECKPOINT
01082 int i;
01083 envelope *env = (envelope *)msg;
01084 CkUnpackMessage(&env);
01085 CkProcCheckPTMessage* procMsg = (CkProcCheckPTMessage *)(EnvToUsr(env));
01086 CpvAccess(_curRestartPhase) = procMsg->cur_restart_phase;
01087 CmiPrintf("[%d] ----- recoverProcDataHandler cur_restart_phase:%d at time: %f\n", CkMyPe(), CpvAccess(_curRestartPhase), CkWallTimer());
01088
01089
01090
01091
01092
01093
01094
01095 PUP::fromMem p(procMsg->packData);
01096 _handleProcData(p);
01097
01098 CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->resetLB(CkMyPe());
01099
01100 CKLOCMGR_LOOP(mgr->startInserting(););
01101
01102 char *reqmsg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes+sizeof(int));
01103 *(int *)(reqmsg+CmiMsgHeaderSizeBytes) = CkMyPe();
01104 CmiSetHandler(reqmsg, restartBcastHandlerIdx);
01105 CmiSyncBroadcastAllAndFree(CmiMsgHeaderSizeBytes+sizeof(int), (char *)reqmsg);
01106
01107 _initDone();
01108
01109 CmiPrintf("[%d] ----- recoverProcDataHandler done at %f\n", CkMyPe(), CkWallTimer());
01110 #endif
01111 }
01112
01113
01114
01115 static void askProcDataHandler(char *msg)
01116 {
01117 #if CMK_MEM_CHECKPOINT
01118 int diePe = *(int *)(msg+CmiMsgHeaderSizeBytes);
01119 CkPrintf("[%d] restartBcastHandler called with '%d' cur_restart_phase:%d at time %f.\n",CmiMyPe(),diePe, CpvAccess(_curRestartPhase), CkWallTimer());
01120 if (CpvAccess(procChkptBuf) == NULL)
01121 CkPrintf("[%d] no checkpoint found for processor %d. This could be due to a crash before the first checkpointing.\n", CkMyPe(), diePe);
01122 CmiAssert(CpvAccess(procChkptBuf)!=NULL);
01123 envelope *env = (envelope *)(UsrToEnv(CpvAccess(procChkptBuf)));
01124 CmiAssert(CpvAccess(procChkptBuf)->pe == diePe);
01125
01126 CpvAccess(procChkptBuf)->cur_restart_phase = CpvAccess(_curRestartPhase);
01127
01128 CkPackMessage(&env);
01129 CmiSetHandler(env, recoverProcDataHandlerIdx);
01130 CmiSyncSendAndFree(CpvAccess(procChkptBuf)->pe, env->getTotalsize(), (char *)env);
01131 CpvAccess(procChkptBuf) = NULL;
01132 #endif
01133 }
01134
01135
01136 void qd_callback(void *m)
01137 {
01138 CmiPrintf("[%d] callback after QD for crashed node: %d. \n", CkMyPe(), CpvAccess(_crashedNode));
01139 CkFreeMsg(m);
01140 #ifdef CMK_SMP
01141 for(int i=0;i<CmiMyNodeSize();i++){
01142 char *msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes+sizeof(int));
01143 *(int *)(msg+CmiMsgHeaderSizeBytes) =CpvAccess(_crashedNode);
01144 CmiSetHandler(msg, askProcDataHandlerIdx);
01145 int pe = ChkptOnPe(CpvAccess(_crashedNode)*CmiMyNodeSize()+i);
01146 CmiSyncSendAndFree(pe, CmiMsgHeaderSizeBytes+sizeof(int), (char *)msg);
01147 }
01148 return;
01149 #endif
01150 char *msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes+sizeof(int));
01151 *(int *)(msg+CmiMsgHeaderSizeBytes) = CpvAccess(_crashedNode);
01152
01153 CmiSetHandler(msg, askProcDataHandlerIdx);
01154 int pe = ChkptOnPe(CpvAccess(_crashedNode));
01155 CmiSyncSendAndFree(pe, CmiMsgHeaderSizeBytes+sizeof(int), (char *)msg);
01156
01157 }
01158
01159
01160 void CkMemRestart(const char *dummy, CkArgMsg *args)
01161 {
01162 #if CMK_MEM_CHECKPOINT
01163 _diePE = CmiMyNode();
01164 CkMemCheckPT::startTime = restartT = CmiWallTimer();
01165 CmiPrintf("[%d] I am restarting cur_restart_phase:%d at time: %f\n",CmiMyPe(), CpvAccess(_curRestartPhase), CkMemCheckPT::startTime);
01166 CkMemCheckPT::inRestarting = 1;
01167
01168 CpvAccess( _crashedNode )= CmiMyNode();
01169
01170 _discard_charm_message();
01171
01172 if(CmiMyRank()==0){
01173 CkCallback cb(qd_callback);
01174 CkStartQD(cb);
01175 CkPrintf("crash_node:%d\n",CpvAccess( _crashedNode));
01176 }
01177 #else
01178 CmiAbort("Fault tolerance is not support, rebuild charm++ with 'syncft' option");
01179 #endif
01180 }
01181
01182
01183
01184 extern "C"
01185 int CkInRestarting()
01186 {
01187 #if CMK_MEM_CHECKPOINT
01188 if (CpvAccess( _crashedNode)!=-1) return 1;
01189
01190
01191
01192 return CkMemCheckPT::inRestarting;
01193 #else
01194 return 0;
01195 #endif
01196 }
01197
01198
01199
01200
01201
01202 static int arg_where = CkCheckPoint_inMEM;
01203
01204 #if CMK_MEM_CHECKPOINT
01205 void init_memcheckpt(char **argv)
01206 {
01207 if (CmiGetArgFlagDesc(argv, "+ftc_disk", "Double-disk Checkpointing")) {
01208 arg_where = CkCheckPoint_inDISK;
01209 }
01210
01211
01212 CpvInitialize(int, _crashedNode);
01213 CpvAccess(_crashedNode) = -1;
01214
01215 }
01216 #endif
01217
01218 class CkMemCheckPTInit: public Chare {
01219 public:
01220 CkMemCheckPTInit(CkArgMsg *m) {
01221 #if CMK_MEM_CHECKPOINT
01222 if (arg_where == CkCheckPoint_inDISK) {
01223 CkPrintf("Charm++> Double-disk Checkpointing. \n");
01224 }
01225 ckCheckPTGroupID = CProxy_CkMemCheckPT::ckNew(arg_where);
01226 CkPrintf("Charm++> CkMemCheckPTInit mainchare is created!\n");
01227 #endif
01228 }
01229 };
01230
01231 static void notifyHandler(char *msg)
01232 {
01233 #if CMK_MEM_CHECKPOINT
01234 CmiFree(msg);
01235
01236 CpvAccess(_curRestartPhase) ++;
01237 CpvAccess(_qd)->flushStates();
01238 _discard_charm_message();
01239 #endif
01240 }
01241
01242 extern "C"
01243 void notify_crash(int node)
01244 {
01245 #ifdef CMK_MEM_CHECKPOINT
01246 CpvAccess( _crashedNode) = node;
01247 #ifdef CMK_SMP
01248 for(int i=0;i<CkMyNodeSize();i++){
01249 CpvAccessOther(_crashedNode,i)=node;
01250 }
01251 #endif
01252 CmiAssert(CmiMyNode() !=CpvAccess( _crashedNode));
01253 CkMemCheckPT::inRestarting = 1;
01254
01255
01256 int pe = CmiNodeFirst(CkMyNode());
01257 for(int i=0;i<CkMyNodeSize();i++){
01258 char *msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes);
01259 CmiSetHandler(msg, notifyHandlerIdx);
01260 CmiSyncSendAndFree(pe+i, CmiMsgHeaderSizeBytes, (char *)msg);
01261 }
01262 #endif
01263 }
01264
01265 extern "C" void (*notify_crash_fn)(int node);
01266
01267 #if CMK_CONVERSE_MPI
01268 static int pingHandlerIdx;
01269 static int pingCheckHandlerIdx;
01270 static int buddyDieHandlerIdx;
01271 static double lastPingTime = -1;
01272
01273 extern "C" void mpi_restart_crashed(int pe, int rank);
01274 extern "C" int find_spare_mpirank(int pe);
01275
01276 void pingBuddy();
01277 void pingCheckHandler();
01278
01279 void buddyDieHandler(char *msg)
01280 {
01281 #if CMK_MEM_CHECKPOINT
01282
01283 int diepe = *(int *)(msg+CmiMsgHeaderSizeBytes);
01284 notify_crash(diepe);
01285
01286 CkMemCheckPT *obj = CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch();
01287 int newrank = find_spare_mpirank(diepe);
01288 int buddy = obj->BuddyPE(CmiMyPe());
01289 if (buddy == diepe) {
01290 mpi_restart_crashed(diepe, newrank);
01291 CcdCallOnCondition(CcdPERIODIC_5s,(CcdVoidFn)pingCheckHandler,NULL);
01292 }
01293 #endif
01294 }
01295
01296 void pingHandler(void *msg)
01297 {
01298 lastPingTime = CmiWallTimer();
01299 CmiFree(msg);
01300 }
01301
01302 void pingCheckHandler()
01303 {
01304 #if CMK_MEM_CHECKPOINT
01305 double now = CmiWallTimer();
01306 if (lastPingTime > 0 && now - lastPingTime > 4) {
01307 int i, pe, buddy;
01308
01309 CkMemCheckPT *obj = CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch();
01310 for (i = 1; i < CmiNumPes(); i++) {
01311 pe = (CmiMyPe() - i + CmiNumPes()) % CmiNumPes();
01312 if (obj->BuddyPE(pe) == CmiMyPe()) break;
01313 }
01314 buddy = pe;
01315 CmiPrintf("[%d] detected buddy processor %d died %f %f. \n", CmiMyPe(), buddy, now, lastPingTime);
01316 for (int pe = 0; pe < CmiNumPes(); pe++) {
01317 if (obj->isFailed(pe) || pe == buddy) continue;
01318 char *msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes+sizeof(int));
01319 *(int *)(msg+CmiMsgHeaderSizeBytes) = buddy;
01320 CmiSetHandler(msg, buddyDieHandlerIdx);
01321 CmiSyncSendAndFree(pe, CmiMsgHeaderSizeBytes+sizeof(int), (char *)msg);
01322 }
01323 }
01324 else
01325 CcdCallOnCondition(CcdPERIODIC_5s,(CcdVoidFn)pingCheckHandler,NULL);
01326 #endif
01327 }
01328
01329 void pingBuddy()
01330 {
01331 #if CMK_MEM_CHECKPOINT
01332 CkMemCheckPT *obj = CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch();
01333 if (obj) {
01334 int buddy = obj->BuddyPE(CkMyPe());
01335
01336 char *msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes+sizeof(int));
01337 *(int *)(msg+CmiMsgHeaderSizeBytes) = CmiMyPe();
01338 CmiSetHandler(msg, pingHandlerIdx);
01339 CmiGetRestartPhase(msg) = 9999;
01340 CmiSyncSendAndFree(buddy, CmiMsgHeaderSizeBytes+sizeof(int), (char *)msg);
01341 }
01342 CcdCallOnCondition(CcdPERIODIC_100ms,(CcdVoidFn)pingBuddy,NULL);
01343 #endif
01344 }
01345 #endif
01346
01347
01348 void CkRegisterRestartHandler( )
01349 {
01350 #if CMK_MEM_CHECKPOINT
01351 notifyHandlerIdx = CkRegisterHandler((CmiHandler)notifyHandler);
01352 askProcDataHandlerIdx = CkRegisterHandler((CmiHandler)askProcDataHandler);
01353 recoverProcDataHandlerIdx = CkRegisterHandler((CmiHandler)recoverProcDataHandler);
01354 restartBcastHandlerIdx = CkRegisterHandler((CmiHandler)restartBcastHandler);
01355 restartBeginHandlerIdx = CkRegisterHandler((CmiHandler)restartBeginHandler);
01356
01357 #if CMK_CONVERSE_MPI
01358 pingHandlerIdx = CkRegisterHandler((CmiHandler)pingHandler);
01359 pingCheckHandlerIdx = CkRegisterHandler((CmiHandler)pingCheckHandler);
01360 buddyDieHandlerIdx = CkRegisterHandler((CmiHandler)buddyDieHandler);
01361 #endif
01362
01363 CpvInitialize(CkProcCheckPTMessage *, procChkptBuf);
01364 CpvAccess(procChkptBuf) = NULL;
01365
01366 notify_crash_fn = notify_crash;
01367
01368 #if ! CMK_CONVERSE_MPI
01369
01370 CkPrintf("[%d] PID %d \n", CkMyPe(), getpid());
01371
01372 #endif
01373 #endif
01374 }
01375
01376
01377 extern "C"
01378 int CkHasCheckpoints()
01379 {
01380 return checkpointed;
01381 }
01382
01384
01385
01389 #ifdef CMK_MEM_CHECKPOINT
01390 #if CMK_HAS_GETPID
01391 void killLocal(void *_dummy,double curWallTime){
01392 printf("[%d] KillLocal called at %.6lf \n",CkMyPe(),CmiWallTimer());
01393 if(CmiWallTimer()<killTime-1){
01394 CcdCallFnAfter(killLocal,NULL,(killTime-CmiWallTimer())*1000);
01395 }else{
01396 kill(getpid(),SIGKILL);
01397 }
01398 }
01399 #else
01400 void killLocal(void *_dummy,double curWallTime){
01401 CmiAbort("kill() not supported!");
01402 }
01403 #endif
01404 #endif
01405
01406 #ifdef CMK_MEM_CHECKPOINT
01407
01410 void readKillFile(){
01411 FILE *fp=fopen(killFile,"r");
01412 if(!fp){
01413 return;
01414 }
01415 int proc;
01416 double sec;
01417 while(fscanf(fp,"%d %lf",&proc,&sec)==2){
01418 if(proc == CkMyNode() && CkMyRank() == 0){
01419 killTime = CmiWallTimer()+sec;
01420 printf("[%d] To be killed after %.6lf s (MEMCKPT) \n",CkMyPe(),sec);
01421 CcdCallFnAfter(killLocal,NULL,sec*1000);
01422 }
01423 }
01424 fclose(fp);
01425 }
01426
01427 #if ! CMK_CONVERSE_MPI
01428 void CkDieNow()
01429 {
01430
01431 CmiPrintf("[%d] die now.\n", CmiMyPe());
01432 killTime = CmiWallTimer()+0.001;
01433 CcdCallFnAfter(killLocal,NULL,1);
01434 }
01435 #endif
01436
01437 #endif
01438
01439 #include "CkMemCheckpoint.def.h"
01440
01441