00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #include <stdio.h>
00011 #include <stdlib.h>
00012 #ifndef _WIN32
00013 #include <unistd.h>
00014 #endif
00015 #include <string.h>
00016 #include <sstream>
00017 using std::ostringstream;
00018 #include <errno.h>
00019 #include "charm++.h"
00020 #include "ck.h"
00021 #include "ckcheckpoint.h"
00022 #include "CkCheckpoint.decl.h"
00023
00024 void noopit(const char*, ...)
00025 {}
00026
00027
00028 #define DEBCHK noopit
00029
00030 #define DEBUGC(x) x
00031
00032
00033 CkGroupID _sysChkptMgr;
00034
00035 typedef struct _GroupInfo{
00036 CkGroupID gID;
00037 int MigCtor;
00038 char name[256];
00039 bool present;
00040 } GroupInfo;
00041 PUPbytes(GroupInfo)
00042 PUPmarshall(GroupInfo)
00043
00044 bool _inrestart = false;
00045 bool _restarted = false;
00046 int _oldNumPes = 0;
00047 bool _chareRestored = false;
00048 double chkptStartTimer = 0;
00049 #if CMK_SHRINK_EXPAND
00050 int originalnumGroups = -1;
00051 extern int Cmi_isOldProcess;
00052 extern int Cmi_myoldpe;
00053 extern char *_shrinkexpand_basedir;
00054 #endif
00055
00056 #if CMK_ONESIDED_IMPL
00057 extern UInt numZerocopyROops;
00058 #endif
00059
00060 void CkCreateLocalChare(int epIdx, envelope *env);
00061
00062
00063 class ElementCounter : public CkLocIterator {
00064 private:
00065 int count;
00066 public:
00067 ElementCounter():count(0){};
00068 void addLocation(CkLocation &loc) { count++; }
00069 int getCount() { return count; }
00070 };
00071
00072
00073 class ElementCheckpointer : public CkLocIterator {
00074 private:
00075 CkLocMgr *locMgr;
00076 PUP::er &p;
00077 public:
00078 ElementCheckpointer(CkLocMgr* mgr_, PUP::er &p_):locMgr(mgr_),p(p_){};
00079 void addLocation(CkLocation &loc) {
00080 CkArrayIndex idx=loc.getIndex();
00081 CkGroupID gID = locMgr->ckGetGroupID();
00082 CmiUInt8 id = loc.getID();
00083 p|gID;
00084 p|idx;
00085 p|id;
00086 p|loc;
00087
00088 }
00089 };
00090
00091
00092 extern void _initDone();
00093
00094 static void bdcastRO(void){
00095 int i;
00096
00097 PUP::sizer ps;
00098 for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(ps);
00099
00100
00101 envelope *env = _allocEnv(RODataMsg, ps.size());
00102 PUP::toMem pp((char *)EnvToUsr(env));
00103 #if CMK_ONESIDED_IMPL
00104 pp|numZerocopyROops;
00105
00106 #endif
00107 for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(pp);
00108
00109 env->setCount(++_numInitMsgs);
00110 env->setSrcPe(CkMyPe());
00111 CmiSetHandler(env, _roRestartHandlerIdx);
00112 CmiSyncBroadcastAndFree(env->getTotalsize(), (char *)env);
00113 }
00114
00115 #if CMK_SHRINK_EXPAND
00116 static void bdcastROGroupData(void){
00117 int i;
00118
00119 PUP::sizer ps, ps1;
00120 CkPupROData(ps);
00121 int ROSize = ps.size();
00122
00123 CkPupGroupData(ps1);
00124 int GroupSize = ps1.size();
00125
00126 char *msg = (char *)CmiAlloc(CmiMsgHeaderSizeBytes + 2*sizeof(int) + ps.size() + ps1.size());
00127 char *payloadOffset = msg + CmiMsgHeaderSizeBytes;
00128
00129
00130 *(int*)payloadOffset = ps.size();
00131 payloadOffset += sizeof(int);
00132 *(int*)payloadOffset = ps1.size();
00133 payloadOffset += sizeof(int);
00134
00135
00136 PUP::toMem pp((char *)payloadOffset);
00137 CkPupROData(pp);
00138
00139 CkPupGroupData(pp);
00140
00141 CmiSetHandler(msg, _ROGroupRestartHandlerIdx);
00142 CmiSyncBroadcastAllAndFree(CmiMsgHeaderSizeBytes + 2*sizeof(int) + pp.size(), msg);
00143 }
00144 #endif
00145
00146
00147
00148 void printIndex(const CkArrayIndex &idx,char *dest) {
00149 const int *idxData=idx.data();
00150 for (int i=0;i<idx.nInts;i++) {
00151 sprintf(dest,"%s%d",i==0?"":"_", idxData[i]);
00152 dest+=strlen(dest);
00153 }
00154 }
00155
00156 static bool checkpointOne(const char* dirname, CkCallback& cb, bool requestStatus);
00157
00158 static void addPartitionDirectory(ostringstream &path) {
00159 if (CmiNumPartitions() > 1) {
00160 path << "/part-" << CmiMyPartition() << '/';
00161 }
00162 }
00163
00164 static FILE* openCheckpointFile(const char *dirname, const char *basename,
00165 const char *mode, int id = -1) {
00166 ostringstream out;
00167 out << dirname << '/';
00168 addPartitionDirectory(out);
00169 out << basename;
00170 if (id != -1)
00171 out << '_' << id;
00172 out << ".dat";
00173
00174 FILE *fp = CmiFopen(out.str().c_str(), mode);
00175 if (!fp) {
00176 ostringstream error;
00177 error << "PE " << CkMyPe() << " failed to open checkpoint file: " << out.str()
00178 << ", mode: " << mode << " status: " << strerror(errno);
00179 CkAbort(error.str().c_str());
00180 }
00181 return fp;
00182 }
00183
00187 class CkCheckpointMgr : public CBase_CkCheckpointMgr {
00188 private:
00189 CkCallback restartCB;
00190 double chkptStartTimer;
00191 bool requestStatus;
00192 int chkpStatus;
00193 public:
00194 CkCheckpointMgr() { }
00195 CkCheckpointMgr(CkMigrateMessage *m):CBase_CkCheckpointMgr(m) { }
00196 void Checkpoint(const char *dirname, CkCallback cb, bool requestStatus = false);
00197 void SendRestartCB(void);
00198 void pup(PUP::er& p){ p|restartCB; }
00199 };
00200
00201
00202 void CkCheckpointMgr::Checkpoint(const char *dirname, CkCallback cb, bool _requestStatus){
00203 chkptStartTimer = CmiWallTimer();
00204 requestStatus = _requestStatus;
00205
00206 CmiMkdir(dirname);
00207 bool success = true;
00208 if (CmiNumPartitions() > 1) {
00209 ostringstream partDir;
00210 partDir << dirname;
00211 addPartitionDirectory(partDir);
00212 CmiMkdir(partDir.str().c_str());
00213 }
00214
00215 if (CkMyPe() == 0) {
00216 #if CMK_SHRINK_EXPAND
00217 if (pending_realloc_state == REALLOC_IN_PROGRESS) {
00218
00219
00220 CkCallback resumeFromSyncCB(CkIndex_LBDatabase::ResumeClients(), _lbdb);
00221 success &= checkpointOne(dirname, resumeFromSyncCB, requestStatus);
00222 } else
00223 #endif
00224 {
00225 success &= checkpointOne(dirname, cb, requestStatus);
00226 }
00227 }
00228
00229 #ifndef CMK_CHARE_USE_PTR
00230
00231 FILE* fChares = openCheckpointFile(dirname, "Chares", "wb", CkMyPe());
00232 PUP::toDisk pChares(fChares);
00233 CkPupChareData(pChares);
00234 if(pChares.checkError())
00235 success = false;
00236 if(CmiFclose(fChares)!=0)
00237 success = false;
00238 #endif
00239
00240
00241
00242 FILE* fGroups = openCheckpointFile(dirname, "Groups", "wb", CkMyPe());
00243 PUP::toDisk pGroups(fGroups);
00244 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00245 CkPupGroupData(pGroups, true);
00246 #else
00247 CkPupGroupData(pGroups);
00248 #endif
00249 if(pGroups.checkError())
00250 success = false;
00251 if(CmiFclose(fGroups)!=0)
00252 success = false;
00253
00254
00255
00256 if (CkMyRank() == 0) {
00257 FILE* fNodeGroups = openCheckpointFile(dirname, "NodeGroups", "wb", CkMyNode());
00258 PUP::toDisk pNodeGroups(fNodeGroups);
00259 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00260 CkPupNodeGroupData(pNodeGroups, true);
00261 #else
00262 CkPupNodeGroupData(pNodeGroups);
00263 #endif
00264 if(pNodeGroups.checkError())
00265 success = false;
00266 if(CmiFclose(fNodeGroups)!=0)
00267 success = false;
00268 }
00269
00270
00271 FILE *datFile = openCheckpointFile(dirname, "arr", "wb", CkMyPe());
00272 PUP::toDisk p(datFile);
00273 CkPupArrayElementsData(p);
00274 if(p.checkError())
00275 success = false;
00276 if(CmiFclose(datFile)!=0)
00277 success = false;
00278
00279 #if ! CMK_DISABLE_SYNC
00280 #if CMK_HAS_SYNC_FUNC
00281 sync();
00282 #elif CMK_HAS_SYNC
00283 system("sync");
00284 #endif
00285 #endif
00286 chkpStatus = success?CK_CHECKPOINT_SUCCESS:CK_CHECKPOINT_FAILURE;
00287 restartCB = cb;
00288 DEBCHK("[%d]restartCB installed\n",CkMyPe());
00289
00290
00291
00292 barrier(CkCallback(CkReductionTarget(CkCheckpointMgr, SendRestartCB), 0, thisgroup));
00293 }
00294
00295 void CkCheckpointMgr::SendRestartCB(void){
00296 DEBCHK("[%d]Sending out the cb\n",CkMyPe());
00297 CkPrintf("Checkpoint to disk finished in %fs, sending out the cb...\n", CmiWallTimer() - chkptStartTimer);
00298 if(requestStatus)
00299 {
00300 CkCheckpointStatusMsg * m = new CkCheckpointStatusMsg(chkpStatus);
00301 restartCB.send(m);
00302 }
00303 else
00304 restartCB.send();
00305 }
00306
00307 void CkPupROData(PUP::er &p)
00308 {
00309 int _numReadonlies = 0;
00310 int _numReadonlyMsgs = 0;
00311 if (!p.isUnpacking()) _numReadonlies=_readonlyTable.size();
00312
00313 p|_numReadonlies;
00314
00315 if (p.isUnpacking()) {
00316 if (_numReadonlies != _readonlyTable.size())
00317 CkAbort("You cannot add readonlies and restore from checkpoint...");
00318 }
00319 for(int i=0;i<_numReadonlies;i++) _readonlyTable[i]->pupData(p);
00320 if (!p.isUnpacking()) _numReadonlyMsgs=_readonlyMsgs.size();
00321 p|_numReadonlyMsgs;
00322 for(int i=0;i<_numReadonlyMsgs; i++){
00323 ReadonlyMsgInfo *c = _readonlyMsgs[i];
00324 CkPupMessage(p,c->pMsg);
00325 }
00326 }
00327
00328
00329 void CkPupMainChareData(PUP::er &p, CkArgMsg *args)
00330 {
00331 int nMains=_mainTable.size();
00332 DEBCHK("[%d] CkPupMainChareData %s: nMains = %d\n", CkMyPe(),p.typeString(),nMains);
00333 for(int i=0;i<nMains;i++){
00334 ChareInfo *entry = _chareTable[_mainTable[i]->chareIdx];
00335 int entryMigCtor = entry->getMigCtor();
00336 if(entryMigCtor!=-1) {
00337 Chare* obj;
00338 if (p.isUnpacking()) {
00339 int size = entry->size;
00340 DEBCHK("MainChare PUP'ed: name = %s, idx = %d, size = %d\n", entry->name, i, size);
00341 obj = (Chare*)malloc(size);
00342 _MEMCHECK(obj);
00343 _mainTable[i]->setObj(obj);
00344
00345 _entryTable[entryMigCtor]->call(args, obj);
00346 }
00347 else
00348 obj = (Chare *)_mainTable[i]->getObj();
00349 obj->virtual_pup(p);
00350 }
00351 }
00352
00353
00354
00355
00356
00357 #if !CMK_SHRINK_EXPAND
00358 if (p.isUnpacking() && CkMyPe()==0)
00359 bdcastRO();
00360 #endif
00361
00362 }
00363
00364 #ifndef CMK_CHARE_USE_PTR
00365
00366 CkpvExtern(std::vector<void *>, chare_objs);
00367 CkpvExtern(std::vector<int>, chare_types);
00368 CkpvExtern(std::vector<VidBlock *>, vidblocks);
00369
00370
00371 void CkPupChareData(PUP::er &p)
00372 {
00373 int i, n = 0;
00374 if (!p.isUnpacking()) n = CkpvAccess(chare_objs).size();
00375 p|n;
00376 for (i=0; i<n; i++) {
00377 int chare_type = 0;
00378 if (!p.isUnpacking()) {
00379 chare_type = CkpvAccess(chare_types)[i];
00380 }
00381 p | chare_type;
00382 bool pup_flag = true;
00383 if (!p.isUnpacking()) {
00384 if(CkpvAccess(chare_objs)[i] == NULL){
00385 pup_flag = false;
00386 }
00387 }
00388 p|pup_flag;
00389 if(pup_flag)
00390 {
00391 if (p.isUnpacking()) {
00392 int migCtor = _chareTable[chare_type]->migCtor;
00393 if(migCtor==-1) {
00394 char buf[512];
00395 sprintf(buf,"Chare %s needs a migration constructor and PUP'er routine for restart.\n", _chareTable[chare_type]->name);
00396 CkAbort(buf);
00397 }
00398 void *m = CkAllocSysMsg();
00399 envelope* env = UsrToEnv((CkMessage *)m);
00400 CkCreateLocalChare(migCtor, env);
00401 CkFreeSysMsg(m);
00402 }
00403 Chare *obj = (Chare*)CkpvAccess(chare_objs)[i];
00404 obj->virtual_pup(p);
00405 }
00406 else
00407 {
00408 CkpvAccess(chare_objs)[i] = NULL;
00409 }
00410 }
00411
00412 if (!p.isUnpacking()) n = CkpvAccess(vidblocks).size();
00413 p|n;
00414 for (i=0; i<n; i++) {
00415 VidBlock *v;
00416 bool pup_flag = true;
00417 if (!p.isUnpacking()) {
00418 if(CkpvAccess(vidblocks)[i]==NULL)
00419 {
00420 pup_flag = false;
00421 }
00422 }
00423 p|pup_flag;
00424 if(pup_flag)
00425 {
00426 if (p.isUnpacking()) {
00427 v = new VidBlock();
00428 CkpvAccess(vidblocks).push_back(v);
00429 }
00430 else{
00431 v = CkpvAccess(vidblocks)[i];
00432 }
00433 v->pup(p);
00434 }
00435 }
00436 }
00437 #else
00438 void CkPupChareData(PUP::er &p)
00439 {
00440
00441 }
00442 #endif
00443
00444 typedef void GroupCreationFn(CkGroupID groupID, int constructorIdx, envelope *env);
00445
00446 static void CkPupPerPlaceData(PUP::er &p, GroupIDTable *idTable, GroupTable *objectTable,
00447 unsigned int &numObjects, int constructionMsgType,
00448 GroupCreationFn creationFn
00449 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00450 , bool create
00451 #endif
00452 )
00453 {
00454 int numGroups = 0, i;
00455
00456 if (!p.isUnpacking()) {
00457 numGroups = idTable->size();
00458 }
00459 p|numGroups;
00460 if (p.isUnpacking()) {
00461 if(CkMyPe()==0)
00462 numObjects = numGroups+1;
00463 else
00464 numObjects = 1;
00465 }
00466 DEBCHK("[%d] CkPupPerPlaceData %s: numGroups = %d\n", CkMyPe(),p.typeString(),numGroups);
00467
00468 std::vector<GroupInfo> tmpInfo(numGroups);
00469 if (!p.isUnpacking()) {
00470 for (i = 0; i < numGroups; i++) {
00471 tmpInfo[i].gID = (*idTable)[i];
00472 TableEntry ent = objectTable->find(tmpInfo[i].gID);
00473 tmpInfo[i].present = ent.getObj() != NULL;
00474 tmpInfo[i].MigCtor = _chareTable[ent.getcIdx()]->migCtor;
00475 strncpy(tmpInfo[i].name,_chareTable[ent.getcIdx()]->name,255);
00476
00477
00478 if(tmpInfo[i].MigCtor==-1) {
00479 char buf[512];
00480 sprintf(buf,"(Node)Group %s needs a migration constructor and PUP'er routine for restart.\n", tmpInfo[i].name);
00481 CkAbort(buf);
00482 }
00483 }
00484 }
00485 p|tmpInfo;
00486
00487 for (i = 0; i < numGroups; i++)
00488 {
00489 if (!tmpInfo[i].present)
00490 continue;
00491
00492 CkGroupID gID = tmpInfo[i].gID;
00493 if (p.isUnpacking()) {
00494 int eIdx = tmpInfo[i].MigCtor;
00495 if (eIdx == -1) {
00496 CkPrintf("[%d] ERROR> (Node)Group %s's migration constructor is not defined!\n", CkMyPe(), tmpInfo[i].name);
00497 CkAbort("Abort");
00498 }
00499 void *m = CkAllocSysMsg();
00500 envelope* env = UsrToEnv((CkMessage *)m);
00501 env->setMsgtype(constructionMsgType);
00502
00503 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00504 if(create)
00505 #endif
00506 {
00507 creationFn(gID, eIdx, env);
00508 }
00509 }
00510 IrrGroup *gobj = objectTable->find(gID).getObj();
00511
00512 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00513 if(creationFn == CkCreateLocalGroup && !create)
00514 {
00515 gobj->mlogData->teamRecoveryFlag = 1;
00516 }
00517 #endif
00518
00519
00520 gobj->virtual_pup(p);
00521 }
00522 }
00523
00524 void CkPupGroupData(PUP::er &p
00525 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00526 , bool create
00527 #endif
00528 )
00529 {
00530 CkPupPerPlaceData(p, CkpvAccess(_groupIDTable), CkpvAccess(_groupTable),
00531 CkpvAccess(_numGroups), BocInitMsg, &CkCreateLocalGroup
00532 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00533 , create
00534 #endif
00535 );
00536 }
00537
00538 void CkPupNodeGroupData(PUP::er &p
00539 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00540 , bool create
00541 #endif
00542 )
00543 {
00544 CkPupPerPlaceData(p, &CksvAccess(_nodeGroupIDTable),
00545 CksvAccess(_nodeGroupTable), CksvAccess(_numNodeGroups),
00546 NodeBocInitMsg, &CkCreateLocalNodeGroup
00547 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00548 , create
00549 #endif
00550 );
00551 }
00552
00553
00554 void CkPupArrayElementsData(PUP::er &p, int notifyListeners)
00555 {
00556 int i;
00557
00558 int numGroups = CkpvAccess(_groupIDTable)->size();
00559
00560
00561 int numElements = 0;
00562 if (!p.isUnpacking()) {
00563 ElementCounter counter;
00564 CKLOCMGR_LOOP(mgr->iterate(counter););
00565 numElements = counter.getCount();
00566 }
00567 p|numElements;
00568
00569 DEBCHK("[%d] CkPupArrayElementsData %s numGroups:%d numElements:%d \n",CkMyPe(),p.typeString(), numGroups, numElements);
00570
00571 if (!p.isUnpacking())
00572 {
00573
00574 CKLOCMGR_LOOP(ElementCheckpointer chk(mgr, p); mgr->iterate(chk););
00575 }
00576 else {
00577
00578
00579 for (int i=0; i<numElements; i++) {
00580 CkGroupID gID;
00581 CkArrayIndex idx;
00582 CmiUInt8 id;
00583 p|gID;
00584 p|idx;
00585 p|id;
00586 CkLocMgr *mgr = (CkLocMgr*)CkpvAccess(_groupTable)->find(gID).getObj();
00587 if (notifyListeners){
00588 mgr->resume(idx, id, p, true);
00589 }
00590 else{
00591 mgr->restore(idx, id, p);
00592 }
00593 }
00594 }
00595
00596 if (notifyListeners)
00597 for(i=0;i<numGroups;i++) {
00598 IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
00599 if (obj)
00600 obj->ckJustMigrated();
00601 }
00602 }
00603
00604 #if __FAULT__
00605 int CkCountArrayElements(){
00606 int numGroups = CkpvAccess(_groupIDTable)->size();
00607 int i;
00608 ElementCounter counter;
00609 CKLOCMGR_LOOP(mgr->iterate(counter););
00610 int numElements = counter.getCount();
00611 return numElements;
00612 }
00613 #endif
00614
00615 void CkPupProcessorData(PUP::er &p)
00616 {
00617
00618 if(CkMyRank()==0) {
00619 CkPupROData(p);
00620 }
00621
00622
00623 if(CkMyPe()==0) {
00624 CkPupMainChareData(p, NULL);
00625 }
00626
00627
00628 CkPupChareData(p);
00629
00630
00631 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00632 CkPupGroupData(p,true);
00633 #else
00634 CkPupGroupData(p);
00635 #endif
00636
00637
00638 if(CkMyRank()==0) {
00639 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00640 CkPupNodeGroupData(p,true);
00641 #else
00642 CkPupNodeGroupData(p);
00643 #endif
00644 }
00645
00646
00647 CkPupArrayElementsData(p);
00648 }
00649
00650
00651 static bool checkpointOne(const char* dirname, CkCallback& cb, bool requestStatus){
00652 CmiAssert(CkMyPe()==0);
00653 char filename[1024];
00654
00655
00656 FILE* fRO = openCheckpointFile(dirname, "RO", "wb", -1);
00657 PUP::toDisk pRO(fRO);
00658 int _numPes = CkNumPes();
00659 pRO|_numPes;
00660 int _numNodes = CkNumNodes();
00661
00662 pRO|_numNodes;
00663 pRO|cb;
00664 CkPupROData(pRO);
00665 pRO|requestStatus;
00666
00667 if(pRO.checkError())
00668 {
00669 return false;
00670 }
00671
00672 if(CmiFclose(fRO)!=0)
00673 {
00674 return false;
00675 }
00676
00677
00678 {
00679 FILE* fMain = openCheckpointFile(dirname, "MainChares", "wb", -1);
00680 PUP::toDisk pMain(fMain);
00681 CkPupMainChareData(pMain, NULL);
00682 if(pMain.checkError())
00683 {
00684 return false;
00685 }
00686 if(CmiFclose(fMain) != 0)
00687 {
00688 return false;
00689 }
00690 }
00691 return true;
00692 }
00693
00694 void CkRemoveArrayElements()
00695 {
00696 int i;
00697 int numGroups = CkpvAccess(_groupIDTable)->size();
00698 CKLOCMGR_LOOP(mgr->flushAllRecs(););
00699
00700
00701
00702
00703
00704
00705
00706
00707 }
00708
00709
00710
00711
00712
00713
00714
00715
00716
00717
00718
00719
00720
00721
00722
00723 void CkStartCheckpoint(const char* dirname,const CkCallback& cb, bool requestStatus)
00724 {
00725 if(cb.isInvalid())
00726 CkAbort("callback after checkpoint is not set properly");
00727
00728 if(cb.containsPointer())
00729 CkAbort("Cannot restart from a callback based on a pointer");
00730
00731
00732 CkPrintf("[%d] Checkpoint starting in %s\n", CkMyPe(), dirname);
00733
00734
00735 CProxy_CkCheckpointMgr(_sysChkptMgr).Checkpoint(dirname, cb, requestStatus);
00736 }
00737
00745 CkCallback cb;
00746 void CkRestartMain(const char* dirname, CkArgMsg *args){
00747 int i;
00748 char filename[1024];
00749
00750 if (CmiMyRank() == 0) {
00751 _inrestart = true;
00752 _restarted = true;
00753 CkMemCheckPT::inRestarting = true;
00754 }
00755
00756
00757 FILE* fRO = openCheckpointFile(dirname, "RO", "rb", -1);
00758 int _numPes = -1;
00759 PUP::fromDisk pRO(fRO);
00760 pRO|_numPes;
00761 int _numNodes = -1;
00762 pRO|_numNodes;
00763 pRO|cb;
00764 if (CmiMyRank() == 0) CkPupROData(pRO);
00765 bool requestStatus = false;
00766 pRO|requestStatus;
00767 CmiFclose(fRO);
00768 DEBCHK("[%d]CkRestartMain: readonlys restored\n",CkMyPe());
00769 _oldNumPes = _numPes;
00770
00771 CmiNodeBarrier();
00772
00773
00774 FILE* fMain = openCheckpointFile(dirname, "MainChares", "rb");
00775 if(fMain && CkMyPe()==0){
00776 PUP::fromDisk pMain(fMain);
00777 CkPupMainChareData(pMain, args);
00778 CmiFclose(fMain);
00779 DEBCHK("[%d]CkRestartMain: mainchares restored\n",CkMyPe());
00780
00781 }
00782
00783 #ifndef CMK_CHARE_USE_PTR
00784
00785 if(CkNumPes() == _numPes) {
00786 FILE* fChares = openCheckpointFile(dirname, "Chares", "rb", CkMyPe());
00787 PUP::fromDisk pChares(fChares);
00788 CkPupChareData(pChares);
00789 CmiFclose(fChares);
00790 if (CmiMyRank() == 0) _chareRestored = true;
00791 }
00792 #endif
00793
00794
00795
00796
00797 FILE* fGroups = openCheckpointFile(dirname, "Groups", "rb",
00798 (CkNumPes() == _numPes) ? CkMyPe() : 0);
00799 PUP::fromDisk pGroups(fGroups);
00800 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00801 CkPupGroupData(pGroups,true);
00802 #else
00803 CkPupGroupData(pGroups);
00804 #endif
00805 CmiFclose(fGroups);
00806
00807
00808
00809 if(CkMyRank()==0){
00810 FILE* fNodeGroups = openCheckpointFile(dirname, "NodeGroups", "rb",
00811 (CkNumNodes() == _numNodes) ? CkMyNode() : 0);
00812 PUP::fromDisk pNodeGroups(fNodeGroups);
00813 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00814 CkPupNodeGroupData(pNodeGroups,true);
00815 #else
00816 CkPupNodeGroupData(pNodeGroups);
00817 #endif
00818 CmiFclose(fNodeGroups);
00819 }
00820
00821
00822
00823 DEBCHK("[%d]Number of PE: %d -> %d\n",CkMyPe(),_numPes,CkNumPes());
00824 if(CkMyPe() < _numPes)
00825 for (i=0; i<_numPes;i++) {
00826 if (i%CkNumPes() == CkMyPe()) {
00827 FILE *datFile = openCheckpointFile(dirname, "arr", "rb", i);
00828 PUP::fromDisk p(datFile);
00829 CkPupArrayElementsData(p);
00830 CmiFclose(datFile);
00831 }
00832 }
00833
00834 _inrestart = false;
00835
00836 if (CmiMyRank()==0) _initDone();
00837
00838 CkMemCheckPT::inRestarting = false;
00839 if(CkMyPe()==0) {
00840 CmiPrintf("[%d]CkRestartMain done. sending out callback.\n",CkMyPe());
00841 if(requestStatus)
00842 {
00843 CkCheckpointStatusMsg * m = new CkCheckpointStatusMsg(CK_CHECKPOINT_SUCCESS);
00844 cb.send(m);
00845 }
00846 else
00847 {
00848 cb.send();
00849 }
00850 }
00851 }
00852
00853 #if CMK_SHRINK_EXPAND
00854
00855 void CkResumeRestartMain(char * msg) {
00856 int i;
00857 char filename[1024];
00858 const char * dirname = "";
00859 _inrestart = true;
00860 _restarted = true;
00861 CkMemCheckPT::inRestarting = true;
00862 CmiPrintf("[%d]CkResumeRestartMain: Inside Resume Restart\n",CkMyPe());
00863 CmiPrintf("[%d]CkResumeRestartMain: Group restored %d\n",CkMyPe(), CkpvAccess(_numGroups)-1);
00864
00865 int _numPes = -1;
00866 if(CkMyPe()!=0) {
00867 PUP::fromMem pRO((char *)(msg+CmiMsgHeaderSizeBytes+2*sizeof(int)));
00868
00869 CkPupROData(pRO);
00870 CmiPrintf("[%d]CkRestartMain: readonlys restored\n",CkMyPe());
00871
00872 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00873 CkPupGroupData(pRO,true);
00874 #else
00875 CkPupGroupData(pRO);
00876 #endif
00877 CmiPrintf("[%d]CkResumeRestartMain: Group restored %d\n",CkMyPe(), CkpvAccess(_numGroups)-1);
00878 }
00879
00880 CmiFree(msg);
00881 CmiNodeBarrier();
00882 if(Cmi_isOldProcess) {
00883
00884
00885 FILE *datFile = openCheckpointFile(dirname, "arr", "rb", Cmi_myoldpe);
00886 PUP::fromDisk p(datFile);
00887 CkPupArrayElementsData(p);
00888 CmiFclose(datFile);
00889 }
00890 _initDone();
00891 _inrestart = false;
00892 CkMemCheckPT::inRestarting = false;
00893 if(CkMyPe()==0) {
00894 CmiPrintf("[%d]CkResumeRestartMain done. sending out callback.\n",CkMyPe());
00895 CkPrintf("Restart from shared memory finished in %fs, sending out the cb...\n", CmiWallTimer() - chkptStartTimer);
00896 cb.send();
00897 }
00898 }
00899 #endif
00900
00901
00902 class CkCheckpointInit : public Chare {
00903 public:
00904 CkCheckpointInit(CkArgMsg *msg) {
00905 _sysChkptMgr = CProxy_CkCheckpointMgr::ckNew();
00906 delete msg;
00907 }
00908 CkCheckpointInit(CkMigrateMessage *m) {delete m;}
00909 };
00910
00911 #include "CkCheckpoint.def.h"
00912 #include "CkCheckpointStatus.def.h"
00913