00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #include <stdio.h>
00011 #include <stdlib.h>
00012 #include <string.h>
00013 #include "charm++.h"
00014 #include "ck.h"
00015 #include "ckcheckpoint.h"
00016
00017 void noopit(const char*, ...)
00018 {}
00019
00020
00021 #define DEBCHK noopit
00022
00023 #define DEBUGC(x) x
00024
00025
00026 CkGroupID _sysChkptMgr;
00027
00028 typedef struct _GroupInfo{
00029 CkGroupID gID;
00030 int MigCtor, DefCtor;
00031 char name[256];
00032 } GroupInfo;
00033 PUPbytes(GroupInfo)
00034 PUPmarshall(GroupInfo)
00035
00036 int _inrestart = 0;
00037 int _restarted = 0;
00038 int _oldNumPes = 0;
00039 int _chareRestored = 0;
00040
00041 void CkCreateLocalChare(int epIdx, envelope *env);
00042
00043
00044 class ElementCounter : public CkLocIterator {
00045 private:
00046 int count;
00047 public:
00048 ElementCounter():count(0){};
00049 void addLocation(CkLocation &loc) { count++; }
00050 int getCount() { return count; }
00051 };
00052
00053
00054 class ElementCheckpointer : public CkLocIterator {
00055 private:
00056 CkLocMgr *locMgr;
00057 PUP::er &p;
00058 public:
00059 ElementCheckpointer(CkLocMgr* mgr_, PUP::er &p_):locMgr(mgr_),p(p_){};
00060 void addLocation(CkLocation &loc) {
00061 CkArrayIndex idx=loc.getIndex();
00062 CkGroupID gID = locMgr->ckGetGroupID();
00063 p|gID;
00064 p|idx;
00065 p|loc;
00066
00067 }
00068 };
00069
00070
00071 extern void _initDone();
00072
00073 static void bdcastRO(void){
00074 int i;
00075
00076 PUP::sizer ps;
00077 for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(ps);
00078
00079
00080 envelope *env = _allocEnv(RODataMsg, ps.size());
00081 PUP::toMem pp((char *)EnvToUsr(env));
00082 for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(pp);
00083
00084 env->setCount(++_numInitMsgs);
00085 env->setSrcPe(CkMyPe());
00086 CmiSetHandler(env, _roRestartHandlerIdx);
00087 CmiSyncBroadcastAndFree(env->getTotalsize(), (char *)env);
00088 }
00089
00090
00091
00092 void printIndex(const CkArrayIndex &idx,char *dest) {
00093 const int *idxData=idx.data();
00094 for (int i=0;i<idx.nInts;i++) {
00095 sprintf(dest,"%s%d",i==0?"":"_", idxData[i]);
00096 dest+=strlen(dest);
00097 }
00098 }
00099
00100 static void checkpointOne(const char* dirname, CkCallback& cb);
00101
00102
00103 void CkCheckpointMgr::Checkpoint(const char *dirname, CkCallback& cb){
00104 chkptStartTimer = CmiWallTimer();
00105
00106 CmiMkdir(dirname);
00107
00108 if (CkMyPe() == 0) {
00109 checkpointOne(dirname, cb);
00110 }
00111
00112 char fileName[1024];
00113
00114 #ifndef CMK_CHARE_USE_PTR
00115
00116 sprintf(fileName,"%s/Chares_%d.dat",dirname,CkMyPe());
00117 FILE* fChares = CmiFopen(fileName,"wb");
00118 if(!fChares) CkAbort("Failed to create checkpoint file for chares!");
00119 PUP::toDisk pChares(fChares);
00120 CkPupChareData(pChares);
00121 CmiFclose(fChares);
00122 #endif
00123
00124
00125
00126 sprintf(fileName,"%s/Groups_%d.dat",dirname,CkMyPe());
00127 FILE* fGroups = CmiFopen(fileName,"wb");
00128 if(!fGroups) CkAbort("Failed to create checkpoint file for group table!");
00129 PUP::toDisk pGroups(fGroups);
00130 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00131 CkPupGroupData(pGroups,CmiTrue);
00132 #else
00133 CkPupGroupData(pGroups);
00134 #endif
00135 CmiFclose(fGroups);
00136
00137
00138
00139 if (CkMyRank() == 0) {
00140 sprintf(fileName,"%s/NodeGroups_%d.dat",dirname,CkMyNode());
00141 FILE* fNodeGroups = CmiFopen(fileName,"wb");
00142 if(!fNodeGroups)
00143 CkAbort("Failed to create checkpoint file for nodegroup table!");
00144 PUP::toDisk pNodeGroups(fNodeGroups);
00145 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00146 CkPupNodeGroupData(pNodeGroups,CmiTrue);
00147 #else
00148 CkPupNodeGroupData(pNodeGroups);
00149 #endif
00150 CmiFclose(fNodeGroups);
00151 }
00152
00153
00154 sprintf(fileName,"%s/arr_%d.dat",dirname, CkMyPe());
00155 FILE *datFile=CmiFopen(fileName,"wb");
00156 if (datFile==NULL) CkAbort("Could not create data file");
00157 PUP::toDisk p(datFile);
00158 CkPupArrayElementsData(p);
00159 CmiFclose(datFile);
00160
00161 #if CMK_HAS_SYNC && ! CMK_DISABLE_SYNC
00162 system("sync");
00163 #endif
00164
00165 restartCB = cb;
00166 DEBCHK("[%d]restartCB installed\n",CkMyPe());
00167 CkCallback localcb(CkIndex_CkCheckpointMgr::SendRestartCB(NULL),0,thisgroup);
00168 contribute(0,NULL,CkReduction::sum_int,localcb);
00169 }
00170
00171 void CkCheckpointMgr::SendRestartCB(CkReductionMsg *m){
00172 delete m;
00173 DEBCHK("[%d]Sending out the cb\n",CkMyPe());
00174 CkPrintf("Checkpoint to disk finished in %fs, sending out the cb...\n", CmiWallTimer() - chkptStartTimer);
00175 restartCB.send();
00176 }
00177
00178 void CkPupROData(PUP::er &p)
00179 {
00180 int _numReadonlies;
00181 if (!p.isUnpacking()) _numReadonlies=_readonlyTable.size();
00182 p|_numReadonlies;
00183 if (p.isUnpacking()) {
00184 if (_numReadonlies != _readonlyTable.size())
00185 CkAbort("You cannot add readonlies and restore from checkpoint...");
00186 }
00187 for(int i=0;i<_numReadonlies;i++) _readonlyTable[i]->pupData(p);
00188 }
00189
00190
00191 void CkPupMainChareData(PUP::er &p, CkArgMsg *args)
00192 {
00193 int nMains=_mainTable.size();
00194 DEBCHK("[%d] CkPupMainChareData %s: nMains = %d\n", CkMyPe(),p.typeString(),nMains);
00195 for(int i=0;i<nMains;i++){
00196 ChareInfo *entry = _chareTable[_mainTable[i]->chareIdx];
00197 int entryMigCtor = entry->getMigCtor();
00198 if(entryMigCtor!=-1) {
00199 Chare* obj;
00200 if (p.isUnpacking()) {
00201 int size = entry->size;
00202 DEBCHK("MainChare PUP'ed: name = %s, idx = %d, size = %d\n", entry->name, i, size);
00203 obj = (Chare*)malloc(size);
00204 _MEMCHECK(obj);
00205 _mainTable[i]->setObj(obj);
00206
00207 _entryTable[entryMigCtor]->call(args, obj);
00208 }
00209 else
00210 obj = (Chare *)_mainTable[i]->getObj();
00211 obj->pup(p);
00212 }
00213 }
00214
00215
00216
00217
00218
00219 if (p.isUnpacking() && CkMyPe()==0)
00220 bdcastRO();
00221 }
00222
00223 #ifndef CMK_CHARE_USE_PTR
00224
00225 CkpvExtern(CkVec<void *>, chare_objs);
00226 CkpvExtern(CkVec<int>, chare_types);
00227 CkpvExtern(CkVec<VidBlock *>, vidblocks);
00228
00229
00230 void CkPupChareData(PUP::er &p)
00231 {
00232 int i, n;
00233 if (!p.isUnpacking()) n = CkpvAccess(chare_objs).size();
00234 p|n;
00235 for (i=0; i<n; i++) {
00236 int chare_type;
00237 if (!p.isUnpacking()) {
00238 chare_type = CkpvAccess(chare_types)[i];
00239 }
00240 p | chare_type;
00241 if (p.isUnpacking()) {
00242 int migCtor = _chareTable[chare_type]->migCtor;
00243 if(migCtor==-1) {
00244 char buf[512];
00245 sprintf(buf,"Chare %s needs a migration constructor and PUP'er routine for restart.\n", _chareTable[chare_type]->name);
00246 CkAbort(buf);
00247 }
00248 void *m = CkAllocSysMsg();
00249 envelope* env = UsrToEnv((CkMessage *)m);
00250 CkCreateLocalChare(migCtor, env);
00251 CkFreeSysMsg(m);
00252 }
00253 Chare *obj = (Chare*)CkpvAccess(chare_objs)[i];
00254 obj->pup(p);
00255 }
00256
00257 if (!p.isUnpacking()) n = CkpvAccess(vidblocks).size();
00258 p|n;
00259 for (i=0; i<n; i++) {
00260 VidBlock *v;
00261 if (p.isUnpacking()) {
00262 v = new VidBlock();
00263 CkpvAccess(vidblocks).push_back(v);
00264 }
00265 else
00266 v = CkpvAccess(vidblocks)[i];
00267 v->pup(p);
00268 }
00269 }
00270 #else
00271 void CkPupChareData(PUP::er &p)
00272 {
00273
00274 }
00275 #endif
00276
00277 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00278
00279 void CkPupGroupData(PUP::er &p, CmiBool create)
00280 {
00281 int numGroups, i;
00282
00283 if (!p.isUnpacking()) {
00284 numGroups = CkpvAccess(_groupIDTable)->size();
00285 }
00286 p|numGroups;
00287 if (p.isUnpacking()) {
00288 if(CkMyPe()==0)
00289 CkpvAccess(_numGroups) = numGroups+1;
00290 else
00291 CkpvAccess(_numGroups) = 1;
00292 }
00293 DEBCHK("[%d] CkPupGroupData %s: numGroups = %d\n", CkMyPe(),p.typeString(),numGroups);
00294
00295 GroupInfo *tmpInfo = new GroupInfo [numGroups];
00296 if (!p.isUnpacking()) {
00297 for(i=0;i<numGroups;i++) {
00298 tmpInfo[i].gID = (*CkpvAccess(_groupIDTable))[i];
00299 TableEntry ent = CkpvAccess(_groupTable)->find(tmpInfo[i].gID);
00300 tmpInfo[i].MigCtor = _chareTable[ent.getcIdx()]->migCtor;
00301 tmpInfo[i].DefCtor = _chareTable[ent.getcIdx()]->defCtor;
00302 strncpy(tmpInfo[i].name,_chareTable[ent.getcIdx()]->name,255);
00303
00304
00305 if(tmpInfo[i].MigCtor==-1) {
00306 char buf[512];
00307 sprintf(buf,"Group %s needs a migration constructor and PUP'er routine for restart.\n", tmpInfo[i].name);
00308 CkAbort(buf);
00309 }
00310 }
00311 }
00312 for (i=0; i<numGroups; i++) p|tmpInfo[i];
00313
00314 for(i=0;i<numGroups;i++)
00315 {
00316 CkGroupID gID = tmpInfo[i].gID;
00317 if (p.isUnpacking()) {
00318
00319 int eIdx = tmpInfo[i].MigCtor;
00320
00321 if (eIdx == -1) {
00322 CkPrintf("[%d] ERROR> Group %s's migration constructor is not defined!\n", CkMyPe(), tmpInfo[i].name); CkAbort("Abort");
00323 }
00324 void *m = CkAllocSysMsg();
00325 envelope* env = UsrToEnv((CkMessage *)m);
00326 if(create)
00327 CkCreateLocalGroup(gID, eIdx, env);
00328 }
00329 IrrGroup *gobj = CkpvAccess(_groupTable)->find(gID).getObj();
00330
00331 if(!create)
00332 gobj->mlogData->teamRecoveryFlag = 1;
00333 gobj->pup(p);
00334
00335 }
00336 delete [] tmpInfo;
00337 }
00338
00339
00340 void CkPupNodeGroupData(PUP::er &p, CmiBool create)
00341 {
00342 int numNodeGroups, i;
00343 if (!p.isUnpacking()) {
00344 numNodeGroups = CksvAccess(_nodeGroupIDTable).size();
00345 }
00346 p|numNodeGroups;
00347 if (p.isUnpacking()) {
00348 if(CkMyPe()==0){ CksvAccess(_numNodeGroups) = numNodeGroups+1; }
00349 else { CksvAccess(_numNodeGroups) = 1; }
00350 }
00351 if(CkMyPe() == 3)
00352 CkPrintf("[%d] CkPupNodeGroupData %s: numNodeGroups = %d\n",CkMyPe(),p.typeString(),numNodeGroups);
00353
00354 GroupInfo *tmpInfo = new GroupInfo [numNodeGroups];
00355 if (!p.isUnpacking()) {
00356 for(i=0;i<numNodeGroups;i++) {
00357 tmpInfo[i].gID = CksvAccess(_nodeGroupIDTable)[i];
00358 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(tmpInfo[i].gID);
00359 tmpInfo[i].MigCtor = _chareTable[ent2.getcIdx()]->migCtor;
00360 if(tmpInfo[i].MigCtor==-1) {
00361 char buf[512];
00362 sprintf(buf,"NodeGroup %s either need a migration constructor and\n\
00363 declared as [migratable] in .ci to be able to checkpoint.",\
00364 _chareTable[ent2.getcIdx()]->name);
00365 CkAbort(buf);
00366 }
00367 }
00368 }
00369 for (i=0; i<numNodeGroups; i++) p|tmpInfo[i];
00370 for (i=0;i<numNodeGroups;i++) {
00371 CkGroupID gID = tmpInfo[i].gID;
00372 if (p.isUnpacking()) {
00373
00374 int eIdx = tmpInfo[i].MigCtor;
00375 void *m = CkAllocSysMsg();
00376 envelope* env = UsrToEnv((CkMessage *)m);
00377 if(create){
00378 CkCreateLocalNodeGroup(gID, eIdx, env);
00379 }
00380 }
00381 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(gID);
00382 IrrGroup *obj = ent2.getObj();
00383 obj->pup(p);
00384 if(CkMyPe() == 3) CkPrintf("Nodegroup PUP'ed: gid = %d, name = %s\n",
00385 obj->ckGetGroupID().idx,
00386 _chareTable[ent2.getcIdx()]->name);
00387 }
00388 delete [] tmpInfo;
00389 }
00390 #else
00391
00392 void CkPupGroupData(PUP::er &p)
00393 {
00394 int numGroups, i;
00395
00396 if (!p.isUnpacking()) {
00397 numGroups = CkpvAccess(_groupIDTable)->size();
00398 }
00399 p|numGroups;
00400 if (p.isUnpacking()) {
00401 if(CkMyPe()==0)
00402 CkpvAccess(_numGroups) = numGroups+1;
00403 else
00404 CkpvAccess(_numGroups) = 1;
00405 }
00406 DEBCHK("[%d] CkPupGroupData %s: numGroups = %d\n", CkMyPe(),p.typeString(),numGroups);
00407
00408 GroupInfo *tmpInfo = new GroupInfo [numGroups];
00409 if (!p.isUnpacking()) {
00410 for(i=0;i<numGroups;i++) {
00411 tmpInfo[i].gID = (*CkpvAccess(_groupIDTable))[i];
00412 TableEntry ent = CkpvAccess(_groupTable)->find(tmpInfo[i].gID);
00413 tmpInfo[i].MigCtor = _chareTable[ent.getcIdx()]->migCtor;
00414 tmpInfo[i].DefCtor = _chareTable[ent.getcIdx()]->defCtor;
00415 strncpy(tmpInfo[i].name,_chareTable[ent.getcIdx()]->name,255);
00416 DEBCHK("[%d] CkPupGroupData: %s group %s \n",
00417 CkMyPe(), p.typeString(), tmpInfo[i].name);
00418
00419 if(tmpInfo[i].MigCtor==-1) {
00420 char buf[512];
00421 sprintf(buf,"Group %s needs a migration constructor and PUP'er routine for restart.\n", tmpInfo[i].name);
00422 CkAbort(buf);
00423 }
00424 }
00425 }
00426 for (i=0; i<numGroups; i++) p|tmpInfo[i];
00427
00428 for(i=0;i<numGroups;i++)
00429 {
00430 CkGroupID gID = tmpInfo[i].gID;
00431 if (p.isUnpacking()) {
00432
00433 int eIdx = tmpInfo[i].MigCtor;
00434
00435 if (eIdx == -1) {
00436 CkPrintf("[%d] ERROR> Group %s's migration constructor is not defined!\n", CkMyPe(), tmpInfo[i].name); CkAbort("Abort");
00437 }
00438 void *m = CkAllocSysMsg();
00439 envelope* env = UsrToEnv((CkMessage *)m);
00440 CkCreateLocalGroup(gID, eIdx, env);
00441 }
00442 IrrGroup *gobj = CkpvAccess(_groupTable)->find(gID).getObj();
00443
00444 gobj->pup(p);
00445 DEBCHK("Group PUP'ed: gid = %d, name = %s\n",
00446 gobj->ckGetGroupID().idx, tmpInfo[i].name);
00447 }
00448 delete [] tmpInfo;
00449 }
00450
00451
00452 void CkPupNodeGroupData(PUP::er &p)
00453 {
00454 int numNodeGroups, i;
00455 if (!p.isUnpacking()) {
00456 numNodeGroups = CksvAccess(_nodeGroupIDTable).size();
00457 }
00458 p|numNodeGroups;
00459 if (p.isUnpacking()) {
00460 if(CkMyPe()==0){ CksvAccess(_numNodeGroups) = numNodeGroups+1; }
00461 else { CksvAccess(_numNodeGroups) = 1; }
00462 }
00463 DEBCHK("[%d] CkPupNodeGroupData %s: numNodeGroups = %d\n",CkMyPe(),p.typeString(),numNodeGroups);
00464
00465 GroupInfo *tmpInfo = new GroupInfo [numNodeGroups];
00466 if (!p.isUnpacking()) {
00467 for(i=0;i<numNodeGroups;i++) {
00468 tmpInfo[i].gID = CksvAccess(_nodeGroupIDTable)[i];
00469 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(tmpInfo[i].gID);
00470 tmpInfo[i].MigCtor = _chareTable[ent2.getcIdx()]->migCtor;
00471 if(tmpInfo[i].MigCtor==-1) {
00472 char buf[512];
00473 sprintf(buf,"NodeGroup %s either need a migration constructor and\n\
00474 declared as [migratable] in .ci to be able to checkpoint.",\
00475 _chareTable[ent2.getcIdx()]->name);
00476 CkAbort(buf);
00477 }
00478 }
00479 }
00480 for (i=0; i<numNodeGroups; i++) p|tmpInfo[i];
00481 for (i=0;i<numNodeGroups;i++) {
00482 CkGroupID gID = tmpInfo[i].gID;
00483 if (p.isUnpacking()) {
00484
00485 int eIdx = tmpInfo[i].MigCtor;
00486 void *m = CkAllocSysMsg();
00487 envelope* env = UsrToEnv((CkMessage *)m);
00488 CkCreateLocalNodeGroup(gID, eIdx, env);
00489 }
00490 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(gID);
00491 IrrGroup *obj = ent2.getObj();
00492 obj->pup(p);
00493 DEBCHK("Nodegroup PUP'ed: gid = %d, name = %s\n",
00494 obj->ckGetGroupID().idx,
00495 _chareTable[ent2.getcIdx()]->name);
00496 }
00497 delete [] tmpInfo;
00498 }
00499 #endif
00500
00501
00502 void CkPupArrayElementsData(PUP::er &p, int notifyListeners)
00503 {
00504 int i;
00505
00506 int numGroups = CkpvAccess(_groupIDTable)->size();
00507
00508
00509 int numElements;
00510 if (!p.isUnpacking()) {
00511 ElementCounter counter;
00512 CKLOCMGR_LOOP(mgr->iterate(counter););
00513 numElements = counter.getCount();
00514 }
00515 p|numElements;
00516
00517 DEBCHK("[%d] CkPupArrayElementsData %s numGroups:%d numElements:%d \n",CkMyPe(),p.typeString(), numGroups, numElements);
00518
00519 if (!p.isUnpacking())
00520 {
00521
00522 CKLOCMGR_LOOP(ElementCheckpointer chk(mgr, p); mgr->iterate(chk););
00523 }
00524 else {
00525
00526
00527 for (int i=0; i<numElements; i++) {
00528 CkGroupID gID;
00529 CkArrayIndex idx;
00530 p|gID;
00531 p|idx;
00532 CkLocMgr *mgr = (CkLocMgr*)CkpvAccess(_groupTable)->find(gID).getObj();
00533 if (notifyListeners){
00534 mgr->resume(idx,p,CmiTrue);
00535 }
00536 else{
00537 mgr->restore(idx,p);
00538 }
00539 }
00540 }
00541
00542 if (notifyListeners)
00543 for(i=0;i<numGroups;i++) {
00544 IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
00545 obj->ckJustMigrated();
00546 }
00547 }
00548
00549 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00550 int CkCountArrayElements(){
00551 int numGroups = CkpvAccess(_groupIDTable)->size();
00552 int i;
00553 ElementCounter counter;
00554 CKLOCMGR_LOOP(mgr->iterate(counter););
00555 int numElements = counter.getCount();
00556 return numElements;
00557 }
00558 #endif
00559
00560 void CkPupProcessorData(PUP::er &p)
00561 {
00562
00563 if(CkMyRank()==0) {
00564 CkPupROData(p);
00565 }
00566
00567
00568 if(CkMyPe()==0) {
00569 CkPupMainChareData(p, NULL);
00570 }
00571
00572
00573 CkPupChareData(p);
00574
00575
00576 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00577 CkPupGroupData(p,CmiTrue);
00578 #else
00579 CkPupGroupData(p);
00580 #endif
00581
00582
00583 if(CkMyRank()==0) {
00584 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00585 CkPupNodeGroupData(p,CmiTrue);
00586 #else
00587 CkPupNodeGroupData(p);
00588 #endif
00589 }
00590
00591
00592 CkPupArrayElementsData(p);
00593 }
00594
00595
00596 static void checkpointOne(const char* dirname, CkCallback& cb){
00597 CmiAssert(CkMyPe()==0);
00598 char filename[1024];
00599
00600
00601 sprintf(filename,"%s/RO.dat",dirname);
00602 FILE* fRO = CmiFopen(filename,"wb");
00603 if(!fRO) CkAbort("Failed to create checkpoint file for readonly data!");
00604 PUP::toDisk pRO(fRO);
00605 int _numPes = CkNumPes();
00606 pRO|_numPes;
00607 CkPupROData(pRO);
00608 pRO|cb;
00609 CmiFclose(fRO);
00610
00611
00612 {
00613 sprintf(filename,"%s/MainChares.dat",dirname);
00614 FILE* fMain = CmiFopen(filename,"wb");
00615 if(!fMain) CkAbort("Failed to open checkpoint file for mainchare data!");
00616 PUP::toDisk pMain(fMain);
00617 CkPupMainChareData(pMain, NULL);
00618 CmiFclose(fMain);
00619 }
00620 }
00621
00622 void CkRemoveArrayElements()
00623 {
00624 int i;
00625 int numGroups = CkpvAccess(_groupIDTable)->size();
00626 CKLOCMGR_LOOP(mgr->flushAllRecs(););
00627
00628
00629
00630
00631
00632
00633
00634
00635 }
00636
00637
00638
00639
00640
00641
00642
00643
00644
00645
00646
00647
00648
00649
00650
00651 void CkStartCheckpoint(const char* dirname,const CkCallback& cb)
00652 {
00653 CkPrintf("[%d] Checkpoint starting in %s\n", CkMyPe(), dirname);
00654
00655
00656 CProxy_CkCheckpointMgr(_sysChkptMgr).Checkpoint(dirname, cb);
00657 }
00658
00666 void CkRestartMain(const char* dirname, CkArgMsg *args){
00667 int i;
00668 char filename[1024];
00669 CkCallback cb;
00670
00671 _inrestart = 1;
00672 _restarted = 1;
00673
00674
00675 sprintf(filename,"%s/RO.dat",dirname);
00676 FILE* fRO = CmiFopen(filename,"rb");
00677 if(!fRO) CkAbort("Failed to open checkpoint file for readonly data!");
00678 int _numPes = -1;
00679 PUP::fromDisk pRO(fRO);
00680 pRO|_numPes;
00681 CkPupROData(pRO);
00682 pRO|cb;
00683 CmiFclose(fRO);
00684 DEBCHK("[%d]CkRestartMain: readonlys restored\n",CkMyPe());
00685 _oldNumPes = _numPes;
00686
00687 CmiNodeBarrier();
00688
00689
00690 sprintf(filename,"%s/MainChares.dat",dirname);
00691 FILE* fMain = CmiFopen(filename,"rb");
00692 if(fMain && CkMyPe()==0){
00693 PUP::fromDisk pMain(fMain);
00694 CkPupMainChareData(pMain, args);
00695 CmiFclose(fMain);
00696 DEBCHK("[%d]CkRestartMain: mainchares restored\n",CkMyPe());
00697
00698 }
00699
00700 #ifndef CMK_CHARE_USE_PTR
00701
00702 if(CkNumPes() == _numPes) {
00703 sprintf(filename,"%s/Chares_%d.dat",dirname,CkMyPe());
00704 FILE* fChares = CmiFopen(filename,"rb");
00705 if(!fChares) CkAbort("Failed to open checkpoint file for chares!");
00706 PUP::fromDisk pChares(fChares);
00707 CkPupChareData(pChares);
00708 CmiFclose(fChares);
00709 _chareRestored = 1;
00710 }
00711 #endif
00712
00713
00714
00715
00716 if(CkNumPes() != _numPes)
00717 sprintf(filename,"%s/Groups_0.dat",dirname);
00718 else
00719 sprintf(filename,"%s/Groups_%d.dat",dirname,CkMyPe());
00720 FILE* fGroups = CmiFopen(filename,"rb");
00721 if(!fGroups) CkAbort("Failed to open checkpoint file for group table!");
00722 PUP::fromDisk pGroups(fGroups);
00723 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00724 CkPupGroupData(pGroups,CmiTrue);
00725 #else
00726 CkPupGroupData(pGroups);
00727 #endif
00728 CmiFclose(fGroups);
00729
00730
00731
00732 if(CkMyRank()==0){
00733 if(CkNumPes() != _numPes)
00734 sprintf(filename,"%s/NodeGroups_0.dat",dirname);
00735 else
00736 sprintf(filename,"%s/NodeGroups_%d.dat",dirname,CkMyNode());
00737 FILE* fNodeGroups = CmiFopen(filename,"rb");
00738 if(!fNodeGroups) CkAbort("Failed to open checkpoint file for nodegroup table!");
00739 PUP::fromDisk pNodeGroups(fNodeGroups);
00740 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00741 CkPupNodeGroupData(pNodeGroups,CmiTrue);
00742 #else
00743 CkPupNodeGroupData(pNodeGroups);
00744 #endif
00745 CmiFclose(fNodeGroups);
00746 }
00747
00748
00749
00750 DEBCHK("[%d]Number of PE: %d -> %d\n",CkMyPe(),_numPes,CkNumPes());
00751 if(CkMyPe() < _numPes)
00752 for (i=0; i<_numPes;i++) {
00753 if (i%CkNumPes() == CkMyPe()) {
00754 sprintf(filename,"%s/arr_%d.dat",dirname, i);
00755 FILE *datFile=CmiFopen(filename,"rb");
00756 if (datFile==NULL) CkAbort("Could not read data file");
00757 PUP::fromDisk p(datFile);
00758 CkPupArrayElementsData(p);
00759 CmiFclose(datFile);
00760 }
00761 }
00762
00763 _inrestart = 0;
00764
00765 _initDone();
00766
00767 if(CkMyPe()==0) {
00768 CmiPrintf("[%d]CkRestartMain done. sending out callback.\n",CkMyPe());
00769 cb.send();
00770 }
00771 }
00772
00773
00774 class CkCheckpointInit : public Chare {
00775 public:
00776 CkCheckpointInit(CkArgMsg *msg) {
00777 _sysChkptMgr = CProxy_CkCheckpointMgr::ckNew();
00778 delete msg;
00779 }
00780 CkCheckpointInit(CkMigrateMessage *m) {delete m;}
00781 };
00782
00783 #include "CkCheckpoint.def.h"
00784