
00001 /* 00002 Charm++ File: Checkpoint Library 00003 added 01/03/2003 by Chao Huang, chuang10@uiuc.edu 00004 00005 CkStartCheckpoint() is a function to start the procedure 00006 of saving the status of a Charm++ program into disk files. 00007 A corresponding restarting mechanism can later use the 00008 files saved to restore the execution. A callback should 00009 be provided to continue after the checkpoint is done. 00010 00011 Checkpoint manager is a Group to aid the saving and 00012 restarting of Charm++ programs. ... 00013 00014 --- Updated 12/14/2003 by Gengbin, gzheng@uiuc.edu 00015 rewrote to allow code reuse with following 5 functions, 00016 these functions each handle both packing and unpacking of a system data: 00017 void CkPupROData(PUP::er &p); 00018 void CkPupMainChareData(PUP::er &p); 00019 void CkPupGroupData(PUP::er &p); 00020 void CkPupNodeGroupData(PUP::er &p); 00021 void CkPupArrayElementsData(PUP::er &p); 00022 Completely changed the data file format for array elements to become 00023 one file for each processor. 00024 Two main checkpoint/restart subroutines are greatly simplified. 00025 */ 00026 #ifndef _CKCHECKPOINT_H 00027 #define _CKCHECKPOINT_H 00028 00029 #include "CkCheckpoint.decl.h" 00030 // loop over all CkLocMgr and do "code" 00031 #define CKLOCMGR_LOOP(code) \ 00032 for(i=0;i<numGroups;i++) { \ 00033 IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj(); \ 00034 if(obj->isLocMgr()) { \ 00035 CkLocMgr *mgr = (CkLocMgr*)obj; \ 00036 code \ 00037 } \ 00038 } 00039 /*** 00040 * Location iterator that save each location 00041 ***/ 00042 void printIndex(const CkArrayIndex &idx,char *dest); 00043 00047 class CkCheckpointMgr : public CBase_CkCheckpointMgr { 00048 private: 00049 CkCallback restartCB; 00050 double chkptStartTimer; 00051 public: 00052 CkCheckpointMgr() { } 00053 CkCheckpointMgr(CkMigrateMessage *m):CBase_CkCheckpointMgr(m) { } 00054 void Checkpoint(const char *dirname,CkCallback& cb); 00055 void SendRestartCB(CkReductionMsg *m); 00056 void pup(PUP::er& p){ CBase_CkCheckpointMgr::pup(p); p|restartCB; } 00057 }; 00058 00059 // utility functions to pup system global tables 00060 void CkPupROData(PUP::er &p); 00061 void CkPupMainChareData(PUP::er &p, CkArgMsg *args); 00062 void CkPupChareData(PUP::er &p); 00063 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) 00064 void CkPupGroupData(PUP::er &p,CmiBool create=CmiTrue); 00065 void CkPupNodeGroupData(PUP::er &p,CmiBool create=CmiTrue); 00066 #else 00067 void CkPupGroupData(PUP::er &p); 00068 void CkPupNodeGroupData(PUP::er &p); 00069 #endif 00070 void CkPupArrayElementsData(PUP::er &p, int notifyListeners=1); 00071 void CkPupProcessorData(PUP::er &p); 00072 void CkRemoveArrayElements(); 00073 //void CkTestArrayElements(); 00074 00075 void CkStartCheckpoint(const char* dirname,const CkCallback& cb); 00076 void CkRestartMain(const char* dirname, CkArgMsg *args); 00077 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) 00078 int CkCountArrayElements(); 00079 #endif 00080 00081 // some useful flags (for disk checkpointing) 00082 extern int _inrestart; // 1: if is during restart process 00083 extern int _restarted; // 1: if this run is after restart 00084 extern int _oldNumPes; // number of processors in the last run 00085 extern int _chareRestored; // 1: if chare is restored at restart 00086 00087 #endif //_CKCHECKPOINT_H
1.5.5