00001 #ifndef _CK_MEM_CHECKPT_
00002 #define _CK_MEM_CHECKPT_
00003
00004 #include "CkMemCheckpoint.decl.h"
00005
00006 extern CkGroupID ckCheckPTGroupID;
00007 class CkArrayCheckPTReqMessage: public CMessage_CkArrayCheckPTReqMessage {
00008 public:
00009 CkArrayCheckPTReqMessage() {}
00010 };
00011
00012 class CkArrayCheckPTMessage: public CMessage_CkArrayCheckPTMessage {
00013 public:
00014 CkArrayID aid;
00015 CkGroupID locMgr;
00016 CkArrayIndex index;
00017 double *packData;
00018 int bud1, bud2;
00019 size_t len;
00020 bool cp_flag;
00021 };
00022
00023
00024 class CkProcCheckPTMessage: public CMessage_CkProcCheckPTMessage {
00025 public:
00026 int pe;
00027 int reportPe;
00028 int failedpe;
00029 int cur_restart_phase;
00030 size_t len;
00031 int pointer;
00032 char *packData;
00033 };
00034
00035
00036 class CkCheckPTInfo {
00037 friend class CkMemCheckPT;
00038 protected:
00039 CkArrayID aid;
00040 CkGroupID locMgr;
00041 CkArrayIndex index;
00042 int pNo;
00043 public:
00044 CkCheckPTInfo();
00045 CkCheckPTInfo(CkArrayID a, CkGroupID loc, CkArrayIndex idx, int pno):
00046 aid(a), locMgr(loc), index(idx), pNo(pno) {}
00047 virtual ~CkCheckPTInfo() {}
00048 virtual void updateBuffer(CkArrayCheckPTMessage *data) = 0;
00049 virtual CkArrayCheckPTMessage * getCopy() = 0;
00050 virtual void updateBuddy(int b1, int b2) = 0;
00051 virtual size_t getSize() = 0;
00052 };
00053
00055 #define CkCheckPoint_inMEM 1
00056 #define CkCheckPoint_inDISK 2
00057
00058 class CkCheckPTEntry{
00059 std::vector<CkArrayCheckPTMessage *> data;
00060 std::string fname;
00061 public:
00062 int bud1, bud2;
00063 int where;
00064 void init(int _where, int idx)
00065 {
00066 data.resize(2, NULL);
00067 where = _where;
00068 if(where == CkCheckPoint_inDISK)
00069 {
00070 #if CMK_USE_MKSTEMP
00071 #if CMK_CONVERSE_MPI
00072 fname = "/tmp/ckpt" + std::to_string(CmiMyPartition()) + "-" + std::to_string(CkMyPe()) + "-" + std::to_string(idx) + "-XXXXXX";
00073 #else
00074 fname = "/tmp/ckpt" + std::to_string(CkMyPe()) + "-" + std::to_string(idx) + "-XXXXXX";
00075 #endif
00076 if(mkstemp(&fname[0])<0)
00077 {
00078 CmiAbort("mkstemp fail in checkpoint");
00079 }
00080 #else
00081 fname = tmpnam(NULL);
00082 #endif
00083 }
00084 }
00085
00086 void updateBuffer(int pointer, CkArrayCheckPTMessage * msg)
00087 {
00088 if(where == CkCheckPoint_inDISK)
00089 {
00090 envelope *env = UsrToEnv(msg);
00091 CkUnpackMessage(&env);
00092 data[pointer] = (CkArrayCheckPTMessage *)EnvToUsr(env);
00093 FILE *f = fopen(fname.c_str(),"wb");
00094 PUP::toDisk p(f);
00095 CkPupMessage(p, (void **)&msg);
00096
00097
00098 fclose(f);
00099 bud1 = msg->bud1;
00100 bud2 = msg->bud2;
00101 delete msg;
00102 }else
00103 {
00104 CmiAssert(where == CkCheckPoint_inMEM);
00105 CmiAssert(msg!=NULL);
00106 delete data[pointer];
00107 data[pointer] = msg;
00108 bud1 = msg->bud1;
00109 bud2 = msg->bud2;
00110 }
00111 }
00112
00113 CkArrayCheckPTMessage * getCopy(int pointer)
00114 {
00115 if(where == CkCheckPoint_inDISK)
00116 {
00117 CkArrayCheckPTMessage *msg;
00118 FILE *f = fopen(fname.c_str(),"rb");
00119 PUP::fromDisk p(f);
00120 CkPupMessage(p, (void **)&msg);
00121 fclose(f);
00122 msg->bud1 = bud1;
00123 msg->bud2 = bud2;
00124 return msg;
00125 }else
00126 {
00127 CmiAssert(where == CkCheckPoint_inMEM);
00128 if (data[pointer] == NULL) {
00129 CmiPrintf("[%d] recoverArrayElements: element does not have checkpoint data.", CkMyPe());
00130 CmiAbort("Abort!");
00131 }
00132 return (CkArrayCheckPTMessage *)CkCopyMsg((void **)&data[pointer]);
00133 }
00134 }
00135 };
00136
00137
00138 class CkMemCheckPT: public CBase_CkMemCheckPT {
00139 public:
00140 CkMemCheckPT(int w);
00141 CkMemCheckPT(CkMigrateMessage *m):CBase_CkMemCheckPT(m) {};
00142 virtual ~CkMemCheckPT();
00143 void pup(PUP::er& p);
00144 inline int BuddyPE(int pe);
00145 void doItNow(int sp, CkCallback &&);
00146 void restart(int diePe);
00147 void removeArrayElements();
00148 void createEntry(CkArrayID aid, CkGroupID loc, CkArrayIndex index, int buddy);
00149 void recvData(CkArrayCheckPTMessage *);
00150 void gotData();
00151 void recvProcData(CkProcCheckPTMessage *);
00152 void cpFinish();
00153 void syncFiles(void);
00154 void report();
00155 void recoverBuddies();
00156 void recoverEntry(CkArrayCheckPTMessage *msg);
00157 void recoverArrayElements();
00158 void quiescence(CkCallback &&);
00159 void resetReductionMgr();
00160 void finishUp();
00161 void gotReply();
00162 void inmem_restore(CkArrayCheckPTMessage *m);
00163 void updateLocations(int n, CkGroupID *g, CkArrayIndex *idx, CmiUInt8 *id, int nowOnPe);
00164 void resetLB(int diepe);
00165 bool isFailed(int pe);
00166 void pupAllElements(PUP::er &p);
00167 void startArrayCheckpoint();
00168 void recvArrayCheckpoint(CkArrayCheckPTMessage *m);
00169 void recoverAll(CkArrayCheckPTMessage * msg, std::vector<CkGroupID> * gmap=NULL, std::vector<CkArrayIndex> * imap=NULL);
00170 public:
00171 static CkCallback cpCallback;
00172
00173 static bool inRestarting;
00174 static bool inCheckpointing;
00175 static bool inLoadbalancing;
00176 static double startTime;
00177 static char* stage;
00178
00179 private:
00180 std::vector<CkCheckPTInfo *> ckTable;
00181 CkCheckPTEntry chkpTable[2];
00182
00183 int recvCount, peCount;
00184 int expectCount, ackCount;
00185 int recvChkpCount;
00187 int cpStarter;
00188 std::vector<int> failedPes;
00189 int thisFailedPe;
00190
00192 int where;
00193 private:
00194 void initEntry();
00195 inline bool isMaster(int pe);
00196
00197 void failed(int pe);
00198 int totalFailed();
00199
00200 void sendProcData();
00201 };
00202
00203
00204 void CkMemRestart(const char *, CkArgMsg *);
00205
00206
00207
00208 void CkStartMemCheckpoint(CkCallback &cb);
00209
00210
00211 extern "C" int CkInRestarting();
00212 extern "C" int CkInLdb();
00213 extern "C" void CkSetInLdb();
00214 extern "C" void CkResetInLdb();
00215
00216 extern "C" int CkHasCheckpoints();
00217
00218 void CkDieNow();
00219
00220 #endif