Go to the source code of this file.
Data Structures | |
class | ElementPacker |
class | ElementDistributor |
class | ElementCaller |
Functions | |
const char * | idx2str (const CkArrayIndex &ind) |
const char * | idx2str (const ArrayElement *el) |
void | getGlobalStep (CkGroupID gID) |
bool | fault_aware (CkObjID &recver) |
void | createObjIDList (void *data, ChareMlogData *mlogData) |
bool | isLocal (int destPE) |
Determines if the message is local or not. | |
bool | isTeamLocal (int destPE) |
Determines if the message is group local or not. | |
void | printLog (CkObjID *log) |
void | readKillFile () |
CpvDeclare (Chare *, _currentObj) | |
CpvDeclare (StoredCheckpoint *, _storedCheckpointData) | |
CpvDeclare (char *, _incarnation) | |
CpvDeclare (int, _numEmigrantRecObjs) | |
CpvDeclare (int, _numImmigrantRecObjs) | |
CpvDeclare (std::vector< CkLocation * > *, _immigrantRecObjs) | |
void | setTeamRecovery (void *data, ChareMlogData *mlogData) |
void | unsetTeamRecovery (void *data, ChareMlogData *mlogData) |
void | mpi_restart_crashed (int pe, int rank) |
int | find_spare_mpirank (int pe, int partition) |
void | heartBeatPartner () |
Pings buddy to let it know this PE is alive. | |
void | heartBeatHandler (void *msg) |
Registers last time it knew about the PE that checkpoints on it. | |
void | heartBeatCheckHandler () |
Checks whether the PE that checkpoints on it is still alive. | |
void | partnerFailureHandler (char *msg) |
Receives the notification of a failure and updates pe-to-rank mapping. | |
int | getReverseCheckPointPE () |
Getting the pe that checkpoints on this pe. | |
static void * | doNothingMsg (int *size, void *data, void **remote, int count) |
void | _messageLoggingInit () |
Initialize message logging data structures and register handlers. | |
void | killLocal (void *_dummy, double curWallTime) |
void | readFaultFile () |
: reads the PE that will be failing throughout the execution and the mean time between failures. | |
void | CkDieNow () |
void | sendGroupMsg (envelope *env, int destPE, int _infoIdx) |
Sends a group message that might be a broadcast. | |
void | sendNodeGroupMsg (envelope *env, int destNode, int _infoIdx) |
Sends a nodegroup message that might be a broadcast. | |
void | sendArrayMsg (envelope *env, int destPE, int _infoIdx) |
Sends a message to an array element. | |
void | sendChareMsg (envelope *env, int destPE, int _infoIdx, const CkChareID *pCid) |
Sends a message to a singleton chare. | |
void | sendCommonMsg (CkObjID &recver, envelope *_env, int destPE, int _infoIdx) |
A method to generate the actual ticket requests for groups, nodegroups or arrays. | |
void | sendRemoteMsg (CkObjID &sender, CkObjID &recver, int destPE, MlogEntry *entry, MCount SN, int resend) |
Method that does the actual send by creating a ticket request filling it up and sending it. | |
void | sendLocalMsg (envelope *env, int _infoIdx) |
Function to send a local message. | |
int | preProcessReceivedMessage (envelope *env, Chare **objPointer, MlogEntry **logEntryPointer) |
void | postProcessReceivedMessage (Chare *obj, CkObjID &sender, MCount SN, MlogEntry *entry) |
Updates a few variables once a message has been processed. | |
void | generalCldEnqueue (int destPE, envelope *env, int _infoIdx) |
void | _pingHandler (CkPingMsg *msg) |
void | buildProcessedTicketLog (void *data, ChareMlogData *mlogData) |
void | clearUpMigratedRetainedLists (int PE) |
void | checkpointAlarm (void *_dummy, double curWallTime) |
void | _checkpointRequestHandler (CheckpointRequest *request) |
void | CkStartMlogCheckpoint (CkCallback &cb) |
Starts checkpoint phase at PE 0. | |
void | _startCheckpointHandler (CheckpointBarrierMsg *startMsg) |
Starts checkpoint: send its checkpoint to its partner. | |
void | _endCheckpointHandler (char *msg) |
Finishes checkpoint process by making the callback. | |
void | startMlogCheckpoint (void *_dummy, double curWallTime) |
Starts the checkpoint phase after migration. | |
void | pupArrayElementsSkip (PUP::er &p, bool create, MigrationRecord *listToSkip, int listsize) |
Pups all the array elements in this processor. | |
void | readCheckpointFromDisk (int size, char *data) |
Reads a checkpoint from disk. | |
void | writeCheckpointToDisk (int size, char *data) |
Writes a checkpoint to disk. | |
void | _storeCheckpointHandler (char *msg) |
void | _checkpointAckHandler (CheckPointAck *ackMsg) |
void | CkMlogRestart (const char *dummy, CkArgMsg *dummyMsg) |
Function for restarting the crashed processor. | |
void | CkMlogRestartDouble (void *, double) |
void | _getCheckpointHandler (RestartRequest *restartMsg) |
Gets the stored checkpoint for its buddy processor. | |
void | _recvCheckpointHandler (char *_restartData) |
Receives the checkpoint data from its buddy, restores the state of all the objects and asks everyone else to update its home. | |
void | initializeRestart (void *data, ChareMlogData *mlogData) |
Initializes variables and flags for restarting procedure. | |
void | updateHomePE (void *data, ChareMlogData *mlogData) |
Updates the homePe of chare array elements. | |
void | printLog (CkObjID &recver) |
Prints a processed log. | |
void | printMsg (envelope *env, const char *par) |
Prints information about a message. | |
void | resendMessageForChare (void *data, ChareMlogData *mlogData) |
Resends all the logged messages to a particular chare list. | |
void | _resendMessagesHandler (char *msg) |
Resends messages since last checkpoint to the list of objects included in the request. | |
void | distributeRestartedObjects () |
Distributes objects to accelerate recovery after a failure. | |
void | _sendBackLocationHandler (char *receivedMsg) |
Handler to receive back a location. | |
void | _distributedLocationHandler (char *receivedMsg) |
Handler to update information about an object just received. | |
void | sendDummyMigration (int restartPE, CkGroupID lbID, CkGroupID locMgrID, CkArrayIndexMax &idx, int locationPE) |
this method is used to send messages to a restarted processor to tell it that a particular expected object is not going to get to it | |
void | sendDummyMigrationCounts (int *dummyCounts) |
this method is used by a restarted processor to tell other processors that they are not going to receive these many objects. | |
void | _dummyMigrationHandler (DummyMigrationMsg *msg) |
this handler is used to process a dummy migration msg. | |
void | forAllCharesDo (MlogFn fnPointer, void *data) |
Map function pointed by fnPointer over all the chares living in this processor. | |
void | initMlogLBStep (CkGroupID gid) |
This is the first time Converse is called after AtSync method has been called by every local object. | |
void | pupLocation (CkLocation *loc, CkLocMgr *locMgr, PUP::er &p) |
Pups a location. | |
void | sendBackImmigrantRecObjs () |
Sends back the immigrant recovering object to their origin PE. | |
void | restoreParallelRecovery (void(*_fnPtr)(void *), void *_centralLb) |
Restores objects after parallel recovery, either by sending back the immigrant objects or by waiting for all emigrant objects to be back. | |
void | startLoadBalancingMlog (void(*_fnPtr)(void *), void *_centralLb) |
Load Balancing. | |
void | finishedCheckpointLoadBalancing () |
void | _receiveMlogLocationHandler (void *buf) |
void | _checkpointBarrierHandler (CheckpointBarrierMsg *barrierMsg) |
Processor 0 receives a contribution from every other processor after checkpoint. | |
void | _checkpointBarrierAckHandler (CheckpointBarrierMsg *msg) |
void | garbageCollectMlogForChare (void *data, ChareMlogData *mlogData) |
Function to remove all messages in the message log of a particular chare. | |
void | garbageCollectMlog () |
Garbage collects the message log and other data structures. | |
void | informLocationHome (CkGroupID locMgrID, CkArrayIndexMax idx, int homePE, int currentPE) |
method that informs an array elements home processor of its current location It is a converse method to bypass the charm++ message logging framework | |
void | _receiveLocationHandler (CurrentLocationMsg *data) |
void | _getGlobalStepHandler (LBStepMsg *msg) |
void | _recvGlobalStepHandler (LBStepMsg *msg) |
Receives the global step handler from PE 0. | |
void | _messageLoggingExit () |
Function to wrap up performance information. | |
int | getCheckPointPE () |
Getting the pe number of the current processor's buddy. | |
envelope * | copyEnvelope (envelope *env) |
Variables | |
int | _restartFlag = false |
int | _numRestartResponses = 0 |
char * | checkpointDirectory = "." |
int | unAckedCheckpoint = 0 |
int | countUpdateHomeAcks = 0 |
int | teamSize |
int | chkptPeriod |
bool | fastRecovery |
int | parallelRecovery |
char * | killFile |
char * | faultFile |
int | killFlag = 0 |
int | faultFlag = 0 |
int | restartingMlogFlag = 0 |
double | killTime = 0.0 |
double | faultMean |
int | checkpointCount = 0 |
int | diskCkptFlag = 0 |
static char | fName [100] |
int * | numMsgsTarget |
int * | sizeMsgsTarget |
int | totalMsgsTarget |
float | totalMsgsSize |
int | msgLogSize |
int | bufferedDetsSize |
int | storedDetsSize |
int | _pingHandlerIdx |
int | _checkpointRequestHandlerIdx |
int | _storeCheckpointHandlerIdx |
int | _checkpointAckHandlerIdx |
int | _getCheckpointHandlerIdx |
int | _recvCheckpointHandlerIdx |
int | _dummyMigrationHandlerIdx |
int | _getGlobalStepHandlerIdx |
int | _recvGlobalStepHandlerIdx |
int | _updateHomeRequestHandlerIdx |
int | _updateHomeAckHandlerIdx |
int | _resendMessagesHandlerIdx |
int | _receivedDetDataHandlerIdx |
int | _distributedLocationHandlerIdx |
int | _sendBackLocationHandlerIdx |
int | _falseRestart = 0 |
int | onGoingLoadBalancing = 0 |
For testing on clusters we might carry out restarts on a porcessor without actually starting it 1 -> false restart 0 -> restart after an actual crash. | |
void * | centralLb |
void(* | resumeLbFnPtr )(void *) |
int | _receiveMlogLocationHandlerIdx |
int | _checkpointBarrierHandlerIdx |
int | _checkpointBarrierAckHandlerIdx |
int | _startCheckpointIdx |
int | _endCheckpointIdx |
int | donotCountMigration = 0 |
int | countLBMigratedAway = 0 |
int | countLBToMigrate = 0 |
int | migrationDoneCalled = 0 |
int | checkpointBarrierCount = 0 |
int | globalResumeCount = 0 |
CkGroupID | globalLBID |
int | restartDecisionNumber = -1 |
double | lastCompletedAlarm = 0 |
double | lastRestart = 0 |
CkCallback | ckptCallback |
int | _receiveLocationHandlerIdx |
static int | heartBeatHandlerIdx |
static int | heartBeatCheckHandlerIdx |
static int | partnerFailureHandlerIdx |
static double | lastPingTime = -1 |
int | inCkptFlag = 0 |
const char* idx2str | ( | const CkArrayIndex & | ind | ) |
const char* idx2str | ( | const ArrayElement * | el | ) |
void getGlobalStep | ( | CkGroupID | gID | ) |
void createObjIDList | ( | void * | data, | |
ChareMlogData * | mlogData | |||
) |
Determines if the message is local or not.
A message is local if: 1) Both the destination and origin are the same PE.
Determines if the message is group local or not.
A message is group local if: 1) They belong to the same team in the team-based message logging.
void printLog | ( | CkObjID * | log | ) |
void readKillFile | ( | ) |
CpvDeclare | ( | Chare * | , | |
_currentObj | ||||
) |
CpvDeclare | ( | StoredCheckpoint * | , | |
_storedCheckpointData | ||||
) |
CpvDeclare | ( | char * | , | |
_incarnation | ||||
) |
CpvDeclare | ( | int | , | |
_numEmigrantRecObjs | ||||
) |
CpvDeclare | ( | int | , | |
_numImmigrantRecObjs | ||||
) |
CpvDeclare | ( | std::vector< CkLocation * > * | , | |
_immigrantRecObjs | ||||
) |
void setTeamRecovery | ( | void * | data, | |
ChareMlogData * | mlogData | |||
) |
void unsetTeamRecovery | ( | void * | data, | |
ChareMlogData * | mlogData | |||
) |
void heartBeatPartner | ( | ) |
Pings buddy to let it know this PE is alive.
Used for failure detection.
void heartBeatHandler | ( | void * | msg | ) |
Registers last time it knew about the PE that checkpoints on it.
void heartBeatCheckHandler | ( | ) |
Checks whether the PE that checkpoints on it is still alive.
void partnerFailureHandler | ( | char * | msg | ) |
Receives the notification of a failure and updates pe-to-rank mapping.
int getReverseCheckPointPE | ( | ) |
Getting the pe that checkpoints on this pe.
Definition at line 172 of file ckmessagelogging.C.
void _messageLoggingInit | ( | ) |
Initialize message logging data structures and register handlers.
Definition at line 181 of file ckmessagelogging.C.
References _checkpointAckHandler(), _checkpointAckHandlerIdx, _checkpointBarrierAckHandler(), _checkpointBarrierAckHandlerIdx, _checkpointBarrierHandler(), _checkpointBarrierHandlerIdx, _checkpointRequestHandler(), _checkpointRequestHandlerIdx, _distributedLocationHandler(), _distributedLocationHandlerIdx, _dummyMigrationHandler(), _dummyMigrationHandlerIdx, _endCheckpointHandler(), _endCheckpointIdx, _getCheckpointHandler(), _getCheckpointHandlerIdx, _getGlobalStepHandler(), _getGlobalStepHandlerIdx, _getRestartCheckpointHandler(), _getRestartCheckpointHandlerIdx, _indexBufferedDets, _maxBufferedDets, _numBufferedDets, _phaseBufferedDets, _pingHandler(), _pingHandlerIdx, _receivedDetDataHandler(), _receivedDetDataHandlerIdx, _receivedTNDataHandler(), _receivedTNDataHandlerIdx, _receiveLocationHandler(), _receiveLocationHandlerIdx, _receiveMigrationNoticeAckHandler(), _receiveMigrationNoticeAckHandlerIdx, _receiveMigrationNoticeHandler(), _receiveMigrationNoticeHandlerIdx, _receiveMlogLocationHandler(), _receiveMlogLocationHandlerIdx, _recvCheckpointHandler(), _recvCheckpointHandlerIdx, _recvGlobalStepHandler(), _recvGlobalStepHandlerIdx, _recvRestartCheckpointHandler(), _recvRestartCheckpointHandlerIdx, _removeDeterminantsHandler(), _removeDeterminantsHandlerIdx, _removeProcessedLogHandler(), _removeProcessedLogHandlerIdx, _resendMessagesHandler(), _resendMessagesHandlerIdx, _restartHandler(), _restartHandlerIdx, _sendBackLocationHandler(), _sendBackLocationHandlerIdx, _sendDetsHandler(), _sendDetsHandlerIdx, _sendDetsReplyHandler(), _sendDetsReplyHandlerIdx, _startCheckpointHandler(), _startCheckpointIdx, _storeCheckpointHandler(), _storeCheckpointHandlerIdx, _storeDeterminantsHandler(), _storeDeterminantsHandlerIdx, _updateHomeAckHandler(), _updateHomeAckHandlerIdx, _updateHomeRequestHandler(), _updateHomeRequestHandlerIdx, _verifyAckHandler(), _verifyAckHandlerIdx, _verifyAckRequestHandler(), _verifyAckRequestHandlerIdx, bufferedDetsSize, CcdCallOnCondition(), Converse::CkMyPe(), Converse::CkNumPes(), CmiAlloc(), CmiWallTimer(), CqsCreate(), diskCkptFlag, fName, heartBeatCheckHandler(), heartBeatCheckHandlerIdx, heartBeatHandler(), heartBeatHandlerIdx, heartBeatPartner(), int, lastCompletedAlarm, lastRestart, msgLogSize, numDets, numDupDets, numMsgsTarget, numPiggyDets, partnerFailureHandler(), partnerFailureHandlerIdx, sizeMsgsTarget, storedDetsSize, totalMsgsSize, totalMsgsTarget, and traceRegisterUserEvent().
void killLocal | ( | void * | _dummy, | |
double | curWallTime | |||
) |
void readFaultFile | ( | ) |
: reads the PE that will be failing throughout the execution and the mean time between failures.
We assume an exponential distribution for the mean-time-between-failures.
Definition at line 395 of file ckmessagelogging.C.
References CcdCallFnAfter(), Converse::CkMyPe(), faultFile, faultMean, and killLocal().
void CkDieNow | ( | ) |
Definition at line 425 of file ckmessagelogging.C.
References CcdCallFnAfter(), CmiMyPe(), CmiPrintf(), CmiWallTimer(), killLocal(), and killTime.
Sends a group message that might be a broadcast.
Definition at line 442 of file ckmessagelogging.C.
References CkCopyMsg(), Converse::CkMyPe(), CmiMyPe(), CkObjID::data, EnvToUsr(), envelope::getGroupNum(), _ObjectID::group, _ObjectID::id, _ObjectID::onPE, sendCommonMsg(), sendGroupMsg(), CkObjID::type, TypeGroup, TypeInvalid, and UsrToEnv().
Sends a nodegroup message that might be a broadcast.
Definition at line 473 of file ckmessagelogging.C.
References CkCopyMsg(), Converse::CkMyPe(), CkObjID::data, EnvToUsr(), envelope::getGroupNum(), _ObjectID::group, _ObjectID::id, _ObjectID::onPE, sendCommonMsg(), sendNodeGroupMsg(), CkObjID::type, TypeInvalid, TypeNodeGroup, and UsrToEnv().
Sends a message to an array element.
Definition at line 503 of file ckmessagelogging.C.
References _ObjectID::array, CkArrayIndexBase::asChild(), Converse::CkMyPe(), CkObjID::data, envelope::getArrayMgr(), _ObjectID::s_array::id, _ObjectID::s_array::idx, sendCommonMsg(), CkObjID::toString(), CkObjID::type, and TypeArray.
Sends a message to a singleton chare.
Definition at line 524 of file ckmessagelogging.C.
References _ObjectID::chare, Converse::CkMyPe(), CkObjID::data, _ObjectID::id, sendCommonMsg(), CkObjID::toString(), CkObjID::type, TypeArray, and TypeChare.
A method to generate the actual ticket requests for groups, nodegroups or arrays.
Definition at line 544 of file ckmessagelogging.C.
References Converse::CkMyPe(), CmiMemoryCheck(), generalCldEnqueue(), SNToTicket::get(), CkHashtableT< KEY, OBJ >::get(), CkObjID::getObject(), isLocal(), isTeamLocal(), Chare::mlogData, ChareMlogData::nextSN(), sendLocalMsg(), sendMsg(), sendRemoteMsg(), teamSize, ChareMlogData::teamTable, Ticket::TN, CkObjID::toString(), and TypeInvalid.
void sendRemoteMsg | ( | CkObjID & | sender, | |
CkObjID & | recver, | |||
int | destPE, | |||
MlogEntry * | entry, | |||
MCount | SN, | |||
int | resend | |||
) |
Method that does the actual send by creating a ticket request filling it up and sending it.
Definition at line 624 of file ckmessagelogging.C.
References MlogEntry::_infoIdx, ChareMlogData::addLogEntry(), Converse::CkMyPe(), CmiMemoryCheck(), MlogEntry::env, float, generalCldEnqueue(), envelope::getTotalsize(), Chare::mlogData, MLOGFT_totalLogSize, MLOGFT_totalMessages, msgLogSize, numMsgsTarget, sizeMsgsTarget, totalMsgsSize, and totalMsgsTarget.
Referenced by sendCommonMsg().
Function to send a local message.
It first gets a ticket and then enqueues the message. If we are recovering, then the message is enqueued in a delay queue.
Definition at line 667 of file ckmessagelogging.C.
References _skipCldEnqueue(), CmiMemoryCheck(), and CmiMyPe().
Definition at line 714 of file ckmessagelogging.C.
References _getTicket(), _recoveryFlag, addBufferedDeterminant(), ChareMlogData::checkAndStoreSsn(), Converse::CkMyPe(), CmiFree(), CmiMemoryCheck(), Converse::CmiSyncSendAndFree(), CqsDequeue(), CqsEmpty(), CqsEnqueue(), CqsEnqueueGeneral(), fault_aware(), flag, CkObjID::getObject(), envelope::getPriobits(), envelope::getPrioPtr(), envelope::getSrcPe(), envelope::getTotalsize(), CkObjID::guessPE(), Chare::mlogData, CkObjID::toString(), and ChareMlogData::tProcessed.
Updates a few variables once a message has been processed.
Definition at line 770 of file ckmessagelogging.C.
References Converse::CkMyPe(), CmiMemoryCheck(), MlogEntry::env, CkObjID::guessPE(), Chare::mlogData, and ChareMlogData::tProcessed.
Definition at line 777 of file ckmessagelogging.C.
References _noCldNodeEnqueue(), _skipCldEnqueue(), and TypeNodeGroup.
void _pingHandler | ( | CkPingMsg * | msg | ) |
Definition at line 793 of file ckmessagelogging.C.
References Converse::CkMyPe(), CmiFree(), and RestartRequest::PE.
void buildProcessedTicketLog | ( | void * | data, | |
ChareMlogData * | mlogData | |||
) |
void clearUpMigratedRetainedLists | ( | int | PE | ) |
void checkpointAlarm | ( | void * | _dummy, | |
double | curWallTime | |||
) |
Definition at line 807 of file ckmessagelogging.C.
References _checkpointRequestHandlerIdx, CcdCallFnAfter(), checkpointAlarm(), chkptPeriod, Converse::CkMyPe(), CmiInitMsgHeader(), Converse::CmiSyncBroadcastAll(), RestartRequest::header, lastCompletedAlarm, RestartRequest::PE, and request.
void _checkpointRequestHandler | ( | CheckpointRequest * | request | ) |
Definition at line 825 of file ckmessagelogging.C.
References CmiWallTimer(), and startMlogCheckpoint().
void CkStartMlogCheckpoint | ( | CkCallback & | cb | ) |
Starts checkpoint phase at PE 0.
Definition at line 832 of file ckmessagelogging.C.
References _startCheckpointIdx, CmiAlloc(), and Converse::CmiSyncBroadcastAllAndFree().
void _startCheckpointHandler | ( | CheckpointBarrierMsg * | startMsg | ) |
Starts checkpoint: send its checkpoint to its partner.
This checkpointing strategy is NOT connected to the load balancer, hence onGoingLoadBalancer==0.
Definition at line 847 of file ckmessagelogging.C.
References _storeCheckpointHandlerIdx, buf, bufferedDetsSize, checkpointCount, Converse::CkMyPe(), CkPupGroupData(), CkPupNodeGroupData(), CkPupROData(), CmiAlloc(), CmiFree(), CmiMemoryCheck(), CmiMyPe(), Converse::CmiSyncSendAndFree(), CmiTimer(), CmiWallTimer(), CheckPointDataMsg::dataSize, dataSize, garbageCollectMlog(), getCheckPointPE(), _ckGroupID::idx, inCkptFlag, msgLogSize, CheckPointDataMsg::PE, pupArrayElementsSkip(), PUP::sizer::size(), storedDetsSize, and unAckedCheckpoint.
Referenced by _messageLoggingInit().
void _endCheckpointHandler | ( | char * | msg | ) |
Finishes checkpoint process by making the callback.
Definition at line 926 of file ckmessagelogging.C.
References CmiFree(), and CkCallback::send().
Referenced by _messageLoggingInit().
void startMlogCheckpoint | ( | void * | _dummy, | |
double | curWallTime | |||
) |
Starts the checkpoint phase after migration.
Definition at line 936 of file ckmessagelogging.C.
References _recoveryFlag, _storeCheckpointHandlerIdx, buf, bufferedDetsSize, buildProcessedTicketLog(), CcdCallFnAfter(), checkpointAlarm(), checkpointCount, chkptPeriod, Converse::CkMyPe(), CkPupGroupData(), CkPupNodeGroupData(), CkPupROData(), CmiAlloc(), CmiMemoryCheck(), CmiMyPe(), Converse::CmiSyncSendAndFree(), CmiTimer(), CmiWallTimer(), CheckPointDataMsg::dataSize, dataSize, forAllCharesDo(), getCheckPointPE(), inCkptFlag, lastCompletedAlarm, msg, msgLogSize, onGoingLoadBalancing, CheckPointDataMsg::PE, processedTicketLog, pupArrayElementsSkip(), PUP::sizer::size(), storedDetsSize, traceUserBracketEvent(), and unAckedCheckpoint.
void pupArrayElementsSkip | ( | PUP::er & | p, | |
bool | create, | |||
MigrationRecord * | listToSkip, | |||
int | listsize | |||
) |
Pups all the array elements in this processor.
Definition at line 1030 of file ckmessagelogging.C.
References CkCountArrayElements(), Converse::CkMyPe(), CmiMyPe(), flag, CkLocMgr::homePe(), _ckGroupID::idx, idx, MigrationRecord::idx, idx2str(), informLocationHome(), PUP::er::isUnpacking(), CkLocMgr::numLocalElements(), and CkLocMgr::resume().
void readCheckpointFromDisk | ( | int | size, | |
char * | data | |||
) |
Reads a checkpoint from disk.
Assumes variable fName contains the name of the file.
Definition at line 1089 of file ckmessagelogging.C.
References fName.
void writeCheckpointToDisk | ( | int | size, | |
char * | data | |||
) |
Writes a checkpoint to disk.
Assumes variable fName contains the name of the file.
Definition at line 1098 of file ckmessagelogging.C.
References fName.
void _storeCheckpointHandler | ( | char * | msg | ) |
Definition at line 1106 of file ckmessagelogging.C.
References _checkpointAckHandlerIdx, Converse::CkMyPe(), CmiFree(), CmiInitMsgHeader(), CmiMyPe(), Converse::CmiSyncSend(), count, CheckPointDataMsg::dataSize, diskCkptFlag, CheckPointDataMsg::header, migratedNoticeList, CheckPointDataMsg::PE, traceUserBracketEvent(), and writeCheckpointToDisk().
void _checkpointAckHandler | ( | CheckPointAck * | ackMsg | ) |
Definition at line 1144 of file ckmessagelogging.C.
References _endCheckpointIdx, Converse::CkMyPe(), CmiAlloc(), CmiFree(), CmiMemoryCheck(), CmiReduce(), CheckPointDataMsg::dataSize, doNothingMsg(), finishedCheckpointLoadBalancing(), inCkptFlag, onGoingLoadBalancing, CheckPointDataMsg::PE, sendRemoveLogRequests(), and unAckedCheckpoint.
void CkMlogRestart | ( | const char * | dummy, | |
CkArgMsg * | dummyMsg | |||
) |
Function for restarting the crashed processor.
It sets the restart flag and contacts the buddy processor to get the latest checkpoint.
Definition at line 1172 of file ckmessagelogging.C.
References _getCheckpointHandlerIdx, _numRestartResponses, _recoveryFlag, _restartFlag, _restartHandlerIdx, Converse::CkMyPe(), Converse::CkNumPes(), CmiInitMsgHeader(), Converse::CmiSyncSend(), CmiWallTimer(), getCheckPointPE(), RestartRequest::header, msg, RestartRequest::PE, and teamSize.
void CkMlogRestartDouble | ( | void * | , | |
double | ||||
) |
Definition at line 1188 of file ckmessagelogging.C.
References CkMlogRestart().
void _getCheckpointHandler | ( | RestartRequest * | restartMsg | ) |
Gets the stored checkpoint for its buddy processor.
Definition at line 1195 of file ckmessagelogging.C.
References _recvCheckpointHandlerIdx, _verifyAckRequestHandlerIdx, StoredCheckpoint::buf, buf, StoredCheckpoint::bufSize, RestartProcessorData::checkPointSize, Converse::CkMyPe(), CmiAlloc(), CmiFree(), CmiInitMsgHeader(), CmiMyPe(), CmiPrintf(), Converse::CmiSyncSend(), Converse::CmiSyncSendAndFree(), CmiTimer(), diskCkptFlag, VerifyAckMsg::fromPE, VerifyAckMsg::header, idx, idx2str(), VerifyAckMsg::index, RestartProcessorData::lbGroupID, migratedNoticeList, VerifyAckMsg::migRecord, msg, RestartProcessorData::PE, StoredCheckpoint::PE, RestartRequest::PE, readCheckpointFromDisk(), RestartProcessorData::restartWallTime, sendCheckpointData(), verifyAckCount, and verifyAckTotal.
void _recvCheckpointHandler | ( | char * | _restartData | ) |
Receives the checkpoint data from its buddy, restores the state of all the objects and asks everyone else to update its home.
Definition at line 1246 of file ckmessagelogging.C.
References _initDone(), _numRestartResponses, _sendDetsHandlerIdx, adjustChkptPeriod, buf, checkpointCount, RestartProcessorData::checkPointSize, chkptPeriod, Converse::CkMyPe(), CkPupGroupData(), CkPupNodeGroupData(), CkPupROData(), CmiAlloc(), CmiFree(), CmiMyPe(), Converse::CmiSyncBroadcastAndFree(), CmiWallTimer(), createObjIDList(), forAllCharesDo(), getGlobalStep(), initializeRestart(), RestartProcessorData::lbGroupID, ResendRequest::numberObjects, RestartProcessorData::numMigratedAwayElements, ResendRequest::PE, RestartProcessorData::PE, pupArrayElementsSkip(), RestartProcessorData::restartWallTime, and PUP::mem::size().
void initializeRestart | ( | void * | data, | |
ChareMlogData * | mlogData | |||
) |
Initializes variables and flags for restarting procedure.
Definition at line 1283 of file ckmessagelogging.C.
References ChareMlogData::receivedTNs, ChareMlogData::resendReplyRecvd, and ChareMlogData::restartFlag.
void updateHomePE | ( | void * | data, | |
ChareMlogData * | mlogData | |||
) |
Updates the homePe of chare array elements.
Definition at line 1291 of file ckmessagelogging.C.
References _ObjectID::array, CkArrayIndexBase::asChild(), CkArrayID::ckLocalBranch(), Converse::CkMyPe(), CkObjID::data, CkLocMgr::getGroupID(), CkArray::getLocMgr(), CkLocMgr::homePe(), _ObjectID::s_array::id, _ObjectID::s_array::idx, informLocationHome(), ChareMlogData::objID, RestartRequest::PE, CkObjID::type, and TypeArray.
void printLog | ( | CkObjID & | recver | ) |
Prints a processed log.
Definition at line 1314 of file ckmessagelogging.C.
References Converse::CkMyPe(), and CkObjID::toString().
void printMsg | ( | envelope * | env, | |
const char * | par | |||
) |
Prints information about a message.
Definition at line 1322 of file ckmessagelogging.C.
References Converse::CkMyPe().
void resendMessageForChare | ( | void * | data, | |
ChareMlogData * | mlogData | |||
) |
Resends all the logged messages to a particular chare list.
data | is of type ResendData which contains the array of objects on the restartedProcessor. | |
mlogData | a particular chare living in this processor. |
Definition at line 1333 of file ckmessagelogging.C.
References Converse::CkMyPe(), Converse::CmiSyncSend(), copyEnvelope(), count, CqsEnqueueGeneral(), MlogEntry::env, ChareMlogData::getMlog(), envelope::getPriobits(), envelope::getPrioPtr(), envelope::getQueueing(), envelope::getTotalsize(), CkQ< T >::length(), ResendData::listObjects, ResendData::numberObjects, ChareMlogData::objID, ResendData::PE, printMsg(), CkObjID::toString(), TypeInvalid, and TypeNodeGroup.
void _resendMessagesHandler | ( | char * | msg | ) |
Resends messages since last checkpoint to the list of objects included in the request.
It also sends stored remote determinants to the particular failed PE.
Definition at line 1387 of file ckmessagelogging.C.
References Converse::CkMyPe(), CmiFree(), CmiMemoryCheck(), CmiResetGlobalReduceSeqID(), CmiWallTimer(), PUP::d, fillTicketForChare(), forAllCharesDo(), isTeamLocal(), lastRestart, ResendData::listObjects, ResendRequest::numberObjects, ResendData::numberObjects, ResendRequest::PE, ResendData::PE, and resendMessageForChare().
void distributeRestartedObjects | ( | ) |
Distributes objects to accelerate recovery after a failure.
Definition at line 1492 of file ckmessagelogging.C.
References Converse::CkMyPe().
void _sendBackLocationHandler | ( | char * | receivedMsg | ) |
Handler to receive back a location.
Definition at line 1502 of file ckmessagelogging.C.
References _ObjectID::array, buf, centralLb, Converse::CkMyPe(), CkObjID::data, CkReductionMgr::decGCount(), CkReductionMgr::decNumEmigrantRecObjs(), donotCountMigration, CkLocMgr::elementRec(), CkLocMgr::homePe(), _ObjectID::s_array::id, idx, informLocationHome(), CkLocMgr::migratableList(), ChareMlogData::objID, DistributeObjectMsg::PE, and CkLocMgr::resume().
void _distributedLocationHandler | ( | char * | receivedMsg | ) |
Handler to update information about an object just received.
Definition at line 1539 of file ckmessagelogging.C.
References _ObjectID::array, buf, Converse::CkMyPe(), CkObjID::data, CkReductionMgr::decGCount(), donotCountMigration, CkLocMgr::elementRec(), globalResumeCount, CkLocMgr::homePe(), _ObjectID::s_array::id, idx, CkReductionMgr::incNumImmigrantRecObjs(), informLocationHome(), CkLocMgr::migratableList(), ChareMlogData::objID, DistributeObjectMsg::PE, CkLocMgr::resume(), and ChareMlogData::toResumeOrNot.
void sendDummyMigration | ( | int | restartPE, | |
CkGroupID | lbID, | |||
CkGroupID | locMgrID, | |||
CkArrayIndexMax & | idx, | |||
int | locationPE | |||
) |
this method is used to send messages to a restarted processor to tell it that a particular expected object is not going to get to it
Definition at line 1585 of file ckmessagelogging.C.
References _dummyMigrationHandlerIdx, buf, CmiInitMsgHeader(), Converse::CmiSyncSend(), DummyMigrationMsg::flag, DummyMigrationMsg::header, DummyMigrationMsg::idx, DummyMigrationMsg::lbID, DummyMigrationMsg::locationPE, and DummyMigrationMsg::mgrID.
void _dummyMigrationHandler | ( | DummyMigrationMsg * | msg | ) |
this handler is used to process a dummy migration msg.
it looks up the load balancer and calls migrated for it
Definition at line 1620 of file ckmessagelogging.C.
References CmiFree(), CmiMyPe(), CmiPrintf(), DummyMigrationMsg::count, DummyMigrationMsg::flag, h, DummyMigrationMsg::idx, _ckGroupID::idx, idx2str(), lb, DummyMigrationMsg::lbID, DummyMigrationMsg::locationPE, DummyMigrationMsg::mgrID, CentralLB::Migrated(), and verifyAckedRequests.
void forAllCharesDo | ( | MlogFn | fnPointer, | |
void * | data | |||
) |
Map function pointed by fnPointer over all the chares living in this processor.
Definition at line 1669 of file ckmessagelogging.C.
References caller, and Chare::mlogData.
void pupLocation | ( | CkLocation * | loc, | |
CkLocMgr * | locMgr, | |||
PUP::er & | p | |||
) |
Pups a location.
Definition at line 1713 of file ckmessagelogging.C.
References IrrGroup::ckGetGroupID(), CkLocation::getIndex(), and idx.
void sendBackImmigrantRecObjs | ( | ) |
Sends back the immigrant recovering object to their origin PE.
Definition at line 1724 of file ckmessagelogging.C.
References _sendBackLocationHandlerIdx, _ObjectID::array, buf, CkLocMgr::callMethod(), CkMigratable::ckAboutToMigrate(), Converse::CkMyPe(), CmiAlloc(), Converse::CmiSyncSendAndFree(), CkObjID::data, CkReductionMgr::decNumImmigrantRecObjs(), CkLocation::getIndex(), CkLocation::getLocalRecord(), CkLocation::getManager(), _ObjectID::s_array::id, idx, CkLocMgr::inform(), CkLocMgr::lastKnown(), CkLocMgr::migratableList(), ChareMlogData::objID, DistributeObjectMsg::PE, pupLocation(), CkLocMgr::setDuringMigration(), and PUP::sizer::size().
void restoreParallelRecovery | ( | void(*)(void *) | _fnPtr, | |
void * | _centralLb | |||
) |
Restores objects after parallel recovery, either by sending back the immigrant objects or by waiting for all emigrant objects to be back.
Definition at line 1792 of file ckmessagelogging.C.
References centralLb, resumeLbFnPtr, and sendBackImmigrantRecObjs().
void startLoadBalancingMlog | ( | void(*)(void *) | _fnPtr, | |
void * | _centralLb | |||
) |
Load Balancing.
Definition at line 1809 of file ckmessagelogging.C.
References centralLb, CmiMyPe(), CmiWallTimer(), countLBMigratedAway, countLBToMigrate, migrationDoneCalled, resumeLbFnPtr, and startMlogCheckpoint().
void finishedCheckpointLoadBalancing | ( | ) |
Definition at line 1822 of file ckmessagelogging.C.
References _checkpointBarrierHandlerIdx, CmiAlloc(), CmiMyPe(), CmiReduce(), and doNothingMsg().
void _receiveMlogLocationHandler | ( | void * | buf | ) |
Definition at line 1830 of file ckmessagelogging.C.
References Converse::CkMyPe(), CkUnpackMessage(), EnvToUsr(), envelope::getTotalsize(), CkArrayElementMigrateMessage::gid, _ckGroupID::idx, and CkLocMgr::immigrate().
void _checkpointBarrierHandler | ( | CheckpointBarrierMsg * | barrierMsg | ) |
Processor 0 receives a contribution from every other processor after checkpoint.
Definition at line 1843 of file ckmessagelogging.C.
References _checkpointBarrierAckHandlerIdx, CmiAlloc(), CmiFree(), and Converse::CmiSyncBroadcastAllAndFree().
void _checkpointBarrierAckHandler | ( | CheckpointBarrierMsg * | msg | ) |
Definition at line 1854 of file ckmessagelogging.C.
References centralLb, Converse::CkMyPe(), CmiFree(), CmiMyPe(), CmiPrintf(), inCkptFlag, and sendRemoveLogRequests().
void garbageCollectMlogForChare | ( | void * | data, | |
ChareMlogData * | mlogData | |||
) |
Function to remove all messages in the message log of a particular chare.
Definition at line 1872 of file ckmessagelogging.C.
References CkQ< T >::deq(), ChareMlogData::getMlog(), and CkQ< T >::length().
void garbageCollectMlog | ( | ) |
Garbage collects the message log and other data structures.
In case of synchronized checkpoint, we use an optimization to avoid causal message logging protocol to communicate all determinants to the rest of the processors.
Definition at line 1889 of file ckmessagelogging.C.
References _indexBufferedDets, _numBufferedDets, _phaseBufferedDets, Converse::CkMyPe(), forAllCharesDo(), garbageCollectMlogForChare(), CkHashtableIterator::hasNext(), and CkHashtableIterator::next().
void informLocationHome | ( | CkGroupID | locMgrID, | |
CkArrayIndexMax | idx, | |||
int | homePE, | |||
int | currentPE | |||
) |
method that informs an array elements home processor of its current location It is a converse method to bypass the charm++ message logging framework
Definition at line 1901 of file ckmessagelogging.C.
References _receiveLocationHandlerIdx, Converse::CkMyPe(), CmiInitMsgHeader(), CmiMyPe(), CmiPrintf(), Converse::CmiSyncSend(), CmiWallTimer(), CurrentLocationMsg::fromPE, CurrentLocationMsg::header, _ckGroupID::idx, CurrentLocationMsg::idx, idx2str(), CurrentLocationMsg::locationPE, CurrentLocationMsg::mgrID, and traceUserBracketEvent().
void _receiveLocationHandler | ( | CurrentLocationMsg * | data | ) |
Definition at line 1917 of file ckmessagelogging.C.
References Converse::CkMyPe(), CmiAbort(), CmiFree(), CmiMyPe(), CmiPrintf(), Converse::CmiSyncSend(), CmiWallTimer(), CkLocMgr::elementNrec(), CurrentLocationMsg::fromPE, CurrentLocationMsg::idx, idx2str(), CkLocMgr::inform(), CkLocMgr::lastKnown(), CurrentLocationMsg::locationPE, CurrentLocationMsg::mgrID, and traceUserBracketEvent().
void _getGlobalStepHandler | ( | LBStepMsg * | msg | ) |
Definition at line 1959 of file ckmessagelogging.C.
References _recvGlobalStepHandlerIdx, CmiMyPe(), CmiPrintf(), Converse::CmiSyncSend(), LBStepMsg::fromPE, _ckGroupID::idx, lb, LBStepMsg::lbID, and LBStepMsg::step.
void _recvGlobalStepHandler | ( | LBStepMsg * | msg | ) |
Receives the global step handler from PE 0.
Definition at line 1976 of file ckmessagelogging.C.
References _resendMessagesHandlerIdx, _sendDetsReplyHandler(), Converse::CkMyPe(), CmiAlloc(), CmiFree(), CmiMyPe(), CmiPrintf(), Converse::CmiSyncBroadcastAllAndFree(), CmiWallTimer(), createObjIDList(), distributeRestartedObjects(), fastRecovery, forAllCharesDo(), lb, ResendRequest::numberObjects, ResendRequest::PE, CentralLB::ReceiveDummyMigration(), restartDecisionNumber, and LBStepMsg::step.
int getCheckPointPE | ( | ) |
Getting the pe number of the current processor's buddy.
In the team-based approach each processor might checkpoint in the next team, but currently teams are only meant to reduce memory overhead. Note: function getReverseCheckPointPE performs the reverse map. It must be changed accordingly.
Definition at line 2187 of file ckmessagelogging.C.
References CmiMyPe().
Definition at line 2199 of file ckmessagelogging.C.
References CmiAlloc(), and envelope::getTotalsize().
Definition at line 55 of file ckmessagelogging.C.
char* checkpointDirectory = "." |
Definition at line 57 of file ckmessagelogging.C.
int unAckedCheckpoint = 0 |
Definition at line 58 of file ckmessagelogging.C.
Definition at line 59 of file ckmessagelogging.C.
char* faultFile |
Definition at line 67 of file ckmessagelogging.C.
Definition at line 69 of file ckmessagelogging.C.
Definition at line 70 of file ckmessagelogging.C.
double killTime = 0.0 |
Definition at line 72 of file ckmessagelogging.C.
double faultMean |
Definition at line 73 of file ckmessagelogging.C.
int checkpointCount = 0 |
Definition at line 74 of file ckmessagelogging.C.
char fName[100] [static] |
Definition at line 76 of file ckmessagelogging.C.
Definition at line 94 of file ckmessagelogging.C.
Definition at line 95 of file ckmessagelogging.C.
Definition at line 96 of file ckmessagelogging.C.
Definition at line 97 of file ckmessagelogging.C.
Definition at line 100 of file ckmessagelogging.C.
Definition at line 101 of file ckmessagelogging.C.
Definition at line 102 of file ckmessagelogging.C.
Definition at line 106 of file ckmessagelogging.C.
Definition at line 107 of file ckmessagelogging.C.
Definition at line 108 of file ckmessagelogging.C.
Definition at line 109 of file ckmessagelogging.C.
Definition at line 110 of file ckmessagelogging.C.
Definition at line 111 of file ckmessagelogging.C.
Definition at line 112 of file ckmessagelogging.C.
Definition at line 113 of file ckmessagelogging.C.
Definition at line 114 of file ckmessagelogging.C.
Definition at line 115 of file ckmessagelogging.C.
Definition at line 116 of file ckmessagelogging.C.
Definition at line 117 of file ckmessagelogging.C.
Definition at line 118 of file ckmessagelogging.C.
Definition at line 119 of file ckmessagelogging.C.
Definition at line 120 of file ckmessagelogging.C.
int _falseRestart = 0 |
Definition at line 124 of file ckmessagelogging.C.
For testing on clusters we might carry out restarts on a porcessor without actually starting it 1 -> false restart 0 -> restart after an actual crash.
Definition at line 132 of file ckmessagelogging.C.
void* centralLb |
Definition at line 133 of file ckmessagelogging.C.
void(* resumeLbFnPtr)(void *) |
Definition at line 135 of file ckmessagelogging.C.
Definition at line 136 of file ckmessagelogging.C.
Definition at line 137 of file ckmessagelogging.C.
Definition at line 138 of file ckmessagelogging.C.
Referenced by _messageLoggingInit(), and CkStartMlogCheckpoint().
Definition at line 139 of file ckmessagelogging.C.
Referenced by _checkpointAckHandler(), and _messageLoggingInit().
Definition at line 141 of file ckmessagelogging.C.
int countLBToMigrate = 0 |
Definition at line 142 of file ckmessagelogging.C.
Definition at line 143 of file ckmessagelogging.C.
Definition at line 144 of file ckmessagelogging.C.
Definition at line 146 of file ckmessagelogging.C.
int restartDecisionNumber = -1 |
Definition at line 147 of file ckmessagelogging.C.
double lastCompletedAlarm = 0 |
Definition at line 148 of file ckmessagelogging.C.
double lastRestart = 0 |
Definition at line 149 of file ckmessagelogging.C.
Definition at line 150 of file ckmessagelogging.C.
Definition at line 153 of file ckmessagelogging.C.
int heartBeatHandlerIdx [static] |
Definition at line 156 of file ckmessagelogging.C.
int heartBeatCheckHandlerIdx [static] |
Definition at line 157 of file ckmessagelogging.C.
int partnerFailureHandlerIdx [static] |
Definition at line 158 of file ckmessagelogging.C.
double lastPingTime = -1 [static] |
Definition at line 159 of file ckmessagelogging.C.
int inCkptFlag = 0 |
Definition at line 169 of file ckmessagelogging.C.