Go to the source code of this file.
Data Structures | |
class | ElementPacker |
class | ElementDistributor |
class | ElementCaller |
Functions | |
const char * | idx2str (const CkArrayIndex &ind) |
const char * | idx2str (const ArrayElement *el) |
void | getGlobalStep (CkGroupID gID) |
bool | fault_aware (CkObjID &recver) |
void | sendCheckpointData (int mode) |
Sends the checkpoint to its buddy. | |
void | createObjIDList (void *data, ChareMlogData *mlogData) |
bool | isLocal (int destPE) |
Determines if the message is local or not. | |
bool | isTeamLocal (int destPE) |
Determines if the message is group local or not. | |
void | printLog (TProcessedLog *log) |
Prints a processed log. | |
void | readKillFile () |
CpvDeclare (Chare *, _currentObj) | |
CpvDeclare (StoredCheckpoint *, _storedCheckpointData) | |
CpvDeclare (CkQ< MlogEntry * > *, _delayedLocalMsgs) | |
CpvDeclare (Queue, _outOfOrderMessageQueue) | |
CpvDeclare (Queue, _delayedRemoteMessageQueue) | |
CpvDeclare (char **, _bufferedTicketRequests) | |
CpvDeclare (int *, _numBufferedTicketRequests) | |
CpvDeclare (char *, _localDets) | |
CpvDeclare (CkDeterminantHashtableT *, _remoteDets) | |
CpvDeclare (char *, _incarnation) | |
CpvDeclare (RemoveDeterminantsHeader *, _removeDetsHeader) | |
CpvDeclare (StoreDeterminantsHeader *, _storeDetsHeader) | |
CpvDeclare (int *, _storeDetsSizes) | |
CpvDeclare (char **, _storeDetsPtrs) | |
CpvDeclare (int, _numEmigrantRecObjs) | |
CpvDeclare (int, _numImmigrantRecObjs) | |
CpvDeclare (std::vector< CkLocation * > *, _immigrantRecObjs) | |
void | setTeamRecovery (void *data, ChareMlogData *mlogData) |
Turns on the flag for team recovery that selectively restores particular metadata information. | |
void | unsetTeamRecovery (void *data, ChareMlogData *mlogData) |
Turns off the flag for team recovery. | |
void | mpi_restart_crashed (int pe, int rank) |
int | find_spare_mpirank (int pe, int partition) |
void | heartBeatPartner () |
Pings buddy to let it know this PE is alive. | |
void | heartBeatHandler (void *msg) |
Registers last time it knew about the PE that checkpoints on it. | |
void | heartBeatCheckHandler () |
Checks whether the PE that checkpoints on it is still alive. | |
void | partnerFailureHandler (char *msg) |
Receives the notification of a failure and updates pe-to-rank mapping. | |
int | getReverseCheckPointPE () |
Getting the pe that checkpoints on this pe. | |
static void * | doNothingMsg (int *size, void *data, void **remote, int count) |
void | _messageLoggingInit () |
Initialize message logging data structures and register handlers. | |
void | killLocal (void *_dummy, double curWallTime) |
void | readFaultFile () |
: reads the PE that will be failing throughout the execution and the mean time between failures. | |
void | CkDieNow () |
void | addBufferedDeterminant (CkObjID sender, CkObjID receiver, MCount SN, MCount TN) |
Adds a determinants to the buffered determinants and checks whether the array of buffered determinants needs to be extended. | |
void | sendGroupMsg (envelope *env, int destPE, int _infoIdx) |
Sends a group message that might be a broadcast. | |
void | sendNodeGroupMsg (envelope *env, int destNode, int _infoIdx) |
Sends a nodegroup message that might be a broadcast. | |
void | sendArrayMsg (envelope *env, int destPE, int _infoIdx) |
Sends a message to an array element. | |
void | sendChareMsg (envelope *env, int destPE, int _infoIdx, const CkChareID *pCid) |
Sends a message to a singleton chare. | |
void | sendCommonMsg (CkObjID &recver, envelope *_env, int destPE, int _infoIdx) |
A method to generate the actual ticket requests for groups, nodegroups or arrays. | |
void | sendMsg (CkObjID &sender, CkObjID &recver, int destPE, MlogEntry *entry, MCount SN, MCount TN, int resend) |
Method that does the actual send by creating a ticket request filling it up and sending it. | |
void | sendLocalMsg (envelope *env, int _infoIdx) |
Function to send a local message. | |
void | _removeDeterminantsHandler (char *buffer) |
Removes the determinants after a particular index in the _localDets array. | |
void | _storeDeterminantsHandler (char *buffer) |
Stores the determinants coming from other processor. | |
void | _ticketRequestHandler (TicketRequest *ticketRequest) |
If there are any delayed requests, process them first before processing this request. | |
bool | _getTicket (envelope *env, int *flag) |
Gets a ticket for a recently received message. | |
int | preProcessReceivedMessage (envelope *env, Chare **objPointer, MlogEntry **logEntryPointer) |
void | postProcessReceivedMessage (Chare *obj, CkObjID &sender, MCount SN, MlogEntry *entry) |
Updates a few variables once a message has been processed. | |
void | generalCldEnqueue (int destPE, envelope *env, int _infoIdx) |
void | _pingHandler (CkPingMsg *msg) |
void | buildProcessedTicketLog (void *data, ChareMlogData *mlogData) |
A chare adds the latest ticket number processed. | |
void | clearUpMigratedRetainedLists (int PE) |
void | checkpointAlarm (void *_dummy, double curWallTime) |
void | _checkpointRequestHandler (CheckpointRequest *request) |
void | startMlogCheckpoint (void *_dummy, double curWallTime) |
Starts the checkpoint phase after migration. | |
void | pupArrayElementsSkip (PUP::er &p, bool create, MigrationRecord *listToSkip, int listsize) |
Pups all the array elements in this processor. | |
void | readCheckpointFromDisk (int size, char *data) |
Reads a checkpoint from disk. | |
void | writeCheckpointToDisk (int size, char *data) |
Writes a checkpoint to disk. | |
void | _storeCheckpointHandler (char *msg) |
void | sendRemoveLogRequests () |
Sends out the messages asking senders to throw away message logs below a certain ticket number. | |
void | _checkpointAckHandler (CheckPointAck *ackMsg) |
void | populateDeterminantTable (char *data) |
Inserts all the determinants into a hash table. | |
void | removeProcessedLogs (void *_data, ChareMlogData *mlogData) |
void | _removeProcessedLogHandler (char *requestMsg) |
Removes messages in the log according to the received ticket numbers. | |
void | CkMlogRestart (const char *dummy, CkArgMsg *dummyMsg) |
Function for restarting the crashed processor. | |
void | _restartHandler (RestartRequest *restartMsg) |
Function to restart this processor. | |
void | _getRestartCheckpointHandler (RestartRequest *restartMsg) |
Gets the stored checkpoint but calls another function in the sender. | |
void | _recvRestartCheckpointHandler (char *_restartData) |
Receives the checkpoint coming from its buddy. | |
void | CkMlogRestartDouble (void *, double) |
void | CkMlogRestartLocal () |
void | _getCheckpointHandler (RestartRequest *restartMsg) |
Gets the stored checkpoint for its buddy processor. | |
void | _verifyAckRequestHandler (VerifyAckMsg *verifyRequest) |
void | _verifyAckHandler (VerifyAckMsg *verifyReply) |
void | _recvCheckpointHandler (char *_restartData) |
Receives the checkpoint data from its buddy, restores the state of all the objects and asks everyone else to update its home. | |
void | _updateHomeAckHandler (RestartRequest *updateHomeAck) |
Receives the updateHome ACKs from all other processors. | |
void | initializeRestart (void *data, ChareMlogData *mlogData) |
Initializes variables and flags for restarting procedure. | |
void | updateHomePE (void *data, ChareMlogData *mlogData) |
Updates the homePe of chare array elements. | |
void | _updateHomeRequestHandler (RestartRequest *updateRequest) |
Updates the homePe for all chares in this processor. | |
void | fillTicketForChare (void *data, ChareMlogData *mlogData) |
Fills up the ticket vector for each chare. | |
void | printMsg (envelope *env, const char *par) |
Prints information about a message. | |
void | printDet (Determinant *det, const char *par) |
Prints information about a determinant. | |
void | resendMessageForChare (void *data, ChareMlogData *mlogData) |
Resends all the logged messages to a particular chare list. | |
void | _sendDetsHandler (char *msg) |
Send all remote determinants to a particular failed PE. | |
void | _resendMessagesHandler (char *msg) |
Resends messages since last checkpoint to the list of objects included in the request. | |
MCount | maxVec (std::vector< MCount > *TNvec) |
Returns the maximum ticket from a vector. | |
void | sortVec (std::vector< MCount > *TNvec) |
int | searchVec (std::vector< MCount > *TNVec, MCount searchTN) |
void | processDelayedRemoteMsgQueue () |
Processes the messages in the delayed remote message queue. | |
void | _sendDetsReplyHandler (char *msg) |
Receives determinants stored on remote nodes. | |
void | _receivedDetDataHandler (ReceivedDetData *msg) |
Receives a list of determinants coming from the home PE of a migrated object (parallel restart). | |
void | _receivedTNDataHandler (ReceivedTNData *msg) |
Receives a list of TNs coming from the home PE of a migrated object (parallel restart). | |
void | processReceivedDet (Chare *obj, int listSize, Determinant *listDets) |
Processes the received list of determinants from a particular PE. | |
void | processReceivedTN (Chare *obj, int listSize, MCount *listTNs) |
Processes the received list of tickets from a particular PE. | |
void | distributeRestartedObjects () |
Distributes objects to accelerate recovery after a failure. | |
void | _sendBackLocationHandler (char *receivedMsg) |
Handler to receive back a location. | |
void | _distributedLocationHandler (char *receivedMsg) |
Handler to update information about an object just received. | |
void | sendDummyMigration (int restartPE, CkGroupID lbID, CkGroupID locMgrID, CkArrayIndexMax &idx, int locationPE) |
this method is used to send messages to a restarted processor to tell it that a particular expected object is not going to get to it | |
void | sendDummyMigrationCounts (int *dummyCounts) |
this method is used by a restarted processor to tell other processors that they are not going to receive these many objects. | |
void | _dummyMigrationHandler (DummyMigrationMsg *msg) |
this handler is used to process a dummy migration msg. | |
void | forAllCharesDo (MlogFn fnPointer, void *data) |
Map function pointed by fnPointer over all the chares living in this processor. | |
void | initMlogLBStep (CkGroupID gid) |
This is the first time Converse is called after AtSync method has been called by every local object. | |
void | pupLocation (CkLocation *loc, CkLocMgr *locMgr, PUP::er &p) |
Pups a location. | |
void | sendBackImmigrantRecObjs () |
Sends back the immigrant recovering object to their origin PE. | |
void | restoreParallelRecovery (void(*_fnPtr)(void *), void *_centralLb) |
Restores objects after parallel recovery, either by sending back the immigrant objects or by waiting for all emigrant objects to be back. | |
void | startLoadBalancingMlog (void(*_fnPtr)(void *), void *_centralLb) |
Load Balancing. | |
void | finishedCheckpointLoadBalancing () |
void | sendMlogLocation (int targetPE, envelope *env) |
void | _receiveMigrationNoticeHandler (MigrationNotice *msg) |
void | _receiveMigrationNoticeAckHandler (MigrationNoticeAck *msg) |
void | _receiveMlogLocationHandler (void *buf) |
void | resumeFromSyncRestart (void *data, ChareMlogData *mlogData) |
void | _checkpointBarrierHandler (CheckpointBarrierMsg *barrierMsg) |
Processor 0 receives a contribution from every other processor after checkpoint. | |
void | _checkpointBarrierAckHandler (CheckpointBarrierMsg *msg) |
void | garbageCollectMlogForChare (void *data, ChareMlogData *mlogData) |
Function to remove all messages in the message log of a particular chare. | |
void | garbageCollectMlog () |
Garbage collects the message log and other data structures. | |
void | informLocationHome (CkGroupID locMgrID, CkArrayIndexMax idx, int homePE, int currentPE) |
method that informs an array elements home processor of its current location It is a converse method to bypass the charm++ message logging framework | |
void | _receiveLocationHandler (CurrentLocationMsg *data) |
void | _getGlobalStepHandler (LBStepMsg *msg) |
void | _recvGlobalStepHandler (LBStepMsg *msg) |
Receives the global step handler from PE 0. | |
void | _messageLoggingExit () |
Function to wrap up performance information. | |
int | getCheckPointPE () |
Getting the pe number of the current processor's buddy. | |
envelope * | copyEnvelope (envelope *env) |
bool | isSameDet (Determinant *first, Determinant *second) |
Variables | |
bool | _recoveryFlag = false |
bool | _restartFlag = false |
int | _numRestartResponses = 0 |
int | countHashRefs = 0 |
int | countHashCollisions = 0 |
char * | checkpointDirectory = "." |
int | unAckedCheckpoint = 0 |
int | countLocal = 0 |
int | countBuffered = 0 |
int | countPiggy = 0 |
int | countClearBufferedLocalCalls = 0 |
int | countUpdateHomeAcks = 0 |
int | teamSize |
int | chkptPeriod |
bool | fastRecovery |
int | parallelRecovery |
char * | killFile |
char * | faultFile |
int | killFlag = 0 |
int | faultFlag = 0 |
int | restartingMlogFlag = 0 |
double | killTime = 0.0 |
double | faultMean |
int | checkpointCount = 0 |
int | diskCkptFlag = 0 |
static char | fName [100] |
int | _numBufferedDets |
int | _indexBufferedDets |
int | _phaseBufferedDets |
int | _maxBufferedDets |
int * | numMsgsTarget |
int * | sizeMsgsTarget |
int | totalMsgsTarget |
float | totalMsgsSize |
int | numPiggyDets |
int | numDets |
int | numDupDets |
int | msgLogSize |
int | bufferedDetsSize |
int | storedDetsSize |
float | MLOGFT_totalLogSize = 0.0 |
float | MLOGFT_totalMessages = 0.0 |
float | MLOGFT_totalMcastLogSize = 0.0 |
float | MLOGFT_totalReductionLogSize = 0.0 |
static double | adjustChkptPeriod = 0.0 |
static double | nextCheckpointTime = 0.0 |
static CkHashtableT < CkHashtableAdaptorT< CkObjID > , CkHashtableT < CkHashtableAdaptorT< CkObjID > , SNToTicket * > * > | detTable (1000, 0.3) |
int | _pingHandlerIdx |
char | objString [100] |
int | _checkpointRequestHandlerIdx |
int | _storeCheckpointHandlerIdx |
int | _checkpointAckHandlerIdx |
int | _getCheckpointHandlerIdx |
int | _recvCheckpointHandlerIdx |
int | _removeProcessedLogHandlerIdx |
int | _verifyAckRequestHandlerIdx |
int | _verifyAckHandlerIdx |
int | _dummyMigrationHandlerIdx |
int | _getGlobalStepHandlerIdx |
int | _recvGlobalStepHandlerIdx |
int | _updateHomeRequestHandlerIdx |
int | _updateHomeAckHandlerIdx |
int | _resendMessagesHandlerIdx |
int | _sendDetsHandlerIdx |
int | _sendDetsReplyHandlerIdx |
int | _receivedTNDataHandlerIdx |
int | _receivedDetDataHandlerIdx |
int | _distributedLocationHandlerIdx |
int | _sendBackLocationHandlerIdx |
int | _storeDeterminantsHandlerIdx |
int | _removeDeterminantsHandlerIdx |
int | _restartHandlerIdx |
int | _getRestartCheckpointHandlerIdx |
int | _recvRestartCheckpointHandlerIdx |
int | verifyAckTotal |
int | verifyAckCount |
int | verifyAckedRequests = 0 |
RestartRequest * | storedRequest |
int | _falseRestart = 0 |
int | onGoingLoadBalancing = 0 |
For testing on clusters we might carry out restarts on a porcessor without actually starting it 1 -> false restart 0 -> restart after an actual crash. | |
void * | centralLb |
void(* | resumeLbFnPtr )(void *) |
int | _receiveMlogLocationHandlerIdx |
int | _receiveMigrationNoticeHandlerIdx |
int | _receiveMigrationNoticeAckHandlerIdx |
int | _checkpointBarrierHandlerIdx |
int | _checkpointBarrierAckHandlerIdx |
std::vector< MigrationRecord > | migratedNoticeList |
std::vector < RetainedMigratedObject * > | retainedObjectList |
int | donotCountMigration = 0 |
int | countLBMigratedAway = 0 |
int | countLBToMigrate = 0 |
int | migrationDoneCalled = 0 |
int | checkpointBarrierCount = 0 |
int | globalResumeCount = 0 |
CkGroupID | globalLBID |
int | restartDecisionNumber = -1 |
double | lastCompletedAlarm = 0 |
double | lastRestart = 0 |
int | _receiveLocationHandlerIdx |
static int | heartBeatHandlerIdx |
static int | heartBeatCheckHandlerIdx |
static int | partnerFailureHandlerIdx |
static double | lastPingTime = -1 |
int | inCkptFlag = 0 |
int | calledRetryTicketRequest = 0 |
This method is used to retry the ticket requests that had been queued up earlier. | |
std::vector< TProcessedLog > | processedTicketLog |
double | totalSearchRestoredTime = 0 |
double | totalSearchRestoredCount = 0 |
const char* idx2str | ( | const CkArrayIndex & | ind | ) |
const char* idx2str | ( | const ArrayElement * | el | ) |
void getGlobalStep | ( | CkGroupID | gID | ) |
Definition at line 3657 of file ckcausalmlog.C.
References _getGlobalStepHandlerIdx, CmiInitMsgHeader(), CmiMyPe(), Converse::CmiSyncSend(), LBStepMsg::fromPE, LBStepMsg::header, LBStepMsg::lbID, and LBStepMsg::step.
Referenced by _recvCheckpointHandler().
Definition at line 1165 of file ckcausalmlog.C.
References CkObjID::type, TypeArray, TypeChare, TypeGroup, TypeMainChare, and TypeNodeGroup.
Referenced by preProcessReceivedMessage().
void sendCheckpointData | ( | int | mode | ) |
Sends the checkpoint to its buddy.
The mode distinguishes between the two cases: MLOG_RESTARTED: sending the checkpoint to a team member that did not crash but is restarting. MLOG_CRASHED: sending the checkpoint to the processor that crashed.
Definition at line 2155 of file ckcausalmlog.C.
References _recvCheckpointHandlerIdx, _recvRestartCheckpointHandlerIdx, StoredCheckpoint::buf, buf, StoredCheckpoint::bufSize, RestartProcessorData::checkPointSize, Converse::CkMyPe(), CmiAlloc(), CmiFree(), CmiMyPe(), Converse::CmiSyncSendAndFree(), CmiTimer(), diskCkptFlag, RestartProcessorData::lbGroupID, RestartProcessorData::migratedElementSize, migratedNoticeList, msg, RestartProcessorData::numMigratedAwayElements, RestartProcessorData::numMigratedInElements, RestartProcessorData::PE, RestartRequest::PE, readCheckpointFromDisk(), and RestartProcessorData::restartWallTime.
Referenced by _getCheckpointHandler(), _getRestartCheckpointHandler(), and _verifyAckHandler().
void createObjIDList | ( | void * | data, | |
ChareMlogData * | mlogData | |||
) |
Definition at line 2218 of file ckcausalmlog.C.
References Converse::CkMyPe(), list, ChareMlogData::objID, objString, printLog(), TProcessedLog::recver, CkObjID::toString(), ChareMlogData::tProcessed, and TProcessedLog::tProcessed.
Referenced by _recvCheckpointHandler(), _recvGlobalStepHandler(), _recvRestartCheckpointHandler(), _sendDetsReplyHandler(), and _updateHomeAckHandler().
Determines if the message is local or not.
A message is local if: 1) Both the destination and origin are the same PE.
Definition at line 828 of file ckcausalmlog.C.
References Converse::CkMyPe().
Referenced by sendCommonMsg().
Determines if the message is group local or not.
A message is group local if: 1) They belong to the same group in the group-based message logging.
A message is group local if: 1) They belong to the same team in the team-based message logging.
Definition at line 840 of file ckcausalmlog.C.
References Converse::CkMyPe(), and teamSize.
Referenced by _resendMessagesHandler(), sendCommonMsg(), and sendMsg().
void printLog | ( | TProcessedLog * | log | ) |
Prints a processed log.
Definition at line 2475 of file ckcausalmlog.C.
References Converse::CkMyPe(), TProcessedLog::recver, CkObjID::toString(), and TProcessedLog::tProcessed.
Referenced by createObjIDList().
void readKillFile | ( | ) |
Definition at line 525 of file ckcausalmlog.C.
References CcdCallFnAfter(), Converse::CkMyPe(), CmiWallTimer(), killFile, killLocal(), and killTime.
Referenced by _initCharm(), and CkMemCheckPT::isMaster().
CpvDeclare | ( | Chare * | , | |
_currentObj | ||||
) |
CpvDeclare | ( | StoredCheckpoint * | , | |
_storedCheckpointData | ||||
) |
CpvDeclare | ( | Queue | , | |
_outOfOrderMessageQueue | ||||
) |
CpvDeclare | ( | Queue | , | |
_delayedRemoteMessageQueue | ||||
) |
CpvDeclare | ( | char ** | , | |
_bufferedTicketRequests | ||||
) |
CpvDeclare | ( | int * | , | |
_numBufferedTicketRequests | ||||
) |
CpvDeclare | ( | char * | , | |
_localDets | ||||
) |
CpvDeclare | ( | CkDeterminantHashtableT * | , | |
_remoteDets | ||||
) |
CpvDeclare | ( | char * | , | |
_incarnation | ||||
) |
CpvDeclare | ( | RemoveDeterminantsHeader * | , | |
_removeDetsHeader | ||||
) |
CpvDeclare | ( | StoreDeterminantsHeader * | , | |
_storeDetsHeader | ||||
) |
CpvDeclare | ( | int * | , | |
_storeDetsSizes | ||||
) |
CpvDeclare | ( | char ** | , | |
_storeDetsPtrs | ||||
) |
CpvDeclare | ( | int | , | |
_numEmigrantRecObjs | ||||
) |
CpvDeclare | ( | int | , | |
_numImmigrantRecObjs | ||||
) |
CpvDeclare | ( | std::vector< CkLocation * > * | , | |
_immigrantRecObjs | ||||
) |
void setTeamRecovery | ( | void * | data, | |
ChareMlogData * | mlogData | |||
) |
Turns on the flag for team recovery that selectively restores particular metadata information.
Definition at line 2460 of file ckcausalmlog.C.
References name, and ChareMlogData::teamRecoveryFlag.
Referenced by _recvRestartCheckpointHandler().
void unsetTeamRecovery | ( | void * | data, | |
ChareMlogData * | mlogData | |||
) |
Turns off the flag for team recovery.
Definition at line 2468 of file ckcausalmlog.C.
References ChareMlogData::teamRecoveryFlag.
Referenced by _recvRestartCheckpointHandler().
Referenced by CkMemCheckPT::isMaster(), partnerFailureHandler(), and SendMsgBuf().
Referenced by CkMemCheckPT::isMaster(), partnerFailureHandler(), and SendMsgBuf().
void heartBeatPartner | ( | ) |
Pings buddy to let it know this PE is alive.
Used for failure detection.
Definition at line 511 of file ckcausalmlog.C.
References CcdCallOnCondition(), CmiAlloc(), CmiMyPe(), Converse::CmiSyncSendAndFree(), getCheckPointPE(), heartBeatHandlerIdx, and msg.
Referenced by _messageLoggingInit().
void heartBeatHandler | ( | void * | msg | ) |
Registers last time it knew about the PE that checkpoints on it.
Definition at line 478 of file ckcausalmlog.C.
References CmiFree(), CmiWallTimer(), and lastPingTime.
Referenced by _messageLoggingInit().
void heartBeatCheckHandler | ( | ) |
Checks whether the PE that checkpoints on it is still alive.
Definition at line 487 of file ckcausalmlog.C.
References CcdCallOnCondition(), CmiAlloc(), CmiMyPe(), CmiPrintf(), Converse::CmiSyncSendAndFree(), CmiWallTimer(), getReverseCheckPointPE(), inCkptFlag, lastPingTime, msg, and partnerFailureHandlerIdx.
Referenced by _messageLoggingInit(), and partnerFailureHandler().
void partnerFailureHandler | ( | char * | msg | ) |
Receives the notification of a failure and updates pe-to-rank mapping.
Definition at line 462 of file ckcausalmlog.C.
References CcdCallOnCondition(), find_spare_mpirank(), getReverseCheckPointPE(), heartBeatCheckHandler(), and mpi_restart_crashed().
Referenced by _messageLoggingInit().
int getReverseCheckPointPE | ( | ) |
Getting the pe that checkpoints on this pe.
Definition at line 4104 of file ckcausalmlog.C.
References CmiMyPe().
Referenced by heartBeatCheckHandler(), and partnerFailureHandler().
Definition at line 270 of file ckcausalmlog.C.
Referenced by _checkpointAckHandler(), finishedCheckpointLoadBalancing(), and CkMemCheckPT::isMaster().
void _messageLoggingInit | ( | ) |
Initialize message logging data structures and register handlers.
Definition at line 277 of file ckcausalmlog.C.
Referenced by _initCharm().
void killLocal | ( | void * | _dummy, | |
double | curWallTime | |||
) |
Definition at line 562 of file ckcausalmlog.C.
References CcdCallFnAfter(), CkDieNow(), Converse::CkMyPe(), CmiWallTimer(), and killTime.
Referenced by CkDieNow(), CkMemCheckPT::isMaster(), readFaultFile(), and readKillFile().
void readFaultFile | ( | ) |
: reads the PE that will be failing throughout the execution and the mean time between failures.
We assume an exponential distribution for the mean-time-between-failures.
Definition at line 546 of file ckcausalmlog.C.
References CcdCallFnAfter(), Converse::CkMyPe(), faultFile, faultMean, and killLocal().
void CkDieNow | ( | ) |
Definition at line 576 of file ckcausalmlog.C.
Referenced by CkMemCheckPT::isMaster(), killLocal(), and SendMsgBuf().
Adds a determinants to the buffered determinants and checks whether the array of buffered determinants needs to be extended.
Definition at line 591 of file ckcausalmlog.C.
References _indexBufferedDets, _maxBufferedDets, _numBufferedDets, bufferedDetsSize, Converse::CkMyPe(), CmiAlloc(), CmiFree(), numDets, Determinant::receiver, Determinant::sender, Determinant::SN, and Determinant::TN.
Referenced by preProcessReceivedMessage().
Sends a group message that might be a broadcast.
Definition at line 628 of file ckcausalmlog.C.
Referenced by _sendMsgBranch(), and sendGroupMsg().
Sends a nodegroup message that might be a broadcast.
Definition at line 661 of file ckcausalmlog.C.
Referenced by _sendMsgNodeBranch(), and sendNodeGroupMsg().
Sends a message to an array element.
Definition at line 693 of file ckcausalmlog.C.
Referenced by CkArrayManagerDeliver().
Sends a message to a singleton chare.
Definition at line 715 of file ckcausalmlog.C.
Referenced by CkSendMsg().
A method to generate the actual ticket requests for groups, nodegroups or arrays.
Definition at line 736 of file ckcausalmlog.C.
Referenced by sendArrayMsg(), sendChareMsg(), sendGroupMsg(), and sendNodeGroupMsg().
void sendMsg | ( | CkObjID & | sender, | |
CkObjID & | recver, | |||
int | destPE, | |||
MlogEntry * | entry, | |||
MCount | SN, | |||
MCount | TN, | |||
int | resend | |||
) |
Method that does the actual send by creating a ticket request filling it up and sending it.
Definition at line 852 of file ckcausalmlog.C.
References _indexBufferedDets, MlogEntry::_infoIdx, _numBufferedDets, _phaseBufferedDets, _storeDeterminantsHandlerIdx, ChareMlogData::addLogEntry(), Converse::CkMyPe(), CmiMemoryCheck(), CmiMyPe(), CmiSyncVectorSend(), MlogEntry::destPE, MlogEntry::env, float, generalCldEnqueue(), envelope::getTotalsize(), MlogEntry::indexBufDets, isTeamLocal(), Chare::mlogData, MLOGFT_totalLogSize, MLOGFT_totalMcastLogSize, MLOGFT_totalMessages, MLOGFT_totalReductionLogSize, msgLogSize, MlogEntry::numBufDets, numMsgsTarget, numPiggyDets, sizeMsgsTarget, totalMsgsSize, and totalMsgsTarget.
Referenced by sendCommonMsg().
Function to send a local message.
It first gets a ticket and then enqueues the message. If we are recovering, then the message is enqueued in a delay queue.
Definition at line 937 of file ckcausalmlog.C.
Referenced by sendCommonMsg().
void _removeDeterminantsHandler | ( | char * | buffer | ) |
Removes the determinants after a particular index in the _localDets array.
Definition at line 1006 of file ckcausalmlog.C.
References _indexBufferedDets, _numBufferedDets, _phaseBufferedDets, CmiFree(), RemoveDeterminantsHeader::index, index, and RemoveDeterminantsHeader::phase.
Referenced by _messageLoggingInit().
void _storeDeterminantsHandler | ( | char * | buffer | ) |
Stores the determinants coming from other processor.
Definition at line 1033 of file ckcausalmlog.C.
References _removeDeterminantsHandlerIdx, Converse::CkMyPe(), CmiFree(), CmiMemoryCheck(), Converse::CmiSyncSend(), StoreDeterminantsHeader::index, index, isSameDet(), n, StoreDeterminantsHeader::number, numDupDets, StoreDeterminantsHeader::PE, StoreDeterminantsHeader::phase, Determinant::receiver, Determinant::sender, Determinant::SN, storedDetsSize, and Determinant::TN.
Referenced by _messageLoggingInit().
void _ticketRequestHandler | ( | TicketRequest * | ticketRequest | ) | [inline] |
If there are any delayed requests, process them first before processing this request.
Definition at line 1097 of file ckcausalmlog.C.
References Converse::CkMyPe(), and CmiFree().
Gets a ticket for a recently received message.
Definition at line 1110 of file ckcausalmlog.C.
References Converse::CkMyPe(), CmiMemoryCheck(), CmiWallTimer(), CkObjID::getObject(), ChareMlogData::getTicket(), Chare::mlogData, ChareMlogData::next_ticket(), ChareMlogData::restartFlag, Ticket::state, teamSize, Ticket::TN, CkObjID::toString(), ChareMlogData::tProcessed, and ChareMlogData::verifyTicket().
Referenced by preProcessReceivedMessage().
Definition at line 1181 of file ckcausalmlog.C.
Referenced by _processHandler().
Updates a few variables once a message has been processed.
Definition at line 1302 of file ckcausalmlog.C.
Referenced by _processHandler().
Definition at line 1322 of file ckcausalmlog.C.
Referenced by sendCommonMsg(), sendMsg(), and sendRemoteMsg().
void _pingHandler | ( | CkPingMsg * | msg | ) |
Definition at line 1343 of file ckcausalmlog.C.
Referenced by _messageLoggingInit().
void buildProcessedTicketLog | ( | void * | data, | |
ChareMlogData * | mlogData | |||
) |
A chare adds the latest ticket number processed.
Definition at line 1468 of file ckcausalmlog.C.
References Converse::CkMyPe(), ChareMlogData::objID, objString, TProcessedLog::recver, CkObjID::toString(), ChareMlogData::tProcessed, and TProcessedLog::tProcessed.
Referenced by startMlogCheckpoint().
void clearUpMigratedRetainedLists | ( | int | PE | ) |
Definition at line 1809 of file ckcausalmlog.C.
References CmiFree(), CmiMemoryCheck(), CmiMyPe(), count, migratedNoticeList, RetainedMigratedObject::msg, and retainedObjectList.
Referenced by _removeProcessedLogHandler(), and sendRemoveLogRequests().
void checkpointAlarm | ( | void * | _dummy, | |
double | curWallTime | |||
) |
Definition at line 1358 of file ckcausalmlog.C.
Referenced by checkpointAlarm(), and startMlogCheckpoint().
void _checkpointRequestHandler | ( | CheckpointRequest * | request | ) |
Definition at line 1376 of file ckcausalmlog.C.
Referenced by _messageLoggingInit().
void startMlogCheckpoint | ( | void * | _dummy, | |
double | curWallTime | |||
) |
Starts the checkpoint phase after migration.
Definition at line 1383 of file ckcausalmlog.C.
Referenced by _checkpointRequestHandler(), _receiveMigrationNoticeAckHandler(), _updateHomeRequestHandler(), and startLoadBalancingMlog().
void pupArrayElementsSkip | ( | PUP::er & | p, | |
bool | create, | |||
MigrationRecord * | listToSkip, | |||
int | listsize | |||
) |
Pups all the array elements in this processor.
Definition at line 1498 of file ckcausalmlog.C.
Referenced by _recvCheckpointHandler(), _recvRestartCheckpointHandler(), _startCheckpointHandler(), and startMlogCheckpoint().
void readCheckpointFromDisk | ( | int | size, | |
char * | data | |||
) |
Reads a checkpoint from disk.
Assumes variable fName contains the name of the file.
Definition at line 1557 of file ckcausalmlog.C.
References fName.
Referenced by _getCheckpointHandler(), and sendCheckpointData().
void writeCheckpointToDisk | ( | int | size, | |
char * | data | |||
) |
Writes a checkpoint to disk.
Assumes variable fName contains the name of the file.
Definition at line 1566 of file ckcausalmlog.C.
References fName.
Referenced by _storeCheckpointHandler().
void _storeCheckpointHandler | ( | char * | msg | ) |
Definition at line 1574 of file ckcausalmlog.C.
Referenced by _messageLoggingInit().
void sendRemoveLogRequests | ( | ) |
Sends out the messages asking senders to throw away message logs below a certain ticket number.
Definition at line 1632 of file ckcausalmlog.C.
References _removeProcessedLogHandlerIdx, Converse::CkMyPe(), Converse::CkNumPes(), clearUpMigratedRetainedLists(), CmiAbort(), CmiAlloc(), CmiFree(), CmiMemoryCheck(), CmiMyPe(), Converse::CmiSyncSend(), ResendRequest::numberObjects, ResendRequest::PE, processedTicketLog, request, and traceUserBracketEvent().
Referenced by _checkpointAckHandler(), and _checkpointBarrierAckHandler().
void _checkpointAckHandler | ( | CheckPointAck * | ackMsg | ) |
Definition at line 1671 of file ckcausalmlog.C.
Referenced by _messageLoggingInit().
void populateDeterminantTable | ( | char * | data | ) | [inline] |
Inserts all the determinants into a hash table.
Definition at line 1690 of file ckcausalmlog.C.
References CmiMemoryCheck(), detTable, CkHashtableT< KEY, OBJ >::get(), list, ResendRequest::numberObjects, numDets, SNToTicket::put(), CkHashtableTslow< KEY, OBJ >::put(), request, Determinant::TN, and Ticket::TN.
Referenced by _removeProcessedLogHandler().
void removeProcessedLogs | ( | void * | _data, | |
ChareMlogData * | mlogData | |||
) |
Definition at line 1726 of file ckcausalmlog.C.
References Converse::CkMyPe(), CmiMemoryCheck(), count, data, CkQ< T >::deq(), detTable, CkQ< T >::enq(), MlogEntry::env, SNToTicket::get(), CkHashtableT< KEY, OBJ >::get(), ChareMlogData::getMlog(), CkQ< T >::length(), list, match(), ResendRequest::numberObjects, ChareMlogData::objID, TProcessedLog::recver, request, Ticket::TN, CkObjID::toString(), and TProcessedLog::tProcessed.
Referenced by _removeProcessedLogHandler().
void _removeProcessedLogHandler | ( | char * | requestMsg | ) |
Removes messages in the log according to the received ticket numbers.
Definition at line 1783 of file ckcausalmlog.C.
References Converse::CkMyPe(), clearUpMigratedRetainedLists(), CmiFree(), CmiMemoryCheck(), forAllCharesDo(), ResendRequest::PE, populateDeterminantTable(), removeProcessedLogs(), request, and traceUserBracketEvent().
Referenced by _messageLoggingInit().
void CkMlogRestart | ( | const char * | dummy, | |
CkArgMsg * | dummyMsg | |||
) |
Function for restarting the crashed processor.
It sets the restart flag and contacts the buddy processor to get the latest checkpoint.
Definition at line 1847 of file ckcausalmlog.C.
Referenced by _initCharm(), _parseCommandLineOpts(), CkMlogRestartDouble(), and CkMlogRestartLocal().
void _restartHandler | ( | RestartRequest * | restartMsg | ) |
Function to restart this processor.
The handler is invoked by a member of its same team in message logging.
Definition at line 1879 of file ckcausalmlog.C.
References _getRestartCheckpointHandlerIdx, _numRestartResponses, _restartFlag, Converse::CkMyPe(), CmiInitMsgHeader(), Converse::CmiSyncSend(), CmiWallTimer(), getCheckPointPE(), RestartRequest::header, msg, and RestartRequest::PE.
Referenced by _messageLoggingInit().
void _getRestartCheckpointHandler | ( | RestartRequest * | restartMsg | ) |
Gets the stored checkpoint but calls another function in the sender.
Definition at line 1911 of file ckcausalmlog.C.
References _verifyAckRequestHandlerIdx, CmiInitMsgHeader(), CmiMyPe(), CmiPrintf(), Converse::CmiSyncSend(), VerifyAckMsg::fromPE, VerifyAckMsg::header, idx, idx2str(), VerifyAckMsg::index, migratedNoticeList, VerifyAckMsg::migRecord, msg, StoredCheckpoint::PE, RestartRequest::PE, sendCheckpointData(), verifyAckCount, and verifyAckTotal.
Referenced by _messageLoggingInit().
void _recvRestartCheckpointHandler | ( | char * | _restartData | ) |
Receives the checkpoint coming from its buddy.
This is the case of restart for one team member that did not crash.
Definition at line 1951 of file ckcausalmlog.C.
References _resendMessagesHandler(), _resendMessagesHandlerIdx, adjustChkptPeriod, buf, checkpointCount, RestartProcessorData::checkPointSize, chkptPeriod, Converse::CkMyPe(), Converse::CkNumPes(), CkPupGroupData(), CkPupNodeGroupData(), CkPupROData(), CmiAlloc(), CmiFree(), CmiMyPe(), Converse::CmiSyncSend(), CmiWallTimer(), createObjIDList(), forAllCharesDo(), initializeRestart(), lb, RestartProcessorData::lbGroupID, ResendRequest::numberObjects, RestartProcessorData::numMigratedAwayElements, ResendRequest::PE, RestartProcessorData::PE, pupArrayElementsSkip(), CentralLB::ReceiveDummyMigration(), restartDecisionNumber, RestartProcessorData::restartWallTime, setTeamRecovery(), PUP::mem::size(), sleep(), teamSize, and unsetTeamRecovery().
Referenced by _messageLoggingInit().
void CkMlogRestartDouble | ( | void * | , | |
double | ||||
) |
Definition at line 2063 of file ckcausalmlog.C.
void CkMlogRestartLocal | ( | ) |
Definition at line 2068 of file ckcausalmlog.C.
References CkMlogRestart().
void _getCheckpointHandler | ( | RestartRequest * | restartMsg | ) |
Gets the stored checkpoint for its buddy processor.
Definition at line 2075 of file ckcausalmlog.C.
Referenced by _messageLoggingInit().
void _verifyAckRequestHandler | ( | VerifyAckMsg * | verifyRequest | ) |
Definition at line 2113 of file ckcausalmlog.C.
References _verifyAckHandlerIdx, CmiMyPe(), CmiPrintf(), Converse::CmiSyncSendAndFree(), CkLocMgr::elementNrec(), VerifyAckMsg::fromPE, CkLocRec::getLBDB(), CkLocRec::getLdHandle(), MigrationRecord::gID, _ckGroupID::idx, MigrationRecord::idx, idx2str(), VerifyAckMsg::migRecord, CkLocMgr::reclaim(), CkLocMgr::setDuringMigration(), LBDatabase::UnregisterObj(), and verifyAckedRequests.
Referenced by _messageLoggingInit().
void _verifyAckHandler | ( | VerifyAckMsg * | verifyReply | ) |
Definition at line 2139 of file ckcausalmlog.C.
References CmiMyPe(), CmiPrintf(), idx, idx2str(), VerifyAckMsg::index, index, migratedNoticeList, VerifyAckMsg::migRecord, sendCheckpointData(), verifyAckCount, and verifyAckTotal.
Referenced by _messageLoggingInit().
void _recvCheckpointHandler | ( | char * | _restartData | ) |
Receives the checkpoint data from its buddy, restores the state of all the objects and asks everyone else to update its home.
Definition at line 2234 of file ckcausalmlog.C.
Referenced by _messageLoggingInit().
void _updateHomeAckHandler | ( | RestartRequest * | updateHomeAck | ) |
Receives the updateHome ACKs from all other processors.
Once everybody has replied, it sends a request to resend the logged messages.
Definition at line 2307 of file ckcausalmlog.C.
References _resendMessagesHandlerIdx, Converse::CkMyPe(), CmiAlloc(), CmiFree(), Converse::CmiSyncBroadcastAllAndFree(), CmiWallTimer(), countUpdateHomeAcks, createObjIDList(), distributeRestartedObjects(), fastRecovery, forAllCharesDo(), lb, ResendRequest::numberObjects, ResendRequest::PE, CentralLB::ReceiveDummyMigration(), and restartDecisionNumber.
Referenced by _messageLoggingInit().
void initializeRestart | ( | void * | data, | |
ChareMlogData * | mlogData | |||
) |
Initializes variables and flags for restarting procedure.
Definition at line 2355 of file ckcausalmlog.C.
Referenced by _recvCheckpointHandler(), and _recvRestartCheckpointHandler().
void updateHomePE | ( | void * | data, | |
ChareMlogData * | mlogData | |||
) |
Updates the homePe of chare array elements.
Definition at line 2364 of file ckcausalmlog.C.
References _ObjectID::array, CkArrayIndexBase::asChild(), CkArrayID::ckLocalBranch(), Converse::CkMyPe(), CkObjID::data, CkLocMgr::getGroupID(), CkArray::getLocMgr(), CkLocMgr::homePe(), _ObjectID::s_array::id, _ObjectID::s_array::idx, informLocationHome(), ChareMlogData::objID, RestartRequest::PE, CkObjID::type, and TypeArray.
Referenced by _updateHomeRequestHandler().
void _updateHomeRequestHandler | ( | RestartRequest * | updateRequest | ) |
Updates the homePe for all chares in this processor.
Definition at line 2388 of file ckcausalmlog.C.
References _receiveMigrationNoticeHandlerIdx, _updateHomeAckHandlerIdx, checkpointCount, CmiInitMsgHeader(), CmiMyPe(), CmiPrintf(), Converse::CmiSyncSend(), Converse::CmiSyncSendAndFree(), forAllCharesDo(), getCheckPointPE(), MigrationNotice::header, MigrationNotice::migRecord, RestartRequest::PE, MigrationNotice::record, retainedObjectList, startMlogCheckpoint(), unAckedCheckpoint, and updateHomePE().
Referenced by _messageLoggingInit().
void fillTicketForChare | ( | void * | data, | |
ChareMlogData * | mlogData | |||
) |
Fills up the ticket vector for each chare.
Definition at line 2418 of file ckcausalmlog.C.
References count, SNToTicket::get(), SNToTicket::getFinishSN(), SNToTicket::getStartSN(), CkHashtable::iterator(), ResendData::listObjects, name, CkHashtableIterator::next(), ResendData::numberObjects, ResendData::PE, ChareMlogData::teamTable, ResendData::ticketVecs, and Ticket::TN.
Referenced by _resendMessagesHandler().
void printMsg | ( | envelope * | env, | |
const char * | par | |||
) |
Prints information about a message.
Definition at line 2483 of file ckcausalmlog.C.
References Converse::CkMyPe().
Referenced by processDelayedRemoteMsgQueue(), and resendMessageForChare().
void printDet | ( | Determinant * | det, | |
const char * | par | |||
) |
Prints information about a determinant.
Definition at line 2492 of file ckcausalmlog.C.
References Converse::CkMyPe(), Determinant::receiver, Determinant::sender, Determinant::SN, Determinant::TN, and CkObjID::toString().
Referenced by _sendDetsHandler(), and processReceivedDet().
void resendMessageForChare | ( | void * | data, | |
ChareMlogData * | mlogData | |||
) |
Resends all the logged messages to a particular chare list.
data | is of type ResendData which contains the array of objects on the restartedProcessor. | |
mlogData | a particular chare living in this processor. |
Definition at line 2503 of file ckcausalmlog.C.
References Converse::CkMyPe(), Converse::CmiSyncSend(), copyEnvelope(), count, CqsEnqueueGeneral(), MlogEntry::env, ChareMlogData::getMlog(), envelope::getPriobits(), envelope::getPrioPtr(), envelope::getQueueing(), envelope::getTotalsize(), CkQ< T >::length(), ResendData::listObjects, ResendData::numberObjects, ChareMlogData::objID, ResendData::PE, printMsg(), CkObjID::toString(), TypeInvalid, and TypeNodeGroup.
Referenced by _resendMessagesHandler().
void _sendDetsHandler | ( | char * | msg | ) |
Send all remote determinants to a particular failed PE.
It only sends determinants to those objects on the list.
Definition at line 2557 of file ckcausalmlog.C.
References _sendDetsReplyHandlerIdx, Converse::CkMyPe(), CmiAlloc(), CmiFree(), CmiMemoryCheck(), CmiResetGlobalReduceSeqID(), Converse::CmiSyncSendAndFree(), CmiWallTimer(), PUP::d, int, lastRestart, ResendData::listObjects, ResendRequest::numberObjects, ResendData::numberObjects, ResendRequest::PE, ResendData::PE, printDet(), TProcessedLog::recver, ResendData::ticketVecs, and TProcessedLog::tProcessed.
Referenced by _messageLoggingInit().
void _resendMessagesHandler | ( | char * | msg | ) |
Resends messages since last checkpoint to the list of objects included in the request.
It also sends stored remote determinants to the particular failed PE.
Definition at line 2663 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and _recvRestartCheckpointHandler().
MCount maxVec | ( | std::vector< MCount > * | TNvec | ) |
Returns the maximum ticket from a vector.
Definition at line 2969 of file ckcausalmlog.C.
References max().
Referenced by processReceivedTN().
void sortVec | ( | std::vector< MCount > * | TNvec | ) |
Definition at line 2978 of file ckcausalmlog.C.
References sort().
Referenced by processReceivedTN().
int searchVec | ( | std::vector< MCount > * | TNVec, | |
MCount | searchTN | |||
) |
Definition at line 2983 of file ckcausalmlog.C.
Referenced by processReceivedTN().
void processDelayedRemoteMsgQueue | ( | ) |
Processes the messages in the delayed remote message queue.
Definition at line 2700 of file ckcausalmlog.C.
References Converse::CkMyPe(), CmiMemoryCheck(), CqsDequeue(), CqsEmpty(), CqsEnqueueGeneral(), envelope::getPriobits(), envelope::getPrioPtr(), and printMsg().
void _sendDetsReplyHandler | ( | char * | msg | ) |
Receives determinants stored on remote nodes.
Message format: |Header|ObjID list|TN list|Determinant list| TN list = |number of TNs|list of TNs|...|
Definition at line 2719 of file ckcausalmlog.C.
References _numRestartResponses, _receivedDetDataHandlerIdx, _receivedTNDataHandlerIdx, _resendMessagesHandlerIdx, Converse::CkMyPe(), Converse::CkNumPes(), CmiAlloc(), CmiMyPe(), Converse::CmiSyncBroadcastAllAndFree(), Converse::CmiSyncSendAndFree(), CmiWallTimer(), createObjIDList(), distributeRestartedObjects(), fastRecovery, forAllCharesDo(), int, lb, ResendRequest::numberObjects, ReceivedDetData::numDets, ReceivedTNData::numTNs, ResendRequest::PE, processReceivedDet(), processReceivedTN(), CentralLB::ReceiveDummyMigration(), ReceivedDetData::recver, ReceivedTNData::recver, and restartDecisionNumber.
Referenced by _messageLoggingInit(), and _recvGlobalStepHandler().
void _receivedDetDataHandler | ( | ReceivedDetData * | msg | ) |
Receives a list of determinants coming from the home PE of a migrated object (parallel restart).
Definition at line 2828 of file ckcausalmlog.C.
References CmiFree(), CmiMyPe(), Converse::CmiSyncSendAndFree(), CkObjID::getObject(), CkObjID::guessPE(), Chare::mlogData, ReceivedDetData::numDets, ChareMlogData::objID, processReceivedDet(), ReceivedDetData::recver, and CkObjID::toString().
Referenced by _messageLoggingInit().
void _receivedTNDataHandler | ( | ReceivedTNData * | msg | ) |
Receives a list of TNs coming from the home PE of a migrated object (parallel restart).
Definition at line 2846 of file ckcausalmlog.C.
References CmiFree(), CmiMyPe(), Converse::CmiSyncSendAndFree(), CkObjID::getObject(), CkObjID::guessPE(), Chare::mlogData, ReceivedTNData::numTNs, ChareMlogData::objID, processReceivedTN(), ReceivedTNData::recver, and CkObjID::toString().
Referenced by _messageLoggingInit().
void processReceivedDet | ( | Chare * | obj, | |
int | listSize, | |||
Determinant * | listDets | |||
) |
Processes the received list of determinants from a particular PE.
Definition at line 2864 of file ckcausalmlog.C.
References Converse::CkMyPe(), CmiMemoryCheck(), Chare::mlogData, printDet(), Determinant::sender, Determinant::SN, Determinant::TN, and ChareMlogData::verifyTicket().
Referenced by _receivedDetDataHandler(), and _sendDetsReplyHandler().
Processes the received list of tickets from a particular PE.
Definition at line 2881 of file ckcausalmlog.C.
References Converse::CkMyPe(), Converse::CkNumPes(), CmiMyPe(), CmiWallTimer(), ChareMlogData::currentHoles, maxVec(), Chare::mlogData, ChareMlogData::numberHoles, ChareMlogData::objID, objString, ChareMlogData::receivedTNs, ChareMlogData::resendReplyRecvd, ChareMlogData::restartFlag, searchVec(), sortVec(), ChareMlogData::tCount, teamSize, ChareMlogData::ticketHoles, CkObjID::toString(), and ChareMlogData::tProcessed.
Referenced by _receivedTNDataHandler(), and _sendDetsReplyHandler().
void distributeRestartedObjects | ( | ) |
Distributes objects to accelerate recovery after a failure.
Definition at line 3075 of file ckcausalmlog.C.
Referenced by _recvGlobalStepHandler(), _sendDetsReplyHandler(), and _updateHomeAckHandler().
void _sendBackLocationHandler | ( | char * | receivedMsg | ) |
Handler to receive back a location.
Definition at line 3085 of file ckcausalmlog.C.
Referenced by _messageLoggingInit().
void _distributedLocationHandler | ( | char * | receivedMsg | ) |
Handler to update information about an object just received.
Definition at line 3122 of file ckcausalmlog.C.
Referenced by _messageLoggingInit().
void sendDummyMigration | ( | int | restartPE, | |
CkGroupID | lbID, | |||
CkGroupID | locMgrID, | |||
CkArrayIndexMax & | idx, | |||
int | locationPE | |||
) |
this method is used to send messages to a restarted processor to tell it that a particular expected object is not going to get to it
Definition at line 3168 of file ckcausalmlog.C.
void _dummyMigrationHandler | ( | DummyMigrationMsg * | msg | ) |
this handler is used to process a dummy migration msg.
it looks up the load balancer and calls migrated for it
Definition at line 3203 of file ckcausalmlog.C.
Referenced by _messageLoggingInit().
void forAllCharesDo | ( | MlogFn | fnPointer, | |
void * | data | |||
) |
Map function pointed by fnPointer over all the chares living in this processor.
Definition at line 3254 of file ckcausalmlog.C.
Referenced by _recvCheckpointHandler(), _recvGlobalStepHandler(), _recvRestartCheckpointHandler(), _removeProcessedLogHandler(), _resendMessagesHandler(), _sendDetsReplyHandler(), _updateHomeAckHandler(), _updateHomeRequestHandler(), garbageCollectMlog(), and startMlogCheckpoint().
void pupLocation | ( | CkLocation * | loc, | |
CkLocMgr * | locMgr, | |||
PUP::er & | p | |||
) |
Pups a location.
Definition at line 3298 of file ckcausalmlog.C.
References IrrGroup::ckGetGroupID(), CkLocation::getIndex(), and idx.
Referenced by sendBackImmigrantRecObjs().
void sendBackImmigrantRecObjs | ( | ) |
Sends back the immigrant recovering object to their origin PE.
Definition at line 3309 of file ckcausalmlog.C.
References _sendBackLocationHandlerIdx, _ObjectID::array, buf, CkLocMgr::callMethod(), CkMigratable::ckAboutToMigrate(), Converse::CkMyPe(), CmiAlloc(), Converse::CmiSyncSendAndFree(), CkObjID::data, CkReductionMgr::decNumImmigrantRecObjs(), CkLocation::getIndex(), CkLocation::getLocalRecord(), CkLocation::getManager(), _ObjectID::s_array::id, idx, CkLocMgr::inform(), CkLocMgr::lastKnown(), CkLocMgr::migratableList(), ChareMlogData::objID, DistributeObjectMsg::PE, pupLocation(), CkLocMgr::setDuringMigration(), and PUP::sizer::size().
Referenced by restoreParallelRecovery().
void restoreParallelRecovery | ( | void(*)(void *) | _fnPtr, | |
void * | _centralLb | |||
) |
Restores objects after parallel recovery, either by sending back the immigrant objects or by waiting for all emigrant objects to be back.
Definition at line 3377 of file ckcausalmlog.C.
Referenced by CentralLB::ReceiveMigration().
void startLoadBalancingMlog | ( | void(*)(void *) | _fnPtr, | |
void * | _centralLb | |||
) |
Load Balancing.
Definition at line 3394 of file ckcausalmlog.C.
Referenced by CentralLB::MigrationDoneImpl().
void finishedCheckpointLoadBalancing | ( | ) |
Definition at line 3407 of file ckcausalmlog.C.
Referenced by _checkpointAckHandler().
Definition at line 3416 of file ckcausalmlog.C.
References _receiveMigrationNoticeHandlerIdx, RetainedMigratedObject::acked, Converse::CkMyPe(), CkPackMessage(), CmiInitMsgHeader(), CmiMyPe(), CmiPrintf(), Converse::CmiSyncSend(), countLBToMigrate, EnvToUsr(), MigrationRecord::fromPE, getCheckPointPE(), envelope::getTotalsize(), CkArrayElementMigrateMessage::gid, MigrationRecord::gID, MigrationNotice::header, _ckGroupID::idx, CkArrayElementMigrateMessage::idx, MigrationRecord::idx, idx2str(), RetainedMigratedObject::migRecord, MigrationNotice::migRecord, RetainedMigratedObject::msg, MigrationNotice::record, retainedObjectList, RetainedMigratedObject::size, size, and MigrationRecord::toPE.
void _receiveMigrationNoticeHandler | ( | MigrationNotice * | msg | ) |
Definition at line 3468 of file ckcausalmlog.C.
References _receiveMigrationNoticeAckHandlerIdx, MigrationRecord::ackFrom, MigrationRecord::ackTo, buf, CmiInitMsgHeader(), Converse::CmiSyncSend(), getCheckPointPE(), MigrationNoticeAck::header, migratedNoticeList, MigrationNotice::migRecord, MigrationNotice::record, and MigrationNoticeAck::record.
Referenced by _messageLoggingInit().
void _receiveMigrationNoticeAckHandler | ( | MigrationNoticeAck * | msg | ) |
Definition at line 3479 of file ckcausalmlog.C.
References _receiveMlogLocationHandlerIdx, RetainedMigratedObject::acked, CmiMyPe(), Converse::CmiSyncSend(), CmiWallTimer(), countLBMigratedAway, countLBToMigrate, MigrationRecord::gID, CkLocMgr::homePe(), MigrationRecord::idx, informLocationHome(), migrationDoneCalled, RetainedMigratedObject::migRecord, RetainedMigratedObject::msg, MigrationNoticeAck::record, RetainedMigratedObject::size, startMlogCheckpoint(), and MigrationRecord::toPE.
Referenced by _messageLoggingInit().
void _receiveMlogLocationHandler | ( | void * | buf | ) |
Definition at line 3499 of file ckcausalmlog.C.
Referenced by _messageLoggingInit().
void resumeFromSyncRestart | ( | void * | data, | |
ChareMlogData * | mlogData | |||
) |
Definition at line 3513 of file ckcausalmlog.C.
void _checkpointBarrierHandler | ( | CheckpointBarrierMsg * | barrierMsg | ) |
Processor 0 receives a contribution from every other processor after checkpoint.
Definition at line 3527 of file ckcausalmlog.C.
Referenced by _messageLoggingInit().
void _checkpointBarrierAckHandler | ( | CheckpointBarrierMsg * | msg | ) |
Definition at line 3538 of file ckcausalmlog.C.
Referenced by _messageLoggingInit().
void garbageCollectMlogForChare | ( | void * | data, | |
ChareMlogData * | mlogData | |||
) |
Function to remove all messages in the message log of a particular chare.
Definition at line 3561 of file ckcausalmlog.C.
References CkQ< T >::deq(), ChareMlogData::getMlog(), and CkQ< T >::length().
Referenced by garbageCollectMlog().
void garbageCollectMlog | ( | ) |
Garbage collects the message log and other data structures.
In case of synchronized checkpoint, we use an optimization to avoid causal message logging protocol to communicate all determinants to the rest of the processors.
Definition at line 3580 of file ckcausalmlog.C.
Referenced by _startCheckpointHandler(), and initMlogLBStep().
void informLocationHome | ( | CkGroupID | locMgrID, | |
CkArrayIndexMax | idx, | |||
int | homePE, | |||
int | currentPE | |||
) |
method that informs an array elements home processor of its current location It is a converse method to bypass the charm++ message logging framework
Definition at line 3610 of file ckcausalmlog.C.
Referenced by _distributedLocationHandler(), _receiveMigrationNoticeAckHandler(), _sendBackLocationHandler(), pupArrayElementsSkip(), and updateHomePE().
void _receiveLocationHandler | ( | CurrentLocationMsg * | data | ) |
Definition at line 3626 of file ckcausalmlog.C.
Referenced by _messageLoggingInit().
void _getGlobalStepHandler | ( | LBStepMsg * | msg | ) |
Definition at line 3668 of file ckcausalmlog.C.
Referenced by _messageLoggingInit().
void _recvGlobalStepHandler | ( | LBStepMsg * | msg | ) |
Receives the global step handler from PE 0.
Definition at line 3680 of file ckcausalmlog.C.
Referenced by _messageLoggingInit().
int getCheckPointPE | ( | ) |
Getting the pe number of the current processor's buddy.
In the team-based approach each processor might checkpoint in the next team, but currently teams are only meant to reduce memory overhead. Note: function getReverseCheckPointPE performs the reverse map. It must be changed accordingly.
Definition at line 4097 of file ckcausalmlog.C.
Referenced by _receiveMigrationNoticeHandler(), _restartHandler(), _startCheckpointHandler(), _updateHomeRequestHandler(), CkMlogRestart(), heartBeatPartner(), sendMlogLocation(), and startMlogCheckpoint().
Definition at line 4109 of file ckcausalmlog.C.
Referenced by resendMessageForChare().
bool isSameDet | ( | Determinant * | first, | |
Determinant * | second | |||
) | [inline] |
Definition at line 4116 of file ckcausalmlog.C.
References Determinant::receiver, Determinant::sender, Determinant::SN, and Determinant::TN.
Referenced by _storeDeterminantsHandler().
bool _recoveryFlag = false |
Definition at line 58 of file ckcausalmlog.C.
Referenced by CkMlogRestart(), preProcessReceivedMessage(), and startMlogCheckpoint().
Definition at line 60 of file ckcausalmlog.C.
Referenced by _recvCheckpointHandler(), _restartHandler(), _sendDetsReplyHandler(), and CkMlogRestart().
int countHashRefs = 0 |
Definition at line 63 of file ckcausalmlog.C.
Definition at line 64 of file ckcausalmlog.C.
char* checkpointDirectory = "." |
Definition at line 66 of file ckcausalmlog.C.
int unAckedCheckpoint = 0 |
Definition at line 67 of file ckcausalmlog.C.
Referenced by _checkpointAckHandler(), _startCheckpointHandler(), _updateHomeRequestHandler(), and startMlogCheckpoint().
int countLocal = 0 |
Definition at line 69 of file ckcausalmlog.C.
int countBuffered = 0 |
Definition at line 69 of file ckcausalmlog.C.
int countPiggy = 0 |
Definition at line 70 of file ckcausalmlog.C.
Definition at line 71 of file ckcausalmlog.C.
char* faultFile |
Definition at line 83 of file ckcausalmlog.C.
Definition at line 84 of file ckcausalmlog.C.
double killTime = 0.0 |
Definition at line 86 of file ckcausalmlog.C.
Referenced by CkDieNow(), CkMemCheckPT::isMaster(), killLocal(), and readKillFile().
double faultMean |
int checkpointCount = 0 |
Definition at line 88 of file ckcausalmlog.C.
Referenced by _recvCheckpointHandler(), _recvRestartCheckpointHandler(), _startCheckpointHandler(), _updateHomeRequestHandler(), and startMlogCheckpoint().
char fName[100] [static] |
Definition at line 90 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), block::block(), LV3D_save_init(), NetFEM_End(), openReplayFile(), POSE_init(), readCheckpointFromDisk(), and writeCheckpointToDisk().
Definition at line 116 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), _removeDeterminantsHandler(), addBufferedDeterminant(), garbageCollectMlog(), and sendMsg().
Definition at line 118 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), _removeDeterminantsHandler(), addBufferedDeterminant(), garbageCollectMlog(), and sendMsg().
Definition at line 120 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), _removeDeterminantsHandler(), garbageCollectMlog(), and sendMsg().
Definition at line 125 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and addBufferedDeterminant().
Definition at line 148 of file ckcausalmlog.C.
Referenced by _messageLoggingExit(), _messageLoggingInit(), sendMsg(), and sendRemoteMsg().
Definition at line 149 of file ckcausalmlog.C.
Referenced by _messageLoggingExit(), _messageLoggingInit(), sendMsg(), and sendRemoteMsg().
Definition at line 150 of file ckcausalmlog.C.
Referenced by _messageLoggingExit(), _messageLoggingInit(), sendMsg(), and sendRemoteMsg().
Definition at line 151 of file ckcausalmlog.C.
Referenced by _messageLoggingExit(), _messageLoggingInit(), sendMsg(), and sendRemoteMsg().
Definition at line 154 of file ckcausalmlog.C.
Referenced by _messageLoggingExit(), _messageLoggingInit(), and sendMsg().
Definition at line 155 of file ckcausalmlog.C.
Referenced by _messageLoggingExit(), _messageLoggingInit(), addBufferedDeterminant(), and populateDeterminantTable().
Definition at line 156 of file ckcausalmlog.C.
Referenced by _messageLoggingExit(), _messageLoggingInit(), and _storeDeterminantsHandler().
Definition at line 159 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), _startCheckpointHandler(), sendMsg(), sendRemoteMsg(), and startMlogCheckpoint().
Definition at line 160 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), _startCheckpointHandler(), addBufferedDeterminant(), and startMlogCheckpoint().
Definition at line 161 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), _startCheckpointHandler(), _storeDeterminantsHandler(), and startMlogCheckpoint().
float MLOGFT_totalLogSize = 0.0 |
Definition at line 165 of file ckcausalmlog.C.
Referenced by _messageLoggingExit(), sendMsg(), and sendRemoteMsg().
float MLOGFT_totalMessages = 0.0 |
Definition at line 166 of file ckcausalmlog.C.
Referenced by _messageLoggingExit(), sendMsg(), and sendRemoteMsg().
double adjustChkptPeriod = 0.0 [static] |
Definition at line 171 of file ckcausalmlog.C.
Referenced by _recvCheckpointHandler(), and _recvRestartCheckpointHandler().
double nextCheckpointTime = 0.0 [static] |
Definition at line 172 of file ckcausalmlog.C.
CkHashtableT<CkHashtableAdaptorT<CkObjID>,CkHashtableT<CkHashtableAdaptorT<CkObjID>,SNToTicket *> *> detTable(1000, 0.3) [static] |
Referenced by populateDeterminantTable(), and removeProcessedLogs().
char objString[100] |
Definition at line 177 of file ckcausalmlog.C.
Referenced by buildProcessedTicketLog(), createObjIDList(), and processReceivedTN().
Definition at line 178 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and checkpointAlarm().
Definition at line 179 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), _startCheckpointHandler(), and startMlogCheckpoint().
Definition at line 180 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and _storeCheckpointHandler().
Definition at line 181 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and CkMlogRestart().
Definition at line 182 of file ckcausalmlog.C.
Referenced by _getCheckpointHandler(), _messageLoggingInit(), and sendCheckpointData().
Definition at line 183 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and sendRemoveLogRequests().
Definition at line 185 of file ckcausalmlog.C.
Referenced by _getCheckpointHandler(), _getRestartCheckpointHandler(), and _messageLoggingInit().
Definition at line 186 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and _verifyAckRequestHandler().
Definition at line 187 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), sendDummyMigration(), and sendDummyMigrationCounts().
Definition at line 190 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and getGlobalStep().
Definition at line 191 of file ckcausalmlog.C.
Referenced by _getGlobalStepHandler(), and _messageLoggingInit().
Definition at line 194 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and _updateHomeRequestHandler().
Definition at line 195 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), _recvGlobalStepHandler(), _recvRestartCheckpointHandler(), _sendDetsReplyHandler(), and _updateHomeAckHandler().
Definition at line 196 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and _recvCheckpointHandler().
Definition at line 197 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and _sendDetsHandler().
Definition at line 198 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and _sendDetsReplyHandler().
Definition at line 199 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and _sendDetsReplyHandler().
Definition at line 200 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and ElementDistributor::addLocation().
Definition at line 201 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and sendBackImmigrantRecObjs().
Definition at line 203 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and _storeDeterminantsHandler().
Definition at line 206 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and CkMlogRestart().
Definition at line 207 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and _restartHandler().
Definition at line 208 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and sendCheckpointData().
Definition at line 212 of file ckcausalmlog.C.
Referenced by _getCheckpointHandler(), _getRestartCheckpointHandler(), and _verifyAckHandler().
Definition at line 213 of file ckcausalmlog.C.
Referenced by _getCheckpointHandler(), _getRestartCheckpointHandler(), and _verifyAckHandler().
Definition at line 215 of file ckcausalmlog.C.
Referenced by _dummyMigrationHandler(), and _verifyAckRequestHandler().
Definition at line 217 of file ckcausalmlog.C.
int _falseRestart = 0 |
Definition at line 219 of file ckcausalmlog.C.
For testing on clusters we might carry out restarts on a porcessor without actually starting it 1 -> false restart 0 -> restart after an actual crash.
Definition at line 227 of file ckcausalmlog.C.
Referenced by _checkpointAckHandler(), initMlogLBStep(), and startMlogCheckpoint().
void* centralLb |
Definition at line 228 of file ckcausalmlog.C.
Referenced by _checkpointBarrierAckHandler(), _sendBackLocationHandler(), restoreParallelRecovery(), and startLoadBalancingMlog().
void(* resumeLbFnPtr)(void *) |
Referenced by restoreParallelRecovery(), and startLoadBalancingMlog().
Definition at line 230 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and _receiveMigrationNoticeAckHandler().
Definition at line 231 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), _updateHomeRequestHandler(), and sendMlogLocation().
Definition at line 232 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and _receiveMigrationNoticeHandler().
Definition at line 233 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and finishedCheckpointLoadBalancing().
Definition at line 234 of file ckcausalmlog.C.
Referenced by _checkpointBarrierHandler(), and _messageLoggingInit().
std::vector<MigrationRecord> migratedNoticeList |
Definition at line 236 of file ckcausalmlog.C.
Referenced by _getCheckpointHandler(), _getRestartCheckpointHandler(), _receiveMigrationNoticeHandler(), _storeCheckpointHandler(), _verifyAckHandler(), clearUpMigratedRetainedLists(), and sendCheckpointData().
std::vector<RetainedMigratedObject *> retainedObjectList |
Definition at line 237 of file ckcausalmlog.C.
Referenced by _updateHomeRequestHandler(), clearUpMigratedRetainedLists(), and sendMlogLocation().
Definition at line 239 of file ckcausalmlog.C.
Referenced by _receiveMigrationNoticeAckHandler(), initMlogLBStep(), and startLoadBalancingMlog().
int countLBToMigrate = 0 |
Definition at line 240 of file ckcausalmlog.C.
Referenced by _receiveMigrationNoticeAckHandler(), initMlogLBStep(), sendMlogLocation(), and startLoadBalancingMlog().
Definition at line 241 of file ckcausalmlog.C.
Referenced by _receiveMigrationNoticeAckHandler(), initMlogLBStep(), and startLoadBalancingMlog().
Definition at line 244 of file ckcausalmlog.C.
int restartDecisionNumber = -1 |
Definition at line 245 of file ckcausalmlog.C.
Referenced by _recvGlobalStepHandler(), _recvRestartCheckpointHandler(), _sendDetsReplyHandler(), and _updateHomeAckHandler().
double lastCompletedAlarm = 0 |
Definition at line 247 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), checkpointAlarm(), and startMlogCheckpoint().
double lastRestart = 0 |
Definition at line 248 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), _resendMessagesHandler(), and _sendDetsHandler().
Definition at line 251 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and informLocationHome().
int heartBeatHandlerIdx [static] |
Definition at line 254 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and heartBeatPartner().
int heartBeatCheckHandlerIdx [static] |
int partnerFailureHandlerIdx [static] |
Definition at line 256 of file ckcausalmlog.C.
Referenced by _messageLoggingInit(), and heartBeatCheckHandler().
double lastPingTime = -1 [static] |
Definition at line 257 of file ckcausalmlog.C.
Referenced by heartBeatCheckHandler(), heartBeatHandler(), and CkMemCheckPT::isMaster().
int inCkptFlag = 0 |
Definition at line 267 of file ckcausalmlog.C.
Referenced by _checkpointAckHandler(), _checkpointBarrierAckHandler(), _startCheckpointHandler(), heartBeatCheckHandler(), and startMlogCheckpoint().
This method is used to retry the ticket requests that had been queued up earlier.
Definition at line 1341 of file ckcausalmlog.C.
std::vector<TProcessedLog> processedTicketLog |
Definition at line 1354 of file ckcausalmlog.C.
Referenced by sendRemoveLogRequests(), and startMlogCheckpoint().
double totalSearchRestoredTime = 0 |
Definition at line 3903 of file ckcausalmlog.C.
double totalSearchRestoredCount = 0 |
Definition at line 3904 of file ckcausalmlog.C.