PPL Logo

ck-core/ckmessagelogging.C File Reference

Go to the source code of this file.

Data Structures

class  ElementPacker
class  ElementDistributor
class  ElementCaller

Functions

const char * idx2str (const CkArrayIndex &ind)
const char * idx2str (const ArrayElement *el)
void getGlobalStep (CkGroupID gID)
bool fault_aware (CkObjID &recver)
void createObjIDList (void *data, ChareMlogData *mlogData)
bool isLocal (int destPE)
 Determines if the message is local or not.
bool isTeamLocal (int destPE)
 Determines if the message is group local or not.
void printLog (CkObjID *log)
void readKillFile ()
 CpvDeclare (Chare *, _currentObj)
 CpvDeclare (StoredCheckpoint *, _storedCheckpointData)
 CpvDeclare (char *, _incarnation)
 CpvDeclare (int, _numEmigrantRecObjs)
 CpvDeclare (int, _numImmigrantRecObjs)
 CpvDeclare (std::vector< CkLocation * > *, _immigrantRecObjs)
void setTeamRecovery (void *data, ChareMlogData *mlogData)
void unsetTeamRecovery (void *data, ChareMlogData *mlogData)
void mpi_restart_crashed (int pe, int rank)
int find_spare_mpirank (int pe, int partition)
void heartBeatPartner ()
 Pings buddy to let it know this PE is alive.
void heartBeatHandler (void *msg)
 Registers last time it knew about the PE that checkpoints on it.
void heartBeatCheckHandler ()
 Checks whether the PE that checkpoints on it is still alive.
void partnerFailureHandler (char *msg)
 Receives the notification of a failure and updates pe-to-rank mapping.
int getReverseCheckPointPE ()
 Getting the pe that checkpoints on this pe.
static void * doNothingMsg (int *size, void *data, void **remote, int count)
void _messageLoggingInit ()
 Initialize message logging data structures and register handlers.
void killLocal (void *_dummy, double curWallTime)
void readFaultFile ()
 : reads the PE that will be failing throughout the execution and the mean time between failures.
void CkDieNow ()
void sendGroupMsg (envelope *env, int destPE, int _infoIdx)
 Sends a group message that might be a broadcast.
void sendNodeGroupMsg (envelope *env, int destNode, int _infoIdx)
 Sends a nodegroup message that might be a broadcast.
void sendArrayMsg (envelope *env, int destPE, int _infoIdx)
 Sends a message to an array element.
void sendChareMsg (envelope *env, int destPE, int _infoIdx, const CkChareID *pCid)
 Sends a message to a singleton chare.
void sendCommonMsg (CkObjID &recver, envelope *_env, int destPE, int _infoIdx)
 A method to generate the actual ticket requests for groups, nodegroups or arrays.
void sendRemoteMsg (CkObjID &sender, CkObjID &recver, int destPE, MlogEntry *entry, MCount SN, int resend)
 Method that does the actual send by creating a ticket request filling it up and sending it.
void sendLocalMsg (envelope *env, int _infoIdx)
 Function to send a local message.
int preProcessReceivedMessage (envelope *env, Chare **objPointer, MlogEntry **logEntryPointer)
void postProcessReceivedMessage (Chare *obj, CkObjID &sender, MCount SN, MlogEntry *entry)
 Updates a few variables once a message has been processed.
void generalCldEnqueue (int destPE, envelope *env, int _infoIdx)
void _pingHandler (CkPingMsg *msg)
void buildProcessedTicketLog (void *data, ChareMlogData *mlogData)
void clearUpMigratedRetainedLists (int PE)
void checkpointAlarm (void *_dummy, double curWallTime)
void _checkpointRequestHandler (CheckpointRequest *request)
void CkStartMlogCheckpoint (CkCallback &cb)
 Starts checkpoint phase at PE 0.
void _startCheckpointHandler (CheckpointBarrierMsg *startMsg)
 Starts checkpoint: send its checkpoint to its partner.
void _endCheckpointHandler (char *msg)
 Finishes checkpoint process by making the callback.
void startMlogCheckpoint (void *_dummy, double curWallTime)
 Starts the checkpoint phase after migration.
void pupArrayElementsSkip (PUP::er &p, bool create, MigrationRecord *listToSkip, int listsize)
 Pups all the array elements in this processor.
void readCheckpointFromDisk (int size, char *data)
 Reads a checkpoint from disk.
void writeCheckpointToDisk (int size, char *data)
 Writes a checkpoint to disk.
void _storeCheckpointHandler (char *msg)
void _checkpointAckHandler (CheckPointAck *ackMsg)
void CkMlogRestart (const char *dummy, CkArgMsg *dummyMsg)
 Function for restarting the crashed processor.
void CkMlogRestartDouble (void *, double)
void _getCheckpointHandler (RestartRequest *restartMsg)
 Gets the stored checkpoint for its buddy processor.
void _recvCheckpointHandler (char *_restartData)
 Receives the checkpoint data from its buddy, restores the state of all the objects and asks everyone else to update its home.
void initializeRestart (void *data, ChareMlogData *mlogData)
 Initializes variables and flags for restarting procedure.
void updateHomePE (void *data, ChareMlogData *mlogData)
 Updates the homePe of chare array elements.
void printLog (CkObjID &recver)
 Prints a processed log.
void printMsg (envelope *env, const char *par)
 Prints information about a message.
void resendMessageForChare (void *data, ChareMlogData *mlogData)
 Resends all the logged messages to a particular chare list.
void _resendMessagesHandler (char *msg)
 Resends messages since last checkpoint to the list of objects included in the request.
void distributeRestartedObjects ()
 Distributes objects to accelerate recovery after a failure.
void _sendBackLocationHandler (char *receivedMsg)
 Handler to receive back a location.
void _distributedLocationHandler (char *receivedMsg)
 Handler to update information about an object just received.
void sendDummyMigration (int restartPE, CkGroupID lbID, CkGroupID locMgrID, CkArrayIndexMax &idx, int locationPE)
 this method is used to send messages to a restarted processor to tell it that a particular expected object is not going to get to it
void sendDummyMigrationCounts (int *dummyCounts)
 this method is used by a restarted processor to tell other processors that they are not going to receive these many objects.
void _dummyMigrationHandler (DummyMigrationMsg *msg)
 this handler is used to process a dummy migration msg.
void forAllCharesDo (MlogFn fnPointer, void *data)
 Map function pointed by fnPointer over all the chares living in this processor.
void initMlogLBStep (CkGroupID gid)
 This is the first time Converse is called after AtSync method has been called by every local object.
void pupLocation (CkLocation *loc, CkLocMgr *locMgr, PUP::er &p)
 Pups a location.
void sendBackImmigrantRecObjs ()
 Sends back the immigrant recovering object to their origin PE.
void restoreParallelRecovery (void(*_fnPtr)(void *), void *_centralLb)
 Restores objects after parallel recovery, either by sending back the immigrant objects or by waiting for all emigrant objects to be back.
void startLoadBalancingMlog (void(*_fnPtr)(void *), void *_centralLb)
 Load Balancing.
void finishedCheckpointLoadBalancing ()
void _receiveMlogLocationHandler (void *buf)
void _checkpointBarrierHandler (CheckpointBarrierMsg *barrierMsg)
 Processor 0 receives a contribution from every other processor after checkpoint.
void _checkpointBarrierAckHandler (CheckpointBarrierMsg *msg)
void garbageCollectMlogForChare (void *data, ChareMlogData *mlogData)
 Function to remove all messages in the message log of a particular chare.
void garbageCollectMlog ()
 Garbage collects the message log and other data structures.
void informLocationHome (CkGroupID locMgrID, CkArrayIndexMax idx, int homePE, int currentPE)
 method that informs an array elements home processor of its current location It is a converse method to bypass the charm++ message logging framework
void _receiveLocationHandler (CurrentLocationMsg *data)
void _getGlobalStepHandler (LBStepMsg *msg)
void _recvGlobalStepHandler (LBStepMsg *msg)
 Receives the global step handler from PE 0.
void _messageLoggingExit ()
 Function to wrap up performance information.
int getCheckPointPE ()
 Getting the pe number of the current processor's buddy.
envelopecopyEnvelope (envelope *env)

Variables

int _restartFlag = false
int _numRestartResponses = 0
char * checkpointDirectory = "."
int unAckedCheckpoint = 0
int countUpdateHomeAcks = 0
int teamSize
int chkptPeriod
bool fastRecovery
int parallelRecovery
char * killFile
char * faultFile
int killFlag = 0
int faultFlag = 0
int restartingMlogFlag = 0
double killTime = 0.0
double faultMean
int checkpointCount = 0
int diskCkptFlag = 0
static char fName [100]
intnumMsgsTarget
intsizeMsgsTarget
int totalMsgsTarget
float totalMsgsSize
int msgLogSize
int bufferedDetsSize
int storedDetsSize
int _pingHandlerIdx
int _checkpointRequestHandlerIdx
int _storeCheckpointHandlerIdx
int _checkpointAckHandlerIdx
int _getCheckpointHandlerIdx
int _recvCheckpointHandlerIdx
int _dummyMigrationHandlerIdx
int _getGlobalStepHandlerIdx
int _recvGlobalStepHandlerIdx
int _updateHomeRequestHandlerIdx
int _updateHomeAckHandlerIdx
int _resendMessagesHandlerIdx
int _receivedDetDataHandlerIdx
int _distributedLocationHandlerIdx
int _sendBackLocationHandlerIdx
int _falseRestart = 0
int onGoingLoadBalancing = 0
 For testing on clusters we might carry out restarts on a porcessor without actually starting it 1 -> false restart 0 -> restart after an actual crash.
void * centralLb
void(* resumeLbFnPtr )(void *)
int _receiveMlogLocationHandlerIdx
int _checkpointBarrierHandlerIdx
int _checkpointBarrierAckHandlerIdx
int _startCheckpointIdx
int _endCheckpointIdx
int donotCountMigration = 0
int countLBMigratedAway = 0
int countLBToMigrate = 0
int migrationDoneCalled = 0
int checkpointBarrierCount = 0
int globalResumeCount = 0
CkGroupID globalLBID
int restartDecisionNumber = -1
double lastCompletedAlarm = 0
double lastRestart = 0
CkCallback ckptCallback
int _receiveLocationHandlerIdx
static int heartBeatHandlerIdx
static int heartBeatCheckHandlerIdx
static int partnerFailureHandlerIdx
static double lastPingTime = -1
int inCkptFlag = 0


Function Documentation

const char* idx2str ( const CkArrayIndex &  ind  ) 

const char* idx2str ( const ArrayElement el  ) 

void getGlobalStep ( CkGroupID  gID  ) 

bool fault_aware ( CkObjID recver  ) 

void createObjIDList ( void *  data,
ChareMlogData mlogData 
)

bool isLocal ( int  destPE  )  [inline]

Determines if the message is local or not.

A message is local if: 1) Both the destination and origin are the same PE.

bool isTeamLocal ( int  destPE  )  [inline]

Determines if the message is group local or not.

A message is group local if: 1) They belong to the same team in the team-based message logging.

void printLog ( CkObjID log  ) 

void readKillFile (  ) 

CpvDeclare ( Chare ,
_currentObj   
)

CpvDeclare ( StoredCheckpoint ,
_storedCheckpointData   
)

CpvDeclare ( char *  ,
_incarnation   
)

CpvDeclare ( int  ,
_numEmigrantRecObjs   
)

CpvDeclare ( int  ,
_numImmigrantRecObjs   
)

CpvDeclare ( std::vector< CkLocation * > *  ,
_immigrantRecObjs   
)

void setTeamRecovery ( void *  data,
ChareMlogData mlogData 
)

void unsetTeamRecovery ( void *  data,
ChareMlogData mlogData 
)

void mpi_restart_crashed ( int  pe,
int  rank 
)

int find_spare_mpirank ( int  pe,
int  partition 
)

void heartBeatPartner (  ) 

Pings buddy to let it know this PE is alive.

Used for failure detection.

void heartBeatHandler ( void *  msg  ) 

Registers last time it knew about the PE that checkpoints on it.

void heartBeatCheckHandler (  ) 

Checks whether the PE that checkpoints on it is still alive.

void partnerFailureHandler ( char *  msg  ) 

Receives the notification of a failure and updates pe-to-rank mapping.

int getReverseCheckPointPE (  ) 

Getting the pe that checkpoints on this pe.

static void* doNothingMsg ( int size,
void *  data,
void **  remote,
int  count 
) [static]

Definition at line 172 of file ckmessagelogging.C.

void _messageLoggingInit (  ) 

Initialize message logging data structures and register handlers.

Definition at line 181 of file ckmessagelogging.C.

References _checkpointAckHandler(), _checkpointAckHandlerIdx, _checkpointBarrierAckHandler(), _checkpointBarrierAckHandlerIdx, _checkpointBarrierHandler(), _checkpointBarrierHandlerIdx, _checkpointRequestHandler(), _checkpointRequestHandlerIdx, _distributedLocationHandler(), _distributedLocationHandlerIdx, _dummyMigrationHandler(), _dummyMigrationHandlerIdx, _endCheckpointHandler(), _endCheckpointIdx, _getCheckpointHandler(), _getCheckpointHandlerIdx, _getGlobalStepHandler(), _getGlobalStepHandlerIdx, _getRestartCheckpointHandler(), _getRestartCheckpointHandlerIdx, _indexBufferedDets, _maxBufferedDets, _numBufferedDets, _phaseBufferedDets, _pingHandler(), _pingHandlerIdx, _receivedDetDataHandler(), _receivedDetDataHandlerIdx, _receivedTNDataHandler(), _receivedTNDataHandlerIdx, _receiveLocationHandler(), _receiveLocationHandlerIdx, _receiveMigrationNoticeAckHandler(), _receiveMigrationNoticeAckHandlerIdx, _receiveMigrationNoticeHandler(), _receiveMigrationNoticeHandlerIdx, _receiveMlogLocationHandler(), _receiveMlogLocationHandlerIdx, _recvCheckpointHandler(), _recvCheckpointHandlerIdx, _recvGlobalStepHandler(), _recvGlobalStepHandlerIdx, _recvRestartCheckpointHandler(), _recvRestartCheckpointHandlerIdx, _removeDeterminantsHandler(), _removeDeterminantsHandlerIdx, _removeProcessedLogHandler(), _removeProcessedLogHandlerIdx, _resendMessagesHandler(), _resendMessagesHandlerIdx, _restartHandler(), _restartHandlerIdx, _sendBackLocationHandler(), _sendBackLocationHandlerIdx, _sendDetsHandler(), _sendDetsHandlerIdx, _sendDetsReplyHandler(), _sendDetsReplyHandlerIdx, _startCheckpointHandler(), _startCheckpointIdx, _storeCheckpointHandler(), _storeCheckpointHandlerIdx, _storeDeterminantsHandler(), _storeDeterminantsHandlerIdx, _updateHomeAckHandler(), _updateHomeAckHandlerIdx, _updateHomeRequestHandler(), _updateHomeRequestHandlerIdx, _verifyAckHandler(), _verifyAckHandlerIdx, _verifyAckRequestHandler(), _verifyAckRequestHandlerIdx, bufferedDetsSize, CcdCallOnCondition(), Converse::CkMyPe(), Converse::CkNumPes(), CmiAlloc(), CmiWallTimer(), CqsCreate(), diskCkptFlag, fName, heartBeatCheckHandler(), heartBeatCheckHandlerIdx, heartBeatHandler(), heartBeatHandlerIdx, heartBeatPartner(), int, lastCompletedAlarm, lastRestart, msgLogSize, numDets, numDupDets, numMsgsTarget, numPiggyDets, partnerFailureHandler(), partnerFailureHandlerIdx, sizeMsgsTarget, storedDetsSize, totalMsgsSize, totalMsgsTarget, and traceRegisterUserEvent().

Here is the call graph for this function:

void killLocal ( void *  _dummy,
double  curWallTime 
)

void readFaultFile (  ) 

: reads the PE that will be failing throughout the execution and the mean time between failures.

We assume an exponential distribution for the mean-time-between-failures.

Definition at line 395 of file ckmessagelogging.C.

References CcdCallFnAfter(), Converse::CkMyPe(), faultFile, faultMean, and killLocal().

Here is the call graph for this function:

void CkDieNow (  ) 

Definition at line 425 of file ckmessagelogging.C.

References CcdCallFnAfter(), CmiMyPe(), CmiPrintf(), CmiWallTimer(), killLocal(), and killTime.

Here is the call graph for this function:

void sendGroupMsg ( envelope env,
int  destPE,
int  _infoIdx 
)

Sends a group message that might be a broadcast.

Definition at line 442 of file ckmessagelogging.C.

References CkCopyMsg(), Converse::CkMyPe(), CmiMyPe(), CkObjID::data, EnvToUsr(), envelope::getGroupNum(), _ObjectID::group, _ObjectID::id, _ObjectID::onPE, sendCommonMsg(), sendGroupMsg(), CkObjID::type, TypeGroup, TypeInvalid, and UsrToEnv().

Here is the call graph for this function:

void sendNodeGroupMsg ( envelope env,
int  destNode,
int  _infoIdx 
)

Sends a nodegroup message that might be a broadcast.

Definition at line 473 of file ckmessagelogging.C.

References CkCopyMsg(), Converse::CkMyPe(), CkObjID::data, EnvToUsr(), envelope::getGroupNum(), _ObjectID::group, _ObjectID::id, _ObjectID::onPE, sendCommonMsg(), sendNodeGroupMsg(), CkObjID::type, TypeInvalid, TypeNodeGroup, and UsrToEnv().

Here is the call graph for this function:

void sendArrayMsg ( envelope env,
int  destPE,
int  _infoIdx 
)

void sendChareMsg ( envelope env,
int  destPE,
int  _infoIdx,
const CkChareID pCid 
)

Sends a message to a singleton chare.

Definition at line 524 of file ckmessagelogging.C.

References _ObjectID::chare, Converse::CkMyPe(), CkObjID::data, _ObjectID::id, sendCommonMsg(), CkObjID::toString(), CkObjID::type, TypeArray, and TypeChare.

Here is the call graph for this function:

void sendCommonMsg ( CkObjID recver,
envelope _env,
int  destPE,
int  _infoIdx 
)

void sendRemoteMsg ( CkObjID sender,
CkObjID recver,
int  destPE,
MlogEntry entry,
MCount  SN,
int  resend 
)

Method that does the actual send by creating a ticket request filling it up and sending it.

Definition at line 624 of file ckmessagelogging.C.

References MlogEntry::_infoIdx, ChareMlogData::addLogEntry(), Converse::CkMyPe(), CmiMemoryCheck(), MlogEntry::env, float, generalCldEnqueue(), envelope::getTotalsize(), Chare::mlogData, MLOGFT_totalLogSize, MLOGFT_totalMessages, msgLogSize, numMsgsTarget, sizeMsgsTarget, totalMsgsSize, and totalMsgsTarget.

Referenced by sendCommonMsg().

Here is the call graph for this function:

Here is the caller graph for this function:

void sendLocalMsg ( envelope env,
int  _infoIdx 
)

Function to send a local message.

It first gets a ticket and then enqueues the message. If we are recovering, then the message is enqueued in a delay queue.

Definition at line 667 of file ckmessagelogging.C.

References _skipCldEnqueue(), CmiMemoryCheck(), and CmiMyPe().

Here is the call graph for this function:

int preProcessReceivedMessage ( envelope env,
Chare **  objPointer,
MlogEntry **  logEntryPointer 
)

void postProcessReceivedMessage ( Chare obj,
CkObjID sender,
MCount  SN,
MlogEntry entry 
)

Updates a few variables once a message has been processed.

Definition at line 770 of file ckmessagelogging.C.

References Converse::CkMyPe(), CmiMemoryCheck(), MlogEntry::env, CkObjID::guessPE(), Chare::mlogData, and ChareMlogData::tProcessed.

Here is the call graph for this function:

void generalCldEnqueue ( int  destPE,
envelope env,
int  _infoIdx 
)

Definition at line 777 of file ckmessagelogging.C.

References _noCldNodeEnqueue(), _skipCldEnqueue(), and TypeNodeGroup.

Here is the call graph for this function:

void _pingHandler ( CkPingMsg msg  ) 

Definition at line 793 of file ckmessagelogging.C.

References Converse::CkMyPe(), CmiFree(), and RestartRequest::PE.

Here is the call graph for this function:

void buildProcessedTicketLog ( void *  data,
ChareMlogData mlogData 
)

void clearUpMigratedRetainedLists ( int  PE  ) 

void checkpointAlarm ( void *  _dummy,
double  curWallTime 
)

void _checkpointRequestHandler ( CheckpointRequest request  ) 

Definition at line 825 of file ckmessagelogging.C.

References CmiWallTimer(), and startMlogCheckpoint().

Here is the call graph for this function:

void CkStartMlogCheckpoint ( CkCallback cb  ) 

Starts checkpoint phase at PE 0.

Definition at line 832 of file ckmessagelogging.C.

References _startCheckpointIdx, CmiAlloc(), and Converse::CmiSyncBroadcastAllAndFree().

Here is the call graph for this function:

void _startCheckpointHandler ( CheckpointBarrierMsg startMsg  ) 

Starts checkpoint: send its checkpoint to its partner.

This checkpointing strategy is NOT connected to the load balancer, hence onGoingLoadBalancer==0.

Definition at line 847 of file ckmessagelogging.C.

References _storeCheckpointHandlerIdx, buf, bufferedDetsSize, checkpointCount, Converse::CkMyPe(), CkPupGroupData(), CkPupNodeGroupData(), CkPupROData(), CmiAlloc(), CmiFree(), CmiMemoryCheck(), CmiMyPe(), Converse::CmiSyncSendAndFree(), CmiTimer(), CmiWallTimer(), CheckPointDataMsg::dataSize, dataSize, garbageCollectMlog(), getCheckPointPE(), _ckGroupID::idx, inCkptFlag, msgLogSize, CheckPointDataMsg::PE, pupArrayElementsSkip(), PUP::sizer::size(), storedDetsSize, and unAckedCheckpoint.

Referenced by _messageLoggingInit().

Here is the call graph for this function:

Here is the caller graph for this function:

void _endCheckpointHandler ( char *  msg  ) 

Finishes checkpoint process by making the callback.

Definition at line 926 of file ckmessagelogging.C.

References CmiFree(), and CkCallback::send().

Referenced by _messageLoggingInit().

Here is the call graph for this function:

Here is the caller graph for this function:

void startMlogCheckpoint ( void *  _dummy,
double  curWallTime 
)

void pupArrayElementsSkip ( PUP::er p,
bool  create,
MigrationRecord listToSkip,
int  listsize 
)

Pups all the array elements in this processor.

Definition at line 1030 of file ckmessagelogging.C.

References CkCountArrayElements(), Converse::CkMyPe(), CmiMyPe(), flag, CkLocMgr::homePe(), _ckGroupID::idx, idx, MigrationRecord::idx, idx2str(), informLocationHome(), PUP::er::isUnpacking(), CkLocMgr::numLocalElements(), and CkLocMgr::resume().

Here is the call graph for this function:

void readCheckpointFromDisk ( int  size,
char *  data 
)

Reads a checkpoint from disk.

Assumes variable fName contains the name of the file.

Definition at line 1089 of file ckmessagelogging.C.

References fName.

void writeCheckpointToDisk ( int  size,
char *  data 
)

Writes a checkpoint to disk.

Assumes variable fName contains the name of the file.

Definition at line 1098 of file ckmessagelogging.C.

References fName.

void _storeCheckpointHandler ( char *  msg  ) 

void _checkpointAckHandler ( CheckPointAck ackMsg  ) 

void CkMlogRestart ( const char *  dummy,
CkArgMsg dummyMsg 
)

Function for restarting the crashed processor.

It sets the restart flag and contacts the buddy processor to get the latest checkpoint.

Definition at line 1172 of file ckmessagelogging.C.

References _getCheckpointHandlerIdx, _numRestartResponses, _recoveryFlag, _restartFlag, _restartHandlerIdx, Converse::CkMyPe(), Converse::CkNumPes(), CmiInitMsgHeader(), Converse::CmiSyncSend(), CmiWallTimer(), getCheckPointPE(), RestartRequest::header, msg, RestartRequest::PE, and teamSize.

Here is the call graph for this function:

void CkMlogRestartDouble ( void *  ,
double   
)

Definition at line 1188 of file ckmessagelogging.C.

References CkMlogRestart().

Here is the call graph for this function:

void _getCheckpointHandler ( RestartRequest restartMsg  ) 

void _recvCheckpointHandler ( char *  _restartData  ) 

void initializeRestart ( void *  data,
ChareMlogData mlogData 
)

Initializes variables and flags for restarting procedure.

Definition at line 1283 of file ckmessagelogging.C.

References ChareMlogData::receivedTNs, ChareMlogData::resendReplyRecvd, and ChareMlogData::restartFlag.

void updateHomePE ( void *  data,
ChareMlogData mlogData 
)

void printLog ( CkObjID recver  ) 

Prints a processed log.

Definition at line 1314 of file ckmessagelogging.C.

References Converse::CkMyPe(), and CkObjID::toString().

Here is the call graph for this function:

void printMsg ( envelope env,
const char *  par 
)

Prints information about a message.

Definition at line 1322 of file ckmessagelogging.C.

References Converse::CkMyPe().

Here is the call graph for this function:

void resendMessageForChare ( void *  data,
ChareMlogData mlogData 
)

Resends all the logged messages to a particular chare list.

Parameters:
data is of type ResendData which contains the array of objects on the restartedProcessor.
mlogData a particular chare living in this processor.

Definition at line 1333 of file ckmessagelogging.C.

References Converse::CkMyPe(), Converse::CmiSyncSend(), copyEnvelope(), count, CqsEnqueueGeneral(), MlogEntry::env, ChareMlogData::getMlog(), envelope::getPriobits(), envelope::getPrioPtr(), envelope::getQueueing(), envelope::getTotalsize(), CkQ< T >::length(), ResendData::listObjects, ResendData::numberObjects, ChareMlogData::objID, ResendData::PE, printMsg(), CkObjID::toString(), TypeInvalid, and TypeNodeGroup.

Here is the call graph for this function:

void _resendMessagesHandler ( char *  msg  ) 

Resends messages since last checkpoint to the list of objects included in the request.

It also sends stored remote determinants to the particular failed PE.

Definition at line 1387 of file ckmessagelogging.C.

References Converse::CkMyPe(), CmiFree(), CmiMemoryCheck(), CmiResetGlobalReduceSeqID(), CmiWallTimer(), PUP::d, fillTicketForChare(), forAllCharesDo(), isTeamLocal(), lastRestart, ResendData::listObjects, ResendRequest::numberObjects, ResendData::numberObjects, ResendRequest::PE, ResendData::PE, and resendMessageForChare().

Here is the call graph for this function:

void distributeRestartedObjects (  ) 

Distributes objects to accelerate recovery after a failure.

Definition at line 1492 of file ckmessagelogging.C.

References Converse::CkMyPe().

Here is the call graph for this function:

void _sendBackLocationHandler ( char *  receivedMsg  ) 

void _distributedLocationHandler ( char *  receivedMsg  ) 

void sendDummyMigration ( int  restartPE,
CkGroupID  lbID,
CkGroupID  locMgrID,
CkArrayIndexMax idx,
int  locationPE 
)

this method is used to send messages to a restarted processor to tell it that a particular expected object is not going to get to it

Definition at line 1585 of file ckmessagelogging.C.

References _dummyMigrationHandlerIdx, buf, CmiInitMsgHeader(), Converse::CmiSyncSend(), DummyMigrationMsg::flag, DummyMigrationMsg::header, DummyMigrationMsg::idx, DummyMigrationMsg::lbID, DummyMigrationMsg::locationPE, and DummyMigrationMsg::mgrID.

Here is the call graph for this function:

void _dummyMigrationHandler ( DummyMigrationMsg msg  ) 

this handler is used to process a dummy migration msg.

it looks up the load balancer and calls migrated for it

Definition at line 1620 of file ckmessagelogging.C.

References CmiFree(), CmiMyPe(), CmiPrintf(), DummyMigrationMsg::count, DummyMigrationMsg::flag, h, DummyMigrationMsg::idx, _ckGroupID::idx, idx2str(), lb, DummyMigrationMsg::lbID, DummyMigrationMsg::locationPE, DummyMigrationMsg::mgrID, CentralLB::Migrated(), and verifyAckedRequests.

Here is the call graph for this function:

void forAllCharesDo ( MlogFn  fnPointer,
void *  data 
)

Map function pointed by fnPointer over all the chares living in this processor.

Definition at line 1669 of file ckmessagelogging.C.

References caller, and Chare::mlogData.

void pupLocation ( CkLocation loc,
CkLocMgr locMgr,
PUP::er p 
)

Pups a location.

Definition at line 1713 of file ckmessagelogging.C.

References IrrGroup::ckGetGroupID(), CkLocation::getIndex(), and idx.

Here is the call graph for this function:

void sendBackImmigrantRecObjs (  ) 

void restoreParallelRecovery ( void(*)(void *)  _fnPtr,
void *  _centralLb 
)

Restores objects after parallel recovery, either by sending back the immigrant objects or by waiting for all emigrant objects to be back.

Definition at line 1792 of file ckmessagelogging.C.

References centralLb, resumeLbFnPtr, and sendBackImmigrantRecObjs().

Here is the call graph for this function:

void startLoadBalancingMlog ( void(*)(void *)  _fnPtr,
void *  _centralLb 
)

Load Balancing.

Definition at line 1809 of file ckmessagelogging.C.

References centralLb, CmiMyPe(), CmiWallTimer(), countLBMigratedAway, countLBToMigrate, migrationDoneCalled, resumeLbFnPtr, and startMlogCheckpoint().

Here is the call graph for this function:

void finishedCheckpointLoadBalancing (  ) 

Definition at line 1822 of file ckmessagelogging.C.

References _checkpointBarrierHandlerIdx, CmiAlloc(), CmiMyPe(), CmiReduce(), and doNothingMsg().

Here is the call graph for this function:

void _receiveMlogLocationHandler ( void *  buf  ) 

void _checkpointBarrierHandler ( CheckpointBarrierMsg barrierMsg  ) 

Processor 0 receives a contribution from every other processor after checkpoint.

Definition at line 1843 of file ckmessagelogging.C.

References _checkpointBarrierAckHandlerIdx, CmiAlloc(), CmiFree(), and Converse::CmiSyncBroadcastAllAndFree().

Here is the call graph for this function:

void _checkpointBarrierAckHandler ( CheckpointBarrierMsg msg  ) 

Definition at line 1854 of file ckmessagelogging.C.

References centralLb, Converse::CkMyPe(), CmiFree(), CmiMyPe(), CmiPrintf(), inCkptFlag, and sendRemoveLogRequests().

Here is the call graph for this function:

void garbageCollectMlogForChare ( void *  data,
ChareMlogData mlogData 
)

Function to remove all messages in the message log of a particular chare.

Definition at line 1872 of file ckmessagelogging.C.

References CkQ< T >::deq(), ChareMlogData::getMlog(), and CkQ< T >::length().

Here is the call graph for this function:

void garbageCollectMlog (  ) 

Garbage collects the message log and other data structures.

In case of synchronized checkpoint, we use an optimization to avoid causal message logging protocol to communicate all determinants to the rest of the processors.

Definition at line 1889 of file ckmessagelogging.C.

References _indexBufferedDets, _numBufferedDets, _phaseBufferedDets, Converse::CkMyPe(), forAllCharesDo(), garbageCollectMlogForChare(), CkHashtableIterator::hasNext(), and CkHashtableIterator::next().

Here is the call graph for this function:

void informLocationHome ( CkGroupID  locMgrID,
CkArrayIndexMax  idx,
int  homePE,
int  currentPE 
)

method that informs an array elements home processor of its current location It is a converse method to bypass the charm++ message logging framework

Definition at line 1901 of file ckmessagelogging.C.

References _receiveLocationHandlerIdx, Converse::CkMyPe(), CmiInitMsgHeader(), CmiMyPe(), CmiPrintf(), Converse::CmiSyncSend(), CmiWallTimer(), CurrentLocationMsg::fromPE, CurrentLocationMsg::header, _ckGroupID::idx, CurrentLocationMsg::idx, idx2str(), CurrentLocationMsg::locationPE, CurrentLocationMsg::mgrID, and traceUserBracketEvent().

Here is the call graph for this function:

void _receiveLocationHandler ( CurrentLocationMsg data  ) 

void _getGlobalStepHandler ( LBStepMsg msg  ) 

Definition at line 1959 of file ckmessagelogging.C.

References _recvGlobalStepHandlerIdx, CmiMyPe(), CmiPrintf(), Converse::CmiSyncSend(), LBStepMsg::fromPE, _ckGroupID::idx, lb, LBStepMsg::lbID, and LBStepMsg::step.

Here is the call graph for this function:

void _recvGlobalStepHandler ( LBStepMsg msg  ) 

int getCheckPointPE (  ) 

Getting the pe number of the current processor's buddy.

In the team-based approach each processor might checkpoint in the next team, but currently teams are only meant to reduce memory overhead. Note: function getReverseCheckPointPE performs the reverse map. It must be changed accordingly.

Definition at line 2187 of file ckmessagelogging.C.

References CmiMyPe().

Here is the call graph for this function:

envelope* copyEnvelope ( envelope env  ) 

Definition at line 2199 of file ckmessagelogging.C.

References CmiAlloc(), and envelope::getTotalsize().

Here is the call graph for this function:


Variable Documentation

Definition at line 55 of file ckmessagelogging.C.

char* checkpointDirectory = "."

Definition at line 57 of file ckmessagelogging.C.

Definition at line 58 of file ckmessagelogging.C.

Definition at line 59 of file ckmessagelogging.C.

char* faultFile

Definition at line 67 of file ckmessagelogging.C.

Definition at line 69 of file ckmessagelogging.C.

Definition at line 70 of file ckmessagelogging.C.

double killTime = 0.0

Definition at line 72 of file ckmessagelogging.C.

double faultMean

Definition at line 73 of file ckmessagelogging.C.

Definition at line 74 of file ckmessagelogging.C.

char fName[100] [static]

Definition at line 76 of file ckmessagelogging.C.

Definition at line 94 of file ckmessagelogging.C.

Definition at line 95 of file ckmessagelogging.C.

Definition at line 96 of file ckmessagelogging.C.

Definition at line 97 of file ckmessagelogging.C.

Definition at line 100 of file ckmessagelogging.C.

Definition at line 101 of file ckmessagelogging.C.

Definition at line 102 of file ckmessagelogging.C.

Definition at line 106 of file ckmessagelogging.C.

Definition at line 107 of file ckmessagelogging.C.

Definition at line 108 of file ckmessagelogging.C.

Definition at line 109 of file ckmessagelogging.C.

Definition at line 110 of file ckmessagelogging.C.

Definition at line 111 of file ckmessagelogging.C.

Definition at line 112 of file ckmessagelogging.C.

Definition at line 113 of file ckmessagelogging.C.

Definition at line 114 of file ckmessagelogging.C.

Definition at line 115 of file ckmessagelogging.C.

Definition at line 116 of file ckmessagelogging.C.

Definition at line 117 of file ckmessagelogging.C.

Definition at line 118 of file ckmessagelogging.C.

Definition at line 119 of file ckmessagelogging.C.

Definition at line 120 of file ckmessagelogging.C.

Definition at line 124 of file ckmessagelogging.C.

For testing on clusters we might carry out restarts on a porcessor without actually starting it 1 -> false restart 0 -> restart after an actual crash.

Definition at line 132 of file ckmessagelogging.C.

void* centralLb

Definition at line 133 of file ckmessagelogging.C.

void(* resumeLbFnPtr)(void *)

Definition at line 135 of file ckmessagelogging.C.

Definition at line 136 of file ckmessagelogging.C.

Definition at line 137 of file ckmessagelogging.C.

Definition at line 138 of file ckmessagelogging.C.

Referenced by _messageLoggingInit(), and CkStartMlogCheckpoint().

Definition at line 139 of file ckmessagelogging.C.

Referenced by _checkpointAckHandler(), and _messageLoggingInit().

Definition at line 141 of file ckmessagelogging.C.

Definition at line 142 of file ckmessagelogging.C.

Definition at line 143 of file ckmessagelogging.C.

Definition at line 144 of file ckmessagelogging.C.

Definition at line 146 of file ckmessagelogging.C.

Definition at line 147 of file ckmessagelogging.C.

double lastCompletedAlarm = 0

Definition at line 148 of file ckmessagelogging.C.

double lastRestart = 0

Definition at line 149 of file ckmessagelogging.C.

Definition at line 150 of file ckmessagelogging.C.

Definition at line 153 of file ckmessagelogging.C.

Definition at line 156 of file ckmessagelogging.C.

Definition at line 157 of file ckmessagelogging.C.

Definition at line 158 of file ckmessagelogging.C.

double lastPingTime = -1 [static]

Definition at line 159 of file ckmessagelogging.C.

Definition at line 169 of file ckmessagelogging.C.


Generated on Mon Sep 21 07:58:05 2020 for Charm++ by  doxygen 1.5.5