Charm++: Charm Source Code Documentation

00001 
00002 #include <converse.h>
00003 
00004 //#define CMK_POWER8_NVL   1
00005 
00006 #ifdef CMK_POWER8_NVL
00007 #include "/usr/local/cuda/include/cuda_runtime.h"
00008 #include "/usr/local/cuda/include/cuda_runtime_api.h"
00009 #endif
00010 
00011 #define ALIGNMENT        32
00012 #ifdef CMK_POWER8_NVL
00013 #define SMSG_SIZE        4096
00014 #define N_SMSG_ELEM      4096
00015 #define MMSG_SIZE        131072
00016 #define N_MMSG_ELEM      1024
00017 #define LLMSG_SIZE       4194304
00018 #define N_LLMSG_ELEM     128
00019 #else
00020 #define SMSG_SIZE        4096
00021 #define N_SMSG_ELEM      4096
00022 #define MMSG_SIZE        16384
00023 #define N_MMSG_ELEM      2048
00024 #define LLMSG_SIZE       65536
00025 #define N_LLMSG_ELEM     1024
00026 #endif
00027 
00028 #if CMK_BLUEGENEQ
00029 #include <spi/include/kernel/location.h>
00030 #endif
00031 
00032 PPCAtomicQueue *sPPCMemallocVec;
00033 PPCAtomicQueue *mPPCMemallocVec;
00034 PPCAtomicQueue *llPPCMemallocVec;
00035 
00036 typedef struct CmiMemAllocHdr_ppcq_t {
00037   int rank;
00038   int size;
00039   //Align the application buffer to 32 bytes
00040   char dummy[ALIGNMENT - sizeof(CmiChunkHeader) - 2*sizeof(int)];
00041 } CmiMemAllocHdr_ppcq;
00042 
00043 static int _nodeStart;
00044 extern int  Cmi_nodestart; /* First processor in this address space */
00045 
00046 #if CMK_ENABLE_ASYNC_PROGRESS
00047 extern CMK_THREADLOCAL int32_t _comm_thread_id;
00048 #endif
00049 
00050 void *CmiAlloc_ppcq (int size) {
00051   CmiMemAllocHdr_ppcq *hdr = NULL;
00052   char *buf;
00053 #if CMK_TRACE_PAMI_ENABLED
00054   double start = CmiWallTimer();
00055 #endif
00056 
00057 #if CMK_BLUEGENEQ
00058   //Comm threads are hidden on BG/Q
00059   int myrank = Kernel_ProcessorID() - _nodeStart;
00060 #else
00061   int myrank = CmiMyRank();
00062 #if CMK_ENABLE_ASYNC_PROGRESS
00063   if (CmiInCommThread())
00064     myrank = CmiMyNodeSize() + _comm_thread_id;
00065 #endif
00066 #endif
00067 
00068   if (size <= SMSG_SIZE) {
00069     hdr = (CmiMemAllocHdr_ppcq *)PPCAtomicDequeue (&sPPCMemallocVec[myrank]);
00070     if (hdr == NULL) {
00071 #ifdef CMK_POWER8_NVL
00072       cudaMallocHost ((void **) &hdr, SMSG_SIZE + sizeof(CmiMemAllocHdr_ppcq));
00073 #else
00074       hdr = (CmiMemAllocHdr_ppcq *)
00075         malloc_nomigrate(SMSG_SIZE + sizeof(CmiMemAllocHdr_ppcq));
00076 #endif
00077     }
00078     hdr->size = SMSG_SIZE;
00079   }
00080   else if (size <= MMSG_SIZE) {
00081     hdr = (CmiMemAllocHdr_ppcq *)PPCAtomicDequeue (&mPPCMemallocVec[myrank]);
00082     if (hdr == NULL) {
00083 #ifdef CMK_POWER8_NVL
00084       cudaMallocHost ((void **) &hdr, MMSG_SIZE + sizeof(CmiMemAllocHdr_ppcq));
00085 #else
00086       hdr = (CmiMemAllocHdr_ppcq *)
00087         malloc_nomigrate(MMSG_SIZE + sizeof(CmiMemAllocHdr_ppcq));
00088 #endif
00089     }
00090     hdr->size = MMSG_SIZE;
00091   }
00092   else if (size <= LLMSG_SIZE) {
00093     hdr = (CmiMemAllocHdr_ppcq *)PPCAtomicDequeue (&llPPCMemallocVec[myrank]);
00094     if (hdr == NULL) {
00095 #ifdef CMK_POWER8_NVL
00096       cudaMallocHost ((void **) &hdr, LLMSG_SIZE + sizeof(CmiMemAllocHdr_ppcq));
00097 #else
00098       hdr = (CmiMemAllocHdr_ppcq *)
00099         malloc_nomigrate(LLMSG_SIZE + sizeof(CmiMemAllocHdr_ppcq));
00100 #endif
00101     }
00102     hdr->size = LLMSG_SIZE;
00103   }
00104   else {
00105 #ifdef CMK_POWER8_NVL
00106     cudaMallocHost ((void **) &hdr, size + sizeof(CmiMemAllocHdr_ppcq));
00107 #else
00108     hdr = (CmiMemAllocHdr_ppcq *)
00109       malloc_nomigrate(size + sizeof(CmiMemAllocHdr_ppcq));
00110 #endif
00111     hdr->size = size;
00112   }
00113 
00114   hdr->rank = myrank;
00115   buf = (char*)hdr + sizeof(CmiMemAllocHdr_ppcq);
00116 
00117 #if CMK_TRACE_PAMI_ENABLED
00118   traceUserBracketEvent(30001, start, CmiWallTimer());
00119 #endif
00120 
00121   return buf;
00122 }
00123 
00124 void CmiFree_ppcq (void *buf) {
00125   CmiMemAllocHdr_ppcq *hdr = (CmiMemAllocHdr_ppcq *)((char*)buf - sizeof(CmiMemAllocHdr_ppcq));
00126   int rc = CMI_PPCQ_EAGAIN;
00127 
00128 #if CMK_TRACE_PAMI_ENABLED
00129   double start = CmiWallTimer();
00130 #endif
00131 
00132   if (hdr->size == SMSG_SIZE)
00133     rc = PPCAtomicEnqueue (&sPPCMemallocVec[hdr->rank], hdr);
00134   else if (hdr->size == MMSG_SIZE)
00135     rc = PPCAtomicEnqueue (&mPPCMemallocVec[hdr->rank], hdr);
00136   else if (hdr->size == LLMSG_SIZE)
00137     rc = PPCAtomicEnqueue (&llPPCMemallocVec[hdr->rank], hdr);
00138 
00139   if (rc == CMI_PPCQ_EAGAIN) {
00140 #ifdef CMK_POWER8_NVL
00141     cudaFreeHost (hdr);
00142 #else
00143     //queues are full or large buf
00144     free_nomigrate(hdr);
00145 #endif
00146   }
00147 
00148 #if CMK_TRACE_PAMI_ENABLED
00149   traceUserBracketEvent(30002, start, CmiWallTimer());
00150 #endif
00151 }
00152 
00153 void CmiMemAllocInit_ppcq (void   * atomic_mem,
00154                size_t   atomic_memsize)
00155 {
00156   int i = 0;
00157 #if CMK_BLUEGENEQ
00158   int node_size = 64/Kernel_ProcessCount();
00159   _nodeStart = node_size * Kernel_MyTcoord();
00160 #else
00161   int node_size = 2 * CmiMyNodeSize();
00162   _nodeStart = Cmi_nodestart;
00163 #endif
00164 
00165   //We want to align headers to 32 bytes
00166   CmiAssert(sizeof(CmiMemAllocHdr_ppcq)+sizeof(CmiChunkHeader) == ALIGNMENT);
00167 
00168   CmiAssert (atomic_memsize >= 3 * node_size * sizeof(PPCAtomicState));
00169   sPPCMemallocVec = (PPCAtomicQueue *)malloc_nomigrate(sizeof(PPCAtomicQueue)*node_size);
00170   mPPCMemallocVec = (PPCAtomicQueue *)malloc_nomigrate(sizeof(PPCAtomicQueue)*node_size);
00171   llPPCMemallocVec = (PPCAtomicQueue *)malloc_nomigrate(sizeof(PPCAtomicQueue)*node_size);
00172 
00173   for (i = 0; i < node_size; ++i) {
00174     PPCAtomicQueueInit ((char *)atomic_mem + 3*i*sizeof(PPCAtomicState),
00175             sizeof(PPCAtomicState),
00176             &sPPCMemallocVec[i],
00177             0, /*No Overflow*/
00178             N_SMSG_ELEM );
00179 
00180     PPCAtomicQueueInit ((char *)atomic_mem + (3*i+1)*sizeof(PPCAtomicState),
00181             sizeof(PPCAtomicState),
00182             &mPPCMemallocVec[i],
00183             0,
00184             N_MMSG_ELEM );
00185 
00186     PPCAtomicQueueInit ((char *)atomic_mem + (3*i+2)*sizeof(PPCAtomicState),
00187             sizeof(PPCAtomicState),
00188             &llPPCMemallocVec[i],
00189             0,
00190             N_LLMSG_ELEM );
00191   }
00192 }
arch/util/memalloc.C