00001
00002 #include <converse.h>
00003
00004
00005
00006 #ifdef CMK_POWER8_NVL
00007 #include "/usr/local/cuda/include/cuda_runtime.h"
00008 #include "/usr/local/cuda/include/cuda_runtime_api.h"
00009 #endif
00010
00011 #define ALIGNMENT 32
00012 #ifdef CMK_POWER8_NVL
00013 #define SMSG_SIZE 4096
00014 #define N_SMSG_ELEM 4096
00015 #define MMSG_SIZE 131072
00016 #define N_MMSG_ELEM 1024
00017 #define LLMSG_SIZE 4194304
00018 #define N_LLMSG_ELEM 128
00019 #else
00020 #define SMSG_SIZE 4096
00021 #define N_SMSG_ELEM 4096
00022 #define MMSG_SIZE 16384
00023 #define N_MMSG_ELEM 2048
00024 #define LLMSG_SIZE 65536
00025 #define N_LLMSG_ELEM 1024
00026 #endif
00027
00028 #if CMK_BLUEGENEQ
00029 #include <spi/include/kernel/location.h>
00030 #endif
00031
00032 PPCAtomicQueue *sPPCMemallocVec;
00033 PPCAtomicQueue *mPPCMemallocVec;
00034 PPCAtomicQueue *llPPCMemallocVec;
00035
00036 typedef struct CmiMemAllocHdr_ppcq_t {
00037 int rank;
00038 int size;
00039
00040 char dummy[ALIGNMENT - sizeof(CmiChunkHeader) - 2*sizeof(int)];
00041 } CmiMemAllocHdr_ppcq;
00042
00043 static int _nodeStart;
00044 extern int Cmi_nodestart;
00045
00046 #if CMK_ENABLE_ASYNC_PROGRESS
00047 extern CMK_THREADLOCAL int32_t _comm_thread_id;
00048 #endif
00049
00050 void *CmiAlloc_ppcq (int size) {
00051 CmiMemAllocHdr_ppcq *hdr = NULL;
00052 char *buf;
00053 #if CMK_TRACE_PAMI_ENABLED
00054 double start = CmiWallTimer();
00055 #endif
00056
00057 #if CMK_BLUEGENEQ
00058
00059 int myrank = Kernel_ProcessorID() - _nodeStart;
00060 #else
00061 int myrank = CmiMyRank();
00062 #if CMK_ENABLE_ASYNC_PROGRESS
00063 if (CmiInCommThread())
00064 myrank = CmiMyNodeSize() + _comm_thread_id;
00065 #endif
00066 #endif
00067
00068 if (size <= SMSG_SIZE) {
00069 hdr = (CmiMemAllocHdr_ppcq *)PPCAtomicDequeue (&sPPCMemallocVec[myrank]);
00070 if (hdr == NULL) {
00071 #ifdef CMK_POWER8_NVL
00072 cudaMallocHost ((void **) &hdr, SMSG_SIZE + sizeof(CmiMemAllocHdr_ppcq));
00073 #else
00074 hdr = (CmiMemAllocHdr_ppcq *)
00075 malloc_nomigrate(SMSG_SIZE + sizeof(CmiMemAllocHdr_ppcq));
00076 #endif
00077 }
00078 hdr->size = SMSG_SIZE;
00079 }
00080 else if (size <= MMSG_SIZE) {
00081 hdr = (CmiMemAllocHdr_ppcq *)PPCAtomicDequeue (&mPPCMemallocVec[myrank]);
00082 if (hdr == NULL) {
00083 #ifdef CMK_POWER8_NVL
00084 cudaMallocHost ((void **) &hdr, MMSG_SIZE + sizeof(CmiMemAllocHdr_ppcq));
00085 #else
00086 hdr = (CmiMemAllocHdr_ppcq *)
00087 malloc_nomigrate(MMSG_SIZE + sizeof(CmiMemAllocHdr_ppcq));
00088 #endif
00089 }
00090 hdr->size = MMSG_SIZE;
00091 }
00092 else if (size <= LLMSG_SIZE) {
00093 hdr = (CmiMemAllocHdr_ppcq *)PPCAtomicDequeue (&llPPCMemallocVec[myrank]);
00094 if (hdr == NULL) {
00095 #ifdef CMK_POWER8_NVL
00096 cudaMallocHost ((void **) &hdr, LLMSG_SIZE + sizeof(CmiMemAllocHdr_ppcq));
00097 #else
00098 hdr = (CmiMemAllocHdr_ppcq *)
00099 malloc_nomigrate(LLMSG_SIZE + sizeof(CmiMemAllocHdr_ppcq));
00100 #endif
00101 }
00102 hdr->size = LLMSG_SIZE;
00103 }
00104 else {
00105 #ifdef CMK_POWER8_NVL
00106 cudaMallocHost ((void **) &hdr, size + sizeof(CmiMemAllocHdr_ppcq));
00107 #else
00108 hdr = (CmiMemAllocHdr_ppcq *)
00109 malloc_nomigrate(size + sizeof(CmiMemAllocHdr_ppcq));
00110 #endif
00111 hdr->size = size;
00112 }
00113
00114 hdr->rank = myrank;
00115 buf = (char*)hdr + sizeof(CmiMemAllocHdr_ppcq);
00116
00117 #if CMK_TRACE_PAMI_ENABLED
00118 traceUserBracketEvent(30001, start, CmiWallTimer());
00119 #endif
00120
00121 return buf;
00122 }
00123
00124 void CmiFree_ppcq (void *buf) {
00125 CmiMemAllocHdr_ppcq *hdr = (CmiMemAllocHdr_ppcq *)((char*)buf - sizeof(CmiMemAllocHdr_ppcq));
00126 int rc = CMI_PPCQ_EAGAIN;
00127
00128 #if CMK_TRACE_PAMI_ENABLED
00129 double start = CmiWallTimer();
00130 #endif
00131
00132 if (hdr->size == SMSG_SIZE)
00133 rc = PPCAtomicEnqueue (&sPPCMemallocVec[hdr->rank], hdr);
00134 else if (hdr->size == MMSG_SIZE)
00135 rc = PPCAtomicEnqueue (&mPPCMemallocVec[hdr->rank], hdr);
00136 else if (hdr->size == LLMSG_SIZE)
00137 rc = PPCAtomicEnqueue (&llPPCMemallocVec[hdr->rank], hdr);
00138
00139 if (rc == CMI_PPCQ_EAGAIN) {
00140 #ifdef CMK_POWER8_NVL
00141 cudaFreeHost (hdr);
00142 #else
00143
00144 free_nomigrate(hdr);
00145 #endif
00146 }
00147
00148 #if CMK_TRACE_PAMI_ENABLED
00149 traceUserBracketEvent(30002, start, CmiWallTimer());
00150 #endif
00151 }
00152
00153 void CmiMemAllocInit_ppcq (void * atomic_mem,
00154 size_t atomic_memsize)
00155 {
00156 int i = 0;
00157 #if CMK_BLUEGENEQ
00158 int node_size = 64/Kernel_ProcessCount();
00159 _nodeStart = node_size * Kernel_MyTcoord();
00160 #else
00161 int node_size = 2 * CmiMyNodeSize();
00162 _nodeStart = Cmi_nodestart;
00163 #endif
00164
00165
00166 CmiAssert(sizeof(CmiMemAllocHdr_ppcq)+sizeof(CmiChunkHeader) == ALIGNMENT);
00167
00168 CmiAssert (atomic_memsize >= 3 * node_size * sizeof(PPCAtomicState));
00169 sPPCMemallocVec = (PPCAtomicQueue *)malloc_nomigrate(sizeof(PPCAtomicQueue)*node_size);
00170 mPPCMemallocVec = (PPCAtomicQueue *)malloc_nomigrate(sizeof(PPCAtomicQueue)*node_size);
00171 llPPCMemallocVec = (PPCAtomicQueue *)malloc_nomigrate(sizeof(PPCAtomicQueue)*node_size);
00172
00173 for (i = 0; i < node_size; ++i) {
00174 PPCAtomicQueueInit ((char *)atomic_mem + 3*i*sizeof(PPCAtomicState),
00175 sizeof(PPCAtomicState),
00176 &sPPCMemallocVec[i],
00177 0,
00178 N_SMSG_ELEM );
00179
00180 PPCAtomicQueueInit ((char *)atomic_mem + (3*i+1)*sizeof(PPCAtomicState),
00181 sizeof(PPCAtomicState),
00182 &mPPCMemallocVec[i],
00183 0,
00184 N_MMSG_ELEM );
00185
00186 PPCAtomicQueueInit ((char *)atomic_mem + (3*i+2)*sizeof(PPCAtomicState),
00187 sizeof(PPCAtomicState),
00188 &llPPCMemallocVec[i],
00189 0,
00190 N_LLMSG_ELEM );
00191 }
00192 }