00001
00002
00003
00004 #define MAX_TEMP 49
00005
00016
00017 #include "TempAwareRefineLB.h"
00018 #include "ckgraph.h"
00019 #include <algorithm>
00020
00021 extern int quietModeRequested;
00022
00023 CreateLBFunc_Def(TempAwareRefineLB, "always assign the heaviest obj onto lightest loaded processor.")
00024
00025 #ifdef TEMP_LDB
00026
00027
00028 static int cpufreq_sysfs_write (
00029 const char *setting,int proc
00030 )
00031 {
00032 char path[100];
00033 sprintf(path,"/sys/devices/system/cpu/cpu%d/cpufreq/scaling_setspeed",proc);
00034 FILE *fd = fopen (path, "w");
00035
00036 if (!fd) {
00037 printf("PROC#%d ooooooo666 FILE OPEN ERROR file=%s\n",CkMyPe(),path);
00038 return -1;
00039 }
00040
00041
00042 fseek ( fd , 0 , SEEK_SET );
00043 int numw=fprintf (fd, setting);
00044 if (numw <= 0) {
00045
00046 fclose (fd);
00047 printf("FILE WRITING ERROR\n");
00048 return 0;
00049 }
00050
00051 fclose(fd);
00052 return 1;
00053 }
00054
00055 float TempAwareRefineLB::getTemp(int cpu)
00056 {
00057 char val[10];
00058 FILE *f;
00059 char path[100];
00060 sprintf(path,"/sys/devices/platform/coretemp.%d/temp1_input",cpu);
00061 f=fopen(path,"r");
00062 if (!f) {
00063 printf("777 FILE OPEN ERROR file=%s\n",path);
00064 exit(0);
00065 }
00066
00067 if(f==NULL) {printf("ddddddddddddddddddddddddddd\n");exit(0);}
00068 fgets(val,10,f);
00069 fclose(f);
00070 return atof(val)/1000;
00071 }
00072
00073 static int cpufreq_sysfs_read (int proc)
00074 {
00075 FILE *fd;
00076 char path[100];
00077 int i=proc;
00078 sprintf(path,"/sys/devices/system/cpu/cpu%d/cpufreq/scaling_setspeed",i);
00079
00080 fd = fopen (path, "r");
00081
00082 if (!fd) {
00083 printf("22 FILE OPEN ERROR file=%s\n",path);
00084 return 0;
00085 }
00086 char val[10];
00087 fgets(val,10,fd);
00088 int ff=atoi(val);
00089 fclose (fd);
00090
00091 return ff;
00092 }
00093
00094 void printCurrentTemperature(void *LB, double curWallTime)
00095 {
00096 TempAwareRefineLB *taalb = static_cast<TempAwareRefineLB *>(LB);
00097 int pe = CkMyPe();
00098 float temp = taalb->getTemp(pe % taalb->physicalCoresPerNode);
00099 int freq = cpufreq_sysfs_read (pe % taalb->logicalCoresPerNode);
00100 fprintf(taalb->logFD, "%f, %d, %f, %d\n", curWallTime, pe, temp, freq);
00101 }
00102
00103 int getProcFreqPtr(int *freqs,int numAvail,int freq)
00104 {
00105 for(int i=0;i<numAvail;i++) if(freqs[i]==freq) return i;
00106 }
00107 #endif
00108 FILE *migFile;
00109 double starting;
00110 TempAwareRefineLB::TempAwareRefineLB(const CkLBOptions &opt): CBase_TempAwareRefineLB(opt)
00111 {
00112 #ifdef TEMP_LDB
00113 starting=CmiWallTimer();
00114
00115 migFile=fopen("migInfo","w");
00116 numAvailFreqs = 11;
00117
00118
00119 freqs=new int[numAvailFreqs];
00120 freqsEffect=new int[numAvailFreqs];
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133 freqs[0] = 2395000;
00134 freqs[1] = 2394000;
00135 freqs[2] = 2261000;
00136 freqs[3] = 2128000;
00137 freqs[4] = 1995000;
00138 freqs[5] = 1862000;
00139 freqs[6] = 1729000;
00140 freqs[7] = 1596000;
00141 freqs[8] = 1463000;
00142 freqs[9] = 1330000;
00143 freqs[10] = 1197000;
00144
00145 freqsEffect[0] = 1979886;
00146 freqsEffect[1] = 1943017;
00147 freqsEffect[2] = 1910989;
00148 freqsEffect[3] = 1876619;
00149 freqsEffect[4] = 1824126;
00150 freqsEffect[5] = 1763990;
00151 freqsEffect[6] = 1666773;
00152 freqsEffect[7] = 1560224;
00153 freqsEffect[8] = 1443154;
00154 freqsEffect[9] = 1317009;
00155 freqsEffect[10] = 1200000;
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165
00166
00167
00168
00169
00170
00171
00172
00173
00174
00175
00176 procFreqPtr = new int[CkNumPes()];
00177
00178 for(int i=0;i<CkNumPes();i++)
00179 {
00180 char newfreq[10];
00181 sprintf(newfreq,"%d",freqs[0]);
00182 cpufreq_sysfs_write(newfreq,i%physicalCoresPerNode);
00183 procFreqPtr[i]=0;
00184 }
00185
00186 procFreq=NULL;
00187 procTemp=NULL;
00188 procFreqNew=NULL;
00189 procFreqNewEffect = NULL;
00190 avgChipTemp=NULL;
00191 lbname = "TempAwareRefineLB";
00192 if (CkMyPe()==0 && !quietModeRequested)
00193 CkPrintf("CharmLB> TempAwareRefineLB created.\n");
00194
00195 char logFile[100];
00196 snprintf(logFile, sizeof(logFile), "temp_freq.log.%d", CkMyPe());
00197 if ((logFD = fopen(logFile, "a"))) {
00198 fprintf(logFD, "Time, PE, Temperature, Frequency\n");
00199 } else {
00200 CkAbort("Couldn't open temperature/frequency log file");
00201 }
00202
00203
00204 CcdCallOnConditionKeep(CcdPERIODIC_1second, &printCurrentTemperature, this);
00205 #else
00206 CmiAbort("TEMPLB ERROR: not supported without TEMP_LDB flag.\n");
00207 #endif
00208
00209 }
00210
00211 void TempAwareRefineLB::populateEffectiveFreq(int numProcs)
00212 {
00213 #ifdef TEMP_LDB
00214 for(int i=0;i<numProcs;i++)
00215 {
00216 for(int j=0;j<numAvailFreqs;j++)
00217 {
00218 if(freqs[j] == procFreqNew[i])
00219 {
00220 procFreqNewEffect[i] = freqsEffect[j];
00221
00222 }
00223 if(freqs[j] == procFreq[i])
00224 {
00225 procFreqEffect[i] = freqsEffect[j];
00226
00227 }
00228 }
00229 }
00230 #endif
00231 }
00232
00233 bool TempAwareRefineLB::QueryBalanceNow(int _step)
00234 {
00235
00236 return true;
00237 }
00238
00239 void TempAwareRefineLB::changeFreq(int nFreq)
00240 {
00241 #ifdef TEMP_LDB
00242
00243
00244 {
00245
00246 {
00247 char newfreq[10];
00248 sprintf(newfreq,"%d",nFreq);
00249 cpufreq_sysfs_write(newfreq,CkMyPe()%physicalCoresPerNode);
00250
00251 }
00252 }
00253 #endif
00254 }
00255
00256 #ifdef TEMP_LDB
00257 int getTaskIdForMigration(ObjGraph *ogr,int pe,int start)
00258 {
00259 for(int vert = start; vert < ogr->vertices.size(); vert++)
00260 {
00261 if(ogr->vertices[vert].getCurrentPe()==pe && ogr->vertices[vert].getNewPe()==-1) return vert;
00262 }
00263 CkPrintf("THERE IS A PROBLEM IN TEMPREFINELB 222 start=%d pe=%d objArraySize=%d!!!!!\n",start,pe,ogr->vertices.size());
00264 CkExit();
00265 }
00266
00267 int getNumTasks(ObjGraph *ogr,int pe)
00268 {
00269 int c=0;
00270 for(int vert = 0; vert < ogr->vertices.size(); vert++)
00271 {
00272 if(ogr->vertices[vert].getCurrentPe()==pe && ogr->vertices[vert].getNewPe()==-1) c++;
00273 }
00274 return c;
00275 }
00276
00277 int getTaskIdForMigration(ObjGraph *ogr,int pe,std::vector<int> assTasks)
00278 {
00279 for(int vert = 0; vert < ogr->vertices.size(); vert++)
00280 {
00281 if(ogr->vertices[vert].getCurrentPe()==pe && ogr->vertices[vert].getNewPe()==-1)
00282 {
00283
00284
00285
00286
00287
00288
00289
00290
00291
00292
00293
00294
00295
00296
00297 return vert;
00298 }
00299 }
00300 return -1;
00301
00302
00303
00304 }
00305
00306 bool saneFreqNormLds(double *loads, int numProcs)
00307 {
00308 double tot=0.0;
00309 for(int i=0;i<numProcs;i++)
00310 {
00311 tot+=loads[i];
00312 }
00313 double r=numProcs-tot;
00314 if(r>0.01 || r<-0.01)
00315 {
00316 CkPrintf("THere is a problem with LOADs!!! r=%f procs=%d loadSum=%f\n",r,numProcs,tot);
00317 return false;
00318 }
00319 else return true;
00320 }
00321 #endif
00322 void TempAwareRefineLB::work(LDStats* stats)
00323 {
00324 #ifdef TEMP_LDB
00326 numProcs=stats->nprocs();
00327 numChips=numProcs/logicalCoresPerChip;
00328 avgChipTemp=new float[numChips];
00329 if(procFreq!=NULL) delete [] procFreq;
00330 if(procFreqEffect!=NULL) delete [] procFreqEffect;
00331
00332 if(procTemp!=NULL) delete [] procTemp;
00333 if(procFreqNew!=NULL) delete [] procFreqNew;
00334 if(procFreqNewEffect!=NULL) delete [] procFreqNewEffect;
00335 if(avgChipTemp!=NULL) delete [] avgChipTemp;
00336
00337 procFreq = new int[numProcs];
00338 procFreqEffect = new int[numProcs];
00339
00340 procTemp = new float[numProcs];
00341 procFreqNew = new int[numProcs];
00342 procFreqNewEffect = new int[numProcs];
00343 avgChipTemp = new float[numChips];
00344
00345 for(int i=0;i<numChips;i++) avgChipTemp[i]=0;
00346
00347 for(int i=0;i<numProcs;i++)
00348 {
00349 procFreq[i] = stats->procs[i].pe_speed;
00350 procTemp[i] = stats->procs[i].pe_temp;
00351
00352 avgChipTemp[i/logicalCoresPerChip] += procTemp[i];
00353 }
00354
00355 for(int i=0;i<numChips;i++)
00356 {
00357 avgChipTemp[i]/=logicalCoresPerChip;
00358
00359 }
00360 for(int i=0;i<numChips;i++)
00361 {
00362 int over=0,under=0;
00363 if(avgChipTemp[i] > MAX_TEMP)
00364 {
00365 over=1;
00366 if(procFreqPtr[i*logicalCoresPerChip]==numAvailFreqs-1)
00367 {
00368 for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[procFreqPtr[j]];
00369 CkPrintf("CHIP#%d RUNNING HOT EVEN WITH MIN FREQUENCY!!\n",i);
00370 }
00371 else
00372 {
00373 for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++)
00374 {
00375 if(procFreqPtr[j]<numAvailFreqs-1) procFreqPtr[j]++;
00376 #ifdef MAX_MIN
00378 if(i==0) {procFreqPtr[j] = numAvailFreqs-1;}
00379
00380 else procFreqPtr[j]=0;
00382 #endif
00383 procFreqNew[j] = freqs[procFreqPtr[j]];
00384 }
00385 #ifndef ORG_VERSION
00386 CkPrintf("!!!!! Chip#%d running HOT shifting from %d to %d temp=%f\n",i,procFreq[i*logicalCoresPerChip],procFreqNew[i*logicalCoresPerChip],avgChipTemp[i]);
00387 #endif
00388 }
00389 }
00390 else
00391
00392 {
00393 under=1;
00394 if(procFreqPtr[i*logicalCoresPerChip]>0)
00395 {
00396 for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++)
00397 {
00398 if(procFreqPtr[j]>0)
00399 procFreqPtr[j]--;
00400 #ifdef MAX_MIN
00402 if(i==0) procFreqPtr[j] = numAvailFreqs-1;
00403
00404 else procFreqPtr[j]=0;
00406 #endif
00407 procFreqNew[j] = freqs[procFreqPtr[j]];
00408 }
00409 #ifndef ORG_VERSION
00410 CkPrintf("!!!!! Chip#%d running COLD shifting from %d to %d temp=%f\n",i,procFreq[i*logicalCoresPerChip],procFreqNew[i*logicalCoresPerChip],avgChipTemp[i]);
00411 #endif
00412 }
00413 else
00414 {
00415 for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[procFreqPtr[j]];
00416 }
00417 }
00418
00419
00420
00421
00422
00423
00424
00425
00426 #ifdef ORG_VERSION
00427 for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[0];
00428 #endif
00429
00430 }
00431
00432
00434
00435 #ifndef NO_TEMP_LB
00436 int obj;
00437 int n_pes = stats->nprocs();
00438
00439
00440
00441
00442
00443
00444 int* from_procs = RefinerTemp::AllocProcs(n_pes, stats);
00445 for(obj=0;obj<stats->n_objs;obj++) {
00446 int pe = stats->from_proc[obj];
00447 from_procs[obj] = pe;
00448 }
00449
00450 populateEffectiveFreq(numProcs);
00451 int* to_procs = RefinerTemp::AllocProcs(n_pes, stats);
00452
00453 RefinerTemp refiner(1.03,procFreq,procFreqNew,n_pes);
00454 refiner.Refine(n_pes, stats, from_procs, to_procs);
00455
00456 int migs=0;
00457 int *numMigs = new int[numProcs];
00458 int totE = 0;
00459 for(int mm=0;mm<numProcs;mm++) numMigs[mm] = 0;
00460 for(obj=0;obj<stats->n_objs;obj++) {
00461 int pe = stats->from_proc[obj];
00462 numMigs[to_procs[obj]]++;
00463
00464 LDObjData &odata = stats->objData[obj];
00465 computeInfo *c1 = new computeInfo();
00466 c1->id = odata.objID();
00467
00468 if (to_procs[obj] != pe) {
00469 migs++;
00470
00471 {
00472
00473
00474 }
00475 stats->to_proc[obj] = to_procs[obj];
00476 }
00477 }
00478
00479 for(int mm=0;mm<numProcs;mm++)
00480 {
00481
00482 }
00483 CkPrintf("TEMPLB INFO: Total Objs:%d migrations:%d time:%f \n",stats->n_objs,migs,CmiWallTimer()-starting);
00484 fprintf(migFile,"%f %d\n",CmiWallTimer()-starting,migs);
00485
00486 RefinerTemp::FreeProcs(from_procs);
00487 RefinerTemp::FreeProcs(to_procs);
00488
00489 #endif
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501 for(int x=0;x<numProcs;x++)
00502 {
00503
00504 if(procFreq[x]!=procFreqNew[x]) thisProxy[x].changeFreq(procFreqNew[x]);
00505 }
00506 #endif // TEMP_LDB endif
00507 }
00508 #include "TempAwareRefineLB.def.h"
00509