00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047 #include "GridCommRefineLB.decl.h"
00048
00049 #include "GridCommRefineLB.h"
00050 #include "manager.h"
00051
00052 extern int quietModeRequested;
00053
00054 CreateLBFunc_Def (GridCommRefineLB, "Grid communication load balancer (refines object mapping within each cluster)")
00055
00056
00057
00058
00059
00060
00061 GridCommRefineLB::GridCommRefineLB (const CkLBOptions &opt) : CBase_GridCommRefineLB (opt)
00062 {
00063 char *value;
00064
00065
00066 lbname = (char *) "GridCommRefineLB";
00067
00068 if (CkMyPe() == 0 && !quietModeRequested) {
00069 CkPrintf ("CharmLB> GridCommRefineLB created.\n");
00070 }
00071
00072 if ((value = getenv ("CK_LDB_GRIDCOMMREFINELB_TOLERANCE"))) {
00073 CK_LDB_GridCommRefineLB_Tolerance = atof (value);
00074 } else {
00075 CK_LDB_GridCommRefineLB_Tolerance = CK_LDB_GRIDCOMMREFINELB_TOLERANCE;
00076 }
00077
00078 manager_init ();
00079 }
00080
00081
00082
00083
00084
00085
00086 GridCommRefineLB::GridCommRefineLB (CkMigrateMessage *msg) : CBase_GridCommRefineLB (msg)
00087 {
00088 char *value;
00089
00090
00091 lbname = (char *) "GridCommRefineLB";
00092
00093 if ((value = getenv ("CK_LDB_GRIDCOMMREFINELB_TOLERANCE"))) {
00094 CK_LDB_GridCommRefineLB_Tolerance = atof (value);
00095 } else {
00096 CK_LDB_GridCommRefineLB_Tolerance = CK_LDB_GRIDCOMMREFINELB_TOLERANCE;
00097 }
00098
00099 manager_init ();
00100 }
00101
00102
00103
00104
00105
00106
00107
00108 bool GridCommRefineLB::QueryBalanceNow (int step)
00109 {
00110 if (_lb_args.debug() > 2) {
00111 CkPrintf ("[%d] GridCommRefineLB is balancing on step %d.\n", CkMyPe(), step);
00112 }
00113
00114 return (true);
00115 }
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130 int GridCommRefineLB::Get_Cluster (int pe)
00131 {
00132 return (0);
00133 }
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146 void GridCommRefineLB::Initialize_PE_Data (CentralLB::LDStats *stats)
00147 {
00148 int min_speed;
00149 int i;
00150
00151
00152 PE_Data = new PE_Data_T[Num_PEs];
00153
00154 min_speed = MAXINT;
00155 for (i = 0; i < Num_PEs; i++) {
00156 (&PE_Data[i])->available = stats->procs[i].available;
00157 (&PE_Data[i])->cluster = Get_Cluster (i);
00158 (&PE_Data[i])->num_objs = 0;
00159 (&PE_Data[i])->num_lan_objs = 0;
00160 (&PE_Data[i])->num_lan_msgs = 0;
00161 (&PE_Data[i])->num_wan_objs = 0;
00162 (&PE_Data[i])->num_wan_msgs = 0;
00163 (&PE_Data[i])->relative_speed = 0.0;
00164 (&PE_Data[i])->scaled_load = 0.0;
00165
00166 if (stats->procs[i].pe_speed < min_speed) {
00167 min_speed = stats->procs[i].pe_speed;
00168 }
00169 }
00170
00171
00172
00173 for (i = 0; i < Num_PEs; i++) {
00174 (&PE_Data[i])->relative_speed = (double) (stats->procs[i].pe_speed / min_speed);
00175 (&PE_Data[i])->scaled_load += stats->procs[i].bg_walltime;
00176 }
00177 }
00178
00179
00180
00181
00182
00183
00184 int GridCommRefineLB::Available_PE_Count ()
00185 {
00186 int available_pe_count;
00187 int i;
00188
00189
00190 available_pe_count = 0;
00191 for (i = 0; i < Num_PEs; i++) {
00192 if ((&PE_Data[i])->available) {
00193 available_pe_count += 1;
00194 }
00195 }
00196 return (available_pe_count);
00197 }
00198
00199
00200
00201
00202
00203
00204 int GridCommRefineLB::Compute_Number_Of_Clusters ()
00205 {
00206 int max_cluster;
00207 int i;
00208
00209
00210 max_cluster = 0;
00211 for (i = 0; i < Num_PEs; i++) {
00212 if ((&PE_Data[i])->cluster < 0) {
00213 return (-1);
00214 }
00215
00216 if ((&PE_Data[i])->cluster > max_cluster) {
00217 max_cluster = (&PE_Data[i])->cluster;
00218 }
00219 }
00220 return (max_cluster + 1);
00221 }
00222
00223
00224
00225
00226
00227
00228 void GridCommRefineLB::Initialize_Object_Data (CentralLB::LDStats *stats)
00229 {
00230 int i;
00231
00232
00233 Object_Data = new Object_Data_T[Num_Objects];
00234
00235 for (i = 0; i < Num_Objects; i++) {
00236 (&Object_Data[i])->migratable = (&stats->objData[i])->migratable;
00237 (&Object_Data[i])->cluster = Get_Cluster (stats->from_proc[i]);
00238 (&Object_Data[i])->from_pe = stats->from_proc[i];
00239 (&Object_Data[i])->to_pe = stats->from_proc[i];
00240 (&Object_Data[i])->num_lan_msgs = 0;
00241 (&Object_Data[i])->num_wan_msgs = 0;
00242 (&Object_Data[i])->load = (&stats->objData[i])->wallTime;
00243
00244
00245
00246 }
00247 }
00248
00249
00250
00251
00252
00253
00254 void GridCommRefineLB::Examine_InterObject_Messages (CentralLB::LDStats *stats)
00255 {
00256 int i;
00257 int j;
00258 LDCommData *com_data;
00259 int send_object;
00260 int send_pe;
00261 int send_cluster;
00262 int recv_object;
00263 int recv_pe;
00264 int recv_cluster;
00265 const LDObjKey *recv_objects;
00266 int num_objects;
00267
00268
00269 for (i = 0; i < stats->n_comm; i++) {
00270 com_data = &(stats->commData[i]);
00271 if ((!com_data->from_proc()) && (com_data->recv_type() == LD_OBJ_MSG)) {
00272 send_object = stats->getHash (com_data->sender);
00273 recv_object = stats->getHash (com_data->receiver.get_destObj());
00274
00275 if ((send_object < 0) || (send_object > Num_Objects) || (recv_object < 0) || (recv_object > Num_Objects)) {
00276 continue;
00277 }
00278
00279 send_pe = (&Object_Data[send_object])->from_pe;
00280 recv_pe = (&Object_Data[recv_object])->from_pe;
00281
00282 send_cluster = Get_Cluster (send_pe);
00283 recv_cluster = Get_Cluster (recv_pe);
00284
00285 if (send_cluster == recv_cluster) {
00286 (&Object_Data[send_object])->num_lan_msgs += com_data->messages;
00287 } else {
00288 (&Object_Data[send_object])->num_wan_msgs += com_data->messages;
00289 }
00290 } else if (com_data->receiver.get_type() == LD_OBJLIST_MSG) {
00291 send_object = stats->getHash (com_data->sender);
00292
00293 if ((send_object < 0) || (send_object > Num_Objects)) {
00294 continue;
00295 }
00296
00297 send_pe = (&Object_Data[send_object])->from_pe;
00298 send_cluster = Get_Cluster (send_pe);
00299
00300 recv_objects = com_data->receiver.get_destObjs (num_objects);
00301
00302 for (j = 0; j < num_objects; j++) {
00303 recv_object = stats->getHash (recv_objects[j]);
00304
00305 if ((recv_object < 0) || (recv_object > Num_Objects)) {
00306 continue;
00307 }
00308
00309 recv_pe = (&Object_Data[recv_object])->from_pe;
00310 recv_cluster = Get_Cluster (recv_pe);
00311
00312 if (send_cluster == recv_cluster) {
00313 (&Object_Data[send_object])->num_lan_msgs += com_data->messages;
00314 } else {
00315 (&Object_Data[send_object])->num_wan_msgs += com_data->messages;
00316 }
00317 }
00318 }
00319 }
00320 }
00321
00322
00323
00324
00325
00326
00327 void GridCommRefineLB::Place_Objects_On_PEs ()
00328 {
00329 int i;
00330
00331
00332 for (i = 0; i < Num_Objects; i++) {
00333 Assign_Object_To_PE (i, (&Object_Data[i])->from_pe);
00334 }
00335 }
00336
00337
00338
00339
00340
00341
00342 void GridCommRefineLB::Remap_Objects_To_PEs (int cluster)
00343 {
00344 int num_cluster_pes;
00345 int num_wan_msgs;
00346 int avg_wan_msgs;
00347 int target_object;
00348 int target_pe;
00349 int i;
00350
00351
00352
00353 num_cluster_pes = 0;
00354 num_wan_msgs = 0;
00355 for (i = 0; i < Num_PEs; i++) {
00356 if (cluster == (&PE_Data[i])->cluster) {
00357 num_cluster_pes += 1;
00358 num_wan_msgs += (&PE_Data[i])->num_wan_msgs;
00359 }
00360 }
00361 avg_wan_msgs = num_wan_msgs / num_cluster_pes;
00362
00363
00364 for (i = 0; i < Num_PEs; i++) {
00365 if (cluster == (&PE_Data[i])->cluster) {
00366 while ((&PE_Data[i])->num_wan_msgs > (avg_wan_msgs * CK_LDB_GridCommRefineLB_Tolerance)) {
00367 target_object = Find_Maximum_WAN_Object (i);
00368 target_pe = Find_Minimum_WAN_PE (cluster);
00369
00370 if ((target_object == -1) || (target_pe == -1)) {
00371 break;
00372 }
00373
00374 Remove_Object_From_PE (target_object, i);
00375 Assign_Object_To_PE (target_object, target_pe);
00376 }
00377 }
00378 }
00379
00380
00381
00382
00383
00384
00385
00386
00387
00388
00389
00390
00391
00392
00393
00394
00395
00396
00397
00398
00399
00400
00401
00402
00403
00404
00405
00406
00407
00408
00409 }
00410
00411
00412
00413
00414
00415
00416
00417
00418
00419
00420 int GridCommRefineLB::Find_Maximum_WAN_Object (int pe)
00421 {
00422 int i;
00423 int max_index;
00424 int max_wan_msgs;
00425
00426
00427 max_index = -1;
00428 max_wan_msgs = -1;
00429
00430 for (i = 0; i < Num_Objects; i++) {
00431 if ((&Object_Data[i])->from_pe == pe) {
00432 if ((&Object_Data[i])->migratable) {
00433 if ((&Object_Data[i])->num_wan_msgs > max_wan_msgs) {
00434 max_index = i;
00435 max_wan_msgs = (&Object_Data[i])->num_wan_msgs;
00436 }
00437 }
00438 }
00439 }
00440
00441 return (max_index);
00442 }
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454
00455
00456
00457
00458
00459
00460 int GridCommRefineLB::Find_Minimum_WAN_PE (int cluster)
00461 {
00462 int i;
00463 int min_index;
00464 int min_wan_msgs;
00465
00466
00467 min_index = -1;
00468 min_wan_msgs = MAXINT;
00469
00470 for (i = 0; i < Num_PEs; i++) {
00471 if (((&PE_Data[i])->available) && ((&PE_Data[i])->cluster == cluster)) {
00472 if ((&PE_Data[i])->num_wan_msgs < min_wan_msgs) {
00473 min_index = i;
00474 min_wan_msgs = (&PE_Data[i])->num_wan_msgs;
00475 } else if (((&PE_Data[i])->num_wan_msgs == min_wan_msgs) &&
00476 ((&PE_Data[i])->scaled_load < (&PE_Data[min_index])->scaled_load)) {
00477 min_index = i;
00478 min_wan_msgs = (&PE_Data[i])->num_wan_msgs;
00479 } else if (((&PE_Data[i])->num_wan_msgs == min_wan_msgs) &&
00480 ((&PE_Data[i])->scaled_load == (&PE_Data[min_index])->scaled_load) &&
00481 ((&PE_Data[i])->num_objs < (&PE_Data[min_index])->num_objs)) {
00482 min_index = i;
00483 min_wan_msgs = (&PE_Data[i])->num_wan_msgs;
00484 }
00485 }
00486 }
00487
00488 return (min_index);
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502
00503
00504
00505
00506
00507
00508
00509
00510
00511
00512
00513
00514
00515
00516
00517
00518
00519 }
00520
00521
00522
00523
00524
00525
00526
00527
00528 void GridCommRefineLB::Remove_Object_From_PE (int target_object, int target_pe)
00529 {
00530 (&Object_Data[target_object])->to_pe = -1;
00531
00532 (&PE_Data[target_pe])->num_objs -= 1;
00533
00534 if ((&Object_Data[target_object])->num_lan_msgs > 0) {
00535 (&PE_Data[target_pe])->num_lan_objs -= 1;
00536 (&PE_Data[target_pe])->num_lan_msgs -= (&Object_Data[target_object])->num_lan_msgs;
00537 }
00538
00539 if ((&Object_Data[target_object])->num_wan_msgs > 0) {
00540 (&PE_Data[target_pe])->num_wan_objs -= 1;
00541 (&PE_Data[target_pe])->num_wan_msgs -= (&Object_Data[target_object])->num_wan_msgs;
00542 }
00543
00544 (&PE_Data[target_pe])->scaled_load -= (&Object_Data[target_object])->load / (&PE_Data[target_pe])->relative_speed;
00545 }
00546
00547
00548
00549
00550
00551
00552
00553
00554 void GridCommRefineLB::Assign_Object_To_PE (int target_object, int target_pe)
00555 {
00556 (&Object_Data[target_object])->to_pe = target_pe;
00557
00558 (&PE_Data[target_pe])->num_objs += 1;
00559
00560 if ((&Object_Data[target_object])->num_lan_msgs > 0) {
00561 (&PE_Data[target_pe])->num_lan_objs += 1;
00562 (&PE_Data[target_pe])->num_lan_msgs += (&Object_Data[target_object])->num_lan_msgs;
00563 }
00564
00565 if ((&Object_Data[target_object])->num_wan_msgs > 0) {
00566 (&PE_Data[target_pe])->num_wan_objs += 1;
00567 (&PE_Data[target_pe])->num_wan_msgs += (&Object_Data[target_object])->num_wan_msgs;
00568 }
00569
00570 (&PE_Data[target_pe])->scaled_load += (&Object_Data[target_object])->load / (&PE_Data[target_pe])->relative_speed;
00571 }
00572
00573
00574
00575
00576
00577
00578
00579 void GridCommRefineLB::work (LDStats *stats)
00580 {
00581 int i;
00582
00583
00584
00585
00586
00587
00588
00589
00590
00591
00592
00593
00594
00595
00596 if (_lb_args.debug() > 0) {
00597 CkPrintf ("[%d] GridCommRefineLB is working.\n", CkMyPe());
00598 }
00599
00600
00601 stats->makeCommHash ();
00602
00603
00604 Num_PEs = stats->nprocs();
00605 Num_Objects = stats->n_objs;
00606
00607 if (_lb_args.debug() > 0) {
00608 CkPrintf ("[%d] GridCommRefineLB is examining %d PEs and %d objects.\n", CkMyPe(), Num_PEs, Num_Objects);
00609 }
00610
00611
00612 Initialize_PE_Data (stats);
00613
00614
00615 if (Available_PE_Count() < 1) {
00616 if (_lb_args.debug() > 0) {
00617 CkPrintf ("[%d] GridCommRefineLB finds no available PEs -- no balancing done.\n", CkMyPe());
00618 }
00619
00620 delete [] PE_Data;
00621
00622 return;
00623 }
00624
00625
00626
00627 Num_Clusters = Compute_Number_Of_Clusters ();
00628 if (Num_Clusters < 1) {
00629 if (_lb_args.debug() > 0) {
00630 CkPrintf ("[%d] GridCommRefineLB finds incomplete PE cluster map -- no balancing done.\n", CkMyPe());
00631 }
00632
00633 delete [] PE_Data;
00634
00635 return;
00636 }
00637
00638 if (_lb_args.debug() > 0) {
00639 CkPrintf ("[%d] GridCommRefineLB finds %d clusters.\n", CkMyPe(), Num_Clusters);
00640 }
00641
00642
00643 Initialize_Object_Data (stats);
00644
00645
00646 Examine_InterObject_Messages (stats);
00647
00648
00649 Place_Objects_On_PEs ();
00650
00651
00652 for (i = 0; i < Num_Clusters; i++) {
00653 Remap_Objects_To_PEs (i);
00654 }
00655
00656
00657 for (i = 0; i < Num_Objects; i++) {
00658 stats->to_proc[i] = (&Object_Data[i])->to_pe;
00659
00660 if (_lb_args.debug() > 2) {
00661 CkPrintf ("[%d] GridCommRefineLB migrates object %d from PE %d to PE %d.\n", CkMyPe(), i, stats->from_proc[i], stats->to_proc[i]);
00662 } else if (_lb_args.debug() > 1) {
00663 if (stats->to_proc[i] != stats->from_proc[i]) {
00664 CkPrintf ("[%d] GridCommRefineLB migrates object %d from PE %d to PE %d.\n", CkMyPe(), i, stats->from_proc[i], stats->to_proc[i]);
00665 }
00666 }
00667 }
00668
00669
00670 delete [] Object_Data;
00671 delete [] PE_Data;
00672 }
00673
00674 #include "GridCommRefineLB.def.h"