00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047 #include "GridCommRefineLB.decl.h"
00048
00049 #include "GridCommRefineLB.h"
00050 #include "manager.h"
00051
00052 CreateLBFunc_Def (GridCommRefineLB, "Grid communication load balancer (refines object mapping within each cluster)")
00053
00054
00055
00056
00057
00058
00059 GridCommRefineLB::GridCommRefineLB (const CkLBOptions &opt) : CentralLB (opt)
00060 {
00061 char *value;
00062
00063
00064 lbname = (char *) "GridCommRefineLB";
00065
00066 if (CkMyPe() == 0) {
00067 CkPrintf ("[%d] GridCommRefineLB created.\n", CkMyPe());
00068 }
00069
00070 if (value = getenv ("CK_LDB_GRIDCOMMREFINELB_TOLERANCE")) {
00071 CK_LDB_GridCommRefineLB_Tolerance = atof (value);
00072 } else {
00073 CK_LDB_GridCommRefineLB_Tolerance = CK_LDB_GRIDCOMMREFINELB_TOLERANCE;
00074 }
00075
00076 manager_init ();
00077 }
00078
00079
00080
00081
00082
00083
00084 GridCommRefineLB::GridCommRefineLB (CkMigrateMessage *msg) : CentralLB (msg)
00085 {
00086 char *value;
00087
00088
00089 lbname = (char *) "GridCommRefineLB";
00090
00091 if (value = getenv ("CK_LDB_GRIDCOMMREFINELB_TOLERANCE")) {
00092 CK_LDB_GridCommRefineLB_Tolerance = atof (value);
00093 } else {
00094 CK_LDB_GridCommRefineLB_Tolerance = CK_LDB_GRIDCOMMREFINELB_TOLERANCE;
00095 }
00096
00097 manager_init ();
00098 }
00099
00100
00101
00102
00103
00104
00105
00106 CmiBool GridCommRefineLB::QueryBalanceNow (int step)
00107 {
00108 if (_lb_args.debug() > 2) {
00109 CkPrintf ("[%d] GridCommRefineLB is balancing on step %d.\n", CkMyPe(), step);
00110 }
00111
00112 return (CmiTrue);
00113 }
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128 int GridCommRefineLB::Get_Cluster (int pe)
00129 {
00130 #if CONVERSE_VERSION_VMI
00131 return (CmiGetCluster (pe));
00132 #else
00133 return (0);
00134 #endif
00135 }
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148 void GridCommRefineLB::Initialize_PE_Data (CentralLB::LDStats *stats)
00149 {
00150 int min_speed;
00151 int i;
00152
00153
00154 PE_Data = new PE_Data_T[Num_PEs];
00155
00156 min_speed = MAXINT;
00157 for (i = 0; i < Num_PEs; i++) {
00158 (&PE_Data[i])->available = stats->procs[i].available;
00159 (&PE_Data[i])->cluster = Get_Cluster (i);
00160 (&PE_Data[i])->num_objs = 0;
00161 (&PE_Data[i])->num_lan_objs = 0;
00162 (&PE_Data[i])->num_lan_msgs = 0;
00163 (&PE_Data[i])->num_wan_objs = 0;
00164 (&PE_Data[i])->num_wan_msgs = 0;
00165 (&PE_Data[i])->relative_speed = 0.0;
00166 (&PE_Data[i])->scaled_load = 0.0;
00167
00168 if (stats->procs[i].pe_speed < min_speed) {
00169 min_speed = stats->procs[i].pe_speed;
00170 }
00171 }
00172
00173
00174
00175 for (i = 0; i < Num_PEs; i++) {
00176 (&PE_Data[i])->relative_speed = (double) (stats->procs[i].pe_speed / min_speed);
00177 (&PE_Data[i])->scaled_load += stats->procs[i].bg_walltime;
00178 }
00179 }
00180
00181
00182
00183
00184
00185
00186 int GridCommRefineLB::Available_PE_Count ()
00187 {
00188 int available_pe_count;
00189 int i;
00190
00191
00192 available_pe_count = 0;
00193 for (i = 0; i < Num_PEs; i++) {
00194 if ((&PE_Data[i])->available) {
00195 available_pe_count += 1;
00196 }
00197 }
00198 return (available_pe_count);
00199 }
00200
00201
00202
00203
00204
00205
00206 int GridCommRefineLB::Compute_Number_Of_Clusters ()
00207 {
00208 int max_cluster;
00209 int i;
00210
00211
00212 max_cluster = 0;
00213 for (i = 0; i < Num_PEs; i++) {
00214 if ((&PE_Data[i])->cluster < 0) {
00215 return (-1);
00216 }
00217
00218 if ((&PE_Data[i])->cluster > max_cluster) {
00219 max_cluster = (&PE_Data[i])->cluster;
00220 }
00221 }
00222 return (max_cluster + 1);
00223 }
00224
00225
00226
00227
00228
00229
00230 void GridCommRefineLB::Initialize_Object_Data (CentralLB::LDStats *stats)
00231 {
00232 int i;
00233
00234
00235 Object_Data = new Object_Data_T[Num_Objects];
00236
00237 for (i = 0; i < Num_Objects; i++) {
00238 (&Object_Data[i])->migratable = (&stats->objData[i])->migratable;
00239 (&Object_Data[i])->cluster = Get_Cluster (stats->from_proc[i]);
00240 (&Object_Data[i])->from_pe = stats->from_proc[i];
00241 (&Object_Data[i])->to_pe = stats->from_proc[i];
00242 (&Object_Data[i])->num_lan_msgs = 0;
00243 (&Object_Data[i])->num_wan_msgs = 0;
00244 (&Object_Data[i])->load = (&stats->objData[i])->wallTime;
00245
00246
00247
00248 }
00249 }
00250
00251
00252
00253
00254
00255
00256 void GridCommRefineLB::Examine_InterObject_Messages (CentralLB::LDStats *stats)
00257 {
00258 int i;
00259 int j;
00260 LDCommData *com_data;
00261 int send_object;
00262 int send_pe;
00263 int send_cluster;
00264 int recv_object;
00265 int recv_pe;
00266 int recv_cluster;
00267 LDObjKey *recv_objects;
00268 int num_objects;
00269
00270
00271 for (i = 0; i < stats->n_comm; i++) {
00272 com_data = &(stats->commData[i]);
00273 if ((!com_data->from_proc()) && (com_data->recv_type() == LD_OBJ_MSG)) {
00274 send_object = stats->getHash (com_data->sender);
00275 recv_object = stats->getHash (com_data->receiver.get_destObj());
00276
00277 if ((send_object < 0) || (send_object > Num_Objects) || (recv_object < 0) || (recv_object > Num_Objects)) {
00278 continue;
00279 }
00280
00281 send_pe = (&Object_Data[send_object])->from_pe;
00282 recv_pe = (&Object_Data[recv_object])->from_pe;
00283
00284 send_cluster = Get_Cluster (send_pe);
00285 recv_cluster = Get_Cluster (recv_pe);
00286
00287 if (send_cluster == recv_cluster) {
00288 (&Object_Data[send_object])->num_lan_msgs += com_data->messages;
00289 } else {
00290 (&Object_Data[send_object])->num_wan_msgs += com_data->messages;
00291 }
00292 } else if (com_data->receiver.get_type() == LD_OBJLIST_MSG) {
00293 send_object = stats->getHash (com_data->sender);
00294
00295 if ((send_object < 0) || (send_object > Num_Objects)) {
00296 continue;
00297 }
00298
00299 send_pe = (&Object_Data[send_object])->from_pe;
00300 send_cluster = Get_Cluster (send_pe);
00301
00302 recv_objects = com_data->receiver.get_destObjs (num_objects);
00303
00304 for (j = 0; j < num_objects; j++) {
00305 recv_object = stats->getHash (recv_objects[j]);
00306
00307 if ((recv_object < 0) || (recv_object > Num_Objects)) {
00308 continue;
00309 }
00310
00311 recv_pe = (&Object_Data[recv_object])->from_pe;
00312 recv_cluster = Get_Cluster (recv_pe);
00313
00314 if (send_cluster == recv_cluster) {
00315 (&Object_Data[send_object])->num_lan_msgs += com_data->messages;
00316 } else {
00317 (&Object_Data[send_object])->num_wan_msgs += com_data->messages;
00318 }
00319 }
00320 }
00321 }
00322 }
00323
00324
00325
00326
00327
00328
00329 void GridCommRefineLB::Place_Objects_On_PEs ()
00330 {
00331 int i;
00332
00333
00334 for (i = 0; i < Num_Objects; i++) {
00335 Assign_Object_To_PE (i, (&Object_Data[i])->from_pe);
00336 }
00337 }
00338
00339
00340
00341
00342
00343
00344 void GridCommRefineLB::Remap_Objects_To_PEs (int cluster)
00345 {
00346 int num_cluster_pes;
00347 int num_wan_msgs;
00348 int avg_wan_msgs;
00349 int target_object;
00350 int target_pe;
00351 int i;
00352
00353
00354
00355 num_cluster_pes = 0;
00356 num_wan_msgs = 0;
00357 for (i = 0; i < Num_PEs; i++) {
00358 if (cluster == (&PE_Data[i])->cluster) {
00359 num_cluster_pes += 1;
00360 num_wan_msgs += (&PE_Data[i])->num_wan_msgs;
00361 }
00362 }
00363 avg_wan_msgs = num_wan_msgs / num_cluster_pes;
00364
00365
00366 for (i = 0; i < Num_PEs; i++) {
00367 if (cluster == (&PE_Data[i])->cluster) {
00368 while ((&PE_Data[i])->num_wan_msgs > (avg_wan_msgs * CK_LDB_GridCommRefineLB_Tolerance)) {
00369 target_object = Find_Maximum_WAN_Object (i);
00370 target_pe = Find_Minimum_WAN_PE (cluster);
00371
00372 if ((target_object == -1) || (target_pe == -1)) {
00373 break;
00374 }
00375
00376 Remove_Object_From_PE (target_object, i);
00377 Assign_Object_To_PE (target_object, target_pe);
00378 }
00379 }
00380 }
00381
00382
00383
00384
00385
00386
00387
00388
00389
00390
00391
00392
00393
00394
00395
00396
00397
00398
00399
00400
00401
00402
00403
00404
00405
00406
00407
00408
00409
00410
00411 }
00412
00413
00414
00415
00416
00417
00418
00419
00420
00421
00422 int GridCommRefineLB::Find_Maximum_WAN_Object (int pe)
00423 {
00424 int i;
00425 int max_index;
00426 int max_wan_msgs;
00427
00428
00429 max_index = -1;
00430 max_wan_msgs = -1;
00431
00432 for (i = 0; i < Num_Objects; i++) {
00433 if ((&Object_Data[i])->from_pe == pe) {
00434 if ((&Object_Data[i])->migratable) {
00435 if ((&Object_Data[i])->num_wan_msgs > max_wan_msgs) {
00436 max_index = i;
00437 max_wan_msgs = (&Object_Data[i])->num_wan_msgs;
00438 }
00439 }
00440 }
00441 }
00442
00443 return (max_index);
00444 }
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454
00455
00456
00457
00458
00459
00460
00461
00462 int GridCommRefineLB::Find_Minimum_WAN_PE (int cluster)
00463 {
00464 int i;
00465 int min_index;
00466 int min_wan_msgs;
00467
00468
00469 min_index = -1;
00470 min_wan_msgs = MAXINT;
00471
00472 for (i = 0; i < Num_PEs; i++) {
00473 if (((&PE_Data[i])->available) && ((&PE_Data[i])->cluster == cluster)) {
00474 if ((&PE_Data[i])->num_wan_msgs < min_wan_msgs) {
00475 min_index = i;
00476 min_wan_msgs = (&PE_Data[i])->num_wan_msgs;
00477 } else if (((&PE_Data[i])->num_wan_msgs == min_wan_msgs) &&
00478 ((&PE_Data[i])->scaled_load < (&PE_Data[min_index])->scaled_load)) {
00479 min_index = i;
00480 min_wan_msgs = (&PE_Data[i])->num_wan_msgs;
00481 } else if (((&PE_Data[i])->num_wan_msgs == min_wan_msgs) &&
00482 ((&PE_Data[i])->scaled_load == (&PE_Data[min_index])->scaled_load) &&
00483 ((&PE_Data[i])->num_objs < (&PE_Data[min_index])->num_objs)) {
00484 min_index = i;
00485 min_wan_msgs = (&PE_Data[i])->num_wan_msgs;
00486 }
00487 }
00488 }
00489
00490 return (min_index);
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502
00503
00504
00505
00506
00507
00508
00509
00510
00511
00512
00513
00514
00515
00516
00517
00518
00519
00520
00521 }
00522
00523
00524
00525
00526
00527
00528
00529
00530 void GridCommRefineLB::Remove_Object_From_PE (int target_object, int target_pe)
00531 {
00532 (&Object_Data[target_object])->to_pe = -1;
00533
00534 (&PE_Data[target_pe])->num_objs -= 1;
00535
00536 if ((&Object_Data[target_object])->num_lan_msgs > 0) {
00537 (&PE_Data[target_pe])->num_lan_objs -= 1;
00538 (&PE_Data[target_pe])->num_lan_msgs -= (&Object_Data[target_object])->num_lan_msgs;
00539 }
00540
00541 if ((&Object_Data[target_object])->num_wan_msgs > 0) {
00542 (&PE_Data[target_pe])->num_wan_objs -= 1;
00543 (&PE_Data[target_pe])->num_wan_msgs -= (&Object_Data[target_object])->num_wan_msgs;
00544 }
00545
00546 (&PE_Data[target_pe])->scaled_load -= (&Object_Data[target_object])->load / (&PE_Data[target_pe])->relative_speed;
00547 }
00548
00549
00550
00551
00552
00553
00554
00555
00556 void GridCommRefineLB::Assign_Object_To_PE (int target_object, int target_pe)
00557 {
00558 (&Object_Data[target_object])->to_pe = target_pe;
00559
00560 (&PE_Data[target_pe])->num_objs += 1;
00561
00562 if ((&Object_Data[target_object])->num_lan_msgs > 0) {
00563 (&PE_Data[target_pe])->num_lan_objs += 1;
00564 (&PE_Data[target_pe])->num_lan_msgs += (&Object_Data[target_object])->num_lan_msgs;
00565 }
00566
00567 if ((&Object_Data[target_object])->num_wan_msgs > 0) {
00568 (&PE_Data[target_pe])->num_wan_objs += 1;
00569 (&PE_Data[target_pe])->num_wan_msgs += (&Object_Data[target_object])->num_wan_msgs;
00570 }
00571
00572 (&PE_Data[target_pe])->scaled_load += (&Object_Data[target_object])->load / (&PE_Data[target_pe])->relative_speed;
00573 }
00574
00575
00576
00577
00578
00579
00580
00581 void GridCommRefineLB::work (LDStats *stats)
00582 {
00583 int i;
00584
00585
00586
00587
00588
00589
00590
00591
00592
00593
00594
00595
00596
00597
00598 if (_lb_args.debug() > 0) {
00599 CkPrintf ("[%d] GridCommRefineLB is working.\n", CkMyPe());
00600 }
00601
00602
00603 stats->makeCommHash ();
00604
00605
00606 Num_PEs = stats->nprocs();
00607 Num_Objects = stats->n_objs;
00608
00609 if (_lb_args.debug() > 0) {
00610 CkPrintf ("[%d] GridCommRefineLB is examining %d PEs and %d objects.\n", CkMyPe(), Num_PEs, Num_Objects);
00611 }
00612
00613
00614 Initialize_PE_Data (stats);
00615
00616
00617 if (Available_PE_Count() < 1) {
00618 if (_lb_args.debug() > 0) {
00619 CkPrintf ("[%d] GridCommRefineLB finds no available PEs -- no balancing done.\n", CkMyPe());
00620 }
00621
00622 delete [] PE_Data;
00623
00624 return;
00625 }
00626
00627
00628
00629 Num_Clusters = Compute_Number_Of_Clusters ();
00630 if (Num_Clusters < 1) {
00631 if (_lb_args.debug() > 0) {
00632 CkPrintf ("[%d] GridCommRefineLB finds incomplete PE cluster map -- no balancing done.\n", CkMyPe());
00633 }
00634
00635 delete [] PE_Data;
00636
00637 return;
00638 }
00639
00640 if (_lb_args.debug() > 0) {
00641 CkPrintf ("[%d] GridCommRefineLB finds %d clusters.\n", CkMyPe(), Num_Clusters);
00642 }
00643
00644
00645 Initialize_Object_Data (stats);
00646
00647
00648 Examine_InterObject_Messages (stats);
00649
00650
00651 Place_Objects_On_PEs ();
00652
00653
00654 for (i = 0; i < Num_Clusters; i++) {
00655 Remap_Objects_To_PEs (i);
00656 }
00657
00658
00659 for (i = 0; i < Num_Objects; i++) {
00660 stats->to_proc[i] = (&Object_Data[i])->to_pe;
00661
00662 if (_lb_args.debug() > 2) {
00663 CkPrintf ("[%d] GridCommRefineLB migrates object %d from PE %d to PE %d.\n", CkMyPe(), i, stats->from_proc[i], stats->to_proc[i]);
00664 } else if (_lb_args.debug() > 1) {
00665 if (stats->to_proc[i] != stats->from_proc[i]) {
00666 CkPrintf ("[%d] GridCommRefineLB migrates object %d from PE %d to PE %d.\n", CkMyPe(), i, stats->from_proc[i], stats->to_proc[i]);
00667 }
00668 }
00669 }
00670
00671
00672 delete [] Object_Data;
00673 delete [] PE_Data;
00674 }
00675
00676 #include "GridCommRefineLB.def.h"