00001
00002
00003
00004
00005
00006
00007
00008
00009 #include "GridHybridLB.decl.h"
00010
00011 #include "GridHybridLB.h"
00012 #include "manager.h"
00013
00014 CreateLBFunc_Def (GridHybridLB, "Grid load balancer that uses hybrid technique to optimize communication graph")
00015
00016
00017
00018
00019
00020
00021 GridHybridLB::GridHybridLB (const CkLBOptions &opt) : CentralLB (opt)
00022 {
00023 char *value;
00024
00025
00026 lbname = (char *) "GridHybridLB";
00027
00028 if (CkMyPe() == 0) {
00029 CkPrintf ("[%d] GridHybridLB created.\n", CkMyPe());
00030 }
00031
00032 if (value = getenv ("CK_LDB_GRIDHYBRIDLB_MODE")) {
00033 CK_LDB_GridHybridLB_Mode = atoi (value);
00034 } else {
00035 CK_LDB_GridHybridLB_Mode = CK_LDB_GRIDHYBRIDLB_MODE;
00036 }
00037
00038 if (value = getenv ("CK_LDB_GRIDHYBRIDLB_BACKGROUND_LOAD")) {
00039 CK_LDB_GridHybridLB_Background_Load = atoi (value);
00040 } else {
00041 CK_LDB_GridHybridLB_Background_Load = CK_LDB_GRIDHYBRIDLB_BACKGROUND_LOAD;
00042 }
00043
00044 if (value = getenv ("CK_LDB_GRIDHYBRIDLB_LOAD_TOLERANCE")) {
00045 CK_LDB_GridHybridLB_Load_Tolerance = atof (value);
00046 } else {
00047 CK_LDB_GridHybridLB_Load_Tolerance = CK_LDB_GRIDHYBRIDLB_LOAD_TOLERANCE;
00048 }
00049
00050 manager_init ();
00051 }
00052
00053
00054
00055
00056
00057
00058 GridHybridLB::GridHybridLB (CkMigrateMessage *msg) : CentralLB (msg)
00059 {
00060 char *value;
00061
00062
00063 lbname = (char *) "GridHybridLB";
00064
00065 if (value = getenv ("CK_LDB_GRIDHYBRIDLB_MODE")) {
00066 CK_LDB_GridHybridLB_Mode = atoi (value);
00067 } else {
00068 CK_LDB_GridHybridLB_Mode = CK_LDB_GRIDHYBRIDLB_MODE;
00069 }
00070
00071 if (value = getenv ("CK_LDB_GRIDHYBRIDLB_BACKGROUND_LOAD")) {
00072 CK_LDB_GridHybridLB_Background_Load = atoi (value);
00073 } else {
00074 CK_LDB_GridHybridLB_Background_Load = CK_LDB_GRIDHYBRIDLB_BACKGROUND_LOAD;
00075 }
00076
00077 if (value = getenv ("CK_LDB_GRIDHYBRIDLB_LOAD_TOLERANCE")) {
00078 CK_LDB_GridHybridLB_Load_Tolerance = atof (value);
00079 } else {
00080 CK_LDB_GridHybridLB_Load_Tolerance = CK_LDB_GRIDHYBRIDLB_LOAD_TOLERANCE;
00081 }
00082
00083 manager_init ();
00084 }
00085
00086
00087
00088
00089
00090
00091
00092 CmiBool GridHybridLB::QueryBalanceNow (int step)
00093 {
00094 if (_lb_args.debug() > 2) {
00095 CkPrintf ("[%d] GridHybridLB is balancing on step %d.\n", CkMyPe(), step);
00096 }
00097
00098 return (CmiTrue);
00099 }
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114 int GridHybridLB::Get_Cluster (int pe)
00115 {
00116 #if CONVERSE_VERSION_VMI
00117 return (CmiGetCluster (pe));
00118 #else
00119 return (0);
00120 #endif
00121 }
00122
00123
00124
00125
00126
00127
00128 void GridHybridLB::Initialize_PE_Data (CentralLB::LDStats *stats)
00129 {
00130 int min_speed;
00131 int i;
00132
00133
00134 PE_Data = new PE_Data_T[Num_PEs];
00135
00136 min_speed = MAXINT;
00137 for (i = 0; i < Num_PEs; i++) {
00138 (&PE_Data[i])->available = stats->procs[i].available;
00139 (&PE_Data[i])->cluster = Get_Cluster (i);
00140 (&PE_Data[i])->num_objs = 0;
00141 (&PE_Data[i])->num_lan_objs = 0;
00142 (&PE_Data[i])->num_lan_msgs = 0;
00143 (&PE_Data[i])->num_wan_objs = 0;
00144 (&PE_Data[i])->num_wan_msgs = 0;
00145 (&PE_Data[i])->relative_speed = 0.0;
00146 (&PE_Data[i])->scaled_load = 0.0;
00147
00148 if (stats->procs[i].pe_speed < min_speed) {
00149 min_speed = stats->procs[i].pe_speed;
00150 }
00151 }
00152
00153
00154
00155 for (i = 0; i < Num_PEs; i++) {
00156 (&PE_Data[i])->relative_speed = (double) (stats->procs[i].pe_speed / min_speed);
00157 if (CK_LDB_GridHybridLB_Background_Load) {
00158 (&PE_Data[i])->scaled_load += stats->procs[i].bg_walltime;
00159 }
00160 }
00161 }
00162
00163
00164
00165
00166
00167
00168 int GridHybridLB::Available_PE_Count ()
00169 {
00170 int available_pe_count;
00171 int i;
00172
00173
00174 available_pe_count = 0;
00175 for (i = 0; i < Num_PEs; i++) {
00176 if ((&PE_Data[i])->available) {
00177 available_pe_count += 1;
00178 }
00179 }
00180 return (available_pe_count);
00181 }
00182
00183
00184
00185
00186
00187
00188 int GridHybridLB::Compute_Number_Of_Clusters ()
00189 {
00190 int max_cluster;
00191 int i;
00192
00193
00194 max_cluster = 0;
00195 for (i = 0; i < Num_PEs; i++) {
00196 if ((&PE_Data[i])->cluster < 0) {
00197 return (-1);
00198 }
00199
00200 if ((&PE_Data[i])->cluster > max_cluster) {
00201 max_cluster = (&PE_Data[i])->cluster;
00202 }
00203 }
00204 return (max_cluster + 1);
00205 }
00206
00207
00208
00209
00210
00211
00212 void GridHybridLB::Initialize_Object_Data (CentralLB::LDStats *stats)
00213 {
00214 int i;
00215
00216
00217 Object_Data = new Object_Data_T[Num_Objects];
00218
00219 for (i = 0; i < Num_Objects; i++) {
00220 (&Object_Data[i])->migratable = (&stats->objData[i])->migratable;
00221 (&Object_Data[i])->from_pe = stats->from_proc[i];
00222 (&Object_Data[i])->num_lan_msgs = 0;
00223 (&Object_Data[i])->num_wan_msgs = 0;
00224 (&Object_Data[i])->load = (&stats->objData[i])->wallTime;
00225
00226 if ((&Object_Data[i])->migratable) {
00227 (&Object_Data[i])->to_pe = -1;
00228 (&Object_Data[i])->cluster = -1;
00229 } else {
00230 (&Object_Data[i])->to_pe = (&Object_Data[i])->from_pe;
00231 (&Object_Data[i])->cluster = Get_Cluster ((&Object_Data[i])->from_pe);
00232 if (_lb_args.debug() > 1) {
00233 CkPrintf ("[%d] GridHybridLB identifies object %d as non-migratable.\n", CkMyPe(), i);
00234 }
00235 }
00236 }
00237 }
00238
00239
00240
00241
00242
00243
00244 void GridHybridLB::Initialize_Cluster_Data ()
00245 {
00246 int cluster;
00247 double min_total_cpu_power;
00248 int i;
00249
00250
00251 Cluster_Data = new Cluster_Data_T[Num_Clusters];
00252
00253 for (i = 0; i < Num_Clusters; i++) {
00254 (&Cluster_Data[i])->num_pes = 0;
00255 (&Cluster_Data[i])->total_cpu_power = 0.0;
00256 (&Cluster_Data[i])->scaled_cpu_power = 0.0;
00257 }
00258
00259
00260 for (i = 0; i < Num_PEs; i++) {
00261 cluster = (&PE_Data[i])->cluster;
00262
00263 (&Cluster_Data[cluster])->num_pes += 1;
00264 (&Cluster_Data[cluster])->total_cpu_power += (&PE_Data[i])->relative_speed;
00265 }
00266
00267 min_total_cpu_power = MAXDOUBLE;
00268 for (i = 0; i < Num_Clusters; i++) {
00269 if ((&Cluster_Data[i])->total_cpu_power < min_total_cpu_power) {
00270 min_total_cpu_power = (&Cluster_Data[i])->total_cpu_power;
00271 }
00272 }
00273
00274 for (i = 0; i < Num_Clusters; i++) {
00275 (&Cluster_Data[i])->scaled_cpu_power = (double) ((&Cluster_Data[i])->total_cpu_power / min_total_cpu_power);
00276 }
00277 }
00278
00279
00280
00281
00282
00283
00284 void GridHybridLB::Partition_Objects_Into_Clusters (CentralLB::LDStats *stats)
00285 {
00286 int num_migratable_objects;
00287 int *migratable_objects;
00288 int index;
00289 int num_partitions;
00290 int *partition_to_cluster_map;
00291 int cluster;
00292 int partition;
00293 int partition_count;
00294 int *vertex_weights;
00295 int vertex;
00296 int **communication_matrix;
00297 LDCommData *com_data;
00298 int send_object;
00299 int recv_object;
00300 int send_index;
00301 int recv_index;
00302 LDObjKey *recv_objects;
00303 int num_objects;
00304 int *xadj;
00305 int num_edges;
00306 int *adjncy;
00307 int *edge_weights;
00308 int count;
00309 int weight_flag;
00310 int numbering_flag;
00311 int options[5];
00312 int edgecut;
00313 int *newmap;
00314 int i;
00315 int j;
00316
00317
00318 if (Num_Clusters == 1) {
00319 for (i = 0; i < Num_Objects; i++) {
00320 (&Object_Data[i])->cluster = 0;
00321 }
00322
00323 return;
00324 }
00325
00326 for (i = 0; i < Num_Objects; i++) {
00327 (&Object_Data[i])->secondary_index = -1;
00328 }
00329
00330
00331
00332
00333 num_migratable_objects = 0;
00334 for (i = 0; i < Num_Objects; i++) {
00335 if ((&Object_Data[i])->migratable) {
00336 num_migratable_objects += 1;
00337 }
00338 }
00339
00340 migratable_objects = new int[num_migratable_objects];
00341
00342 index = 0;
00343 for (i = 0; i < Num_Objects; i++) {
00344 if ((&Object_Data[i])->migratable) {
00345 (&Object_Data[i])->secondary_index = index;
00346 migratable_objects[index] = i;
00347 index += 1;
00348 }
00349 }
00350
00351
00352
00353 num_partitions = 0;
00354 for (i = 0; i < Num_Clusters; i++) {
00355 num_partitions += (int) ceil ((&Cluster_Data[i])->scaled_cpu_power);
00356 }
00357
00358 partition_to_cluster_map = new int[num_partitions];
00359
00360 cluster = 0;
00361 partition = 0;
00362 while (partition < num_partitions) {
00363 partition_count = (int) ceil ((&Cluster_Data[cluster])->scaled_cpu_power);
00364
00365 for (i = partition; i < (partition + partition_count); i++) {
00366 partition_to_cluster_map[i] = cluster;
00367 }
00368
00369 partition += partition_count;
00370 cluster += 1;
00371 }
00372
00373 if ((CK_LDB_GridHybridLB_Mode == 1) || (CK_LDB_GridHybridLB_Mode == 3)) {
00374 vertex_weights = new int[num_migratable_objects];
00375 vertex = 0;
00376 for (i = 0; i < Num_Objects; i++) {
00377 if ((&Object_Data[i])->migratable) {
00378 vertex_weights[vertex] = (int) ceil ((&Object_Data[i])->load * 10000);
00379 vertex += 1;
00380 }
00381 }
00382 }
00383
00384
00385 communication_matrix = new int *[num_migratable_objects];
00386 for (i = 0; i < num_migratable_objects; i++) {
00387 communication_matrix[i] = new int[num_migratable_objects];
00388 for (j = 0; j < num_migratable_objects; j++) {
00389 communication_matrix[i][j] = 0;
00390 }
00391 }
00392
00393 for (i = 0; i < stats->n_comm; i++) {
00394 com_data = &(stats->commData[i]);
00395 if ((!com_data->from_proc()) && (com_data->recv_type() == LD_OBJ_MSG)) {
00396 send_object = stats->getHash (com_data->sender);
00397 recv_object = stats->getHash (com_data->receiver.get_destObj());
00398
00399
00400 if ((send_object < 0) || (send_object > Num_Objects) || (recv_object < 0) || (recv_object > Num_Objects)) {
00401 continue;
00402 }
00403
00404 if ((!(&Object_Data[send_object])->migratable) || (!(&Object_Data[recv_object])->migratable)) {
00405 continue;
00406 }
00407
00408 send_index = (&Object_Data[send_object])->secondary_index;
00409 recv_index = (&Object_Data[recv_object])->secondary_index;
00410
00411 communication_matrix[send_index][recv_index] += com_data->messages;
00412 communication_matrix[recv_index][send_index] += com_data->messages;
00413 } else if (com_data->receiver.get_type() == LD_OBJLIST_MSG) {
00414 send_object = stats->getHash (com_data->sender);
00415
00416 if ((send_object < 0) || (send_object > Num_Objects)) {
00417 continue;
00418 }
00419
00420 if (!(&Object_Data[send_object])->migratable) {
00421 continue;
00422 }
00423
00424 recv_objects = com_data->receiver.get_destObjs (num_objects);
00425
00426 for (j = 0; j < num_objects; j++) {
00427 recv_object = stats->getHash (recv_objects[j]);
00428
00429
00430 if ((recv_object < 0) || (recv_object > Num_Objects)) {
00431 continue;
00432 }
00433
00434 if (!(&Object_Data[recv_object])->migratable) {
00435 continue;
00436 }
00437
00438 send_index = (&Object_Data[send_object])->secondary_index;
00439 recv_index = (&Object_Data[recv_object])->secondary_index;
00440
00441 communication_matrix[send_index][recv_index] += com_data->messages;
00442 communication_matrix[recv_index][send_index] += com_data->messages;
00443 }
00444 }
00445 }
00446
00447 for (i = 0; i < num_migratable_objects; i++) {
00448 communication_matrix[i][i] = 0;
00449 }
00450
00451
00452 xadj = new int[num_migratable_objects + 1];
00453 num_edges = 0;
00454 for (i = 0; i < num_migratable_objects; i++) {
00455 for (j = 0; j < num_migratable_objects; j++) {
00456 if (communication_matrix[i][j] > 0) {
00457 num_edges += 1;
00458 }
00459 }
00460 }
00461 adjncy = new int[num_edges];
00462 edge_weights = new int[num_edges];
00463 count = 0;
00464 xadj[0] = 0;
00465 for (i = 0; i < num_migratable_objects; i++) {
00466 for (j = 0; j < num_migratable_objects; j++) {
00467 if (communication_matrix[i][j] > 0) {
00468 adjncy[count] = j;
00469 edge_weights[count] = communication_matrix[i][j];
00470 count += 1;
00471 }
00472 }
00473 xadj[i+1] = count;
00474 }
00475
00476 if ((CK_LDB_GridHybridLB_Mode == 0) || (CK_LDB_GridHybridLB_Mode == 2)) {
00477
00478 weight_flag = 1;
00479 numbering_flag = 0;
00480 options[0] = 0;
00481 newmap = new int[num_migratable_objects];
00482
00483 METIS_PartGraphRecursive (&num_migratable_objects, xadj, adjncy, NULL, edge_weights, &weight_flag, &numbering_flag, &num_partitions, options, &edgecut, newmap);
00484 } else if ((CK_LDB_GridHybridLB_Mode == 1) || (CK_LDB_GridHybridLB_Mode == 3)) {
00485
00486 weight_flag = 3;
00487 numbering_flag = 0;
00488 options[0] = 0;
00489 newmap = new int[num_migratable_objects];
00490
00491 METIS_PartGraphRecursive (&num_migratable_objects, xadj, adjncy, vertex_weights, edge_weights, &weight_flag, &numbering_flag, &num_partitions, options, &edgecut, newmap);
00492 } else {
00493 if (_lb_args.debug() > 0) {
00494 CkPrintf ("[%d] GridHybridLB was told to use bad mode (%d).\n", CkMyPe(), CK_LDB_GridHybridLB_Mode);
00495 }
00496 }
00497
00498
00499 for (i = 0; i < num_migratable_objects; i++) {
00500 partition = newmap[i];
00501 cluster = partition_to_cluster_map[partition];
00502
00503 index = migratable_objects[i];
00504
00505 (&Object_Data[index])->cluster = cluster;
00506 }
00507
00508
00509 delete [] newmap;
00510 delete [] edge_weights;
00511 delete [] adjncy;
00512 delete [] xadj;
00513 for (i = 0; i < num_migratable_objects; i++) {
00514 delete [] communication_matrix[i];
00515 }
00516 delete [] communication_matrix;
00517 if ((CK_LDB_GridHybridLB_Mode == 1) || (CK_LDB_GridHybridLB_Mode == 3)) {
00518 delete [] vertex_weights;
00519 }
00520 delete [] partition_to_cluster_map;
00521 delete [] migratable_objects;
00522 }
00523
00524
00525
00526
00527
00528
00529 void GridHybridLB::Examine_InterObject_Messages (CentralLB::LDStats *stats)
00530 {
00531 int i;
00532 int j;
00533 LDCommData *com_data;
00534 int send_object;
00535 int send_cluster;
00536 int recv_object;
00537 int recv_cluster;
00538 LDObjKey *recv_objects;
00539 int num_objects;
00540
00541
00542 for (i = 0; i < stats->n_comm; i++) {
00543 com_data = &(stats->commData[i]);
00544 if ((!com_data->from_proc()) && (com_data->recv_type() == LD_OBJ_MSG)) {
00545 send_object = stats->getHash (com_data->sender);
00546 recv_object = stats->getHash (com_data->receiver.get_destObj());
00547
00548 if ((send_object < 0) || (send_object > Num_Objects) || (recv_object < 0) || (recv_object > Num_Objects)) {
00549 continue;
00550 }
00551
00552 send_cluster = (&Object_Data[send_object])->cluster;
00553 recv_cluster = (&Object_Data[recv_object])->cluster;
00554
00555 if (send_cluster == recv_cluster) {
00556 (&Object_Data[send_object])->num_lan_msgs += com_data->messages;
00557 } else {
00558 (&Object_Data[send_object])->num_wan_msgs += com_data->messages;
00559 }
00560 } else if (com_data->receiver.get_type() == LD_OBJLIST_MSG) {
00561 send_object = stats->getHash (com_data->sender);
00562
00563 if ((send_object < 0) || (send_object > Num_Objects)) {
00564 continue;
00565 }
00566
00567 send_cluster = (&Object_Data[send_object])->cluster;
00568
00569 recv_objects = com_data->receiver.get_destObjs (num_objects);
00570
00571 for (j = 0; j < num_objects; j++) {
00572 recv_object = stats->getHash (recv_objects[j]);
00573
00574 if ((recv_object < 0) || (recv_object > Num_Objects)) {
00575 continue;
00576 }
00577
00578 recv_cluster = (&Object_Data[recv_object])->cluster;
00579
00580 if (send_cluster == recv_cluster) {
00581 (&Object_Data[send_object])->num_lan_msgs += com_data->messages;
00582 } else {
00583 (&Object_Data[send_object])->num_wan_msgs += com_data->messages;
00584 }
00585 }
00586 }
00587 }
00588 }
00589
00590
00591
00592
00593
00594
00595 void GridHybridLB::Map_NonMigratable_Objects_To_PEs ()
00596 {
00597 int i;
00598
00599
00600 for (i = 0; i < Num_Objects; i++) {
00601 if (!((&Object_Data[i])->migratable)) {
00602 if (_lb_args.debug() > 1) {
00603 CkPrintf ("[%d] GridHybridLB identifies object %d as non-migratable.\n", CkMyPe(), i);
00604 }
00605
00606 Assign_Object_To_PE (i, (&Object_Data[i])->from_pe);
00607 }
00608 }
00609 }
00610
00611
00612
00613
00614
00615
00616 void GridHybridLB::Map_Migratable_Objects_To_PEs (int cluster)
00617 {
00618 int target_object;
00619 int target_pe;
00620
00621
00622 while (1) {
00623 target_object = Find_Maximum_Object (cluster);
00624 target_pe = Find_Minimum_PE (cluster);
00625
00626 if ((target_object == -1) || (target_pe == -1)) {
00627 break;
00628 }
00629
00630 Assign_Object_To_PE (target_object, target_pe);
00631 }
00632 }
00633
00634
00635
00636
00637
00638
00639
00640
00641
00642
00643
00644 int GridHybridLB::Find_Maximum_Object (int cluster)
00645 {
00646 int max_index;
00647 int max_load_index;
00648 double max_load;
00649 int max_wan_msgs_index;
00650 int max_wan_msgs;
00651 double load_tolerance;
00652 int i;
00653
00654
00655 max_index = -1;
00656
00657 max_load_index = -1;
00658 max_load = -1.0;
00659
00660 max_wan_msgs_index = -1;
00661 max_wan_msgs = -1;
00662
00663 for (i = 0; i < Num_Objects; i++) {
00664 if (((&Object_Data[i])->cluster == cluster) && ((&Object_Data[i])->to_pe == -1)) {
00665 if ((&Object_Data[i])->load > max_load) {
00666 max_load_index = i;
00667 max_load = (&Object_Data[i])->load;
00668 }
00669 if ((&Object_Data[i])->num_wan_msgs > max_wan_msgs) {
00670 max_wan_msgs_index = i;
00671 max_wan_msgs = (&Object_Data[i])->num_wan_msgs;
00672 }
00673 }
00674 }
00675
00676 if (max_load_index < 0) {
00677 return (max_load_index);
00678 }
00679
00680 if ((&Object_Data[max_load_index])->num_wan_msgs >= (&Object_Data[max_wan_msgs_index])->num_wan_msgs) {
00681 return (max_load_index);
00682 }
00683
00684 load_tolerance = (&Object_Data[max_load_index])->load * CK_LDB_GridHybridLB_Load_Tolerance;
00685
00686 max_index = max_load_index;
00687
00688 for (i = 0; i < Num_Objects; i++) {
00689 if (((&Object_Data[i])->cluster == cluster) && ((&Object_Data[i])->to_pe == -1)) {
00690 if (i != max_load_index) {
00691 if (fabs ((&Object_Data[max_load_index])->load - (&Object_Data[i])->load) <= load_tolerance) {
00692 if ((&Object_Data[i])->num_wan_msgs > (&Object_Data[max_index])->num_wan_msgs) {
00693 max_index = i;
00694 }
00695 }
00696 }
00697 }
00698 }
00699
00700 return (max_index);
00701 }
00702
00703
00704
00705
00706
00707
00708
00709
00710
00711
00712
00713
00714
00715
00716
00717
00718
00719 int GridHybridLB::Find_Minimum_PE (int cluster)
00720 {
00721 if ((CK_LDB_GridHybridLB_Mode == 0) || (CK_LDB_GridHybridLB_Mode == 1)) {
00722 int min_index;
00723 int min_objs;
00724 int i;
00725
00726
00727 min_index = -1;
00728 min_objs = MAXINT;
00729
00730 for (i = 0; i < Num_PEs; i++) {
00731 if (((&PE_Data[i])->available) && ((&PE_Data[i])->cluster == cluster)) {
00732 if ((&PE_Data[i])->num_objs < min_objs) {
00733 min_index = i;
00734 min_objs = (&PE_Data[i])->num_objs;
00735 } else if (((&PE_Data[i])->num_objs == min_objs) &&
00736 ((&PE_Data[i])->num_wan_objs < (&PE_Data[min_index])->num_wan_objs)) {
00737 min_index = i;
00738 } else if (((&PE_Data[i])->num_objs == min_objs) &&
00739 ((&PE_Data[i])->num_wan_objs == (&PE_Data[min_index])->num_wan_objs) &&
00740 ((&PE_Data[i])->num_wan_msgs < (&PE_Data[min_index])->num_wan_msgs)) {
00741 min_index = i;
00742 } else if (((&PE_Data[i])->num_objs == min_objs) &&
00743 ((&PE_Data[i])->num_wan_objs == (&PE_Data[min_index])->num_wan_objs) &&
00744 ((&PE_Data[i])->num_wan_msgs == (&PE_Data[min_index])->num_wan_msgs) &&
00745 ((&PE_Data[i])->scaled_load < (&PE_Data[min_index])->scaled_load)) {
00746 min_index = i;
00747 }
00748 }
00749 }
00750
00751 return (min_index);
00752 } else if ((CK_LDB_GridHybridLB_Mode == 2) || (CK_LDB_GridHybridLB_Mode == 3)) {
00753 int min_index;
00754 int min_load_index;
00755 double min_scaled_load;
00756 int min_wan_msgs_index;
00757 int min_wan_msgs;
00758 double load_tolerance;
00759 int i;
00760
00761
00762 min_index = -1;
00763
00764 min_load_index = -1;
00765 min_scaled_load = MAXDOUBLE;
00766
00767 min_wan_msgs_index = -1;
00768 min_wan_msgs = MAXINT;
00769
00770 for (i = 0; i < Num_PEs; i++) {
00771 if (((&PE_Data[i])->available) && ((&PE_Data[i])->cluster == cluster)) {
00772 if ((&PE_Data[i])->scaled_load < min_scaled_load) {
00773 min_load_index = i;
00774 min_scaled_load = (&PE_Data[i])->scaled_load;
00775 }
00776 if ((&PE_Data[i])->num_wan_msgs < min_wan_msgs) {
00777 min_wan_msgs_index = i;
00778 min_wan_msgs = (&PE_Data[i])->num_wan_msgs;
00779 }
00780 }
00781 }
00782
00783
00784 if (min_load_index < 0) {
00785 return (min_load_index);
00786 }
00787
00788
00789
00790
00791 if ((&PE_Data[min_load_index])->num_wan_msgs <= (&PE_Data[min_wan_msgs_index])->num_wan_msgs) {
00792 return (min_load_index);
00793 }
00794
00795
00796
00797
00798
00799 load_tolerance = (&PE_Data[min_load_index])->scaled_load * CK_LDB_GridHybridLB_Load_Tolerance;
00800
00801 min_index = min_load_index;
00802
00803 for (i = 0; i < Num_PEs; i++) {
00804 if (((&PE_Data[i])->available) && ((&PE_Data[i])->cluster == cluster)) {
00805 if (i != min_load_index) {
00806 if (fabs ((&PE_Data[i])->scaled_load - (&PE_Data[min_load_index])->scaled_load) <= load_tolerance) {
00807 if ((&PE_Data[i])->num_wan_msgs < (&PE_Data[min_index])->num_wan_msgs) {
00808 min_index = i;
00809 }
00810 }
00811 }
00812 }
00813 }
00814
00815 return (min_index);
00816 } else {
00817 if (_lb_args.debug() > 0) {
00818 CkPrintf ("[%d] GridHybridLB was told to use bad mode (%d).\n", CkMyPe(), CK_LDB_GridHybridLB_Mode);
00819 }
00820 return (-1);
00821 }
00822 }
00823
00824
00825
00826
00827
00828
00829
00830
00831
00832 void GridHybridLB::Assign_Object_To_PE (int target_object, int target_pe)
00833 {
00834 (&Object_Data[target_object])->to_pe = target_pe;
00835
00836 (&PE_Data[target_pe])->num_objs += 1;
00837
00838 if ((&Object_Data[target_object])->num_lan_msgs > 0) {
00839 (&PE_Data[target_pe])->num_lan_objs += 1;
00840 (&PE_Data[target_pe])->num_lan_msgs += (&Object_Data[target_object])->num_lan_msgs;
00841 }
00842
00843 if ((&Object_Data[target_object])->num_wan_msgs > 0) {
00844 (&PE_Data[target_pe])->num_wan_objs += 1;
00845 (&PE_Data[target_pe])->num_wan_msgs += (&Object_Data[target_object])->num_wan_msgs;
00846 }
00847
00848 (&PE_Data[target_pe])->scaled_load += (&Object_Data[target_object])->load / (&PE_Data[target_pe])->relative_speed;
00849 }
00850
00851
00852
00853
00854
00855
00856
00857 void GridHybridLB::work (LDStats *stats)
00858 {
00859 int i;
00860
00861
00862 if (_lb_args.debug() > 0) {
00863 CkPrintf ("[%d] GridHybridLB is working (mode=%d, background load=%d, load tolerance=%f).\n", CkMyPe(), CK_LDB_GridHybridLB_Mode, CK_LDB_GridHybridLB_Background_Load, CK_LDB_GridHybridLB_Load_Tolerance);
00864 }
00865
00866
00867 stats->makeCommHash ();
00868
00869
00870 Num_PEs = stats->nprocs();
00871 Num_Objects = stats->n_objs;
00872
00873 if (_lb_args.debug() > 0) {
00874 CkPrintf ("[%d] GridHybridLB is examining %d PEs and %d objects.\n", CkMyPe(), Num_PEs, Num_Objects);
00875 }
00876
00877
00878 Initialize_PE_Data (stats);
00879
00880
00881 if (Available_PE_Count() < 1) {
00882 if (_lb_args.debug() > 0) {
00883 CkPrintf ("[%d] GridHybridLB finds no available PEs -- no balancing done.\n", CkMyPe());
00884 }
00885
00886 delete [] PE_Data;
00887
00888 return;
00889 }
00890
00891
00892
00893 Num_Clusters = Compute_Number_Of_Clusters ();
00894 if (Num_Clusters < 1) {
00895 if (_lb_args.debug() > 0) {
00896 CkPrintf ("[%d] GridHybridLB finds incomplete PE cluster map -- no balancing done.\n", CkMyPe());
00897 }
00898
00899 delete [] PE_Data;
00900
00901 return;
00902 }
00903
00904 if (_lb_args.debug() > 0) {
00905 CkPrintf ("[%d] GridHybridLB finds %d clusters.\n", CkMyPe(), Num_Clusters);
00906 }
00907
00908
00909 Initialize_Object_Data (stats);
00910
00911
00912 Initialize_Cluster_Data ();
00913
00914
00915 Partition_Objects_Into_Clusters (stats);
00916
00917
00918 Examine_InterObject_Messages (stats);
00919
00920
00921 Map_NonMigratable_Objects_To_PEs ();
00922
00923
00924 for (i = 0; i < Num_Clusters; i++) {
00925 Map_Migratable_Objects_To_PEs (i);
00926 }
00927
00928
00929 for (i = 0; i < Num_Objects; i++) {
00930 stats->to_proc[i] = (&Object_Data[i])->to_pe;
00931
00932 if (_lb_args.debug() > 2) {
00933 CkPrintf ("[%d] GridHybridLB migrates object %d from PE %d to PE %d.\n", CkMyPe(), i, stats->from_proc[i], stats->to_proc[i]);
00934 } else if (_lb_args.debug() > 1) {
00935 if (stats->to_proc[i] != stats->from_proc[i]) {
00936 CkPrintf ("[%d] GridHybridLB migrates object %d from PE %d to PE %d.\n", CkMyPe(), i, stats->from_proc[i], stats->to_proc[i]);
00937 }
00938 }
00939 }
00940
00941
00942 delete [] Cluster_Data;
00943 delete [] Object_Data;
00944 delete [] PE_Data;
00945 }
00946
00947 #include "GridHybridLB.def.h"