-
Notifications
You must be signed in to change notification settings - Fork 29
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add SFC based Multistep load balancer. (#175)
* Add SFC based Multistep load balancer. * Ignore empty treepieces in the load balancers. Also some white space cleanup. * Refactored printing debug information among all the geometry based load balancers. * SFC load balancer sfcPartition(): stop when we run out of TreePieces. * SFC load balance: add background loads for a more even distribution of chares. Also a little cleanup of MultistepLB_notopo.
- Loading branch information
Showing
11 changed files
with
546 additions
and
281 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
module MultistepLB_SFC { | ||
|
||
extern module CentralLB; | ||
initnode void lbinit(void); | ||
|
||
group [migratable] MultistepLB_SFC : CentralLB { | ||
entry void MultistepLB_SFC(const CkLBOptions &); | ||
}; | ||
|
||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,285 @@ | ||
#include <charm++.h> | ||
#include "MultistepLB_SFC.h" | ||
#include "ParallelGravity.h" | ||
#include "Vector3D.h" | ||
#include "formatted_string.h" | ||
|
||
CkpvExtern(int, _lb_obj_index); | ||
using namespace std; | ||
|
||
#if CHARM_VERSION > 61002 | ||
static void lbinit() | ||
{ | ||
LBRegisterBalancer<MultistepLB_SFC>("MultistepLB_SFC", | ||
"Works best with multistepped runs; uses SFC distribution"); | ||
} | ||
#else | ||
CreateLBFunc_Def(MultistepLB_SFC, | ||
"Works best with multistepped runs; uses SFC distribution"); | ||
#endif | ||
|
||
void MultistepLB_SFC::init() { | ||
lbname = "MultistepLB_SFC"; | ||
if (CkpvAccess(_lb_obj_index) == -1) | ||
CkpvAccess(_lb_obj_index) = LBRegisterObjUserData(sizeof(TaggedVector3D)); | ||
} | ||
|
||
|
||
MultistepLB_SFC::MultistepLB_SFC(const CkLBOptions &opt): CBase_MultistepLB_SFC(opt) | ||
{ | ||
init(); | ||
if (CkMyPe() == 0){ | ||
CkPrintf("[%d] MultistepLB_SFC created\n",CkMyPe()); | ||
} | ||
} | ||
|
||
bool MultistepLB_SFC::QueryBalanceNow(int step){ | ||
if(CkMyPe() == 0) CkPrintf("LB_SFC: Step %d\n", step); | ||
return true; | ||
} | ||
|
||
/// @brief Implement load balancing: store loads and determine active | ||
/// processors and objects, sort by SFC, then divide up among processors. | ||
/// @param stats The Load Balancer statistics object. | ||
void MultistepLB_SFC::work(BaseLB::LDStats* stats) | ||
{ | ||
#if CMK_LBDB_ON | ||
// find active objects - mark the inactive ones as non-migratable | ||
const auto num_objs = stats->objData.size(); | ||
|
||
if(_lb_args.debug() >= 2 && step() > 0) { | ||
// Write out "particle file" of measured load balance information | ||
auto achFileName = make_formatted_string("lb_a.%d.sim", step()-1); | ||
write_LB_particles(stats, achFileName.c_str(), true); | ||
} | ||
|
||
int numActiveObjects = 0; | ||
int numInactiveObjects = 0; | ||
int minActiveProc = INT_MAX; | ||
int maxActiveProc = 0; | ||
|
||
for(int i = 0; i < num_objs; i++){ | ||
stats->to_proc[i] = stats->from_proc[i]; | ||
} | ||
|
||
for(int i = 0; i < num_objs; i++){ | ||
if (!stats->objData[i].migratable) continue; | ||
|
||
LDObjData &odata = stats->objData[i]; | ||
TaggedVector3D* udata = (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index)); | ||
|
||
if(udata->myNumParticles == 0){ // ignore pieces with no particles | ||
stats->objData[i].migratable = 0; | ||
stats->n_migrateobjs--; | ||
continue; | ||
} | ||
if(udata->numActiveParticles == 0){ | ||
numInactiveObjects++; | ||
} | ||
else{ | ||
numActiveObjects++; | ||
if(minActiveProc > stats->from_proc[i]) | ||
minActiveProc = stats->from_proc[i]; | ||
if(maxActiveProc < stats->from_proc[i]) | ||
maxActiveProc = stats->from_proc[i]; | ||
} | ||
} | ||
CkPrintf("numActiveObjects: %d, numInactiveObjects: %d\n", numActiveObjects, | ||
numInactiveObjects); | ||
CkPrintf("active PROC range: %d to %d\n", minActiveProc, maxActiveProc); | ||
if(numActiveObjects < 0.1*numInactiveObjects) { | ||
// only a small number of active objects, only migrate them | ||
for(int i = 0; i < stats->objData.size(); i++){ | ||
if (!stats->objData[i].migratable) continue; | ||
|
||
LDObjData &odata = stats->objData[i]; | ||
TaggedVector3D* udata = | ||
(TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index)); | ||
if(udata->numActiveParticles == 0) { | ||
stats->objData[i].migratable = 0; | ||
stats->n_migrateobjs--; | ||
} | ||
} | ||
} | ||
else { | ||
CkPrintf("Migrating all: numActiveObjects: %d, numInactiveObjects: %d\n", | ||
numActiveObjects, numInactiveObjects); | ||
} | ||
|
||
// let the strategy take over on this modified instrumented data and processor information | ||
work2(stats); | ||
#endif //CMK_LDB_ON | ||
} | ||
|
||
/// @brief SFC load balance. | ||
void MultistepLB_SFC::work2(BaseLB::LDStats *stats){ | ||
const int numobjs = stats->objData.size(); | ||
const int nmig = stats->n_migrateobjs; | ||
|
||
// this data structure is used by the SFC strategy | ||
// to balance objects. it is NOT indexed by tree piece index | ||
// there are as many entries in it as there are | ||
// migratable (active) tree pieces | ||
vector<SFCObject> tp_array; | ||
tp_array.resize(nmig); | ||
|
||
if (_lb_args.debug()>=2) { | ||
CkPrintf("[work2] ready tp_array data structure\n"); | ||
} | ||
|
||
int numProcessed = 0; | ||
|
||
double dBgLoad = 0.0; | ||
for(int i = 0; i < stats->nprocs(); i++){ | ||
dBgLoad += stats->procs[i].bg_walltime; | ||
} | ||
dBgLoad /= numobjs; | ||
|
||
dTotalLoad = 0.0; | ||
for(int i = 0; i < numobjs; i++){ | ||
if(!stats->objData[i].migratable) continue; | ||
|
||
float load; | ||
LDObjData &odata = stats->objData[i]; | ||
TaggedVector3D* udata = (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index)); | ||
if(step() == 0){ // no load information, balance by particle numbers | ||
load = udata->myNumParticles; | ||
} | ||
else{ | ||
// give each piece a portion of the background load | ||
load = stats->objData[i].wallTime + dBgLoad; | ||
} | ||
|
||
tp_array[numProcessed] = SFCObject(i, load); | ||
tp_array[numProcessed].centroid = udata->vec; | ||
numProcessed++; | ||
dTotalLoad += load; | ||
} | ||
|
||
if(verbosity > 0) | ||
CkPrintf("Avg active load %g; Avg bg load %g\n", dTotalLoad/numobjs, | ||
dBgLoad); | ||
|
||
CkAssert(numProcessed==nmig); | ||
|
||
sfcPrepare(tp_array, nmig, stats); | ||
sfcPartition(stats->nprocs(),tp_array, stats); | ||
|
||
// refine(stats, numobjs); | ||
Orb_PrintLBStats(stats, numobjs); | ||
|
||
if(_lb_args.debug() >= 2) { | ||
// Write out "particle file" of load balance information | ||
auto achFileName = make_formatted_string("lb.%d.sim", step()); | ||
write_LB_particles(stats, achFileName.c_str(), false); | ||
} | ||
} | ||
|
||
/// @brief Prepare structures for the ORB partition. | ||
/// @param tp_array Reference to Vector of Objects representing TreePieces. | ||
/// @param nObjs Number of tree pieces to partition. | ||
/// @param stats Data from the load balancing framework. | ||
/// @param node_partition Are we partitioning on nodes. | ||
void MultistepLB_SFC::sfcPrepare(vector<SFCObject> &tp_array, | ||
int nObjs, | ||
BaseLB::LDStats *stats, | ||
bool node_partition){ | ||
|
||
OrientedBox<float> boundingBox; | ||
int nmig = stats->n_migrateobjs; | ||
if(dMaxBalance < 1.0) | ||
dMaxBalance = 1.0; | ||
|
||
// If using node based orb partition, then the maxPieceProc is total | ||
// migratable objs / total number of node. | ||
if (node_partition) { | ||
maxPieceProc = dMaxBalance * nmig / CkNumNodes(); | ||
} else { | ||
maxPieceProc = dMaxBalance*nmig/stats->nprocs(); | ||
} | ||
|
||
if(maxPieceProc < 1.0) | ||
maxPieceProc = 1.01; | ||
|
||
CkAssert(tp_array.size() == nObjs); | ||
|
||
mapping = &stats->to_proc; | ||
from = &stats->from_proc; | ||
|
||
CkPrintf("[LB_SFC] sorting\n"); | ||
for(int i = 0; i < nObjs; i++) | ||
boundingBox.grow(tp_array[i].centroid); | ||
|
||
// N.B. code below from TreePiece::assignKeys(). | ||
// Refactoring is a possibility. | ||
// get longest axis | ||
Vector3D<float> bsize = boundingBox.size(); | ||
float max = (bsize.x > bsize.y) ? bsize.x : bsize.y; | ||
max = (max > bsize.z) ? max : bsize.z; | ||
// | ||
// Make the bounding box cubical. | ||
// | ||
Vector3D<float> bcenter = boundingBox.center(); | ||
// The magic number below is approximately 2^(-19) | ||
const float fEps = 1.0 + 1.91e-6; // slop to ensure keys fall | ||
// between 0 and 1. | ||
bsize = Vector3D<float>(fEps*0.5*max); | ||
boundingBox = OrientedBox<float>(bcenter-bsize, bcenter+bsize); | ||
if(verbosity > 1) | ||
ckout << "TreePiece: Bounding box now: " << boundingBox << endl; | ||
|
||
for(unsigned int i = 0; i < nObjs; ++i) { | ||
tp_array[i].key = SFC::generateKey(tp_array[i].centroid, boundingBox); | ||
} | ||
sort(tp_array.begin(),tp_array.end()); | ||
} | ||
|
||
/// @brief Partition treepieces among processors by | ||
/// dividing the SFC as evenly as possible. | ||
/// @param nprocs Number of processors over which to partition the | ||
/// pieces. N.B. if node_partition is true, then this is the number of nodes. | ||
/// @param tp Vector of TreePiece data. | ||
/// @param stats Load balance data | ||
void MultistepLB_SFC::sfcPartition(int nProcs, vector<SFCObject> & tp, | ||
BaseLB::LDStats *stats, | ||
bool node_partition){ | ||
|
||
double loadPrev = 0.0; // load on all lower processors | ||
int iCurrPiece = 0; // Piece under consideration | ||
const int nPieces = tp.size(); | ||
for (int iProc = 0; iProc < nProcs && iCurrPiece < nPieces; iProc++) { | ||
if (!stats->procs[iProc].available) | ||
continue; | ||
// always assign one piece to a processor | ||
SFCObject &oSFC = tp[iCurrPiece]; | ||
(*mapping)[oSFC.lbindex] = iProc; | ||
double loadCurr = oSFC.load; | ||
iCurrPiece++; | ||
int nCurrPiece = 1; // number of pieces on this processor | ||
double loadTarget = (iProc+1)*dTotalLoad/nProcs; | ||
double dLoadError = fabs(loadPrev + loadCurr - loadTarget); | ||
|
||
while ((nCurrPiece < maxPieceProc) | ||
&& fabs(tp[iCurrPiece].load + loadPrev + loadCurr - loadTarget) | ||
< dLoadError | ||
&& iCurrPiece < nPieces) { // add pieces to this | ||
// processor to get the | ||
// closest to the target | ||
// load | ||
oSFC = tp[iCurrPiece]; | ||
loadCurr += oSFC.load; | ||
(*mapping)[oSFC.lbindex] = iProc; | ||
dLoadError = fabs(loadPrev + loadCurr - loadTarget); | ||
iCurrPiece++; | ||
nCurrPiece++; | ||
} | ||
loadPrev += loadCurr; | ||
} | ||
CkAssert(iCurrPiece == tp.size()); | ||
} | ||
|
||
void MultistepLB_SFC::pup(PUP::er &p){ | ||
CBase_MultistepLB_SFC::pup(p); | ||
} | ||
|
||
#include "MultistepLB_SFC.def.h" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#ifndef _MULTISTEPLB_SFC_H_ | ||
#define _MULTISTEPLB_SFC_H_ | ||
|
||
#include "MultistepLB_SFC.decl.h" | ||
#include "Vector3D.h" | ||
#include "cosmoType.h" | ||
#include "SFC.h" | ||
#include "CentralLB.h" | ||
|
||
void Orb_PrintLBStats(BaseLB::LDStats *stats, int numobjs); | ||
void write_LB_particles(BaseLB::LDStats* stats, const char *achFileName, bool bFrom); | ||
|
||
/// @brief Multistep load balancer using Space Filling Curve | ||
/// | ||
/// This balancer recognizes different "phases" (called rungs in other | ||
/// parts of the code), and uses loads based on measurements of the | ||
/// previous calculation at the same phase. For large phases, (i.e., | ||
/// when many particles are active, the TreePieces are divided among | ||
/// the processors using a Space Filling Curve based on the centroids | ||
/// of the TreePieces. | ||
/// | ||
|
||
class MultistepLB_SFC : public CBase_MultistepLB_SFC { | ||
private: | ||
void init(); | ||
bool QueryBalanceNow(int step); | ||
|
||
decltype(BaseLB::LDStats::to_proc) *mapping; | ||
decltype(BaseLB::LDStats::from_proc) *from; | ||
/// total computational cost to be balanced | ||
double dTotalLoad; | ||
/// Maximum number of pieces per processor | ||
double maxPieceProc; | ||
|
||
public: | ||
MultistepLB_SFC(const CkLBOptions &); | ||
MultistepLB_SFC(CkMigrateMessage *m) : CBase_MultistepLB_SFC(m) { | ||
init(); | ||
} | ||
|
||
class SFCObject | ||
{ | ||
public: | ||
/// index into LB stats->objData | ||
int lbindex; | ||
/// Spacial location of TreePiece | ||
Vector3D<cosmoType> centroid; | ||
SFC::Key key; | ||
/// computational cost of this object | ||
double load; | ||
|
||
SFCObject() : lbindex(-1), load(0) {} | ||
SFCObject(int _lbindex, double _load) : | ||
lbindex(_lbindex), | ||
load(_load) | ||
{ | ||
} | ||
bool operator<(const SFCObject &o) const{ | ||
return key < o.key; | ||
} | ||
}; | ||
|
||
void work(BaseLB::LDStats* stats); | ||
void work2(BaseLB::LDStats* stats); | ||
void sfcPrepare(std::vector<SFCObject> &tp_array, | ||
int nObjs, BaseLB::LDStats * stats, | ||
bool node_partition=false); | ||
void sfcPartition(int nProcs, std::vector<SFCObject> & tp, | ||
BaseLB::LDStats *stats, bool node_partition=false); | ||
void pup(PUP::er &p); | ||
}; | ||
|
||
|
||
#endif /* _MultistepLB_notopo */ |
Oops, something went wrong.