Skip to content

Commit

Permalink
Add SFC based Multistep load balancer. (#175)
Browse files Browse the repository at this point in the history
* Add SFC based Multistep load balancer.

* Ignore empty treepieces in the load balancers.

Also some white space cleanup.

* Refactored printing debug information among all the  geometry  based load balancers.

* SFC load balancer sfcPartition(): stop when we run out of TreePieces.

* SFC load balance: add background loads for a more even distribution of chares.

Also a little cleanup of MultistepLB_notopo.
  • Loading branch information
trquinn authored Jul 26, 2024
1 parent 702eec5 commit a423605
Show file tree
Hide file tree
Showing 11 changed files with 546 additions and 281 deletions.
2 changes: 1 addition & 1 deletion Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ cache_lib_path := @CHARM_PATH@/tmp/libs/ck-libs/cache
threadsafe_ht_path := $(cache_lib_path)/threadsafe_hashtable

# ------- Modules to build ----------------------------------------------------
changa_modules := $(strip MultistepLB MultistepLB_notopo \
changa_modules := $(strip MultistepLB MultistepLB_SFC MultistepLB_notopo \
MultistepNodeLB_notopo Orb3dLB Orb3dLB_notopo HierarchOrbLB)

charm_modules := $(strip CkCache CkIO CkMulticast RefineLB \
Expand Down
10 changes: 10 additions & 0 deletions MultistepLB_SFC.ci
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
module MultistepLB_SFC {

extern module CentralLB;
initnode void lbinit(void);

group [migratable] MultistepLB_SFC : CentralLB {
entry void MultistepLB_SFC(const CkLBOptions &);
};

};
285 changes: 285 additions & 0 deletions MultistepLB_SFC.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
#include <charm++.h>
#include "MultistepLB_SFC.h"
#include "ParallelGravity.h"
#include "Vector3D.h"
#include "formatted_string.h"

CkpvExtern(int, _lb_obj_index);
using namespace std;

#if CHARM_VERSION > 61002
static void lbinit()
{
LBRegisterBalancer<MultistepLB_SFC>("MultistepLB_SFC",
"Works best with multistepped runs; uses SFC distribution");
}
#else
CreateLBFunc_Def(MultistepLB_SFC,
"Works best with multistepped runs; uses SFC distribution");
#endif

void MultistepLB_SFC::init() {
lbname = "MultistepLB_SFC";
if (CkpvAccess(_lb_obj_index) == -1)
CkpvAccess(_lb_obj_index) = LBRegisterObjUserData(sizeof(TaggedVector3D));
}


MultistepLB_SFC::MultistepLB_SFC(const CkLBOptions &opt): CBase_MultistepLB_SFC(opt)
{
init();
if (CkMyPe() == 0){
CkPrintf("[%d] MultistepLB_SFC created\n",CkMyPe());
}
}

bool MultistepLB_SFC::QueryBalanceNow(int step){
if(CkMyPe() == 0) CkPrintf("LB_SFC: Step %d\n", step);
return true;
}

/// @brief Implement load balancing: store loads and determine active
/// processors and objects, sort by SFC, then divide up among processors.
/// @param stats The Load Balancer statistics object.
void MultistepLB_SFC::work(BaseLB::LDStats* stats)
{
#if CMK_LBDB_ON
// find active objects - mark the inactive ones as non-migratable
const auto num_objs = stats->objData.size();

if(_lb_args.debug() >= 2 && step() > 0) {
// Write out "particle file" of measured load balance information
auto achFileName = make_formatted_string("lb_a.%d.sim", step()-1);
write_LB_particles(stats, achFileName.c_str(), true);
}

int numActiveObjects = 0;
int numInactiveObjects = 0;
int minActiveProc = INT_MAX;
int maxActiveProc = 0;

for(int i = 0; i < num_objs; i++){
stats->to_proc[i] = stats->from_proc[i];
}

for(int i = 0; i < num_objs; i++){
if (!stats->objData[i].migratable) continue;

LDObjData &odata = stats->objData[i];
TaggedVector3D* udata = (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index));

if(udata->myNumParticles == 0){ // ignore pieces with no particles
stats->objData[i].migratable = 0;
stats->n_migrateobjs--;
continue;
}
if(udata->numActiveParticles == 0){
numInactiveObjects++;
}
else{
numActiveObjects++;
if(minActiveProc > stats->from_proc[i])
minActiveProc = stats->from_proc[i];
if(maxActiveProc < stats->from_proc[i])
maxActiveProc = stats->from_proc[i];
}
}
CkPrintf("numActiveObjects: %d, numInactiveObjects: %d\n", numActiveObjects,
numInactiveObjects);
CkPrintf("active PROC range: %d to %d\n", minActiveProc, maxActiveProc);
if(numActiveObjects < 0.1*numInactiveObjects) {
// only a small number of active objects, only migrate them
for(int i = 0; i < stats->objData.size(); i++){
if (!stats->objData[i].migratable) continue;

LDObjData &odata = stats->objData[i];
TaggedVector3D* udata =
(TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index));
if(udata->numActiveParticles == 0) {
stats->objData[i].migratable = 0;
stats->n_migrateobjs--;
}
}
}
else {
CkPrintf("Migrating all: numActiveObjects: %d, numInactiveObjects: %d\n",
numActiveObjects, numInactiveObjects);
}

// let the strategy take over on this modified instrumented data and processor information
work2(stats);
#endif //CMK_LDB_ON
}

/// @brief SFC load balance.
void MultistepLB_SFC::work2(BaseLB::LDStats *stats){
const int numobjs = stats->objData.size();
const int nmig = stats->n_migrateobjs;

// this data structure is used by the SFC strategy
// to balance objects. it is NOT indexed by tree piece index
// there are as many entries in it as there are
// migratable (active) tree pieces
vector<SFCObject> tp_array;
tp_array.resize(nmig);

if (_lb_args.debug()>=2) {
CkPrintf("[work2] ready tp_array data structure\n");
}

int numProcessed = 0;

double dBgLoad = 0.0;
for(int i = 0; i < stats->nprocs(); i++){
dBgLoad += stats->procs[i].bg_walltime;
}
dBgLoad /= numobjs;

dTotalLoad = 0.0;
for(int i = 0; i < numobjs; i++){
if(!stats->objData[i].migratable) continue;

float load;
LDObjData &odata = stats->objData[i];
TaggedVector3D* udata = (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index));
if(step() == 0){ // no load information, balance by particle numbers
load = udata->myNumParticles;
}
else{
// give each piece a portion of the background load
load = stats->objData[i].wallTime + dBgLoad;
}

tp_array[numProcessed] = SFCObject(i, load);
tp_array[numProcessed].centroid = udata->vec;
numProcessed++;
dTotalLoad += load;
}

if(verbosity > 0)
CkPrintf("Avg active load %g; Avg bg load %g\n", dTotalLoad/numobjs,
dBgLoad);

CkAssert(numProcessed==nmig);

sfcPrepare(tp_array, nmig, stats);
sfcPartition(stats->nprocs(),tp_array, stats);

// refine(stats, numobjs);
Orb_PrintLBStats(stats, numobjs);

if(_lb_args.debug() >= 2) {
// Write out "particle file" of load balance information
auto achFileName = make_formatted_string("lb.%d.sim", step());
write_LB_particles(stats, achFileName.c_str(), false);
}
}

/// @brief Prepare structures for the ORB partition.
/// @param tp_array Reference to Vector of Objects representing TreePieces.
/// @param nObjs Number of tree pieces to partition.
/// @param stats Data from the load balancing framework.
/// @param node_partition Are we partitioning on nodes.
void MultistepLB_SFC::sfcPrepare(vector<SFCObject> &tp_array,
int nObjs,
BaseLB::LDStats *stats,
bool node_partition){

OrientedBox<float> boundingBox;
int nmig = stats->n_migrateobjs;
if(dMaxBalance < 1.0)
dMaxBalance = 1.0;

// If using node based orb partition, then the maxPieceProc is total
// migratable objs / total number of node.
if (node_partition) {
maxPieceProc = dMaxBalance * nmig / CkNumNodes();
} else {
maxPieceProc = dMaxBalance*nmig/stats->nprocs();
}

if(maxPieceProc < 1.0)
maxPieceProc = 1.01;

CkAssert(tp_array.size() == nObjs);

mapping = &stats->to_proc;
from = &stats->from_proc;

CkPrintf("[LB_SFC] sorting\n");
for(int i = 0; i < nObjs; i++)
boundingBox.grow(tp_array[i].centroid);

// N.B. code below from TreePiece::assignKeys().
// Refactoring is a possibility.
// get longest axis
Vector3D<float> bsize = boundingBox.size();
float max = (bsize.x > bsize.y) ? bsize.x : bsize.y;
max = (max > bsize.z) ? max : bsize.z;
//
// Make the bounding box cubical.
//
Vector3D<float> bcenter = boundingBox.center();
// The magic number below is approximately 2^(-19)
const float fEps = 1.0 + 1.91e-6; // slop to ensure keys fall
// between 0 and 1.
bsize = Vector3D<float>(fEps*0.5*max);
boundingBox = OrientedBox<float>(bcenter-bsize, bcenter+bsize);
if(verbosity > 1)
ckout << "TreePiece: Bounding box now: " << boundingBox << endl;

for(unsigned int i = 0; i < nObjs; ++i) {
tp_array[i].key = SFC::generateKey(tp_array[i].centroid, boundingBox);
}
sort(tp_array.begin(),tp_array.end());
}

/// @brief Partition treepieces among processors by
/// dividing the SFC as evenly as possible.
/// @param nprocs Number of processors over which to partition the
/// pieces. N.B. if node_partition is true, then this is the number of nodes.
/// @param tp Vector of TreePiece data.
/// @param stats Load balance data
void MultistepLB_SFC::sfcPartition(int nProcs, vector<SFCObject> & tp,
BaseLB::LDStats *stats,
bool node_partition){

double loadPrev = 0.0; // load on all lower processors
int iCurrPiece = 0; // Piece under consideration
const int nPieces = tp.size();
for (int iProc = 0; iProc < nProcs && iCurrPiece < nPieces; iProc++) {
if (!stats->procs[iProc].available)
continue;
// always assign one piece to a processor
SFCObject &oSFC = tp[iCurrPiece];
(*mapping)[oSFC.lbindex] = iProc;
double loadCurr = oSFC.load;
iCurrPiece++;
int nCurrPiece = 1; // number of pieces on this processor
double loadTarget = (iProc+1)*dTotalLoad/nProcs;
double dLoadError = fabs(loadPrev + loadCurr - loadTarget);

while ((nCurrPiece < maxPieceProc)
&& fabs(tp[iCurrPiece].load + loadPrev + loadCurr - loadTarget)
< dLoadError
&& iCurrPiece < nPieces) { // add pieces to this
// processor to get the
// closest to the target
// load
oSFC = tp[iCurrPiece];
loadCurr += oSFC.load;
(*mapping)[oSFC.lbindex] = iProc;
dLoadError = fabs(loadPrev + loadCurr - loadTarget);
iCurrPiece++;
nCurrPiece++;
}
loadPrev += loadCurr;
}
CkAssert(iCurrPiece == tp.size());
}

void MultistepLB_SFC::pup(PUP::er &p){
CBase_MultistepLB_SFC::pup(p);
}

#include "MultistepLB_SFC.def.h"
74 changes: 74 additions & 0 deletions MultistepLB_SFC.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#ifndef _MULTISTEPLB_SFC_H_
#define _MULTISTEPLB_SFC_H_

#include "MultistepLB_SFC.decl.h"
#include "Vector3D.h"
#include "cosmoType.h"
#include "SFC.h"
#include "CentralLB.h"

void Orb_PrintLBStats(BaseLB::LDStats *stats, int numobjs);
void write_LB_particles(BaseLB::LDStats* stats, const char *achFileName, bool bFrom);

/// @brief Multistep load balancer using Space Filling Curve
///
/// This balancer recognizes different "phases" (called rungs in other
/// parts of the code), and uses loads based on measurements of the
/// previous calculation at the same phase. For large phases, (i.e.,
/// when many particles are active, the TreePieces are divided among
/// the processors using a Space Filling Curve based on the centroids
/// of the TreePieces.
///

class MultistepLB_SFC : public CBase_MultistepLB_SFC {
private:
void init();
bool QueryBalanceNow(int step);

decltype(BaseLB::LDStats::to_proc) *mapping;
decltype(BaseLB::LDStats::from_proc) *from;
/// total computational cost to be balanced
double dTotalLoad;
/// Maximum number of pieces per processor
double maxPieceProc;

public:
MultistepLB_SFC(const CkLBOptions &);
MultistepLB_SFC(CkMigrateMessage *m) : CBase_MultistepLB_SFC(m) {
init();
}

class SFCObject
{
public:
/// index into LB stats->objData
int lbindex;
/// Spacial location of TreePiece
Vector3D<cosmoType> centroid;
SFC::Key key;
/// computational cost of this object
double load;

SFCObject() : lbindex(-1), load(0) {}
SFCObject(int _lbindex, double _load) :
lbindex(_lbindex),
load(_load)
{
}
bool operator<(const SFCObject &o) const{
return key < o.key;
}
};

void work(BaseLB::LDStats* stats);
void work2(BaseLB::LDStats* stats);
void sfcPrepare(std::vector<SFCObject> &tp_array,
int nObjs, BaseLB::LDStats * stats,
bool node_partition=false);
void sfcPartition(int nProcs, std::vector<SFCObject> & tp,
BaseLB::LDStats *stats, bool node_partition=false);
void pup(PUP::er &p);
};


#endif /* _MultistepLB_notopo */
Loading

0 comments on commit a423605

Please sign in to comment.