Add SFC based Multistep load balancer. (#175)

* Add SFC based Multistep load balancer. * Ignore empty treepieces in the load balancers. Also some white space cleanup. * Refactored printing debug information among all the geometry based load balancers. * SFC load balancer sfcPartition(): stop when we run out of TreePieces. * SFC load balance: add background loads for a more even distribution of chares. Also a little cleanup of MultistepLB_notopo.
N-BodyShop · Jul 26, 2024 · a423605 · a423605
1 parent 702eec5
commit a423605
Show file tree

Hide file tree

Showing 11 changed files with 546 additions and 281 deletions.
diff --git a/Makefile.in b/Makefile.in
@@ -111,7 +111,7 @@ cache_lib_path      := @CHARM_PATH@/tmp/libs/ck-libs/cache
 threadsafe_ht_path  := $(cache_lib_path)/threadsafe_hashtable
 
 # ------- Modules to build ----------------------------------------------------
-changa_modules := $(strip MultistepLB MultistepLB_notopo \
+changa_modules := $(strip MultistepLB MultistepLB_SFC MultistepLB_notopo \
                     MultistepNodeLB_notopo Orb3dLB Orb3dLB_notopo HierarchOrbLB)
 
 charm_modules := $(strip CkCache CkIO CkMulticast RefineLB \

diff --git a/MultistepLB_SFC.ci b/MultistepLB_SFC.ci
@@ -0,0 +1,10 @@
+module MultistepLB_SFC {
+
+extern module CentralLB;
+initnode void lbinit(void);
+
+group [migratable] MultistepLB_SFC : CentralLB {
+  entry void MultistepLB_SFC(const CkLBOptions &);
+};
+
+};
diff --git a/MultistepLB_SFC.cpp b/MultistepLB_SFC.cpp
@@ -0,0 +1,285 @@
+#include <charm++.h>
+#include "MultistepLB_SFC.h"
+#include "ParallelGravity.h"
+#include "Vector3D.h"
+#include "formatted_string.h"
+
+CkpvExtern(int, _lb_obj_index);
+using namespace std;
+
+#if CHARM_VERSION > 61002
+static void lbinit()
+{
+    LBRegisterBalancer<MultistepLB_SFC>("MultistepLB_SFC",
+      "Works best with multistepped runs; uses SFC distribution");
+}
+#else
+CreateLBFunc_Def(MultistepLB_SFC,
+                 "Works best with multistepped runs; uses SFC distribution");
+#endif
+
+void MultistepLB_SFC::init() {
+    lbname = "MultistepLB_SFC";
+    if (CkpvAccess(_lb_obj_index) == -1)
+        CkpvAccess(_lb_obj_index) = LBRegisterObjUserData(sizeof(TaggedVector3D));
+}
+
+
+MultistepLB_SFC::MultistepLB_SFC(const CkLBOptions &opt): CBase_MultistepLB_SFC(opt)
+{
+    init();
+    if (CkMyPe() == 0){
+        CkPrintf("[%d] MultistepLB_SFC created\n",CkMyPe());
+    }
+}
+
+bool MultistepLB_SFC::QueryBalanceNow(int step){
+    if(CkMyPe() == 0) CkPrintf("LB_SFC: Step %d\n", step);
+    return true;
+}
+
+/// @brief Implement load balancing: store loads and determine active
+/// processors and objects, sort by SFC, then divide up among processors.
+/// @param stats The Load Balancer statistics object.
+void MultistepLB_SFC::work(BaseLB::LDStats* stats)
+{
+#if CMK_LBDB_ON
+    // find active objects - mark the inactive ones as non-migratable
+    const auto num_objs = stats->objData.size();
+
+    if(_lb_args.debug() >= 2 && step() > 0) {
+        // Write out "particle file" of measured load balance information
+        auto achFileName = make_formatted_string("lb_a.%d.sim", step()-1);
+        write_LB_particles(stats, achFileName.c_str(), true);
+    }
+
+    int numActiveObjects = 0;
+    int numInactiveObjects = 0;
+    int minActiveProc = INT_MAX;
+    int maxActiveProc = 0;
+
+    for(int i = 0; i < num_objs; i++){
+        stats->to_proc[i] = stats->from_proc[i];
+    }
+
+    for(int i = 0; i < num_objs; i++){
+        if (!stats->objData[i].migratable) continue;
+
+        LDObjData &odata = stats->objData[i];
+        TaggedVector3D* udata = (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index));
+
+        if(udata->myNumParticles == 0){ // ignore pieces with no particles
+            stats->objData[i].migratable = 0;
+            stats->n_migrateobjs--;
+            continue;
+        }
+        if(udata->numActiveParticles == 0){
+            numInactiveObjects++;
+        }
+        else{
+            numActiveObjects++;
+            if(minActiveProc > stats->from_proc[i])
+                minActiveProc = stats->from_proc[i];
+            if(maxActiveProc < stats->from_proc[i])
+                maxActiveProc = stats->from_proc[i];
+        }
+    }
+    CkPrintf("numActiveObjects: %d, numInactiveObjects: %d\n", numActiveObjects,
+             numInactiveObjects);
+    CkPrintf("active PROC range: %d to %d\n", minActiveProc, maxActiveProc);
+    if(numActiveObjects < 0.1*numInactiveObjects) {
+        // only a small number of active objects, only migrate them
+        for(int i = 0; i < stats->objData.size(); i++){
+            if (!stats->objData[i].migratable) continue;
+
+            LDObjData &odata = stats->objData[i];
+            TaggedVector3D* udata =
+              (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index));
+            if(udata->numActiveParticles == 0) {
+                stats->objData[i].migratable = 0;
+                stats->n_migrateobjs--;
+            }
+        }
+    }
+    else {
+        CkPrintf("Migrating all: numActiveObjects: %d, numInactiveObjects: %d\n",
+                 numActiveObjects, numInactiveObjects);
+    }
+
+    // let the strategy take over on this modified instrumented data and processor information
+    work2(stats);
+#endif //CMK_LDB_ON
+}
+
+/// @brief SFC load balance.
+void MultistepLB_SFC::work2(BaseLB::LDStats *stats){
+    const int numobjs = stats->objData.size();
+    const int nmig = stats->n_migrateobjs;
+
+    // this data structure is used by the SFC strategy
+    // to balance objects. it is NOT indexed by tree piece index
+    // there are as many entries in it as there are
+    // migratable (active) tree pieces
+    vector<SFCObject> tp_array;
+    tp_array.resize(nmig);
+
+    if (_lb_args.debug()>=2) {
+        CkPrintf("[work2] ready tp_array data structure\n");
+    }
+
+    int numProcessed = 0;
+
+    double dBgLoad = 0.0;
+    for(int i = 0; i < stats->nprocs(); i++){
+        dBgLoad += stats->procs[i].bg_walltime;
+    }
+    dBgLoad /= numobjs;
+
+    dTotalLoad = 0.0;
+    for(int i = 0; i < numobjs; i++){
+        if(!stats->objData[i].migratable) continue;
+
+        float load;
+        LDObjData &odata = stats->objData[i];
+        TaggedVector3D* udata = (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index));
+        if(step() == 0){ // no load information, balance by particle numbers
+            load = udata->myNumParticles;
+        }
+        else{
+            // give each piece a portion of the background load
+            load = stats->objData[i].wallTime + dBgLoad;
+        }
+
+        tp_array[numProcessed] = SFCObject(i, load);
+        tp_array[numProcessed].centroid = udata->vec;
+        numProcessed++;
+        dTotalLoad += load;
+    }
+
+    if(verbosity > 0)
+        CkPrintf("Avg active load %g; Avg bg load %g\n", dTotalLoad/numobjs,
+                 dBgLoad);
+
+    CkAssert(numProcessed==nmig);
+
+    sfcPrepare(tp_array, nmig, stats);
+    sfcPartition(stats->nprocs(),tp_array, stats);
+
+    // refine(stats, numobjs);
+    Orb_PrintLBStats(stats, numobjs);
+
+    if(_lb_args.debug() >= 2) {
+        // Write out "particle file" of load balance information
+        auto achFileName = make_formatted_string("lb.%d.sim", step());
+        write_LB_particles(stats, achFileName.c_str(), false);
+    }
+}
+
+/// @brief Prepare structures for the ORB partition.
+/// @param tp_array Reference to Vector of Objects representing TreePieces.
+/// @param nObjs Number of tree pieces to partition.
+/// @param stats Data from the load balancing framework.
+/// @param node_partition Are we partitioning on nodes.
+void MultistepLB_SFC::sfcPrepare(vector<SFCObject> &tp_array,
+                                 int nObjs,
+                                 BaseLB::LDStats *stats,
+                                 bool node_partition){
+
+    OrientedBox<float> boundingBox;
+    int nmig = stats->n_migrateobjs;
+    if(dMaxBalance < 1.0)
+        dMaxBalance = 1.0;
+
+    // If using node based orb partition, then the maxPieceProc is total
+    // migratable objs / total number of node.
+    if (node_partition) {
+        maxPieceProc = dMaxBalance * nmig / CkNumNodes();
+    } else {
+        maxPieceProc = dMaxBalance*nmig/stats->nprocs();
+    }
+
+    if(maxPieceProc < 1.0)
+        maxPieceProc = 1.01;
+
+    CkAssert(tp_array.size() == nObjs);
+
+    mapping = &stats->to_proc;
+    from = &stats->from_proc;
+
+    CkPrintf("[LB_SFC] sorting\n");
+    for(int i = 0; i < nObjs; i++)
+        boundingBox.grow(tp_array[i].centroid);
+
+    // N.B. code below from TreePiece::assignKeys().
+    // Refactoring is a possibility.
+    // get longest axis
+    Vector3D<float> bsize = boundingBox.size();
+    float max = (bsize.x > bsize.y) ? bsize.x : bsize.y;
+    max = (max > bsize.z) ? max : bsize.z;
+    //
+    // Make the bounding box cubical.
+    //
+    Vector3D<float> bcenter = boundingBox.center();
+    // The magic number below is approximately 2^(-19)
+    const float fEps = 1.0 + 1.91e-6;  // slop to ensure keys fall
+                                       // between 0 and 1.
+    bsize = Vector3D<float>(fEps*0.5*max);
+    boundingBox = OrientedBox<float>(bcenter-bsize, bcenter+bsize);
+    if(verbosity > 1)
+        ckout << "TreePiece: Bounding box now: " << boundingBox << endl;
+
+    for(unsigned int i = 0; i < nObjs; ++i) {
+        tp_array[i].key = SFC::generateKey(tp_array[i].centroid, boundingBox);
+    }
+    sort(tp_array.begin(),tp_array.end());
+}
+
+/// @brief Partition treepieces among processors by
+/// dividing the SFC as evenly as possible.
+/// @param nprocs Number of processors over which to partition the
+/// pieces. N.B. if node_partition is true, then this is the number of nodes.
+/// @param tp Vector of TreePiece data.
+/// @param stats Load balance data
+void MultistepLB_SFC::sfcPartition(int nProcs, vector<SFCObject> & tp,
+                                   BaseLB::LDStats *stats,
+                                   bool node_partition){
+
+    double loadPrev = 0.0; // load on all lower processors
+    int iCurrPiece = 0;    // Piece under consideration
+    const int nPieces = tp.size();
+    for (int iProc = 0; iProc < nProcs && iCurrPiece < nPieces; iProc++) {
+        if (!stats->procs[iProc].available)
+            continue;
+        // always assign one piece to a processor
+        SFCObject &oSFC = tp[iCurrPiece];
+        (*mapping)[oSFC.lbindex] = iProc;
+        double loadCurr = oSFC.load;
+        iCurrPiece++;
+        int nCurrPiece = 1;     // number of pieces on this processor
+        double loadTarget = (iProc+1)*dTotalLoad/nProcs;
+        double dLoadError = fabs(loadPrev + loadCurr - loadTarget);
+
+        while ((nCurrPiece < maxPieceProc)
+               && fabs(tp[iCurrPiece].load + loadPrev + loadCurr - loadTarget)
+               < dLoadError
+               && iCurrPiece < nPieces) { // add pieces to this
+                                          // processor to get the
+                                          // closest to the target
+                                          // load
+            oSFC = tp[iCurrPiece];
+            loadCurr += oSFC.load;
+            (*mapping)[oSFC.lbindex] = iProc;
+            dLoadError = fabs(loadPrev + loadCurr - loadTarget);
+            iCurrPiece++;
+            nCurrPiece++;
+        }
+        loadPrev += loadCurr;
+    }
+    CkAssert(iCurrPiece == tp.size());
+}
+
+void MultistepLB_SFC::pup(PUP::er &p){
+    CBase_MultistepLB_SFC::pup(p);
+}
+
+#include "MultistepLB_SFC.def.h"
diff --git a/MultistepLB_SFC.h b/MultistepLB_SFC.h
@@ -0,0 +1,74 @@
+#ifndef _MULTISTEPLB_SFC_H_
+#define _MULTISTEPLB_SFC_H_
+
+#include "MultistepLB_SFC.decl.h"
+#include "Vector3D.h"
+#include "cosmoType.h"
+#include "SFC.h"
+#include "CentralLB.h"
+
+void Orb_PrintLBStats(BaseLB::LDStats *stats, int numobjs);
+void write_LB_particles(BaseLB::LDStats* stats, const char *achFileName, bool bFrom);
+
+/// @brief Multistep load balancer using Space Filling Curve
+///
+/// This balancer recognizes different "phases" (called rungs in other
+/// parts of the code), and uses loads based on measurements of the
+/// previous calculation at the same phase.  For large phases, (i.e.,
+/// when many particles are active, the TreePieces are divided among
+/// the processors using a Space Filling Curve based on the centroids
+/// of the TreePieces.
+///
+
+class MultistepLB_SFC : public CBase_MultistepLB_SFC {
+private:
+    void init();
+    bool QueryBalanceNow(int step);
+
+    decltype(BaseLB::LDStats::to_proc) *mapping;
+    decltype(BaseLB::LDStats::from_proc) *from;
+    /// total computational cost to be balanced
+    double dTotalLoad;
+    /// Maximum number of pieces per processor
+    double maxPieceProc;
+
+public:
+    MultistepLB_SFC(const CkLBOptions &);
+    MultistepLB_SFC(CkMigrateMessage *m) : CBase_MultistepLB_SFC(m) {
+        init();
+    }
+
+    class SFCObject
+    {
+    public:
+        /// index into LB stats->objData
+        int lbindex;
+        /// Spacial location of TreePiece
+        Vector3D<cosmoType> centroid;
+        SFC::Key key;
+        /// computational cost of this object
+        double load;
+
+        SFCObject() : lbindex(-1), load(0) {}
+        SFCObject(int _lbindex, double _load) :
+            lbindex(_lbindex),
+            load(_load)
+        {
+        }
+        bool operator<(const SFCObject &o) const{
+            return key < o.key;
+        }
+    };
+
+    void work(BaseLB::LDStats* stats);
+    void work2(BaseLB::LDStats* stats);
+    void sfcPrepare(std::vector<SFCObject> &tp_array,
+                    int nObjs, BaseLB::LDStats * stats,
+                    bool node_partition=false);
+    void sfcPartition(int nProcs, std::vector<SFCObject> & tp,
+                      BaseLB::LDStats *stats, bool node_partition=false);
+    void pup(PUP::er &p);
+};
+
+
+#endif /* _MultistepLB_notopo */