diff --git a/.gitmodules b/.gitmodules
index 940e1e27..c5be00cf 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -46,3 +46,9 @@
 [submodule "discreture"]
 	path = discreture
 	url = https://github.com/mraggi/discreture/
+[submodule "diskmat"]
+	path = diskmat
+	url = https://github.com/dnbaker/diskmat
+[submodule "cpp-btree"]
+	path = cpp-btree
+	url = https://github.com/Kronuz/cpp-btree
diff --git a/.travis.yml b/.travis.yml
index 6f56f655..3aee190e 100755
--- a/.travis.yml
+++ b/.travis.yml
@@ -31,6 +31,7 @@ script:
   - ./jsdhashdbg
   - ./fgcinctestdbg
   - ./geomedtestdbg
+  - ./sparsepriortestdbg
 notifications:
     slack: jhu-genomics:BbHYSks7DhOolq80IYf6m9oe#libsketch
     rooms:
diff --git a/Makefile b/Makefile
index 12f7e20f..cbad73b3 100644
--- a/Makefile
+++ b/Makefile
@@ -32,7 +32,7 @@ WARNINGS+=-Wall -Wextra -Wpointer-arith -Wformat -Wunused-variable -Wno-attribut
 OPT?=O3
 LDFLAGS+=$(LIBS) -lz $(LINKS)
 EXTRA?=
-DEFINES+= -DBLAZE_RANDOM_NUMBER_GENERATOR='wy::WyHash<uint64_t, 2>'
+DEFINES+= #-DBLAZE_RANDOM_NUMBER_GENERATOR='wy::WyHash<uint64_t, 2>'
 CXXFLAGS+=-$(OPT) -std=$(STD) -march=native $(WARNINGS) $(INCLUDE) $(DEFINES) $(BLAS_LINKING_FLAGS) \
     -DBOOST_NO_AUTO_PTR
 
@@ -63,7 +63,7 @@ LINKS += -ltbb
 endif
 
 TESTS=tbmdbg coreset_testdbg bztestdbg btestdbg osm2dimacsdbg dmlsearchdbg diskmattestdbg graphtestdbg jvtestdbg kmpptestdbg tbasdbg \
-      jsdtestdbg jsdkmeanstestdbg jsdhashdbg fgcinctestdbg geomedtestdbg oracle_thorup_ddbg
+      jsdtestdbg jsdkmeanstestdbg jsdhashdbg fgcinctestdbg geomedtestdbg oracle_thorup_ddbg sparsepriortestdbg
 
 clust: kzclustexpdbg kzclustexp kzclustexpf
 
@@ -78,7 +78,9 @@ CXXFLAGS += $(EXTRA)
 
 CXXFLAGS += $(LDFLAGS)
 
-%dbg: src/%.cpp $(wildcard include/minocore/*.h)
+HEADERS=$(shell find include -name '*.h')
+
+%dbg: src/%.cpp $(HEADERS)
 	$(CXX) $(CXXFLAGS) $< -o $@ -pthread
 
 printlibs:
@@ -91,7 +93,7 @@ graphrun: src/graphtest.cpp $(wildcard include/minocore/*.h)
 dmlrun: src/dmlsearch.cpp $(wildcard include/minocore/*.h)
 	$(CXX) $(CXXFLAGS) $< -o $@ -DNDEBUG $(OMP_STR)
 
-%: src/%.cpp $(wildcard include/minocore/*.h)
+%: src/%.cpp $(HEADERS)
 	$(CXX) $(CXXFLAGS) $< -o $@ -DNDEBUG $(OMP_STR) -O3
 
 alphaest: src/alphaest.cpp $(wildcard include/minocore/*.h)
diff --git a/cpp-btree b/cpp-btree
new file mode 160000
index 00000000..405ecf43
--- /dev/null
+++ b/cpp-btree
@@ -0,0 +1 @@
+Subproject commit 405ecf43729be4b65d35962f244ed94028edd585
diff --git a/diskmat b/diskmat
new file mode 160000
index 00000000..5e30f828
--- /dev/null
+++ b/diskmat
@@ -0,0 +1 @@
+Subproject commit 5e30f828ff2cdae5304eff712ff9b316d1109a15
diff --git a/exp/generate_bregman_data.py b/exp/generate_bregman_data.py
new file mode 100644
index 00000000..4820e988
--- /dev/null
+++ b/exp/generate_bregman_data.py
@@ -0,0 +1,56 @@
+import numpy as np
+import sys
+import argparse
+
+try:
+    from cytoolz import frequencies as Counter
+except ImportError:
+    from collections import Counter
+np.random.seed(0)
+
+ap = argparse.ArgumentParser()
+ap.add_argument("--num-clusters", type=int, help="Number of clusters.", default=10)
+ap.add_argument("--num-rows", type=int, help="Number of rows.", default=5000)
+ap.add_argument("--num-dim", type=int, help="Number of dimensions.", default=50)
+ap.add_argument("--set-noise", type=float, default=1.)
+ap.add_argument("--set-data-variance", type=float, default=5.)
+ap.add_argument("--outfile", type=str, default="randombregman.out")
+ap.add_argument("--sample-coverage", type=int, default=1000)
+ap = ap.parse_args()
+
+num_clusters = ap.num_clusters
+num_dim = ap.num_dim
+num_rows = ap.num_rows
+
+assert num_rows % num_clusters == 0, "num rows must be divisible by number of clusters"
+
+# Normalize
+centers = np.abs(np.random.standard_cauchy(size=(num_clusters, num_dim)) * ap.set_data_variance)
+
+centers = (1. / np.sum(centers, axis=1))[:,np.newaxis] * centers
+
+datapoints = []
+for i in range(num_clusters):
+    for j in range(num_rows // num_clusters):
+        # Generate a number of samples, and then sample them.
+        nsamp = np.random.poisson(ap.sample_coverage)
+        row = centers[i] + np.random.standard_normal(size=(num_dim,))
+        row = np.abs(row)
+        row /= np.sum(row)
+        selections = Counter(np.random.choice(len(row), p=row, size=(nsamp,))[:])
+        samples = np.zeros((num_dim,))
+        for k, v in selections.items():
+            samples[k] = v
+        datapoints.append(samples)
+
+datapoints = np.vstack(datapoints)
+
+ordering = np.arange(0, num_rows, dtype=np.uint32)
+np.random.shuffle(ordering)
+with open(ap.outfile, "w") as ofp:
+    ofp.write("%d/%d/%d\n" % (num_rows, num_dim, num_clusters))
+    for index in ordering:
+        ofp.write(" ".join(map(str, datapoints[index,:])) + "\n")
+with open(ap.outfile + ".labels.txt", "w") as f:
+    f.write("\n".join(str(ordering[i] // (num_rows // num_clusters)) for i in range(num_rows)))
+    f.write("\n")
diff --git a/exp/generate_kmeans_data.py b/exp/generate_kmeans_data.py
new file mode 100644
index 00000000..d5d9d21f
--- /dev/null
+++ b/exp/generate_kmeans_data.py
@@ -0,0 +1,30 @@
+import numpy as np
+import sys
+np.random.seed(0)
+
+num_clusters = 10
+num_dim = 50
+num_rows = 5000
+assert num_rows % num_clusters == 0
+
+centers = np.abs(np.random.standard_normal(size=(num_clusters, num_dim)) * 5.)
+
+points = np.vstack([np.random.standard_normal(size=(num_rows // num_clusters, num_dim)) * 1 + centers[i,:][np.newaxis, :]
+                    for i in range(num_clusters)])
+
+ordering = np.arange(0, num_rows, dtype=np.uint32)
+np.random.shuffle(ordering)
+if sys.argv[1:]:
+    ofp = open(sys.argv[1], "w")
+    labels = sys.argv[1] + ".labels.txt"
+else:
+    ofp = open("random.out", "w")
+    labels = "random.out.labels.txt"
+ofp.write("%d/%d/%d\n" % (num_rows, num_dim, num_clusters))
+for index in ordering:
+    ofp.write(" ".join(map(str, points[index,:])) + "\n")
+with open(labels, "w") as f:
+    f.write("\n".join(str(ordering[i] // (num_rows // num_clusters)) for i in range(num_rows)))
+    f.write("\n")
+
+if ofp != sys.stdout: ofp.close()
diff --git a/include/minocore/clustering.h b/include/minocore/clustering.h
new file mode 100644
index 00000000..cdc68d59
--- /dev/null
+++ b/include/minocore/clustering.h
@@ -0,0 +1,8 @@
+#ifndef MINOCORE_CLUSTERING_HEADERS_H__
+#define MINOCORE_CLUSTERING_HEADERS_H__
+
+#include "minocore/clustering/dispatch.h"
+#include "minocore/clustering/traits.h"
+#include "minocore/clustering/sampling.h"
+
+#endif /* MINOCORE_CLUSTERING_HEADERS_H__ */
diff --git a/include/minocore/clustering/centroid.h b/include/minocore/clustering/centroid.h
new file mode 100644
index 00000000..b0e66183
--- /dev/null
+++ b/include/minocore/clustering/centroid.h
@@ -0,0 +1,161 @@
+#ifndef MINOCORE_CLUSTERING_CENTROID_H__
+#define MINOCORE_CLUSTERING_CENTROID_H__
+#include "minocore/dist.h"
+#include "minocore/util/blaze_adaptor.h"
+#include "minocore/optim/kmedian.h"
+
+namespace minocore { namespace clustering {
+
+struct CentroidPolicy {
+    template<typename VT, bool TF, typename Range, typename VT2=VT, typename RowSums>
+    static void perform_average(blaze::DenseVector<VT, TF> &ret, const Range &r, const RowSums &rs,
+                                const VT2 *wc = static_cast<VT2 *>(nullptr),
+                                dist::DissimilarityMeasure measure=static_cast<dist::DissimilarityMeasure>(-1))
+    {
+        using FT = blz::ElementType_t<VT>;
+        PREC_REQ(measure != static_cast<dist::DissimilarityMeasure>(-1), "Must define dissimilarity measure");
+        if(measure == dist::TOTAL_VARIATION_DISTANCE) {
+            PRETTY_SAY << "TVD: performing " << (wc ? static_cast<const char *>("weighted"): static_cast<const char *>("unweighted")) << "L1 median on *normalized* categorical distributions.\n";
+            if(wc)
+                coresets::l1_median(r, ret, wc->data());
+            else
+                coresets::l1_median(r, ret);
+        }
+        else if(measure == dist::L1) {
+            std::conditional_t<blz::IsSparseMatrix_v<Range>,
+                               blz::CompressedMatrix<FT, blz::StorageOrder_v<Range> >,
+                               blz::DynamicMatrix<FT, blz::StorageOrder_v<Range> >
+            > cm = r % blz::expand(trans(rs), r.columns());
+            PRETTY_SAY << "L1: performing " << (wc ? static_cast<const char *>("weighted"): static_cast<const char *>("unweighted")) << "L1 median on *unnormalized* categorical distributions, IE absolute count data.\n";
+            if(wc)
+                coresets::l1_median(cm, ret, wc->data());
+            else
+                coresets::l1_median(cm, ret);
+        } else if(measure == dist::LLR || measure == dist::UWLLR || measure == dist::OLLR) {
+            PRETTY_SAY << "LLR test\n";
+            FT total_sum_inv;
+            if(wc) {
+                total_sum_inv = 1. / blz::dot(rs, *wc);
+                ~ret = blaze::sum<blz::columnwise>(r % blz::expand(trans(*wc * rs), r.columns())) * total_sum_inv;
+            } else {
+                total_sum_inv = 1. / blaze::sum(rs);
+                ~ret = blaze::sum<blz::columnwise>(r % blz::expand(trans(rs), r.columns())) * total_sum_inv;
+            }
+        } else if(wc) {
+            PRETTY_SAY << "Weighted, anything but L1 or LLR" << dist::detail::prob2str(measure) << '\n';
+            assert((~(*wc)).size() == r.rows());
+            assert(blz::expand(~(*wc), r.columns()).rows() == r.rows());
+            assert(blz::expand(~(*wc), r.columns()).columns() == r.columns());
+            auto wsuminv = 1. / blaze::sum(*wc);
+            if(!dist::detail::is_probability(measure)) { // e.g., take mean of unscaled values
+                auto mat2schur = blz::expand(~(*wc) * rs, r.columns());
+                PRETTY_SAY << "NOTPROB r dims: " << r.rows() << "/" << r.columns() << '\n';
+                PRETTY_SAY << "NOTPROB mat2schur dims: " << mat2schur.rows() << "/" << mat2schur.columns() << '\n';
+                ~ret = blaze::sum<blz::columnwise>(r % blz::expand(~(*wc) * rs, r.columns())) * wsuminv;
+            } else {                                    // Else take mean of scaled values
+                auto mat2schur = blz::expand(~(*wc), r.columns());
+                PRETTY_SAY << "PROB r dims: " << r.rows() << "/" << r.columns() << '\n';
+                PRETTY_SAY << "PROB mat2schur dims: " << mat2schur.rows() << "/" << mat2schur.columns() << '\n';
+                ~ret = blaze::sum<blz::columnwise>(r % blz::expand(~(*wc), r.columns())) * wsuminv;
+                assert(blaze::max(~ret) < 1. || !std::fprintf(stderr, "max in ret: %g for a probability distribution.", blaze::max(~ret)));
+            }
+        } else {
+            PRETTY_SAY << "Unweighted, anything but L1 or LLR" << dist::detail::prob2str(measure) << '\n';
+            if(dist::detail::is_probability(measure)) {
+                // Weighted average for all
+#ifndef NDEBUG
+                auto expansion = blz::expand(trans(rs), r.columns());
+                PRETTY_SAY << "PROB r dims: " << r.rows() << "/" << r.columns() << '\n';
+                PRETTY_SAY << "NOTPROB expansion dims: " << expansion.rows() << "/" << expansion.columns() << '\n';
+#endif
+                ~ret = blaze::sum<blz::columnwise>(r % blz::expand(trans(rs), r.columns())) * (1. / (blaze::sum(rs) * r.rows()));
+            } else ~ret = blz::mean<blz::columnwise>(r % blz::expand(trans(rs), r.columns()));
+        }
+    }
+    template<typename FT, typename Row, typename Src>
+    static void __perform_increment(FT neww, FT cw, Row &ret, const Src &dat, FT row_sum, dist::DissimilarityMeasure measure)
+    {
+        if(measure == dist::L1 || measure == dist::TOTAL_VARIATION_DISTANCE)
+            throw std::invalid_argument("__perform_increment is only for linearly-calculated means, not l1 median");
+        if(cw == 0.) {
+            if(dist::detail::is_probability(measure))
+                ret = dat;
+            else
+                ret = dat * row_sum;
+        } else {
+            auto div = neww / (neww + cw);
+            if(dist::detail::is_probability(measure)) {
+                ret += (dat - ret) * div;
+            } else if(measure == dist::LLR || measure == dist::UWLLR) {
+                ret += (dat * row_sum) * neww;
+                // Add up total sum and subtract later
+                // since there are three weighting factors here:
+                // First, partial assignment
+                // Then point-wise weights (both of which are in neww)
+                // Then, for LLR/UWLLR, there's weighting by the row-sums
+            } else {
+                // Maintain running mean for full vector value
+                ret += (dat * row_sum - ret) * div;
+            }
+        }
+    }
+
+    template<typename VT, bool TF, typename RowSums, typename MatType, typename CenterCon, typename VT2=blz::DynamicVector<blz::ElementType_t<VT>> >
+    static void perform_soft_assignment(const blz::DenseMatrix<VT, TF> &assignments,
+        const RowSums &rs,
+        OMP_ONLY(std::mutex *mutptr,)
+        const MatType &data, CenterCon &newcon,
+        const VT2 *wc = static_cast<const VT2 *>(nullptr),
+        dist::DissimilarityMeasure measure=static_cast<dist::DissimilarityMeasure>(-1))
+    {
+        using FT = blz::ElementType_t<VT>;
+        PREC_REQ(measure != static_cast<dist::DissimilarityMeasure>(-1), "Must define dissimilarity measure");
+        if(measure == dist::L1 || measure == dist::TOTAL_VARIATION_DISTANCE) {
+            OMP_PFOR
+            for(unsigned j = 0; j < newcon.size(); ++j) {
+                blz::DynamicVector<FT, blz::rowVector> newweights;
+                {
+                    auto col = trans(column(assignments, j));
+                    if(wc) newweights = col * *wc;
+                    else   newweights = col;
+                }
+                if(measure == dist::L1) {
+                    std::conditional_t<blz::IsDenseMatrix_v<VT>,
+                                       blz::DynamicMatrix<FT>, blz::CompressedMatrix<FT>>
+                        scaled_data = data % blz::expand(rs, data.columns());
+                    coresets::l1_median(scaled_data, newcon[j], newweights.data());
+                } else { // TVD
+                    coresets::l1_median(data, newcon[j], newweights.data());
+                }
+            }
+        } else {
+            blz::DynamicVector<FT> summed_contribs(newcon.size(), 0.);
+            OMP_PFOR
+            for(size_t i = 0; i < data.rows(); ++i) {
+                auto item_weight = wc ? wc->operator[](i): static_cast<FT>(1.);
+                const auto row_sum = rs[i];
+                auto asn(row(assignments, i, blz::unchecked));
+                for(size_t j = 0; j < newcon.size(); ++j) {
+                    auto &cw = summed_contribs[j];
+                    if(auto asnw = asn[j]; asnw > 0.) {
+                        auto neww = item_weight * asnw;
+                        OMP_ONLY(if(mutptr) mutptr[j].lock();)
+                        __perform_increment(neww, cw, newcon[j], row(data, i, blz::unchecked), row_sum, measure);
+                        OMP_ONLY(if(mutptr) mutptr[j].unlock();)
+                        OMP_ATOMIC
+                        cw += neww;
+                    }
+                }
+            }
+            if(measure == dist::LLR || measure == dist::UWLLR || measure == dist::OLLR) {
+                OMP_PFOR
+                for(auto i = 0u; i < newcon.size(); ++i)
+                    newcon[i] *= 1. / blz::dot(column(assignments, i), rs);
+            }
+        }
+    }
+}; // CentroidPolicy
+
+} } // namespace minocore::clustering
+
+#endif /* MINOCORE_CLUSTERING_CENTROID_H__ */
diff --git a/include/minocore/clustering/dispatch.h b/include/minocore/clustering/dispatch.h
new file mode 100644
index 00000000..ef00c42b
--- /dev/null
+++ b/include/minocore/clustering/dispatch.h
@@ -0,0 +1,574 @@
+#ifndef FGC_CLUSTERING_DISPATCH_H__
+#define FGC_CLUSTERING_DISPATCH_H__
+#include "minocore/dist.h"
+#include "minocore/optim/jv_solver.h"
+#include "minocore/optim/lsearch.h"
+#include "minocore/optim/oracle_thorup.h"
+#include "minocore/util/exception.h"
+#include "minocore/clustering/traits.h"
+#include "minocore/clustering/sampling.h"
+#include "minocore/clustering/centroid.h"
+
+#include "boost/iterator/zip_iterator.hpp"
+#include "diskmat/diskmat.h"
+
+namespace minocore {
+
+namespace clustering {
+
+using dist::DissimilarityMeasure;
+using blaze::ElementType_t;
+using diskmat::PolymorphicMat;
+using boost::make_zip_iterator;
+
+template<typename T>
+bool use_packed_distmat(const T &app) {
+    if constexpr(jsd::is_dissimilarity_applicator_v<T>) {
+        return dist::detail::is_symmetric(app.get_measure());
+    }
+    return true;
+}
+
+template<typename IT=uint32_t, typename FT, typename WFT=FT, typename OracleType, typename Traits>
+auto perform_cluster_metric_kmedian(const OracleType &app, size_t np, Traits traits)
+{
+    MetricSelectionResult<IT, FT> ret;
+
+    std::unique_ptr<dm::DistanceMatrix<FT, 0, dm::DM_MMAP>> distmatp;
+    std::unique_ptr<PolymorphicMat<FT>> full_distmatp;
+    if(traits.compute_full) {
+        if(use_packed_distmat(app)) {
+            distmatp.reset(new dm::DistanceMatrix<FT, 0, dm::DM_MMAP>(np));
+            for(size_t i = 0; i < np; ++i) {
+                auto [ptr, extent] = distmatp->row_span(i);
+                const auto offset = i + 1;
+                OMP_PFOR_DYN
+                for(size_t j = 0; j < extent; ++j)
+                    ptr[j] = app(i, j + offset);
+            }
+        } else {
+            full_distmatp.reset(new PolymorphicMat<FT>(np, np));
+            for(size_t i = 0; i < np; ++i) {
+                auto r = row(full_distmatp->operator~(), i, blaze::unchecked);
+                OMP_PFOR_DYN
+                for(size_t j = 0; j < np; ++j) {
+                    r[j] = app(i, j);
+                }
+            }
+        }
+    }
+    auto fill_distance_mat = [&](const auto &lu) {
+        auto &retdm = std::get<3>(ret);
+        retdm.resize(std::get<0>(ret).size(), np);
+        for(size_t i = 0; i < std::get<0>(ret).size(); ++i) {
+            const auto cid = std::get<0>(ret)[i];
+            auto rowptr = row(retdm, i);
+            OMP_PFOR
+            for(size_t j = 0; j < np; ++j) {
+                rowptr[j] = (unlikely(j == cid) ? static_cast<FT>(0.): FT(lu(cid, j)));
+            }
+        }
+    };
+    if(traits.sampling == THORUP_SAMPLING) {
+        auto sample_and_fill = [&](const auto &x) {
+           std::tie(std::get<0>(ret), std::get<1>(ret), std::get<2>(ret))
+                = iterated_oracle_thorup_d(
+               x, np, traits.k, traits.thorup_iter, traits.thorup_sub_iter, traits.weights, traits.thorup_npermult, 3, 0.5, traits.seed);
+            fill_distance_mat(x);
+        };
+        if(distmatp) {
+            sample_and_fill(*distmatp);
+        } else if(full_distmatp) {
+            sample_and_fill(~*full_distmatp);
+        } else {
+            auto caching_app = make_row_caching_oracle_wrapper<
+                shared::flat_hash_map, /*is_symmetric=*/ true, /*is_threadsafe=*/true
+            >(app, np);
+            sample_and_fill(caching_app);
+        }
+    } else switch(traits.sampling) {
+        case D2_SAMPLING: {
+            ret = select_d2(app, np, traits);
+            break;
+        }
+        case UNIFORM_SAMPLING: {
+            ret = select_uniform_random(app, np, traits);
+            break;
+        }
+        case GREEDY_SAMPLING: {
+            ret = select_greedy(app, np, traits);
+            break;
+        }
+        case DEFAULT_SAMPLING: default: {
+            char buf[128];
+            auto l = std::sprintf(buf, "Unrecognized sampling: %d\n", (int)DEFAULT_SAMPLING);
+            throw std::invalid_argument(std::string(buf, l));
+        }
+        fill_distance_mat(app);
+    }
+    auto &costmat = ret.facility_cost_matrix();
+    std::vector<IT> center_sol;
+    switch(traits.metric_solver) {
+        case JAIN_VAZIRANI_FL: case JV_PLUS_LOCAL_SEARCH: {
+            auto jvs = jv::make_jv_solver(costmat);
+            auto [c_centers, c_assignments] = jvs.kmedian(traits.k, traits.max_jv_rounds);
+            if(traits.metric_solver == JAIN_VAZIRANI_FL) {
+                center_sol = std::move(c_centers);
+                break;
+            }
+            // JV_PLUS_LOCAL_SEARCH
+            auto lsearcher = minocore::make_kmed_lsearcher(costmat, traits.k, traits.eps, traits.seed);
+            lsearcher.lazy_eval_ = 2;
+            lsearcher.assign_centers(c_centers.begin(), c_centers.end());
+            lsearcher.run();
+            center_sol.assign(lsearcher.sol_.begin(), lsearcher.sol_.end());
+            break;
+        }
+        case LOCAL_SEARCH: {
+            auto lsearcher = minocore::make_kmed_lsearcher(costmat, traits.k, traits.eps, traits.seed);
+            lsearcher.lazy_eval_ = 2;
+            lsearcher.run();
+            center_sol.assign(lsearcher.sol_.begin(), lsearcher.sol_.end());
+            break;
+        }
+        default: throw std::invalid_argument("Unrecognized metric solver strategy");
+    }
+    std::transform(center_sol.begin(), center_sol.end(), center_sol.begin(),
+                   [&sel=ret.selected()](auto x) {return sel[x];});
+    blaze::DynamicVector<IT> asn(np, center_sol.front());
+    blaze::DynamicVector<FT> costs = trans(row(costmat, center_sol.front()));
+    for(unsigned ci = 1; ci < center_sol.size(); ++ci) {
+        auto r = row(costmat, ci);
+        OMP_PFOR
+        for(size_t i = 0; i < np; ++i) {
+            if(auto newv = r[i]; newv < costs[i])
+                costs[i] = newv, asn[i] = center_sol[ci];
+        }
+    }
+    shared::sort(center_sol.begin(), center_sol.end());
+    return std::make_tuple(center_sol, asn, costs);
+}
+
+enum LloydLoopResult {
+    FINISHED,
+    REACHED_MAX_ROUNDS,
+    UNFINISHED
+};
+
+template<Assignment asn_method=HARD, CenterOrigination co=EXTRINSIC, typename MatrixType, typename CentersType, typename Assignments, typename WFT=ElementType_t<MatrixType>,
+         typename CostType>
+LloydLoopResult perform_lloyd_loop(CentersType &centers, Assignments &assignments,
+    const jsd::DissimilarityApplicator<MatrixType> &app,
+    unsigned k, CostType &retcost, uint64_t seed=0, const WFT *weights=static_cast<WFT *>(nullptr),
+    size_t max_iter=100, double eps=1e-4)
+{
+    if constexpr(asn_method == HARD) {
+        if(retcost.size() != app.size()) retcost.resize(app.size());
+    } else {
+        // asn_method == SOFT || asn_method == SOFT_HARMONIC_MEAN
+        retcost.resize(app.size(), k);
+    }
+    assert(retcost.size() == app.size() || !std::fprintf(stderr, "retcost size: %zu. app size: %zu\n", retcost.size(), app.size()));
+    if(co != EXTRINSIC) throw std::invalid_argument("Must be extrinsic for Lloyd's");
+    using FT = ElementType_t<MatrixType>;
+    auto &mat = app.data();
+    const size_t npoints = app.size();
+    CentersType centers_cpy(centers), centers_cache;
+    MINOCORE_REQUIRE(centers.size() == k, "Must have the correct number of centers");
+    const auto measure = app.get_measure();
+    if(dist::detail::needs_logs(measure) || dist::detail::needs_sqrt(measure))
+        centers_cache.resize(k);
+    FT current_cost = std::numeric_limits<FT>::max(), first_cost = current_cost;
+    //PRETTY_SAY << "Beginning\n";
+    LloydLoopResult ret = UNFINISHED;
+    wy::WyRand<uint64_t> rng(seed);
+    size_t iternum = 0;
+    // HEY DANIEL WHEN YOU GET BACK HERE
+    // You are removing the center_distance
+    // and instead calculating the objective function
+    // and terminating when the change in objective function is less than eps * first cost.
+    using cv_t = blaze::CustomVector<WFT, blaze::unaligned, blaze::unpadded, blaze::rowVector>;
+    std::unique_ptr<cv_t> weight_cv;
+    if(weights) {
+        weight_cv.reset(new cv_t(const_cast<WFT *>(weights), npoints));
+    }
+    auto getcache = [&] (size_t j) {
+        decltype(&centers_cache[j]) ret = nullptr;
+        if(centers_cache.size()) ret = &centers_cache[j];
+        return ret;
+    };
+    assert(centers_cache.empty() || getcache(0) == nullptr);
+    auto getcost = [&]() {
+        if constexpr(asn_method == HARD) {
+            return weight_cv ? blz::sum(retcost * *weight_cv): blz::sum(retcost);
+        } else {
+#ifndef NDEBUG
+            // Ensure that the assignments are as expected.
+            for(size_t i = 0; i < assignments.rows(); ++i) {
+                auto r(row(assignments, i));
+                auto cr(row(retcost, i));
+                auto maxi = std::max_element(r.begin(), r.end()) - r.begin();
+                auto mini = std::min_element(cr.begin(), cr.end()) - cr.begin();
+                //std::cerr << "mini: " << mini << '\n';
+                //std::cerr << "maxi: " << maxi << '\n';
+                assert(std::abs(blaze::sum(r) - 1.) < 1e-4);
+                assert(maxi == mini || r[maxi] == r[mini] || cr[mini] == cr[maxi]
+                      || &(std::cerr << r << '\n' << cr << '\n') == nullptr);
+            }
+#endif
+            if(weight_cv) {
+                auto ew = blaze::expand(*weight_cv, app.data().columns());
+                std::fprintf(stderr, "expanded weight shape: %zu/%zu. asn: %zu/%zu\n", ew.rows(), ew.columns(), assignments.rows(), assignments.columns());
+                return blaze::sum(assignments % retcost % ew);
+            } else {
+                return blaze::sum(assignments % retcost);
+            }
+        }
+    };
+    auto check = [&]() {
+        ++iternum;
+        if(first_cost == std::numeric_limits<FT>::max()) first_cost = getcost();
+        else {
+            FT itercost = getcost();
+            if(current_cost == std::numeric_limits<FT>::max()) {
+                current_cost = itercost;
+                assert(current_cost != std::numeric_limits<FT>::max());
+            } else {
+                if(std::abs(itercost - current_cost) < eps * first_cost) { // consider taking sign into account here
+                    PRETTY_SAY << "Itercost: " << itercost << " vs current " << current_cost << " with diff " << std::abs(itercost - current_cost)
+                               << "compared to first cost of " << first_cost << " with eps = " << eps << ".\n";
+                    return FINISHED;
+                }
+                if(iternum == max_iter)
+                    return REACHED_MAX_ROUNDS;
+            }
+            current_cost = itercost;
+        }
+        PRETTY_SAY << "iternum: " << iternum << '\n';
+        return UNFINISHED;
+    };
+    auto soft_assignments = [&]() {
+        if constexpr(asn_method != HARD) {
+            OMP_PFOR
+            for(size_t i = 0; i < npoints; ++i) {
+                auto row = blaze::row(retcost, i BLAZE_CHECK_DEBUG);
+                for(unsigned j = 0; j < centers.size(); ++j) {
+                    row[j] = app(i, centers[j], getcache(j), measure);
+                }
+                auto asnrow = blaze::row(assignments, i BLAZE_CHECK_DEBUG);
+                if constexpr(asn_method == SOFT_HARMONIC_MEAN) {
+                    asnrow = 1. / row;
+                } else {
+                    auto mv = blaze::min(row);
+                    assert(mv >= 0.);
+                    asnrow = blaze::exp(-row + mv);
+                    assert(blaze::min(asnrow) >= 0.);
+                }
+                asnrow *= 1. / blaze::sum(asnrow);
+                assert(blaze::min(asnrow) >= 0.);
+                PRETTY_SAY << "row " << row << " yields " << asnrow << " with max " << blz::max(asnrow) << ", min " << blz::min(asnrow) <<'\n';
+            }
+        }
+    };
+    if constexpr(asn_method == HARD) {
+        std::vector<std::vector<uint32_t>> assigned(k);
+        OMP_ONLY(std::unique_ptr<std::mutex[]> mutexes(new std::mutex[k]);)
+        for(;;) {
+            // Do it forever
+            if(centers_cache.size()) {
+                PRETTY_SAY << "Setting centers cache for measure " << dist::detail::prob2str(measure) << '\n';
+                for(unsigned i = 0; i < k; ++i)
+                    dist::detail::set_cache(centers[i], centers_cache[i], measure);
+            }
+            for(auto &i: assigned) i.clear();
+            OMP_PFOR
+            for(size_t i = 0; i < npoints; ++i) {
+                auto dist = app(i, centers[0], getcache(0), measure);
+                unsigned asn = 0;
+                for(unsigned j = 1; j < k; ++j) {
+                    auto newdist = app(i, centers[j], getcache(j), measure);
+                    if(newdist < dist) {
+                        asn = j;
+                        dist = newdist;
+                    }
+                }
+                retcost[i] = dist;
+                assignments[i] = asn;
+                {
+                    OMP_ONLY(std::unique_lock<std::mutex> lock(mutexes[asn]);)
+                    assigned[asn].push_back(i);
+                }
+            }
+            // Check termination condition
+            if(auto rc = check(); rc != UNFINISHED) {
+                ret = rc;
+                goto end;
+            }
+            blaze::SmallArray<uint32_t, 16> centers_to_restart;
+            for(unsigned i = 0; i < k; ++i)
+                if(assigned[i].empty())
+                    centers_to_restart.pushBack(i);
+            if(auto restartn = centers_to_restart.size()) {
+                // Use D^2 sampling to stayrt a new cluster
+                // And then restart the loop
+                assert(retcost.size() == npoints);
+                retcost = std::numeric_limits<FT>::max();
+                OMP_PFOR
+                for(size_t i = 0; i < npoints; ++i) {
+                    for(size_t j = 0; j < k; ++j) {
+                        if(assigned[j].empty()) continue;
+                        auto fc = app(i, centers[j], getcache(j), measure);
+                        if(fc < retcost[i]) retcost[i] = fc;
+                    }
+                }
+                blaze::DynamicVector<FT> csum(npoints);
+                std::uniform_real_distribution<FT> urd;
+                for(size_t i = 0; i < restartn;) {
+                    std::partial_sum(retcost.data(), retcost.data() + retcost.size(), csum.data());
+                    auto newp = std::lower_bound(csum.data(), csum.data() + csum.size(), urd(rng) * csum[csum.size() - 1])
+                                - csum.data();
+                    centers[centers_to_restart[i]] = row(app.data(), newp, blaze::unchecked);
+                    if(++i != restartn) {
+                        OMP_PFOR
+                        for(size_t i = 0; i < npoints; ++i)
+                            retcost[i] = std::min(retcost[i], app(i, newp));
+                    }
+                }
+                continue; // Reassign, re-center, and re-compute
+            }
+            // Make centers
+            for(size_t i = 0; i < centers_cpy.size(); ++i) {
+                auto &cref = centers_cpy[i];
+                auto &assigned_ids = assigned[i];
+                shared::sort(assigned_ids.begin(), assigned_ids.end()); // Better access pattern
+                auto aidptr = assigned_ids.data();
+                const size_t nid = assigned_ids.size();
+                auto rowsel = rows(mat, aidptr, nid);
+                auto sumsel = blaze::elements(app.row_sums(), aidptr, nid);
+                if(weight_cv) {
+                    auto wsel = blaze::elements(*weight_cv, aidptr, nid);
+                    CentroidPolicy::perform_average(cref, rowsel, sumsel, &wsel, measure);
+                } else {
+                    CentroidPolicy::perform_average(cref, rowsel, sumsel,
+                        static_cast<decltype(blaze::elements(*weight_cv, aidptr, nid)) *>(nullptr), measure
+                    );
+                    //PRETTY_SAY << "Center " << i << " is " << cref << '\n';
+                    PRETTY_SAY << "Difference between previous center and new center is " << blz::sqrL2Dist(cref, centers[i]) << '\n';
+                }
+            }
+            // Set the returned values to be the last iteration's.
+            centers = centers_cpy;
+        }
+    } else {
+        if(assignments.rows() != npoints || assignments.columns() != centers.size()) {
+            assignments.resize(npoints, centers.size());
+        }
+        std::unique_ptr<std::mutex[]> mutexes;
+        OMP_ONLY(mutexes.reset(new std::mutex[centers.size()]);)
+        for(;;) {
+            if(centers_cache.size()) {
+                for(size_t i = 0; i < centers.size(); ++i)
+                    dist::detail::set_cache(centers[i], centers_cache[i], measure);
+            }
+            for(auto &c: centers_cpy) c = static_cast<FT>(0);
+            soft_assignments();
+            assert(blz::sum(assignments) - assignments.rows() < 1e-3 * assignments.rows());
+            for(size_t i = 0; i < centers.size(); ++i)
+                if(blaze::sum(blaze::column(assignments, i)) == 0.)
+                    throw TODOError("TODO: reassignment for support goes to 0");
+            // Check termination condition
+            if(auto rc = check(); rc != UNFINISHED) {
+                ret = rc;
+                goto end;
+            }
+            // Now points have been assigned, and we now perform center assignment
+            CentroidPolicy::perform_soft_assignment(
+                assignments, app.row_sums(),
+                OMP_ONLY(mutexes.get(),)
+                app.data(), centers_cpy, weight_cv.get(), measure
+            );
+        }
+        std::swap(centers_cpy, centers);
+    }
+    end: {
+        if(centers_cache.size()) {
+            for(size_t i = 0; i < centers.size(); ++i)
+                dist::detail::set_cache(centers[i], centers_cache[i], measure);
+        }
+        soft_assignments();
+    }
+    DBG_ONLY(if(ret == FINISHED) PRETTY_SAY << "Completed Lloyd's loop in " << iternum << " iterations\n";)
+    return ret;
+}
+
+
+
+template<typename FT, typename IT, Assignment asn_method=HARD, CenterOrigination co=INTRINSIC>
+void update_defaults_with_measure(ClusteringTraits<FT, IT, asn_method, co> &ct, dist::DissimilarityMeasure measure) {
+    if(ct.opt == DEFAULT_OPT) {
+        switch(measure) {
+            case dist::L2:
+            case dist::SQRL2:
+            case dist::L1: case dist::TVD:
+            case dist::COSINE_DISTANCE:
+            case dist::PROBABILITY_COSINE_DISTANCE:
+            case dist::LLR: case dist::UWLLR:
+            case dist::HELLINGER: case dist::BHATTACHARYYA_DISTANCE: case dist::BHATTACHARYYA_METRIC:
+                ct.opt = EXPECTATION_MAXIMIZATION; break;
+            /*
+             * Bregman Divergences, LLR, cosine distance use the (weighted) mean of each
+             * point, in either soft or hard clustering.
+             * TVD and L1 use the feature-wise median.
+             * Scores are either calculated with softmax distance or harmonic softmax
+             */
+            case dist::ORACLE_METRIC: case dist::ORACLE_PSEUDOMETRIC: case dist::WASSERSTEIN:
+                /* otherwise, use metric kmedian */
+                ct.opt = METRIC_KMEDIAN; break;
+            default:
+                if(dist::detail::is_bregman(measure)) {
+                    ct.opt = EXPECTATION_MAXIMIZATION;
+                    break;
+                }
+        }
+    }
+    if(ct.approx == DEFAULT_APPROX) {
+        if(ct.opt == EXPECTATION_MAXIMIZATION) ct.approx = BICRITERIA;
+        else ct.approx = CONSTANT_FACTOR;
+    }
+    if(ct.sampling == DEFAULT_SAMPLING) {
+        ct.sampling = ct.opt == EXPECTATION_MAXIMIZATION
+            ? D2_SAMPLING: THORUP_SAMPLING;
+    }
+}
+
+
+template<Assignment asn_method=HARD, CenterOrigination co=INTRINSIC, typename MatrixType, typename IT=uint32_t>
+auto perform_clustering(const jsd::DissimilarityApplicator<MatrixType> &app, size_t npoints, unsigned k,
+                        const ElementType_t<MatrixType> *weights=nullptr,
+                        CenterSamplingType csample=DEFAULT_SAMPLING,
+                        OptimizationMethod opt=DEFAULT_OPT,
+                        ApproximateSolutionType approx=DEFAULT_APPROX,
+                        uint64_t seed=0,
+                        size_t max_iter=100, double eps=1e-4)
+{
+    MINOCORE_REQUIRE(npoints == app.size(), "assumption");
+    using FT = typename MatrixType::ElementType;
+
+    // Setup clustering traits
+    auto ct = make_clustering_traits<FT, IT, asn_method, co>(npoints, k,
+        csample, opt, approx, weights, seed, max_iter, eps);
+    using ct_t = decltype(ct);
+    auto measure = app.get_measure();
+    update_defaults_with_measure(ct, measure);
+
+    // and helpers
+    typename ct_t::centers_t centers;
+    centers.reserve(k);
+    typename ct_t::assignments_t assignments;
+    typename ct_t::costs_t costs;
+    if constexpr(asn_method == HARD) {
+        assignments.resize(app.size());
+    } else {
+        assignments.resize(app.size(), k);
+    }
+    PRETTY_SAY << "Assignments sized.\n";
+
+
+    auto set_metric_return_values = [&](const auto &ret) {
+        MINOCORE_REQUIRE(asn_method == HARD, "Not supported: soft extrinsic clustering");
+        auto &[cc, asn, retcosts] = ret;
+        centers.resize(cc.size());
+        if constexpr(co == EXTRINSIC) {
+            OMP_PFOR
+            for(size_t i = 0; i < cc.size(); ++i) {
+                centers[i] = row(app.data(), cc[i], blaze::unchecked);
+            }
+        } else std::copy(cc.begin(), cc.end(), centers.begin()); // INTRINSIC
+        if constexpr(asn_method == HARD) {
+            assignments.resize(asn.size());
+            std::copy(asn.begin(), asn.end(), assignments.begin());
+            costs.resize(retcosts.size());
+            std::copy(retcosts.begin(), retcosts.end(), costs.begin());
+        }
+    };
+
+    // Delegate to solvers and set-up return values
+    if(dist::detail::satisfies_d2(measure) || measure == dist::L1 || measure == dist::TOTAL_VARIATION_DISTANCE || co == EXTRINSIC) {
+        auto [initcenters, initasn, initcosts] = jsd::make_kmeanspp(app, ct.k, ct.seed, ct.weights);
+        assert(initcenters.size() == k);
+        if(co == INTRINSIC || opt == METRIC_KMEDIAN) {
+            PRETTY_SAY << "Performing metric clustering\n";
+            // Do graph metric calculation
+            MINOCORE_REQUIRE(asn_method == HARD, "Can't do soft metric k-median");
+            auto metric_ret = perform_cluster_metric_kmedian<IT, FT>(detail::make_aa(app), app.size(), ct);
+            set_metric_return_values(metric_ret);
+        } else {
+            PRETTY_SAY << "Setting centers with D2\n";
+            for(const auto id: initcenters)
+                centers.emplace_back(row(app.data(), id));
+            assert(centers.size() == k);
+            PRETTY_SAY << "Beginning lloyd loop\n";
+            // Perform EM
+            if(auto ret = perform_lloyd_loop<asn_method>(centers, assignments, app, k, costs, ct.seed, ct.weights, max_iter, eps))
+                std::fprintf(stderr, "lloyd loop ret: %s\n", ret == REACHED_MAX_ROUNDS ? "max rounds": "unfinished");
+        }
+    } else if(dist::detail::satisfies_metric(measure) || dist::detail::satisfies_rho_metric(measure)) {
+        MINOCORE_REQUIRE(asn_method == HARD, "Can't do soft metric k-median");
+        auto metric_ret = perform_cluster_metric_kmedian<IT, FT>(detail::make_aa(app), app.size(), ct);
+        set_metric_return_values(metric_ret);
+    } else {
+        throw NotImplementedError("Unsupported: asymmetric measures not supporting D2 sampling");
+    }
+    return std::make_tuple(std::move(centers), std::move(assignments), std::move(costs));
+} // perform_clustering
+
+// Make # points optional
+template<Assignment asn_method=HARD, CenterOrigination co=INTRINSIC, typename MatrixType, typename IT=uint32_t>
+auto perform_clustering(const jsd::DissimilarityApplicator<MatrixType> &app, unsigned k,
+                        const ElementType_t<MatrixType> *weights=nullptr,
+                        CenterSamplingType csample=DEFAULT_SAMPLING,
+                        OptimizationMethod opt=DEFAULT_OPT,
+                        ApproximateSolutionType approx=DEFAULT_APPROX,
+                        uint64_t seed=0,
+                        size_t max_iter=100, double eps=1e-4)
+{
+    return perform_clustering<asn_method, co, MatrixType, IT>(app, app.size(), k, weights, csample, opt, approx, seed, max_iter, eps);
+}
+
+template<typename FT=float, typename IT=uint32_t, typename OracleType>
+auto perform_clustering(const OracleType &app, size_t npoints, unsigned k,
+                        const FT *weights=nullptr,
+                        CenterSamplingType csample=DEFAULT_SAMPLING,
+                        OptimizationMethod opt=DEFAULT_OPT,
+                        ApproximateSolutionType approx=DEFAULT_APPROX,
+                        uint64_t seed=0,
+                        size_t max_iter=100, double eps=ClusteringTraits<FT, IT, HARD, EXTRINSIC>::DEFAULT_EPS)
+{
+    // Setup
+    if(opt == DEFAULT_OPT) opt = METRIC_KMEDIAN;
+    if(approx == DEFAULT_APPROX) approx = CONSTANT_FACTOR;
+    if(csample == DEFAULT_SAMPLING) csample = THORUP_SAMPLING;
+    MINOCORE_REQUIRE(opt == METRIC_KMEDIAN, "No other method supported for metric clustering");
+    auto clustering_traits = make_clustering_traits<FT, IT, HARD, EXTRINSIC>(npoints, k,
+        csample, opt, approx, weights, seed, max_iter, eps);
+    using ct_t = decltype(clustering_traits);
+
+    // Cluster
+    auto [cc, asn, retcosts] = perform_cluster_metric_kmedian<IT, FT>(app, npoints, clustering_traits);
+
+    // Return
+    typename ct_t::centers_t centers(cc.size());
+    typename ct_t::assignments_t assignments(asn.size());
+    typename ct_t::costs_t costs(retcosts.size());
+    std::copy(cc.begin(), cc.end(), centers.begin());
+    std::copy(asn.begin(), asn.end(), assignments.begin());
+    std::copy(retcosts.begin(), retcosts.end(), costs.begin());
+    return std::make_tuple(std::move(centers), std::move(assignments), std::move(costs));
+}
+
+
+} // namespace clustering
+
+} // namespace minocore
+
+#endif /* FGC_CLUSTERING_DISPATCH_H__ */
diff --git a/include/minocore/clustering/sampling.h b/include/minocore/clustering/sampling.h
new file mode 100644
index 00000000..5a4c8c6a
--- /dev/null
+++ b/include/minocore/clustering/sampling.h
@@ -0,0 +1,140 @@
+#ifndef CLUSTERING_SAMPLING_H__
+#define CLUSTERING_SAMPLING_H__
+#include "minocore/clustering/traits.h"
+#include "minocore/optim/oracle_thorup.h"
+
+namespace minocore {
+
+namespace clustering {
+
+template<typename IT, typename FT>
+struct MetricSelectionResult: public std::tuple<std::vector<IT>, blz::DV<FT>, std::vector<IT>, blz::DM<FT> > {
+	auto &selected() {return std::get<0>(*this);}
+	const auto &selected() const {return std::get<0>(*this);}
+	auto &costs() {return std::get<1>(*this);}
+	const auto &costs() const {return std::get<1>(*this);}
+	auto &assignments() {return std::get<2>(*this);}
+	const auto &assignments() const {return std::get<2>(*this);}
+	auto &facility_cost_matrix() {return std::get<3>(*this);}
+	const auto &facility_cost_matrix() const {return std::get<3>(*this);}
+};
+
+
+
+template<typename OracleType, typename IT, typename FT, Assignment asn=HARD, CenterOrigination co=EXTRINSIC>
+MetricSelectionResult<IT, FT>
+select_uniform_random(const OracleType &oracle, size_t np, ClusteringTraits<FT, IT, asn, co> opts)
+{
+    assert(opts.k != (unsigned)-1);
+	MetricSelectionResult<IT, FT> ret;
+    size_t nsamp = std::min(size_t(std::ceil(opts.k * opts.approx_mul)), np);
+    std::vector<IT> selected;
+    std::mt19937_64 rng(opts.seed);
+    schism::Schismatic<IT> modder(np);
+    blz::DV<FT> costs(np, std::numeric_limits<FT>::max());
+    std::vector<IT> assignments(np);
+    shared::flat_hash_set<IT> sel;
+    do {
+        IT next;
+        do next = modder.mod(rng());
+        while(sel.find(next) != sel.end());
+        OMP_PFOR
+        for(size_t i = 0; i < np; ++i) {
+            if(costs[i] == 0.) continue;
+            auto c = oracle(next, i);
+            if(c < costs[i])
+                costs[i] = c, assignments[i] = selected.size();
+        }
+        sel.insert(next);
+        selected.push_back(next);
+    } while(selected.size() < nsamp);
+    std::get<0>(ret) = std::move(selected);
+    std::get<1>(ret) = std::move(costs);
+    std::get<2>(ret) = std::move(assignments);
+    return ret;
+}
+
+template<typename OracleType, typename IT, typename FT, Assignment asn=HARD, CenterOrigination co=EXTRINSIC>
+MetricSelectionResult<IT, FT>
+select_greedy(const OracleType &oracle, size_t np, ClusteringTraits<FT, IT, asn, co> opts)
+{
+    assert(opts.k != (unsigned)-1);
+	MetricSelectionResult<IT, FT> ret;
+    size_t nsamp = std::min(size_t(std::ceil(opts.k * opts.approx_mul)), np);
+    blz::DV<FT> costs(np);
+    IT next = std::mt19937_64(opts.seed)() % np;
+    std::vector<IT> selected{next}, assignments(np, next);
+    costs[next] = 0.;
+    OMP_PFOR
+    for(size_t i = 0; i < np; ++i) {
+        if(unlikely(i == next)) continue;
+        costs[i] = oracle(i, next);
+    }
+
+    while(selected.size() < nsamp) {
+        next = std::max_element(costs.data(), costs.data() + costs.size()) - costs.data();
+        costs[next] = 0.;
+        assignments[next] = next;
+        OMP_PFOR
+        for(size_t i = 0; i < np; ++i) {
+            if(unlikely(i == next) || costs[i] == 0.) continue;
+            if(auto newcost = oracle(i, next); newcost < costs[i]) {
+                costs[i] = newcost;
+                assignments[i] = next;
+            }
+        }
+        selected.push_back(next);
+    }
+    std::get<0>(ret) = std::move(selected);
+    std::get<1>(ret) = std::move(costs);
+    std::get<2>(ret) = std::move(assignments);
+    return ret;
+}
+
+template<typename OracleType, typename IT, typename FT, Assignment asn=HARD, CenterOrigination co=EXTRINSIC>
+MetricSelectionResult<IT, FT>
+select_d2(const OracleType &oracle, size_t np, ClusteringTraits<FT, IT, asn, co> opts) {
+	MetricSelectionResult<IT, FT> ret;
+    size_t nsamp = std::min(size_t(std::ceil(opts.k * opts.approx_mul)), np);
+    blz::DV<FT> costs(np);
+    std::mt19937_64 mt(opts.seed);
+    IT next = mt() % np;
+    std::vector<IT> selected{next}, assignments(np, next);
+    costs[next] = 0.;
+    OMP_PFOR
+    for(size_t i = 0; i < np; ++i) {
+        if(unlikely(i == next)) continue;
+        costs[i] = oracle(i, next);
+    }
+    auto cdf = std::make_unique<FT[]>(np);
+    FT *const cdfbeg = cdf.get(), *const cdfend = cdfbeg + np;
+    do {
+        std::partial_sum(costs.data(), costs.data() + costs.size(), cdfbeg);
+        std::uniform_real_distribution<double> dist;
+        IT id;
+        do {
+            id = std::lower_bound(cdfbeg, cdfend, cdfend[-1] * dist(mt)) - cdfbeg;
+        } while(std::find(selected.begin(), selected.end(), id) != selected.end());
+        selected.push_back(id);
+        costs[id] = 0.;
+        assignments[id] = id;
+        OMP_PFOR
+        for(IT i = 0; i < np; ++i) {
+            if(costs[i] == 0.) continue;
+            if(auto newcost = oracle(i, id); newcost < costs[i]) {
+                costs[i] = newcost;
+                assignments[i] = id;
+            }
+        }
+    } while(selected.size() < nsamp);
+    std::get<0>(ret) = std::move(selected);
+    std::get<1>(ret) = std::move(costs);
+    std::get<2>(ret) = std::move(assignments);
+    return ret;
+}
+
+} // namespace clustering
+
+} // namespace minocore
+
+#endif
diff --git a/include/minocore/clustering/traits.h b/include/minocore/clustering/traits.h
new file mode 100644
index 00000000..74c1ec35
--- /dev/null
+++ b/include/minocore/clustering/traits.h
@@ -0,0 +1,184 @@
+#ifndef FGC_CLUSTERING_TRAITS_H__
+#define FGC_CLUSTERING_TRAITS_H__
+
+namespace minocore {
+namespace clustering {
+
+using ClusteringEnumType = std::size_t;
+using ce_t = ClusteringEnumType;
+
+/*
+ *
+ * Problem classification. These are:
+ * 1. Assignment
+ * IE, is the assignment to cluster centers hard or soft?
+ * 2. Center Origination
+ * Are cluster centers Intrinsic or Extrinsic? (IE, are centers selected from input points or not?)
+ * 3. Type of approximate solution.
+ * If using coresets, which algorithm is used for an approximate solution?
+ * BICRITERIA (for alpha-beta approximations, where a constant factor approximation with more than alpha centers is allowed)
+ * CONSTANT_FACTOR (for the exact number of centers but with a constant factor approximation)
+ * HEURISTIC (for a good enough solution, which we will treat as one of the above even though it isn't)
+ * 4. Center sampling type.
+ * When selecting centers to use as candidates in search, use:
+ * Thorup sampling
+ * Uniform sampling
+ * D2/cost sampling
+ * 5. Optimization technique
+ * Metric k-median: use metric clustering techniques, such as Jain-Vazirani or local search
+ * Expectation Maximization: Lloyd's algorithm or a variant
+ * Gradient descent (will require autograd or similar)
+ * Exhaustive search: combinatorial approximation
+ * Black box: plugging into CPLEX or Gurobi
+ */
+
+static constexpr ce_t UNSET = ce_t(-1);
+
+enum Assignment: ce_t {
+    HARD = 0,
+    /* Assignment(x) = argmin_{c \in C}[d(c, x)]
+     *
+     */
+    SOFT = 1,
+    // Assignment(X, c) = \frac{c}{\sum_{c' \in C}[d(c', x)]}
+    SOFT_HARMONIC_MEAN = 2,
+    /* Assignment(X, c) = \frac{e^{d(c, x)}}{sum_{c' \in C}[e^d(c', x)]}
+     *                  = softmax(d(C, x))
+     */
+};
+enum CenterOrigination: ce_t {
+    INTRINSIC = 0,
+    EXTRINSIC = 1
+};
+
+enum ApproximateSolutionType: ce_t {
+    BICRITERIA      = 0,
+    CONSTANT_FACTOR = 1,
+    HEURISTIC       = 2,
+    RSVD            = 3,
+    DEFAULT_APPROX = UNSET
+};
+enum CenterSamplingType: ce_t {
+    THORUP_SAMPLING,
+    D2_SAMPLING,
+    UNIFORM_SAMPLING,
+    GREEDY_SAMPLING,
+    DEFAULT_SAMPLING = UNSET,
+    COST_SAMPLING = D2_SAMPLING,
+};
+enum OptimizationMethod: ce_t {
+    METRIC_KMEDIAN,
+    EXPECTATION_MAXIMIZATION,
+    BLACK_BOX,
+    GRADIENT_DESCENT,
+    EXHAUSTIVE_SEARCH,
+    DEFAULT_OPT = UNSET
+};
+
+enum MetricKMedianSolverMethod: ce_t {
+    JAIN_VAZIRANI_FL,
+    LOCAL_SEARCH,
+    JV_PLUS_LOCAL_SEARCH,
+    DEFAULT_SOLVER = UNSET
+};
+
+
+
+template<Assignment asn_method, typename index_t=uint32_t, typename cost_t=float>
+using assignment_fmt_t = std::conditional_t<asn_method == HARD,                                       
+                                         blz::DV<index_t>,                                         
+                                         blaze::DynamicMatrix<cost_t>                              
+                                        >;  
+
+template<typename FT=float, typename IT=uint32_t, Assignment asn=HARD, CenterOrigination co=EXTRINSIC>
+struct ClusteringTraits {
+    static constexpr Assignment asn_method = asn;
+    static constexpr CenterOrigination center_origin = co;
+    ApproximateSolutionType approx = static_cast<ApproximateSolutionType>(UNSET);
+    CenterSamplingType sampling = static_cast<CenterSamplingType>(UNSET);
+    OptimizationMethod opt = static_cast<OptimizationMethod>(UNSET);
+    MetricKMedianSolverMethod metric_solver = JV_PLUS_LOCAL_SEARCH;
+
+    static constexpr FT DEFAULT_EPS = 1e-6;
+
+// Settings
+    FT thorup_npermult = 7;
+    FT approx_mul = 50;
+    FT eps = DEFAULT_EPS;
+    unsigned thorup_iter = 4;
+    unsigned thorup_sub_iter = 10;
+    unsigned max_jv_rounds = 100;
+    unsigned max_lloyd_iter = 1000;
+    unsigned k = -1;
+    size_t npoints = 0;
+
+    bool compute_full = true;
+    uint64_t seed = 13;
+
+    const FT *weights = nullptr;
+
+    static_assert(std::is_floating_point_v<FT>, "FT must be floating");
+    static_assert(std::is_integral_v<IT>, "FT must be integral and support required index ranges");
+    using cost_t = FT;
+    using index_t = IT;
+
+    // If hard, one cost per point
+    // If soft, one cost per point per center
+    // Assignment fractions are generated as-needed (for the case of softmax)
+    // For this reason, matrix forms are stored as
+    // row = point, column = center
+    using costs_t = std::conditional_t<asn_method == HARD,
+                                       blz::DV<cost_t>,
+                                       blz::DynamicMatrix<cost_t>>;
+    // If hard assignment, then assignments are managed
+    using assignments_t = assignment_fmt_t<asn_method, index_t, cost_t>;
+    using centers_t = std::conditional_t<center_origin == INTRINSIC,
+                                         blz::DV<index_t>,
+                                         std::vector<blaze::DynamicVector<FT, blaze::rowVector>>
+                                        >;
+    // Thorup
+};
+
+
+template<typename FT, typename IT=uint32_t, Assignment asn_method=HARD, CenterOrigination co=INTRINSIC>
+ClusteringTraits<FT, IT, asn_method, co> make_clustering_traits(
+    size_t npoints, unsigned k,
+    CenterSamplingType csample=DEFAULT_SAMPLING, OptimizationMethod opt=DEFAULT_OPT,
+    ApproximateSolutionType approx=DEFAULT_APPROX, const FT *weights=nullptr, uint64_t seed=0,
+    size_t max_iter=100, double eps=ClusteringTraits<FT, IT, asn_method, co>::DEFAULT_EPS) {
+    ClusteringTraits<FT, IT, asn_method, co> ret;
+    ret.k = k;
+    ret.seed = seed;
+    ret.max_jv_rounds = ret.max_lloyd_iter = max_iter;
+    ret.eps = eps;
+    ret.opt = opt;
+    ret.sampling = csample;
+    ret.approx = approx;
+    ret.weights = weights;
+    ret.npoints = npoints;
+    return ret;
+}
+
+
+namespace detail {
+
+template<typename MatrixType>
+struct ApplicatorAdaptor {
+    const jsd::DissimilarityApplicator<MatrixType> &mat_;
+    ApplicatorAdaptor(const jsd::DissimilarityApplicator<MatrixType> &mat): mat_(mat) {}
+    decltype(auto) operator()(size_t i, size_t j) const {
+        return mat_(i, j);
+    }
+    auto get_measure() const {return mat_.get_measure();}
+};
+template<typename MatrixType>
+auto make_aa(const jsd::DissimilarityApplicator<MatrixType> &mat) {
+    return ApplicatorAdaptor<MatrixType>(mat);
+}
+
+} // namespace detail
+
+} // clustering
+} // minocore
+
+#endif /* FGC_CLUSTERING_TRAITS_H__ */
diff --git a/include/minocore/coreset/coreset.h b/include/minocore/coreset/coreset.h
index ffe183b4..4031c018 100644
--- a/include/minocore/coreset/coreset.h
+++ b/include/minocore/coreset/coreset.h
@@ -3,9 +3,11 @@
 #define FGC_CORESETS_H__
 #include <vector>
 #include <map>
+#include <queue>
 #include "alias_sampler/alias_sampler.h"
-#include "minocore/util/blaze_adaptor.h"
 #include "minocore/util/shared.h"
+#include "blaze/math/CustomVector.h"
+#include "blaze/math/DynamicVector.h"
 #include <zlib.h>
 #ifdef _OPENMP
 #  include <omp.h>
@@ -188,8 +190,8 @@ struct CoresetSampler {
     using CoresetType = IndexCoreset<IT, FT>;
     std::unique_ptr<Sampler>     sampler_;
     std::unique_ptr<FT []>         probs_;
-    std::unique_ptr<blz::DV<FT>> weights_;
-    std::unique_ptr<blz::DV<IT>> fl_bicriteria_points_; // Used only by FL
+    std::unique_ptr<blaze::DynamicVector<FT>> weights_;
+    std::unique_ptr<blaze::DynamicVector<IT>> fl_bicriteria_points_; // Used only by FL
     std::unique_ptr<IT []>        fl_asn_;
     size_t                            np_;
     size_t                             k_;
@@ -272,7 +274,7 @@ struct CoresetSampler {
         gzread(fp, &weights_present, sizeof(weights_present));
         if(weights_present) {
             assert(weights_present == 137);
-            weights_.reset(new blz::DV<FT>(n));
+            weights_.reset(new blaze::DynamicVector<FT>(n));
             gzread(fp, weights_->data(), sizeof(FT) * n);
         }
         sampler_.reset(new Sampler(probs_.get(), probs_.get() + n, seed_));
@@ -288,7 +290,7 @@ struct CoresetSampler {
         ::read(fd, &weights_present, sizeof(weights_present));
         if(weights_present) {
             assert(weights_present == 137);
-            weights_.reset(new blz::DV<FT>(n));
+            weights_.reset(new blaze::DynamicVector<FT>(n));
             ::read(fd, weights_->data(), sizeof(FT) * n);
         }
         sampler_.reset(new Sampler(probs_.get(), probs_.get() + n, seed_));
@@ -364,7 +366,7 @@ struct CoresetSampler {
         if(!k) k = ncenters;
         k_ = k;
         if(weights) {
-            weights_.reset(new blz::DV<FT>(np_));
+            weights_.reset(new blaze::DynamicVector<FT>(np_));
             std::memcpy(weights_->data(), weights, sizeof(FT) * np_);
         } else weights_.release();
         if(sens == LUCIC_FAULKNER_KRAUSE_FELDMAN) {
@@ -394,7 +396,7 @@ struct CoresetSampler {
             weights_ ? blaze::dot(*weights_, cv)
                      : blaze::sum(cv);
         probs_.reset(new FT[np_]);
-        blz::CustomVector<FT, blaze::unaligned, blaze::unpadded> sensitivies(probs_.get(), np_);
+        blaze::CustomVector<FT, blaze::unaligned, blaze::unpadded> sensitivies(probs_.get(), np_);
         std::vector<IT> center_counts(ncenters);
         OMP_PFOR
         for(size_t i = 0; i < np_; ++i) {
@@ -407,7 +409,7 @@ struct CoresetSampler {
             sensitivies = cv * (1. / total_cost);
         }
         // sensitivities = weights * costs / total_cost
-        blz::DV<FT> ccinv(ncenters);
+        blaze::DynamicVector<FT> ccinv(ncenters);
         for(unsigned i = 0; i < ncenters; ++i)
             ccinv[i] = 1. / center_counts[i];
         OMP_PFOR
@@ -430,7 +432,7 @@ struct CoresetSampler {
         blaze::CustomVector<IT, blaze::unaligned, blaze::unpadded>(fl_asn_.get(), np_) =
             blaze::CustomVector<const IT, blaze::unaligned, blaze::unpadded>(asn, np_);
         if(bicriteria_centers) {
-            if(!fl_bicriteria_points_) fl_bicriteria_points_.reset(new blz::DV<IT>(b_));
+            if(!fl_bicriteria_points_) fl_bicriteria_points_.reset(new blaze::DynamicVector<IT>(b_));
             else fl_bicriteria_points_->resize(b_);
             *fl_bicriteria_points_ = blaze::CustomVector<const IT, blaze::unaligned, blaze::unpadded>(bicriteria_centers, b_);
         }
@@ -447,8 +449,8 @@ struct CoresetSampler {
                 probs_[i] = getweight(i) * (costs[i]) * total_cost_inv;
             }
         } else {
-            blaze::CustomVector<CFT, blaze::unaligned, blaze::unpadded> probv(const_cast<CFT *>(probs_.get()), np_);
-            probv = blz::ceil(CFT(np_) * total_cost_inv * cv) + 1.;
+            blaze::CustomVector<FT, blaze::unaligned, blaze::unpadded> probv(const_cast<FT *>(probs_.get()), np_);
+            probv = blaze::ceil(FT(np_) * total_cost_inv * cv) + 1.;
         }
         sampler_.reset(new Sampler(probs_.get(), probs_.get() + np_, seed));
     }
@@ -460,8 +462,8 @@ struct CoresetSampler {
         const double alpha = 16 * std::log(k_) + 32., alpha2 = 2. * alpha;
 
         //auto center_counts = std::make_unique<IT[]>(ncenters);
-        blz::DV<FT> weight_sums(ncenters, FT(0));
-        blz::DV<FT> cost_sums(ncenters, FT(0));
+        blaze::DynamicVector<FT> weight_sums(ncenters, FT(0));
+        blaze::DynamicVector<FT> cost_sums(ncenters, FT(0));
 
         double total_costs(0.);
         OMP_PRAGMA("omp parallel for reduction(+:total_costs)")
@@ -480,10 +482,10 @@ struct CoresetSampler {
             cost_sums[asn] += pointcost;
             total_costs += w * costs[i];
         }
-        double weight_sum = blz::sum(weight_sums);
+        double weight_sum = blaze::sum(weight_sums);
         total_costs /= weight_sum;
         const double tcinv = alpha / total_costs;
-        blz::DV<FT> sens(np_);
+        blaze::DynamicVector<FT> sens(np_);
         for(size_t i = 0; i < ncenters; ++i) {
             cost_sums[i] = alpha2 * cost_sums[i] / (weight_sums[i] * total_costs) + 4 * weight_sum / weight_sums[i];
         }
@@ -535,6 +537,45 @@ struct CoresetSampler {
     auto getweight(size_t ind) const {
         return weights_ ? weights_->operator[](ind): static_cast<FT>(1.);
     }
+    struct importance_compare {
+        bool operator()(const std::pair<IT, FT> lh, const std::pair<IT, FT> rh) const {
+            return lh.second > rh.second;
+        }
+    };
+    struct importance_queue: public std::priority_queue<std::pair<IT, FT>,
+                                                        std::vector<std::pair<IT, FT>>,
+                                                        importance_compare>
+    {
+        auto &getc() {return this->c;}
+        const auto &getc() const {return this->c;}
+    };
+    IndexCoreset<IT, FT> top_outliers(const size_t n) {
+        importance_queue topk;
+        std::pair<IT, FT> cpoint;
+        for(size_t i = 0; i < size(); ++i) {
+            FT pi = probs_[i];
+            if(topk.size() < n) {
+                cpoint = {IT(i), pi};
+                topk.push(cpoint);
+                continue;
+            }
+            if(topk.top().second < pi) {
+                topk.pop();
+                cpoint = {IT(i), pi};
+                topk.push(cpoint);
+            }
+        }
+        auto container = std::move(topk.getc());
+        // Put the most expensive items in front.
+        shared::sort(container.begin(), container.end(), importance_compare());
+        IndexCoreset<IT, FT> ret(n);
+        const double dn = n;
+        for(unsigned i = 0; i < n; ++i) {
+            auto ind = container[i].first;
+            ret.indices_[i] = ind;
+            ret.weights_[i] = getweight(ind) / (dn * container[i].second);
+        }
+    }
     IndexCoreset<IT, FT> sample(const size_t n, uint64_t seed=0, double eps=0.1) {
         if(unlikely(!sampler_.get())) throw std::runtime_error("Sampler not constructed");
         if(seed) sampler_->seed(seed);
diff --git a/include/minocore/coreset/gmm.h b/include/minocore/coreset/gmm.h
index a460f1e0..54b3deed 100644
--- a/include/minocore/coreset/gmm.h
+++ b/include/minocore/coreset/gmm.h
@@ -9,9 +9,9 @@ template<typename FT=float, bool SO=blaze::rowMajor>
 struct GMM {
     // Related: Laplacian
     unsigned                    k_;
-    blz::DynamicMatrix<FT, SO> mu_;
-    blz::DynamicMatrix<FT, SO> pi_;
-    blz::DynamicMatrix<FT, SO> pm_; // precision matrix
+    blaze::DynamicMatrix<FT, SO> mu_;
+    blaze::DynamicMatrix<FT, SO> pi_;
+    blaze::DynamicMatrix<FT, SO> pm_; // precision matrix
     std::vector<FT>    cached_det_;
     static constexpr double m_pi = 3.14159265358979323846;
 
diff --git a/include/minocore/coreset/kcenter.h b/include/minocore/coreset/kcenter.h
new file mode 100644
index 00000000..0557bfba
--- /dev/null
+++ b/include/minocore/coreset/kcenter.h
@@ -0,0 +1,308 @@
+#ifndef FGC_KCENTER_CORESET_H__
+#define FGC_KCENTER_CORESET_H__
+#include "minocore/optim/kcenter.h"
+
+namespace minocore {
+namespace coresets {
+namespace outliers {
+
+/*
+// All algorithms in this namespace are from:
+// Greedy Strategy Works for k-Center Clustering with Outliers and Coreset Construction
+// Hu Ding, Haikuo Yu, Zixiu Wang
+*/
+
+template<typename IT=std::uint32_t, typename FT=float, typename Container=std::vector<std::pair<FT, IT>>,
+         typename Cmp=std::greater<>>
+struct fpq: public std::priority_queue<std::pair<double, IT>, Container, Cmp> {
+    // priority queue providing access to underlying constainer with getc()
+    // , a reserve function and that defaults to std::greater<> for farthest points.
+    using super = std::priority_queue<std::pair<double, IT>, Container, Cmp>;
+    using value_type = std::pair<double, IT>;
+
+    IT size_;
+    fpq(IT size=0): size_(size) {reserve(size);}
+    fpq(const fpq &o) = default;
+    void reserve(size_t n) {this->c.reserve(n);}
+    auto &getc() {return this->c;}
+    const auto &getc() const {return this->c;}
+    void update(const fpq &o) {
+        for(const auto v: o.getc())
+            add(v);
+    }
+    void add(const value_type v) {
+        if(this->size() < size_) this->push(v);
+        else if(v > this->top()) {
+            this->pop();
+            this->push(v);
+        }
+    }
+    void add(FT val, IT index) {
+        if(this->size() < size_) {
+            this->push(value_type(val, index));
+        } else if(val > this->top().first) {
+            this->pop();
+            this->push(value_type(val, index));
+        }
+    }
+};
+
+
+
+template<typename IT>
+struct bicriteria_result_t: public std::tuple<IVec<IT>, IVec<IT>, std::vector<std::pair<double, IT>>, double> {
+    using super = std::tuple<IVec<IT>, IVec<IT>, std::vector<std::pair<double, IT>>, double>;
+    template<typename...Args>
+    bicriteria_result_t(Args &&...args): super(std::forward<Args>(args)...) {}
+    auto &centers() {return std::get<0>(*this);}
+    auto &assignments() {return std::get<1>(*this);}
+    // alias
+    auto &labels() {return assignments();}
+    auto &outliers() {return std::get<2>(*this);}
+    double outlier_threshold() const {return std::get<3>(*this);}
+    size_t num_centers() const {return centers().size();}
+};
+
+/*
+// Algorithm 1 from the above DYW paper
+// Z = # outliers
+// \mu = quality of coreset
+// size of coreset: 2z + O((2/\mu)^p k)
+// \gamma = z / n
+*/
+
+template<typename Iter, typename FT=shared::ContainedTypeFromIterator<Iter>,
+         typename IT=std::uint32_t, typename RNG, typename Norm=sqrL2Norm>
+bicriteria_result_t<IT>
+kcenter_bicriteria(Iter first, Iter end, RNG &rng, size_t, double eps,
+                   double gamma=0.001, size_t t = 100, double eta=0.01,
+                   const Norm &norm=Norm())
+{
+    auto dm = make_index_dm(first, norm);
+    // Step 1: constants
+    assert(end > first);
+    size_t np = end - first;
+    const size_t z = std::ceil(gamma * np);
+    std::fprintf(stderr, "z: %zu\n", z);
+    size_t farthestchunksize = std::ceil((1 + eps) * z),
+           samplechunksize = std::ceil(std::log(1./eta) / (1 - gamma));
+    IVec<IT> ret;
+    IVec<IT> labels(np);
+    ret.reserve(samplechunksize);
+    std::vector<FT> distances(np);
+    // randomly select 'log(1/eta) / (1 - eps)' vertices from X and add them to E.
+    while(ret.size() < samplechunksize) {
+        // Assuming that this is relatively small and we can take bad asymptotic complexity
+        auto newv = rng() % np;
+        if(std::find(ret.begin(), ret.end(), newv) == ret.end())
+            push_back(ret, newv);
+    }
+    assert(flat_hash_set<IT>(ret.begin(), ret.end()).size() == ret.size());
+    if(samplechunksize > 100) {
+        std::fprintf(stderr, "Warning: with samplechunksize %zu, it may end up taking a decent amount of time. Consider swapping this in for a hash set.", samplechunksize);
+    }
+    if(samplechunksize > farthestchunksize) {
+        std::fprintf(stderr, "samplecc is %zu (> fcs %zu). changing gcs to scc + z (%zu)\n", samplechunksize, farthestchunksize, samplechunksize + z);
+        farthestchunksize = samplechunksize + z;
+    }
+    fpq<IT, FT> pq(farthestchunksize);
+    const auto fv = ret[0];
+    labels[fv] = fv;
+    distances[fv] = 0.;
+    // Fill the priority queue from the first set
+#ifdef _OPENMP
+    #pragma omp declare reduction (merge : fpq<IT, FT> : omp_out.update(omp_in)) initializer(omp_priv(omp_orig))
+    #pragma omp parallel for reduction(merge: pq)
+#endif
+    for(IT i = 0; i < np; ++i) {
+        double dist = dm(fv, i);
+        double newdist;
+        IT label = 0; // This label is an index into the ret vector, rather than the actual index
+        for(size_t j = 1, e = ret.size(); j < e; ++j) {
+            if((newdist = dm(i, ret[j])) < dist) {
+                label = j;
+                dist = newdist;
+            }
+        }
+        distances[i] = dist;
+        labels[i] = ret[label];
+        pq.add(dist, i);
+    }
+    IVec<IT> random_samples(samplechunksize);
+    // modulo without a div/mod instruction, much faster
+    schism::Schismatic<IT> div(farthestchunksize); // pq size
+    assert(samplechunksize >= 1.);
+    for(size_t j = 0;j < t;++j) {
+        //std::fprintf(stderr, "j: %zu/%zu\n", j, t);
+        // Sample 'samplechunksize' points from pq into random_samples.
+        // Sample them
+        size_t rsi = 0;
+        IT *rsp = random_samples.data();
+        do {
+            IT index = div.mod(rng());
+            // (Without replacement)
+            if(std::find(rsp, rsp + rsi, index))
+                rsp[rsi++] = index;
+        } while(rsi < samplechunksize);
+        // random_samples now contains indexes *into pq*
+        assert(pq.getc().data());
+        std::transform(rsp, rsp + rsi, rsp,
+            [pqi=pq.getc().data()](auto x) {
+            return pqi[x].second;
+        });
+        for(size_t i = 0; i < rsi; ++i)
+            assert(rsp[i] < np);
+        // random_samples now contains indexes *into original dataset*
+
+        // Insert into solution
+        for(auto it = rsp, e = rsp + rsi; it < e;++it) {
+            if(std::find(ret.begin(), ret.end(), *it) != ret.end()) continue;
+            distances[*it] = 0.;
+            labels[*it] = *it;
+            ret.pushBack(*it);
+        }
+
+        // compare each point against all of the new points
+        pq.getc().clear(); // empty priority queue
+        // Fill priority queue
+#ifdef _OPENMP
+    #pragma omp declare reduction (merge : fpq<IT, FT> : omp_out.update(omp_in)) initializer(omp_priv(omp_orig))
+    #pragma omp parallel for reduction(merge: pq)
+#endif
+        for(size_t i = 0; i < np; ++i) {
+            double dist = distances[i];
+            if(dist == 0.) continue;
+            double newdist;
+            IT label = labels[i];
+            for(size_t j = 0; j < rsi; ++j) {
+                if((newdist = dm(i, rsp[j])) < dist)
+                    dist = newdist, label = rsp[j];
+            }
+            distances[i] = dist;
+            labels[i] = label;
+            pq.add(dist, i);
+        }
+    }
+    const double minmaxdist = pq.top().first;
+    bicriteria_result_t<IT> bicret;
+    assert(flat_hash_set<IT>(ret.begin(), ret.end()).size() == ret.size());
+    bicret.centers() = std::move(ret);
+    bicret.labels() = std::move(labels);
+    bicret.outliers() = std::move(pq.getc());
+    std::fprintf(stderr, "outliers size: %zu\n", bicret.outliers().size());
+    std::get<3>(bicret) = minmaxdist;
+    return bicret;
+    // center ids, label assignments for all points besides outliers, outliers, and the distance of the closest excluded point
+} // kcenter_bicriteria
+
+/*
+// Algorithm 2 from the above DYW paper
+// Z = # outliers
+// \gamma = z / n
+*/
+
+template<typename Iter, typename FT=shared::ContainedTypeFromIterator<Iter>,
+         typename IT=std::uint32_t, typename RNG, typename Norm=L2Norm>
+std::vector<IT>
+kcenter_greedy_2approx_outliers(Iter first, Iter end, RNG &rng, size_t k, double eps,
+                                double gamma=0.001,
+                                const Norm &norm=Norm())
+{
+    auto dm = make_index_dm(first, norm);
+    const size_t np = end - first;
+    const size_t z = std::ceil(gamma * np);
+    size_t farthestchunksize = std::ceil((1. + eps) * z);
+    fpq<IT, FT> pq(farthestchunksize);
+    //pq.reserve(farthestchunksize + 1);
+    std::vector<IT> ret;
+    std::vector<FT> distances(np, std::numeric_limits<FT>::max());
+    ret.reserve(k);
+    auto newc = rng() % np;
+    ret.push_back(newc);
+    do {
+        //const auto &newel = first[newc];
+        // Fill pq
+#ifdef _OPENMP
+    #pragma omp declare reduction (merge : fpq<IT, FT> : omp_out.update(omp_in)) initializer(omp_priv(omp_orig))
+    #pragma omp parallel for reduction(merge: pq)
+#endif
+        for(IT i = 0; i < np; ++i) {
+            double dist = distances[i];
+            if(dist == 0.) continue;
+            double newdist;
+            if((newdist = dm(i, newc)) < dist)
+                dist = newdist;
+            distances[i] = dist;
+            pq.add(dist, i);
+        }
+
+        // Sample point
+        newc = pq.getc()[rng() % farthestchunksize].second;
+        assert(newc < np);
+        ret.push_back(newc);
+        pq.getc().clear();
+    } while(ret.size() < k);
+    return ret;
+}// kcenter_greedy_2approx_outliers
+
+// Algorithm 3 (coreset construction)
+template<typename Iter, typename FT=shared::ContainedTypeFromIterator<Iter>,
+         typename IT=std::uint32_t, typename RNG, typename Norm=L2Norm>
+coresets::IndexCoreset<IT, FT>
+kcenter_coreset_outliers(Iter first, Iter end, RNG &rng, size_t k, double eps=0.1, double mu=.5,
+                double rho=1.5,
+                double gamma=0.001, double eta=0.01, const Norm &norm=Norm()) {
+    // rho is 'D' for R^D (http://www.wisdom.weizmann.ac.il/~robi/teaching/2014b-SeminarGeometryAlgorithms/lecture1.pdf)
+    // in Euclidean space, as worst-case, but usually better in real data with structure.
+    assert(mu > 0. && mu <= 1.);
+    const size_t np = end - first;
+    size_t L = std::ceil(std::pow(2. / mu, rho) * k);
+    size_t nrounds = std::ceil((L + std::sqrt(L)) / (1. - eta));
+    auto bic = kcenter_bicriteria(first, end, rng, k, eps,
+                                  gamma, nrounds, eta, norm);
+    double rtilde = bic.outlier_threshold();
+    std::fprintf(stderr, "outlier threshold: %f\n", rtilde);
+    auto &centers = bic.centers();
+    auto &labels = bic.labels();
+    auto &outliers = bic.outliers();
+#ifndef NDEBUG
+    for(const auto c: centers)
+        assert(c < np);
+    for(const auto label: labels)
+        assert(labels[label] == label);
+#endif
+    //std::vector<size_t> counts(centers.size());
+    coresets::flat_hash_map<IT, uint32_t> counts;
+    counts.reserve(centers.size());
+    size_t i = 0;
+    SK_UNROLL_8
+    do ++counts[labels[i++]]; while(i < np);
+    coresets::IndexCoreset<IT, FT> ret(centers.size() + outliers.size());
+    std::fprintf(stderr, "ret size: %zu. centers size: %zu. counts size %zu. outliers size: %zu\n", ret.size(), centers.size(), counts.size(), outliers.size());
+    for(i = 0; i < outliers.size(); ++i) {
+        assert(outliers[i].second < np);
+        ret.indices_[i] = outliers[i].second;
+        ret.weights_[i] = 1.;
+    }
+    for(const auto &pair: counts) {
+        assert(pair.first < np);
+        ret.weights_[i] = pair.second;
+        ret.indices_[i] = pair.first;
+        ++i;
+    }
+    assert(i == ret.size());
+    for(size_t i = 0; i < ret.indices_.size(); ++i) {
+        assert(ret.indices_[i] < np);
+    }
+    return ret;
+}
+} // namespace outliers
+using outliers::kcenter_coreset_outliers;
+using outliers::kcenter_greedy_2approx_outliers;
+} // namespace coresets
+using coresets::outliers::kcenter_greedy_2approx_outliers;
+
+} // namespace minocore
+
+#endif /* FGC_KCENTER_CORESET_H__ */
+
diff --git a/include/minocore/coreset/matrix_coreset.h b/include/minocore/coreset/matrix_coreset.h
index 05d5040b..f023033e 100644
--- a/include/minocore/coreset/matrix_coreset.h
+++ b/include/minocore/coreset/matrix_coreset.h
@@ -7,7 +7,7 @@ namespace coresets {
 template<typename MatrixType, typename FT=double>
 struct MatrixCoreset {
     MatrixType mat_;
-    blz::DynamicVector<FT> weights_;
+    blaze::DynamicVector<FT> weights_;
     bool rowwise_;
     MatrixCoreset &merge(const MatrixCoreset &o) {
         if(rowwise_ != o.rowwise_) throw std::runtime_error("Can't merge coresets of differing rowwiseness");
diff --git a/include/minocore/dist.h b/include/minocore/dist.h
index dd68ee93..c402983a 100644
--- a/include/minocore/dist.h
+++ b/include/minocore/dist.h
@@ -2,4 +2,5 @@
 #define FGC_DISTANCE_HEADERS_
 #include <minocore/dist/applicator.h>
 #include <minocore/dist/distance.h>
+#include <minocore/dist/knngraph.h>
 #endif
diff --git a/include/minocore/dist/applicator.h b/include/minocore/dist/applicator.h
index 1c5a5203..64bc332c 100644
--- a/include/minocore/dist/applicator.h
+++ b/include/minocore/dist/applicator.h
@@ -1,5 +1,6 @@
 #ifndef FGC_JSD_H__
 #define FGC_JSD_H__
+#include "minocore/util/exception.h"
 #include "minocore/coreset.h"
 #include "minocore/dist/distance.h"
 #include "distmat/distmat.h"
@@ -16,11 +17,13 @@ namespace jsd {
 using namespace blz;
 using namespace blz::distance;
 
+
 template<typename MatrixType>
-class ProbDivApplicator {
+class DissimilarityApplicator {
     //using opposite_type = typename base_type::OppositeType;
     MatrixType &data_;
     using VecT = blaze::DynamicVector<typename MatrixType::ElementType, IsRowMajorMatrix_v<MatrixType> ? blaze::rowVector: blaze::columnVector>;
+    using matrix_type = MatrixType;
     VecT row_sums_;
     std::unique_ptr<MatrixType> logdata_;
     std::unique_ptr<MatrixType> sqrdata_;
@@ -34,20 +37,22 @@ class ProbDivApplicator {
 public:
     using FT = typename MatrixType::ElementType;
     using MT = MatrixType;
-    using This = ProbDivApplicator<MatrixType>;
-    using ConstThis = const ProbDivApplicator<MatrixType>;
+    using This = DissimilarityApplicator<MatrixType>;
+    using ConstThis = const DissimilarityApplicator<MatrixType>;
 
-    const ProbDivType measure_;
+    const DissimilarityMeasure measure_;
     const MatrixType &data() const {return data_;}
+    const VecT &row_sums() const {return row_sums_;}
     size_t size() const {return data_.rows();}
     template<typename PriorContainer=blaze::DynamicVector<FT, blaze::rowVector>>
-    ProbDivApplicator(MatrixType &ref,
-                      ProbDivType measure=JSM,
+    DissimilarityApplicator(MatrixType &ref,
+                      DissimilarityMeasure measure=JSM,
                       Prior prior=NONE,
                       const PriorContainer *c=nullptr):
         data_(ref), logdata_(nullptr), measure_(measure)
     {
         prep(prior, c);
+        MINOCORE_REQUIRE(dist::detail::is_valid_measure(measure_), "measure_ must be valid");
     }
     /*
      * Sets distance matrix, under measure_ (if not provided)
@@ -56,13 +61,13 @@ class ProbDivApplicator {
     template<typename MatType>
     void set_distance_matrix(MatType &m, bool symmetrize=false) const {set_distance_matrix(m, measure_, symmetrize);}
 
-    template<typename MatType, ProbDivType measure>
+    template<typename MatType, DissimilarityMeasure measure>
     void set_distance_matrix(MatType &m, bool symmetrize=false) const {
         using blaze::sqrt;
         const size_t nr = m.rows();
         assert(nr == m.columns());
         assert(nr == data_.rows());
-        static constexpr ProbDivType actual_measure =
+        static constexpr DissimilarityMeasure actual_measure =
             measure == JSM ? JSD
                 : measure == COSINE_DISTANCE ? COSINE_SIMILARITY
                 : measure == PROBABILITY_COSINE_DISTANCE ? PROBABILITY_COSINE_SIMILARITY
@@ -81,19 +86,19 @@ class ProbDivApplicator {
         }
         if constexpr(measure == JSM) {
             if constexpr(blaze::IsDenseMatrix_v<MatType> || blaze::IsSparseMatrix_v<MatType>) {
-                m = blz::sqrt(m);
+                m = blaze::sqrt(m);
             } else if constexpr(dm::is_distance_matrix_v<MatType>) {
                 blaze::CustomVector<FT, blaze::unaligned, blaze::unpadded> cv(const_cast<FT *>(m.data()), m.size());
-                cv = blz::sqrt(cv);
+                cv = blaze::sqrt(cv);
             } else {
                 std::transform(m.begin(), m.end(), m.begin(), [](auto x) {return std::sqrt(x);});
             }
         } else if constexpr(measure == COSINE_DISTANCE || measure == PROBABILITY_COSINE_DISTANCE) {
             if constexpr(blaze::IsDenseMatrix_v<MatType> || blaze::IsSparseMatrix_v<MatType>) {
-                m = blz::acos(m) * PI_INV;
+                m = blaze::acos(m) * PI_INV;
             } else if constexpr(dm::is_distance_matrix_v<MatType>) {
                 blaze::CustomVector<FT, blaze::unaligned, blaze::unpadded> cv(const_cast<FT *>(m.data()), m.size());
-                cv = blz::acos(cv) * PI_INV;
+                cv = blaze::acos(cv) * PI_INV;
             } else {
                 std::transform(m.begin(), m.end(), m.begin(), [](auto x) {return std::acos(x) * PI_INV;});
             }
@@ -128,7 +133,7 @@ class ProbDivApplicator {
         }
     } // set_distance_matrix
     template<typename MatType>
-    void set_distance_matrix(MatType &m, ProbDivType measure, bool symmetrize=false) const {
+    void set_distance_matrix(MatType &m, DissimilarityMeasure measure, bool symmetrize=false) const {
         switch(measure) {
             case TOTAL_VARIATION_DISTANCE: set_distance_matrix<MatType, TOTAL_VARIATION_DISTANCE>(m, symmetrize); break;
             case L1:                       set_distance_matrix<MatType, L1>(m, symmetrize); break;
@@ -156,7 +161,8 @@ class ProbDivApplicator {
             case COSINE_SIMILARITY:        set_distance_matrix<MatType, COSINE_SIMILARITY>(m, symmetrize); break;
             case PROBABILITY_COSINE_SIMILARITY:
                                            set_distance_matrix<MatType, PROBABILITY_COSINE_SIMILARITY>(m, symmetrize); break;
-            default: throw std::invalid_argument(std::string("unknown dissimilarity measure: ") + std::to_string(int(measure)) + blz::detail::prob2str(measure));
+            case ORACLE_METRIC: case ORACLE_PSEUDOMETRIC: std::fprintf(stderr, "These are placeholders and should not be called."); throw std::invalid_argument("Placeholders");
+            default: throw std::invalid_argument(std::string("unknown dissimilarity measure: ") + std::to_string(int(measure)) + dist::detail::prob2str(measure));
         }
     }
     template<typename OFT=FT>
@@ -164,42 +170,59 @@ class ProbDivApplicator {
         return make_distance_matrix<OFT>(measure_, symmetrize);
     }
     template<typename OFT=FT>
-    blaze::DynamicMatrix<OFT> make_distance_matrix(ProbDivType measure, bool symmetrize=false) const {
+    blaze::DynamicMatrix<OFT> make_distance_matrix(DissimilarityMeasure measure, bool symmetrize=false) const {
         blaze::DynamicMatrix<OFT> ret(data_.rows(), data_.rows());
         set_distance_matrix(ret, measure, symmetrize);
         return ret;
     }
     auto cosine_similarity(size_t i, size_t j) const {
-        return blz::dot(weighted_row(i), weighted_row(j)) * l2norm_cache_->operator[](i) * l2norm_cache_->operator[](j);
+        return blaze::dot(weighted_row(i), weighted_row(j)) * l2norm_cache_->operator[](i) * l2norm_cache_->operator[](j);
+    }
+    template<typename OT, typename=std::enable_if_t<!std::is_integral_v<OT>>>
+    auto cosine_similarity(size_t j, const OT &o) const {
+        return blaze::dot(o, weighted_row(j)) / blaze::l2Norm(o) * l2norm_cache_->operator[](j);
+    }
+    template<typename OT, typename=std::enable_if_t<!std::is_integral_v<OT>>>
+    auto cosine_similarity(const OT &o, size_t j) const {
+        return blaze::dot(o, weighted_row(j)) / blaze::l2Norm(o) * l2norm_cache_->operator[](j);
     }
     auto pcosine_similarity(size_t i, size_t j) const {
-        return blz::dot(row(i), row(j)) * pl2norm_cache_->operator[](i) * pl2norm_cache_->operator[](j);
+        return blaze::dot(row(i), row(j)) * pl2norm_cache_->operator[](i) * pl2norm_cache_->operator[](j);
+    }
+    template<typename OT, typename=std::enable_if_t<!std::is_integral_v<OT>>>
+    auto pcosine_similarity(size_t j, const OT &o) const {
+        return blaze::dot(o, row(j)) / blaze::l2Norm(o) * pl2norm_cache_->operator[](j);
+    }
+    template<typename OT, typename=std::enable_if_t<!std::is_integral_v<OT>>>
+    auto pcosine_similarity(const OT &o, size_t j) const {
+        return blaze::dot(o, row(j)) / blaze::l2Norm(o) * pl2norm_cache_->operator[](j);
     }
 
     static constexpr FT PI_INV = 1. / 3.14159265358979323846264338327950288;
 
-    auto cosine_distance(size_t i, size_t j) const {
-        return std::acos(cosine_similarity(i, j)) * PI_INV;
+    template<typename...Args>
+    auto cosine_distance(Args &&...args) const {
+        return std::acos(cosine_similarity(std::forward<Args>(args)...)) * PI_INV;
     }
-    auto pcosine_distance(size_t i, size_t j) const {
-        return std::acos(cosine_similarity(i, j)) * PI_INV;
+    template<typename...Args>
+    auto pcosine_distance(Args &&...args) const {
+        return std::acos(pcosine_similarity(std::forward<Args>(args)...)) * PI_INV;
     }
     auto dotproduct_distance(size_t i, size_t j) const {
-        return blz::dot(weighted_row(i), weighted_row(j)) * l2norm_cache_->operator[](i) * l2norm_cache_->operator[](j);
+        return blaze::dot(weighted_row(i), weighted_row(j)) * l2norm_cache_->operator[](i) * l2norm_cache_->operator[](j);
     }
     auto pdotproduct_distance(size_t i, size_t j) const {
-        return blz::dot(row(i), row(j)) * pl2norm_cache_->operator[](i) * pl2norm_cache_->operator[](j);
+        return blaze::dot(row(i), row(j)) * pl2norm_cache_->operator[](i) * pl2norm_cache_->operator[](j);
     }
 
 
     // Accessors
     decltype(auto) weighted_row(size_t ind) const {
-        return blz::row(data_, ind BLAZE_CHECK_DEBUG) * row_sums_[ind];
+        return blaze::row(data_, ind BLAZE_CHECK_DEBUG) * row_sums_[ind];
     }
-    auto row(size_t ind) const {return blz::row(data_, ind BLAZE_CHECK_DEBUG);}
-    auto logrow(size_t ind) const {return blz::row(*logdata_, ind BLAZE_CHECK_DEBUG);}
-    auto sqrtrow(size_t ind) const {return blz::row(*sqrdata_, ind BLAZE_CHECK_DEBUG);}
-
+    auto row(size_t ind) const {return blaze::row(data_, ind BLAZE_CHECK_DEBUG);}
+    auto logrow(size_t ind) const {return blaze::row(*logdata_, ind BLAZE_CHECK_DEBUG);}
+    auto sqrtrow(size_t ind) const {return blaze::row(*sqrdata_, ind BLAZE_CHECK_DEBUG);}
 
     /*
      * Distances
@@ -207,7 +230,152 @@ class ProbDivApplicator {
     INLINE auto operator()(size_t i, size_t j) const {
         return this->operator()(i, j, measure_);
     }
-    template<ProbDivType constexpr_measure>
+    template<DissimilarityMeasure constexpr_measure, typename OT, typename CacheT,
+             typename=std::enable_if_t<!std::is_integral_v<OT> && !std::is_integral_v<CacheT>>>
+    INLINE FT operator()(size_t i, OT &o, CacheT *cp=static_cast<CacheT *>(nullptr)) const {
+        return this->call<constexpr_measure, OT, CacheT>(i, o, cp);
+    }
+    template<DissimilarityMeasure constexpr_measure, typename OT, typename CacheT,
+             typename=std::enable_if_t<!std::is_integral_v<OT> && !std::is_integral_v<CacheT>>>
+    INLINE FT operator()(OT &o, size_t i, CacheT *cp=static_cast<CacheT *>(nullptr)) const {
+        return this->call<constexpr_measure, OT, CacheT>(o, i, cp);
+    }
+    template<DissimilarityMeasure constexpr_measure, typename OT, typename CacheT=OT,
+             typename=std::enable_if_t<!std::is_integral_v<OT> && !std::is_integral_v<CacheT>>>
+    INLINE FT call(OT &o, size_t i, CacheT *cp=static_cast<CacheT *>(nullptr)) const {
+        FT ret;
+        if constexpr(constexpr_measure == TOTAL_VARIATION_DISTANCE) {
+            ret = discrete_total_variation_distance(o, row(i));
+        } else if constexpr(constexpr_measure == L1) {
+            ret = l1Norm(weighted_row(i) - o);
+        } else if constexpr(constexpr_measure == L2) {
+            ret = l2Norm(weighted_row(i) - o);
+        } else if constexpr(constexpr_measure == SQRL2) {
+            ret = blaze::sqrNorm(weighted_row(i) - o);
+        } else if constexpr(constexpr_measure == JSD) {
+            if(cp) {
+                ret = jsd(i, o, *cp);
+            } else ret = jsd(i, o);
+        } else if constexpr(constexpr_measure == JSM) {
+            if(cp) {
+                ret = jsm(i, o, *cp);
+            } else ret = jsm(i, o);
+        } else if constexpr(constexpr_measure == REVERSE_MKL) {
+            ret = cp ? mkl(i, o, *cp): mkl(i, o);
+        } else if constexpr(constexpr_measure == MKL) {
+            ret = cp ? mkl(o, i, *cp): mkl(o, i);
+        } else if constexpr(constexpr_measure == EMD) {
+            ret = p_wasserstein(row(i), o);
+        } else if constexpr(constexpr_measure == WEMD) {
+            ret = p_wasserstein(weighted_row(i), o);
+        } else if constexpr(constexpr_measure == REVERSE_POISSON) {
+            ret = cp ? pkl(i, o, *cp): pkl(i, o);
+        } else if constexpr(constexpr_measure == POISSON) {
+            ret = cp ? pkl(o, i, *cp): pkl(o, i);
+        } else if constexpr(constexpr_measure == HELLINGER) {
+            ret = cp ? blaze::sqrNorm(sqrtrow(i) - *cp)
+                     : blaze::sqrNorm(sqrtrow(i) - blaze::sqrt(o));
+        } else if constexpr(constexpr_measure == BHATTACHARYYA_METRIC) {
+            ret = bhattacharyya_metric(i, o);
+        } else if constexpr(constexpr_measure == BHATTACHARYYA_DISTANCE) {
+            ret = bhattacharyya_distance(i, o);
+        } else if constexpr(constexpr_measure == LLR) {
+            ret = cp ? llr(i, o, *cp): llr(i, o);
+        } else if constexpr(constexpr_measure == UWLLR) {
+            ret = cp ? uwllr(i, o, *cp): uwllr(i, o);
+        } else if constexpr(constexpr_measure == OLLR) {
+            throw 1; // Not implemented
+        } else if constexpr(constexpr_measure == ITAKURA_SAITO) {
+            ret = itakura_saito(o, i);
+        } else if constexpr(constexpr_measure == REVERSE_ITAKURA_SAITO) {
+            ret = itakura_saito(i, o);
+        } else if constexpr(constexpr_measure == COSINE_DISTANCE) {
+            ret = cosine_distance(i, o);
+        } else if constexpr(constexpr_measure == PROBABILITY_COSINE_DISTANCE) {
+            ret = pcosine_distance(i, o);
+        } else if constexpr(constexpr_measure == COSINE_SIMILARITY) {
+            ret = cosine_similarity(i, o);
+        } else if constexpr(constexpr_measure == PROBABILITY_COSINE_SIMILARITY) {
+            ret = pcosine_similarity(i, o);
+        } else {
+            throw std::runtime_error(std::string("Unknown measure: ") + std::to_string(int(constexpr_measure)));
+        }
+        return ret;
+    }
+    template<DissimilarityMeasure constexpr_measure, typename OT, typename CacheT=OT,
+             typename=std::enable_if_t<!std::is_integral_v<OT> && !std::is_integral_v<CacheT>>>
+    INLINE FT call(size_t i, OT &o, [[maybe_unused]] CacheT *cp=static_cast<CacheT *>(nullptr)) const {
+        FT ret;
+        if constexpr(constexpr_measure == TOTAL_VARIATION_DISTANCE) {
+            ret = discrete_total_variation_distance(row(i), o);
+        } else if constexpr(constexpr_measure == L1) {
+            ret = l1Norm(weighted_row(i) - o);
+        } else if constexpr(constexpr_measure == L2) {
+            ret = l2Norm(weighted_row(i) - o);
+        } else if constexpr(constexpr_measure == SQRL2) {
+            assert(i < this->data().rows());
+            ret = blaze::sqrNorm(weighted_row(i) - o);
+        } else if constexpr(constexpr_measure == JSD) {
+            if(cp) {
+                ret = jsd(i, o, *cp);
+            } else ret = jsd(i, o);
+        } else if constexpr(constexpr_measure == JSM) {
+            if(cp) {
+                ret = jsm(i, o, *cp);
+            } else ret = jsm(i, o);
+        } else if constexpr(constexpr_measure == REVERSE_MKL) {
+            if(cp) {
+                ret = mkl(o, i, *cp);
+            } else ret = mkl(o, i);
+        } else if constexpr(constexpr_measure == MKL) {
+            if(cp) {
+                ret = mkl(i, o, *cp);
+            } else ret = mkl(i, o);
+        } else if constexpr(constexpr_measure == EMD) {
+            ret = p_wasserstein(row(i), o);
+        } else if constexpr(constexpr_measure == WEMD) {
+            ret = p_wasserstein(weighted_row(i), o);
+        } else if constexpr(constexpr_measure == REVERSE_POISSON) {
+            ret = cp ? pkl(o, i, *cp): pkl(o, i);
+        } else if constexpr(constexpr_measure == POISSON) {
+            ret = cp ? pkl(i, o, *cp): pkl(i, o);
+        } else if constexpr(constexpr_measure == HELLINGER) {
+            if(cp) {
+                ret = blaze::sqrNorm(sqrtrow(i) - *cp);
+            } else {
+                ret = blaze::sqrNorm(sqrtrow(i) - blaze::sqrt(o));
+            }
+        } else if constexpr(constexpr_measure == BHATTACHARYYA_METRIC) {
+            ret = cp ? bhattacharyya_metric(i, o, *cp)
+                     : bhattacharyya_metric(i, o);
+        } else if constexpr(constexpr_measure == BHATTACHARYYA_DISTANCE) {
+            ret = cp ? bhattacharyya_distance(i, o, *cp)
+                     : bhattacharyya_distance(i, o);
+        } else if constexpr(constexpr_measure == LLR) {
+            ret = cp ? llr(i, o, *cp): llr(i, o);
+        } else if constexpr(constexpr_measure == UWLLR) {
+            ret = cp ? uwllr(i, o, *cp): uwllr(i, o);
+        } else if constexpr(constexpr_measure == OLLR) {
+            ret = cp ? llr(i, o, *cp): llr(i, o);
+            std::cerr << "Note: computing LLR, not OLLR, for this case\n";
+        } else if constexpr(constexpr_measure == ITAKURA_SAITO) {
+            ret = itakura_saito(i, o);
+        } else if constexpr(constexpr_measure == REVERSE_ITAKURA_SAITO) {
+            ret = itakura_saito(o, i);
+        } else if constexpr(constexpr_measure == COSINE_DISTANCE) {
+            ret = cosine_distance(i, o);
+        } else if constexpr(constexpr_measure == PROBABILITY_COSINE_DISTANCE) {
+            ret = pcosine_distance(i, o);
+        } else if constexpr(constexpr_measure == COSINE_SIMILARITY) {
+            ret = cosine_similarity(i, o);
+        } else if constexpr(constexpr_measure == PROBABILITY_COSINE_SIMILARITY) {
+            ret = pcosine_similarity(i, o);
+        } else {
+            throw std::runtime_error(std::string("Unknown measure: ") + std::to_string(int(constexpr_measure)));
+        }
+        return ret;
+    }
+    template<DissimilarityMeasure constexpr_measure>
     INLINE FT call(size_t i, size_t j) const {
         FT ret;
         if constexpr(constexpr_measure == TOTAL_VARIATION_DISTANCE) {
@@ -263,7 +431,104 @@ class ProbDivApplicator {
         }
         return ret;
     }
-    INLINE FT operator()(size_t i, size_t j, ProbDivType measure) const {
+    template<typename OT, typename CacheT=OT, typename=std::enable_if_t<!std::is_integral_v<OT> > >
+    INLINE FT operator()(const OT &o, size_t i, const CacheT *cache=static_cast<CacheT *>(nullptr)) const noexcept {
+        return this->operator()(o, i, cache, measure_);
+    }
+    template<typename OT, typename CacheT=OT, typename=std::enable_if_t<!std::is_integral_v<OT> > >
+    INLINE FT operator()(const OT &o, size_t i, const CacheT *cache, DissimilarityMeasure measure) const noexcept {
+#ifndef NDEBUG
+        if(unlikely(i >= data_.rows())) {
+            std::cerr << (std::string("Invalid rows selection: ") + std::to_string(i) + '\n');
+            std::exit(1);
+        }
+#endif
+        if(unlikely(measure == static_cast<DissimilarityMeasure>(-1))) {
+            std::cerr << "Unset measure\n";
+            std::exit(1);
+        }
+        //PRETTY_SAY << "Performing with " << (void *)&o << " and row " << i << '\n';
+        FT ret;
+        switch(measure) {
+            case TOTAL_VARIATION_DISTANCE: ret = call<TOTAL_VARIATION_DISTANCE>(o, i); break;
+            case L1: ret = call<L1>(o, i); break;
+            case L2: ret = call<L2>(o, i); break;
+            case SQRL2: ret = call<SQRL2>(o, i); break;
+            case JSD: ret = call<JSD>(o, i); break;
+            case JSM: ret = call<JSM>(o, i); break;
+            case REVERSE_MKL: ret = call<REVERSE_MKL>(o, i, cache); break;
+            case MKL: ret = call<MKL>(o, i, cache); break;
+            case EMD: ret = call<EMD>(o, i); break;
+            case WEMD: ret = call<WEMD>(o, i); break;
+            case REVERSE_POISSON: ret = call<REVERSE_POISSON>(o, i, cache); break;
+            case POISSON: ret = call<POISSON>(o, i, cache); break;
+            case HELLINGER: ret = call<HELLINGER>(o, i, cache); break;
+            case BHATTACHARYYA_METRIC: ret = call<BHATTACHARYYA_METRIC>(o, i); break;
+            case BHATTACHARYYA_DISTANCE: ret = call<BHATTACHARYYA_DISTANCE>(o, i); break;
+            case LLR: ret = call<LLR>(o, i, cache); break;
+            case UWLLR: ret = call<UWLLR>(o, i, cache); break;
+            case OLLR: ret = call<OLLR>(o, i, cache); break;
+            case ITAKURA_SAITO: ret = call<ITAKURA_SAITO>(o, i, cache); break;
+            case COSINE_DISTANCE: ret = call<COSINE_DISTANCE>(o, i); break;
+            case PROBABILITY_COSINE_DISTANCE: ret = call<PROBABILITY_COSINE_DISTANCE>(o, i); break;
+            case COSINE_SIMILARITY: ret = call<COSINE_SIMILARITY>(o, i); break;
+            case PROBABILITY_COSINE_SIMILARITY: ret = call<PROBABILITY_COSINE_SIMILARITY>(o, i); break;
+            case ORACLE_METRIC: case ORACLE_PSEUDOMETRIC: std::fprintf(stderr, "These are placeholders and should not be called."); return 0.;
+            default: __builtin_unreachable();
+        }
+    }
+    template<typename OT, typename CacheT=OT, typename=std::enable_if_t<!std::is_integral_v<OT> > >
+    INLINE FT operator()(size_t i, const OT &o, const CacheT *cache=static_cast<CacheT *>(nullptr)) const {
+        return this->operator()(i, o, cache, measure_);
+    }
+    template<typename OT, typename CacheT=OT, typename=std::enable_if_t<!std::is_integral_v<OT> > >
+    INLINE FT operator()(size_t i, const OT &o, const CacheT *cache, DissimilarityMeasure measure) const noexcept {
+        if(unlikely(i >= data_.rows())) {
+            std::cerr << (std::string("Invalid rows selection: ") + std::to_string(i) + '\n');
+            std::exit(1);
+        }
+        if(unlikely(measure == static_cast<DissimilarityMeasure>(-1))) {
+            std::cerr << "Unset measure\n";
+            std::exit(1);
+        }
+#if 0
+        PRETTY_SAY << "Computing i vs outside o with cache and " << detail::prob2str(measure) << "\n";
+        PRETTY_SAY << "Performing with "
+            << " row " << i << " and "
+            << (void *)&o
+            << '\n';
+#endif
+        FT ret;
+        switch(measure) {
+            case TOTAL_VARIATION_DISTANCE: ret = call<TOTAL_VARIATION_DISTANCE>(i, o); break;
+            case L1: ret = call<L1>(i, o); break;
+            case L2: ret = call<L2>(i, o); break;
+            case SQRL2: ret = call<SQRL2>(i, o); break;
+            case JSD: ret = call<JSD>(i, o); break;
+            case JSM: ret = call<JSM>(i, o); break;
+            case REVERSE_MKL: ret = call<REVERSE_MKL>(i, o, cache); break;
+            case MKL: ret = call<MKL>(i, o, cache); break;
+            case EMD: ret = call<EMD>(i, o); break;
+            case WEMD: ret = call<WEMD>(i, o); break;
+            case REVERSE_POISSON: ret = call<REVERSE_POISSON>(i, o, cache); break;
+            case POISSON: ret = call<POISSON>(i, o, cache); break;
+            case HELLINGER: ret = call<HELLINGER>(i, o, cache); break;
+            case BHATTACHARYYA_METRIC: ret = call<BHATTACHARYYA_METRIC>(i, o); break;
+            case BHATTACHARYYA_DISTANCE: ret = call<BHATTACHARYYA_DISTANCE>(i, o); break;
+            case LLR: ret = call<LLR>(i, o, cache); break;
+            case UWLLR: ret = call<UWLLR>(i, o, cache); break;
+            case OLLR: ret = call<OLLR>(i, o, cache); break;
+            case ITAKURA_SAITO: ret = call<ITAKURA_SAITO>(i, o, cache); break;
+            case COSINE_DISTANCE: ret = call<COSINE_DISTANCE>(i, o); break;
+            case PROBABILITY_COSINE_DISTANCE: ret = call<PROBABILITY_COSINE_DISTANCE>(i, o); break;
+            case COSINE_SIMILARITY: ret = call<COSINE_SIMILARITY>(i, o); break;
+            case PROBABILITY_COSINE_SIMILARITY: ret = call<PROBABILITY_COSINE_SIMILARITY>(i, o); break;
+            case ORACLE_METRIC: case ORACLE_PSEUDOMETRIC: std::fprintf(stderr, "These are placeholders and should not be called."); return 0.;
+            default: __builtin_unreachable();
+        }
+        return ret;
+    }
+    INLINE FT operator()(size_t i, size_t j, DissimilarityMeasure measure) const noexcept {
         if(unlikely(i >= data_.rows() || j >= data_.rows())) {
             std::cerr << (std::string("Invalid rows selection: ") + std::to_string(i) + ", " + std::to_string(j) + '\n');
             std::exit(1);
@@ -293,12 +558,13 @@ class ProbDivApplicator {
             case PROBABILITY_COSINE_DISTANCE: ret = call<PROBABILITY_COSINE_DISTANCE>(i, j); break;
             case COSINE_SIMILARITY: ret = call<COSINE_SIMILARITY>(i, j); break;
             case PROBABILITY_COSINE_SIMILARITY: ret = call<PROBABILITY_COSINE_SIMILARITY>(i, j); break;
+            case ORACLE_METRIC: case ORACLE_PSEUDOMETRIC: std::fprintf(stderr, "These are placeholders and should not be called."); return 0.;
             default: __builtin_unreachable();
         }
         return ret;
     }
     template<typename MatType>
-    void operator()(MatType &mat, ProbDivType measure, bool symmetrize=false) {
+    void operator()(MatType &mat, DissimilarityMeasure measure, bool symmetrize=false) {
         set_distance_matrix(mat, measure, symmetrize);
     }
     template<typename MatType>
@@ -319,17 +585,51 @@ class ProbDivApplicator {
                 throw std::runtime_error(buf);
             }
             ret = -std::numeric_limits<FT>::max();
-            throw shared::TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
+            throw TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
         } else {
             auto div = row(i) / row(j);
             ret = blaze::sum(div - blaze::log(div)) - row(i).size();
         }
         return ret;
     }
+    template<typename OT, typename=std::enable_if_t<!std::is_integral_v<OT>> >
+    auto itakura_saito(size_t i, const OT &o) const {
+        FT ret;
+        if constexpr(IS_SPARSE) {
+            if(!prior_data_) {
+                char buf[128];
+                std::sprintf(buf, "warning: Itakura-Saito cannot be computed to sparse vectors/matrices at %zu/%p\n", i, (void *)&o);
+                throw std::runtime_error(buf);
+            }
+            ret = -std::numeric_limits<FT>::max();
+            throw TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
+        } else {
+            auto div = row(i) / o;
+            ret = blaze::sum(div - blaze::log(div)) - row(i).size();
+        }
+        return ret;
+    }
+    template<typename OT, typename=std::enable_if_t<!std::is_integral_v<OT>> >
+    auto itakura_saito(const OT &o, size_t i) const {
+        FT ret;
+        if constexpr(IS_SPARSE) {
+            if(!prior_data_) {
+                char buf[128];
+                std::sprintf(buf, "warning: Itakura-Saito cannot be computed to sparse vectors/matrices at %zu/%p\n", i, (void *)&o);
+                throw std::runtime_error(buf);
+            }
+            ret = -std::numeric_limits<FT>::max();
+            throw TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
+        } else {
+            auto div = o / row(i);
+            ret = blaze::sum(div - blaze::log(div)) - o.size();
+        }
+        return ret;
+    }
 
     auto hellinger(size_t i, size_t j) const {
         return sqrdata_ ? blaze::sqrNorm(sqrtrow(i) - sqrtrow(j))
-                        : blaze::sqrNorm(blz::sqrt(row(i)) - blz::sqrt(row(j)));
+                        : blaze::sqrNorm(blaze::sqrt(row(i)) - blaze::sqrt(row(j)));
     }
     FT jsd(size_t i, size_t j) const {
         if(!IsSparseMatrix_v<MatrixType> || !prior_data_) {
@@ -338,98 +638,250 @@ class ProbDivApplicator {
             FT ret;
             auto ri = row(i), rj = row(j);
             //constexpr FT logp5 = -0.693147180559945; // std::log(0.5)
-            auto s = ri + rj;
-            ret = jsd_cache_->operator[](i) + jsd_cache_->operator[](j) - blz::dot(s, blaze::neginf2zero(blaze::log(s * 0.5)));
-#ifndef NDEBUG
-            static constexpr typename MatrixType::ElementType threshold
-                = std::is_same_v<typename MatrixType::ElementType, double>
-                                    ? 0.: -1e-5;
-            assert(ret >= threshold || !std::fprintf(stderr, "ret: %g (numerical stability issues)\n", ret));
-#endif
-            return std::max(ret, static_cast<FT>(0.));
-        } else {
-            throw shared::TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
-            return FT(0);
+            auto s = evaluate(ri + rj);
+            ret = get_jsdcache(i) + get_jsdcache(j) - blaze::dot(s, blaze::neginf2zero(blaze::log(s * 0.5)));
+            return std::max(.5 * ret, static_cast<FT>(0.));
+        } else if constexpr(IS_SPARSE) {
+            FT ret = get_jsdcache(i) + get_jsdcache(j);
+            const size_t dim = row(i).size();
+            auto lhr = row(i), rhr = row(j);
+            auto lhit = lhr.begin(), rhit = rhr.begin();
+            const auto lhe = lhr.end(), rhe = rhr.end();
+            auto lhrsi = 1. / row_sums_[i];
+            auto rhrsi = 1. / row_sums_[j];
+            if(prior_data_->size() == 1) {
+                const auto lhrsimul = lhrsi * prior_data_->operator[](0);
+                const auto rhrsimul = rhrsi * prior_data_->operator[](0);
+                if(lhit == lhe || rhit == rhe) return static_cast<FT>(0);
+                auto dox = [&](auto x) {ret -= x * std::log(.5 * x);};
+                while(lhit != lhe && rhit != rhe) {
+                    if(lhit->index() == rhit->index()) {
+                        dox(lhit->value() + rhit->value());
+                        ++lhit; ++rhit;
+                    } else if(lhit->index() < rhit->index()) {
+                        dox(lhit->value() + rhrsimul);
+                        ++lhit;
+                    } else {
+                        dox(rhit->value() + lhrsimul);
+                        ++rhit;
+                    }
+                }
+                //std::fprintf(stderr, "Finished loop. lhit is end? %d rhit is ind? %d\n", lhit == lhe, rhit == rhe);
+                for(;lhit != lhe;++lhit)
+                    dox(lhit->value() + rhrsimul);
+                for(;rhit != rhe;++rhit)
+                    dox(rhit->value() + lhrsimul);
+                //std::fprintf(stderr, "Handled all lhit\n");
+                const FT sump = (lhrsimul + rhrsimul);
+                ret -= blz::number_shared_zeros(lhr, rhr) * (sump * std::log(.5 * (sump)));
+            } else {
+                std::fprintf(stderr, "Fanciest\n");
+                // This could later be accelerated, but that kind of caching is more complicated.
+                auto &pd = *prior_data_;
+                auto dox = [&](auto x, auto y) {ret -= (x + y) * std::log(.5 * (x + y));};
+                auto doxy = [&](auto x) {ret -= x * std::log(.5 * x);};
+                size_t first_index = lhit != lhe ? (rhit != rhe ? std::min(lhit->index(), rhit->index()): lhit->index()): rhit != rhe ? rhit->index(): dim;
+                for(size_t i = 0; i < first_index; ++i)
+                    doxy(pd[i] * (lhrsi + rhrsi));
+                while(lhit != lhe && rhit != rhe) {
+                    if(lhit->index() == rhit->index()) {
+                        dox(lhit->value(), rhit->value());
+                        if(++lhit == lhe) break;
+                        if(++rhit == rhe) break;
+                    } else if(lhit->index() < rhit->index()) {
+                        dox(lhit->value(), pd[lhit->index()] * rhrsi);
+                        for(size_t i = lhit->index() + 1; i < rhit->index(); ++i)
+                            dox(pd[i] * lhrsi, pd[i] * rhrsi);
+                        if(++lhit == lhe) break;
+                    } else {
+                        dox(rhit->value(), pd[rhit->index()] * lhrsi);
+                        for(size_t i = rhit->index() + 1; i < lhit->index(); ++i)
+                            doxy(pd[i] * (lhrsi + rhrsi));
+                        if(++rhit == rhe) break;
+                    }
+                }
+                // Remaining entries
+                while(lhit != lhe) {
+                    dox(lhit->value(), pd[lhit->index()] * rhrsi);
+                    size_t i = lhit->index() + 1;
+                    size_t nextind = (++lhit == lhe) ? dim: lhit->index();
+                    for(; i < nextind; ++i)
+                        doxy(pd[i] * (lhrsi + rhrsi));
+                }
+                while(rhit != rhe) {
+                    dox(rhit->value(), lhrsi * pd[rhit->index()]);
+                    size_t i = rhit->index() + 1;
+                    size_t nextind = (++rhit == rhe) ? dim: rhit->index();
+                    for(; i < nextind; ++i)
+                        doxy(pd[i] * (lhrsi + rhrsi));
+                }
+            }
+            return std::max(ret * .5, static_cast<FT>(0.));
         }
+        __builtin_unreachable();
     }
     template<typename OT, typename=std::enable_if_t<!std::is_integral_v<OT>>, typename OT2>
     auto jsd(size_t i, const OT &o, const OT2 &olog) const {
-        if(IS_SPARSE && prior_data_) throw shared::TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
+        if(IS_SPARSE && blaze::IsSparseVector_v<OT> && prior_data_) throw TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
         auto mnlog = evaluate(log(0.5 * (row(i) + o)));
-        return (blz::dot(row(i), logrow(i) - mnlog) + blz::dot(o, olog - mnlog));
+        return (blaze::dot(row(i), logrow(i) - mnlog) + blaze::dot(o, olog - mnlog));
     }
     template<typename OT, typename=std::enable_if_t<!std::is_integral_v<OT>>>
     auto jsd(size_t i, const OT &o) const {
-        if(IS_SPARSE && prior_data_) throw shared::TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
-        auto olog = evaluate(blaze::neginf2zero(blz::log(o)));
+        if(IS_SPARSE && blaze::IsSparseVector_v<OT> && prior_data_) throw TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
+        auto olog = evaluate(blaze::neginf2zero(blaze::log(o)));
         return jsd(i, o, olog);
     }
     auto mkl(size_t i, size_t j) const {
-        // Multinomial KL
-        if(IS_SPARSE && prior_data_) throw shared::TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
-        return get_jsdcache(i) - blz::dot(row(i), logrow(j));
+        if constexpr(IS_SPARSE) {
+            if(prior_data_) {
+                const auto &pd(*prior_data_);
+                const bool single_value = pd.size() == 1;
+                auto lhr = row(i);
+                const size_t dim = lhr.size();
+                auto rhr = row(j);
+                auto lhit = lhr.begin(), rhit = rhr.begin();
+                const auto lhe = lhr.end(), rhe = rhr.end();
+                const auto lhrsi = 1. / row_sums_[i];
+                const auto rhrsi = 1. / row_sums_[j];
+                FT ret = 0.;
+                if(single_value) {
+                    size_t i = 0;
+                    const FT inc = pd[0];
+                    const FT lhinc = inc * lhrsi;
+                    const FT rhinc = inc * rhrsi;
+                    const FT rhincl = std::log(rhinc);
+                    const FT empty_contrib = -lhinc * rhincl;
+                    size_t nz = 0;
+                    for(;;) {
+                        if(lhit != lhe && rhit != rhe) {
+                            size_t cind = std::min(lhit->index(), rhit->index());
+                            nz += cind - i;
+                            i = cind + 1;
+                            const size_t lhi = lhit->index();
+                            const size_t rhi = rhit->index();
+                            if(lhi == rhi) {
+                                ret -= lhit->value() * std::log(rhit->value());
+                                ++lhit;
+                                ++rhit;
+                            } else if(lhi < rhi) {
+                                ret -= lhit->value() * rhincl;
+                                ++lhit;
+                            } else {
+                                ret -= lhinc * std::log(rhit->value());
+                                ++rhit;
+                            }
+                        } else if(lhit == lhe) {
+                            if(rhit == rhe) {
+                                nz += dim - i;
+                                i = dim;
+                                break;
+                            } else {
+                                for(;rhit != rhe;++rhit) {
+                                    nz += rhit->index() - i;
+                                    ret -= lhinc * std::log(rhit->value());
+                                    i = rhit->index() + 1;
+                                }
+                            }
+                        } else if(rhit == rhe) {
+                            for(;lhit != lhe;++lhit) {
+                                nz += lhit->index() - i;
+                                ret -= lhit->value() * rhincl;
+                                i = lhit->index() + 1;
+                            }
+                        }
+                    }
+                    ret += empty_contrib * nz;
+                } else { // if(single_value) / else
+                    for(;;) {
+                        if(lhit != lhe && rhit != rhe) {
+                            size_t cind = std::min(lhit->index(), rhit->index());
+                            for(;i < cind;++i)
+                                ret -= lhrsi * pd[i] * std::log(rhrsi * pd[i]);
+                            const size_t lhi = lhit->index();
+                            const size_t rhi = rhit->index();
+                            if(lhi == rhi) {
+                                ret -= lhit->value() * std::log(rhit->value());
+                                ++lhit;
+                                ++rhit;
+                            } else if(lhi < rhi) {
+                                ret -= lhit->value() * std::log(rhrsi * pd[i]);
+                                ++lhit;
+                            } else {
+                                // lh contrib is prior over row sum
+                                ret -= pd[i] * lhrsi * std::log(rhit->value());
+                                ++rhit;
+                            }
+                            ++i;
+                        } else if(lhit == lhe) {
+                            if(rhit == rhe) {
+                                for(;i < dim;++i)
+                                    ret -= lhrsi * pd[i] * std::log(rhrsi * pd[i]);
+                                break;
+                            } else {
+                                for(;rhit != rhe;++rhit) {
+                                    for(;i < rhit->index(); ++i)
+                                        ret -= lhrsi * pd[i] * std::log(rhrsi * pd[i]);
+                                    ret -= lhrsi * pd[i] * std::log(rhit->value());
+                                    ++i;
+                                }
+                            }
+                        } else if(rhit == rhe) {
+                            for(;lhit != lhe;++lhit) {
+                                for(;i < lhit->index(); ++i)
+                                    ret -= lhrsi * pd[i] * std::log(rhrsi * pd[i]);
+                                ret -= lhit->value() * std::log(rhrsi * pd[i]);
+                                ++i;
+                            }
+                        }
+                    }
+                }
+                return ret + get_jsdcache(i);
+            }
+        }
+        return FT(get_jsdcache(i) - blz::dot(row(i), logrow(j)));
     }
     template<typename OT, typename=std::enable_if_t<!std::is_integral_v<OT>>>
     auto mkl(size_t i, const OT &o) const {
-        // Multinomial KL
-        if(IS_SPARSE && prior_data_) throw shared::TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
-        return get_jsdcache(i) - blz::dot(row(i), blaze::neginf2zero(blz::log(o)));
-    }
-    template<typename OT, typename=std::enable_if_t<!std::is_integral_v<OT>>, typename OT2>
-    auto mkl(size_t i, const OT &, const OT2 &olog) const {
-        if(IS_SPARSE && prior_data_) throw shared::TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
-        // Multinomial KL
-        return blz::dot(row(i), logrow(i) - olog);
-    }
-    auto pkl(size_t i, size_t j) const {
-        if(IS_SPARSE && prior_data_) throw shared::TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
-        // Poission KL
-        return get_jsdcache(i) - blz::dot(row(i), logrow(j)) + blz::sum(row(j) - row(i));
+        if(IS_SPARSE && blaze::IsSparseVector_v<OT> && prior_data_) throw TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
+        return get_jsdcache(i) - blaze::dot(row(i), blaze::neginf2zero(blaze::log(o)));
     }
     template<typename OT, typename=std::enable_if_t<!std::is_integral_v<OT>>, typename OT2>
-    auto pkl(size_t i, const OT &o, const OT2 &olog) const {
-        if(IS_SPARSE && prior_data_) throw shared::TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
-        // Poission KL
-        return get_jsdcache(i) - blz::dot(row(i), olog) + blz::sum(row(i) - o);
+    auto mkl(const OT &o, size_t i, const OT2 &olog) const {
+        if(IS_SPARSE && blaze::IsSparseVector_v<OT> && prior_data_) throw TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
+        return blaze::dot(o, olog - logrow(i));
     }
     template<typename OT, typename=std::enable_if_t<!std::is_integral_v<OT>>>
-    auto pkl(size_t i, const OT &o) const {
-        if(IS_SPARSE && prior_data_) throw shared::TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
-        return pkl(i, o, neginf2zero(blz::log(o)));
-    }
-    auto psd(size_t i, size_t j) const {
-        if(IS_SPARSE && prior_data_) throw shared::TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
-        // Poission JSD
-        auto mnlog = evaluate(log(.5 * (row(i) + row(j))));
-        return (blz::dot(row(i), logrow(i) - mnlog) + blz::dot(row(j), logrow(j) - mnlog));
+    auto mkl(const OT &o, size_t i) const {
+        if(IS_SPARSE && prior_data_) throw TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
+        return blaze::dot(o, blaze::neginf2zero(blaze::log(o)) - logrow(i));
     }
     template<typename OT, typename=std::enable_if_t<!std::is_integral_v<OT>>, typename OT2>
-    auto psd(size_t i, const OT &o, const OT2 &olog) const {
-        if(IS_SPARSE && prior_data_) throw shared::TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
-        // Poission JSD
-        auto mnlog = evaluate(log(.5 * (row(i) + o)));
-        return (blz::dot(row(i), logrow(i) - mnlog) + blz::dot(o, olog - mnlog));
-    }
-    template<typename OT, typename=std::enable_if_t<!std::is_integral_v<OT>>>
-    auto psd(size_t i, const OT &o) const {
-        if(IS_SPARSE && prior_data_) throw shared::TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
-        return psd(i, o, neginf2zero(blz::log(o)));
+    auto mkl(size_t i, const OT &, const OT2 &olog) const {
+        if(IS_SPARSE && prior_data_) throw TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
+        return blaze::dot(row(i), logrow(i) - olog);
     }
+    template<typename...Args>
+    auto pkl(Args &&...args) const { return mkl(std::forward<Args>(args)...);}
+    template<typename...Args>
+    auto psd(Args &&...args) const { return jsd(std::forward<Args>(args)...);}
+    template<typename...Args>
+    auto psm(Args &&...args) const { return jsm(std::forward<Args>(args)...);}
     auto bhattacharyya_sim(size_t i, size_t j) const {
-        if(IS_SPARSE && prior_data_) throw shared::TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
-        return sqrdata_ ? blz::dot(sqrtrow(i), sqrtrow(j))
-                        : blz::sum(blz::sqrt(row(i) * row(j)));
+        if(IS_SPARSE && prior_data_) throw TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
+        return sqrdata_ ? blaze::dot(sqrtrow(i), sqrtrow(j))
+                        : blaze::sum(blaze::sqrt(row(i) * row(j)));
     }
     template<typename OT, typename=std::enable_if_t<!std::is_integral_v<OT>>, typename OT2>
     auto bhattacharyya_sim(size_t i, const OT &o, const OT2 &osqrt) const {
         if(IS_SPARSE && prior_data_) throw std::runtime_error("Failed to calculate. TODO: complete special fast version of this supporting priors at no runtime cost.");
-        return sqrdata_ ? blz::dot(sqrtrow(i), osqrt)
-                        : blz::sum(blz::sqrt(row(i) * o));
+        return sqrdata_ ? blaze::dot(sqrtrow(i), osqrt)
+                        : blaze::sum(blaze::sqrt(row(i) * o));
     }
     template<typename OT, typename=std::enable_if_t<!std::is_integral_v<OT>>>
     auto bhattacharyya_sim(size_t i, const OT &o) const {
         if(IS_SPARSE && prior_data_) throw std::runtime_error("Failed to calculate. TODO: complete special fast version of this supporting priors at no runtime cost.");
-        return bhattacharyya_sim(i, o, blz::sqrt(o));
+        return bhattacharyya_sim(i, o, blaze::sqrt(o));
     }
     template<typename...Args>
     auto bhattacharyya_distance(Args &&...args) const {
@@ -438,13 +890,11 @@ class ProbDivApplicator {
     }
     template<typename...Args>
     auto bhattacharyya_metric(Args &&...args) const {
-        throw std::runtime_error("Failed to calculate. TODO: complete special fast version of this supporting priors at no runtime cost.");
+        if(IS_SPARSE && prior_data_) throw std::runtime_error("Failed to calculate. TODO: complete special fast version of this supporting priors at no runtime cost.");
         return std::sqrt(1 - bhattacharyya_sim(std::forward<Args>(args)...));
     }
-    template<typename...Args>
-    auto psm(Args &&...args) const {return std::sqrt(std::forward<Args>(args)...);}
     auto llr(size_t i, size_t j) const {
-        if(IS_SPARSE && prior_data_) throw shared::TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
+        if(IS_SPARSE && prior_data_) throw TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
             //blaze::dot(row(i), logrow(i)) * row_sums_[i]
             //+
             //blaze::dot(row(j), logrow(j)) * row_sums_[j]
@@ -455,39 +905,51 @@ class ProbDivApplicator {
         const auto lambda = lhn / (lhn + rhn), m1l = 1. - lambda;
         auto ret = lhn * get_jsdcache(i) + rhn * get_jsdcache(j)
             -
-            blz::dot(weighted_row(i) + weighted_row(j),
-                neginf2zero(blz::log(lambda * row(i) + m1l * row(j)))
+            blaze::dot(weighted_row(i) + weighted_row(j),
+                neginf2zero(blaze::log(lambda * row(i) + m1l * row(j)))
             );
         assert(ret >= -1e-2 * (row_sums_[i] + row_sums_[j]) || !std::fprintf(stderr, "ret: %g\n", ret));
         return std::max(ret, 0.);
     }
     auto ollr(size_t i, size_t j) const {
-        if(IS_SPARSE && prior_data_) throw shared::TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
+        if(IS_SPARSE && prior_data_) throw TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
         auto ret = get_jsdcache(i) * row_sums_[i] + get_jsdcache(j) * row_sums_[j]
-            - blz::dot(weighted_row(i) + weighted_row(j), neginf2zero(blz::log((row(i) + row(j)) * .5)));
+            - blaze::dot(weighted_row(i) + weighted_row(j), neginf2zero(blaze::log((row(i) + row(j)) * .5)));
         return std::max(ret, 0.);
     }
     auto uwllr(size_t i, size_t j) const {
-        if(IS_SPARSE && prior_data_) throw shared::TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
-        const auto lhn = row_sums_[i], rhn = row_sums_[j];
-        const auto lambda = lhn / (lhn + rhn), m1l = 1. - lambda;
-        return
-          std::max(
-            lambda * get_jsdcache(i) +
-                  m1l * get_jsdcache(j) -
-               blz::dot(lambda * row(i) + m1l * row(j),
-                        neginf2zero(blz::log(
-                            lambda * row(i) + m1l * row(j)))),
-          0.);
+        if(IS_SPARSE && prior_data_) throw TODOError("TODO: complete special fast version of this supporting priors at no runtime cost.");
+        else {
+            const auto lhn = row_sums_[i], rhn = row_sums_[j];
+            const auto lambda = lhn / (lhn + rhn), m1l = 1. - lambda;
+            return
+              std::max(
+                lambda * get_jsdcache(i) +
+                      m1l * get_jsdcache(j) -
+                   blaze::dot(lambda * row(i) + m1l * row(j),
+                            neginf2zero(blaze::log(
+                                lambda * row(i) + m1l * row(j)))),
+              0.);
+        }
     }
     template<typename OT, typename=std::enable_if_t<!std::is_integral_v<OT>>>
     auto llr(size_t, const OT &) const {
-        throw shared::TODOError("llr is not implemented for this.");
+        throw TODOError("llr is not implemented for this.");
         return 0.;
     }
     template<typename OT, typename=std::enable_if_t<!std::is_integral_v<OT>>, typename OT2>
     auto llr(size_t, const OT &, const OT2 &) const {
-        throw shared::TODOError("llr is not implemented for this.");
+        throw TODOError("llr is not implemented for this.");
+        return 0.;
+    }
+    template<typename OT, typename=std::enable_if_t<!std::is_integral_v<OT>>>
+    auto uwllr(size_t, const OT &) const {
+        throw TODOError("llr is not implemented for this.");
+        return 0.;
+    }
+    template<typename OT, typename=std::enable_if_t<!std::is_integral_v<OT>>, typename OT2>
+    auto uwllr(size_t, const OT &, const OT2 &) const {
+        throw TODOError("llr is not implemented for this.");
         return 0.;
     }
     template<typename...Args>
@@ -499,9 +961,11 @@ class ProbDivApplicator {
             throw std::invalid_argument(std::string("Param for lambda ") + std::to_string(param) + " is out of range.");
         lambda_ = param;
     }
+    auto get_measure() const {return measure_;}
 private:
     template<typename Container=blaze::DynamicVector<FT, blaze::rowVector>>
     void prep(Prior prior, const Container *c=nullptr) {
+        std::fprintf(stderr, "beginning prep.\n");
         switch(prior) {
             case NONE:
             break;
@@ -509,15 +973,13 @@ class ProbDivApplicator {
                 if constexpr(!IsSparseMatrix_v<MatrixType>) {
                     data_ += static_cast<FT>(1);
                 } else {
-                    prior_data_.reset(new VecT(data_.columns()));
-                    (*prior_data_)[0] = static_cast<FT>(1);
+                    prior_data_.reset(new VecT({FT(1)}));
                 }
                 break;
             case GAMMA_BETA:
                 if(c == nullptr) throw std::invalid_argument("Can't do gamma_beta with null pointer");
                 if constexpr(IsSparseMatrix_v<MatrixType>) {
-                    prior_data_.reset(new VecT(data_.columns()));
-                    (*prior_data_)[0] = (*c)[0];
+                    prior_data_.reset(new VecT({(*c)[0]}));
                 } else if constexpr(IsDenseMatrix_v<MatrixType>) {
                     data_ += (*c)[0];
                 }
@@ -525,7 +987,7 @@ class ProbDivApplicator {
             case FEATURE_SPECIFIC_PRIOR:
                 if(c == nullptr) throw std::invalid_argument("Can't do feature-specific with null pointer");
                 if constexpr(IsDenseMatrix_v<MatrixType>) {
-                    data_ += blz::expand(*c, data_.rows());
+                    data_ += blaze::expand(*c, data_.rows());
                 } else if constexpr(IsSparseMatrix_v<MatrixType>) {
                     assert(c->size() == data_.columns());
                     prior_data_.reset(new VecT(data_.columns()));
@@ -535,46 +997,96 @@ class ProbDivApplicator {
         }
         row_sums_.resize(data_.rows());
         {
-            auto rowsumit = row_sums_.data();
-            for(auto r: blz::rowiterator(data_)) {
-                if constexpr(blz::IsDenseMatrix_v<MatrixType>) {
+            for(size_t i = 0; i < data_.rows(); ++i) {
+                auto r(row(i));
+                FT countsum = blaze::sum(r);
+                if constexpr(blaze::IsDenseMatrix_v<MatrixType>) {
                     if(prior == NONE) {
                         r += 1e-50;
-                        assert(blz::min(r) > 0.);
+#ifndef NDEBUG
+                        if(dist::detail::expects_nonnegative(measure_) && blaze::min(r) < 0.)
+                            throw std::invalid_argument(std::string("Measure ") + dist::detail::prob2str(measure_) + " expects nonnegative data");
+#endif
+                    }
+                } else if constexpr(blaze::IsSparseMatrix_v<MatrixType>) {
+                    if(prior_data_) {
+                        bool single_value = prior_data_->size() == 1;
+                        if(prior == DIRICHLET) {
+                            countsum += r.size();
+                        } else {
+                            MINOCORE_VALIDATE(prior_data_ != nullptr);
+                            countsum += single_value ? r.size() * *prior_data_->begin()
+                                                     : blaze::sum(*prior_data_);
+                        }
+                        for(auto &item: r)
+                            item.value() +=
+                                (*prior_data_)[single_value ? size_t(0): item.index()];
                     }
                 }
-                const auto countsum = blz::sum(r);
                 r /= countsum;
-                *rowsumit++ = countsum;
+                row_sums_[i] = countsum;
             }
         }
 
-        if(blz::detail::needs_logs(measure_)) {
+        if(dist::detail::needs_logs(measure_)) {
             logdata_.reset(new MatrixType(neginf2zero(log(data_))));
         }
-        if(blz::detail::needs_sqrt(measure_)) {
-            sqrdata_.reset(new MatrixType(blz::sqrt(data_)));
+        if(dist::detail::needs_sqrt(measure_)) {
+            sqrdata_.reset(new MatrixType(blaze::sqrt(data_)));
         }
-        if(blz::detail::needs_l2_cache(measure_)) {
+        if(dist::detail::needs_l2_cache(measure_)) {
             l2norm_cache_.reset(new VecT(data_.rows()));
             OMP_PFOR
             for(size_t i = 0; i < data_.rows(); ++i) {
-                l2norm_cache_->operator[](i)  = 1. / blz::l2Norm(weighted_row(i));
+                l2norm_cache_->operator[](i)  = 1. / blaze::l2Norm(weighted_row(i));
             }
         }
-        if(blz::detail::needs_probability_l2_cache(measure_)) {
+        if(dist::detail::needs_probability_l2_cache(measure_)) {
             pl2norm_cache_.reset(new VecT(data_.rows()));
             OMP_PFOR
             for(size_t i = 0; i < data_.rows(); ++i) {
-                pl2norm_cache_->operator[](i) = 1. / blz::l2Norm(row(i));
+                pl2norm_cache_->operator[](i) = 1. / blaze::l2Norm(row(i));
             }
         }
         if(logdata_) {
             jsd_cache_.reset(new VecT(data_.rows()));
             auto &jc = *jsd_cache_;
-            for(size_t i = 0; i < jc.size(); ++i) {
-                jc[i] = dot(row(i), logrow(i));
+            if constexpr(IS_SPARSE) {
+                if(prior_data_) {
+                    // Handle sparse priors
+                    MINOCORE_VALIDATE(prior_data_->size() == 1 || prior_data_->size() == data_.columns());
+                    auto &pd = *prior_data_;
+                    const bool single_value = pd.size() == 1;
+                    for(size_t i = 0; i < data_.rows(); ++i) {
+                        const auto rs = row_sums_[i];
+                        auto r = row(i);
+                        double contrib = 0.;
+                        auto upcontrib = [&](auto x) {contrib += x * std::log(x);};
+                        if(single_value) {
+                            FT invp = pd[0] / rs;
+                            size_t number_zero = r.size() - nonZeros(r);
+                            contrib += number_zero * (invp * std::log(invp)); // Empty
+                            for(auto &pair: r) upcontrib(pair.value());       // Non-empty
+                        } else {
+                            size_t i = 0;
+                            auto it = r.begin();
+                            auto contribute_range = [&](size_t end) {
+                                while(i < end) upcontrib(pd[i++] / rs);
+                            };
+                            while(it != r.end() && i < r.size()) {
+                                contribute_range(it->index());
+                                upcontrib(it->value());
+                                if(++it == r.end())
+                                    contribute_range(r.size());
+                            }
+                        }
+                        jc[i] = contrib;
+                    }
+                }
             }
+            if(!(IS_SPARSE && prior_data_))
+                for(size_t i = 0; i < jc.size(); ++i)
+                    jc[i] = dot(row(i), logrow(i));
         }
     }
     FT get_jsdcache(size_t index) const {
@@ -586,13 +1098,20 @@ class ProbDivApplicator {
         return get_jsdcache(index) * row_sums_->operator[](index);
         return (*jsd_cache_)[index] * row_sums_->operator[](index);
     }
-}; // ProbDivApplicator
+}; // DissimilarityApplicator
+
+template<typename T>
+struct is_dissimilarity_applicator: std::false_type {};
+template<typename MT>
+struct is_dissimilarity_applicator<DissimilarityApplicator<MT>>: std::true_type {};
+template<typename T>
+static constexpr bool is_dissimilarity_applicator_v = is_dissimilarity_applicator<T>::value;
 
 template<typename MT1, typename MT2>
-struct PairProbDivApplicator {
-    ProbDivApplicator<MT1> &pda_;
-    ProbDivApplicator<MT2> &pdb_;
-    PairProbDivApplicator(ProbDivApplicator<MT1> &lhs, ProbDivApplicator<MT2> &rhs): pda_(lhs), pdb_(rhs) {
+struct PairDissimilarityApplicator {
+    DissimilarityApplicator<MT1> &pda_;
+    DissimilarityApplicator<MT2> &pdb_;
+    PairDissimilarityApplicator(DissimilarityApplicator<MT1> &lhs, DissimilarityApplicator<MT2> &rhs): pda_(lhs), pdb_(rhs) {
         if(lhs.measure_ != rhs.measure_) throw std::runtime_error("measures must be the same (for preprocessing reasons).");
     }
     decltype(auto) operator()(size_t i, size_t j) const {
@@ -601,22 +1120,22 @@ struct PairProbDivApplicator {
 };
 
 template<typename MatrixType>
-class MultinomialJSDApplicator: public ProbDivApplicator<MatrixType> {
-    using super = ProbDivApplicator<MatrixType>;
+class MultinomialJSDApplicator: public DissimilarityApplicator<MatrixType> {
+    using super = DissimilarityApplicator<MatrixType>;
     template<typename PriorContainer=blaze::DynamicVector<typename super::FT, blaze::rowVector>>
     MultinomialJSDApplicator(MatrixType &ref,
                              Prior prior=NONE,
                              const PriorContainer *c=nullptr):
-        ProbDivApplicator<MatrixType>(ref, JSD, prior, c) {}
+        DissimilarityApplicator<MatrixType>(ref, JSD, prior, c) {}
 };
 template<typename MatrixType>
-class MultinomialLLRApplicator: public ProbDivApplicator<MatrixType> {
-    using super = ProbDivApplicator<MatrixType>;
+class MultinomialLLRApplicator: public DissimilarityApplicator<MatrixType> {
+    using super = DissimilarityApplicator<MatrixType>;
     template<typename PriorContainer=blaze::DynamicVector<typename super::FT, blaze::rowVector>>
     MultinomialLLRApplicator(MatrixType &ref,
                              Prior prior=NONE,
                              const PriorContainer *c=nullptr):
-        ProbDivApplicator<MatrixType>(ref, LLR, prior, c) {}
+        DissimilarityApplicator<MatrixType>(ref, LLR, prior, c) {}
 };
 
 template<typename MJD>
@@ -625,13 +1144,13 @@ struct BaseOperand {
 };
 
 template<typename MatrixType, typename PriorContainer=blaze::DynamicVector<typename MatrixType::ElementType, blaze::rowVector>>
-auto make_probdiv_applicator(MatrixType &data, ProbDivType type=JSM, Prior prior=NONE, const PriorContainer *pc=nullptr) {
+auto make_probdiv_applicator(MatrixType &data, DissimilarityMeasure type=JSM, Prior prior=NONE, const PriorContainer *pc=nullptr) {
 #if VERBOSE_AF
     std::fprintf(stderr, "[%s:%s:%d] Making probdiv applicator with %d/%s as measure, %d/%s as prior, and %s for prior container.\n",
-                 __PRETTY_FUNCTION__, __FILE__, __LINE__, int(type), blz::detail::prob2str(type), int(prior), prior == NONE ? "No prior": prior == DIRICHLET ? "Dirichlet" : prior == GAMMA_BETA ? "Gamma/Beta": "Feature-specific prior",
+                 __PRETTY_FUNCTION__, __FILE__, __LINE__, int(type), dist::detail::prob2str(type), int(prior), prior == NONE ? "No prior": prior == DIRICHLET ? "Dirichlet" : prior == GAMMA_BETA ? "Gamma/Beta": "Feature-specific prior",
                 pc == nullptr ? "No prior container": (std::string("Container of size ") + std::to_string(pc->size())).data());
 #endif
-    return ProbDivApplicator<MatrixType>(data, type, prior, pc);
+    return DissimilarityApplicator<MatrixType>(data, type, prior, pc);
 }
 template<typename MatrixType, typename PriorContainer=blaze::DynamicVector<typename MatrixType::ElementType, blaze::rowVector>>
 auto make_jsm_applicator(MatrixType &data, Prior prior=NONE, const PriorContainer *pc=nullptr) {
@@ -640,19 +1159,19 @@ auto make_jsm_applicator(MatrixType &data, Prior prior=NONE, const PriorContaine
 
 
 template<typename MatrixType>
-auto make_kmc2(const ProbDivApplicator<MatrixType> &app, unsigned k, size_t m=2000, uint64_t seed=13) {
+auto make_kmc2(const DissimilarityApplicator<MatrixType> &app, unsigned k, size_t m=2000, uint64_t seed=13) {
     wy::WyRand<uint64_t> gen(seed);
     return coresets::kmc2(app, gen, app.size(), k, m);
 }
 
-template<typename MatrixType>
-auto make_kmeanspp(const ProbDivApplicator<MatrixType> &app, unsigned k, uint64_t seed=13) {
+template<typename MatrixType, typename WFT=blaze::ElementType_t<MatrixType>>
+auto make_kmeanspp(const DissimilarityApplicator<MatrixType> &app, unsigned k, uint64_t seed=13, const WFT *weights=nullptr) {
     wy::WyRand<uint64_t> gen(seed);
-    return coresets::kmeanspp(app, gen, app.size(), k);
+    return coresets::kmeanspp(app, gen, app.size(), k, weights);
 }
 
 template<typename MatrixType, typename WFT=typename MatrixType::ElementType, typename IT=uint32_t>
-auto make_d2_coreset_sampler(const ProbDivApplicator<MatrixType> &app, unsigned k, uint64_t seed=13, const WFT *weights=nullptr, coresets::SensitivityMethod sens=cs::LBK) {
+auto make_d2_coreset_sampler(const DissimilarityApplicator<MatrixType> &app, unsigned k, uint64_t seed=13, const WFT *weights=nullptr, coresets::SensitivityMethod sens=cs::LBK) {
     auto [centers, asn, costs] = make_kmeanspp(app, k, seed);
     coresets::CoresetSampler<typename MatrixType::ElementType, IT> cs;
     cs.make_sampler(app.size(), centers.size(), costs.data(), asn.data(), weights,
@@ -661,7 +1180,7 @@ auto make_d2_coreset_sampler(const ProbDivApplicator<MatrixType> &app, unsigned
 }
 
 } // jsd
-using jsd::ProbDivApplicator;
+using jsd::DissimilarityApplicator;
 using jsd::make_d2_coreset_sampler;
 using jsd::make_kmc2;
 using jsd::make_kmeanspp;
diff --git a/include/minocore/dist/distance.h b/include/minocore/dist/distance.h
index f70ba0d4..99e93a84 100644
--- a/include/minocore/dist/distance.h
+++ b/include/minocore/dist/distance.h
@@ -11,14 +11,16 @@
 #define BOOST_NO_AUTO_PTR 1
 #endif
 
-#include "network_simplex/network_simplex_simple.h"
 #include "boost/iterator/transform_iterator.hpp"
 
 namespace blz {
 
 inline namespace distance {
 
-enum ProbDivType {
+
+
+
+enum DissimilarityMeasure {
     L1,
     L2,
     SQRL2,
@@ -49,6 +51,8 @@ enum ProbDivType {
     PROBABILITY_DOT_PRODUCT_SIMILARITY,
     EMD,
     WEMD, // Weighted Earth-mover's distance
+    ORACLE_METRIC,
+    ORACLE_PSEUDOMETRIC,
     WLLR = LLR, // Weighted Log-likelihood Ratio, now equivalent to the LLR
     TVD = TOTAL_VARIATION_DISTANCE,
     WASSERSTEIN=EMD,
@@ -90,7 +94,8 @@ namespace detail {
  * For all other distance measures, Jain-Vazirani and/or local search should be run.
  *
  */
-static constexpr INLINE bool is_bregman(ProbDivType d)  {
+
+static constexpr INLINE bool is_bregman(DissimilarityMeasure d)  {
     switch(d) {
         case JSD: case MKL: case POISSON: case ITAKURA_SAITO:
         case REVERSE_MKL: case REVERSE_POISSON: case REVERSE_ITAKURA_SAITO: return true;
@@ -98,10 +103,10 @@ static constexpr INLINE bool is_bregman(ProbDivType d)  {
     }
     return false;
 }
-static constexpr INLINE bool satisfies_d2(ProbDivType d) {
+static constexpr INLINE bool satisfies_d2(DissimilarityMeasure d) {
     return d == LLR || is_bregman(d) || d == SQRL2;
 }
-static constexpr INLINE bool satisfies_metric(ProbDivType d) {
+static constexpr INLINE bool satisfies_metric(DissimilarityMeasure d) {
     switch(d) {
         case L1:
         case L2:
@@ -109,16 +114,18 @@ static constexpr INLINE bool satisfies_metric(ProbDivType d) {
         case BHATTACHARYYA_METRIC:
         case TOTAL_VARIATION_DISTANCE:
         case HELLINGER:
+        case ORACLE_METRIC:
             return true;
         default: ;
     }
     return false;
 }
-static constexpr INLINE bool satisfies_rho_metric(ProbDivType d) {
+static constexpr INLINE bool satisfies_rho_metric(DissimilarityMeasure d) {
     if(satisfies_metric(d)) return true;
     switch(d) {
         case SQRL2: // rho = 2
         // These three don't, technically, but using a prior can force it to follow it on real data
+        case ORACLE_PSEUDOMETRIC:
         case LLR: case UWLLR: case OLLR:
             return true;
         default:;
@@ -126,7 +133,7 @@ static constexpr INLINE bool satisfies_rho_metric(ProbDivType d) {
     return false;
 }
 
-static constexpr INLINE bool needs_logs(ProbDivType d)  {
+static constexpr INLINE bool needs_logs(DissimilarityMeasure d)  {
     switch(d) {
         case JSM: case JSD: case MKL: case POISSON: case LLR: case OLLR: case ITAKURA_SAITO:
         case REVERSE_MKL: case REVERSE_POISSON: case UWLLR: case REVERSE_ITAKURA_SAITO: return true;
@@ -135,19 +142,60 @@ static constexpr INLINE bool needs_logs(ProbDivType d)  {
     return false;
 }
 
-static constexpr INLINE bool needs_l2_cache(ProbDivType d) {
+static constexpr INLINE bool is_probability(DissimilarityMeasure d)  {
+    switch(d) {
+        case TOTAL_VARIATION_DISTANCE: case BHATTACHARYYA_METRIC: case BHATTACHARYYA_DISTANCE:
+        case MKL: case POISSON: case REVERSE_MKL: case REVERSE_POISSON:
+        case PROBABILITY_COSINE_DISTANCE: case PROBABILITY_DOT_PRODUCT_SIMILARITY:
+        case ITAKURA_SAITO: case REVERSE_ITAKURA_SAITO:
+        return true;
+        default: break;
+    }
+    return false;
+}
+
+static constexpr INLINE bool needs_l2_cache(DissimilarityMeasure d) {
     return d == COSINE_DISTANCE;
 }
 
-static constexpr INLINE bool needs_probability_l2_cache(ProbDivType d) {
+static constexpr bool expects_nonnegative(DissimilarityMeasure measure) {
+    switch(measure) {
+        case L1: case L2: case SQRL2:
+        case COSINE_DISTANCE: case COSINE_SIMILARITY:
+        case PROBABILITY_COSINE_DISTANCE: case PROBABILITY_COSINE_SIMILARITY:
+        case DOT_PRODUCT_SIMILARITY:
+        case WEMD: case EMD: case ORACLE_METRIC: case ORACLE_PSEUDOMETRIC: return false;
+
+        default: // Unexpected, but will assume it's required.
+        case JSM: case JSD: case MKL: case POISSON: case HELLINGER: case BHATTACHARYYA_METRIC:
+        case BHATTACHARYYA_DISTANCE: case TOTAL_VARIATION_DISTANCE: case LLR:
+        case REVERSE_MKL: case REVERSE_POISSON: case ITAKURA_SAITO: case REVERSE_ITAKURA_SAITO:
+        case PROBABILITY_DOT_PRODUCT_SIMILARITY:
+        return true;
+        
+    }
+}
+
+static constexpr INLINE bool is_dissimilarity(DissimilarityMeasure d) {
+    switch(d) {
+        case DOT_PRODUCT_SIMILARITY: case PROBABILITY_DOT_PRODUCT_SIMILARITY:
+        case COSINE_SIMILARITY:      case PROBABILITY_COSINE_DISTANCE:
+            return false;
+        default: ;
+    }
+    return true;
+}
+
+
+static constexpr INLINE bool needs_probability_l2_cache(DissimilarityMeasure d) {
     return d == PROBABILITY_COSINE_DISTANCE;
 }
 
-static constexpr INLINE bool  needs_sqrt(ProbDivType d) {
+static constexpr INLINE bool  needs_sqrt(DissimilarityMeasure d) {
     return d == HELLINGER || d == BHATTACHARYYA_METRIC || d == BHATTACHARYYA_DISTANCE;
 }
 
-static constexpr INLINE bool is_symmetric(ProbDivType d) {
+static constexpr INLINE bool is_symmetric(DissimilarityMeasure d) {
     switch(d) {
         case L1: case L2: case EMD: case HELLINGER: case BHATTACHARYYA_DISTANCE: case BHATTACHARYYA_METRIC:
         case JSD: case JSM: case LLR: case UWLLR: case SQRL2: case TOTAL_VARIATION_DISTANCE: case OLLR:
@@ -159,7 +207,22 @@ static constexpr INLINE bool is_symmetric(ProbDivType d) {
     return false;
 }
 
-static constexpr INLINE const char *prob2str(ProbDivType d) {
+template<typename VT, bool TF, typename VT2>
+void set_cache(const blz::Vector<VT, TF> &src, blz::Vector<VT2, TF> &dest, DissimilarityMeasure d) {
+    if(needs_logs(d)) {
+        if(is_probability(d))
+            ~dest = neginf2zero(log(~src));
+        else
+            ~dest = neginf2zero(log(~src / blaze::sum(~src)));
+        return;
+    }
+    if(needs_sqrt(d)) {
+        ~dest = sqrt(~src);
+        return;
+    }
+}
+
+static constexpr INLINE const char *prob2str(DissimilarityMeasure d) {
     switch(d) {
         case BHATTACHARYYA_DISTANCE: return "BHATTACHARYYA_DISTANCE";
         case BHATTACHARYYA_METRIC: return "BHATTACHARYYA_METRIC";
@@ -184,10 +247,12 @@ static constexpr INLINE const char *prob2str(ProbDivType d) {
         case PROBABILITY_COSINE_DISTANCE: return "PROBABILITY_COSINE_DISTANCE";
         case COSINE_SIMILARITY: return "COSINE_SIMILARITY";
         case PROBABILITY_COSINE_SIMILARITY: return "PROBABILITY_COSINE_SIMILARITY";
+        case ORACLE_METRIC: return "ORACLE_METRIC";
+        case ORACLE_PSEUDOMETRIC: return "ORACLE_PSEUDOMETRIC";
         default: return "INVALID TYPE";
     }
 }
-static constexpr INLINE const char *prob2desc(ProbDivType d) {
+static constexpr INLINE const char *prob2desc(DissimilarityMeasure d) {
     switch(d) {
         case BHATTACHARYYA_DISTANCE: return "Bhattacharyya distance: -log(dot(sqrt(x) * sqrt(y)))";
         case BHATTACHARYYA_METRIC: return "Bhattacharyya metric: sqrt(1 - BhattacharyyaSimilarity(x, y))";
@@ -212,11 +277,13 @@ static constexpr INLINE const char *prob2desc(ProbDivType d) {
         case PROBABILITY_COSINE_DISTANCE: return "Cosine distance of the probability vectors: arccos(\\frac{A \\cdot B}{|A|_2 |B|_2}) / pi";
         case COSINE_SIMILARITY: return "Cosine similarity: \\frac{A \\cdot B}{|A|_2 |B|_2}";
         case PROBABILITY_COSINE_SIMILARITY: return "Cosine similarity of the probability vectors: \\frac{A \\cdot B}{|A|_2 |B|_2}";
+        case ORACLE_METRIC: return "Placeholder for oracle metrics, allowing us to use DissimilarityMeasure in other situations";
+        case ORACLE_PSEUDOMETRIC: return "Placeholder for oracle pseudometrics";
         default: return "INVALID TYPE";
     }
 }
 static void print_measures() {
-    std::set<ProbDivType> measures {
+    std::set<DissimilarityMeasure> measures {
         L1,
         L2,
         SQRL2,
@@ -230,7 +297,7 @@ static void print_measures() {
         TOTAL_VARIATION_DISTANCE,
         LLR,
         OLLR,
-        EMD,
+        //EMD,
         REVERSE_MKL,
         REVERSE_POISSON,
         UWLLR,
@@ -249,6 +316,20 @@ static void print_measures() {
         std::fprintf(stderr, "Code: %d. Description: '%s'. Short name: '%s'\n", measure, prob2desc(measure), prob2str(measure));
     }
 }
+static constexpr bool is_valid_measure(DissimilarityMeasure measure) {
+    switch(measure) {
+        case L1: case L2: case SQRL2: case JSM: case JSD: case MKL:
+        case POISSON: case HELLINGER: case BHATTACHARYYA_METRIC:
+        case BHATTACHARYYA_DISTANCE: case TOTAL_VARIATION_DISTANCE:
+        case LLR: case REVERSE_MKL: case REVERSE_POISSON: case REVERSE_ITAKURA_SAITO:
+        case ITAKURA_SAITO: case COSINE_DISTANCE: case PROBABILITY_COSINE_DISTANCE:
+        case DOT_PRODUCT_SIMILARITY: case PROBABILITY_DOT_PRODUCT_SIMILARITY:
+        case EMD: case WEMD: case ORACLE_METRIC: case ORACLE_PSEUDOMETRIC:
+        return true;
+        default: ;
+    }
+    return false;
+}
 } // detail
 
 
@@ -480,117 +561,6 @@ inline auto s2jsd(const blz::Vector<VT, SO> &lhs, const blaze::Vector<VT2, SO> &
 }
 
 
-template<typename VT, bool SO, typename VT2>
-CommonType_t<ElementType_t<VT>, ElementType_t<VT2>>
-network_p_wasserstein(const blz::Vector<VT, SO> &x, const blz::Vector<VT2, SO> &y, double p=1.)
-{
-    std::fprintf(stderr, "Warning: network_p_wasserstein seems to have a bug. Do not use.\n");
-    auto &xref = ~x;
-    auto &yref = ~y;
-    const size_t sz = xref.size();
-    size_t nl = nonZeros(xref), nr = nonZeros(~y);
-    using FT = CommonType_t<ElementType_t<VT>, ElementType_t<VT2>>;
-
-    using namespace lemon;
-    using Digraph = lemon::FullBipartiteDigraph;
-    Digraph di(nl, nr);
-    NetworkSimplexSimple<Digraph, FT, FT, unsigned, minocore::shared::flat_hash_map> net(di, true, nl + nr, nl * nr);
-    DV<FT> weights(nl + nr);
-    DV<unsigned> indices(nl + nr);
-    size_t i = 0;
-    for(size_t ii = 0; ii < sz; ++ii) {
-        if(xref[ii] > 0)
-            weights[i] = xref[ii], indices[i] = xref[ii], ++i;
-    }
-    for(size_t ii = 0; ii < sz; ++ii) {
-        if(yref[ii] > 0)
-            weights[i] = -yref[ii], indices[i] = yref[ii], ++i;
-    }
-    auto func = [p](auto x, auto y) {
-        auto ret = x - y;
-        if(p == 1) ret = std::abs(ret);
-        else if(p == 2.) ret = ret * ret;
-        else ret = std::pow(ret, p);
-        return ret;
-    };
-    net.supplyMap(weights.data(), nl, weights.data() + nl, nr);
-    {
-        const auto jptr = &weights[nl];
-        for(unsigned i = 0; i < nl; ++i) {
-            auto arcid = i * nl;
-            for(unsigned j = 0; j < nl; ++j) {
-                net.setCost(di.arcFromId(arcid++), func(weights[i], jptr[j]));
-            }
-        }
-    }
-    int rc = net.run();
-    if(rc != (int)net.OPTIMAL) {
-        std::fprintf(stderr, "[%s:%s:%d] Warning: something went wrong in network simplex. Error code: [%s]\n", __PRETTY_FUNCTION__, __FILE__, __LINE__,
-            rc == (int)net.INFEASIBLE ? "infeasible" : (int)net.UNBOUNDED ? "unbounded" : "unknown");
-    }
-
-    FT ret(0);
-    //OMP_PRAGMA("omp parallel for reduction(+:ret)")
-    for(size_t i = 0; i < nl; ++i) {
-        for(size_t j = 0; j < nr; ++j)
-           ret += net.flow(i * nr + j) * func(weights[i], weights[sz + j]);
-    }
-    return ret;
-}
-
-#if 0
-template<typename VT, bool SO, typename VT2>
-CommonType_t<ElementType_t<VT>, ElementType_t<VT2>>
-network_p_wasserstein(const blz::SparseVector<VT, SO> &x, const blz::SparseVector<VT2, SO> &y, double p=1., size_t maxiter=100)
-{
-    auto &xref = ~x;
-    const size_t sz = xref.size();
-    size_t nl = nonZeros(xref), nr = nonZeros(~y);
-    using FT = CommonType_t<ElementType_t<VT>, ElementType_t<VT2>>;
-
-    using namespace lemon;
-	typedef lemon::FullBipartiteDigraph Digraph;
-    Digraph di(nl, nr);
-    NetworkSimplexSimple<Digraph, FT, FT, unsigned> net(di, true, nl + nr, nl * nr, maxiter);
-    DV<FT> weights(nl + nr);
-    DV<unsigned> indices(nl + nr);
-    size_t i = 0;
-    for(const auto &pair: xref)
-        weights[i] = pair.value(), indices[i] = pair.index(), ++i;
-    for(const auto &pair: ~y)
-        weights[i] = -pair.value(), indices[i] = pair.index(), ++i; // negative weight
-    auto func = [p](auto x, auto y) {
-        auto ret = x - y;
-        if(p == 1) ret = std::abs(ret);
-        else if(p == 2.) ret = ret * ret;
-        else ret = std::pow(ret, p);
-        return ret;
-    };
-    net.supplyMap(weights.data(), nl, weights.data() + nl, nr);
-    {
-        const auto jptr = &weights[nl];
-        for(unsigned i = 0; i < nl; ++i) {
-            auto arcid = i * nl;
-            for(unsigned j = 0; j < nl; ++j) {
-                net.setCost(di.arcFromId(arcid++), func(weights[i], jptr[j]));
-            }
-        }
-    }
-    int rc = net.run();
-    if(rc != (int)net.OPTIMAL) {
-        std::fprintf(stderr, "[%s:%s:%d] Warning: something went wrong in network simplex. Error code: [%s]\n", __PRETTY_FUNCTION__, __FILE__, __LINE__,
-            rc == (int)net.INFEASIBLE ? "infeasible" : (int)net.UNBOUNDED ? "unbounded" : "unknown");
-    }
-    FT ret(0);
-    //OMP_PRAGMA("omp parallel for reduction(+:ret)")
-    for(size_t i = 0; i < nl; ++i) {
-        for(size_t j = 0; j < nr; ++j)
-           ret += net.flow(i * nr + j) * func(weights[i], weights[sz + j]);
-    }
-    return ret;
-}
-#endif
-
 template<typename VT, bool SO, typename VT2, typename CT=CommonType_t<ElementType_t<VT>, ElementType_t<VT2>>>
 CT scipy_p_wasserstein(const blz::SparseVector<VT, SO> &x, const blz::SparseVector<VT2, SO> &y, double p=1.) {
     auto &xr = ~x;
@@ -723,4 +693,6 @@ auto witten_poisson_dissimilarity(const blz::Vector<VT, SO> &lhs, const blz::Vec
 
 } // namespace blz
 
+namespace dist = blz::distance;
+
 #endif // FGC_DISTANCE_AND_MEANING_H__
diff --git a/include/minocore/dist/knngraph.h b/include/minocore/dist/knngraph.h
new file mode 100644
index 00000000..8d912a56
--- /dev/null
+++ b/include/minocore/dist/knngraph.h
@@ -0,0 +1,286 @@
+#include "minocore/graph.h"
+#include "minocore/util/packed.h"
+#include "minocore/dist/applicator.h"
+#include "minocore/hash/hash.h"
+#include <boost/graph/kruskal_min_spanning_tree.hpp>
+
+namespace minocore {
+
+template<typename IT=uint32_t, typename MatrixType>
+std::vector<packed::pair<blaze::ElementType_t<MatrixType>, IT>> make_knns(const jsd::DissimilarityApplicator<MatrixType> &app, unsigned k) {
+    using FT = blaze::ElementType_t<MatrixType>;
+    static_assert(std::is_integral_v<IT>, "Sanity");
+    static_assert(std::is_floating_point_v<FT>, "Sanity");
+
+    MINOCORE_REQUIRE(std::numeric_limits<IT>::max() > app.size(), "sanity check");
+    if(k > app.size()) {
+        std::fprintf(stderr, "Note: make_knn_graph was provided k (%u) > # points (%zu).\n", k, app.size());
+        k = app.size();
+    }
+    const size_t np = app.size();
+    const jsd::DissimilarityMeasure measure = app.get_measure();
+    std::vector<packed::pair<FT, IT>> ret(k * np);
+    std::vector<unsigned> in_set(np);
+    const bool measure_is_sym = blz::detail::is_symmetric(measure);
+    const bool measure_is_dist = measure_is_dist;
+    std::unique_ptr<std::mutex[]> locks;
+    OMP_ONLY(locks.reset(new std::mutex[np]);)
+
+    // Helper functions
+    // Update
+    auto update_fwd = [&](FT d, size_t i, size_t j) {
+        if(in_set[i] < k) {
+            OMP_ONLY(std::lock_guard<std::mutex> lock(locks[i]);)
+            ret[i * k + in_set[i]] = packed::pair<FT, IT>{d, j};
+            if(++in_set[i] == k) {
+                if(measure_is_dist)
+                    std::make_heap(ret.data() + i * k, ret.data() + (i + 1) * k, std::less<void>());
+                else
+                    std::make_heap(ret.data() + i * k, ret.data() + (i + 1) * k, std::greater<void>());
+            }
+        } else {
+            auto cmp = [&](auto d) {return measure_is_dist ? (ret[i * k].first > d) : (ret[i * k].first < d);};
+            auto pushpop = [&](auto d) {
+                auto startp = &ret[i * k];
+                auto stopp = startp + k;
+                if(measure_is_dist) std::pop_heap(startp, stopp, std::less<void>());
+                                    std::pop_heap(startp, stopp, std::greater<void>());
+                ret[(i + 1) * k - 1] = packed::pair<FT, IT>{d, j};
+                if(measure_is_dist) std::push_heap(startp, stopp, std::less<void>());
+                                    std::push_heap(startp, stopp, std::greater<void>());
+            };
+            if(cmp(d)) {
+                OMP_ONLY(std::lock_guard<std::mutex> lock(locks[i]);)
+                {
+                    OMP_ONLY(if(cmp(d)))
+                        pushpop(d);
+                }
+            }
+        }
+    };
+
+    // Sort
+    auto perform_sort = [&](auto ptr) {
+        auto end = ptr + k;
+        if(measure_is_dist)
+            shared::sort(ptr, end, std::less<>());
+        else
+            shared::sort(ptr, end, std::greater<>());
+        ptr = end;
+    };
+    auto ptr = ret.data();
+    if(measure_is_sym) {
+        OMP_PFOR
+        for(size_t i = 0; i < np; ++i) {
+            for(size_t j = i + 1; j < np; ++j) {
+                auto d = app(i, j);
+                update_fwd(d, i, j);
+                update_fwd(d, j, i);
+            }
+            perform_sort(ptr);
+            std::fprintf(stderr, "[Symmetric:%s] Completed %zu/%zu\n", blz::detail::prob2str(measure), i + 1, np);
+        }
+    } else {
+        OMP_PFOR
+        for(size_t i = 0; i < np; ++i) {
+            for(size_t j = 0; j < np; ++j) {
+                update_fwd(app(i, j), i, j);
+            }
+            perform_sort(ptr);
+            std::fprintf(stderr, "[Asymmetric:%s] Completed %zu/%zu\n", blz::detail::prob2str(measure), i + 1, np);
+        }
+    }
+    std::fprintf(stderr, "Created knn graph for k = %u and %zu points\n", k, np);
+    return ret;
+}
+
+template<typename IT=uint32_t, typename MatrixType, typename Hasher, typename IT2=IT, typename KT>
+std::vector<packed::pair<blaze::ElementType_t<MatrixType>, IT>>
+make_knns_by_lsh(const jsd::DissimilarityApplicator<MatrixType> &app, hash::LSHTable<Hasher, IT2, KT> &table, unsigned k, unsigned maxlshcmp=0)
+{
+    if(!maxlshcmp) maxlshcmp = 10 * k;
+    using FT = blaze::ElementType_t<MatrixType>;
+    static_assert(std::is_integral_v<IT>, "Sanity");
+    static_assert(std::is_floating_point_v<FT>, "Sanity");
+
+    MINOCORE_REQUIRE(std::numeric_limits<IT>::max() > app.size(), "sanity check");
+    if(k > app.size()) {
+        std::fprintf(stderr, "Note: make_knn_graph was provided k (%u) > # points (%zu).\n", k, app.size());
+        k = app.size();
+    }
+    const size_t np = app.size();
+    const jsd::DissimilarityMeasure measure = app.get_measure();
+    std::vector<packed::pair<FT, IT>> ret(k * np);
+    std::vector<unsigned> in_set(np);
+    const bool measure_is_sym = blz::detail::is_symmetric(measure);
+    const bool measure_is_dist = measure_is_dist;
+    std::unique_ptr<std::mutex[]> locks;
+    OMP_ONLY(locks.reset(new std::mutex[np]);)
+    table.add(app.data());
+    table.sort();
+
+
+    auto update_fwd = [&](FT d, size_t i, size_t j) {
+        if(in_set[i] < k) {
+            OMP_ONLY(std::lock_guard<std::mutex> lock(locks[i]);)
+            ret[i * k + in_set[i]] = packed::pair<FT, IT>{d, j};
+            if(++in_set[i] == k) {
+                if(measure_is_dist)
+                    std::make_heap(ret.data() + i * k, ret.data() + (i + 1) * k, std::less<void>());
+                else
+                    std::make_heap(ret.data() + i * k, ret.data() + (i + 1) * k, std::greater<void>());
+            }
+        } else {
+            auto cmp = [&](auto d) {return measure_is_dist ? (ret[i * k].first > d) : (ret[i * k].first < d);};
+            auto pushpop = [&](auto d) {
+                auto startp = &ret[i * k];
+                auto stopp = startp + k;
+                std::pop_heap(startp, stopp, std::less<void>());
+                ret[(i + 1) * k - 1] = packed::pair<FT, IT>{d, j};
+                std::push_heap(startp, stopp, std::less<void>());
+            };
+            if(cmp(d)) {
+                OMP_ONLY(std::lock_guard<std::mutex> lock(locks[i]);)
+                {
+                    OMP_ONLY(if(cmp(d)))
+                        pushpop(d);
+                }
+            }
+        }
+    };
+
+    // Sort
+    auto ptr = ret.data();
+    MINOCORE_VALIDATE(maxlshcmp <= k);
+    OMP_PFOR
+    for(size_t i = 0; i < np; ++i) {
+        auto tk = table.topk(row(app.data(), i, blaze::unchecked), maxlshcmp);
+        for(const auto &pair: tk) {
+            if(pair.first != i) {
+                auto d = app(i, pair.first);
+                update_fwd(d, i, pair.first);
+                update_fwd(d, pair.first, i);
+            }
+        }
+    }
+    size_t number_exhaustive = 0;
+    for(size_t i = 0; i < np; ++i) {
+        if(in_set[i] >= k) continue;
+        ++number_exhaustive;
+        std::fprintf(stderr, "Warning: LSH table returned < k (%d) neighbors (only %d compared). Performing exhaustive comparisons for item %zu\n",
+                     k, in_set[i], i);
+        OMP_PFOR
+        for(size_t j = (measure_is_sym ? i + 1: size_t(0)); j < np; ++j) {
+            if(unlikely(j == i)) continue;
+            auto d = app(i, j);
+            update_fwd(d, i, j);
+            update_fwd(d, j, i);
+        }
+    }
+    if(number_exhaustive)
+        std::fprintf(stderr, "Performed quadratic distance comparisons with %zu/%zu items\n",
+                     number_exhaustive, np);
+    std::fprintf(stderr, "Created knn graph for k = %u and %zu points\n", k, np);
+    return ret;
+}
+
+template<typename IT=uint32_t, typename FT=float>
+auto knns2graph(const std::vector<packed::pair<FT, IT>> &knns, size_t np, bool mutual=true, bool symmetric=true) {
+    MINOCORE_REQUIRE(knns.size() % np == 0, "sanity");
+    MINOCORE_REQUIRE(knns.size(), "nonempty");
+    unsigned k = knns.size() / np;
+    graph::Graph<boost::undirectedS, FT> ret(np);
+    for(size_t i = 0; i < np; ++i) {
+        auto p = &knns[i * k];
+        SK_UNROLL_8
+        for(unsigned j = 0; j < k; ++j) {
+            if(mutual) {
+                if(symmetric) {
+                    if(p[j].first > knns[k * (p[j].second + 1) * k - 1].first)
+                        continue;
+                } else {
+                    // More expensive (O(k) vs O(1)), but does not require the assumption of symmetry.
+                    auto start = knns.data() + p[j].second * k, stop = start + k;
+                    if(std::find_if(start, stop, [i](auto x) {return x.second == i;})== stop)
+                        continue;
+                }
+            }
+            boost::add_edge(i, static_cast<size_t>(p[j].second), p[j].first, ret);
+        }
+    }
+    return ret;
+}
+
+template<typename IT=uint32_t, typename MatrixType>
+auto make_knn_graph(const jsd::DissimilarityApplicator<MatrixType> &app, unsigned k, bool mutual=true) {
+    return knns2graph(make_knns(app, k), app.size(), mutual, blz::detail::is_symmetric(app.get_measure()));
+}
+
+template<typename IT=uint32_t, typename Graph>
+auto knng2mst(const Graph &gr) {
+    std::vector<typename boost::graph_traits<Graph>::edge_descriptor> ret;
+    ret.reserve(boost::num_vertices(gr) * 1.5);
+    boost::kruskal_minimum_spanning_tree(gr, std::back_inserter(ret));
+    return ret;
+}
+
+#if 0
+template<typename IT=uint32_t, typename MatrixType>
+auto perform_rcc(const jsd::DissimilarityApplicator<MatrixType> &app, unsigned k, bool mutual=true, size_t niter=100) {
+    using FT = blaze::ElementType_t<MatrixType>;
+    auto graph = make_knn_graph(app, k);
+    auto mst = knng2mst(graph);
+    using eh_t = std::conditional_t<sizeof(IT) <= 4, uint64_t, std::pair<IT, IT> >;
+    shared::flat_hash_set<eh_t> edges;
+    auto add_edge = [&](auto edge) {
+        if constexpr(sizeof(IT) <= 4) {
+            uint64_t encoded = (uint64_t(boost::source(edge, graph)) << 32) | boost::target(edge, graph);
+            edges.insert(encoded);
+        } else {
+            edges.insert(eh_t(boost::source(edge, graph), boost::target(edge, graph)));
+        }
+    };
+    for(const auto edge: mst) {
+        add_edge(edge);
+    }
+    for(const auto edge: graph.edges()) {
+        add_edge(edge);
+    }
+    size_t nedges = edges.size();
+    std::unique_ptr<IT[]> lhp(new IT[nedges]), rhp(new IT[nedges]);
+    size_t i = 0;
+    for(const auto &e: edges) {
+        lhp[i] = e.first;
+        rhp[i] = e.second;
+        ++i;
+    }
+    // Free unneeded memory
+    { shared::flat_hash_set<eh_t> tmp(std::move(edges)); }
+    const double xi = blaze::norm(app.data());
+    blaze::DynamicMatrix<FT> U = app.data();
+    blaze::DynamicVector<FT> lpq(nedges, 1.);
+    blaze::DynamicVector<FT> epsilons = blaze::generate(nedges, [&](auto x) {
+        return app(lhp[x], rhp[x]);
+    });
+    shared::sort(epsilons.data(), epsilons.data() + nedges);
+    const int top_samples = std::minimum(250, int(std::ceil(nedges*0.01)));
+    double delta = blaze::mean(blaze::subvector(epsilons, 0, top_samples));
+    double eps   = blaze::mean(blaze::subvector(epsilons, 0, int(std::ceil(nedge * 0.01)));
+    const double mu = 3.0 * std::pow(epsilons[nedges - 1], 2.);
+    auto calculate_objective = [&]() {
+        auto dat = .5 * blaze::sum(blaze::pow(app.data() - U), 2.);
+        return dat;
+    };
+    std::vector<FT> obj;
+    for(size_t iternum = 0; iternum < niter; ++iternum) {
+        OMP_PFOR
+        for(size_t i = 0; i < app.data().columns(); ++i) {
+            lpq[i] = mu / (mu + app(lhp[i], rhp[i]));
+        }
+        obj.push_back(calculate_objective());
+    }
+}
+#endif
+
+
+} // minocore
diff --git a/include/minocore/graph/graphdist.h b/include/minocore/graph/graphdist.h
index 539e5860..0fd95067 100644
--- a/include/minocore/graph/graphdist.h
+++ b/include/minocore/graph/graphdist.h
@@ -2,10 +2,11 @@
 #ifndef FGC_GRAPH_DIST_H__
 #define FGC_GRAPH_DIST_H__
 #include "minocore/graph/graph.h"
-#include "minocore/util/diskmat.h"
+#include "diskmat/diskmat.h"
 #include <atomic>
 
 namespace minocore {
+using diskmat::DiskMat;
 
 namespace graph {
 template<typename Graph, typename MatType, typename VType=std::vector<typename boost::graph_traits<Graph>::vertex_descriptor>>
@@ -36,14 +37,10 @@ void fill_graph_distmat(const Graph &x, MatType &mat, const VType *sources=nullp
         }
 #endif
         blaze::DynamicMatrix<float> working_space(nt, boost::num_vertices(x));
-#ifndef USE_BOOST_PARALLEL
         OMP_PFOR
-#endif
         for(size_t i = 0; i < nrows; ++i) {
             unsigned rowid = 0;
-#if !defined(USE_BOOST_PARALLEL)
             OMP_ONLY(rowid = omp_get_thread_num();)
-#endif
             auto vtx = all_sources ? vertices[i]: (*sources)[i];
             auto wrow(row(working_space, rowid BLAZE_CHECK_DEBUG));
             boost::dijkstra_shortest_paths(x, vtx, boost::distance_map(&wrow[0]));
@@ -55,14 +52,7 @@ void fill_graph_distmat(const Graph &x, MatType &mat, const VType *sources=nullp
         }
     } else {
         assert(ncol == boost::num_vertices(x));
-#ifndef NDEBUG
-        if(all_sources) {
-            assert(boost::num_vertices(x) == nrows);
-        }
-#endif
-#ifndef USE_BOOST_PARALLEL
         OMP_PFOR
-#endif
         for(size_t i = 0; i < nrows; ++i) {
             auto mr = row(~mat, i BLAZE_CHECK_DEBUG);
             auto vtx = all_sources || sources == nullptr ? vertices[i]: (*sources)[i];
@@ -91,14 +81,14 @@ graph2diskmat(const Graph &x, std::string path, const VType *sources=nullptr, bo
 
 
 template<typename Graph, typename VType=std::vector<typename boost::graph_traits<Graph>::vertex_descriptor>>
-blz::DynamicMatrix<typename Graph::edge_property_type::value_type>
+blaze::DynamicMatrix<typename Graph::edge_property_type::value_type>
 graph2rammat(const Graph &x, std::string, const VType *sources=nullptr, bool only_sources_as_dests=false, bool all_sources=false) {
     static_assert(std::is_arithmetic<typename Graph::edge_property_type::value_type>::value, "This should be floating point, or at least arithmetic");
     using FT = typename Graph::edge_property_type::value_type;
     size_t nv = sources && only_sources_as_dests ? sources->size(): boost::num_vertices(x);
     size_t nrows = all_sources || !sources ? boost::num_vertices(x): sources->size();
     std::fprintf(stderr, "all sources: %d. nrows: %zu\n", all_sources, nrows);
-    blz::DynamicMatrix<FT>  ret(nrows, nv);
+    blaze::DynamicMatrix<FT>  ret(nrows, nv);
     fill_graph_distmat(x, ret, sources, only_sources_as_dests, all_sources);
     return ret;
 }
diff --git a/include/minocore/hash/hash.h b/include/minocore/hash/hash.h
index bf5c644e..4470b99a 100644
--- a/include/minocore/hash/hash.h
+++ b/include/minocore/hash/hash.h
@@ -77,7 +77,7 @@ struct cms_distribution {
     }
 };
 
-template<typename FT=double, bool SO=blz::rowMajor>
+template<typename FT=double, bool SO=blaze::rowMajor>
 class JSDLSHasher {
     // See https://papers.nips.cc/paper/9195-locality-sensitive-hashing-for-f-divergences-mutual-information-loss-and-beyond.pdf
     // for the function.
@@ -85,8 +85,8 @@ class JSDLSHasher {
     //
     // This relies on a U/H-approximation of the JSD (capacitory discrimination in Topsoe, 2000)
     // by the Hellinger distance, and uses an LSH for the Hellinger as-is.
-    blz::DM<FT, SO> randproj_;
-    blz::DV<FT, SO> boffsets_;
+    blaze::DynamicMatrix<FT, SO> randproj_;
+    blaze::DynamicVector<FT, SO> boffsets_;
     LSHasherSettings settings_;
 public:
     using ElementType = FT;
@@ -96,33 +96,33 @@ class JSDLSHasher {
         if(seed == 0) seed = nd * nh + r;
         std::mt19937_64 mt(seed);
         std::normal_distribution<FT> gen;
-        randproj_ = blz::generate(nh, nd, [&](size_t x, size_t y){
+        randproj_ = blaze::generate(nh, nd, [&](size_t x, size_t y){
             std::mt19937_64 mt(seed + x + seed * y);
             return gen(mt);
         });
         randproj_ /= r;
-        boffsets_ = blz::generate(nh, [&](size_t){return FT(mt()) / mt.max();});
+        boffsets_ = blaze::generate(nh, [&](size_t){return FT(mt()) / mt.max();});
         assert(settings_.k_ * settings_.l_ == randproj_.rows()); // In case of overflow, I suppose
     }
     template<typename VT>
-    decltype(auto) hash(const blz::Vector<VT, SO> &input) const {
+    decltype(auto) hash(const blaze::Vector<VT, SO> &input) const {
         //std::fprintf(stderr, "Regular input size: %zu. my rows/col:%zu/%zu\n", (~input).size(), randproj_.rows(), randproj_.columns());
-        return blz::ceil(randproj_ * blz::sqrt(~input) + boffsets_);
+        return blaze::ceil(randproj_ * blaze::sqrt(~input) + boffsets_);
     }
     template<typename VT>
-    decltype(auto) hash(const blz::Vector<VT, !SO> &input) const {
+    decltype(auto) hash(const blaze::Vector<VT, !SO> &input) const {
         //std::fprintf(stderr, "Reversed input size: %zu. my rows/col:%zu/%zu\n", (~input).size(), randproj_.rows(), randproj_.columns());
-        return blz::ceil(randproj_ * trans(blz::sqrt(~input)) + boffsets_);
+        return blaze::ceil(randproj_ * trans(blaze::sqrt(~input)) + boffsets_);
     }
     template<typename VT>
-    decltype(auto) hash(const blz::Matrix<VT, SO> &input) const {
+    decltype(auto) hash(const blaze::Matrix<VT, SO> &input) const {
         //std::fprintf(stderr, "Regular input rows/col: %zu/%zu. my rows/col:%zu/%zu\n", (~input).rows(), (~input).columns(), randproj_.rows(), randproj_.columns());
-        return trans(blz::ceil(randproj_ * trans(blz::sqrt(~input)) + blz::expand(boffsets_, (~input).rows())));
+        return trans(blaze::ceil(randproj_ * trans(blaze::sqrt(~input)) + blaze::expand(boffsets_, (~input).rows())));
     }
     template<typename VT>
-    decltype(auto) hash(const blz::Matrix<VT, !SO> &input) const {
+    decltype(auto) hash(const blaze::Matrix<VT, !SO> &input) const {
         //std::fprintf(stderr, "Reversed SO input rows/col: %zu/%zu. my rows/col:%zu/%zu\n", (~input).rows(), (~input).columns(), randproj_.rows(), randproj_.columns());
-        return trans(blz::ceil(randproj_ * blz::sqrt(~input) + blz::expand(boffsets_, (~input).columns())));
+        return trans(blaze::ceil(randproj_ * blaze::sqrt(~input) + blaze::expand(boffsets_, (~input).columns())));
     }
     const auto &matrix() const {return randproj_;}
     auto dim() const {return randproj_.columns();}
@@ -131,7 +131,8 @@ class JSDLSHasher {
     auto l()   const {return settings_.l_;}
     const auto &settings() const {return settings_;}
 };
-template<typename FT=double, bool SO=blz::rowMajor>
+
+template<typename FT=double, bool SO=blaze::rowMajor>
 class HellingerLSHasher: public JSDLSHasher<FT, SO> {
 public:
     template<typename...Args>
@@ -141,12 +142,8 @@ class HellingerLSHasher: public JSDLSHasher<FT, SO> {
 
 template<template<typename...> class Distribution, typename FT, bool SO, bool use_offsets, typename...Args>
 class PStableLSHasher {
-    // See S2JSD-LSH: A Locality-Sensitive Hashing Schema for Probability Distributions
-    // https://aaai.org/ocs/index.php/AAAI/AAAI17/paper/view/14692
-    // for the derivation
-    // Note that this is an LSH for the JS Metric, not the JSD.
-    blz::DM<FT, SO> randproj_;
-    blz::DV<FT, SO> boffsets_;
+    blaze::DynamicMatrix<FT, SO> randproj_;
+    blaze::DynamicVector<FT, SO> boffsets_;
     LSHasherSettings settings_;
     double w_;
 public:
@@ -161,34 +158,34 @@ class PStableLSHasher {
         auto nd = settings.dim_;
         if(seed == 0) seed = nd * nh  + w + 1. / w;
         std::mt19937_64 mt(seed);
-        randproj_ = blz::abs(blz::generate(nh, nd, [&](size_t, size_t){return gen(mt);}) * (1. / w));
+        randproj_ = blaze::abs(blaze::generate(nh, nd, [&](size_t, size_t){return gen(mt);}) * (1. / w));
         if constexpr(use_offsets)
-            boffsets_ = blz::generate(nh, [&](size_t){return FT(mt() / 2) / mt.max();}) - 0.5;
+            boffsets_ = blaze::generate(nh, [&](size_t){return FT(mt() / 2) / mt.max();}) - 0.5;
         assert(settings_.k_ * settings_.l_ == randproj_.rows()); // In case of overflow, I suppose
     }
     template<typename VT>
-    decltype(auto) hash(const blz::Vector<VT, SO> &input) const {
-        if constexpr(use_offsets) return blz::floor(randproj_ * (~input) + 1.) + boffsets_;
-        else                      return blz::floor(randproj_ * (~input));
+    decltype(auto) hash(const blaze::Vector<VT, SO> &input) const {
+        if constexpr(use_offsets) return blaze::floor(randproj_ * (~input) + 1. + boffsets_);
+        else                      return blaze::floor(randproj_ * (~input));
     }
     template<typename VT>
-    decltype(auto) hash(const blz::Vector<VT, !SO> &input) const {
-        if constexpr(use_offsets) return blz::floor(randproj_ * trans(~input) + 1.) + boffsets_;
-        else                      return blz::floor(randproj_ * trans(~input));
+    decltype(auto) hash(const blaze::Vector<VT, !SO> &input) const {
+        if constexpr(use_offsets) return blaze::floor(randproj_ * trans(~input) + 1. + boffsets_);
+        else                      return blaze::floor(randproj_ * trans(~input));
     }
     template<typename MT>
-    decltype(auto) hash(const blz::Matrix<MT, SO> &input) const {
+    decltype(auto) hash(const blaze::Matrix<MT, SO> &input) const {
         if constexpr(use_offsets)
-            return trans(blz::floor(randproj_ * trans(~input)) + blz::expand(boffsets_, (~input).rows()));
+            return trans(blaze::floor(randproj_ * trans(~input) + blaze::expand(boffsets_, (~input).rows())));
         else
-            return trans(blz::floor(randproj_ * trans(~input)));
+            return trans(blaze::floor(randproj_ * trans(~input)));
     }
     template<typename MT>
-    decltype(auto) hash(const blz::Matrix<MT, !SO> &input) const {
+    decltype(auto) hash(const blaze::Matrix<MT, !SO> &input) const {
         if constexpr(use_offsets)
-            return trans(blz::floor(randproj_ * trans(~input)) + blz::expand(boffsets_, (~input).columns()));
+            return trans(blaze::floor(randproj_ * trans(~input) + blaze::expand(boffsets_, (~input).columns())));
         else
-            return trans(blz::floor(randproj_ * trans(~input)));
+            return trans(blaze::floor(randproj_ * trans(~input)));
     }
     const auto &matrix() const {return randproj_;}
     auto dim() const {return settings_.dim_;}
@@ -198,7 +195,7 @@ class PStableLSHasher {
     const auto &settings() const {return settings_;}
 };
 
-template<typename FT=double, bool SO=blz::rowMajor, bool use_offsets=true>
+template<typename FT=double, bool SO=blaze::rowMajor, bool use_offsets=true>
 class L2LSHasher: public PStableLSHasher<std::normal_distribution, FT, SO, use_offsets> {
 public:
     using super = PStableLSHasher<std::normal_distribution, FT, SO, use_offsets>;
@@ -208,7 +205,7 @@ class L2LSHasher: public PStableLSHasher<std::normal_distribution, FT, SO, use_o
     }
 };
 
-template<typename FT=double, bool SO=blz::rowMajor, bool use_offsets=true>
+template<typename FT=double, bool SO=blaze::rowMajor, bool use_offsets=true>
 class L1LSHasher: public PStableLSHasher<std::cauchy_distribution, FT, SO, use_offsets> {
 public:
     using super = PStableLSHasher<std::cauchy_distribution, FT, SO, use_offsets>;
@@ -217,7 +214,7 @@ class L1LSHasher: public PStableLSHasher<std::cauchy_distribution, FT, SO, use_o
     {
     }
 };
-template<typename FT=double, bool SO=blz::rowMajor, bool use_offsets=true>
+template<typename FT=double, bool SO=blaze::rowMajor, bool use_offsets=true>
 class LpLSHasher: public PStableLSHasher<cms_distribution, FT, SO, use_offsets> {
 public:
     using super = PStableLSHasher<cms_distribution, FT, SO, use_offsets>;
@@ -227,7 +224,7 @@ class LpLSHasher: public PStableLSHasher<cms_distribution, FT, SO, use_offsets>
     }
 };
 
-template<typename FT=double, bool SO=blz::rowMajor, bool use_offsets=true>
+template<typename FT=double, bool SO=blaze::rowMajor, bool use_offsets=true>
 class ClippedL1LSHasher: public PStableLSHasher<clipped_cauchy_distribution, FT, SO, use_offsets> {
 public:
     using super = PStableLSHasher<clipped_cauchy_distribution, FT, SO, use_offsets>;
@@ -235,7 +232,7 @@ class ClippedL1LSHasher: public PStableLSHasher<clipped_cauchy_distribution, FT,
     ClippedL1LSHasher(Args &&...args): super(std::forward<Args>(args)...) {}
 };
 
-template<typename FT=double, bool SO=blz::rowMajor, bool use_offsets=true>
+template<typename FT=double, bool SO=blaze::rowMajor, bool use_offsets=true>
 class TVDLSHasher: public L1LSHasher<FT, SO, use_offsets> {
 public:
     using super = L1LSHasher<FT, SO, use_offsets>;
@@ -244,14 +241,14 @@ class TVDLSHasher: public L1LSHasher<FT, SO, use_offsets> {
 };
 
 
-template<typename FT=double, bool SO=blz::rowMajor>
+template<typename FT=double, bool SO=blaze::rowMajor>
 class S2JSDLSHasher {
     // See S2JSD-LSH: A Locality-Sensitive Hashing Schema for Probability Distributions
     // https://aaai.org/ocs/index.php/AAAI/AAAI17/paper/view/14692
     // for the derivation
     // Note that this is an LSH for the JS Metric, not the JSD.
-    blz::DM<FT, SO> randproj_;
-    blz::DV<FT, SO> boffsets_;
+    blaze::DynamicMatrix<FT, SO> randproj_;
+    blaze::DynamicVector<FT, SO> boffsets_;
     LSHasherSettings settings_;
     double w_;
 public:
@@ -263,25 +260,25 @@ class S2JSDLSHasher {
         if(seed == 0) seed = nd * nh  + w + 1. / w;
         std::mt19937_64 mt(seed);
         std::normal_distribution<FT> gen;
-        randproj_ = blz::abs(blz::generate(nh, nd, [&](size_t, size_t){return gen(mt);}) * (4. / (w * w)));
-        boffsets_ = blz::generate(nh, [&](size_t){return FT(mt() / 2) / mt.max();}) - 0.5;
+        randproj_ = blaze::abs(blaze::generate(nh, nd, [&](size_t, size_t){return gen(mt);}) * (4. / (w * w)));
+        boffsets_ = blaze::generate(nh, [&](size_t){return FT(mt() / 2) / mt.max();}) - 0.5;
         assert(settings_.k_ * settings_.l_ == randproj_.rows()); // In case of overflow, I suppose
     }
     template<typename VT>
-    decltype(auto) hash(const blz::Vector<VT, SO> &input) const {
-        return blz::floor(blz::sqrt(randproj_ * (~input) + 1.) + boffsets_);
+    decltype(auto) hash(const blaze::Vector<VT, SO> &input) const {
+        return blaze::floor(blaze::sqrt(randproj_ * (~input) + 1.) + boffsets_);
     }
     template<typename VT>
-    decltype(auto) hash(const blz::Vector<VT, !SO> &input) const {
-        return blz::floor(blz::sqrt(randproj_ * trans(~input) + 1.) + boffsets_);
+    decltype(auto) hash(const blaze::Vector<VT, !SO> &input) const {
+        return blaze::floor(blaze::sqrt(randproj_ * trans(~input) + 1.) + boffsets_);
     }
     template<typename MT>
-    decltype(auto) hash(const blz::Matrix<MT, SO> &input) const {
-        return trans(blz::floor(blz::sqrt(randproj_ * trans(~input) + 1.) + blz::expand(boffsets_, (~input).rows())));
+    decltype(auto) hash(const blaze::Matrix<MT, SO> &input) const {
+        return trans(blaze::floor(blaze::sqrt(randproj_ * trans(~input) + 1.) + blaze::expand(boffsets_, (~input).rows())));
     }
     template<typename MT>
-    decltype(auto) hash(const blz::Matrix<MT, !SO> &input) const {
-        return trans(blz::floor(blz::sqrt(randproj_ * (trans(~input)) + 1.) + blz::expand(boffsets_, (~input).columns())));
+    decltype(auto) hash(const blaze::Matrix<MT, !SO> &input) const {
+        return trans(blaze::floor(blaze::sqrt(randproj_ * (trans(~input)) + 1.) + blaze::expand(boffsets_, (~input).columns())));
     }
     const auto &matrix() const {return randproj_;}
     auto dim() const {return settings_.dim_;}
@@ -326,6 +323,7 @@ struct LSHTable {
     const unsigned nh_;
     XXHasher<KT> xxhasher_;
     OMP_ONLY(std::unique_ptr<std::mutex[]> mutexes;)
+    size_t ids_used_ = 0;
 
     static constexpr bool SO = Hasher::StorageOrder;
 
@@ -370,8 +368,8 @@ struct LSHTable {
         return hasher_.hash(q);
     }
     template<typename VT, bool OSO>
-    void add(const blz::Vector<VT, OSO> &input, IT id) {
-        auto hv = blz::evaluate(hash(input));
+    void add(const blaze::Vector<VT, OSO> &input, IT id) {
+        auto hv = blaze::evaluate(hash(input));
         if(unlikely(nh_ != hv.size())) {
             std::fprintf(stderr, "[%s] nh_: %u. hv.size: %zu\n", __PRETTY_FUNCTION__, nh_, hv.size());
             std::exit(1);
@@ -381,10 +379,11 @@ struct LSHTable {
             auto hh = xxhasher_(&hv[i * st.k_], sizeof(ElementType) * st.k_);
             insert(i, hh, id);
         }
+        ++ids_used_;
     }
     template<typename MT, bool OSO>
-    void add(const blz::Matrix<MT, OSO> &input, IT idoffset=0) {
-        auto hv = blz::evaluate(hash(input));
+    void add(const blaze::Matrix<MT, OSO> &input, IT idoffset=0) {
+        auto hv = blaze::evaluate(hash(input));
         std::fprintf(stderr, "hv shape: %zu/%zu.\n", hv.rows(), hv.columns());
         if(nh_ != hv.columns()) {
             std::fprintf(stderr, "[%s] nh_: %u. hv.columns: %zu\n", __PRETTY_FUNCTION__, nh_, hv.columns());
@@ -396,15 +395,41 @@ struct LSHTable {
         }
         const size_t nr = (~input).rows();
         const auto _l = l(), _k = k();
+        OMP_PFOR
         for(unsigned i = 0; i < nr; ++i) {
-            auto r = row(hv, i, blz::unchecked);
+            auto r = row(hv, i, blaze::unchecked);
             for(unsigned j = 0; j < _l; ++j) {
                 insert(j, xxhasher_(&r[j * _k], sizeof(ElementType) * _k), idoffset + i);
             }
         }
+        ids_used_ += nr;
+    }
+    template<typename VT, bool OSO>
+    std::vector<std::pair<IT, unsigned>> topk(const blaze::Vector<VT, OSO> &query, unsigned maxgather=0) const {
+        // TODO: build with a heap
+        if(!maxgather) maxgather = ids_used_;
+        std::vector<std::pair<IT, unsigned>> ret;
+        auto retupdate = [&](auto x) {
+            auto rit = std::find(ret.begin(), ret.end(), x);
+        };
+        auto hv = evaluate(hash(query));
+        for(unsigned i = 0; i < l(); ++i) {
+            if(auto it = tables_[i].find(xxhasher_(&hv[i * k()], sizeof(ElementType) * k()));
+               it != tables_[i].end())
+            {
+                for(const auto v: it->second) {
+                    auto rit = std::find_if(ret.begin(), ret.end(), [v](auto x) {return x.first == v;});
+                    if(rit == ret.end()) ret.emplace_back({v, 1u});
+                    else                 ++rit->second;
+                }
+            }
+        }
+        shared::sort(ret.begin(), ret.end(), [](auto x, auto y) {return x.second > y.second;});
+        if(maxgather < ret.size()) ret.resize(maxgather);
+        return ret;
     }
     template<typename VT, bool OSO>
-    shared::flat_hash_map<IT, unsigned> query(const blz::Vector<VT, OSO> &query) const {
+    shared::flat_hash_map<IT, unsigned> query(const blaze::Vector<VT, OSO> &query) const {
         auto hv = evaluate(hash(query));
         shared::flat_hash_map<IT, unsigned> ret;
         for(unsigned i = 0; i < l(); ++i) {
@@ -422,7 +447,7 @@ struct LSHTable {
     }
     template<typename MT, bool OSO>
     std::vector<shared::flat_hash_map<IT, unsigned>>
-    query(const blz::Matrix<MT, OSO> &query) const {
+    query(const blaze::Matrix<MT, OSO> &query) const {
         auto hv = evaluate(hash(query));
         //std::fprintf(stderr, "hv rows: %zu. columns: %zu. nh: %u. input num rows: %zu. input col: %zu\n", hv.rows(), hv.columns(), nh_, (~query).rows(), (~query).columns());
         if(hv.columns() != nh_) throw std::runtime_error("Wrong number of columns");
diff --git a/include/minocore/minocore.h b/include/minocore/minocore.h
index 2d3824f1..ffbbe033 100644
--- a/include/minocore/minocore.h
+++ b/include/minocore/minocore.h
@@ -11,6 +11,8 @@
 
 #include <minocore/graph.h>
 
+#include <minocore/clustering.h>
+
 #include <minocore/wip.h>
 
 #endif
diff --git a/include/minocore/optim/graph_thorup.h b/include/minocore/optim/graph_thorup.h
index e4c63392..20553e3a 100644
--- a/include/minocore/optim/graph_thorup.h
+++ b/include/minocore/optim/graph_thorup.h
@@ -265,7 +265,7 @@ std::vector<typename boost::graph_traits<Graph>::vertex_descriptor>
 }
 
 template<typename Graph, typename Container>
-std::pair<blz::DV<std::decay_t<decltype(get(boost::edge_weight_t(), std::declval<Graph>(), std::declval<Graph>()))>>,
+std::pair<blaze::DynamicVector<std::decay_t<decltype(get(boost::edge_weight_t(), std::declval<Graph>(), std::declval<Graph>()))>>,
           std::vector<uint32_t>>
 get_costs(Graph &x, const Container &container) {
     using edge_cost = std::decay_t<decltype(get(boost::edge_weight_t(), x, std::declval<Graph>()))>;
@@ -274,7 +274,7 @@ get_costs(Graph &x, const Container &container) {
 
     util::ScopedSyntheticVertex<Graph> vx(x);
     std::vector<uint32_t> assignments(boost::num_vertices(x));
-    blz::DV<edge_cost> costs(boost::num_vertices(x));
+    blaze::DynamicVector<edge_cost> costs(boost::num_vertices(x));
     std::vector<Vertex> p(boost::num_vertices(x));
 
     auto synthetic_vertex = vx.get();
@@ -360,9 +360,9 @@ thorup_sample_mincost(Graph &x, unsigned k, uint64_t seed, unsigned num_iter,
 }
 
 template<typename Con, typename IType=std::uint32_t, typename VertexContainer>
-blz::DV<IType> histogram_assignments(const Con &c, unsigned ncenters, const VertexContainer &vtces) {
+blaze::DynamicVector<IType> histogram_assignments(const Con &c, unsigned ncenters, const VertexContainer &vtces) {
     const size_t n = std::size(vtces);
-    blz::DV<IType>ret(ncenters, static_cast<IType>(0));
+    blaze::DynamicVector<IType>ret(ncenters, static_cast<IType>(0));
     OMP_PFOR
     for(size_t i = 0; i < n; ++i) {
         OMP_ATOMIC
diff --git a/include/minocore/optim/jv_solver.h b/include/minocore/optim/jv_solver.h
index f5d9c0c3..337265f8 100644
--- a/include/minocore/optim/jv_solver.h
+++ b/include/minocore/optim/jv_solver.h
@@ -6,6 +6,7 @@
 #include <atomic>
 #include <mutex>
 #include <thread>
+#include "cpp-btree/btree/set.h"
 
 namespace minocore {
 
@@ -44,12 +45,12 @@ struct edgetup: public packed::triple<FT, IT, IT> {
     }
 };
 
-}
+} // namespace jvutil
 
 namespace jv {
 
-
-template<typename MatrixType, typename FT=blaze::ElementType_t<MatrixType>, typename IT=uint32_t>
+template<typename MatrixType, typename FT=blaze::ElementType_t<MatrixType>, typename IT=uint32_t,
+         template<typename, typename> class SortedSet=btree::set>
 struct JVSolver {
 
     static_assert(std::is_floating_point<FT>::value, "FT must be floating-point");
@@ -65,7 +66,7 @@ struct JVSolver {
             return lhs.first < rhs.first || lhs.second < rhs.second;
         }
     };
-    struct payment_queue: public std::set<payment_t, pay_compare_t> {
+    struct payment_queue: public SortedSet<payment_t, pay_compare_t> {
         void push(payment_t payment) {
             this->insert(payment);
         }
@@ -74,7 +75,7 @@ struct JVSolver {
             this->insert(start, end);
         }
         auto top() const {
-            if(this->empty()) throw std::runtime_error("Attempting to access an empty structure");
+            if(unlikely(this->empty())) throw std::runtime_error("Attempting to access an empty structure");
             return *this->begin();
         }
         void pop_top() {
@@ -284,7 +285,7 @@ struct JVSolver {
         if(early_terminate && early_terminate->load()) return;
         // Assign all unassigned
         if(open_facilities.empty()) {
-            blz::DV<FT> fac_costs = blaze::sum<blz::rowwise>(distmat);
+            blaze::DynamicVector<FT> fac_costs = blaze::sum<blaze::rowwise>(distmat);
             open_facilities.push_back(std::min_element(fac_costs.begin(), fac_costs.end()) - fac_costs.begin());
         }
         for(const IT cid: unassigned_clients) {
@@ -428,7 +429,7 @@ struct JVSolver {
     JVSolver(const MatrixType &mat, const CostType &cost): JVSolver() {
         setup(mat, cost);
     }
-    JVSolver(const MatrixType &mat): JVSolver(mat, blz::max(mat)) {
+    JVSolver(const MatrixType &mat): JVSolver(mat, blaze::max(mat)) {
     }
 
     template<typename CostType>
@@ -587,7 +588,7 @@ struct JVSolver {
                 //DBG_ONLY(std::fprintf(stderr, "Trying to update by removing the next facility. Current in next_paid_ %zu\n", next_paid_.size());)
                 n_open_clients_ = update_facilities(next_fac.second, working_open_facilities_[next_fac.second], time);
                 time = next_fac.first;
-                if(current_n == next_paid_.size()) // If it wasn't removed
+                if(current_n == static_cast<size_t>(next_paid_.size())) // If it wasn't removed
                     next_paid_.pop_top();
                 //DBG_ONLY(std::fprintf(stderr, "n open: %zu. time: %0.12g. Now facilities left to pay: %zu\n", size_t(n_open_clients_), time, next_paid_.size());)
             } else {
@@ -890,7 +891,7 @@ struct JVSolver {
         return std::make_pair(final_open_facilities_, final_open_facility_assignments_);
     }
     IT local_best_to_add() const {
-        blz::DV<FT,blaze::rowVector> current_costs = blz::min<blz::columnwise>(blz::rows(*distmatp_, final_open_facilities_.data(), final_open_facilities_.size()));
+        blaze::DynamicVector<FT,blaze::rowVector> current_costs = blaze::min<blaze::columnwise>(blaze::rows(*distmatp_, final_open_facilities_.data(), final_open_facilities_.size()));
         FT max_improvement = -std::numeric_limits<FT>::max();
         IT bestind = -1;
         for(size_t i = 0; i < nfac_; ++i) {
@@ -908,7 +909,7 @@ struct JVSolver {
         return bestind;
     }
     IT local_best_to_rm() const {
-        blz::DV<FT, blaze::rowVector> current_costs = blz::min<blz::columnwise>(blz::rows(*distmatp_, final_open_facilities_.data(), final_open_facilities_.size()));
+        blaze::DynamicVector<FT, blaze::rowVector> current_costs = blaze::min<blaze::columnwise>(blaze::rows(*distmatp_, final_open_facilities_.data(), final_open_facilities_.size()));
         FT min_loss = std::numeric_limits<FT>::max();
         IT bestind = -1;
         std::unique_ptr<IT[]> min_counters(new IT[ncities_]());
@@ -933,6 +934,12 @@ struct JVSolver {
     }
 };
 
+template<typename MT, typename FT=blaze::ElementType_t<MT>, typename IT=uint32_t>
+auto make_jv_solver(const MT &mat) {
+    return JVSolver<MT, FT, IT>(mat);
+}
+
+
 } // namespace jv
 
 } // namespace minocore
diff --git a/include/minocore/optim/kcenter.h b/include/minocore/optim/kcenter.h
index 7214e7db..54057899 100644
--- a/include/minocore/optim/kcenter.h
+++ b/include/minocore/optim/kcenter.h
@@ -1,4 +1,5 @@
-#pragma once
+#ifndef FGC_OPTIM_KCENTER_H__
+#define FGC_OPTIM_KCENTER_H__
 #include "minocore/coreset/matrix_coreset.h"
 #include "minocore/util/div.h"
 #include "minocore/util/blaze_adaptor.h"
@@ -29,341 +30,57 @@ kcenter_greedy_2approx(Iter first, Iter end, RNG &rng, size_t k, const Norm &nor
     if(maxdest == 0) maxdest = np;
     std::vector<IT> centers(k);
     std::vector<FT> distances(np, 0.);
+    static constexpr FT startval =  std::is_floating_point<FT>::value ? -std::numeric_limits<FT>::max(): std::numeric_limits<FT>::min();
+    std::pair<FT, IT> maxdist(startval, 0);
     VERBOSE_ONLY(std::fprintf(stderr, "[%s] Starting kcenter_greedy_2approx\n", __PRETTY_FUNCTION__);)
-    {
-        auto fc = rng() % maxdest;
-        centers[0] = fc;
-        distances[fc] = 0.;
-        OMP_ELSE(OMP_PFOR, SK_UNROLL_8)
-        for(size_t i = 0; i < maxdest; ++i) {
-            if(unlikely(i == fc)) continue;
-            distances[i] = dm(fc, i);
+    auto newc = rng() % maxdest;
+    centers[0] = newc;
+    distances[newc] = 0.;
+#ifdef _OPENMP
+    #pragma omp declare reduction (max : std::pair<FT, IT> : std::max(omp_in, omp_out) )
+    #pragma omp parallel for reduction(max: maxdist)
+#else
+    SK_UNROLL_8
+#endif
+    for(IT i = 0; i < maxdest; ++i) {
+        if(likely(i != newc)) {
+            auto v = dm(newc, i);
+            distances[i] = v;
+            maxdist = std::max(maxdist, std::make_pair(v, i));
         }
-        assert(distances[fc] == 0.);
     }
-
-    for(size_t ci = 1; ci < k; ++ci) {
-        auto it = std::max_element(distances.begin(), distances.end());
-        VERBOSE_ONLY(std::fprintf(stderr, "maxelement is %zd from start\n", std::distance(distances.begin(), it));)
-        uint64_t newc = it - distances.begin();
-        centers[ci] = newc;
-        distances[newc] = 0.;
-        OMP_PFOR
+    assert(distances[newc] == 0.);
+    if(k == 1) return centers;
+    centers[1] = newc = maxdist.second;
+    distances[newc] = 0.;
+
+    for(size_t ci = 2; ci < k; ++ci) {
+        //auto it = std::max_element(distances.begin(), distances.end());
+        //VERBOSE_ONLY(std::fprintf(stderr, "maxelement is %zd from start\n", std::distance(distances.begin(), it));)
+        maxdist = std::pair<FT, IT>(startval, 0);
+#ifdef _OPENMP
+        #pragma omp declare reduction (max : std::pair<FT, IT> : std::max(omp_in, omp_out) )
+        #pragma omp parallel for reduction(max: maxdist)
+#else
+        SK_UNROLL_8
+#endif
         for(IT i = 0; i < maxdest; ++i) {
             if(unlikely(i == newc)) continue;
             auto &ldist = distances[i];
+            if(!ldist) continue;
             const auto dist = dm(newc, i);
             if(dist < ldist) {
                 ldist = dist;
             }
+            maxdist = std::max(maxdist, std::make_pair(ldist, i));
         }
-        assert(std::find_if(distances.begin(), distances.end(), [](auto x) {return std::isnan(x) || std::isinf(x);})
-               == distances.end());
+        centers[ci] = newc = maxdist.second;
+        distances[newc] = 0.;
     }
     return centers;
 } // kcenter_greedy_2approx
 
-namespace outliers {
-
-/*
-// All algorithms in this namespace are from:
-// Greedy Strategy Works for k-Center Clustering with Outliers and Coreset Construction
-// Hu Ding, Haikuo Yu, Zixiu Wang
-*/
-
-namespace detail {
-template<typename IT=std::uint32_t, typename Container=std::vector<std::pair<double, IT>>,
-         typename Cmp=std::greater<>>
-struct fpq: public std::priority_queue<std::pair<double, IT>, Container, Cmp> {
-    // priority queue providing access to underlying constainer with getc()
-    // , a reserve function and that defaults to std::greater<> for farthest points.
-    using super = std::priority_queue<std::pair<double, IT>, Container, Cmp>;
-    template<typename...Args>
-    fpq(Args &&...args): super(std::forward<Args>(args)...) {}
-    void reserve(size_t n) {this->c.reserve(n);}
-    auto &getc() {return this->c;}
-    const auto &getc() const {return this->c;}
-};
-} // detail
-
-
-
-template<typename IT>
-struct bicriteria_result_t: public std::tuple<IVec<IT>, IVec<IT>, std::vector<std::pair<double, IT>>, double> {
-    using super = std::tuple<IVec<IT>, IVec<IT>, std::vector<std::pair<double, IT>>, double>;
-    template<typename...Args>
-    bicriteria_result_t(Args &&...args): super(std::forward<Args>(args)...) {}
-    auto &centers() {return std::get<0>(*this);}
-    auto &assignments() {return std::get<1>(*this);}
-    // alias
-    auto &labels() {return assignments();}
-    auto &outliers() {return std::get<2>(*this);}
-    double outlier_threshold() const {return std::get<3>(*this);}
-    size_t num_centers() const {return centers().size();}
-};
-
-/*
-// Algorithm 1 from the above DYW paper
-// Z = # outliers
-// \mu = quality of coreset
-// size of coreset: 2z + O((2/\mu)^p k)
-// \gamma = z / n
-*/
-
-template<typename Iter, typename FT=shared::ContainedTypeFromIterator<Iter>,
-         typename IT=std::uint32_t, typename RNG, typename Norm=sqrL2Norm>
-bicriteria_result_t<IT>
-kcenter_bicriteria(Iter first, Iter end, RNG &rng, size_t k, double eps,
-                   double gamma=0.001, size_t t = 100, double eta=0.01,
-                   const Norm &norm=Norm())
-{
-    std::fprintf(stderr, "Note: the value k (%zu) is not used in this function or the algorithm\n", k);
-    auto dm = make_index_dm(first, norm);
-    // Step 1: constants
-    assert(end > first);
-    size_t np = end - first;
-    const size_t z = std::ceil(gamma * np);
-    std::fprintf(stderr, "z: %zu\n", z);
-    size_t farthestchunksize = std::ceil((1 + eps) * z),
-           samplechunksize = std::ceil(std::log(1./eta) / (1 - gamma));
-    IVec<IT> ret;
-    IVec<IT> labels(np);
-    ret.reserve(samplechunksize);
-    std::vector<FT> distances(np);
-    // randomly select 'log(1/eta) / (1 - eps)' vertices from X and add them to E.
-    while(ret.size() < samplechunksize) {
-        // Assuming that this is relatively small and we can take bad asymptotic complexity
-        auto newv = rng() % np;
-        if(std::find(ret.begin(), ret.end(), newv) == ret.end())
-            push_back(ret, newv);
-    }
-    assert(flat_hash_set<IT>(ret.begin(), ret.end()).size() == ret.size());
-    if(samplechunksize > 100) {
-        std::fprintf(stderr, "Warning: with samplechunksize %zu, it may end up taking a decent amount of time. Consider swapping this in for a hash set.", samplechunksize);
-    }
-    if(samplechunksize > farthestchunksize) {
-        std::fprintf(stderr, "samplecc is %zu (> fcs %zu). changing gcs to scc + z (%zu)\n", samplechunksize, farthestchunksize, samplechunksize + z);
-        farthestchunksize = samplechunksize + z;
-    }
-    detail::fpq<IT> pq;
-    pq.reserve(farthestchunksize + 1);
-    const auto fv = ret[0];
-    labels[fv] = fv;
-    distances[fv] = 0.;
-    // Fill the priority queue from the first set
-    OMP_PFOR
-    for(size_t i = 0; i < np; ++i) {
-        double dist = dm(fv, i);
-        double newdist;
-        IT label = 0; // This label is an index into the ret vector, rather than the actual index
-        for(size_t j = 1, e = ret.size(); j < e; ++j) {
-            if((newdist = dm(i, ret[j])) < dist) {
-                label = j;
-                dist = newdist;
-            }
-        }
-        distances[i] = dist;
-        labels[i] = ret[label];
-        if(pq.empty() || dist > pq.top().first) {
-            const auto p = std::make_pair(dist, i);
-            OMP_CRITICAL
-            {
-                // Check again after getting the lock
-                if(pq.empty()  || dist > pq.top().first) {
-                    pq.push(p);
-                    if(pq.size() > farthestchunksize)
-                        pq.pop();
-                }
-            }
-        }
-    }
-    IVec<IT> random_samples(samplechunksize);
-    // modulo without a div/mod instruction, much faster
-    schism::Schismatic<IT> div(farthestchunksize); // pq size
-    assert(samplechunksize >= 1.);
-    for(size_t j = 0;j < t;++j) {
-        //std::fprintf(stderr, "j: %zu/%zu\n", j, t);
-        // Sample 'samplechunksize' points from pq into random_samples.
-        // Sample them
-        size_t rsi = 0;
-        IT *rsp = random_samples.data();
-        do {
-            IT index = div.mod(rng());
-            // (Without replacement)
-            if(std::find(rsp, rsp + rsi, index))
-                rsp[rsi++] = index;
-        } while(rsi < samplechunksize);
-        // random_samples now contains indexes *into pq*
-        assert(pq.getc().data());
-        std::transform(rsp, rsp + rsi, rsp,
-            [pqi=pq.getc().data()](auto x) {
-            return pqi[x].second;
-        });
-        for(size_t i = 0; i < rsi; ++i)
-            assert(rsp[i] < np);
-        // random_samples now contains indexes *into original dataset*
-
-        // Insert into solution
-#if 0
-        ret.insert(ret.end(), rsp, rsp + rsi);
-#else
-        for(auto it = rsp, e = rsp + rsi; it < e;++it) {
-            if(std::find(ret.begin(), ret.end(), *it) != ret.end()) continue;
-            distances[*it] = 0.;
-            labels[*it] = *it;
-            ret.pushBack(*it);
-        }
-#endif
-
-        // compare each point against all of the new points
-        pq.getc().clear(); // empty priority queue
-        // Fill priority queue
-        OMP_PFOR
-        for(size_t i = 0; i < np; ++i) {
-            double dist = distances[i];
-            if(dist == 0.) continue;
-            double newdist;
-            IT label = labels[i];
-            for(size_t j = 0; j < rsi; ++j) {
-                if((newdist = dm(i, rsp[j])) < dist)
-                    dist = newdist, label = rsp[j];
-            }
-            distances[i] = dist;
-            labels[i] = label;
-            if(pq.empty() || dist > pq.top().first) {
-                const auto p = std::make_pair(dist, i);
-                OMP_CRITICAL
-                {
-                    // Check again after getting the lock in case it's changed
-                    if(pq.empty() || dist > pq.top().first) {
-                        pq.push(p);
-                        if(pq.size() > farthestchunksize)
-                        // TODO: avoid filling it all the way by checking size but it's probably not worth it
-                            pq.pop();
-                    }
-                }
-            }
-        }
-    }
-    const double minmaxdist = pq.top().first;
-    bicriteria_result_t<IT> bicret;
-    assert(flat_hash_set<IT>(ret.begin(), ret.end()).size() == ret.size());
-    bicret.centers() = std::move(ret);
-    bicret.labels() = std::move(labels);
-    bicret.outliers() = std::move(pq.getc());
-    std::fprintf(stderr, "outliers size: %zu\n", bicret.outliers().size());
-    std::get<3>(bicret) = minmaxdist;
-    return bicret;
-    // center ids, label assignments for all points besides outliers, outliers, and the distance of the closest excluded point
-} // kcenter_bicriteria
-
-/*
-// Algorithm 2 from the above DYW paper
-// Z = # outliers
-// \gamma = z / n
-*/
-
-template<typename Iter, typename FT=shared::ContainedTypeFromIterator<Iter>,
-         typename IT=std::uint32_t, typename RNG, typename Norm=L2Norm>
-std::vector<IT>
-kcenter_greedy_2approx_outliers(Iter first, Iter end, RNG &rng, size_t k, double eps,
-                                double gamma=0.001,
-                                const Norm &norm=Norm())
-{
-    auto dm = make_index_dm(first, norm);
-    const size_t np = end - first;
-    const size_t z = std::ceil(gamma * np);
-    size_t farthestchunksize = std::ceil((1. + eps) * z);
-    detail::fpq<IT> pq;
-    pq.reserve(farthestchunksize + 1);
-    std::vector<IT> ret;
-    std::vector<FT> distances(np, std::numeric_limits<FT>::max());
-    ret.reserve(k);
-    auto newc = rng() % np;
-    ret.push_back(newc);
-    do {
-        const auto &newel = first[newc];
-        // Fill pq
-        OMP_PFOR
-        for(size_t i = 0; i < np; ++i) {
-            double dist = distances[i];
-            if(dist == 0.) continue;
-            double newdist;
-            if((newdist = dm(i, newc)) < dist)
-                dist = newdist;
-            distances[i] = dist;
-            if(pq.empty() || dist > pq.top().first) {
-                const auto p = std::make_pair(dist, i);
-                OMP_CRITICAL
-                {
-                    if(pq.empty() || dist > pq.top().first) {
-                        pq.push(p);
-                        if(pq.size() > farthestchunksize) pq.pop();
-                    }
-                }
-            }
-        }
-
-        // Sample point
-        newc = pq.getc()[rng() % farthestchunksize].second;
-        assert(newc < np);
-        ret.push_back(newc);
-        pq.getc().clear();
-    } while(ret.size() < k);
-    return ret;
-}// kcenter_greedy_2approx_outliers
-// Algorithm 3 (coreset construction)
-template<typename Iter, typename FT=shared::ContainedTypeFromIterator<Iter>,
-         typename IT=std::uint32_t, typename RNG, typename Norm=L2Norm>
-coresets::IndexCoreset<IT, FT>
-kcenter_coreset(Iter first, Iter end, RNG &rng, size_t k, double eps=0.1, double mu=.5,
-                double rho=1.5,
-                double gamma=0.001, double eta=0.01, const Norm &norm=Norm()) {
-    // rho is 'D' for R^D (http://www.wisdom.weizmann.ac.il/~robi/teaching/2014b-SeminarGeometryAlgorithms/lecture1.pdf)
-    // in Euclidean space, as worst-case, but usually better in real data with structure.
-    assert(mu > 0. && mu <= 1.);
-    const size_t np = end - first;
-    size_t L = std::ceil(std::pow(2. / mu, rho) * k);
-    size_t nrounds = std::ceil((L + std::sqrt(L)) / (1. - eta));
-    auto bic = kcenter_bicriteria(first, end, rng, k, eps,
-                                  gamma, nrounds, eta, norm);
-    double rtilde = bic.outlier_threshold();
-    std::fprintf(stderr, "outlier threshold: %f\n", rtilde);
-    auto &centers = bic.centers();
-    auto &labels = bic.labels();
-    auto &outliers = bic.outliers();
-#ifndef NDEBUG
-    for(const auto c: centers)
-        assert(c < np);
-    for(const auto label: labels)
-        assert(labels[label] == label);
-#endif
-    //std::vector<size_t> counts(centers.size());
-    coresets::flat_hash_map<IT, uint32_t> counts;
-    counts.reserve(centers.size());
-    size_t i = 0;
-    SK_UNROLL_8
-    do ++counts[labels[i++]]; while(i < np);
-    coresets::IndexCoreset<IT, FT> ret(centers.size() + outliers.size());
-    std::fprintf(stderr, "ret size: %zu. centers size: %zu. counts size %zu. outliers size: %zu\n", ret.size(), centers.size(), counts.size(), outliers.size());
-    for(i = 0; i < outliers.size(); ++i) {
-        assert(outliers[i].second < np);
-        ret.indices_[i] = outliers[i].second;
-        ret.weights_[i] = 1.;
-    }
-    for(const auto &pair: counts) {
-        assert(pair.first < np);
-        ret.weights_[i] = pair.second;
-        ret.indices_[i] = pair.first;
-        ++i;
-    }
-    assert(i == ret.size());
-    for(size_t i = 0; i < ret.indices_.size(); ++i) {
-        assert(ret.indices_[i] < np);
-    }
-    return ret;
-}
-}// namespace outliers
-
 } // coresets
 } // minocore
+
+#endif /* FGC_OPTIM_KCENTER_H__ */
diff --git a/include/minocore/optim/kmeans.h b/include/minocore/optim/kmeans.h
index 19aec709..a5adc7aa 100644
--- a/include/minocore/optim/kmeans.h
+++ b/include/minocore/optim/kmeans.h
@@ -8,6 +8,7 @@
 #include "minocore/util/oracle.h"
 #include "minocore/util/timer.h"
 #include "minocore/util/div.h"
+#include "minocore/util/blaze_adaptor.h"
 
 namespace minocore {
 
@@ -104,10 +105,10 @@ kmeanspp(Iter first, Iter end, RNG &rng, size_t k, const Norm &norm=Norm(), WFT
 }
 
 template<typename Oracle, typename Sol, typename FT=float, typename IT=uint32_t>
-std::pair<blz::DV<IT>, blz::DV<FT>> get_oracle_costs(const Oracle &oracle, size_t np, const Sol &sol)
+std::pair<blaze::DynamicVector<IT>, blaze::DynamicVector<FT>> get_oracle_costs(const Oracle &oracle, size_t np, const Sol &sol)
 {
-    blz::DV<IT> assignments(np);
-    blz::DV<FT> costs(np, std::numeric_limits<FT>::max());
+    blaze::DynamicVector<IT> assignments(np);
+    blaze::DynamicVector<FT> costs(np, std::numeric_limits<FT>::max());
     util::Timer t("get oracle costs");
     OMP_PFOR
     for(size_t i = 0; i < np; ++i) {
@@ -175,7 +176,6 @@ kmc2(const Oracle &oracle, RNG &rng, size_t np, size_t k, size_t m = 2000)
                 }
             }
         }
-        std::fprintf(stderr, "[kmc2]: %zu/%zu\n", centers.size(), size_t(k));
         centers.insert(x);
     }
     return std::vector<IT>(centers.begin(), centers.end());
diff --git a/include/minocore/optim/kmedian.h b/include/minocore/optim/kmedian.h
index 9b5ef3b3..077c3c19 100644
--- a/include/minocore/optim/kmedian.h
+++ b/include/minocore/optim/kmedian.h
@@ -24,7 +24,7 @@ auto &geomedian(const blz::DenseMatrix<MT, SO> &mat, blz::DenseVector<VT, !SO> &
     const auto &_mat = ~mat;
     ~dv = blz::mean<blz::columnwise>(_mat);
     FT prevcost = std::numeric_limits<FT>::max();
-    blz::DV<FT, !SO> costs(_mat.rows(), FT(0));
+    blaze::DynamicVector<FT, !SO> costs(_mat.rows(), FT(0));
     size_t iternum = 0;
     const size_t nr = _mat.rows();
     assert((~dv).size() == (~mat).columns());
@@ -47,13 +47,14 @@ auto &geomedian(const blz::DenseMatrix<MT, SO> &mat, blz::DenseVector<VT, !SO> &
 }
 
 template<typename MT, bool SO, typename VT, bool TF>
-void l1_unweighted_median(const blz::DenseMatrix<MT, SO> &data, blz::DenseVector<VT, TF> &ret, bool approx_med=false) {
+void l1_unweighted_median(const blz::DenseMatrix<MT, SO> &data, blz::DenseVector<VT, TF> &ret) {
     assert((~ret).size() == (~data).columns());
     auto &rr(~ret);
     const auto &dr(~data);
     const bool odd = dr.rows() % 2;
     const size_t hlf = dr.rows() / 2;
-    if(approx_med) {
+    if(0) {
+#if 0
         std::fprintf(stderr, "note: Boost approximate median takes more time and is less accurate than exact calculation via sorting.\nNot recommended.\n");
         //using acc_tag = boost::accumulators::stats<boost::accumulators::tag::median(boost::accumulators::tag::with_p_square_cumulative_distribution)>;
         using acc_tag = boost::accumulators::stats<boost::accumulators::tag::median(with_p_square_quantile)>;
@@ -63,6 +64,7 @@ void l1_unweighted_median(const blz::DenseMatrix<MT, SO> &data, blz::DenseVector
             for(auto v: column(dr, i)) acc(v);
             (~ret)[i] = boost::accumulators::median(acc);
         }
+#endif
     } else {
         for(size_t i = 0; i < dr.columns(); ++i) {
             blaze::DynamicVector<ElementType_t<MT>, blaze::columnVector> tmpind = column(data, i); // Should do fast copying.
@@ -75,13 +77,14 @@ void l1_unweighted_median(const blz::DenseMatrix<MT, SO> &data, blz::DenseVector
 
 
 template<typename MT, bool SO, typename VT2, bool TF2, typename FT=CommonType_t<ElementType_t<MT>, ElementType_t<VT2>>, typename IT=uint32_t>
-static inline void weighted_median(const blz::Matrix<MT, SO> &data, blz::DenseVector<VT2, TF2> &ret, const FT *weights, bool approx_med=false) {
+static inline void weighted_median(const blz::Matrix<MT, SO> &data, blz::DenseVector<VT2, TF2> &ret, const FT *weights) {
     assert(weights);
     const size_t nc = (~data).columns();
     if((~ret).size() != nc) {
         (~ret).resize(nc);
     }
-    if(approx_med) {
+    if(0) {
+#if 0
         //OMP_PFOR
         for(size_t i = 0; i < nc; ++i) {
             auto &mat = ~data;
@@ -93,6 +96,7 @@ static inline void weighted_median(const blz::Matrix<MT, SO> &data, blz::DenseVe
             }
             (~ret)[i] = boost::accumulators::median(acc);
         }
+#endif
     } else {
         if(unlikely((~data).columns() > ((uint64_t(1) << (sizeof(IT) * CHAR_BIT)) - 1)))
             throw std::runtime_error("Use a different index type, there are more features than fit in IT");
@@ -129,11 +133,11 @@ static inline void weighted_median(const blz::Matrix<MT, SO> &data, blz::DenseVe
 
 
 template<typename MT, bool SO, typename VT, bool TF, typename VT3=blz::CommonType_t<ElementType_t<MT>, ElementType_t<VT>>>
-void l1_median(const blz::DenseMatrix<MT, SO> &data, blz::DenseVector<VT, TF> &ret, const VT3 *weights=static_cast<VT3 *>(nullptr), bool approx_med=false) {
+void l1_median(const blz::DenseMatrix<MT, SO> &data, blz::DenseVector<VT, TF> &ret, const VT3 *weights=static_cast<VT3 *>(nullptr)) {
     if(weights)
-        weighted_median(data, ret, weights, approx_med);
+        weighted_median(data, ret, weights);
     else
-        l1_unweighted_median(data, ret, approx_med);
+        l1_unweighted_median(data, ret);
 }
 
 
diff --git a/include/minocore/optim/lsearch.h b/include/minocore/optim/lsearch.h
index 062ec82c..abdbd676 100644
--- a/include/minocore/optim/lsearch.h
+++ b/include/minocore/optim/lsearch.h
@@ -1,7 +1,7 @@
 #pragma once
 #ifndef FGC_LOCAL_SEARCH_H__
 #define FGC_LOCAL_SEARCH_H__
-#include "minocore/util/diskmat.h"
+#include "diskmat/diskmat.h"
 #include "minocore/util/oracle.h"
 #include "minocore/optim/kcenter.h"
 #include "pdqsort/pdqsort.h"
@@ -33,7 +33,7 @@ struct ExhaustiveSearcher {
         const size_t nr = mat_.rows();
         size_t nchecked = 0;
         for(auto &&comb: discreture::combinations(nr, k_)) {
-            const double cost = blz::sum(blz::min<blz::columnwise>(rows(mat_, comb.data(), comb.size())));
+            const double cost = blaze::sum(blaze::min<blaze::columnwise>(rows(mat_, comb.data(), comb.size())));
             ++nchecked;
             if((nchecked & (nchecked - 1)) == 0)
                 std::fprintf(stderr, "iteration %zu completed\n", nchecked);
@@ -59,18 +59,19 @@ struct LocalKMedSearcher {
 
     const MatType &mat_;
     shared::flat_hash_set<IType> sol_;
-    blz::DV<IType> assignments_;
-    blz::DV<typename MatType::ElementType, blaze::rowVector> current_costs_;
+    blaze::DynamicVector<IType> assignments_;
+    blaze::DynamicVector<typename MatType::ElementType, blaze::rowVector> current_costs_;
     double current_cost_;
     double eps_, initial_cost_, init_cost_div_;
     IType k_;
     const size_t nr_, nc_;
     double diffthresh_;
-    blz::DV<IType> ordering_;
+    blaze::DynamicVector<IType> ordering_;
     uint32_t shuffle_:1;
-    uint32_t lazy_eval_:2;
     // Set to 0 to avoid lazy search, 1 to only do local search, and 2 to do lazy search and then use exhaustive
+    uint32_t lazy_eval_:15;
     uint32_t max_swap_n_:16;
+    // if(max_swap_n_ > 1), after exhaustive single-swap optimization, enables multiswap search.
     // TODO: enable searches for multiswaps.
 
     // Constructors
@@ -91,7 +92,7 @@ struct LocalKMedSearcher {
         current_cost_(std::numeric_limits<value_type>::max()),
         eps_(eps),
         k_(k), nr_(mat.rows()), nc_(mat.columns()),
-        ordering_(mat.rows()), shuffle_(true), lazy_eval_(false), max_swap_n_(1)
+        ordering_(mat.rows()), shuffle_(true), lazy_eval_(2), max_swap_n_(1)
     {
         std::iota(ordering_.begin(), ordering_.end(), 0);
         static_assert(std::is_integral_v<std::decay_t<decltype(wc->operator[](0))>>, "index container must contain integral values");
@@ -137,7 +138,7 @@ struct LocalKMedSearcher {
                 //std::fprintf(stderr, "subm rows: %zu\n", subm.rows());
                 std::vector<uint32_t> approx{uint32_t(rng() % subm.rows())};
                 auto first = approx.front();
-                blz::DV<value_type, blaze::rowVector> mincosts = row(subm, first);
+                blaze::DynamicVector<value_type, blaze::rowVector> mincosts = row(subm, first);
                 std::vector<uint32_t> remaining(subm.rows());
                 std::iota(remaining.begin(), remaining.end(), 0u);
                 while(approx.size() < std::min(subm.rows(), size_t(k_))) {
@@ -217,40 +218,45 @@ struct LocalKMedSearcher {
     }
 
     double evaluate_swap(IType newcenter, IType oldcenter, bool single_threaded=false) const {
-        blz::SmallArray<IType, 16> as(sol_.begin(), sol_.end());
+        blaze::SmallArray<IType, 16> as(sol_.begin(), sol_.end());
         *std::find(as.begin(), as.end(), oldcenter) = newcenter;
         double cost;
         if(single_threaded) {
-            cost = blaze::serial(blz::sum(blz::serial(blz::min<blz::columnwise>(rows(mat_, as)))));
-        } else cost = blz::sum(blz::min<blz::columnwise>(rows(mat_, as)));
+            cost = blaze::serial(blaze::sum(blaze::serial(blaze::min<blaze::columnwise>(rows(mat_, as)))));
+        } else cost = blaze::sum(blaze::min<blaze::columnwise>(rows(mat_, as)));
         return current_cost_ - cost;
     }
 
-    template<size_t N>
-    double evaluate_multiswap(const IType *newcenter, const IType *oldcenter, bool single_threaded=false) const {
-        blz::SmallArray<IType, 16> as(sol_.begin(), sol_.end());
+    template<size_t N, typename IndexType>
+    double evaluate_multiswap(const IndexType *newcenter, const IndexType *oldcenter, bool single_threaded=false) const {
+        blaze::SmallArray<IType, 16> as(sol_.begin(), sol_.end());
+        shared::sort(as.begin(), as.end());
         for(size_t i = 0; i < N; ++i) {
             *std::find(as.begin(), as.end(), oldcenter[i]) = newcenter[i];
         }
         double cost;
         if(single_threaded) {
-            cost = blaze::serial(blz::sum(blz::serial(blz::min<blz::columnwise>(rows(mat_, as)))));
+            cost = blaze::serial(blaze::sum(blaze::serial(blaze::min<blaze::columnwise>(rows(mat_, as)))));
         } else
-            cost = blz::sum(blz::min<blz::columnwise>(rows(mat_, as)));
+            cost = blaze::sum(blaze::min<blaze::columnwise>(rows(mat_, as)));
         return current_cost_ - cost;
     }
     template<typename IndexType>
     double evaluate_multiswap_rt(const IndexType *newcenter, const IndexType *oldcenter, size_t N, bool single_threaded=false) const {
-        blz::SmallArray<IType, 16> as(sol_.begin(), sol_.end());
+        switch(N) {
+           case 2: return evaluate_multiswap<2>(newcenter, oldcenter, single_threaded);
+           case 3: return evaluate_multiswap<3>(newcenter, oldcenter, single_threaded);
+        }
+        blaze::SmallArray<IType, 16> as(sol_.begin(), sol_.end());
         for(size_t i = 0; i < N; ++i) {
             *std::find(as.begin(), as.end(), oldcenter[i]) = newcenter[i];
         }
         shared::sort(as.begin(), as.end());
         double cost;
         if(single_threaded) {
-            cost = blaze::serial(blz::sum(blz::serial(blz::min<blz::columnwise>(rows(mat_, as)))));
+            cost = blaze::serial(blaze::sum(blaze::serial(blaze::min<blaze::columnwise>(rows(mat_, as)))));
         } else
-            cost = blz::sum(blz::min<blz::columnwise>(rows(mat_, as)));
+            cost = blaze::sum(blaze::min<blaze::columnwise>(rows(mat_, as)));
         return current_cost_ - cost;
     }
 
@@ -270,11 +276,11 @@ struct LocalKMedSearcher {
                 current_costs_ = row(mat_, *it BLAZE_CHECK_DEBUG);
             }
             while(++it != sol_.end()) {
-                current_costs_ = blz::min(current_costs_, row(mat_, *it BLAZE_CHECK_DEBUG));
+                current_costs_ = blaze::min(current_costs_, row(mat_, *it BLAZE_CHECK_DEBUG));
             }
         }
-        blz::DV<typename MatType::ElementType, blz::rowVector> newptr = blz::min<blz::rowwise>(rows(mat_, newcenters, N));
-        blz::DV<typename MatType::ElementType, blz::rowVector> oldptr = blz::min<blz::rowwise>(rows(mat_, oldcenters, N));
+        blaze::DynamicVector<typename MatType::ElementType, blaze::rowVector> newptr = blaze::min<blaze::rowwise>(rows(mat_, newcenters, N));
+        blaze::DynamicVector<typename MatType::ElementType, blaze::rowVector> oldptr = blaze::min<blaze::rowwise>(rows(mat_, oldcenters, N));
         double diff = 0.;
 #ifdef _OPENMP
         _Pragma("omp parallel for reduction(+:diff)")
@@ -285,7 +291,7 @@ struct LocalKMedSearcher {
                 auto sub = ccost - newptr[i];
                 diff += sub;
             } else if(ccost == oldptr[i]) {
-                auto oldbest = blz::min(blz::elements(blz::column(mat_, i), tmp.data(), tmp.size()));
+                auto oldbest = blaze::min(blaze::elements(blaze::column(mat_, i), tmp.data(), tmp.size()));
                 auto sub = ccost - std::min(oldbest, newptr[i]);
                 diff += sub;
             }
@@ -309,6 +315,7 @@ struct LocalKMedSearcher {
         std::vector<IType> newindices(sol_.begin(), sol_.end());
         next:
         for(const auto oldcenter: sol_) {
+            newindices.assign(sol_.begin(), sol_.end());
             std::swap(*std::find(newindices.begin(), newindices.end(), oldcenter), newindices.back());
             if(shuffle_) {
                 wy::WyRand<uint64_t, 2> rng(total);
@@ -317,7 +324,7 @@ struct LocalKMedSearcher {
             // Make a vector with the original solution, but replace the old value with the new value
             for(size_t pi = 0; pi < nr_; ++pi) {
                 auto potential_index = ordering_[pi];
-                if(sol_.find(potential_index) != sol_.end()) continue;
+                if(sol_.find(potential_index) != sol_.end() || potential_index == oldcenter) continue;
                 newindices.back() = potential_index;
                 assert(std::find(newindices.begin(), newindices.end(), oldcenter) == newindices.end());
                 double val = 0.;
@@ -331,26 +338,22 @@ struct LocalKMedSearcher {
                         auto diff = oldcost - newptr[i];
                         val += diff;
                     } else if(assignments_[i] == oldcenter) {
-                        auto mincost = blz::min(blz::elements(blz::column(mat_, i), newindices.data(), newindices.size()));
+                        auto mincost = blaze::min(blaze::elements(blaze::column(mat_, i), newindices.data(), newindices.size()));
                         auto diff = oldcost - mincost;
                         val += diff;
                     }
                 }
-#ifndef NDEBUG
-                auto v = evaluate_swap(potential_index, oldcenter);
-                //assert(std::abs(v - val) <= .5 * std::abs(std::max(v, val)) || !std::fprintf(stderr, "Manual: %g. Lazy: %g\n", v, val));
                 assert(sol_.size() == k_);
-#endif
                 // Only calculate exhaustively if the lazy form returns yes.
-                if(val > diffthresh_ && (val = evaluate_swap(potential_index, oldcenter) > diffthresh_)) {
+                if(val > diffthresh_ && (val = evaluate_swap(potential_index, oldcenter)) > diffthresh_) {
                     assert(sol_.size() == k_);
                     sol_.erase(oldcenter);
                     sol_.insert(potential_index);
                     assert(sol_.size() == k_);
                     assign();
-                    //current_cost_ = blz::sum(current_costs_);
+                    //current_cost_ = blaze::sum(current_costs_);
                     ++total;
-                    std::fprintf(stderr, "Swap number %zu updated with delta %g to new cost with cost %0.12g\n", total, val, current_cost_);
+                    std::fprintf(stderr, "Swap number %zu updated with delta %.12g to new cost with cost %0.12g\n", total, val, current_cost_);
                     goto next;
                 }
             }
@@ -371,10 +374,10 @@ struct LocalKMedSearcher {
         diffthresh_ = diffthresh;
         next:
         {
-            blz::DV<IType> csol(sol_.size());
+            blaze::DynamicVector<IType> csol(sol_.size());
             std::copy(sol_.begin(), sol_.end(), csol.data());
-            blz::DV<IType> swap_in(nc_ - sol_.size());
-            blz::DV<IType> inargs(nswap), outargs(nswap);
+            blaze::DynamicVector<IType> swap_in(nc_ - sol_.size());
+            blaze::DynamicVector<IType> inargs(nswap), outargs(nswap);
             for(auto &&swap_out_comb: discreture::combinations(csol.size(), nswap)) {
                 for(auto &&swap_in_comb: discreture::combinations(swap_in.size(), nswap)) {
                     auto v = evaluate_multiswap_rt(swap_in_comb.data(), swap_out_comb.data(), nswap);
@@ -395,9 +398,8 @@ struct LocalKMedSearcher {
         if(mat_.rows() <= k_) return;
         if(lazy_eval_) {
             run_lazy();
-            if(lazy_eval_ == 2)
+            if(lazy_eval_ > 1)
                 return;
-            // Otherwise, running exhaustive local search after to be sure.
         }
         //const double diffthresh = 0.;
         std::fprintf(stderr, "diffthresh: %f\n", diffthresh);
@@ -448,7 +450,7 @@ struct LocalKMedSearcher {
             for(size_t ci = 0; ci < nr_; ++ci) {
                 if(std::find(wsol.begin(), wsol.end(), ci) != wsol.end()) continue;
                 wsol[si] = ci;
-                const double cost = blz::sum(blz::min<blz::columnwise>(rows(mat_, wsol)));
+                const double cost = blaze::sum(blaze::min<blaze::columnwise>(rows(mat_, wsol)));
                 if(cost < ccost) {
                     std::fprintf(stderr, "Found a better one: %g vs %g (%g)\n", cost, ccost, ccost - cost);
                     ccost = cost;
diff --git a/include/minocore/optim/oracle_thorup.h b/include/minocore/optim/oracle_thorup.h
index eb05a266..8c024235 100644
--- a/include/minocore/optim/oracle_thorup.h
+++ b/include/minocore/optim/oracle_thorup.h
@@ -6,6 +6,7 @@
 #include <cassert>
 #include "fastiota/fastiota_ho.h"
 #include "minocore/util/oracle.h"
+#include "boost/iterator/transform_iterator.hpp"
 
 
 namespace minocore {
@@ -24,10 +25,10 @@ template<typename Oracle,
          typename WFT=FT,
          typename IT=uint32_t
         >
-std::tuple<std::vector<IT>, blz::DV<FT>, std::vector<IT>>
+std::tuple<std::vector<IT>, blaze::DynamicVector<FT>, std::vector<IT>>
 oracle_thorup_d(const Oracle &oracle, size_t npoints, unsigned k, const WFT *weights=static_cast<const WFT *>(nullptr), double npermult=21, double nroundmult=3, double eps=0.5, uint64_t seed=1337)
 {
-    const FT total_weight = weights ? static_cast<FT>(blz::sum(blz::CustomVector<WFT, blz::unaligned, blz::unpadded>((WFT *)weights, npoints)))
+    const FT total_weight = weights ? static_cast<FT>(blaze::sum(blaze::CustomVector<WFT, blaze::unaligned, blaze::unpadded>((WFT *)weights, npoints)))
                                     : static_cast<FT>(npoints);
     size_t nperround = npermult * k * std::log(total_weight) / eps;
 #if VERBOSE_AF
@@ -36,7 +37,7 @@ oracle_thorup_d(const Oracle &oracle, size_t npoints, unsigned k, const WFT *wei
 #endif
 
     wy::WyRand<IT, 2> rng(seed);
-    blz::DV<FT> mincosts(npoints, std::numeric_limits<FT>::max());   // minimum costs per point
+    blaze::DynamicVector<FT> mincosts(npoints, std::numeric_limits<FT>::max());   // minimum costs per point
     std::vector<IT> minindices(npoints, IT(-1)); // indices to which points are assigned
     size_t nr = npoints; // Manually managing count
     std::unique_ptr<IT[]> R(new IT[npoints]);
@@ -56,6 +57,10 @@ oracle_thorup_d(const Oracle &oracle, size_t npoints, unsigned k, const WFT *wei
         if(!weights && nr <= nperround) {
             //std::fprintf(stderr, "Adding all\n");
             F.insert(F.end(), R.get(), R.get() + nr);
+            prep_range(R.get(), R.get() + nr, oracle);
+            // This instructs caching oracles to prepare these rows
+            // and results in greater efficiency for cases
+            // where distance computations are expensive.
             for(auto it = R.get(), eit = R.get() + nr; it < eit; ++it) {
                 auto v = *it;
                 //std::fprintf(stderr, "Adding index %zd/value %u\n", it - R.get(), v);
@@ -113,10 +118,11 @@ oracle_thorup_d(const Oracle &oracle, size_t npoints, unsigned k, const WFT *wei
             }
             // Update F, R, and mincosts/minindices
             current_batch.assign(tmp.begin(), tmp.end());
-            tmp.clear();
-            for(const auto item: current_batch)
-                F.push_back(R[item]);
-            shared::sort(current_batch.begin(), current_batch.end(), std::greater<>());
+            auto func = [&R](auto x) {return R[x];};
+            auto clb = boost::make_transform_iterator(current_batch.begin(), func),
+                 cle = boost::make_transform_iterator(current_batch.end(), func);
+            F.insert(F.end(), clb, cle);
+            prep_range(clb, cle, oracle);
             for(const auto v: current_batch) {
                 auto actual_index = R[v];
                 minindices[actual_index] = actual_index;
@@ -178,7 +184,7 @@ template<typename Oracle,
          typename WFT=FT,
          typename IT=uint32_t
         >
-std::tuple<std::vector<IT>, blz::DV<FT>, std::vector<IT>>
+std::tuple<std::vector<IT>, blaze::DynamicVector<FT>, std::vector<IT>>
 iterated_oracle_thorup_d(const Oracle &oracle, size_t npoints, unsigned k, unsigned num_iter=3, unsigned num_sub_iter=8,
                          const WFT *weights=static_cast<const WFT *>(nullptr), double npermult=21, double nroundmult=3, double eps=0.5, uint64_t seed=1337)
 {
@@ -186,23 +192,23 @@ iterated_oracle_thorup_d(const Oracle &oracle, size_t npoints, unsigned k, unsig
         return weights ? weights[index]: static_cast<WFT>(1.);
     };
 #if !NDEBUG
-    const FT total_weight = weights ? blz::sum(blz::CustomVector<WFT, blz::unaligned, blz::unpadded>((WFT *)weights, npoints))
+    const FT total_weight = weights ? blaze::sum(blaze::CustomVector<WFT, blaze::unaligned, blaze::unpadded>((WFT *)weights, npoints))
                                     : WFT(npoints);
 #endif
     wy::WyHash<uint64_t, 2> rng(seed);
-    std::tuple<std::vector<IT>, blz::DV<FT>, std::vector<IT>> ret;
+    std::tuple<std::vector<IT>, blaze::DynamicVector<FT>, std::vector<IT>> ret;
     auto &[centers, costs, bestindices] = ret; // Unpack for named access
     FT best_cost;
     // For convenience: a custom vector
     //                  which is empty if weights is null and full otherwise.
     {
-        std::unique_ptr<blz::CustomVector<const WFT, blz::unaligned, blz::unpadded>> wview;
-        if(weights) wview.reset(new blz::CustomVector<const WFT, blz::unaligned, blz::unpadded>(weights, npoints));
+        std::unique_ptr<blaze::CustomVector<const WFT, blaze::unaligned, blaze::unpadded>> wview;
+        if(weights) wview.reset(new blaze::CustomVector<const WFT, blaze::unaligned, blaze::unpadded>(weights, npoints));
         auto do_thorup_sample = [&]() {
             return oracle_thorup_d(oracle, npoints, k, weights, npermult, nroundmult, eps, rng());
         };
         auto get_cost = [&](const auto &x) {
-            return wview ? blz::dot(x, *wview): blz::sum(x);
+            return wview ? blaze::dot(x, *wview): blaze::sum(x);
         };
 
         // gather first set of sampled points
@@ -233,7 +239,7 @@ iterated_oracle_thorup_d(const Oracle &oracle, size_t npoints, unsigned k, unsig
     }
 
     // Calculate weights for center points
-    blz::DV<FT> center_weights(centers.size(), FT(0));
+    blaze::DynamicVector<FT> center_weights(centers.size(), FT(0));
     shared::flat_hash_map<IT, IT> asn2id; asn2id.reserve(centers.size());
     for(size_t i = 0; i < centers.size(); asn2id[centers[i]] = i, ++i);
     OMP_PRAGMA("omp parallel for")
@@ -252,8 +258,8 @@ iterated_oracle_thorup_d(const Oracle &oracle, size_t npoints, unsigned k, unsig
             nofails = false;
         }
     }
-    assert(std::abs(blz::sum(center_weights) - total_weight) < 1e-4 ||
-           !std::fprintf(stderr, "Expected sum %g, found %g\n", total_weight, blz::sum(center_weights)));
+    assert(std::abs(blaze::sum(center_weights) - total_weight) < 1e-4 ||
+           !std::fprintf(stderr, "Expected sum %g, found %g\n", total_weight, blaze::sum(center_weights)));
     assert(nofails);
 #endif
     shared::flat_hash_map<IT, IT> sub_asn2id;
@@ -264,7 +270,7 @@ iterated_oracle_thorup_d(const Oracle &oracle, size_t npoints, unsigned k, unsig
             return oracle_thorup_d(wrapped_oracle, centers.size(), k, center_weights.data(), npermult, nroundmult, eps, rng());
         };
         auto get_cost = [&](const auto &x) { // Calculates the cost of a set of centers.
-            return blz::dot(x, center_weights);
+            return blaze::dot(x, center_weights);
             // Can this be easily done using the distance from the full without performing all recalculations?
         };
 
@@ -296,7 +302,7 @@ iterated_oracle_thorup_d(const Oracle &oracle, size_t npoints, unsigned k, unsig
         assert(sub_bestindices.size() == center_weights.size());
         sub_asn2id.clear();
         for(size_t i = 0; i < sub_centers.size(); sub_asn2id[sub_centers[i]] = i, ++i);
-        blz::DV<FT> sub_center_weights(sub_centers.size(), FT(0));
+        blaze::DynamicVector<FT> sub_center_weights(sub_centers.size(), FT(0));
         OMP_PFOR
         for(size_t i = 0; i < sub_bestindices.size(); ++i) {
             assert(sub_asn2id.find(sub_bestindices[i]) != sub_asn2id.end());
@@ -307,7 +313,7 @@ iterated_oracle_thorup_d(const Oracle &oracle, size_t npoints, unsigned k, unsig
         }
 
         DBG_ONLY(for(const auto w: sub_center_weights) assert(w > 0.);)
-        assert(std::abs(blz::sum(sub_center_weights) - total_weight) <= 1.e-4);
+        assert(std::abs(blaze::sum(sub_center_weights) - total_weight) <= 1.e-4);
 
         // Convert back to original coordinates
         auto transform_func = [&wrapped_oracle](auto x) {return wrapped_oracle.lookup(x);};
diff --git a/include/minocore/util/blaze_adaptor.h b/include/minocore/util/blaze_adaptor.h
index 83729f0a..51d455ca 100644
--- a/include/minocore/util/blaze_adaptor.h
+++ b/include/minocore/util/blaze_adaptor.h
@@ -141,10 +141,25 @@ struct DynamicMatrix: public blaze::DynamicMatrix<FT, SO> {
     struct const_row_iterator: public row_iterator_t<const this_type> {};
     struct column_iterator: public column_iterator_t<this_type> {};
     struct const_column_iterator: public column_iterator_t<const this_type> {};
+    decltype(auto) operator[](size_t i) const {
+        if constexpr(SO == blaze::rowMajor) {
+            return row(*this, i, blaze::unchecked);
+        } else {
+            return column(*this, i, blaze::unchecked);
+        }
+    }
+    decltype(auto) operator[](size_t i) {
+        if constexpr(SO == blaze::rowMajor) {
+            return row(*this, i, blaze::unchecked);
+        } else {
+            return column(*this, i, blaze::unchecked);
+        }
+    }
     template<typename...Args> this_type &operator=(Args &&...args) {
         ((super &)*this).operator=(std::forward<Args>(args)...);
         return *this;
     }
+    size_t size() const {return SO == blaze::rowMajor ? this->rows(): this->columns();}
     auto rowiterator()       {return RowViewer<this_type>(*this);}
     auto rowiterator() const {return ConstRowViewer<this_type>(*this);}
     auto columniterator()       {return ColumnViewer<this_type>(*this);}
@@ -166,6 +181,21 @@ struct CustomMatrix: public blaze::CustomMatrix<Type, AF, PF, SO> {
         ((super &)*this).operator=(std::forward<Args>(args)...);
         return *this;
     }
+    decltype(auto) operator[](size_t i) const {
+        if constexpr(SO == blaze::rowMajor) {
+            return row(*this, i, blaze::unchecked);
+        } else {
+            return column(*this, i, blaze::unchecked);
+        }
+    }
+    decltype(auto) operator[](size_t i) {
+        if constexpr(SO == blaze::rowMajor) {
+            return row(*this, i, blaze::unchecked);
+        } else {
+            return column(*this, i, blaze::unchecked);
+        }
+    }
+    size_t size() const {return SO == blaze::rowMajor ? this->rows(): this->columns();}
     auto rowiterator()       {return RowViewer<this_type>(*this);}
     auto rowiterator() const {return ConstRowViewer<this_type>(*this);}
     auto columniterator()       {return ColumnViewer<this_type>(*this);}
@@ -267,6 +297,38 @@ INLINE auto sum(const std::vector<FT, Alloc> &vec) {
 template<typename OT>
 INLINE decltype(auto) sum(const OT &x) {return blaze::sum(x);}
 
+template<typename VT, bool SO, typename VT2, bool SO2>
+size_t number_shared_zeros(const blaze::SparseVector<VT, SO> &_lhs, const blaze::SparseVector<VT2, SO2> &_rhs) {
+     auto &lhs = ~_lhs;
+     auto &rhs = ~_rhs;
+     assert(lhs.size() == rhs.size());
+     //const size_t sz = lhs.size();
+     auto lhit = lhs.begin();
+     auto rhit = rhs.begin();
+     auto lhe = lhs.end();
+     auto rhe = rhs.end();
+     if(lhit == lhe) return nonZeros(rhs);
+     if(rhit == rhe) return nonZeros(lhs);
+     auto getnextindex = [&]() {
+         size_t r1 = lhit == lhe ? size_t(-1): lhit->index();
+         size_t r2 = rhit == rhe ? size_t(-1): rhit->index();
+         if(r1 == r2) {
+             ++lhit;
+             ++rhit;
+         } else if(r1 < r2) ++lhit;
+         else ++rhit;
+         return std::min(r1, r2);
+     };
+     size_t current_index = getnextindex();
+     size_t ret = current_index;
+     for(size_t nv; (nv = getnextindex()) != size_t(-1);) {
+         if(nv == current_index) continue;
+         assert(nv > current_index);
+         ret += nv - current_index - 1;
+         current_index = nv;
+     }
+     return ret;
+ }
 
 template<typename MT, bool SO>
 void fill_helper(blaze::Matrix<MT, SO> &mat) {
@@ -277,8 +339,8 @@ void fill_helper(blaze::Matrix<MT, SO> &mat) {
     }
 }
 
-template<typename FT >
-void fill_helper(dm::DistanceMatrix<FT> &) {
+template<typename FT, size_t DV, dm::MemoryStrategy ms>
+void fill_helper(dm::DistanceMatrix<FT, DV, ms> &) {
      std::fprintf(stderr, "[%s] Warning: trying to fill_symmetric_upper_triangular on an unsupported type. Doing nothing.\n", __PRETTY_FUNCTION__);
 }
 
diff --git a/include/minocore/util/csc.h b/include/minocore/util/csc.h
index 0f1bf4e2..4742f376 100644
--- a/include/minocore/util/csc.h
+++ b/include/minocore/util/csc.h
@@ -8,12 +8,14 @@
 
 namespace minocore {
 
+template<typename IndPtrType=uint64_t, typename IndicesType=uint64_t, typename DataType=uint32_t>
 struct CSCMatrixView {
-    const uint64_t *const indptr_, *const indices_;
-    const uint32_t *const data_;
+    const IndPtrType *const indptr_;
+    const IndicesType *const indices_;
+    const DataType *const data_;
     const uint64_t nnz_;
     const uint32_t nf_, n_;
-    CSCMatrixView(const uint64_t *indptr, const uint64_t *indices, const uint32_t *data,
+    CSCMatrixView(const IndPtrType *indptr, const IndicesType *indices, const DataType *data,
                   uint64_t nnz, uint32_t nfeat, uint32_t nitems):
         indptr_(indptr),
         indices_(indices),
@@ -34,14 +36,14 @@ struct CSCMatrixView {
     }
 };
 
-template<typename FT=float>
-blz::SM<FT, blaze::rowMajor> csc2sparse(const CSCMatrixView &mat, bool skip_empty=false) {
+template<typename FT=float, typename IndPtrType, typename IndicesType, typename DataType>
+blz::SM<FT, blaze::rowMajor> csc2sparse(const CSCMatrixView<IndPtrType, IndicesType, DataType> &mat, bool skip_empty=false) {
     blz::SM<FT, blaze::rowMajor> ret(mat.n_, mat.nf_);
     ret.reserve(mat.nnz_);
     size_t used_rows = 0, i;
     for(i = 0; i < mat.n_; ++i) {
         auto col = mat.column(i);
-        if(mat.n_ > 1000000 && i % 1000000 == 0) std::fprintf(stderr, "%zu/%u\r", i, mat.n_);
+        if(mat.n_ > 100000 && i % 10000 == 0) std::fprintf(stderr, "%zu/%u\r", i, mat.n_);
         if(skip_empty && 0u == col.nnz()) continue;
         for(auto s = col.start_; s < col.stop_; ++s) {
             ret.append(used_rows, mat.indices_[s], mat.data_[s]);
@@ -52,7 +54,7 @@ blz::SM<FT, blaze::rowMajor> csc2sparse(const CSCMatrixView &mat, bool skip_empt
     return ret;
 }
 
-template<typename FT=float>
+template<typename FT=float, typename IndPtrType=uint64_t, typename IndicesType=uint64_t, typename DataType=uint32_t>
 blz::SM<FT, blaze::rowMajor> csc2sparse(std::string prefix, bool skip_empty=false) {
     util::Timer t("csc2sparse load time");
     std::string indptrn  = prefix + "indptr.file";
@@ -67,9 +69,13 @@ blz::SM<FT, blaze::rowMajor> csc2sparse(std::string prefix, bool skip_empty=fals
     std::fclose(ifp);
     using mmapper = mio::mmap_source;
     mmapper indptr(indptrn), indices(indicesn), data(datan);
-    CSCMatrixView matview((const uint64_t *)indptr.data(), (const uint64_t *)indices.data(),
-                          (const uint32_t *)data.data(), indices.size() / (sizeof(uint64_t) / sizeof(indices[0])),
-                          nfeat, nsamples);
+    CSCMatrixView<IndPtrType, IndicesType, DataType>
+        matview((const IndPtrType *)indptr.data(), (const IndicesType *)indices.data(),
+                (const DataType *)data.data(), indices.size() / sizeof(IndicesType),
+                 nfeat, nsamples);
+    std::fprintf(stderr, "indptr size: %zu\n", indptr.size() / sizeof(IndPtrType));
+    std::fprintf(stderr, "indices size: %zu\n", indices.size() / sizeof(IndicesType));
+    std::fprintf(stderr, "data size: %zu\n", data.size() / sizeof(DataType));
 #ifndef MADV_REMOVE
 #  define MADV_FLAGS (MADV_DONTNEED | MADV_FREE)
 #else
@@ -79,7 +85,7 @@ blz::SM<FT, blaze::rowMajor> csc2sparse(std::string prefix, bool skip_empty=fals
     ::madvise((void *)indices.data(), indices.size(), MADV_FLAGS);
     ::madvise((void *)data.data(), data.size(), MADV_FLAGS);
 #undef MADV_FLAGS
-    return csc2sparse(matview, skip_empty);
+    return csc2sparse<FT>(matview, skip_empty);
 }
 
 template<typename FT=float, bool SO=blaze::rowMajor>
diff --git a/include/minocore/util/diskmat.h b/include/minocore/util/diskmat.h
deleted file mode 100644
index d517b44e..00000000
--- a/include/minocore/util/diskmat.h
+++ /dev/null
@@ -1,157 +0,0 @@
-#pragma once
-#ifndef DISK_MAT_H__
-#define DISK_MAT_H__
-#include <memory>
-#include "mio/single_include/mio/mio.hpp"
-#include <cstring>
-#include <system_error>
-#include "blaze_adaptor.h"
-
-namespace minocore {
-
-template<typename VT, bool SO=blaze::rowMajor, bool isPadded=blaze::padded, bool isAligned=blaze::aligned>
-struct DiskMat {
-    using This = DiskMat<VT, SO, isPadded, isAligned>;
-    size_t nr_, nc_;
-    using mmapper = mio::mmap_sink;
-    std::unique_ptr<mmapper> ms_;
-    std::FILE *fp_;
-    bool delete_file_;
-
-
-    // TODO:
-    //  alignment -- if offset is 0, it's already aligned.
-    //            -- otherwise, allocate enough extra so that it is
-
-    static constexpr blaze::AlignmentFlag AF = isAligned ? blaze::aligned: blaze::unaligned;
-    static constexpr blaze::PaddingFlag PF = isPadded ? blaze::padded: blaze::unpadded;
-    using MatType = blaze::CustomMatrix<VT, AF, PF, SO>;
-    MatType mat_;
-    std::string path_;
-
-    DiskMat(const DiskMat &o): DiskMat(o.nr_, o.nc_, nullptr) {
-        std::memcpy(ms_->data(), o.ms_->data(), o.ms_->size());
-    }
-    DiskMat(DiskMat &&o): path_(o.path_) {
-        uint8_t *ptr = reinterpret_cast<uint8_t *>(this), *optr = reinterpret_cast<uint8_t *>(std::addressof(o));
-        std::memset(ptr, 0, sizeof(*this));
-        std::swap_ranges(ptr, ptr + sizeof(*this), optr);
-        std::fprintf(stderr, "[%s at %p] moved diskmat has path %s\n", __PRETTY_FUNCTION__, (void *)this, path_.data() ? path_.data(): "tmpfile");
-    }
-    static constexpr size_t SIMDSIZE = blaze::SIMDTrait<VT>::size;
-    DiskMat(const DiskMat &o, const char *s, size_t offset=0, int delete_file=-1):
-        DiskMat(o.rows(), o.columns(), s, offset, delete_file >= 0 ? delete_file: o.delete_file_)
-    {
-        std::memcpy(ms_->data(), o.ms_->data(), sizeof(VT) * (~*this).spacing() * nr_);
-#if VERBOSE_AF
-        std::fprintf(stderr, "Copied to %s\n", path_.size() ? path_.data(): "tmpfile");
-#endif
-    }
-    operator       MatType &()       {return ~*this;}
-    operator const MatType &() const {return ~*this;}
-    DiskMat(size_t nr, size_t nc, const char *s=nullptr, size_t offset=0, bool delete_file=true):
-        nr_(nr), nc_(nc),
-        delete_file_(delete_file),
-        path_(s ? s: "")
-    {
-#if VERBOSE_AF
-        std::fprintf(stderr, "Opened file at %s to make matrix of size %zu, %zu\n", s ? s: "tmpfile", nr_, nc_);
-#endif
-        if(isAligned && offset % (SIMDSIZE * sizeof(VT))) {
-            throw std::invalid_argument("offset is not aligned; invalid storage.");
-        }
-        const size_t nperrow = isPadded ? size_t(blaze::nextMultiple(nc_, SIMDSIZE)): nc_;
-        const size_t nb = nr_ * nperrow * sizeof(VT), total_nb = nb + offset;
-        if((fp_ = s ? std::fopen(s, "a+"): std::tmpfile()) == nullptr) {
-            char buf[256];
-            std::sprintf(buf, "Failed to open file for writing. %s/%d (%s)", ::strerror(errno), errno, s ? s: "tmpfil");
-            throw std::system_error(0, std::system_category(), buf);
-        }
-        const int fd = ::fileno(fp_);
-        struct stat st;
-        int rc;
-        if((rc = ::fstat(fd, &st))) {
-            char buf[256];
-            std::sprintf(buf, "Failed to fstat fd/fp/path %d/%p/%s", fd, (void *)fp_, path_.data());
-            std::fclose(fp_);
-            fp_ = nullptr;
-            throw std::system_error(rc, std::system_category(), buf);
-        }
-        size_t filesize = st.st_size;
-        if(filesize < total_nb) {
-            if((rc = ::ftruncate(fd, total_nb))) throw std::system_error(rc, std::system_category(), "Failed to resize (ftruncate)");
-            ::fstat(fd, &st);
-        }
-        assert(size_t(st.st_size) >= total_nb);
-        ms_.reset(new mmapper(fd, offset, nb));
-        mat_ = MatType((VT *)ms_->data(), nr, nc, nperrow);
-        assert(s ? (path_.data() && std::strcmp(path_.data(), s)) == 0: path_.empty());
-        std::fprintf(stderr, "Spacing: %zu\n", (~*this).spacing());
-    }
-    DiskMat(size_t nr, size_t nc, std::string path, size_t offset=0, bool delete_file=false): DiskMat(nr, nc, path.data(), offset, delete_file) {}
-    auto operator()(size_t i, size_t j) const {return (~*this)(i, j);}
-    auto &operator()(size_t i, size_t j)      {return (~*this)(i, j);}
-    ~DiskMat() {
-        if(fp_) std::fclose(fp_);
-        if(delete_file_ && path_.size()) {
-#if VERBOSE_AF
-            std::fprintf(stderr, "[%s at %p]path: %s/%p\n", __PRETTY_FUNCTION__, (void *)this, path_.data(), (void *)path_.data());
-#endif
-            auto rc = std::system((std::string("rm ") + path_).data());
-            if(rc) {
-                std::fprintf(stderr, "Note: file deletion failed with exit status %d and stopsig %d\n",
-                                      WEXITSTATUS(rc), WSTOPSIG(rc));
-            }
-        }
-    }
-    auto data() const {return mat_.data();}
-    auto data()       {return mat_.data();}
-    auto spacing() const {return mat_.spacing();}
-    auto rows() const {return mat_.rows();}
-    auto columns() const {return mat_.columns();}
-    MatType       &operator~()       {return mat_;}
-    const MatType &operator~() const {return mat_;}
-}; // DiskMat
-
-template<typename VT, bool SO, bool isPadded, bool isAligned, bool checked=true>
-auto row(DiskMat<VT, SO, isPadded, isAligned> &mat, size_t i, blaze::Check<checked> check=blaze::Check<checked>()) {
-    return blaze::row(~mat, i, check);
-}
-template<typename VT, bool SO, bool isPadded, bool isAligned, bool checked=true>
-auto column(DiskMat<VT, SO, isPadded, isAligned> &mat, size_t i, blaze::Check<checked> check=blaze::Check<checked>()) {
-    return blaze::column(~mat, i, check);
-}
-
-#ifndef DEFAULT_MAX_NRAMBYTES
-#define DEFAULT_MAX_NRAMBYTES static_cast<size_t>(16ull << 30)
-#endif
-
-template<typename VT, bool SO=blaze::rowMajor, size_t max_nbytes=DEFAULT_MAX_NRAMBYTES>
-class PolymorphicMat {
-    using CMType = blaze::CustomMatrix<VT, blaze::aligned, blaze::padded, blaze::rowMajor>;
-    using DiskType = DiskMat<VT, SO, blaze::aligned, blaze::padded>;
-    std::unique_ptr<DiskType> diskmat_;
-    std::unique_ptr<blaze::DynamicMatrix<VT, SO>> rammat_;
-    CMType cm_;
-public:
-    static constexpr size_t MAX_BYTES_RAM = max_nbytes;
-    PolymorphicMat(size_t nr, size_t nc, size_t maxmem=MAX_BYTES_RAM, const char *s=nullptr) {
-        size_t spacing = blaze::nextMultiple(nc, blaze::SIMDTrait<VT>::size);
-        size_t total_bytes = nr * spacing * sizeof(VT);
-        VT *ptr;
-        if(total_bytes > maxmem) {
-            diskmat_.reset(new DiskType(nr, nc, s));
-            ptr = diskmat_->data();
-        } else {
-            rammat_.reset(new blaze::DynamicMatrix<VT, SO>(nr, nc));
-            ptr = rammat_->data();
-        }
-        cm_ = CMType(ptr, nr, nc, spacing);
-    }
-    CMType &operator~() {return cm_;}
-    const CMType &operator~() const {return cm_;}
-};
-
-} // minocore
-
-#endif
diff --git a/include/minocore/util/exception.h b/include/minocore/util/exception.h
new file mode 100644
index 00000000..b8797dbd
--- /dev/null
+++ b/include/minocore/util/exception.h
@@ -0,0 +1,103 @@
+#ifndef FGC_EXCEPTION_H__
+#define FGC_EXCEPTION_H__
+#include <stdexcept>
+#include <string>
+
+namespace minocore {
+
+inline namespace exception {
+
+struct TODOError: public std::runtime_error {
+    template<typename...A>
+    TODOError(A &&...a): std::runtime_error(std::forward<A>(a)...) {}
+};
+
+class NotImplementedError: public std::runtime_error {
+public:
+    template<typename... Args>
+    NotImplementedError(Args &&...args): std::runtime_error(std::forward<Args>(args)...) {}
+
+    NotImplementedError(): std::runtime_error("NotImplemented.") {}
+};
+
+class UnsatisfiedPreconditionError: public std::runtime_error {
+public:
+    UnsatisfiedPreconditionError(std::string msg): std::runtime_error(std::string("Unsatisfied precondition: ") + msg) {}
+
+    UnsatisfiedPreconditionError(): std::runtime_error("Unsatisfied precondition.") {}
+};
+
+static int require(bool condition, std::string s, int ec=0) {
+    if(!condition) {
+        if(ec) throw std::runtime_error(s + " Error code: " + std::to_string(ec));
+        else   throw std::runtime_error(s);
+    }
+    return ec;
+}
+
+static int validate(bool condition, std::string s, int ec=0) {
+    if(!condition) {
+        if(ec) throw std::invalid_argument(s + " Error code: " + std::to_string(ec));
+        else   throw std::invalid_argument(s);
+    }
+    return ec;
+}
+
+
+static int precondition_require(bool condition, std::string s, int ec=0) {
+    if(!condition) {
+        if(ec) throw UnsatisfiedPreconditionError(s + " Error code: " + std::to_string(ec));
+        else throw UnsatisfiedPreconditionError(s);
+    }
+    return ec;
+}
+
+class UnsatisfiedPostconditionError: public std::runtime_error {
+public:
+    UnsatisfiedPostconditionError(std::string msg): std::runtime_error(std::string("Unsatisfied precondition: ") + msg) {}
+
+    UnsatisfiedPostconditionError(): std::runtime_error("Unsatisfied precondition.") {}
+};
+
+static int postcondition_require(bool condition, std::string s, int ec=0) {
+    if(!condition) {
+        if(ec) throw UnsatisfiedPostconditionError(s + " Error code: " + std::to_string(ec));
+        else throw UnsatisfiedPostconditionError(s);
+    }
+    return ec;
+}
+
+#ifndef PREC_REQ_EC
+#define PREC_REQ_EC(condition, s, ec) \
+    ::minocore::exception::precondition_require(condition, std::string(s) + '[' + __FILE__ + '|' + __PRETTY_FUNCTION__ + "|#L" + std::to_string(__LINE__) + "] Failing condition: \"" + #condition + '"', ec)
+#endif
+
+#ifndef PREC_REQ
+#define PREC_REQ(condition, s) PREC_REQ_EC(condition, s, 0)
+#endif
+
+#ifndef POST_REQ_EC
+#define POST_REQ_EC(condition, s, ec) \
+    ::minocore::exception::postcondition_require(condition, std::string(s) + '[' + __FILE__ + '|' + __PRETTY_FUNCTION__ + "|#L" + std::to_string(__LINE__) + "] Failing condition: \"" + #condition + '"', ec)
+#endif
+
+#ifndef POST_REQ
+#define POST_REQ(condition, s) POST_REQ_EC(condition, s, 0)
+#endif
+
+
+#ifndef MINOCORE_REQUIRE
+#define MINOCORE_REQUIRE(condition, s) \
+    ::minocore::exception::require(condition, std::string(s) + '[' + __FILE__ + '|' + __PRETTY_FUNCTION__ + "|#L" + std::to_string(__LINE__) + "] Failing condition: \"" + #condition + '"')
+#endif
+
+#ifndef MINOCORE_VALIDATE
+#define MINOCORE_VALIDATE(condition) \
+    ::minocore::exception::validate(condition, std::string("[") + __FILE__ + '|' + __PRETTY_FUNCTION__ + "|#L" + std::to_string(__LINE__) + "] Failing condition: \"" + #condition + '"')
+#endif
+
+} // inline namespace exception
+
+} // namespace minocore
+
+#endif /* FGC_EXCEPTION_H__ */
diff --git a/include/minocore/util/macros.h b/include/minocore/util/macros.h
index bc7d0e4d..1f475a61 100644
--- a/include/minocore/util/macros.h
+++ b/include/minocore/util/macros.h
@@ -135,8 +135,10 @@
 
 #if !NDEBUG
 #  define DBG_ONLY(...) __VA_ARGS__
+#  define DBG_ELSE(x, y) x
 #else
 #  define DBG_ONLY(...)
+#  define DBG_ELSE(x, y) y
 #endif
 
 #if VERBOSE_AF
@@ -195,4 +197,17 @@
 #  endif
 #endif
 
+
+#ifndef NDEBUG
+#  include <iostream>
+#  define PRETTY_SAY std::cerr << '[' << __PRETTY_FUNCTION__ << ':' << __FILE__ << ':' << __LINE__ << ']'
+#else
+  struct CHEVRONEATER {
+      template<typename T>
+      const CHEVRONEATER &operator<<(const T &) const {return *this;}
+  };
+#  define PRETTY_SAY ::CHEVRONEATER{}
+
+#endif
+
 #endif /* SKETCH_MACROS_H__ */
diff --git a/include/minocore/util/oracle.h b/include/minocore/util/oracle.h
index 4af6ef8d..fd5e7426 100644
--- a/include/minocore/util/oracle.h
+++ b/include/minocore/util/oracle.h
@@ -65,14 +65,14 @@ struct PairKeyType {
     }
     static auto rh(Type v) {
         if constexpr(sizeof(IT) == 4) {
-            static constexpr Type bitmask = static_cast<IT>((uint64_t(1) << 32) - 1);
-            return v & bitmask;
+            return v & 0xFFFFFFFFu;
         } else {
             return v.second;
         }
     }
 };
 
+
 template<typename Oracle, template<typename...> class Map=std::unordered_map, bool symmetric=true, bool threadsafe=false, typename IT=std::uint32_t>
 struct CachingOracleWrapper {
     using output_type = std::decay_t<decltype(std::declval<Oracle>()(0,0))>;
@@ -220,6 +220,85 @@ auto make_matrix_m(const Mat &mat) {
 }
 
 
+template<typename Oracle, template<typename...> class Map=std::unordered_map, bool symmetric=true, bool threadsafe=false, typename IT=std::uint32_t, typename FT=float,
+          bool use_row_vector=true>
+struct RowCachingOracleWrapper {
+    using output_type = std::decay_t<decltype(std::declval<Oracle>()(0,0))>;
+    using VType = blaze::DynamicVector<FT, use_row_vector ? blaze::rowVector: blaze::columnVector>;
+    using map_type = Map<IT, VType>;
+    const Oracle &oracle_;
+    mutable map_type map_;
+    size_t np_;
+private:
+    mutable std::shared_mutex mut_;
+    using map_iterator = typename map_type::iterator;
+    // TODO: use two kinds of locks
+public:
+    RowCachingOracleWrapper(const Oracle &oracle, size_t np, size_t rsvsz=0): oracle_(oracle), np_(np) {
+        map_.reserve(rsvsz ? rsvsz: np);
+    }
+    template<typename It>
+    void cache_range(It start, It end) const {
+        unsigned n = std::distance(start, end);
+        for(auto i = 0u; i < n; ++i) {
+            VType tmp(np_);
+            auto lhi = start[i];
+            if(map_.find(lhi) != map_.end()) continue;
+            OMP_PFOR
+            for(size_t j = 0; j < np_; ++j) {
+                auto it = map_.find(j);
+                tmp[j] = (it == map_.end()) ? oracle_(lhi, j): it->second[lhi];
+            }
+            map_.emplace(lhi, std::move(tmp));
+        }
+    }
+    output_type operator()(IT lh, IT rh) const {
+        std::shared_lock<std::shared_mutex> slock(mut_);
+        map_iterator it;
+        if((it = map_.find(lh)) != map_.end())
+            return it->second[rh];
+        if constexpr(symmetric) {
+            if((it = map_.find(rh)) != map_.end())
+                return it->second[lh];
+        }
+        VType tmp(np_);
+#ifdef _OPENMP
+#       pragma omp parallel for
+#endif
+        for(size_t i = 0; i < np_; ++i) 
+            tmp[i] = oracle_(lh, i);
+        output_type ret = tmp[rh];
+#ifndef NDEBUG
+        size_t oldsize = map_.size();
+#endif
+        if constexpr(threadsafe) {
+            slock.unlock();
+            std::unique_lock<std::shared_mutex> ulock(mut_);
+            if(map_.find(lh) != map_.end()) return ret;
+            map_.emplace(lh, std::move(tmp));
+        } else {
+            map_.emplace(lh, std::move(tmp));
+        }
+        DBG_ONLY(if(oldsize != map_.size()) std::fprintf(stderr, "New size: %zu\n", map_.size());)
+        if constexpr(threadsafe) slock.unlock();
+        return ret;
+    }
+};
+
+template<typename It, typename It2, typename T>
+void prep_range(It, It2, const T &) {}
+
+template<typename It, typename It2, typename Oracle, template<typename...> class Map, bool sym, bool ts, typename IT, typename FT, bool use_row_vector>
+void prep_range(It start, It2 end, const RowCachingOracleWrapper<Oracle, Map, sym, ts, IT, FT, use_row_vector> &x) {
+    x.cache_range(start, end);
+}
+
+template<template<typename...> class Map=std::unordered_map, bool symmetric=true, bool threadsafe=false, typename IT=std::uint32_t, typename FT=float, typename Oracle>
+auto make_row_caching_oracle_wrapper(const Oracle &oracle, size_t np, size_t rsvsz=0) {
+    return RowCachingOracleWrapper<Oracle, Map, symmetric, threadsafe, IT, FT>(oracle, np, rsvsz);
+}
+
+
 } // namespace minocore
 
 #endif /* FGC_ORACLE_H__ */
diff --git a/include/minocore/util/shared.h b/include/minocore/util/shared.h
index 8f932b10..36a83ad7 100644
--- a/include/minocore/util/shared.h
+++ b/include/minocore/util/shared.h
@@ -49,10 +49,6 @@ INLINE auto checked_posix_write(int fd, const void *buf, ssize_t count) {
     return ret;
 }
 
-struct TODOError: public std::runtime_error {
-    template<typename...A>
-    TODOError(A &&...a): std::runtime_error(std::forward<A>(a)...) {}
-};
 
 struct Deleter {
     void operator()(const void *x) const {
diff --git a/include/minocore/util/sorted.h b/include/minocore/util/sorted.h
new file mode 100644
index 00000000..12651aa6
--- /dev/null
+++ b/include/minocore/util/sorted.h
@@ -0,0 +1,71 @@
+#ifndef SORTED_DQ_H__
+#define SORTED_DQ_H__
+#include <cassert>
+#include <deque>
+#include <list>
+#include <vector>
+#include <algorithm>
+
+
+namespace sorted {
+
+// Sorted deque
+template<template<typename...> class Container, typename T, typename All, typename Cmp=std::less<>, bool upper_insert=true, typename...Args>
+class container {
+    Container<T, All, Args...> data_;
+    Cmp cmp_;
+public:
+    template<typename...CArgs>
+    container(CArgs &&...args): data_(std::forward<CArgs>(args)...) {
+        sort(data_.begin(), data_.end(), cmp_);
+    }
+    template<typename U>
+    auto lower_bound(const U &item) const {
+        return std::lower_bound(data_.begin(), data_.end(), item, cmp_);
+    }
+    template<typename U>
+    auto upper_bound(const U &item) const {
+        return std::upper_bound(data_.begin(), data_.end(), item, cmp_);
+    }
+    auto find(const T &x) const {
+        return lower_bound(x);
+    }
+    auto &con() {return data_;}
+    auto &con() const {return data_;}
+    template<typename...EArgs>
+    auto emplace(EArgs &&...args) {
+        T x(std::forward<EArgs>(args)...);
+        auto it = upper_insert ? upper_bound(x): lower_bound(x);
+        data_.insert(it, std::move(x));
+        assert(std::is_sorted(data_.begin(), data_.end(), cmp_));
+    }
+    auto erase(const T &x) {
+        if(auto it = find(x); it != end())
+            this->erase(it);
+    }
+    T &operator[](size_t i) {return data_[i];}
+    const T &operator[](size_t i) const {return data_[i];}
+    auto begin() {return data_.begin();}
+    auto end()   {return data_.end();}
+    auto begin() const {return data_.begin();}
+    auto end()   const {return data_.end();}
+    auto cbegin() {return data_.cbegin();}
+    auto cend()   {return data_.cend();}
+    auto size() const {return data_.size();}
+    auto pop() {auto ret = std::move(data_.back()); data_.pop_back(); return ret;}
+    using iterator = typename Container<T, All>::iterator;
+    using const_iterator = typename Container<T, All>::const_iterator;
+    using value_type = typename Container<T, All>::value_type;
+    using pointer = typename Container<T, All>::pointer;
+    using const_pointer = typename Container<T, All>::const_pointer;
+    using reference = typename Container<T, All>::reference;
+    using const_reference = typename Container<T, All>::const_reference;
+};
+
+template<typename T, typename Cmp=std::less<>, typename All=std::allocator<T>>
+using vector = container<std::vector, T, All, Cmp>;
+template<typename T, typename Cmp=std::less<>, typename All=std::allocator<T>>
+using deque = container<std::deque, T, All, Cmp>;
+
+} // sorted
+#endif
diff --git a/include/minocore/util/timer.h b/include/minocore/util/timer.h
index 1bc0da53..59919c1c 100644
--- a/include/minocore/util/timer.h
+++ b/include/minocore/util/timer.h
@@ -11,7 +11,7 @@ namespace util {
 using hrc = std::chrono::high_resolution_clock;
 
 template<typename Clock>
-static inline uint32_t timediff2ms(std::chrono::time_point<Clock> start, std::chrono::time_point<Clock> stop) {
+static inline double timediff2ms(std::chrono::time_point<Clock> start, std::chrono::time_point<Clock> stop) {
     if(stop < start) std::swap(stop, start);
     return std::chrono::duration<double, std::milli>(stop - start).count();
 }
diff --git a/include/minocore/utility.h b/include/minocore/utility.h
index 9ede33f4..f22abeba 100644
--- a/include/minocore/utility.h
+++ b/include/minocore/utility.h
@@ -1,13 +1,13 @@
 #ifndef FGC_UTILITY_H__
 #define FGC_UTILITY_H__
 
+#include "minocore/util/exception.h"
 #include "minocore/util/macros.h"
 #include "minocore/util/shared.h"
 #include "minocore/util/blaze_adaptor.h"
 #include "minocore/util/Inf2Zero.h"
 
 #include "minocore/util/csc.h"
-#include "minocore/util/diskmat.h"
 
 #include "minocore/util/div.h"
 #include "minocore/util/packed.h"
diff --git a/include/minocore/wip.h b/include/minocore/wip.h
index 84a12a6a..020a67fd 100644
--- a/include/minocore/wip.h
+++ b/include/minocore/wip.h
@@ -3,7 +3,6 @@
 
 #include "./wip/caratheodory.h"
 #include "./wip/streaming.h"
-#include "./wip/clustering.h"
 #include "./wip/gen_kmedian.h"
 
 #endif
diff --git a/include/minocore/wip/clustering.h b/include/minocore/wip/clustering.h
deleted file mode 100644
index 9454ab43..00000000
--- a/include/minocore/wip/clustering.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef FGC_CLUSTERING_H__
-#define FGC_CLUSTERING_H__
-#include "minocore/dist.h"
-
-namespace minocore {
-
-namespace clustering {
-
-#if 0
-enum ClusteringAssignmentType: size_t {
-    HARD
-    SOFT
-};
-#endif
-
-}
-
-} // namespace minocore
-
-#endif /* FGC_CLUSTERING_H__ */
diff --git a/include/minocore/wip/old_clustering.h b/include/minocore/wip/old_clustering.h
new file mode 100644
index 00000000..930f9cee
--- /dev/null
+++ b/include/minocore/wip/old_clustering.h
@@ -0,0 +1,652 @@
+#ifndef FGC_CLUSTERING_H__
+#define FGC_CLUSTERING_H__
+#include "minocore/dist.h"
+#include "minocore/optim/kmedian.h"
+#include "minocore/util/exception.h"
+#include "minocore/wip/clustering_traits.h"
+#include <cstdint>
+
+namespace minocore {
+
+namespace clustering {
+
+using blz::DissimilarityMeasure;
+
+struct CentroidPolicy {
+    template<typename VT, bool TF, typename Range, typename VT2=VT, typename RowSums, typename WeightContainer=const blz::ElementType_t<VT> *>
+    static void perform_average(blz::DenseVector<VT, TF> &ret, const Range &r, const RowSums &rs,
+                                const blz::Vector<VT2, TF> *wc = static_cast<const blz::Vector<VT2, TF> *>(nullptr),
+                                DissimilarityMeasure measure=static_cast<DissimilarityMeasure>(-1))
+    {
+        using FT = blz::ElementType_t<VT>;
+        if(measure==static_cast<DissimilarityMeasure>(-1)) {
+            std::fprintf(stderr, "Die\n");
+            std::exit(1);
+        }
+        if(measure == blz::TOTAL_VARIATION_DISTANCE) {
+            coresets::l1_median(r, ret,  wc);
+        }
+        else if(measure == blz::L1) {
+            using cmtype = 
+            std::conditional_t<blz::IsSparseMatrix_v<Range>,
+                               blz::CompressedMatrix<FT, blz::StorageOrder_v<Range> >,
+                               blz::DynamicMatrix<FT, blz::StorageOrder_v<Range> >
+                              >;
+            cmtype cm = r * blz::expand(rs, r.columns());
+            coresets::l1_median(cm, ret, wc);
+        } else if(measure == blz::LLR || measure == blz::UWLLR) {
+            FT total_sum_inv;
+            if(wc) total_sum_inv = 1. / blz::dot(rs, *wc);
+            else   total_sum_inv = 1. / blz::sum(rs);
+            if(wc) {
+                ~ret = blz::sum<blz::columnwise>(r % blz::expand(rs, r.columns())) * total_sum_inv;
+            } else {
+                ~ret = blz::sum<blz::columnwise>(r % blz::expand(*wc * rs, r.columns())) * total_sum_inv;
+            }
+        } else if(wc) {
+            assert((~(*wc)).size() == r.rows());
+            assert(blz::expand(~(*wc), r.columns()).rows() == r.rows());
+            assert(blz::expand(~(*wc), r.columns()).columns() == r.columns());
+            auto wsuminv = 1. / blz::sum(*wc);
+            if(blz::detail::is_probability(measure) {
+                ~ret = blz::sum<blz::columnwise>(r % blz::expand(~(*wc), r.columns())) * (wsuminv / r.rows());
+            } else {
+                // Otherwise
+                ~ret = blz::mean<blz::columnwise>(r % blz::expand(~(*wc) * rs, r.columns()) * wsuminv;
+            }
+        } else {
+            if(blz::detail::is_probability(measure)
+                ~ret = blz::mean<blz::columnwise>(r % blz::expand(rs, r.columns()));
+            else
+                ~ret = blz::mean<blz::columnwise>(r);
+        }
+    }
+    template<typename FT, typename Row, typename Src>
+    static void do_inc(FT neww, FT cw, Row &ret, const Src &dat, FT row_sum, DissimilarityMeasure measure)
+    {
+        if(measure == blz::L1 || measure == blz::TOTAL_VARIATION_DISTANCE)
+            throw std::invalid_argument("do_inc is only for linearly-calculated means, not l1 median");
+        if(cw == 0.) {
+            if(blz::detail::is_probability(measure))
+                ret = dat;
+            else
+                ret = dat * row_sum;
+        } else {
+            auto div = neww / (neww + cw);
+            if(blz::detail::is_probability(measure)) {
+                ret += (dat - ret) * div;
+            } else if(measure == blz::LLR || measure == blz::UWLLR) {
+                ret += (dat * row_sum) * neww;
+                // Add up total sum and subtract later
+                // since there are three weighting factors here:
+                // First, partial assignment
+                // Then point-wise weights (both of which are in neww)
+                // Then, for LLR/UWLLR, there's weighting by the row-sums
+            } else {
+                // Maintain running mean for full vector value
+                ret += (dat * row_sum - ret) * div;
+            }
+        }
+    }
+
+    template<typename VT, bool TF, typename RowSums, typename MatType, typename CenterCon>
+    static void perform_soft_assignment(const blz::DynamicMatrix<VT, TF> &assignments, 
+        const RowSums &rs,
+        std::mutex *mutptr,
+        const MatType &data, CenterCon &newcon,
+        const blz::Vector<VT2, TF> *wc = static_cast<const blz::Vector<VT2, TF> *>(nullptr),
+        DissimilarityMeasure measure=static_cast<DissimilarityMeasure>(-1))
+    {
+        using FT = ElementType_t<VT>;
+        if(measure==static_cast<DissimilarityMeasure>(-1)) {
+            std::fprintf(stderr, "Die\n");
+            std::exit(1);
+        }
+        if(measure == blz::L1 || measure == blz::TOTAL_VARIATION_DISTANCE) {
+            throw TODOError();
+        } else {
+            blz::DV<FT> summed_contribs(newcon.size(), 0.);
+            for(size_t i = 0; i < data.rows(); ++i) {
+                auto item_weight = wc ? wc[i]: static_cast<FT>(1.);
+                const auto row_sum = rs[i];
+                for(size_t j = 0; j < newcon.size(); ++j) {
+                    auto &cw = summed_contribs[j];
+                    if(auto asnw = asn[j]; asnw > 0.) {
+                        auto neww = item_weight * asnw;
+
+#ifdef _OPENMP
+                        if(mutptr) mutptr->lock();
+#endif
+                        do_inc(neww, cw, newcon[j], row(data, i, blz::unchecked), row_sum, measure);
+#ifdef _OPENMP
+                        if(mutptr) mutptr->unlock();
+#endif
+                        OMP_ATOMIC
+                        summed_contribs[j] += neww;
+                    }
+                }
+            }
+            if(measure == blz::LLR || measure == blz::UWLLR) {
+                OMP_PFOR
+                for(auto i = 0u; i < newcon.size(); ++i)
+                    newcon[i] *= 1. / blz::dot(column(assignments, i), rs);
+            }
+        }
+    }
+};
+
+template<typename MatrixType, typename WFT=blz::ElementType_t<MatrixType> >
+void perform_cluster_metric_kmedian(const jsd::DissimilarityApplicator<MatrixType> &app, unsigned k, uint64_t seed=0, const WFT *weights=static_cast<WFT *>(nullptr))
+{
+    throw NotImplementedError();
+}
+
+enum LloydLoopResult {
+    FINISHED,
+    REACHED_MAX_ROUNDS,
+    UNFINISHED
+};
+
+template<Assignment asn_method=HARD, CenterOrigination co=EXTRINSIC, typename MatrixType, typename CentersType, typename Assignments>
+LloydLoopResult perform_lloyd_loop(CentersType &centers, Assignments &assignments, &const jsd::DissimilarityApplicator<MatrixType> &app, unsigned k, uint64_t seed=0, const WFT *weights=static_cast<WFT *>(nullptr),
+                                   size_t max_iter=100, double eps=1e-4, LloydLoopResult &ret)
+{
+    if(co != EXTRINSIC) throw std::invalid_argument("Must be extrinsic for Lloyd's");
+    using FT = ElementType_t<MatrixType>;
+    auto &mat = app.data();
+    CentersType centers_cpy(centers), centers_cache;
+    if(blz::detail::needs_logs(app.measure_) || blz::detail::needs_sqrt(app.measure_))
+        centers_cache.resize(centers.size());
+    double last_distance = std::numeric_limits<double>::max(), first_distance = last_distance,
+           center_distance;
+    LloydLoopResult ret = UNFINISHED;
+    auto get_center_change_distance = [&]() {
+        center_distance = std::accumulate(centers_cpy.begin(), centers_cpy.end(), 0.,
+            [&](double value, auto &center) {
+                auto ind = std::distance(*centers_cpy.begin(), &centers_);
+                return value + blz::sum(blz::abs(center - centers_[ind]));
+            }
+        );
+        std::swap(centers_cpy, centers);
+        if(last_distance == std::numeric_limits<double>::max()) {
+            last_distance = first_distance = center_distance;
+            iternum = 1;
+        } else {
+            last_distance = center_distance;
+            if(center_distance / first_distance < eps)
+                ret = LloydLoopResult::FINISHED;
+            if(++iternum > max_iter)
+                ret = LloydLoopResult::REACHED_MAX_ROUNDS;
+                ret = UNFINISHED;
+        }
+    };
+    // Next: make a set of std::vectors, then use blaze to compute averages under the policy
+    // Everything but L1 and TVD use element-wise mean
+    auto getcache = [&] (size_t j) {
+        return centers_cache.size() ? &centers_cache[j]: static_cast<decltype(&centers_cache[j])>(nullptr);
+    };
+    if constexpr(asn_method = HARD) {
+        std::vector<std::vector<uint32_t>> assigned(centers.size());
+        OMP_ONLY(std::unique_ptr<std::mutex[]> mutexes(centers.size());)
+        size_t iternum = 0;
+        for(;;) {
+            // Do it forever
+            if(centers_cache.size()) {
+                for(size_t i = 0; i < centers.size(); ++i)
+                    set_cache(centers[i], centers_cache[i], app.measure_);
+            }
+            for(auto &i: assigned) i.clear();
+            OMP_PFOR
+            for(size_t i = 0; i < app.size(); ++i) {
+                auto dist = app(i, centers[0], getcache(0));
+                unsigned asn = 0;
+                for(size_t j = 1; j < centers.size(); ++j) {
+                    auto newdist = app(i, centers[j], getcache(j));
+                    if(newdist < dist) {
+                        asn = j;
+                        dist = newdist;
+                    }
+                }
+                assignments[i] = asn;
+                {
+                    OMP_ONLY(std::unique_lock<std::mutex> lock(mutexes[asn]);)
+                    assigned[asn].push_back(i);
+                }
+            }
+            // Make assignments
+            for(size_t i = 0; i < centers_cpy.size(); ++i) {
+                auto &cref = centers_cpy[i];
+                auto &assigned_ids = assigned[i];
+                shared::sort(assigned_ids.begin(), assigned_ids.end()); // Better access pattern
+                CentroidPolicy::perform_average(
+                    cref,
+                    rows(mat, assigned_ids.data(), assigned_ids.size()),
+                    elements(app.row_sums(), assigned_ids.data(), assigned_ids.size()),
+                    weights, app.measure_
+                );
+            }
+            get_center_change_distance();
+            if(ret != UNFINISHED) return ret;
+        }
+        // Set the returned values to be the last iteration's.
+    } else {
+        size_t iternum = 0;
+        const size_t nc = centers.size(), nr = app.size();
+        if(assignments.rows() != app.size() || assignments.columns() != centers.size()) {
+            assignments.resize(app.size(), centers.size());
+        }
+        std::unique_ptr<std::mutex[]> mutexes;
+        OMP_ONLY(mutexes.reset(new std::mutex[centers.size()]);)
+        for(;;) {
+            if(centers_cache.size()) {
+                for(size_t i = 0; i < centers.size(); ++i)
+                    set_cache(centers[i], centers_cache[i], app.measure_);
+            }
+            for(auto &c: centers_cpy) c = static_cast<FT>(0);
+            OMP_PFOR
+            for(size_t i = 0; i < nr; ++i) {
+                auto row = row(assignments, i, BLAZE_CHECK_DEBUG);
+                for(unsigned j = 0; j < nc; ++j) {
+                    row[j] = app(i, centers[j], getcache(j));
+                }
+                if constexpr(asn_method == SOFT_HARMONIC_MEAN) {
+                    row = 1. / row;
+                } else {
+                    auto mv = blz::min(row);
+                    row = blz::exp(-row + mv) - mv;
+                }
+                row *= 1. / blz::sum(row);
+                // And then compute its contribution to the mean of the points.
+                // Use stable running mean calculation
+            }
+            // Now points have been assigned, and we now perform center assignment
+            CentroidPolicy::perform_soft_assignment(
+                assignments, app.row_sums(), mutexes.get(), app.data(), centers_cpy, weights, app.measure_
+            );
+        }
+        get_center_change_distance();
+        if(ret != UNFINISHED) return ret;
+        throw NotImplementedError("Not yet finished");
+    }
+}
+
+
+template<Assignment asn_method=HARD, CenterOrigination co=INTRINSIC, typename MatrixType, typename IT=uint32_t>
+auto perform_clustering(const jsd::DissimilarityApplicator<MatrixType> &app, unsigned k, CenterSamplingType csample=DEFAULT_SAMPLING,
+                        const blz::ElementType_t<MatrixType> *weights=nullptr, uint64_t seed=0, OptimizationMethod opt=DEFAULT_OPT)
+{
+    using FT = typename MatrixType::ElementType;
+    ClusteringTraits<FT, IT, asn_method, co> clustering_traits;
+    clustering_traits.sampling = csample;
+    typename ClusteringTraits<FT, IT, asn_method, co>::centers_t centers;
+    typename ClusteringTraits<FT, IT, asn_method, co>::assignments_t assigments;
+    auto measure = app.measure_;
+    if(opt == DEFAULT_OPT) {
+        switch(measure) {
+            case L2:
+            case SQRL2:
+            case L1: case TVD:
+            case COSINE_DISTANCE:
+            case PROBABILITY_COSINE_DISTANCE:
+            case LLR: case UWLLR:
+            case HELLINGER: case BHATTACHARYYA_DISTANCE:
+                opt = EXPECTATION_MAXIMIZATION; break;
+            /*
+             * Bregman Divergences, LLR, cosine distance use the (weighted) mean of each
+             * point, in either soft or hard clustering.
+             * TVD and L1 use the feature-wise median.
+             * Scores are either calculated with softmax distance or harmonic softmax
+             */
+            case ORACLE_METRIC: case ORACLE_PSEUDOMETRIC: case BHATTACHARYYA_METRIC: case WASSERSTEIN:
+                /* otherwise, use metric kmedian */
+                opt = METRIC_KMEDIAN; break;
+            default:
+                if(blz::detail::is_bregman(opt)) {
+                    opt = EXPECTATION_MAXIMIZATION;
+                    break;
+                }
+        }
+    }
+
+
+    if(blz::detail::satisfies_d2(measure) || measure == blz::L1 || measure == blz::TOTAL_VARIATION_DISTANCE) {
+        auto [initcenters, initasn, initcosts] = jsd::make_kmeanspp(app, k, seed, weights);
+         
+        if constexpr(co == INTRINSIC) {
+            throw std::invalid_argument("Shouldn't happen");
+        }
+        centers.reserve(k);
+        std::copy(initasn.begin(), initasn.end(), std::back_inserter(assignments));
+        for(const auto id: initcenters) {
+            centers.emplace_back(row(app.data(), id));
+        }
+        if(co == INTRINSIC || opt == METRIC_KMEDIAN) {
+            // Do graph metric calculation
+            perform_cluster_metric_kmedian(app, k, seed, weights);
+        } else {
+            // Do Lloyd's loop (``kmeans'' algorithm)
+            perform_lloyd_loop<asn_method>(centers, assignments, app, k, seed, weights);
+        }
+    } else if(blz::detail::is_symmetric(measure)) {
+        throw std::runtime_error("Not implemented: symmetric measure clustering. This method should perform sampling (governed by the csample variable)"
+                                  ", followed by facility location, and finished by local search.");
+        perform_cluster_metric_kmedian(app, k, seed, weights);
+    } else {
+        throw NotImplementedError("Unsupported: asymmetric measures not supporting D2 sampling");
+    }
+}
+
+#if 0
+namespace helpers {
+
+template<typename Mat>
+class LookupMatrixOracle {
+    const Mat &mat_;
+public:
+    LookupMatrixOracle(const Mat &mat): mat_(mat) {}
+    size_t size() const {return mat_.rows();}
+    template<typename Sol>
+    auto compute_distance(const Sol &x, size_t center_index, size_t point_index) const {
+        assert(center_index < mat_.rows());
+        assert(point_index < mat_.columns());
+        return mat_(x[center_index], point_index);
+    }
+    void operator[](size_t ) const {
+        throw std::runtime_error("This should never be called");
+    }
+    auto compute_distance(nullptr_t, size_t center_index, size_t point_index) const {
+        assert(point_index < mat_.rows());
+        assert(center_index < mat_.rows());
+        return mat_(center_index, point_index);
+    }
+};
+
+template<typename Mat>
+auto make_lookup_data_oracle(const Mat &mat) {
+    return LookupMatrixOracle<Mat>(mat);
+}
+
+template<typename Mat, typename Functor>
+class ExtrinsicFunctorOracle {
+    const Mat &mat_;
+    const Functor &func_;
+public:
+    ExtrinsicFunctorOracle(const Mat &mat, const Functor &func): mat_(mat), func_(func) {}
+    size_t size() const {return mat_.rows();}
+    template<typename Sol>
+    auto compute_distance(const Sol &x, size_t center_index, size_t point_index) const {
+        assert(point_index < mat_.rows());
+        assert(center_index < x.size());
+        return func_(x[center_index], mat_[point_index]);
+    }
+    decltype(auto) operator[](size_t ind) {return mat_[ind];}
+    decltype(auto) operator[](size_t ind) const {return mat_[ind];}
+    // This function computes a distance between two points
+    auto compute_distance(nullptr_t, size_t center_index, size_t point_index) const {
+        return compute_distance(center_index, point_index);
+    }
+    auto compute_distance(size_t center_index, size_t point_index) const {
+        assert(point_index < size());
+        assert(center_index < size());
+        return func_(mat_[center_index], mat_[point_index]);
+    }
+};
+
+template<typename Mat, typename Func>
+auto make_exfunc_oracle(const Mat &mat, const Func &func) {
+    return ExtrinsicFunctorOracle<Mat, Func>(mat, func);
+}
+
+} // helpers
+using helpers::make_exfunc_oracle;
+using helpers::make_lookup_data_oracle;
+
+
+template<typename DataOracle, typename MyClusteringTraits>
+struct ClusteringSolverBase: public MyClusteringTraits {
+
+    using centers_t     = typename MyClusteringTraits::centers_t;
+    using costs_t       = typename MyClusteringTraits::costs_t;
+    using assignments_t = typename MyClusteringTraits::assignments_t;
+    using cost_t        = typename MyClusteringTraits::cost_t;
+    using index_t       = typename MyClusteringTraits::index_t;
+    using MyClusteringTraits::asn_method;
+    using MyClusteringTraits::center_origin;
+    using MyClusteringTraits::approx;
+    //using MyClusteringTraits::sampling_method;
+    using MyClusteringTraits::opt;
+
+    using FT = typename MyClusteringTraits::cost_t;
+private:
+    const DataOracle &data_oracle_;
+    /*
+     * DataOracle is the key for interfacing with the data.
+     * It must provide:
+     * 1. size() const method listing the number of points.
+     * 2. compute_distance(const centers_t &centers, unsigned center_index, unsigned point_index)
+     *
+     *    For pre-computed matrices (e.g., metric distance matrix) with rows corresponding to centers,
+     *    and columns corresponding to data points,
+     *    DataOracle might have a mat_ field for the matrix and return
+     *    `mat_(center_index, point_index)`.
+     *    LookupMatrixOracle satisfies this, for instance.
+     *
+     *    For distance-oracle functions,
+     *    use the ExtrinsicFunctorOracle class.
+     *
+     *    For instance, if `dm` is a dense matrix of points in row-major format:
+     *    auto oracle = clustering::make_exfunc_oracle(dm, blz::sqrL2Norm())
+     *    clustering::ClusteringSolverBase<decltype(oracle), MatrixMeta> solver(oracle, dm.rows(), k);
+     *
+     *
+     *    For Applicator-supported functions, this might be
+     *    `applicator_(point_index, centers_[center_index])`
+     *    or have an alternate form that caches logs or sqrts.
+     */
+    size_t np_;
+    uint32_t k_;
+    uint32_t points_to_sample_;
+    DissimilarityMeasure measure_; // What measure of dissimilarity.
+                                   // Use ORACLE_METRIC or ORACLE_PSEUDOMETRIC as placeholders for measures
+                                   // Not supported by the applicator
+
+    std::unique_ptr<centers_t>     c_sol_;
+    std::unique_ptr<assignments_t> c_assignments_;
+    std::unique_ptr<costs_t>       c_costs_;
+    std::unique_ptr<cost_t>        pointwise_costs_;
+    const FT *weights_;
+    SensitivityMethod sens_; // Which coreset construction method
+
+    void validate_parameters() {
+        assert(sens_ != static_cast<SensitivityMethod>(-1));
+        if(opt == METRIC_KMEDIAN) {
+            validate(blz::detail::satisfies_metric(measure_) || blz::detail::satisfies_rho_metric(measure_));
+        }
+#if 0
+        if(sampling_method == THORUP_SAMPLING) {
+            validate(blz::detail::satisfies_metric(measure_) || blz::detail::satisfies_rho_metric(measure_));
+        }
+        if(sampling_method == D2_SAMPLING) {
+            validate(blz::detail::satisfies_d2(measure_));
+        }
+#endif
+    }
+
+    void set_sensitivity_method(SensitivityMethod val=static_cast<SensitivityMethod>(-1)) {
+        bool unset = val == static_cast<SensitivityMethod>(-1);
+        if(unset) {
+            if(blz::detail::is_bregman(val)) sens_ = LBK;
+            else if(approx == BICRITERIA) {
+                sens_ = BFL;
+            } else if(approx == CONSTANT_FACTOR) {
+                if(blz::detail::is_bregman(val)) std::fprintf(stderr, "Warning: Bregman approximations are O(log(k)) approximate, not constant.\n");
+                sens_ = VX;
+            } else /*approx == HEURISTIC */ {
+                sens_ = BFL;
+            }
+        } else {
+            if(val == VX) {
+                MINOCORE_VALIDATE(approx == CONSTANT_FACTOR || approx == HEURISTIC);
+                if(blz::detail::is_bregman(val)) std::fprintf(stderr, "Warning: Bregman solutions are O(log(k)) approximate, not constant.\n");
+            } else if (val == LUCIC_FAULKNER_KRAUSE_FELDMAN) {
+                throw NotImplementedError("Not supported currently: GMM coreset sampling");
+            }
+            sens_ = val;
+        }
+    }
+
+public:
+    void set_assignments_and_costs() {
+        PREC_REQ(c_sol_.get(), "Complete sol must already have been computed.");
+        if constexpr(asn_method == HARD) {
+            if(!c_assignments_)
+                c_assignments_.reset(new assignments_t(data_oracle_.size()));
+            else if(c_assignments_->size() != data_oracle_.size())
+                c_assignments_->resize(data_oracle_.size());
+            if(!c_costs_)
+                c_costs_.reset(new costs_t(data_oracle_.size()));
+            else if(c_costs_->size() != data_oracle_.size())
+                c_costs_->resize(data_oracle_.size());
+            OMP_PFOR
+            for(size_t i = 0; i < data_oracle_.size(); ++i) {
+                auto mincost = data_oracle_.compute_distance(*c_sol_, 0, i);
+                unsigned bestind = 0;
+                for(size_t j = 1; j < c_sol_->size(); ++j) {
+                    if(auto newcost = data_oracle_.compute_distance(*c_sol_, j, i); newcost < mincost)
+                        mincost = newcost, bestind = j;
+                }
+                c_assignments_->operator[](i) = bestind;
+                c_costs_->operator[](i) = mincost;
+            }
+        } else { // Soft or softmax assignments
+            assert(c_sol_->size() == k_);
+            if(!c_costs_) {
+                c_costs_.reset(new costs_t(np_, k_));
+            } else if(c_costs_->rows() != np_ || c_costs_->columns() != k_) {
+                c_costs_->resize(np_, k_);
+            }
+            if(!c_assignments_) c_assignments_.reset(new assignments_t(*c_costs_));
+            if(c_assignments_->size() != data_oracle_.size())
+                c_assignments_->resize(data_oracle_.size());
+            OMP_PFOR
+            for(size_t i = 0; i < data_oracle_.size(); ++i) {
+                // Compute costs
+                auto cost_row = row(*c_costs_, i, blaze::unchecked);
+                cost_row[0] = data_oracle_.compute_distance(*c_sol_, 0, i);
+                for(size_t j = 1; j < c_sol_->size(); ++j) {
+                    cost_row[j] = data_oracle_.compute_distance(*c_sol_, j, i);
+                }
+                // Use costs to make fractional assignments
+                auto asn_row = row(*c_assignments_, i, blaze::unchecked);
+                if(asn_method == SOFT)
+                    asn_row = blz::exp(-cost_row + blz::min(cost_row));
+                else // SOFT_HARMONIC_MEAN, actually harmonic mean
+                    asn_row = 1. / cost_row;
+                asn_row /= blz::sum(asn_row);
+            }
+        }
+    }
+    void approx_sol(uint64_t seed=0) {
+        if constexpr(opt == BLACK_BOX || opt == GRADIENT_DESCENT || opt == EXHAUSTIVE_SEARCH)
+            throw NotImplementedError("Optimization under black box, gd or exhaustive search not yet supported");
+        if constexpr(asn_method != HARD)
+            throw NotImplementedError("Not completed yet: SOFT or SOFT_HARMONIC_MEAN clustering");
+        else
+        {
+            // One optimization technique each for metric (JV + local search)
+            //                                 and expectation maximization.
+            if(blz::detail::satisfies_d2(measure_)) {
+                auto func = [&](size_t i, size_t j) {
+                    return data_oracle_.compute_distance(i, j);
+                };
+                wy::WyRand<uint64_t, 2> rng(seed);
+                auto [initcenters, initasn, initcosts] = coresets::kmeanspp(func, rng, np_, k_);
+                std::vector<blz::DV<cost_t, blz::rowVector>> centers;
+                centers.reserve(k_);
+                for(const auto id: initcenters) {
+                    centers.emplace_back(data_oracle_[id]);
+                }
+                set_centers(std::move(centers));
+                set_assignments_and_costs();
+            } else {
+                throw NotImplementedError("Metric K-median needs to have optimizers plugged in.");
+            }
+        }
+    }
+    auto make_coreset_sampler(uint64_t seed=0) {
+        PREC_REQ(this->c_costs_.get(), "Current costs must be calculated");
+        const cost_t *ptr;
+        if constexpr(asn_method == HARD) {
+            // Use the c_costs->data() method.
+            if(!weights_)
+                ptr = c_costs_->data();
+            else if(pointwise_costs_.get()) ptr = pointwise_costs_.get();
+            else {
+                pointwise_costs_.reset(new cost_t[np_]);
+                blaze::CustomVector<FT, blaze::unaligned, blaze::unpadded>
+                    pv(c_costs_->data(), np_), pc(pointwise_costs_.get(), np_);
+                const blaze::CustomVector<FT, blaze::unaligned, blaze::unpadded> wv(const_cast<FT *>(weights_), np_);
+                pc = pv * wv;
+                ptr = pointwise_costs_.get();
+            }
+        } else {
+            if(pointwise_costs_.get()) {
+                ptr = pointwise_costs_.get();
+            } else {
+                pointwise_costs_.reset(new cost_t[np_]);
+                OMP_PFOR
+                for(size_t i = 0; i < np_; ++i)
+                    pointwise_costs_[i] = blz::dot(row(*c_assignments_, i, blz::unchecked),
+                                                   row(*c_costs_, i, blz::unchecked)) * getw(i);
+                if(weights_) {
+                    blaze::CustomVector<FT, blaze::unaligned, blaze::unpadded>
+                        pv(pointwise_costs_.get(), np_);
+                    const blaze::CustomVector<FT, blaze::unaligned, blaze::unpadded> wv(const_cast<FT *>(weights_), np_);
+                    pv *= wv;
+                }
+            }
+        }
+        coresets::CoresetSampler<cost_t, index_t> sampler;
+        if constexpr(asn_method == HARD) throw NotImplementedError("Coreset sampler supporting fractional assignment not yet available.");
+        else {
+            sampler.make_sampler(np_, points_to_sample_, ptr, c_assignments_->data(), weights_, seed, sens_);
+        }
+    }
+    template<typename OT>
+    void set_centers(const OT &centers) {
+        this->c_sol_.reset(new centers_t(centers.size()));
+        std::copy(centers.begin(), centers.end(), this->c_sol_->begin());
+    }
+    void set_centers(centers_t &&newcenters) {
+        this->c_sol_.reset(new centers_t(std::move(newcenters)));
+    }
+    ClusteringSolverBase(const DataOracle &data, size_t npoints, unsigned k,
+                         DissimilarityMeasure measure=ORACLE_PSEUDOMETRIC,
+                         blz::distance::SensitivityMethod sens=static_cast<blz::distance::SensitivityMethod>(-1),
+                         unsigned points_to_sample=0, const FT *weights=nullptr):
+        data_oracle_(data), np_(npoints), k_(k),
+        points_to_sample_(points_to_sample ? points_to_sample: k_),
+        measure_(measure),
+        weights_(weights)
+    {
+        if(points_to_sample_ != k_) std::fprintf(stderr, "note: sampling different number of points");
+        set_sensitivity_method(sens);
+        validate_parameters();
+    }
+    double calculate_cost(const centers_t &centers) {
+        throw NotImplementedError();
+    }
+    const assignments_t &get_assignments(bool recalc=true) {
+        if(!c_assignments_ || recalc) set_assignments_and_costs();
+        return *c_assignments_;
+    }
+};
+#endif
+
+
+
+} // namespace clustering
+
+} // namespace minocore
+
+#endif /* FGC_CLUSTERING_H__ */
diff --git a/network_simplex/full_bipartitegraph.h b/network_simplex/full_bipartitegraph.h
deleted file mode 100644
index 8a3e5242..00000000
--- a/network_simplex/full_bipartitegraph.h
+++ /dev/null
@@ -1,238 +0,0 @@
-/* -*- mode: C++; indent-tabs-mode: nil; -*-
- *
- * This file has been adapted by Nicolas Bonneel (2013),
- * from full_graph.h from LEMON, a generic C++ optimization library,
- * to implement a lightweight fully connected bipartite graph. A previous
- * version of this file is used as part of the Displacement Interpolation
- * project,
- * Web: http://www.cs.ubc.ca/labs/imager/tr/2011/DisplacementInterpolation/
- *
- *
- **** Original file Copyright Notice :
- * Copyright (C) 2003-2010
- * Egervary Jeno Kombinatorikus Optimalizalasi Kutatocsoport
- * (Egervary Research Group on Combinatorial Optimization, EGRES).
- *
- * Permission to use, modify and distribute this software is granted
- * provided that this copyright notice appears in all copies. For
- * precise terms see the accompanying LICENSE file.
- *
- * This software is provided "AS IS" with no warranty of any kind,
- * express or implied, and with no claim as to its suitability for any
- * purpose.
- *
- */
-
-#ifndef LEMON_FULL_BIPARTITE_GRAPH_H
-#define LEMON_FULL_BIPARTITE_GRAPH_H
-
-#include <cstdint>
-
-///\ingroup graphs
-///\file
-///\brief FullBipartiteDigraph and FullBipartiteGraph classes.
-
-
-namespace lemon {
-
-	///This \c \#define creates convenient type definitions for the following
-	///types of \c Digraph: \c Node,  \c NodeIt, \c Arc, \c ArcIt, \c InArcIt,
-	///\c OutArcIt, \c BoolNodeMap, \c IntNodeMap, \c DoubleNodeMap,
-	///\c BoolArcMap, \c IntArcMap, \c DoubleArcMap.
-	///
-	///\note If the graph type is a dependent type, ie. the graph type depend
-	///on a template parameter, then use \c TEMPLATE_DIGRAPH_TYPEDEFS()
-	///macro.
-#define DIGRAPH_TYPEDEFS(Digraph)                                       \
-  typedef Digraph::Node Node;                                           \
-  typedef Digraph::Arc Arc;                                             \
-
-
-	///Create convenience typedefs for the digraph types and iterators
-
-	///\see DIGRAPH_TYPEDEFS
-	///
-	///\note Use this macro, if the graph type is a dependent type,
-	///ie. the graph type depend on a template parameter.
-#define TEMPLATE_DIGRAPH_TYPEDEFS(Digraph)                              \
-  typedef typename Digraph::Node Node;                                  \
-  typedef typename Digraph::Arc Arc;                                    \
-
-
-  class FullBipartiteDigraphBase {
-  public:
-
-    typedef FullBipartiteDigraphBase Digraph;
-
-    //class Node;
-	typedef int Node;
-    //class Arc;
-	typedef int64_t Arc;
-
-  protected:
-
-    int _node_num;
-	int64_t _arc_num;
-
-    FullBipartiteDigraphBase() {}
-
-    void construct(int n1, int n2) { _node_num = n1+n2; _arc_num = (int64_t)n1 * (int64_t)n2; _n1=n1; _n2=n2;}
-
-  public:
-
-	int _n1, _n2;
-
-
-    Node operator()(int ix) const { return Node(ix); }
-    static int index(const Node& node) { return node; }
-
-    Arc arc(const Node& s, const Node& t) const {
-		if (s<_n1 && t>=_n1)
-			return Arc((int64_t)s * (int64_t)_n2 + (int64_t)(t-_n1) );
-		else
-			return Arc(-1);
-    }
-
-    int nodeNum() const { return _node_num; }
-	int64_t arcNum() const { return _arc_num; }
-
-    int maxNodeId() const { return _node_num - 1; }
-	int64_t maxArcId() const { return _arc_num - 1; }
-
-    Node source(Arc arc) const { return arc / _n2; }
-    Node target(Arc arc) const { return (arc % _n2) + _n1; }
-
-    static int id(Node node) { return node; }
-    static int64_t id(Arc arc) { return arc; }
-
-    static Node nodeFromId(int id) { return Node(id);}
-    static Arc arcFromId(int64_t id) { return Arc(id);}
-
-
-    Arc findArc(Node s, Node t, Arc prev = -1) const {
-      return prev == -1 ? arc(s, t) : -1;
-    }
-
-    void first(Node& node) const {
-      node = _node_num - 1;
-    }
-
-    static void next(Node& node) {
-      --node;
-    }
-
-    void first(Arc& arc) const {
-      arc = _arc_num - 1;
-    }
-
-    static void next(Arc& arc) {
-      --arc;
-    }
-
-    void firstOut(Arc& arc, const Node& node) const {
-		if (node>=_n1)
-			arc = -1;
-		else
-			arc = (node + 1) * _n2 - 1;
-    }
-
-    void nextOut(Arc& arc) const {
-      if (arc % _n2 == 0) arc = 0;
-      --arc;
-    }
-
-    void firstIn(Arc& arc, const Node& node) const {
-		if (node<_n1)
-			arc = -1;
-		else
-			arc = _arc_num + node - _node_num;
-    }
-
-    void nextIn(Arc& arc) const {
-      arc -= _n2;
-      if (arc < 0) arc = -1;
-    }
-
-  };
-
-  /// \ingroup graphs
-  ///
-  /// \brief A directed full graph class.
-  ///
-  /// FullBipartiteDigraph is a simple and fast implmenetation of directed full
-  /// (complete) graphs. It contains an arc from each node to each node
-  /// (including a loop for each node), therefore the number of arcs
-  /// is the square of the number of nodes.
-  /// This class is completely static and it needs constant memory space.
-  /// Thus you can neither add nor delete nodes or arcs, however
-  /// the structure can be resized using resize().
-  ///
-  /// This type fully conforms to the \ref concepts::Digraph "Digraph concept".
-  /// Most of its member functions and nested classes are documented
-  /// only in the concept class.
-  ///
-  /// This class provides constant time counting for nodes and arcs.
-  ///
-  /// \note FullBipartiteDigraph and FullBipartiteGraph classes are very similar,
-  /// but there are two differences. While this class conforms only
-  /// to the \ref concepts::Digraph "Digraph" concept, FullBipartiteGraph
-  /// conforms to the \ref concepts::Graph "Graph" concept,
-  /// moreover FullBipartiteGraph does not contain a loop for each
-  /// node as this class does.
-  ///
-  /// \sa FullBipartiteGraph
-  class FullBipartiteDigraph : public FullBipartiteDigraphBase {
-    typedef FullBipartiteDigraphBase Parent;
-
-  public:
-
-    /// \brief Default constructor.
-    ///
-    /// Default constructor. The number of nodes and arcs will be zero.
-    FullBipartiteDigraph() { construct(0,0); }
-
-    /// \brief Constructor
-    ///
-    /// Constructor.
-    /// \param n The number of the nodes.
-    FullBipartiteDigraph(int n1, int n2) { construct(n1, n2); }
-
-
-    /// \brief Returns the node with the given index.
-    ///
-    /// Returns the node with the given index. Since this structure is
-    /// completely static, the nodes can be indexed with integers from
-    /// the range <tt>[0..nodeNum()-1]</tt>.
-    /// The index of a node is the same as its ID.
-    /// \sa index()
-    Node operator()(int ix) const { return Parent::operator()(ix); }
-
-    /// \brief Returns the index of the given node.
-    ///
-    /// Returns the index of the given node. Since this structure is
-    /// completely static, the nodes can be indexed with integers from
-    /// the range <tt>[0..nodeNum()-1]</tt>.
-    /// The index of a node is the same as its ID.
-    /// \sa operator()()
-    static int index(const Node& node) { return Parent::index(node); }
-
-    /// \brief Returns the arc connecting the given nodes.
-    ///
-    /// Returns the arc connecting the given nodes.
-    /*Arc arc(Node u, Node v) const {
-      return Parent::arc(u, v);
-    }*/
-
-    /// \brief Number of nodes.
-    int nodeNum() const { return Parent::nodeNum(); }
-    /// \brief Number of arcs.
-	int64_t arcNum() const { return Parent::arcNum(); }
-  };
-
-
-
-
-} //namespace lemon
-
-
-#endif //LEMON_FULL_GRAPH_H
diff --git a/network_simplex/network_simplex_simple.h b/network_simplex/network_simplex_simple.h
deleted file mode 100644
index e1c5d996..00000000
--- a/network_simplex/network_simplex_simple.h
+++ /dev/null
@@ -1,1580 +0,0 @@
-/* -*- mode: C++; indent-tabs-mode: nil; -*-
-*
-*
-* This file has been adapted by Nicolas Bonneel (2013),
-* from network_simplex.h from LEMON, a generic C++ optimization library,
-* to implement a lightweight network simplex for mass transport, more
-* memory efficient than the original file. A previous version of this file
-* is used as part of the Displacement Interpolation project,
-* Web: http://www.cs.ubc.ca/labs/imager/tr/2011/DisplacementInterpolation/
-*
-* Revisions:
-* March 2015: added OpenMP parallelization
-* March 2017: included Antoine Rolet's trick to make it more robust
-* April 2018: IMPORTANT bug fix + uses 64bit integers (slightly slower but less risks of overflows), updated to a newer version of the algo by LEMON, sparse flow by default + minor edits.
-*
-*
-**** Original file Copyright Notice :
-*
-* Copyright (C) 2003-2010
-* Egervary Jeno Kombinatorikus Optimalizalasi Kutatocsoport
-* (Egervary Research Group on Combinatorial Optimization, EGRES).
-*
-* Permission to use, modify and distribute this software is granted
-* provided that this copyright notice appears in all copies. For
-* precise terms see the accompanying LICENSE file.
-*
-* This software is provided "AS IS" with no warranty of any kind,
-* express or implied, and with no claim as to its suitability for any
-* purpose.
-*
-*/
-
-#ifndef LEMON_NETWORK_SIMPLEX_SIMPLE_H
-#define LEMON_NETWORK_SIMPLEX_SIMPLE_H
-
-
-/// \ingroup min_cost_flow_algs
-///
-/// \file
-/// \brief Network Simplex algorithm for finding a minimum cost flow.
-
-// if your compiler has troubles with unorderedmaps, just comment the following line to use a slower std::map instead
-#define HASHMAP        // now handled with unorderedmaps instead of stdext::hash_map. Should be better supported.
-
-#define SPARSE_FLOW    // a sparse flow vector will be 10-15% slower for small problems but uses less memory and becomes faster for large problems (40k total nodes)
-
-#include <vector>
-#include <limits>
-#include <algorithm>
-#ifdef HASHMAP
-#include <unordered_map>
-#else
-#include <map>
-#endif
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-#include <cmath>
-
-
-#include "full_bipartitegraph.h"
-
-#define INVALIDNODE -1
-#define INVALID (-1)
-
-
-namespace lemon {
-#ifndef DEFAULT_SPARSE_MAP
-  #ifdef HASHMAP
-#define DEFAULT_SPARSE_MAP std::unordered_map
-  #else
-#define DEFAULT_SPARSE_MAP std::map
-  #endif
-#endif
-
-	template <typename T, template<typename...> class Map=DEFAULT_SPARSE_MAP>
-	class ProxyObject;
-
-	template<typename T, template<typename...> class Map=DEFAULT_SPARSE_MAP>
-	class SparseValueVector
-	{
-	public:
-		template<typename...Args>
-		SparseValueVector(Args &&...)   // parameter n for compatibility with standard vectors
-		{
-		}
-    
-		template<typename...Args>
-		void resize(Args &&...) {/* does nothing */}
-		T operator[](const size_t id) const
-		{
-			auto it = data.find(id);
-			if (it == data.end())
-				return 0;
-			else
-				return it->second;
-		}
-
-		ProxyObject<T, Map> operator[](const size_t id)
-		{
-			return ProxyObject<T, Map>(this, id);
-		}
-
-		//private:
-		Map<size_t, T> data;
-	};
-
-	template <typename T, template<typename...> class Map>
-	class ProxyObject {
-	public:
-		ProxyObject(SparseValueVector<T, Map> *v, size_t idx) { _v = v; _idx = idx; };
-		ProxyObject<T, Map> & operator=(const T &v) {
-			// If we get here, we know that operator[] was called to perform a write access,
-			// so we can insert an item in the vector if needed
-			if (v != 0)
-				_v->data[_idx] = v;
-			return *this;
-		}
-
-		operator T() {
-			// If we get here, we know that operator[] was called to perform a read access,
-			// so we can simply return the existing object
-			auto it = _v->data.find(_idx);
-			if (it == _v->data.end())
-				return 0;
-			else
-				return it->second;
-		}
-
-		void operator+=(T val)
-		{
-			if (val == 0) return;
-			auto it = _v->data.find(_idx);
-			if (it == _v->data.end())
-				_v->data[_idx] = val;
-			else
-			{
-				T sum = it->second + val;
-				if (sum == 0)
-					_v->data.erase(it);
-				else
-					it->second = sum;
-			}
-		}
-		void operator-=(T val)
-		{
-			if (val == 0) return;
-			auto it = _v->data.find(_idx);
-			if (it == _v->data.end())
-				_v->data[_idx] = -val;
-			else
-			{
-				T sum = it->second - val;
-				if (sum == 0)
-					_v->data.erase(it);
-				else
-					it->second = sum;
-			}
-		}
-
-		SparseValueVector<T, Map> *_v;
-		size_t _idx;
-	};
-
-
-
-	/// \addtogroup min_cost_flow_algs
-	/// @{
-
-	/// \brief Implementation of the primal Network Simplex algorithm
-	/// for finding a \ref min_cost_flow "minimum cost flow".
-	///
-	/// \ref NetworkSimplexSimple implements the primal Network Simplex algorithm
-	/// for finding a \ref min_cost_flow "minimum cost flow"
-	/// \ref amo93networkflows, \ref dantzig63linearprog,
-	/// \ref kellyoneill91netsimplex.
-	/// This algorithm is a highly efficient specialized version of the
-	/// linear programming simplex method directly for the minimum cost
-	/// flow problem.
-	///
-	/// In general, %NetworkSimplexSimple is the fastest implementation available
-	/// in LEMON for this problem.
-	/// Moreover, it supports both directions of the supply/demand inequality
-	/// constraints. For more information, see \ref SupplyType.
-	///
-	/// Most of the parameters of the problem (except for the digraph)
-	/// can be given using separate functions, and the algorithm can be
-	/// executed using the \ref run() function. If some parameters are not
-	/// specified, then default values will be used.
-	///
-	/// \tparam GR The digraph type the algorithm runs on.
-	/// \tparam V The number type used for flow amounts, capacity bounds
-	/// and supply values in the algorithm. By default, it is \c int.
-	/// \tparam C The number type used for costs and potentials in the
-	/// algorithm. By default, it is the same as \c V.
-	///
-	/// \warning Both number types must be signed and all input data must
-	/// be integer.
-	///
-	/// \note %NetworkSimplexSimple provides five different pivot rule
-	/// implementations, from which the most efficient one is used
-	/// by default. For more information, see \ref PivotRule.
-	template <typename GR, typename V = int, typename C = V, typename ArcsType = int64_t, template<typename ...> class Map=DEFAULT_SPARSE_MAP>
-	class NetworkSimplexSimple
-	{
-	public:
-
-		/// \brief Constructor.
-		///
-		/// The constructor of the class.
-		///
-		/// \param graph The digraph the algorithm runs on.
-		/// \param arc_mixing Indicate if the arcs have to be stored in a
-		/// mixed order in the internal data structure.
-		/// In special cases, it could lead to better overall performance,
-		/// but it is usually slower. Therefore it is disabled by default.
-		NetworkSimplexSimple(const GR& graph, bool arc_mixing, int nbnodes, ArcsType nb_arcs, size_t maxiters = 0) :
-			_graph(graph),  //_arc_id(graph),
-			_arc_mixing(arc_mixing), _init_nb_nodes(nbnodes), _init_nb_arcs(nb_arcs)
-		{
-			// Reset data structures
-			reset();
-			max_iter = maxiters;
-		}
-
-		/// The type of the flow amounts, capacity bounds and supply values
-		typedef V Value;
-		/// The type of the arc costs
-		typedef C Cost;
-
-	public:
-
-		/// \brief Problem type constants for the \c run() function.
-		///
-		/// Enum type containing the problem type constants that can be
-		/// returned by the \ref run() function of the algorithm.
-		enum ProblemType {
-			/// The problem has no feasible solution (flow).
-			INFEASIBLE,
-			/// The problem has optimal solution (i.e. it is feasible and
-			/// bounded), and the algorithm has found optimal flow and node
-			/// potentials (primal and dual solutions).
-			OPTIMAL,
-			/// The objective function of the problem is unbounded, i.e.
-			/// there is a directed cycle having negative total cost and
-			/// infinite upper bound.
-			UNBOUNDED
-		};
-
-		/// \brief Constants for selecting the type of the supply constraints.
-		///
-		/// Enum type containing constants for selecting the supply type,
-		/// i.e. the direction of the inequalities in the supply/demand
-		/// constraints of the \ref min_cost_flow "minimum cost flow problem".
-		///
-		/// The default supply type is \c GEQ, the \c LEQ type can be
-		/// selected using \ref supplyType().
-		/// The equality form is a special case of both supply types.
-		enum SupplyType {
-			/// This option means that there are <em>"greater or equal"</em>
-			/// supply/demand constraints in the definition of the problem.
-			GEQ,
-			/// This option means that there are <em>"less or equal"</em>
-			/// supply/demand constraints in the definition of the problem.
-			LEQ
-		};
-
-
-
-	private:
-		size_t max_iter;
-		TEMPLATE_DIGRAPH_TYPEDEFS(GR);
-
-		typedef std::vector<int> IntVector;
-		typedef std::vector<ArcsType> ArcVector;
-		typedef std::vector<Value> ValueVector;
-		typedef std::vector<Cost> CostVector;
-		//	typedef SparseValueVector<Cost> CostVector;
-		typedef std::vector<char> BoolVector;
-		// Note: vector<char> is used instead of vector<bool> for efficiency reasons
-
-		// State constants for arcs
-		enum ArcState {
-			STATE_UPPER = -1,
-			STATE_TREE = 0,
-			STATE_LOWER = 1
-		};
-
-		typedef std::vector<signed char> StateVector;
-		// Note: vector<signed char> is used instead of vector<ArcState> for
-		// efficiency reasons
-
-	private:
-
-		// Data related to the underlying digraph
-		const GR &_graph;
-		int _node_num;
-		ArcsType _arc_num;
-		ArcsType _all_arc_num;
-		ArcsType _search_arc_num;
-
-		// Parameters of the problem
-		SupplyType _stype;
-		Value _sum_supply;
-
-		inline int _node_id(int n) const { return _node_num - n - 1; };
-
-		//IntArcMap _arc_id;
-		IntVector _source;  // keep nodes as integers
-		IntVector _target;
-		bool _arc_mixing;
-
-		// Node and arc data
-		CostVector _cost;
-		ValueVector _supply;
-#ifdef SPARSE_FLOW
-		SparseValueVector<Value, Map> _flow;
-#else
-		ValueVector _flow;
-#endif
-
-		CostVector _pi;
-
-		// Data for storing the spanning tree structure
-		IntVector _parent;
-		ArcVector _pred;
-		IntVector _thread;
-		IntVector _rev_thread;
-		IntVector _succ_num;
-		IntVector _last_succ;
-		IntVector _dirty_revs;
-		BoolVector _forward;
-		StateVector _state;
-		ArcsType _root;
-
-		// Temporary data used in the current pivot iteration
-		ArcsType in_arc, join, u_in, v_in, u_out, v_out;
-		ArcsType first, second, right, last;
-		ArcsType stem, par_stem, new_stem;
-		Value delta;
-
-		static constexpr Value MAX_VAL = std::numeric_limits<Value>::max();
-
-		ArcsType mixingCoeff;
-
-	public:
-
-		/// \brief Constant for infinite upper bounds (capacities).
-		///
-		/// Constant for infinite upper bounds (capacities).
-		/// It is \c std::numeric_limits<Value>::infinity() if available,
-		/// \c std::numeric_limits<Value>::max() otherwise.
-        static constexpr Value INF = std::numeric_limits<Value>::has_infinity ? std::numeric_limits<Value>::infinity() : MAX_VAL;
-
-	private:
-
-		// thank you to DVK and MizardX from StackOverflow for this function!
-		inline ArcsType sequence(ArcsType k) const {
-			ArcsType smallv = (k > num_total_big_subsequence_numbers) & 1;
-
-			k -= num_total_big_subsequence_numbers * smallv;
-			ArcsType subsequence_length2 = subsequence_length - smallv;
-			ArcsType subsequence_num = (k / subsequence_length2) + num_big_subsequences * smallv;
-			ArcsType subsequence_offset = (k % subsequence_length2) * mixingCoeff;
-
-			return subsequence_offset + subsequence_num;
-		}
-		ArcsType subsequence_length;
-		ArcsType num_big_subsequences;
-		ArcsType num_total_big_subsequence_numbers;
-
-		inline ArcsType getArcID(const Arc &arc) const
-		{
-			//int n = _arc_num-arc._id-1;
-			ArcsType n = _arc_num - GR::id(arc) - 1;
-
-			//ArcsType a = mixingCoeff*(n%mixingCoeff) + n/mixingCoeff;
-			//ArcsType b = _arc_id[arc];
-			if (_arc_mixing)
-				return sequence(n);
-			else
-				return n;
-		}
-
-		// finally unused because too slow
-		inline ArcsType getSource(const ArcsType arc) const
-		{
-			//ArcsType a = _source[arc];
-			//return a;
-
-			ArcsType n = _arc_num - arc - 1;
-			if (_arc_mixing)
-				n = mixingCoeff*(n%mixingCoeff) + n / mixingCoeff;
-
-			ArcsType b;
-			if (n >= 0)
-				b = _node_id(_graph.source(GR::arcFromId(n)));
-			else
-			{
-				n = arc + 1 - _arc_num;
-				if (n <= _node_num)
-					b = _node_num;
-				else
-					if (n >= _graph._n1)
-						b = _graph._n1;
-					else
-						b = _graph._n1 - n;
-			}
-
-			return b;
-		}
-
-
-
-		// Implementation of the Block Search pivot rule
-		class BlockSearchPivotRule
-		{
-		private:
-
-			// References to the NetworkSimplexSimple class
-			const IntVector  &_source;
-			const IntVector  &_target;
-			const CostVector &_cost;
-			const StateVector &_state;
-			const CostVector &_pi;
-			ArcsType &_in_arc;
-			ArcsType _search_arc_num;
-
-			// Pivot rule data
-			ArcsType _block_size;
-			ArcsType _next_arc;
-			NetworkSimplexSimple &_ns;
-
-		public:
-
-			// Constructor
-			BlockSearchPivotRule(NetworkSimplexSimple &ns) :
-				_source(ns._source), _target(ns._target),
-				_cost(ns._cost), _state(ns._state), _pi(ns._pi),
-				_in_arc(ns.in_arc), _search_arc_num(ns._search_arc_num),
-				_next_arc(0), _ns(ns)
-			{
-				// The main parameters of the pivot rule
-				const double BLOCK_SIZE_FACTOR = 1;
-				const ArcsType MIN_BLOCK_SIZE = 10;
-
-				_block_size = std::max(ArcsType(BLOCK_SIZE_FACTOR *	std::sqrt(double(_search_arc_num))), MIN_BLOCK_SIZE);
-			}
-
-			// Find next entering arc
-			bool findEnteringArc() {
-				Cost min_val = 0;
-
-#ifdef _OPENMP
-				ArcsType N = omp_get_max_threads();
-				std::vector<Cost> minArray(N, 0);
-				std::vector<ArcsType> arcId(N);
-				ArcsType bs = (ArcsType)ceil(_block_size / (double)N);
-#else
-				static constexpr ArcsType N = 1;
-				std::array<Cost, 1> minArray{Cost(0)};
-				std::array<ArcsType, 1> arcId{0};
-#endif
-
-				for (ArcsType i = 0; i < _search_arc_num; i += _block_size) {
-
-					ArcsType e;
-					ArcsType j;
-#ifdef _OPENMP
-#pragma omp parallel
-					{
-						int t = omp_get_thread_num();
-
-#pragma omp for schedule(static, bs) lastprivate(e)
-						for (j = 0; j < std::min(i + _block_size, _search_arc_num) - i; j++) {
-							e = (_next_arc + i + j); if (e >= _search_arc_num) e -= _search_arc_num;
-							Cost c = _state[e] * (_cost[e] + _pi[_source[e]] - _pi[_target[e]]);
-							if (c < minArray[t]) {
-								minArray[t] = c;
-								arcId[t] = e;
-							}
-						}
-					}
-					for (int j = 0; j < N; j++) {
-						if (minArray[j] < min_val) {
-							min_val = minArray[j];
-							_in_arc = arcId[j];
-						}
-					}
-#else
-					{
-
-						for (j = 0; j < std::min(i + _block_size, _search_arc_num) - i; j++) {
-							e = (_next_arc + i + j); if (e >= _search_arc_num) e -= _search_arc_num;
-							Cost c = _state[e] * (_cost[e] + _pi[_source[e]] - _pi[_target[e]]);
-							if (c < minArray[0]) {
-								minArray[0] = c;
-								arcId[0] = e;
-							}
-						}
-					}
-					min_val = minArray[0];
-					_in_arc = arcId[0];
-#endif
-					Cost a = std::abs(_pi[_source[_in_arc]]) > std::abs(_pi[_target[_in_arc]]) ? std::abs(_pi[_source[_in_arc]]) : std::abs(_pi[_target[_in_arc]]);
-					a = a > std::abs(_cost[_in_arc]) ? a : std::abs(_cost[_in_arc]);
-					if (min_val < -std::numeric_limits<Cost>::epsilon()*a) {
-						_next_arc = e;
-						return true;
-					}
-				}
-
-				Cost a = fabs(_pi[_source[_in_arc]]) > fabs(_pi[_target[_in_arc]]) ? fabs(_pi[_source[_in_arc]]) : fabs(_pi[_target[_in_arc]]);
-				a = a > fabs(_cost[_in_arc]) ? a : fabs(_cost[_in_arc]);
-				if (min_val >= -std::numeric_limits<Cost>::epsilon()*a) return false;
-
-				return true;
-			}
-
-
-			// Find next entering arc
-			/*bool findEnteringArc() {
-				Cost min_val = 0;
-				int N = omp_get_max_threads();
-				std::vector<Cost> minArray(N);
-				std::vector<ArcsType> arcId(N);
-
-				ArcsType bs = (ArcsType)ceil(_block_size / (double)N);
-				for (ArcsType i = 0; i < _search_arc_num; i += _block_size) {
-
-					ArcsType maxJ = std::min(i + _block_size, _search_arc_num) - i;
-					ArcsType j;
-#pragma omp parallel
-					{
-						int t = omp_get_thread_num();
-						Cost minV = 0;
-						ArcsType arcStart = _next_arc + i;
-						ArcsType arc = -1;
-#pragma omp for schedule(static, bs)
-						for (j = 0; j < maxJ; j++) {
-							ArcsType e = arcStart + j; if (e >= _search_arc_num) e -= _search_arc_num;
-							Cost c = _state[e] * (_cost[e] + _pi[_source[e]] - _pi[_target[e]]);
-							if (c < minV) {
-								minV = c;
-								arc = e;
-							}
-						}
-
-						minArray[t] = minV;
-						arcId[t] = arc;
-					}
-					for (int j = 0; j < N; j++) {
-						if (minArray[j] < min_val) {
-							min_val = minArray[j];
-							_in_arc = arcId[j];
-						}
-					}
-
-					//FIX by Antoine Rolet to avoid precision issues
-					Cost a = std::max(std::abs(_cost[_in_arc]), std::max(std::abs(_pi[_source[_in_arc]]), std::abs(_pi[_target[_in_arc]])));
-					if (min_val <-std::numeric_limits<Cost>::epsilon()*a) {
-						_next_arc = _next_arc + i + maxJ - 1;
-						if (_next_arc >= _search_arc_num) _next_arc -= _search_arc_num;
-						return true;
-					}
-				}
-
-				if (min_val >= 0) {
-					return false;
-				}
-
-				return true;
-			}*/
-
-
-			/*bool findEnteringArc() {
-				Cost c, min = 0;
-				int cnt = _block_size;
-				int e, min_arc = _next_arc;
-				for (e = _next_arc; e < _search_arc_num; ++e) {
-					c = _state[e] * (_cost[e] + _pi[_source[e]] - _pi[_target[e]]);
-					if (c < min) {
-						min = c;
-						min_arc = e;
-
-					}
-					if (--cnt == 0) {
-						if (min < 0) break;
-						cnt = _block_size;
-
-					}
-
-				}
-				if (min == 0 || cnt > 0) {
-					for (e = 0; e < _next_arc; ++e) {
-						c = _state[e] * (_cost[e] + _pi[_source[e]] - _pi[_target[e]]);
-						if (c < min) {
-							min = c;
-							min_arc = e;
-
-						}
-						if (--cnt == 0) {
-							if (min < 0) break;
-							cnt = _block_size;
-
-						}
-
-					}
-
-				}
-				if (min >= 0) return false;
-				_in_arc = min_arc;
-				_next_arc = e;
-				return true;
-			}*/
-
-
-
-		}; //class BlockSearchPivotRule
-
-
-
-	public:
-
-
-
-		int _init_nb_nodes;
-		ArcsType _init_nb_arcs;
-
-		/// \name Parameters
-		/// The parameters of the algorithm can be specified using these
-		/// functions.
-
-		/// @{
-
-
-		/// \brief Set the costs of the arcs.
-		///
-		/// This function sets the costs of the arcs.
-		/// If it is not used before calling \ref run(), the costs
-		/// will be set to \c 1 on all arcs.
-		///
-		/// \param map An arc map storing the costs.
-		/// Its \c Value type must be convertible to the \c Cost type
-		/// of the algorithm.
-		///
-		/// \return <tt>(*this)</tt>
-		template<typename CostMap>
-		NetworkSimplexSimple& costMap(const CostMap& map) {
-			Arc a; _graph.first(a);
-			for (; a != INVALID; _graph.next(a)) {
-				_cost[getArcID(a)] = map[a];
-			}
-			return *this;
-		}
-
-
-		/// \brief Set the costs of one arc.
-		///
-		/// This function sets the costs of one arcs.
-		/// Done for memory reasons
-		///
-		/// \param arc An arc.
-		/// \param arc A cost
-		///
-		/// \return <tt>(*this)</tt>
-		template<typename Value>
-		NetworkSimplexSimple& setCost(const Arc& arc, const Value cost) {
-			_cost[getArcID(arc)] = cost;
-			return *this;
-		}
-
-
-		/// \brief Set the supply values of the nodes.
-		///
-		/// This function sets the supply values of the nodes.
-		/// If neither this function nor \ref stSupply() is used before
-		/// calling \ref run(), the supply of each node will be set to zero.
-		///
-		/// \param map A node map storing the supply values.
-		/// Its \c Value type must be convertible to the \c Value type
-		/// of the algorithm.
-		///
-		/// \return <tt>(*this)</tt>
-		template<typename SupplyMap>
-		NetworkSimplexSimple& supplyMap(const SupplyMap& map) {
-			Node n; _graph.first(n);
-			for (; n != INVALIDNODE; _graph.next(n)) {
-				_supply[_node_id(n)] = map[n];
-			}
-			return *this;
-		}
-		template<typename SupplyMap>
-		NetworkSimplexSimple& supplyMap(const SupplyMap* map1, int n1, const SupplyMap* map2, int) {
-			Node n; _graph.first(n);
-			for (; n != INVALIDNODE; _graph.next(n)) {
-				if (n<n1)
-					_supply[_node_id(n)] = map1[n];
-				else
-					_supply[_node_id(n)] = map2[n - n1];
-			}
-			return *this;
-		}
-		template<typename SupplyMap>
-		NetworkSimplexSimple& supplyMapAll(SupplyMap val1, int n1, SupplyMap val2, int) {
-			Node n; _graph.first(n);
-			for (; n != INVALIDNODE; _graph.next(n)) {
-				if (n<n1)
-					_supply[_node_id(n)] = val1;
-				else
-					_supply[_node_id(n)] = val2;
-			}
-			return *this;
-		}
-
-		/// \brief Set single source and target nodes and a supply value.
-		///
-		/// This function sets a single source node and a single target node
-		/// and the required flow value.
-		/// If neither this function nor \ref supplyMap() is used before
-		/// calling \ref run(), the supply of each node will be set to zero.
-		///
-		/// Using this function has the same effect as using \ref supplyMap()
-		/// with such a map in which \c k is assigned to \c s, \c -k is
-		/// assigned to \c t and all other nodes have zero supply value.
-		///
-		/// \param s The source node.
-		/// \param t The target node.
-		/// \param k The required amount of flow from node \c s to node \c t
-		/// (i.e. the supply of \c s and the demand of \c t).
-		///
-		/// \return <tt>(*this)</tt>
-		NetworkSimplexSimple& stSupply(const Node& s, const Node& t, Value k) {
-			for (int i = 0; i != _node_num; ++i) {
-				_supply[i] = 0;
-			}
-			_supply[_node_id(s)] = k;
-			_supply[_node_id(t)] = -k;
-			return *this;
-		}
-
-		/// \brief Set the type of the supply constraints.
-		///
-		/// This function sets the type of the supply/demand constraints.
-		/// If it is not used before calling \ref run(), the \ref GEQ supply
-		/// type will be used.
-		///
-		/// For more information, see \ref SupplyType.
-		///
-		/// \return <tt>(*this)</tt>
-		NetworkSimplexSimple& supplyType(SupplyType supply_type) {
-			_stype = supply_type;
-			return *this;
-		}
-
-		/// @}
-
-		/// \name Execution Control
-		/// The algorithm can be executed using \ref run().
-
-		/// @{
-
-		/// \brief Run the algorithm.
-		///
-		/// This function runs the algorithm.
-		/// The paramters can be specified using functions \ref lowerMap(),
-		/// \ref upperMap(), \ref costMap(), \ref supplyMap(), \ref stSupply(),
-		/// \ref supplyType().
-		/// For example,
-		/// \code
-		///   NetworkSimplexSimple<ListDigraph> ns(graph);
-		///   ns.lowerMap(lower).upperMap(upper).costMap(cost)
-		///     .supplyMap(sup).run();
-		/// \endcode
-		///
-		/// This function can be called more than once. All the given parameters
-		/// are kept for the next call, unless \ref resetParams() or \ref reset()
-		/// is used, thus only the modified parameters have to be set again.
-		/// If the underlying digraph was also modified after the construction
-		/// of the class (or the last \ref reset() call), then the \ref reset()
-		/// function must be called.
-		///
-		/// \param pivot_rule The pivot rule that will be used during the
-		/// algorithm. For more information, see \ref PivotRule.
-		///
-		/// \return \c INFEASIBLE if no feasible flow exists,
-		/// \n \c OPTIMAL if the problem has optimal solution
-		/// (i.e. it is feasible and bounded), and the algorithm has found
-		/// optimal flow and node potentials (primal and dual solutions),
-		/// \n \c UNBOUNDED if the objective function of the problem is
-		/// unbounded, i.e. there is a directed cycle having negative total
-		/// cost and infinite upper bound.
-		///
-		/// \see ProblemType, PivotRule
-		/// \see resetParams(), reset()
-		ProblemType run() {
-			if (!init()) return INFEASIBLE;
-			return start();
-		}
-
-		/// \brief Reset all the parameters that have been given before.
-		///
-		/// This function resets all the paramaters that have been given
-		/// before using functions \ref lowerMap(), \ref upperMap(),
-		/// \ref costMap(), \ref supplyMap(), \ref stSupply(), \ref supplyType().
-		///
-		/// It is useful for multiple \ref run() calls. Basically, all the given
-		/// parameters are kept for the next \ref run() call, unless
-		/// \ref resetParams() or \ref reset() is used.
-		/// If the underlying digraph was also modified after the construction
-		/// of the class or the last \ref reset() call, then the \ref reset()
-		/// function must be used, otherwise \ref resetParams() is sufficient.
-		///
-		/// For example,
-		/// \code
-		///   NetworkSimplexSimple<ListDigraph> ns(graph);
-		///
-		///   // First run
-		///   ns.lowerMap(lower).upperMap(upper).costMap(cost)
-		///     .supplyMap(sup).run();
-		///
-		///   // Run again with modified cost map (resetParams() is not called,
-		///   // so only the cost map have to be set again)
-		///   cost[e] += 100;
-		///   ns.costMap(cost).run();
-		///
-		///   // Run again from scratch using resetParams()
-		///   // (the lower bounds will be set to zero on all arcs)
-		///   ns.resetParams();
-		///   ns.upperMap(capacity).costMap(cost)
-		///     .supplyMap(sup).run();
-		/// \endcode
-		///
-		/// \return <tt>(*this)</tt>
-		///
-		/// \see reset(), run()
-		NetworkSimplexSimple& resetParams() {
-			for (int i = 0; i != _node_num; ++i) {
-				_supply[i] = 0;
-			}
-			for (ArcsType i = 0; i != _arc_num; ++i) {
-				_cost[i] = 1;
-			}
-			_stype = GEQ;
-			return *this;
-		}
-
-
-		/// \brief Reset the internal data structures and all the parameters
-		/// that have been given before.
-		///
-		/// This function resets the internal data structures and all the
-		/// paramaters that have been given before using functions \ref lowerMap(),
-		/// \ref upperMap(), \ref costMap(), \ref supplyMap(), \ref stSupply(),
-		/// \ref supplyType().
-		///
-		/// It is useful for multiple \ref run() calls. Basically, all the given
-		/// parameters are kept for the next \ref run() call, unless
-		/// \ref resetParams() or \ref reset() is used.
-		/// If the underlying digraph was also modified after the construction
-		/// of the class or the last \ref reset() call, then the \ref reset()
-		/// function must be used, otherwise \ref resetParams() is sufficient.
-		///
-		/// See \ref resetParams() for examples.
-		///
-		/// \return <tt>(*this)</tt>
-		///
-		/// \see resetParams(), run()
-		NetworkSimplexSimple& reset() {
-			// Resize vectors
-			_node_num = _init_nb_nodes;
-			_arc_num = _init_nb_arcs;
-			int all_node_num = _node_num + 1;
-			ArcsType max_arc_num = _arc_num + 2 * _node_num;
-
-			_source.resize(max_arc_num);
-			_target.resize(max_arc_num);
-
-			_cost.resize(max_arc_num);
-			_supply.resize(all_node_num);
-			_flow.resize(max_arc_num);
-			_pi.resize(all_node_num);
-
-			_parent.resize(all_node_num);
-			_pred.resize(all_node_num);
-			_forward.resize(all_node_num);
-			_thread.resize(all_node_num);
-			_rev_thread.resize(all_node_num);
-			_succ_num.resize(all_node_num);
-			_last_succ.resize(all_node_num);
-			_state.resize(max_arc_num);
-
-
-			//_arc_mixing=false;
-			if (_arc_mixing && _node_num > 1) {
-				// Store the arcs in a mixed order
-				//ArcsType k = std::max(ArcsType(std::sqrt(double(_arc_num))), ArcsType(10));
-				const ArcsType k = std::max(ArcsType(_arc_num / _node_num), ArcsType(3));
-				mixingCoeff = k;
-				subsequence_length = _arc_num / mixingCoeff + 1;
-				num_big_subsequences = _arc_num % mixingCoeff;
-				num_total_big_subsequence_numbers = subsequence_length * num_big_subsequences;
-
-#ifdef _OPENMP
-#pragma omp parallel for schedule(static)
-#endif
-				for (Arc a = 0; a <= _graph.maxArcId(); a++) {   // --a <=> _graph.next(a)  , -1 == INVALID
-					ArcsType i = sequence(_graph.maxArcId()-a);
-					_source[i] = _node_id(_graph.source(a));
-					_target[i] = _node_id(_graph.target(a));
-				}
-			} else {
-				// Store the arcs in the original order
-				ArcsType i = 0;
-				Arc a; _graph.first(a);
-				for (; a != INVALID; _graph.next(a), ++i) {
-					_source[i] = _node_id(_graph.source(a));
-					_target[i] = _node_id(_graph.target(a));
-					//_arc_id[a] = i;
-				}
-			}
-
-			// Reset parameters
-			resetParams();
-			return *this;
-		}
-
-		/// @}
-
-		/// \name Query Functions
-		/// The results of the algorithm can be obtained using these
-		/// functions.\n
-		/// The \ref run() function must be called before using them.
-
-		/// @{
-
-		/// \brief Return the total cost of the found flow.
-		///
-		/// This function returns the total cost of the found flow.
-		/// Its complexity is O(e).
-		///
-		/// \note The return type of the function can be specified as a
-		/// template parameter. For example,
-		/// \code
-		///   ns.totalCost<double>();
-		/// \endcode
-		/// It is useful if the total cost cannot be stored in the \c Cost
-		/// type of the algorithm, which is the default return type of the
-		/// function.
-		///
-		/// \pre \ref run() must be called before using this function.
-		/*template <typename Number>
-		Number totalCost() const {
-		Number c = 0;
-		for (ArcIt a(_graph); a != INVALID; ++a) {
-		int i = getArcID(a);
-		c += Number(_flow[i]) * Number(_cost[i]);
-		}
-		return c;
-		}*/
-
-		template <typename Number>
-		Number totalCost() const {
-			Number c = 0;
-
-#ifdef SPARSE_FLOW
-			for (auto it = _flow.data.begin(); it!=_flow.data.end(); ++it)
-				c += Number(it->second) * Number(_cost[it->first]);
-			return c;
-#else
-			for (ArcsType i = 0; i<_flow.size(); i++)
-				c += _flow[i] * Number(_cost[i]);
-			return c;
-#endif
-		}
-
-#ifndef DOXYGEN
-		Cost totalCost() const {
-			return totalCost<Cost>();
-		}
-#endif
-
-		/// \brief Return the flow on the given arc.
-		///
-		/// This function returns the flow on the given arc.
-		///
-		/// \pre \ref run() must be called before using this function.
-		Value flow(const Arc& a) const {
-			return _flow[getArcID(a)];
-		}
-
-		/// \brief Return the flow map (the primal solution).
-		///
-		/// This function copies the flow value on each arc into the given
-		/// map. The \c Value type of the algorithm must be convertible to
-		/// the \c Value type of the map.
-		///
-		/// \pre \ref run() must be called before using this function.
-		template <typename FlowMap>
-		void flowMap(FlowMap &map) const {
-			Arc a; _graph.first(a);
-			for (; a != INVALID; _graph.next(a)) {
-				map.set(a, _flow[getArcID(a)]);
-			}
-		}
-
-		/// \brief Return the potential (dual value) of the given node.
-		///
-		/// This function returns the potential (dual value) of the
-		/// given node.
-		///
-		/// \pre \ref run() must be called before using this function.
-		Cost potential(const Node& n) const {
-			return _pi[_node_id(n)];
-		}
-
-		/// \brief Return the potential map (the dual solution).
-		///
-		/// This function copies the potential (dual value) of each node
-		/// into the given map.
-		/// The \c Cost type of the algorithm must be convertible to the
-		/// \c Value type of the map.
-		///
-		/// \pre \ref run() must be called before using this function.
-		template <typename PotentialMap>
-		void potentialMap(PotentialMap &map) const {
-			Node n; _graph.first(n);
-			for (; n != INVALID; _graph.next(n)) {
-				map.set(n, _pi[_node_id(n)]);
-			}
-		}
-
-		/// @}
-
-	private:
-
-		// Initialize internal data structures
-		bool init() {
-			if (_node_num == 0) return false;
-
-			// Check the sum of supply values
-			_sum_supply = 0;
-			for (int i = 0; i != _node_num; ++i) {
-				_sum_supply += _supply[i];
-			}
-			/*if (!((_stype == GEQ && _sum_supply <= 0) ||
-				(_stype == LEQ && _sum_supply >= 0))) return false;*/
-
-
-			// Initialize artifical cost
-			Cost ART_COST;
-			if (std::numeric_limits<Cost>::is_exact) {
-				ART_COST = std::numeric_limits<Cost>::max() / 2 + 1;
-			} else {
-				ART_COST = 0;
-				for (ArcsType i = 0; i != _arc_num; ++i) {
-					if (_cost[i] > ART_COST) ART_COST = _cost[i];
-				}
-				ART_COST = (ART_COST + 1) * _node_num;
-			}
-
-			// Initialize arc maps
-			for (ArcsType i = 0; i != _arc_num; ++i) {
-#ifndef SPARSE_FLOW
-				_flow[i] = 0; //by default, the sparse matrix is empty
-#endif
-				_state[i] = STATE_LOWER;
-			}
-#ifdef SPARSE_FLOW
-			_flow = SparseValueVector<Value, Map>();
-#endif
-
-			// Set data for the artificial root node
-			_root = _node_num;
-			_parent[_root] = -1;
-			_pred[_root] = -1;
-			_thread[_root] = 0;
-			_rev_thread[0] = _root;
-			_succ_num[_root] = _node_num + 1;
-			_last_succ[_root] = _root - 1;
-			_supply[_root] = -_sum_supply;
-			_pi[_root] = 0;
-
-			// Add artificial arcs and initialize the spanning tree data structure
-			if (_sum_supply == 0) {
-				// EQ supply constraints
-				_search_arc_num = _arc_num;
-				_all_arc_num = _arc_num + _node_num;
-				for (ArcsType u = 0, e = _arc_num; u != static_cast<ArcsType>(_node_num); ++u, ++e) {
-					_parent[u] = _root;
-					_pred[u] = e;
-					_thread[u] = u + 1;
-					_rev_thread[u + 1] = u;
-					_succ_num[u] = 1;
-					_last_succ[u] = u;
-					_state[e] = STATE_TREE;
-					if (_supply[u] >= 0) {
-						_forward[u] = true;
-						_pi[u] = 0;
-						_source[e] = u;
-						_target[e] = _root;
-						_flow[e] = _supply[u];
-						_cost[e] = 0;
-					} else {
-						_forward[u] = false;
-						_pi[u] = ART_COST;
-						_source[e] = _root;
-						_target[e] = u;
-						_flow[e] = -_supply[u];
-						_cost[e] = ART_COST;
-					}
-				}
-			} else if (_sum_supply > 0) {
-				// LEQ supply constraints
-				_search_arc_num = _arc_num + _node_num;
-				ArcsType f = _arc_num + _node_num;
-				for (ArcsType u = 0, e = _arc_num; u != static_cast<ArcsType>(_node_num); ++u, ++e) {
-					_parent[u] = _root;
-					_thread[u] = u + 1;
-					_rev_thread[u + 1] = u;
-					_succ_num[u] = 1;
-					_last_succ[u] = u;
-					if (_supply[u] >= 0) {
-						_forward[u] = true;
-						_pi[u] = 0;
-						_pred[u] = e;
-						_source[e] = u;
-						_target[e] = _root;
-						_flow[e] = _supply[u];
-						_cost[e] = 0;
-						_state[e] = STATE_TREE;
-					} else {
-						_forward[u] = false;
-						_pi[u] = ART_COST;
-						_pred[u] = f;
-						_source[f] = _root;
-						_target[f] = u;
-						_flow[f] = -_supply[u];
-						_cost[f] = ART_COST;
-						_state[f] = STATE_TREE;
-						_source[e] = u;
-						_target[e] = _root;
-						//_flow[e] = 0;  //by default, the sparse matrix is empty
-						_cost[e] = 0;
-						_state[e] = STATE_LOWER;
-						++f;
-					}
-				}
-				_all_arc_num = f;
-			} else {
-				// GEQ supply constraints
-				_search_arc_num = _arc_num + _node_num;
-				ArcsType f = _arc_num + _node_num;
-				for (ArcsType u = 0, e = _arc_num; u != static_cast<ArcsType>(_node_num); ++u, ++e) {
-					_parent[u] = _root;
-					_thread[u] = u + 1;
-					_rev_thread[u + 1] = u;
-					_succ_num[u] = 1;
-					_last_succ[u] = u;
-					if (_supply[u] <= 0) {
-						_forward[u] = false;
-						_pi[u] = 0;
-						_pred[u] = e;
-						_source[e] = _root;
-						_target[e] = u;
-						_flow[e] = -_supply[u];
-						_cost[e] = 0;
-						_state[e] = STATE_TREE;
-					} else {
-						_forward[u] = true;
-						_pi[u] = -ART_COST;
-						_pred[u] = f;
-						_source[f] = u;
-						_target[f] = _root;
-						_flow[f] = _supply[u];
-						_state[f] = STATE_TREE;
-						_cost[f] = ART_COST;
-						_source[e] = _root;
-						_target[e] = u;
-						//_flow[e] = 0; //by default, the sparse matrix is empty
-						_cost[e] = 0;
-						_state[e] = STATE_LOWER;
-						++f;
-					}
-				}
-				_all_arc_num = f;
-			}
-
-			return true;
-		}
-
-		// Find the join node
-		void findJoinNode() {
-			int u = _source[in_arc];
-			int v = _target[in_arc];
-			while (u != v) {
-				if (_succ_num[u] < _succ_num[v]) {
-					u = _parent[u];
-				} else {
-					v = _parent[v];
-				}
-			}
-			join = u;
-		}
-
-		// Find the leaving arc of the cycle and returns true if the
-		// leaving arc is not the same as the entering arc
-		bool findLeavingArc() {
-			// Initialize first and second nodes according to the direction
-			// of the cycle
-			if (_state[in_arc] == STATE_LOWER) {
-				first = _source[in_arc];
-				second = _target[in_arc];
-			} else {
-				first = _target[in_arc];
-				second = _source[in_arc];
-			}
-			delta = INF;
-			char result = 0;
-			Value d;
-			ArcsType e;
-
-			// Search the cycle along the path form the first node to the root
-			for (auto u = first; u != join; u = _parent[u]) {
-				e = _pred[u];
-				d = _forward[u] ? _flow[e] : INF;
-				if (d < delta) {
-					delta = d;
-					u_out = u;
-					result = 1;
-				}
-			}
-			// Search the cycle along the path form the second node to the root
-			for (int u = second; u != join; u = _parent[u]) {
-				e = _pred[u];
-				d = _forward[u] ? INF : _flow[e];
-				if (d <= delta) {
-					delta = d;
-					u_out = u;
-					result = 2;
-				}
-			}
-
-			if (result == 1) {
-				u_in = first;
-				v_in = second;
-			} else {
-				u_in = second;
-				v_in = first;
-			}
-			return result != 0;
-		}
-
-		// Change _flow and _state vectors
-		void changeFlow(bool change) {
-			// Augment along the cycle
-			if (delta > 0) {
-				Value val = _state[in_arc] * delta;
-				_flow[in_arc] += val;
-				for (auto u = _source[in_arc]; u != join; u = _parent[u]) {
-					_flow[_pred[u]] += _forward[u] ? -val : val;
-				}
-				for (auto u = _target[in_arc]; u != join; u = _parent[u]) {
-					_flow[_pred[u]] += _forward[u] ? val : -val;
-				}
-			}
-			// Update the state of the entering and leaving arcs
-			if (change) {
-				_state[in_arc] = STATE_TREE;
-				_state[_pred[u_out]] =
-					(_flow[_pred[u_out]] == 0) ? STATE_LOWER : STATE_UPPER;
-			} else {
-				_state[in_arc] = -_state[in_arc];
-			}
-		}
-
-		// Update the tree structure
-		void updateTreeStructure() {
-			int old_rev_thread = _rev_thread[u_out];
-			int old_succ_num = _succ_num[u_out];
-			int old_last_succ = _last_succ[u_out];
-			v_out = _parent[u_out];
-
-			// Check if u_in and u_out coincide
-			if (u_in == u_out) {
-				// Update _parent, _pred, _pred_dir
-				_parent[u_in] = v_in;
-				_pred[u_in] = in_arc;
-				_forward[u_in] = (u_in == _source[in_arc]);
-
-				// Update _thread and _rev_thread
-				if (_thread[v_in] != u_out) {
-					ArcsType after = _thread[old_last_succ];
-					_thread[old_rev_thread] = after;
-					_rev_thread[after] = old_rev_thread;
-					after = _thread[v_in];
-					_thread[v_in] = u_out;
-					_rev_thread[u_out] = v_in;
-					_thread[old_last_succ] = after;
-					_rev_thread[after] = old_last_succ;
-				}
-			} else {
-				// Handle the case when old_rev_thread equals to v_in
-				// (it also means that join and v_out coincide)
-				int thread_continue = old_rev_thread == static_cast<int>(v_in) ?
-					_thread[old_last_succ] : _thread[v_in];
-
-				// Update _thread and _parent along the stem nodes (i.e. the nodes
-				// between u_in and u_out, whose parent have to be changed)
-				int stem = u_in;              // the current stem node
-				int par_stem = v_in;          // the new parent of stem
-				int next_stem;                // the next stem node
-				int last = _last_succ[u_in];  // the last successor of stem
-				int before, after = _thread[last];
-				_thread[v_in] = u_in;
-				_dirty_revs.clear();
-				_dirty_revs.push_back(v_in);
-				while (stem != static_cast<int>(u_out)) {
-					// Insert the next stem node into the thread list
-					next_stem = _parent[stem];
-					_thread[last] = next_stem;
-					_dirty_revs.push_back(last);
-
-					// Remove the subtree of stem from the thread list
-					before = _rev_thread[stem];
-					_thread[before] = after;
-					_rev_thread[after] = before;
-
-					// Change the parent node and shift stem nodes
-					_parent[stem] = par_stem;
-					par_stem = stem;
-					stem = next_stem;
-
-					// Update last and after
-					last = _last_succ[stem] == _last_succ[par_stem] ?
-						_rev_thread[par_stem] : _last_succ[stem];
-					after = _thread[last];
-				}
-				_parent[u_out] = par_stem;
-				_thread[last] = thread_continue;
-				_rev_thread[thread_continue] = last;
-				_last_succ[u_out] = last;
-
-				// Remove the subtree of u_out from the thread list except for
-				// the case when old_rev_thread equals to v_in
-				if (old_rev_thread != v_in) {
-					_thread[old_rev_thread] = after;
-					_rev_thread[after] = old_rev_thread;
-				}
-
-				// Update _rev_thread using the new _thread values
-				for (int i = 0; i != int(_dirty_revs.size()); ++i) {
-					int u = _dirty_revs[i];
-					_rev_thread[_thread[u]] = u;
-				}
-
-				// Update _pred, _pred_dir, _last_succ and _succ_num for the
-				// stem nodes from u_out to u_in
-				int tmp_sc = 0, tmp_ls = _last_succ[u_out];
-				for (int u = u_out, p = _parent[u]; u != static_cast<int>(u_in); u = p, p = _parent[u]) {
-					_pred[u] = _pred[p];
-					_forward[u] = !_forward[p];
-					tmp_sc += _succ_num[u] - _succ_num[p];
-					_succ_num[u] = tmp_sc;
-					_last_succ[p] = tmp_ls;
-				}
-				_pred[u_in] = in_arc;
-				_forward[u_in] = (static_cast<int>(u_in) == _source[in_arc]);
-				_succ_num[u_in] = old_succ_num;
-			}
-
-			// Update _last_succ from v_in towards the root
-			int up_limit_out = static_cast<ArcsType>(_last_succ[join]) == v_in ? join : -1;
-			int last_succ_out = _last_succ[u_out];
-			for (int u = v_in; u != -1 && _last_succ[u] == v_in; u = _parent[u]) {
-				_last_succ[u] = last_succ_out;
-			}
-
-			// Update _last_succ from v_out towards the root
-			if (static_cast<int>(join) != old_rev_thread && static_cast<int>(v_in) != old_rev_thread) {
-				for (int u = v_out; u != up_limit_out && _last_succ[u] == old_last_succ;
-					u = _parent[u]) {
-					_last_succ[u] = old_rev_thread;
-				}
-			} else if (last_succ_out != old_last_succ) {
-				for (int u = v_out; u != up_limit_out && _last_succ[u] == old_last_succ;
-					u = _parent[u]) {
-					_last_succ[u] = last_succ_out;
-				}
-			}
-
-			// Update _succ_num from v_in to join
-			for (int u = v_in; u != static_cast<int>(join); u = _parent[u]) {
-				_succ_num[u] += old_succ_num;
-			}
-			// Update _succ_num from v_out to join
-			for (int u = v_out; u != static_cast<int>(join); u = _parent[u]) {
-				_succ_num[u] -= old_succ_num;
-			}
-		}
-
-		void updatePotential() {
-			Cost sigma = _pi[v_in] - _pi[u_in] -
-				((_forward[u_in])?_cost[in_arc]:(-_cost[in_arc]));
-			int end = _thread[_last_succ[u_in]];
-			for (int u = u_in; u != end; u = _thread[u]) {
-				_pi[u] += sigma;
-			}
-		}
-
-
-		// Heuristic initial pivots
-		bool initialPivots() {
-			Value curr, total = 0;
-			std::vector<Node> supply_nodes, demand_nodes;
-			Node u; _graph.first(u);
-			for (; u != INVALIDNODE; _graph.next(u)) {
-				curr = _supply[_node_id(u)];
-				if (curr > 0) {
-					total += curr;
-					supply_nodes.push_back(u);
-				} else if (curr < 0) {
-					demand_nodes.push_back(u);
-				}
-			}
-			if (_sum_supply > 0) total -= _sum_supply;
-			if (total <= 0) return true;
-
-			ArcVector arc_vector;
-			if (_sum_supply >= 0) {
-				if (supply_nodes.size() == 1 && demand_nodes.size() == 1) {
-					// Perform a reverse graph search from the sink to the source
-					//typename GR::template NodeMap<bool> reached(_graph, false);
-					BoolVector reached(_node_num, false);
-					Node s = supply_nodes[0], t = demand_nodes[0];
-					std::vector<Node> stack;
-					reached[t] = true;
-					stack.push_back(t);
-					while (!stack.empty()) {
-						Node u, v = stack.back();
-						stack.pop_back();
-						if (v == s) break;
-						Arc a; _graph.firstIn(a, v);
-						for (; a != INVALID; _graph.nextIn(a)) {
-							if (reached[u = _graph.source(a)]) continue;
-							ArcsType j = getArcID(a);
-							arc_vector.push_back(j);
-							reached[u] = true;
-							stack.push_back(u);
-						}
-					}
-				} else {
-					arc_vector.resize(demand_nodes.size());
-					// Find the min. cost incomming arc for each demand node
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-					for (ArcsType i = 0; i < ArcsType(demand_nodes.size()); ++i) {
-						Node v = demand_nodes[i];
-						Cost min_cost = std::numeric_limits<Cost>::max();
-						Arc min_arc = INVALID;
-						Arc a; _graph.firstIn(a, v);
-						for (; a != INVALID; _graph.nextIn(a)) {
-							Cost c = _cost[getArcID(a)];
-							if (c < min_cost) {
-								min_cost = c;
-								min_arc = a;
-							}
-						}
-						arc_vector[i] = getArcID(min_arc);
-					}
-					arc_vector.erase(std::remove(arc_vector.begin(), arc_vector.end(), INVALID), arc_vector.end());
-				}
-			} else {
-				arc_vector.resize(supply_nodes.size());
-				// Find the min. cost outgoing arc for each supply node
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-				for (int i = 0; i < int(supply_nodes.size()); ++i) {
-					Node u = supply_nodes[i];
-					Cost min_cost = std::numeric_limits<Cost>::max();
-					Arc min_arc = INVALID;
-					Arc a; _graph.firstOut(a, u);
-					for (; a != INVALID; _graph.nextOut(a)) {
-						Cost c = _cost[getArcID(a)];
-						if (c < min_cost) {
-							min_cost = c;
-							min_arc = a;
-						}
-					}
-					arc_vector[i] = getArcID(min_arc);
-				}
-				arc_vector.erase(std::remove(arc_vector.begin(), arc_vector.end(), INVALID), arc_vector.end());
-			}
-
-			// Perform heuristic initial pivots
-			for (ArcsType i = 0; i != ArcsType(arc_vector.size()); ++i) {
-				in_arc = arc_vector[i];
-				if (_state[in_arc] * (_cost[in_arc] + _pi[_source[in_arc]] -
-					_pi[_target[in_arc]]) >= 0) continue;
-				findJoinNode();
-				bool change = findLeavingArc();
-				if (delta >= MAX_VAL) return false;
-				changeFlow(change);
-				if (change) {
-					updateTreeStructure();
-					updatePotential();
-				}
-			}
-			return true;
-		}
-
-		// Execute the algorithm
-		ProblemType start() {
-			return start<BlockSearchPivotRule>();
-		}
-
-		template <typename PivotRuleImpl>
-		ProblemType start() {
-			PivotRuleImpl pivot(*this);
-
-			// Perform heuristic initial pivots
-			if (!initialPivots()) return UNBOUNDED;
-
-			size_t iter_number = 0;
-			// Execute the Network Simplex algorithm
-			while (pivot.findEnteringArc()) {
-				if ((iter_number <= max_iter&&max_iter > 0) || max_iter<=0) {
-					iter_number++;
-					findJoinNode();
-					bool change = findLeavingArc();
-					if (delta >= MAX_VAL) return UNBOUNDED;
-					changeFlow(change);
-					if (change) {
-						updateTreeStructure();
-						updatePotential();
-					}
-				} else break;
-			}
-
-			// Check feasibility
-			for (ArcsType e = _search_arc_num; e != _all_arc_num; ++e) {
-				if (_flow[e] != 0) return INFEASIBLE;
-			}
-
-			// Shift potentials to meet the requirements of the GEQ/LEQ type
-			// optimality conditions
-			if (_sum_supply == 0) {
-				if (_stype == GEQ) {
-					Cost max_pot = -std::numeric_limits<Cost>::max();
-					for (ArcsType i = 0; i != static_cast<ArcsType>(_node_num); ++i) {
-						if (_pi[i] > max_pot) max_pot = _pi[i];
-					}
-					if (max_pot > 0) {
-						for (ArcsType i = 0; i != static_cast<ArcsType>(_node_num); ++i)
-							_pi[i] -= max_pot;
-					}
-				} else {
-					Cost min_pot = std::numeric_limits<Cost>::max();
-					for (ArcsType i = 0; i != static_cast<ArcsType>(_node_num); ++i) {
-						if (_pi[i] < min_pot) min_pot = _pi[i];
-					}
-					if (min_pot < 0) {
-						for (ArcsType i = 0; i != static_cast<ArcsType>(_node_num); ++i)
-							_pi[i] -= min_pot;
-					}
-				}
-			}
-
-			return OPTIMAL;
-		}
-
-	}; //class NetworkSimplexSimple
-
-	   ///@}
-
-} //namespace lemon
-
-#endif //LEMON_NETWORK_SIMPLEX_H
diff --git a/python/pycs.cpp b/python/pycs.cpp
index 928a853f..3d903986 100644
--- a/python/pycs.cpp
+++ b/python/pycs.cpp
@@ -1,5 +1,6 @@
 #include "pyfgc.h"
-#include "minocore/matrix_coreset.h"
+#include "minocore/coreset/matrix_coreset.h"
+#include "pybind11/numpy.h"
 
 using CSType = coresets::CoresetSampler<float, uint32_t>;
 using FNA =  py::array_t<float, py::array::c_style | py::array::forcecast>;
@@ -10,13 +11,21 @@ void init_coreset(py::module &m) {
     py::class_<CSType>(m, "CoresetSampler")
     .def(py::init<>())
     .def("make_sampler", [](
-        CSType &cs, size_t ncenters, FNA costs, INA assignments, py::object weights, uint64_t seed)
+        CSType &cs, size_t ncenters, py::array costs, INA assignments, py::object weights, uint64_t seed, minocore::coresets::SensitivityMethod sens)
     {
         py::buffer_info buf1 = costs.request(), asb = assignments.request();
         if(buf1.ndim != 1) throw std::runtime_error("buffer must have one dimension (reshape if necessary)");
         float *wp = nullptr;
         if(auto p(pybind11::cast<FNA>(weights)); p)
             wp = static_cast<float *>(p.request().ptr);
-        cs.make_sampler(ncenters, costs.shape(0), (float *)buf1.ptr, (uint32_t *)asb.ptr, wp, seed);
-    });
+        if(py::isinstance<py::array_t<float>>(costs)) {
+            cs.make_sampler(ncenters, costs.shape(0), (float *)buf1.ptr, (uint32_t *)asb.ptr, wp, seed, sens);
+        } else {
+            cs.make_sampler(ncenters, costs.shape(0), (double *)buf1.ptr, (uint32_t *)asb.ptr, wp, seed, sens);
+        }
+    },
+    "Generates a coreset sampler given a set of costs, assignments, and, optionally, weights. This can be used to generate an index coreset",
+    py::arg("ncenters"), py::arg("costs"), py::arg("assignments"),
+    py::arg("weights") =  py::cast<py::none>(Py_None), py::arg("seed") = 13, py::arg("sens")=minocore::coresets::BFL);
+
 }
diff --git a/python/pyfgc.cpp b/python/pyfgc.cpp
index 970e6e37..e63f505d 100644
--- a/python/pyfgc.cpp
+++ b/python/pyfgc.cpp
@@ -3,6 +3,8 @@
 
 PYBIND11_MODULE(pyfgc, m) {
     init_ex1(m);
+    init_coreset(m);
     m.doc() = "Python bindings for FGC, which allows for calling coreset/clustering code from numpy and converting results back to numpy arrays";
 }
-
+void init_ex1(py::module &) {
+}
diff --git a/python/pyfgc.h b/python/pyfgc.h
index 58059232..ccb0f2f6 100644
--- a/python/pyfgc.h
+++ b/python/pyfgc.h
@@ -2,7 +2,7 @@
 #include "pybind11/pybind11.h"
 #include "pybind11/numpy.h"
 #include "aesctr/wy.h"
-#include "minocore/matrix_coreset.h"
+#include "minocore/minocore.h"
 using namespace minocore;
 namespace py = pybind11;
 void init_ex1(py::module &);
diff --git a/python/setup.py b/python/setup.py
index 34f9f0a3..023a7c8f 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -26,7 +26,7 @@ def __str__(self):
 extra_compile_args = ['-march=native',
                       '-Wno-char-subscripts', '-Wno-unused-function',
                       '-Wno-strict-aliasing', '-Wno-ignored-attributes', '-fno-wrapv',
-                      '-lz', '-fopenmp', '-lgomp']
+                      '-lz', '-fopenmp', "-lgomp"]
 
 if 'BOOST_DIR' in environ:
     extra_compile_args.append("-I%s" % environ['BOOST_DIR'])
@@ -83,7 +83,7 @@ def cpp_flag(compiler):
                        'is needed!')
 
 
-extra_link_opts = ["-lgomp", "-lz"]
+extra_link_opts = ["-fopenmp", "-lgomp", "-lz"]
 
 class BuildExt(build_ext):
     """A custom build extension for adding compiler-specific options."""
@@ -125,7 +125,7 @@ def build_extensions(self):
     author='Daniel Baker',
     author_email='dnb@cs.jhu.edu',
     url='https://github.com/dnbaker/pyfgc',
-    description='A python module for constructing and comparing HyperLogLogs',
+    description='A python module for stuff',
     long_description='',
     ext_modules=ext_modules,
     install_requires=['pybind11>=2.4'],
diff --git a/scripts/hdf2file.py b/scripts/hdf2file.py
new file mode 100644
index 00000000..ce1179eb
--- /dev/null
+++ b/scripts/hdf2file.py
@@ -0,0 +1,14 @@
+import h5py
+import numpy as np
+import sys
+
+infname = sys.argv[1]
+key = sys.argv[2] if sys.argv[2:] else "matrix"
+prefix = "data" if not sys.argv[3:] else sys.argv[3]
+
+f = h5py.File(infname, "r")
+print(f.keys())
+group = f[key]
+for comp in ["shape", "indices", "indptr", "data"]:
+    with open(prefix + '.' + comp, "w") as f:
+        np.array(group[comp]).tofile(f)
diff --git a/src/clustertest.cpp b/src/clustertest.cpp
new file mode 100644
index 00000000..4914c2bd
--- /dev/null
+++ b/src/clustertest.cpp
@@ -0,0 +1,78 @@
+#include "blaze/math/DynamicMatrix.h"
+#include "aesctr/wy.h"
+#include "minocore/clustering.h"
+
+using namespace minocore;
+
+template<typename FT>
+blaze::DynamicMatrix<FT> parse_file(std::string path, unsigned *num_clusters) {
+    std::ifstream ifs(path);
+    std::string line;
+    if(!std::getline(ifs, line)) throw 1;
+    size_t nr = std::atoi(line.data());
+    size_t nc = std::atoi(std::strchr(line.data(), '/') + 1);
+    *num_clusters = std::atoi(std::strchr(std::strchr(line.data(), '/') + 1, '/') + 1);
+    blaze::DynamicMatrix<FT> ret(nr, nc);
+    size_t row_index = 0;
+    while(std::getline(ifs, line)) {
+        auto r = row(ret, row_index++);
+        char *ptr = line.data();
+        for(size_t col_index = 0;col_index < nc;r[col_index++] = std::strtod(ptr, &ptr));
+    }
+    assert(row_index == nr);
+    return ret;
+}
+
+
+template<bool is_hard>
+int metamain(int argc, char **argv) {
+    int ret = 0;
+    unsigned k;
+    std::string inpath = "random.out";
+    if(argc > 1) inpath = argv[1];
+    auto pointmat = parse_file<float>(inpath, &k);
+
+    std::cerr << "Parsed matrix of " << pointmat.rows() << " rows and "
+              << pointmat.columns() << " columns, with k = " << k << " clusters\n";
+    auto jsdapp = make_probdiv_applicator(pointmat, blz::SQRL2);
+    std::cerr << "Made probdiv applicator\n";
+    auto clusterdata = clustering::perform_clustering<
+        is_hard ? clustering::HARD: clustering::SOFT, clustering::EXTRINSIC>(jsdapp, k);
+    shared::flat_hash_map<uint32_t, shared::flat_hash_set<uint32_t>> labels, clabels;
+    std::ifstream ifs(inpath + ".labels.txt");
+    size_t lno = 0;
+    for(std::string l; std::getline(ifs, l);) {
+        labels[std::atoi(l.data())].insert(lno++);
+    }
+    auto &asn = std::get<1>(clusterdata);
+    if constexpr(is_hard) {
+        for(size_t i = 0; i < asn.size(); ++i) {
+            clabels[asn[i]].insert(i);
+        }
+        shared::flat_hash_set<uint32_t> sizes, csizes;
+        for(const auto &l: labels) sizes.insert(l.second.size());
+        for(const auto &l: clabels) csizes.insert(l.second.size());
+        std::cerr << "sizes size: " << sizes.size() << '\n';
+        std::cerr << "csizes size: " << csizes.size() << '\n';
+        assert(sizes.size() == csizes.size() && *sizes.begin() == *csizes.begin());
+        // TODO: ensure that items are correctly clustered
+    }
+    return ret;
+}
+
+int main(int argc, char **argv) {
+    int rc;
+    if((rc = metamain<true>(argc, argv)))  return rc;
+    if((rc = metamain<false>(argc, argv))) return rc;
+    // Next: test Bregman clustering (HARD)
+    // Next: test Bregman clustering (SOFT)
+    // Next: test LLR (HARD)
+    // Next: test LLR (SOFT)
+    // Next: test TVD (HARD)
+    // Next: test TVD (SOFT)
+    // Next: test L1 (HARD)
+    // Next: test L1 (SOFT)
+    // Next: test metric k-median (universal dispatch)
+    throw std::runtime_error("Not completed!");
+    return rc;
+}
diff --git a/src/csctest.cpp b/src/csctest.cpp
index 06f431ae..0db0985c 100644
--- a/src/csctest.cpp
+++ b/src/csctest.cpp
@@ -1,9 +1,70 @@
 #include "minocore/util/csc.h"
 
+template<typename IndPtrT, typename IndicesT, typename VT>
+void dothing(std::string path) {
+    auto read = minocore::csc2sparse<float, IndPtrT, IndicesT, VT>(path);
+    std::fprintf(stderr, "nr: %zu. nc: %zu. nnz: %zu\n", read.rows(), read.columns(), read.nonZeros());
+}
+
+enum VT {
+    U32,
+    U64,
+    F32,
+    F64
+};
+
+VT c2v(std::string key) {
+    if(key == "u32") return U32;
+    if(key == "u64") return U64;
+    if(key == "f32") return F32;
+    if(key == "f64") return F64;
+    throw 1;
+    return F64;
+}
+
 using namespace minocore;
 int main(int argc, char *argv[]) {
     std::string inpath;
-    if(argc > 1) inpath = argv[1];
-    auto read = csc2sparse<float>(inpath);
-    std::fprintf(stderr, "nr: %zu. nc: %zu. nnz: %zu\n", read.rows(), read.columns(), read.nonZeros());
+    VT ip = U64;
+    VT id = U64;
+    VT dt = F32;
+    for(int c;(c = getopt(argc, argv, "p:i:d:h")) >= 0;) {
+        switch(c) {
+            case 'p': ip = c2v(optarg); break;
+            case 'i': id = c2v(optarg); break;
+            case 'd': dt = c2v(optarg); break;
+        }
+    }
+    // Use as ./csctest -pu32 -iu32 -df32 cao_atlas_
+    if(optind != argc) inpath = argv[optind];
+    if(dt != U32 && dt != F32) throw std::runtime_error("Not supported: datatype other than f32 or u32");
+    if(ip == U64) {
+        if(id == U64) {
+            if(dt == U32) {
+                dothing<uint64_t, uint64_t, uint32_t>(inpath);
+            } else if(dt == F32) {
+                dothing<uint64_t, uint64_t, float>(inpath);
+            }
+        } else {
+            if(dt == U32) {
+                dothing<uint64_t, uint32_t, uint32_t>(inpath);
+            } else if(dt == F32) {
+                dothing<uint64_t, uint32_t, float>(inpath);
+            }
+        }
+    } else {
+        if(id == U64) {
+            if(dt == U32) {
+                dothing<uint32_t, uint64_t, uint32_t>(inpath);
+            } else if(dt == F32) {
+                dothing<uint32_t, uint64_t, float>(inpath);
+            }
+        } else {
+            if(dt == U32) {
+                dothing<uint32_t, uint32_t, uint32_t>(inpath);
+            } else if(dt == F32) {
+                dothing<uint32_t, uint32_t, float>(inpath);
+            }
+        }
+    }
 }
diff --git a/src/diskmattest.cpp b/src/diskmattest.cpp
index d3924f6f..f7109234 100644
--- a/src/diskmattest.cpp
+++ b/src/diskmattest.cpp
@@ -1,8 +1,9 @@
 #include "minocore/dist.h"
-#include "minocore/util/diskmat.h"
+#include "diskmat/diskmat.h"
 
 using namespace minocore;
 using namespace blz;
+using diskmat::DiskMat;
 
 int main() {
     std::srand(0);
@@ -58,7 +59,6 @@ int main() {
     std::cout << r1;
     std::cout << r0;
     std::fprintf(stderr, "Wasserstein distance between rows 1 and 2: %g\n", distance::p_wasserstein(r1, r0));
-    std::fprintf(stderr, "Wasserstein distance between rows 1 and 2: %g\n", distance::network_p_wasserstein(r1, r0));
 #if 0
     std::fprintf(stderr, "multinomial jsd: %f\n", distance::multinomial_jsd(r1, r0));
     std::fprintf(stderr, "multinomial jsd: %f\n", distance::multinomial_jsd(c1, c0));
diff --git a/src/dmlsearch.cpp b/src/dmlsearch.cpp
index 761eaecc..8b13e92b 100644
--- a/src/dmlsearch.cpp
+++ b/src/dmlsearch.cpp
@@ -1,7 +1,7 @@
 //#define VERBOSE_AF 1
 #include "minocore/graph/graphdist.h"
 #include "minocore/optim/lsearch.h"
-#include "minocore/util/diskmat.h"
+#include "diskmat/diskmat.h"
 #include <iostream>
 
 using namespace minocore;
diff --git a/src/geomedtest.cpp b/src/geomedtest.cpp
index 6e6f6f2a..835a8098 100644
--- a/src/geomedtest.cpp
+++ b/src/geomedtest.cpp
@@ -66,7 +66,7 @@ int main(int c, char **a) {
     manstop = t();
     std::fprintf(stderr, "Manual l1 distances time: %zu/%g. reduction-based: %zu/%g\n", size_t((stop - start).count() / 1000), cwmed, size_t((manstop - manstart).count() / 1000), cwmed2);
     auto l1_approx_start = t();
-    minocore::coresets::l1_median(m, v3, static_cast<const float *>(nullptr), true);
+    minocore::coresets::l1_median(m, v3, static_cast<const float *>(nullptr));
     auto l1_approx_stop = t();
     std::fprintf(stderr, "Time to compute exact l1 median: %gms. Approx: %gms.\n", (l1_stop - l1_start).count() * 1.e-6, (l1_approx_stop - l1_approx_start).count() * 1.e-6);
     std::cout << "L1 dist under geomedian: " << l1dist(m, v) << '\n';
diff --git a/src/hdf2dm.cpp b/src/hdf2dm.cpp
index 3fe5e032..e3ecda2f 100644
--- a/src/hdf2dm.cpp
+++ b/src/hdf2dm.cpp
@@ -27,9 +27,11 @@ int main(int argc, char *argv[]) {
     // TODO: extract to binary file, then iterate over the file.
     std::string inpath = "5k_pbmc_protein_v3_raw_feature_bc_matrix.h5";
     std::string outpref = "";
+    std::string key = "matrix";
     if(argc > 1) inpath = argv[1];
+    if(argc > 2) key = argv[2];
     H5::H5File file(inpath.data(), H5F_ACC_RDONLY );
-    auto group = H5::Group(file.openGroup("matrix"));
+    auto group = H5::Group(file.openGroup(key.data()));
     auto shape = group.openDataSet("shape");
     assert(shape.getIntType().getSize() == 4);
     uint32_t shape_out[2];
diff --git a/src/jsdtest.cpp b/src/jsdtest.cpp
index 04cbed43..5201c982 100644
--- a/src/jsdtest.cpp
+++ b/src/jsdtest.cpp
@@ -8,6 +8,16 @@ using namespace blz;
 #define FLOAT_TYPE double
 #endif
 
+#ifndef INDICESTYPE
+#define INDICESTYPE uint64_t
+#endif
+#ifndef INDPTRTYPE
+#define INDPTRTYPE uint64_t
+#endif
+#ifndef DATATYPE
+#define DATATYPE uint32_t
+#endif
+
 int main(int argc, char *argv[]) {
     if(std::find_if(argv, argv + argc, [](auto x) {return std::strcmp(x, "-h") == 0 || std::strcmp(x, "--help") == 0;})
        != argv + argc) {
@@ -21,7 +31,7 @@ int main(int argc, char *argv[]) {
         input = argv[3];
     std::ofstream ofs("output.txt");
     auto sparsemat = input.size() ? minocore::mtx2sparse<FLOAT_TYPE>(input)
-                                  : minocore::csc2sparse<FLOAT_TYPE>("", true);
+                                  : minocore::csc2sparse<FLOAT_TYPE, INDPTRTYPE, INDICESTYPE, DATATYPE>("", true);
     std::vector<unsigned> nonemptyrows;
     size_t i = 0;
     while(nonemptyrows.size() < 25) {
@@ -30,7 +40,7 @@ int main(int argc, char *argv[]) {
         ++i;
     }
     blz::SM<FLOAT_TYPE> first25 = rows(sparsemat, nonemptyrows.data(), nonemptyrows.size());
-    auto jsd = minocore::jsd::make_jsm_applicator(first25);
+    auto jsd = minocore::jsd::make_probdiv_applicator(first25, jsd::JSM, jsd::DIRICHLET);
     //auto jsddistmat = jsd.make_distance_matrix();
     dm::DistanceMatrix<FLOAT_TYPE> utdm(first25.rows());
     jsd.set_distance_matrix(utdm);
@@ -77,12 +87,10 @@ int main(int argc, char *argv[]) {
     timer.restart("1ksparsekl");
     jsd2.set_distance_matrix(jsd_bnj, minocore::jsd::MKL, true);
     timer.report();
-    std::cout << "Multinomial KL\n" << '\n';
     //std::cout << jsd_bnj << '\n';
     timer.restart("1ksparseL1");
-    jsd2.set_distance_matrix(jsd_bnj, minocore::jsd::EMD, true);
+    jsd2.set_distance_matrix(jsd_bnj, minocore::jsd::L1, true);
     timer.report();
-    std::cout << "EMD: " << jsd_bnj << '\n';
 #if 0
     timer.restart("1ldensejsd");
     blz::DM<FLOAT_TYPE> densefirst25 = first25;
@@ -93,8 +101,8 @@ int main(int argc, char *argv[]) {
     //ofs << jsd_bnj << '\n';
     ofs.flush();
     std::fprintf(stderr, "Starting jsm\n");
-    timer.restart("1ksparsejsm");
-    jsd2.set_distance_matrix(jsd_bnj, minocore::jsd::L1);
+    timer.restart("1ksparsetvd");
+    jsd2.set_distance_matrix(jsd_bnj, minocore::jsd::TVD);
     timer.report();
     timer.reset();
     ofs << "JS Metric: \n";
diff --git a/src/kmpptest.cpp b/src/kmpptest.cpp
index f3a5f04c..01c0cad4 100644
--- a/src/kmpptest.cpp
+++ b/src/kmpptest.cpp
@@ -1,5 +1,6 @@
 #include "minocore/optim/kmeans.h"
 #include "minocore/optim/kcenter.h"
+#include "minocore/coreset/kcenter.h"
 #include "minocore/dist/applicator.h"
 #include <new>
 #include <chrono>
@@ -8,7 +9,7 @@
 #include <omp.h>
 #endif
 
-#define t std::chrono::high_resolution_clock::now
+auto t() {return std::chrono::high_resolution_clock::now();}
 
 #ifndef FLOAT_TYPE
 #define FLOAT_TYPE double
@@ -22,19 +23,19 @@ template<typename Mat, typename RNG>
 void test_kccs(Mat &mat, RNG &rng, size_t npoints, double eps) {
     auto matrowit = blz::rowiterator(mat);
     auto start = t();
-    double gamma = 100. / mat.rows();
+    double gamma = 500. / mat.rows();
     if(gamma >= 0.5)
         gamma = 0.05;
-    auto cs = outliers::kcenter_coreset(matrowit.begin(), matrowit.end(), rng, npoints, eps,
+    auto cs = kcenter_coreset_outliers(matrowit.begin(), matrowit.end(), rng, npoints, eps,
                 /*mu=*/0.5, 1.5, gamma);
     auto maxv = *std::max_element(cs.indices_.begin(), cs.indices_.end());
     std::fprintf(stderr, "max index: %u\n", unsigned(maxv));
     auto stop = t();
-    std::fprintf(stderr, "kcenter coreset took %0.12gs\n", double((stop - start).count()) / 1e9);
+    std::fprintf(stderr, "kcenter coreset took %0.12gms\n", util::timediff2ms(stop, start));
     start = t();
     auto csmat = index2matrix(cs, mat);
     stop = t();
-    std::fprintf(stderr, "kcenter compacting to coreset took %0.12gs\n", double((stop - start).count()) / 1e9);
+    std::fprintf(stderr, "kcenter compacting to coreset took %0.12gs\n", util::timediff2ms(stop, start));
 }
 
 int main(int argc, char *argv[]) {
@@ -48,7 +49,6 @@ int main(int argc, char *argv[]) {
         nt = std::atoi(env);
     }
     OMP_ONLY(omp_set_num_threads(nt);)
-    std::fprintf(stderr, "%d threads used\n", nt);
 #endif
     std::srand(0);
     size_t n = argc == 1 ? 250000: std::atoi(argv[1]);
@@ -107,7 +107,8 @@ int main(int argc, char *argv[]) {
     assert(min(sqmat) > 0.);
     {
         auto greedy_metric = kcenter_greedy_2approx(blz::rowiterator(sqmat).begin(), blz::rowiterator(sqmat).end(),
-                                                    gen, /*k=*/3, MatrixLookup{});
+                                                    gen, /*k=*/npoints, MatrixLookup{});
+        kcenter_greedy_2approx_outliers(blz::rowiterator(sqmat).begin(), blz::rowiterator(sqmat).end(), gen, /*k=*/npoints, eps, .001, MatrixLookup{});
     }
     auto kmpp_asn = std::move(std::get<1>(centers));
     std::vector<FLOAT_TYPE> counts(npoints);
diff --git a/src/knntest.cpp b/src/knntest.cpp
new file mode 100644
index 00000000..f03e3146
--- /dev/null
+++ b/src/knntest.cpp
@@ -0,0 +1,12 @@
+#include "include/minocore/dist/knngraph.h"
+
+int main() {
+    blaze::DynamicMatrix<float> mat = blaze::generate(1000, 50, [](auto x, auto y) {
+        return float(std::rand()) / RAND_MAX + (x * y) / 1000. / 50.;
+    });
+    auto app = minocore::jsd::make_probdiv_applicator(mat, blz::distance::L1);
+    auto knns = minocore::make_knns(app, 10);
+    auto graph = minocore::knns2graph(knns, app.size(), true);
+    auto mst = minocore::knng2mst(graph);
+    std::fprintf(stderr, "mst size: %zu edges vs %zu nodes\n", mst.size(), app.size());
+}
diff --git a/src/mtxparse.cpp b/src/mtxparse.cpp
index 7ae8a20e..451dbdab 100644
--- a/src/mtxparse.cpp
+++ b/src/mtxparse.cpp
@@ -1,4 +1,4 @@
-#include "minocore/csc.h"
+#include "minocore/util/csc.h"
 #include <iostream>
 #include "blaze/util/Serialization.h"
 #include <getopt.h>
diff --git a/src/sparsepriortest.cpp b/src/sparsepriortest.cpp
new file mode 100644
index 00000000..970ebb2b
--- /dev/null
+++ b/src/sparsepriortest.cpp
@@ -0,0 +1,20 @@
+#include "minocore/dist/applicator.h"
+
+int main() {
+    blaze::CompressedMatrix<double> cm{{1., 5., 0., 3., 1., 1., 1., 3., 1., 1}, { 1.,  1.,  3.,  2.,  2.,  0., 21.,  1.,  7.,  1. }};
+    std::cerr << cm << '\n';
+    auto app = minocore::make_probdiv_applicator(cm, blz::JSD, minocore::jsd::DIRICHLET);
+    assert(std::abs(app(0, 1) - 0.16066042325849725) < 1e-10 || !std::fprintf(stderr, "got %g vs %g\n", app(0, 1), 0.16066042325849725));
+    blaze::CompressedMatrix<double> cm2{
+        {0, 7, 6, 0, 6, 6, 0, 0, 7, 9, 4, 0, 0, 0, 6, 6, 0, 0, 0, 7},
+        {6, 7, 0, 0, 0, 5, 6, 9, 0, 0, 0, 0, 0, 9, 0, 6, 5, 6, 0, 0}
+    };
+    auto r1 = row(cm2, 0);
+    auto r2 = row(cm2, 1);
+    assert(blz::number_shared_zeros(r1, r2) == 4);
+    auto app2 = minocore::make_probdiv_applicator(cm2, blz::JSD, minocore::jsd::DIRICHLET);
+    double v2 = app2(0, 1);
+    static constexpr double correct2 = 0.2307775339934756;
+    assert(std::abs(correct2 - v2) < 1e-10);
+    //std::fprintf(stderr, "difference: %0.12g\n", correct2 - v2);
+}