diff --git a/data/aminoacid-cpals-dense.json b/data/aminoacid-cpals-dense.json index 15462124a0..67a078404d 100644 --- a/data/aminoacid-cpals-dense.json +++ b/data/aminoacid-cpals-dense.json @@ -9,7 +9,7 @@ }, "k-tensor": { - "rank": 16, + "rank": 15, "initial-guess": "rand", "distributed-guess": "serial", "seed": 12345, @@ -35,7 +35,7 @@ }, "iterations": { - "value": 8, + "value": 5, "absolute-tolerance": 1 } } diff --git a/data/aminoacid-cpals.json b/data/aminoacid-cpals.json index 2249e109b0..ddaec3d61b 100644 --- a/data/aminoacid-cpals.json +++ b/data/aminoacid-cpals.json @@ -8,7 +8,7 @@ }, "k-tensor": { - "rank": 16, + "rank": 15, "output-file": "aminoacid.ktn", "initial-guess": "rand", "distributed-guess": "serial", @@ -33,7 +33,7 @@ }, "iterations": { - "value": 8, + "value": 5, "absolute-tolerance": 1 } } diff --git a/data/aminoacid-cpopt-lbfgsb.json b/data/aminoacid-cpopt-lbfgsb.json index 8d0aec3a77..3b9af43454 100644 --- a/data/aminoacid-cpopt-lbfgsb.json +++ b/data/aminoacid-cpopt-lbfgsb.json @@ -8,7 +8,7 @@ }, "k-tensor": { - "rank": 16, + "rank": 15, "output-file": "aminoacid.ktn", "initial-guess": "rand", "distributed-guess": "serial", @@ -30,7 +30,7 @@ }, "iterations": { - "value": 39, + "value": 47, "absolute-tolerance": 2 } } diff --git a/data/aminoacid-cpopt-rol-hess.json b/data/aminoacid-cpopt-rol-hess.json index 26eb9da357..05c5f4d39d 100644 --- a/data/aminoacid-cpopt-rol-hess.json +++ b/data/aminoacid-cpopt-rol-hess.json @@ -8,7 +8,7 @@ }, "k-tensor": { - "rank": 16, + "rank": 15, "output-file": "aminoacid.ktn", "initial-guess": "rand", "distributed-guess": "serial", @@ -30,7 +30,7 @@ }, "iterations": { - "value": 19, + "value": 20, "absolute-tolerance": 3 } } diff --git a/data/aminoacid-cpopt-rol.json b/data/aminoacid-cpopt-rol.json index cb21daf372..b407e27a04 100644 --- a/data/aminoacid-cpopt-rol.json +++ b/data/aminoacid-cpopt-rol.json @@ -8,7 +8,7 @@ }, "k-tensor": { - "rank": 16, + "rank": 15, "output-file": "aminoacid.ktn", "initial-guess": "rand", "distributed-guess": "serial", @@ -30,7 +30,7 @@ }, "iterations": { - "value": 63, + "value": 68, "absolute-tolerance": 2 } } diff --git a/data/aminoacid-gcpfed.json b/data/aminoacid-gcpfed.json index 4782d8c6f9..b3e36b9329 100644 --- a/data/aminoacid-gcpfed.json +++ b/data/aminoacid-gcpfed.json @@ -7,7 +7,7 @@ }, "k-tensor": { - "rank": 16, + "rank": 15, "output-file": "aminoacid.ktn", "initial-guess": "rand", "distributed-guess": "parallel-drew", diff --git a/data/aminoacid-gcpopt-lbfgsb.json b/data/aminoacid-gcpopt-lbfgsb.json index da2014ef35..7997006093 100644 --- a/data/aminoacid-gcpopt-lbfgsb.json +++ b/data/aminoacid-gcpopt-lbfgsb.json @@ -8,7 +8,7 @@ }, "k-tensor": { - "rank": 16, + "rank": 15, "output-file": "aminoacid.ktn", "initial-guess": "rand", "distributed-guess": "serial", diff --git a/data/aminoacid-gcpopt-rol.json b/data/aminoacid-gcpopt-rol.json index 6acda4f9f2..6105822244 100644 --- a/data/aminoacid-gcpopt-rol.json +++ b/data/aminoacid-gcpopt-rol.json @@ -8,7 +8,7 @@ }, "k-tensor": { - "rank": 16, + "rank": 15, "output-file": "aminoacid.ktn", "initial-guess": "rand", "distributed-guess": "serial", @@ -32,7 +32,7 @@ }, "iterations": { - "value": 80, + "value": 68, "absolute-tolerance": 2 } } diff --git a/data/aminoacid-gcpsgd-dense.json b/data/aminoacid-gcpsgd-dense.json index 2af2fb34d6..0cb971b39c 100644 --- a/data/aminoacid-gcpsgd-dense.json +++ b/data/aminoacid-gcpsgd-dense.json @@ -9,7 +9,7 @@ }, "k-tensor": { - "rank": 16, + "rank": 15, "initial-guess": "rand", "distributed-guess": "parallel-drew", "seed": 12345, @@ -44,7 +44,7 @@ }, "iterations": { - "value": 73, + "value": 75, "absolute-tolerance": 20 } } diff --git a/data/aminoacid-gcpsgd.json b/data/aminoacid-gcpsgd.json index 3ca0c4c61e..883ab3daeb 100644 --- a/data/aminoacid-gcpsgd.json +++ b/data/aminoacid-gcpsgd.json @@ -8,7 +8,7 @@ }, "k-tensor": { - "rank": 16, + "rank": 15, "output-file": "aminoacid.ktn", "initial-guess": "rand", "distributed-guess": "parallel-drew", @@ -29,7 +29,7 @@ "seed": 31415, "fuse": false, "hash": true, - "gnzs": 100, + "gnzs": 150, "gzs": 0, "fnzs": 10000, "fzs": 0, @@ -44,8 +44,8 @@ }, "iterations": { - "value": 73, - "absolute-tolerance": 20 + "value": 60, + "absolute-tolerance": 30 } } } diff --git a/src/Genten_DistKtensorUpdate.cpp b/src/Genten_DistKtensorUpdate.cpp index ee6825a090..9b6121040f 100644 --- a/src/Genten_DistKtensorUpdate.cpp +++ b/src/Genten_DistKtensorUpdate.cpp @@ -52,11 +52,13 @@ KtensorOneSidedUpdate(const DistTensor& X, parallel = pmap != nullptr && pmap->gridSize() > 1; if (parallel) { const unsigned nd = u.ndims(); + padded.resize(nd); sizes.resize(nd); sizes_r.resize(nd); offsets.resize(nd); offsets_r.resize(nd); for (unsigned n=0; nsubCommSize(n); // Get number of rows on each processor @@ -170,7 +172,9 @@ createOverlapKtensor(const KtensorT& u) const for (unsigned n=0; n mat(nrows, nc); + // Create factor matrix to have same padding as u[n] for consistent + // dimensions in import/export + FacMatrixT mat(nrows, nc, nullptr, true, padded[n]); u_overlapped.set_factor(n, mat); } u_overlapped.setProcessorMap(u.getProcessorMap()); @@ -785,17 +789,18 @@ KtensorTwoSidedUpdate:: KtensorTwoSidedUpdate(const DistTensor& X, const KtensorT& u, const AlgParams& a) : - pmap(u.getProcessorMap()), algParams(a), nc(u.ncomponents()) + pmap(u.getProcessorMap()), algParams(a), nd(u.ndims()), nc(u.ncomponents()) { parallel = pmap != nullptr && pmap->gridSize() > 1; if (parallel) { - const unsigned nd = u.ndims(); + padded.resize(nd); sizes.resize(nd); sizes_r.resize(nd); offsets.resize(nd); offsets_r.resize(nd); offsets_dev.resize(nd); for (unsigned n=0; nsubCommSize(n); // Get number of rows on each processor @@ -839,7 +844,6 @@ extractRowRecvsHost() { GENTEN_START_TIMER("extract row recvs host"); const ttb_indx nnz = X_sparse.nnz(); - const unsigned nd = X_sparse.ndims(); if (maps.empty()) { maps.resize(nd); row_recvs_for_proc.resize(nd); @@ -953,7 +957,6 @@ extractRowRecvsDevice() GENTEN_START_TIMER("extract row recvs device"); const ttb_indx nnz = X_sparse.nnz(); - const unsigned nd = X_sparse.ndims(); const unsigned nc_ = nc; if (num_row_recvs_dev.empty()) { num_row_recvs_dev.resize(nd); @@ -1073,7 +1076,6 @@ updateTensor(const DistTensor& X) if (sparse && parallel) { GENTEN_START_TIMER("initialize"); X_sparse = X.getSptensor(); - const unsigned nd = X_sparse.ndims(); if (num_row_sends.empty()) { num_row_sends.resize(nd); num_row_recvs.resize(nd); @@ -1186,16 +1188,16 @@ createOverlapKtensor(const KtensorT& u) const if (!parallel) return u; - const unsigned nd = u.ndims(); - const unsigned nc = u.ncomponents(); KtensorT u_overlapped = KtensorT(nc, nd); for (unsigned n=0; n mat(nrows, nc); + // Create factor matrix to have same padding as u[n] for consistent + // dimensions in import/export + FacMatrixT mat(nrows, nc, nullptr, true, padded[n]); u_overlapped.set_factor(n, mat); } - u_overlapped.setProcessorMap(u.getProcessorMap()); + u_overlapped.setProcessorMap(pmap); return u_overlapped; } @@ -1206,7 +1208,6 @@ initOverlapKtensor(KtensorT& u) const { GENTEN_TIME_MONITOR("k-tensor init"); if (parallel && sparse) { - const unsigned nd = u.ndims(); const unsigned nc_ = nc; for (unsigned n=0; nsubCommSize(n); @@ -1324,7 +1325,6 @@ KtensorTwoSidedUpdate:: doImportSparse(const KtensorT& u_overlapped, const KtensorT& u) const { - const unsigned nd = u.ndims(); for (unsigned n=0; n:: doImportDense(const KtensorT& u_overlapped, const KtensorT& u) const { - const unsigned nd = u.ndims(); for (unsigned n=0; n:: doExportSparse(const KtensorT& u, const KtensorT& u_overlapped) const { - const unsigned nd = u.ndims(); for (unsigned n=0; n:: doExportDense(const KtensorT& u, const KtensorT& u_overlapped) const { - const unsigned nd = u.ndims(); for (unsigned n=0; n { std::vector< std::vector > offsets_r; std::vector< std::vector > sizes_r; + std::vector padded; + public: KtensorAllGatherReduceUpdate(const KtensorT& u) : pmap(u.getProcessorMap()) { const unsigned nd = u.ndims(); + padded.resize(nd); sizes.resize(nd); sizes_r.resize(nd); offsets.resize(nd); offsets_r.resize(nd); for (unsigned n=0; nsubCommSize(n); // Get number of rows on each processor @@ -401,7 +405,9 @@ class KtensorAllGatherReduceUpdate : public DistKtensorUpdate { for (unsigned n=0; n mat(nrows, nc); + // Create factor matrix to have same padding as u[n] for consistent + // dimensions in import/export + FacMatrixT mat(nrows, nc, nullptr, true, padded[n]); u_overlapped.set_factor(n, mat); } u_overlapped.setProcessorMap(u.getProcessorMap()); @@ -523,6 +529,7 @@ class KtensorOneSidedUpdate : bool sparse; SptensorT X_sparse; + std::vector padded; public: using unordered_map_type = @@ -639,7 +646,9 @@ class KtensorTwoSidedUpdate : bool sparse; SptensorT X_sparse; + unsigned nd; unsigned nc; + std::vector padded; using offsets_type = Kokkos::View; // Set the host execution space to be ExecSpace if it is a host execution diff --git a/src/Genten_FacMatrix.cpp b/src/Genten_FacMatrix.cpp index 871a83c40a..c5e8d6e49f 100644 --- a/src/Genten_FacMatrix.cpp +++ b/src/Genten_FacMatrix.cpp @@ -48,6 +48,7 @@ #include "Genten_FacMatArray.hpp" #include "Genten_MathLibs_Wpr.hpp" #include "Genten_portability.hpp" +#include "Kokkos_Random.hpp" #include "CMakeInclude.h" #include // for std::max with MSVC compiler #include @@ -59,13 +60,13 @@ template Genten::FacMatrixT:: FacMatrixT(ttb_indx m, ttb_indx n, const ProcessorMap::FacMap* pmap_, - const bool zero) : + const bool zero, const bool pad) : pmap(pmap_) { // Don't use padding if Cuda, HIP or SYCL is the default execution space, so factor // matrices allocated on the host have the same shape. We really need a // better way to do this. - if (Genten::is_gpu_space::value) { + if (!pad || Genten::is_gpu_space::value) { if (zero) data = view_type("Genten::FacMatrix::data",m,n); else @@ -86,12 +87,12 @@ FacMatrixT(ttb_indx m, ttb_indx n, const ProcessorMap::FacMap* pmap_, template Genten::FacMatrixT:: FacMatrixT(ttb_indx m, ttb_indx n, const ttb_real * cvec, - const ProcessorMap::FacMap* pmap_) : pmap(pmap_) + const ProcessorMap::FacMap* pmap_, const bool pad) : pmap(pmap_) { // Don't use padding if Cuda, HIP or SYCL is the default execution space, so factor // matrices allocated on the host have the same shape. We really need a // better way to do this. - if (Genten::is_gpu_space::value) + if (!pad || Genten::is_gpu_space::value) data = view_type(Kokkos::view_alloc("Genten::FacMatrix::data", Kokkos::WithoutInitializing),m,n); else @@ -112,8 +113,13 @@ template void Genten::FacMatrixT:: rand() const { - auto data_1d = make_data_1d(); - data_1d.rand(); + // Don't fill as 1-D to get consistent values with and without padding + + //auto data_1d = make_data_1d(); + //data_1d.rand(); + + RandomMT cRNG(0); + scatter(false, false, cRNG); } template @@ -122,8 +128,36 @@ scatter (const bool bUseMatlabRNG, const bool bUseParallelRNG, RandomMT & cRMT) const { - auto data_1d = make_data_1d(); - data_1d.scatter (bUseMatlabRNG, bUseParallelRNG, cRMT); + // Don't fill as 1-D to get consistent values with and without padding + + //auto data_1d = make_data_1d(); + //data_1d.scatter (bUseMatlabRNG, bUseParallelRNG, cRMT); + + if (bUseParallelRNG) + { + const ttb_indx seed = cRMT.genrnd_int32(); + Kokkos::Random_XorShift64_Pool rand_pool(seed); + ttb_real min_val = 0.0; + ttb_real max_val = 1.0; + Kokkos::fill_random(data, rand_pool, min_val, max_val); + } + else + { + const ttb_indx nrows = data.extent(0); + const ttb_indx ncols = data.extent(1); + auto d = create_mirror_view(data); + for (ttb_indx i=0; i diff --git a/src/Genten_FacMatrix.hpp b/src/Genten_FacMatrix.hpp index 4a800eff93..f986ba420a 100644 --- a/src/Genten_FacMatrix.hpp +++ b/src/Genten_FacMatrix.hpp @@ -104,7 +104,7 @@ class FacMatrixT * @param[in] n Number of columns, should equal the number of components * in the Ktensor. */ - FacMatrixT(ttb_indx m, ttb_indx n, const ProcessorMap::FacMap* pmap_ = nullptr, const bool zero = true); + FacMatrixT(ttb_indx m, ttb_indx n, const ProcessorMap::FacMap* pmap_ = nullptr, const bool zero = true, const bool pad = true); //! Constructor to create a Factor Matrix of Size M x N using the // given view @@ -117,7 +117,8 @@ class FacMatrixT // given data vector CVEC which is assumed to be stored *columnwise*. // Not currently used; therefore not part of doxygen API. FacMatrixT(ttb_indx m, ttb_indx n, const ttb_real * cvec, - const ProcessorMap::FacMap* pmap_ = nullptr); + const ProcessorMap::FacMap* pmap_ = nullptr, + const bool pad = true); //! @brief Create matrix from supplied view KOKKOS_INLINE_FUNCTION @@ -245,6 +246,12 @@ class FacMatrixT void setProcessorMap(const ProcessorMap::FacMap* pmap_) { pmap = pmap_; } const ProcessorMap::FacMap* getProcessorMap() const { return pmap; } + KOKKOS_INLINE_FUNCTION + bool isPadded() const + { + return data.extent(0)*data.extent(1) != data.span(); + } + /** @} */ /** ---------------------------------------------------------------- diff --git a/src/Genten_GCP_SGD.cpp b/src/Genten_GCP_SGD.cpp index 5a435ab343..d8fe42f5e0 100644 --- a/src/Genten_GCP_SGD.cpp +++ b/src/Genten_GCP_SGD.cpp @@ -176,9 +176,24 @@ namespace Genten { const ttb_indx printIter = print_itn ? algParams.printitn : 0; const bool compute_fit = algParams.compute_fit; - // Create sampler + // Create iterator + Impl::GCP_SGD_Iter *itp = + Impl::createIter( + X, u0, hist, penalty, mode_beg, mode_end, algParams); + Impl::GCP_SGD_Iter& it = *itp; + + // Get vector/Ktensor for current solution (this is a view of the data) + VectorType u = it.getSolution(); + KtensorT ut = u.getKtensor(); + ut.setProcessorMap(pmap); + + // Copy Ktensor for restoring previous solution + VectorType u_prev = u.clone(); + u_prev.set(u); + + // Create sampler -- must use u from iterator to get the right padding Sampler *sampler = - createSampler(X, u0, algParams); + createSampler(X, ut, algParams); // Create annealer auto annealer = getAnnealer(algParams); @@ -221,21 +236,6 @@ namespace Genten { // Start timer for total execution time of the algorithm. timer.start(timer_sgd); - // Create iterator - Impl::GCP_SGD_Iter *itp = - Impl::createIter( - X, u0, hist, penalty, mode_beg, mode_end, algParams); - Impl::GCP_SGD_Iter& it = *itp; - - // Get vector/Ktensor for current solution (this is a view of the data) - VectorType u = it.getSolution(); - KtensorT ut = u.getKtensor(); - ut.setProcessorMap(pmap); - - // Copy Ktensor for restoring previous solution - VectorType u_prev = u.clone(); - u_prev.set(u); - // Initialize sampler (sorting, hashing, ...) timer.start(timer_sort); Kokkos::Random_XorShift64_Pool rand_pool(seed); diff --git a/src/rol/Genten_CP_RolObjective.hpp b/src/rol/Genten_CP_RolObjective.hpp index b261ffa5a4..2436c167ca 100644 --- a/src/rol/Genten_CP_RolObjective.hpp +++ b/src/rol/Genten_CP_RolObjective.hpp @@ -121,8 +121,12 @@ namespace Genten { CP_RolObjective(const tensor_type& x, const ktensor_type& m, const AlgParams& algParams, - PerfHistory& h) : M(m), cp_model(x, m, algParams), history(h), - timer(1) + PerfHistory& h) : + M(m), + // create and pass a ktensor to cp_model through the KokkosVector so to + // ensure the same padding as vectors that will be used in the algorithm + cp_model(x, vector_type(m).getKtensor(), algParams), history(h), + timer(1) { #if COPY_KTENSOR const ttb_indx nc = M.ncomponents(); diff --git a/src/rol/Genten_GCP_Opt_Rol.cpp b/src/rol/Genten_GCP_Opt_Rol.cpp index 0ccc5dba28..8bdf4dd24c 100644 --- a/src/rol/Genten_GCP_Opt_Rol.cpp +++ b/src/rol/Genten_GCP_Opt_Rol.cpp @@ -170,7 +170,7 @@ void gcp_opt_rol(const TensorT& x, KtensorT& u, if (algParams.printitn > 0) { if (algParams.compute_fit) { stream << "Final fit = " << std::setprecision(3) << std::scientific - << objective->computeFit(u) << std::endl; + << objective->computeFit(z->getKtensor()) << std::endl; } stream << "Total time = " << std::setprecision(2) << std::scientific << history.lastEntry().cum_time << std::endl diff --git a/src/rol/Genten_GCP_RolObjective.cpp b/src/rol/Genten_GCP_RolObjective.cpp index 1ee57b9cac..8a67127a9a 100644 --- a/src/rol/Genten_GCP_RolObjective.cpp +++ b/src/rol/Genten_GCP_RolObjective.cpp @@ -119,7 +119,11 @@ namespace Genten { const loss_function_type& func, const AlgParams& algParams, PerfHistory& h) : - M(m), gcp_model(x, m, func, algParams), history(h), timer(1), + M(m), + // create and pass a ktensor to gcp_model through the KokkosVector so to + // ensure the same padding as vectors that will be used in the algorithm + gcp_model(x, vector_type(m).getKtensor(), func, algParams), + history(h), timer(1), compute_fit(algParams.compute_fit) { #if COPY_KTENSOR @@ -142,6 +146,11 @@ namespace Genten { { GENTEN_TIME_MONITOR("GCP_RolObjective::update"); + const vector_type& x = dynamic_cast(xx); + + // Convert input vector to a Ktensor + M = x.getKtensor(); + gcp_model.update(M); // If the step was accepted, record the time and add a new working entry diff --git a/src/rol/Genten_RolKokkosVector.hpp b/src/rol/Genten_RolKokkosVector.hpp index b37489763e..62e8bfbd0d 100644 --- a/src/rol/Genten_RolKokkosVector.hpp +++ b/src/rol/Genten_RolKokkosVector.hpp @@ -63,15 +63,16 @@ namespace Genten { typedef typename kokkos_vector::Ktensor_type Ktensor_type; // This just copies the shape of V, not the values - RolKokkosVector(const Ktensor_type& V, const bool make_view, - const DistKtensorUpdate *dku) : kv(V, dku) + RolKokkosVector(const Ktensor_type& V, const bool make_view = false, + const DistKtensorUpdate *dku = nullptr) : + kv(V, dku) { gt_assert(make_view == false); } RolKokkosVector(const unsigned nc, const unsigned nd, const IndxArray& sz, const ProcessorMap* pmap, - const DistKtensorUpdate *dku) : + const DistKtensorUpdate *dku = nullptr) : kv(nc,nd,sz,pmap,dku) { }