From a90b8b11e3b1ee694c9f2072c7d7f1fcc12a7509 Mon Sep 17 00:00:00 2001 From: Fritz Goebel Date: Tue, 17 Oct 2023 15:17:56 -0400 Subject: [PATCH 1/3] update Ginkgo interface to accept data on GPU through mem_space --- src/LinAlg/hiopLinSolverSparseGinkgo.cpp | 51 +++++++++++++++++++----- src/LinAlg/hiopLinSolverSparseGinkgo.hpp | 2 + 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/src/LinAlg/hiopLinSolverSparseGinkgo.cpp b/src/LinAlg/hiopLinSolverSparseGinkgo.cpp index 971ef699a..3096f7b3d 100644 --- a/src/LinAlg/hiopLinSolverSparseGinkgo.cpp +++ b/src/LinAlg/hiopLinSolverSparseGinkgo.cpp @@ -283,12 +283,21 @@ std::shared_ptr setup_solver_factory(std::shared_ptroptions->GetString("mem_space") == "device") { + M_host_ = LinearAlgebraFactory::create_matrix_sparse("default", n, n, nnz); + } + } hiopLinSolverSymSparseGinkgo::~hiopLinSolverSymSparseGinkgo() { delete [] index_covert_CSR2Triplet_; delete [] index_covert_extra_Diag2CSR_; + + // If memory space is device, delete allocated host mirrors + if(nlp_->options->GetString("mem_space") == "device") { + delete M_host_; + } } void hiopLinSolverSymSparseGinkgo::firstCall() @@ -304,7 +313,19 @@ std::shared_ptr setup_solver_factory(std::shared_ptroptions->GetInteger("ir_inner_restart"); iterative_refinement_ = gmres_iter > 0; - host_mtx_ = transferTripletToCSR(exec_->get_master(), n_, M_, &index_covert_CSR2Triplet_, &index_covert_extra_Diag2CSR_); + // If the matrix is on device, copy it to the host mirror + std::string mem_space = nlp_->options->GetString("mem_space"); + if(mem_space == "device") { + auto host = exec_->get_master(); + auto nnz = M_->numberOfNonzeros(); + host->copy_from(exec_.get(), nnz, M_->M(), M_host_->M()); + host->copy_from(exec_.get(), nnz, M_->i_row(), M_host_->i_row()); + host->copy_from(exec_.get(), nnz, M_->j_col(), M_host_->j_col()); + } else { + M_host_ = M_; + } + + host_mtx_ = transferTripletToCSR(exec_->get_master(), n_, M_host_, &index_covert_CSR2Triplet_, &index_covert_extra_Diag2CSR_); mtx_ = exec_ == (exec_->get_master()) ? host_mtx_ : gko::clone(exec_, host_mtx_); nnz_ = mtx_->get_num_stored_elements(); @@ -321,7 +342,15 @@ std::shared_ptr setup_solver_factory(std::shared_ptrfirstCall(); } else { - update_matrix(M_, mtx_, host_mtx_, index_covert_CSR2Triplet_, index_covert_extra_Diag2CSR_); + std::string mem_space = nlp_->options->GetString("mem_space"); + if(mem_space == "device") { + auto host = exec_->get_master(); + auto nnz = M_->numberOfNonzeros(); + host->copy_from(exec_.get(), nnz, M_->M(), M_host_->M()); + } else { + M_host_ = M_; + } + update_matrix(M_host_, mtx_, host_mtx_, index_covert_CSR2Triplet_, index_covert_extra_Diag2CSR_); } gko_solver_ = gko::share(reusable_factory_->generate(mtx_)); @@ -353,20 +382,22 @@ std::shared_ptr setup_solver_factory(std::shared_ptr(&x_); assert(x != NULL); hiopVectorPar* rhs = dynamic_cast(x->new_copy()); + + std::string mem_space = nlp_->options->GetString("mem_space"); + auto exec = host; + if(mem_space == "device") { + exec = exec_; + } + double* dx = x->local_data(); double* drhs = rhs->local_data(); const auto size = gko::dim<2>{(long unsigned int)n_, 1}; - auto dense_x_host = vec::create(host, size, arr::view(host, n_, dx), 1); - auto dense_x = vec::create(exec_, size); - dense_x->copy_from(dense_x_host.get()); - auto dense_b_host = vec::create(host, size, arr::view(host, n_, drhs), 1); - auto dense_b = vec::create(exec_, size); - dense_b->copy_from(dense_b_host.get()); + auto dense_x = vec::create(exec, size, arr::view(exec, n_, dx), 1); + auto dense_b = vec::create(exec, size, arr::view(exec, n_, drhs), 1); gko_solver_->apply(dense_b.get(), dense_x.get()); nlp_->runStats.linsolv.tmTriuSolves.stop(); - dense_x_host->copy_from(dense_x.get()); delete rhs; rhs=nullptr; return 1; } diff --git a/src/LinAlg/hiopLinSolverSparseGinkgo.hpp b/src/LinAlg/hiopLinSolverSparseGinkgo.hpp index 058c606a5..198322595 100644 --- a/src/LinAlg/hiopLinSolverSparseGinkgo.hpp +++ b/src/LinAlg/hiopLinSolverSparseGinkgo.hpp @@ -96,6 +96,8 @@ class hiopLinSolverSymSparseGinkgo: public hiopLinSolverSymSparse static const std::map alg_map_; + hiopMatrixSparse* M_host_{ nullptr }; ///< Host mirror for the KKT matrix + public: /** called the very first time a matrix is factored. Allocates space From 23336498b1555617c3e676cc188dadd56a18d29a Mon Sep 17 00:00:00 2001 From: Fritz Goebel Date: Mon, 30 Oct 2023 17:37:08 -0400 Subject: [PATCH 2/3] Add Ginkgo as option for hybrid and GPU compute mode --- src/Drivers/Sparse/NlpSparseEx1Driver.cpp | 4 ++ src/LinAlg/hiopLinSolverSparseGinkgo.cpp | 50 +++++++++++++---------- src/LinAlg/hiopLinSolverSparseGinkgo.hpp | 1 + src/Optimization/hiopKKTLinSysSparse.cpp | 28 +++++++++++-- 4 files changed, 59 insertions(+), 24 deletions(-) diff --git a/src/Drivers/Sparse/NlpSparseEx1Driver.cpp b/src/Drivers/Sparse/NlpSparseEx1Driver.cpp index 3131fc024..25cc08e06 100644 --- a/src/Drivers/Sparse/NlpSparseEx1Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseEx1Driver.cpp @@ -239,8 +239,12 @@ int main(int argc, char **argv) nlp.options->SetStringValue("fact_acceptor", "inertia_free"); nlp.options->SetIntegerValue("ir_outer_maxit", 0); if (use_ginkgo_cuda) { + nlp.options->SetStringValue("mem_space", "device"); + nlp.options->SetStringValue("compute_mode", "gpu"); nlp.options->SetStringValue("ginkgo_exec", "cuda"); } else if (use_ginkgo_hip) { + nlp.options->SetStringValue("mem_space", "device"); + nlp.options->SetStringValue("compute_mode", "gpu"); nlp.options->SetStringValue("ginkgo_exec", "hip"); } else { nlp.options->SetStringValue("ginkgo_exec", "reference"); diff --git a/src/LinAlg/hiopLinSolverSparseGinkgo.cpp b/src/LinAlg/hiopLinSolverSparseGinkgo.cpp index 3096f7b3d..b957747f5 100644 --- a/src/LinAlg/hiopLinSolverSparseGinkgo.cpp +++ b/src/LinAlg/hiopLinSolverSparseGinkgo.cpp @@ -284,6 +284,7 @@ std::shared_ptr setup_solver_factory(std::shared_ptroptions->GetString("mem_space") == "device") { M_host_ = LinearAlgebraFactory::create_matrix_sparse("default", n, n, nnz); } @@ -291,13 +292,13 @@ std::shared_ptr setup_solver_factory(std::shared_ptroptions->GetString("mem_space") == "device") { delete M_host_; } + + delete [] index_covert_CSR2Triplet_; + delete [] index_covert_extra_Diag2CSR_; } void hiopLinSolverSymSparseGinkgo::firstCall() @@ -315,21 +316,33 @@ std::shared_ptr setup_solver_factory(std::shared_ptroptions->GetString("mem_space"); + std::cout << nlp_->options->GetString("ginkgo_exec") << " " << mem_space << std::endl; + auto M = M_; if(mem_space == "device") { auto host = exec_->get_master(); auto nnz = M_->numberOfNonzeros(); - host->copy_from(exec_.get(), nnz, M_->M(), M_host_->M()); - host->copy_from(exec_.get(), nnz, M_->i_row(), M_host_->i_row()); - host->copy_from(exec_.get(), nnz, M_->j_col(), M_host_->j_col()); - } else { - M_host_ = M_; + //host->copy_from(exec_.get(), nnz, M_->M(), M_host_->M()); + auto dv = gko::make_const_array_view(exec_, nnz, M_->M()); + auto hv = gko::make_array_view(host, nnz, M_host_->M()); + host->copy_from(exec_.get(), nnz, dv.get_const_data(), hv.get_data()); + auto di = gko::make_const_array_view(exec_, nnz, M_->i_row()); + auto hi = gko::make_array_view(host, nnz, M_host_->i_row()); + host->copy_from(exec_.get(), nnz, di.get_const_data(), hi.get_data()); + auto dj = gko::make_const_array_view(exec_, nnz, M_->j_col()); + auto hj = gko::make_array_view(host, nnz, M_host_->j_col()); + host->copy_from(exec_.get(), nnz, dj.get_const_data(), hj.get_data()); + //host->copy_from(exec_.get(), nnz, M_->i_row(), M_host_->i_row()); + //host->copy_from(exec_.get(), nnz, M_->j_col(), M_host_->j_col()); + M = M_host_; } - host_mtx_ = transferTripletToCSR(exec_->get_master(), n_, M_host_, &index_covert_CSR2Triplet_, &index_covert_extra_Diag2CSR_); + host_mtx_ = transferTripletToCSR(exec_->get_master(), n_, M, &index_covert_CSR2Triplet_, &index_covert_extra_Diag2CSR_); mtx_ = exec_ == (exec_->get_master()) ? host_mtx_ : gko::clone(exec_, host_mtx_); nnz_ = mtx_->get_num_stored_elements(); reusable_factory_ = setup_solver_factory(exec_, mtx_, alg, gmres_iter, gmres_tol, gmres_restart); + + dense_b_ = gko::matrix::Dense::create(exec_, gko::dim<2>{n_, 1}); } int hiopLinSolverSymSparseGinkgo::matrixChanged() @@ -343,14 +356,14 @@ std::shared_ptr setup_solver_factory(std::shared_ptrfirstCall(); } else { std::string mem_space = nlp_->options->GetString("mem_space"); + auto M = M_; if(mem_space == "device") { auto host = exec_->get_master(); auto nnz = M_->numberOfNonzeros(); host->copy_from(exec_.get(), nnz, M_->M(), M_host_->M()); - } else { - M_host_ = M_; + M = M_host_; } - update_matrix(M_host_, mtx_, host_mtx_, index_covert_CSR2Triplet_, index_covert_extra_Diag2CSR_); + update_matrix(M, mtx_, host_mtx_, index_covert_CSR2Triplet_, index_covert_extra_Diag2CSR_); } gko_solver_ = gko::share(reusable_factory_->generate(mtx_)); @@ -370,6 +383,7 @@ std::shared_ptr setup_solver_factory(std::shared_ptr; using arr = gko::array; auto host = exec_->get_master(); @@ -379,26 +393,20 @@ std::shared_ptr setup_solver_factory(std::shared_ptrrunStats.linsolv.tmTriuSolves.start(); - hiopVectorPar* x = dynamic_cast(&x_); - assert(x != NULL); - hiopVectorPar* rhs = dynamic_cast(x->new_copy()); - std::string mem_space = nlp_->options->GetString("mem_space"); auto exec = host; if(mem_space == "device") { exec = exec_; } - double* dx = x->local_data(); - double* drhs = rhs->local_data(); + double* dx = x_.local_data(); const auto size = gko::dim<2>{(long unsigned int)n_, 1}; auto dense_x = vec::create(exec, size, arr::view(exec, n_, dx), 1); - auto dense_b = vec::create(exec, size, arr::view(exec, n_, drhs), 1); + dense_b_->copy_from(dense_x.get()); - gko_solver_->apply(dense_b.get(), dense_x.get()); + gko_solver_->apply(dense_b_.get(), dense_x.get()); nlp_->runStats.linsolv.tmTriuSolves.stop(); - delete rhs; rhs=nullptr; return 1; } diff --git a/src/LinAlg/hiopLinSolverSparseGinkgo.hpp b/src/LinAlg/hiopLinSolverSparseGinkgo.hpp index 198322595..d1c7a0419 100644 --- a/src/LinAlg/hiopLinSolverSparseGinkgo.hpp +++ b/src/LinAlg/hiopLinSolverSparseGinkgo.hpp @@ -90,6 +90,7 @@ class hiopLinSolverSymSparseGinkgo: public hiopLinSolverSymSparse std::shared_ptr exec_; std::shared_ptr> mtx_; std::shared_ptr> host_mtx_; + std::shared_ptr> dense_b_; std::shared_ptr reusable_factory_; std::shared_ptr gko_solver_; bool iterative_refinement_; diff --git a/src/Optimization/hiopKKTLinSysSparse.cpp b/src/Optimization/hiopKKTLinSysSparse.cpp index c41a8579a..3cee4380f 100644 --- a/src/Optimization/hiopKKTLinSysSparse.cpp +++ b/src/Optimization/hiopKKTLinSysSparse.cpp @@ -317,9 +317,7 @@ namespace hiop if( (nullptr == linSys_ && linear_solver == "auto") || linear_solver == "ginkgo") { //ma57, pardiso and strumpack are not available or user requested ginkgo #ifdef HIOP_USE_GINKGO - nlp_->log->printf(hovScalars, - "KKT_SPARSE_XYcYd linsys: alloc GINKGO with matrix size %d (%d cons)\n", - n, neq+nineq); + linsol_actual = "GINKGO"; linSys_ = new hiopLinSolverSymSparseGinkgo(n, nnz, nlp_); #endif // HIOP_USE_GINKGO } @@ -376,6 +374,14 @@ namespace hiop linSys_ = new hiopLinSolverSymSparsePARDISO(n, nnz, nlp_); #endif // HIOP_USE_PARDISO } + + if( (nullptr == linSys_ && linear_solver == "auto") || linear_solver == "ginkgo") { + //ma57, pardiso and strumpack are not available or user requested ginkgo +#ifdef HIOP_USE_GINKGO + linsol_actual = "GINKGO"; + linSys_ = new hiopLinSolverSymSparseGinkgo(n, nnz, nlp_); +#endif // HIOP_USE_GINKGO + } if(linSys_) { nlp_->log->printf(hovScalars, @@ -747,6 +753,14 @@ namespace hiop #endif // HIOP_USE_PARDISO } + if(nullptr == linSys_ && linear_solver == "ginkgo") { + //ma57, pardiso and strumpack are not available or user requested ginkgo +#ifdef HIOP_USE_GINKGO + linSys_ = new hiopLinSolverSymSparseGinkgo(n, nnz, nlp_); + actual_lin_solver = "GINKGO"; +#endif // HIOP_USE_GINKGO + } + if(linSys_) { nlp_->log->printf(hovScalars, "KKT_SPARSE_XDYcYd linsys: alloc [%s] size %d (%d cons) (hybrid)\n", @@ -781,6 +795,14 @@ namespace hiop } #endif } //end resolve + + if(nullptr == linSys_ && linear_solver == "ginkgo") { + //ma57, pardiso and strumpack are not available or user requested ginkgo +#ifdef HIOP_USE_GINKGO + linSys_ = new hiopLinSolverSymSparseGinkgo(n, nnz, nlp_); + actual_lin_solver = "GINKGO"; +#endif // HIOP_USE_GINKGO + } } // end of compute mode gpu } assert(linSys_&& "KKT_SPARSE_XDYcYd linsys: cannot instantiate backend linear solver"); From 7630d4b3f72f3f75b7bf7f07f01c60be395a10ec Mon Sep 17 00:00:00 2001 From: Fritz Goebel Date: Thu, 2 Nov 2023 15:07:19 -0400 Subject: [PATCH 3/3] Remove device reset for Ginkgo Executor, add Ginkgo as option for NlpSparseRajaEx2 --- src/Drivers/Sparse/CMakeLists.txt | 3 +-- src/Drivers/Sparse/NlpSparseEx1Driver.cpp | 2 -- src/Drivers/Sparse/NlpSparseEx2Driver.cpp | 2 ++ src/Drivers/Sparse/NlpSparseEx4Driver.cpp | 2 ++ src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp | 14 +++++++++++++- src/LinAlg/hiopLinSolverSparseGinkgo.cpp | 9 ++------- 6 files changed, 20 insertions(+), 12 deletions(-) diff --git a/src/Drivers/Sparse/CMakeLists.txt b/src/Drivers/Sparse/CMakeLists.txt index 91efb88a7..1c7ed6d8c 100644 --- a/src/Drivers/Sparse/CMakeLists.txt +++ b/src/Drivers/Sparse/CMakeLists.txt @@ -18,11 +18,10 @@ add_executable(NlpSparseEx4.exe NlpSparseEx4.cpp NlpSparseEx4Driver.cpp) target_link_libraries(NlpSparseEx4.exe HiOp::HiOp) if(HIOP_USE_RAJA) - if(HIOP_USE_GPU AND HIOP_USE_CUDA) + if(HIOP_USE_GPU) set_source_files_properties( NlpSparseRajaEx2.cpp NlpSparseRajaEx2Driver.cpp - PROPERTIES LANGUAGE CUDA ) add_executable(NlpSparseRajaEx2.exe NlpSparseRajaEx2Driver.cpp NlpSparseRajaEx2.cpp) diff --git a/src/Drivers/Sparse/NlpSparseEx1Driver.cpp b/src/Drivers/Sparse/NlpSparseEx1Driver.cpp index 25cc08e06..3bc259bae 100644 --- a/src/Drivers/Sparse/NlpSparseEx1Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseEx1Driver.cpp @@ -239,11 +239,9 @@ int main(int argc, char **argv) nlp.options->SetStringValue("fact_acceptor", "inertia_free"); nlp.options->SetIntegerValue("ir_outer_maxit", 0); if (use_ginkgo_cuda) { - nlp.options->SetStringValue("mem_space", "device"); nlp.options->SetStringValue("compute_mode", "gpu"); nlp.options->SetStringValue("ginkgo_exec", "cuda"); } else if (use_ginkgo_hip) { - nlp.options->SetStringValue("mem_space", "device"); nlp.options->SetStringValue("compute_mode", "gpu"); nlp.options->SetStringValue("ginkgo_exec", "hip"); } else { diff --git a/src/Drivers/Sparse/NlpSparseEx2Driver.cpp b/src/Drivers/Sparse/NlpSparseEx2Driver.cpp index e61b866a0..27e3048d0 100644 --- a/src/Drivers/Sparse/NlpSparseEx2Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseEx2Driver.cpp @@ -248,8 +248,10 @@ int main(int argc, char **argv) nlp.options->SetStringValue("linsol_mode", "speculative"); nlp.options->SetStringValue("linear_solver_sparse", "ginkgo"); if (use_ginkgo_cuda) { + nlp.options->SetStringValue("compute_mode", "gpu"); nlp.options->SetStringValue("ginkgo_exec", "cuda"); } else if (use_ginkgo_hip) { + nlp.options->SetStringValue("compute_mode", "gpu"); nlp.options->SetStringValue("ginkgo_exec", "hip"); } else { nlp.options->SetStringValue("ginkgo_exec", "reference"); diff --git a/src/Drivers/Sparse/NlpSparseEx4Driver.cpp b/src/Drivers/Sparse/NlpSparseEx4Driver.cpp index 0200284f6..878a605e9 100644 --- a/src/Drivers/Sparse/NlpSparseEx4Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseEx4Driver.cpp @@ -238,8 +238,10 @@ int main(int argc, char **argv) nlp.options->SetStringValue("fact_acceptor", "inertia_free"); nlp.options->SetIntegerValue("ir_outer_maxit", 0); if (use_ginkgo_cuda) { + nlp.options->SetStringValue("compute_mode", "gpu"); nlp.options->SetStringValue("ginkgo_exec", "cuda"); } else if (use_ginkgo_hip) { + nlp.options->SetStringValue("compute_mode", "gpu"); nlp.options->SetStringValue("ginkgo_exec", "hip"); } else { nlp.options->SetStringValue("ginkgo_exec", "reference"); diff --git a/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp index a2455ebe1..d079ed46e 100644 --- a/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp @@ -256,12 +256,24 @@ int main(int argc, char **argv) // only support cusolverLU right now, 2023.02.28 //lsq initialization of the duals fails for this example since the Jacobian is rank deficient //use zero initialization - nlp.options->SetStringValue("linear_solver_sparse", "resolve"); if(use_resolve_cuda_rf) { + nlp.options->SetStringValue("linear_solver_sparse", "resolve"); nlp.options->SetStringValue("resolve_refactorization", "rf"); nlp.options->SetIntegerValue("ir_inner_maxit", 20); nlp.options->SetIntegerValue("ir_outer_maxit", 0); } + if (use_ginkgo) { + nlp.options->SetStringValue("linear_solver_sparse", "ginkgo"); + nlp.options->SetIntegerValue("ir_outer_maxit", 0); + if (use_ginkgo_cuda) { + nlp.options->SetStringValue("ginkgo_exec", "cuda"); + } else if (use_ginkgo_hip) { + nlp.options->SetStringValue("ginkgo_exec", "hip"); + } else { + nlp.options->SetStringValue("ginkgo_exec", "reference"); + nlp.options->SetStringValue("compute_mode", "cpu"); + } + } nlp.options->SetStringValue("duals_init", "zero"); nlp.options->SetStringValue("mem_space", "device"); nlp.options->SetStringValue("fact_acceptor", "inertia_free"); diff --git a/src/LinAlg/hiopLinSolverSparseGinkgo.cpp b/src/LinAlg/hiopLinSolverSparseGinkgo.cpp index b957747f5..e463500b2 100644 --- a/src/LinAlg/hiopLinSolverSparseGinkgo.cpp +++ b/src/LinAlg/hiopLinSolverSparseGinkgo.cpp @@ -212,13 +212,11 @@ std::shared_ptr create_exec(std::string executor_string) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::ReferenceExecutor::create(), - true); + return gko::CudaExecutor::create(0, gko::ReferenceExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::ReferenceExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::ReferenceExecutor::create()); }}, {"dpcpp", [] { @@ -284,7 +282,6 @@ std::shared_ptr setup_solver_factory(std::shared_ptroptions->GetString("mem_space") == "device") { M_host_ = LinearAlgebraFactory::create_matrix_sparse("default", n, n, nnz); } @@ -316,7 +313,6 @@ std::shared_ptr setup_solver_factory(std::shared_ptroptions->GetString("mem_space"); - std::cout << nlp_->options->GetString("ginkgo_exec") << " " << mem_space << std::endl; auto M = M_; if(mem_space == "device") { auto host = exec_->get_master(); @@ -383,7 +379,6 @@ std::shared_ptr setup_solver_factory(std::shared_ptr; using arr = gko::array; auto host = exec_->get_master();