Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,15 @@

#include <cuda/std/span>

namespace cuopt {
// Forward decl: solver_settings_t exposes a pointer to omp_atomic_t<int> (concurrent_halt) for
// inter-thread coordination across the solver pipeline. The full definition lives in
// utilities/omp_helpers.hpp and is included only by implementation files that read or write
// that flag — so external consumers of this public header don't need OpenMP.
template <typename T>
class omp_atomic_t;
} // namespace cuopt
Comment thread
akifcorduk marked this conversation as resolved.

namespace cuopt::linear_programming {

// Forward declare solver_settings_t for friend class
Expand Down Expand Up @@ -309,8 +318,9 @@ class pdlp_solver_settings_t {
int num_gpus{1};
method_t method{method_t::Concurrent};
bool inside_mip{false};
// For concurrent termination
std::atomic<int>* concurrent_halt{nullptr};
// For concurrent termination. Owned by the caller; nullptr disables cooperative halt. Driven
// through omp_atomic_t<int> so all reads/writes go through `#pragma omp atomic`.
cuopt::omp_atomic_t<int>* concurrent_halt{nullptr};
// Shared strong branching solved flags for cooperative DS + PDLP
cuda::std::span<std::atomic<int>> shared_sb_solved;
static constexpr f_t minimal_absolute_tolerance = 1.0e-12;
Expand Down
18 changes: 11 additions & 7 deletions cpp/src/barrier/sparse_cholesky.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -280,13 +280,17 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {

#if CUDSS_VERSION_MAJOR >= 0 && CUDSS_VERSION_MINOR >= 7
if (settings_.concurrent_halt != nullptr) {
CUDSS_CALL_AND_CHECK_EXIT(cudssDataSet(handle,
solverData,
CUDSS_DATA_USER_HOST_INTERRUPT,
(void*)settings_.concurrent_halt,
sizeof(int)),
status,
"cudssDataSet for interrupt");
// cuDSS polls the int directly via this raw pointer; pass the address of the underlying
// int rather than the omp_atomic_t<int> wrapper. omp_atomic_t<int> is standard-layout
// around a single int, so this is the same address — explicit for clarity.
CUDSS_CALL_AND_CHECK_EXIT(
cudssDataSet(handle,
solverData,
CUDSS_DATA_USER_HOST_INTERRUPT,
static_cast<void*>(&settings_.concurrent_halt->underlying()),
sizeof(int)),
status,
"cudssDataSet for interrupt");
}

if (settings_.cudss_deterministic) {
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/branch_and_bound/branch_and_bound.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ class branch_and_bound_t {

f_t get_lower_bound();
bool enable_concurrent_lp_root_solve() const { return enable_concurrent_lp_root_solve_; }
std::atomic<int>* get_root_concurrent_halt() { return &root_concurrent_halt_; }
omp_atomic_t<int>* get_root_concurrent_halt() { return &root_concurrent_halt_; }
void set_root_concurrent_halt(int value) { root_concurrent_halt_ = value; }
lp_status_t solve_root_relaxation(simplex_solver_settings_t<i_t, f_t> const& lp_settings,
lp_solution_t<i_t, f_t>& root_relax_soln,
Expand Down Expand Up @@ -227,8 +227,8 @@ class branch_and_bound_t {
omp_atomic_t<f_t> root_lp_current_lower_bound_;
omp_atomic_t<bool> solving_root_relaxation_{false};
bool enable_concurrent_lp_root_solve_{false};
std::atomic<int> root_concurrent_halt_{0};
std::atomic<int> node_concurrent_halt_{0};
omp_atomic_t<int> root_concurrent_halt_{0};
omp_atomic_t<int> node_concurrent_halt_{0};
bool is_root_solution_set{false};

// Pseudocosts
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/branch_and_bound/pseudo_costs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -696,7 +696,7 @@ static void batch_pdlp_strong_branching_task(
const simplex_solver_settings_t<i_t, f_t>& settings,
i_t effective_batch_pdlp,
f_t start_time,
std::atomic<int>& concurrent_halt,
omp_atomic_t<int>& concurrent_halt,
const lp_problem_t<i_t, f_t>& original_lp,
const std::vector<i_t>& new_slacks,
const std::vector<f_t>& root_soln,
Expand Down Expand Up @@ -891,7 +891,7 @@ static void batch_pdlp_reliability_branching_task(
i_t rb_mode,
i_t num_candidates,
f_t start_time,
std::atomic<int>& concurrent_halt,
omp_atomic_t<int>& concurrent_halt,
const lp_problem_t<i_t, f_t>& original_lp,
const std::vector<i_t>& new_slacks,
const std::vector<f_t>& solution,
Expand Down Expand Up @@ -1051,7 +1051,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
shared_strong_branching_context_t<i_t, f_t> shared_ctx(2 * fractional.size());
shared_strong_branching_context_view_t<i_t, f_t> sb_view(shared_ctx.solved);

std::atomic<int> concurrent_halt{0};
omp_atomic_t<int> concurrent_halt{0};

std::vector<f_t> pdlp_obj_down(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
std::vector<f_t> pdlp_obj_up(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
Expand Down Expand Up @@ -1604,7 +1604,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
std::vector<f_t> pdlp_obj_down(num_candidates, std::numeric_limits<f_t>::quiet_NaN());
std::vector<f_t> pdlp_obj_up(num_candidates, std::numeric_limits<f_t>::quiet_NaN());

std::atomic<int> concurrent_halt{0};
omp_atomic_t<int> concurrent_halt{0};

if (use_pdlp) {
#pragma omp task default(shared)
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/dual_simplex/simplex_solver_settings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include <dual_simplex/logger.hpp>
#include <dual_simplex/types.hpp>
#include <utilities/omp_helpers.hpp>

#include <omp.h>
#include <algorithm>
Expand Down Expand Up @@ -224,8 +225,8 @@ struct simplex_solver_settings_t {
std::function<void(std::vector<f_t>&, std::vector<f_t>&, f_t)> set_simplex_solution_callback;
std::function<void(f_t)> dual_simplex_objective_callback; // Called with current dual obj
mutable logger_t log;
std::atomic<int>* concurrent_halt; // if nullptr ignored, if !nullptr, 0 if solver should
// continue, 1 if solver should halt
cuopt::omp_atomic_t<int>* concurrent_halt; // if nullptr ignored, if !nullptr, 0 if solver should
// continue, 1 if solver should halt
};

} // namespace cuopt::linear_programming::dual_simplex
3 changes: 2 additions & 1 deletion cpp/src/mip_heuristics/diversity/diversity_manager.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <mip_heuristics/local_search/local_search.cuh>
#include <mip_heuristics/solution/solution.cuh>
#include <mip_heuristics/solver.cuh>
#include <utilities/omp_helpers.hpp>
#include <utilities/timer.hpp>

#include <cstdint>
Expand Down Expand Up @@ -100,7 +101,7 @@ class diversity_manager_t {
// mutex for the simplex solution update
std::mutex relaxed_solution_mutex;
// atomic for signalling pdlp to stop
std::atomic<int> global_concurrent_halt{0};
cuopt::omp_atomic_t<int> global_concurrent_halt{0};

rins_t<i_t, f_t> rins;

Expand Down
46 changes: 20 additions & 26 deletions cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,7 @@ fj_t<i_t, f_t>::fj_t(mip_solver_context_t<i_t, f_t>& context_, fj_settings_t in_
template <typename i_t, typename f_t>
void fj_t<i_t, f_t>::reset_cuda_graph()
{
if (graph_created) cudaGraphExecDestroy(graph_instance);
graph_created = false;
step_graph_.reset();
}

template <typename i_t, typename f_t>
Expand Down Expand Up @@ -682,18 +681,23 @@ void fj_t<i_t, f_t>::run_step_device(const rmm::cuda_stream_view& climber_stream
// Load-balanced codepath not updated yet to handle rounding mode
if (settings.mode == fj_mode_t::ROUNDING) { use_load_balancing = false; }

cudaGraph_t graph;
void* kernel_args[] = {&v};
bool force_reset = false;
void* reset_moves_args[] = {&v, &force_reset};
bool ignore_load_balancing = false;
void* update_assignment_args[] = {&v, &ignore_load_balancing};
if (!graph_created || !use_graph) {
// CUB temp storage initialization
size_t compaction_temp_storage_bytes = 0;
auto valid_move_iterator = thrust::make_transform_iterator(
thrust::counting_iterator<i_t>(0),
cuda::proclaim_return_type<i_t>([v] __device__(i_t i) -> i_t { return v.admits_move(i); }));

// CUB temp storage probe + resize is intentionally done OUTSIDE the
// captured region: the resize would allocate, which is forbidden during
// capture, and the probe itself is a pure size calculation. We only need
// to (re)compute it on first capture for graph mode, and every time for
// eager mode -- the temp-storage size depends on n_variables only and is
// stable across iterations otherwise.
size_t compaction_temp_storage_bytes = 0;
auto valid_move_iterator = thrust::make_transform_iterator(
thrust::counting_iterator<i_t>(0),
cuda::proclaim_return_type<i_t>([v] __device__(i_t i) -> i_t { return v.admits_move(i); }));
if (!step_graph_.is_initialized() || !use_graph) {
cub::DeviceSelect::Flagged((void*)nullptr,
compaction_temp_storage_bytes,
thrust::counting_iterator<i_t>(0),
Expand All @@ -705,10 +709,9 @@ void fj_t<i_t, f_t>::run_step_device(const rmm::cuda_stream_view& climber_stream
if (compaction_temp_storage_bytes > data.cub_storage_bytes.size()) {
data.cub_storage_bytes.resize(compaction_temp_storage_bytes, climber_stream);
}
}

if (use_graph) {
RAFT_CUDA_TRY(cudaStreamBeginCapture(climber_stream, cudaStreamCaptureModeThreadLocal));
}
auto step_body = [&]() {
for (i_t i = 0; i < (use_graph ? iterations_per_graph : 1); ++i) {
{
// related varialbe array has to be dynamically computed each iteration
Expand Down Expand Up @@ -806,22 +809,13 @@ void fj_t<i_t, f_t>::run_step_device(const rmm::cuda_stream_view& climber_stream
0,
climber_stream));
}
};

if (use_graph) {
RAFT_CUDA_TRY(cudaStreamEndCapture(climber_stream, &graph));
try {
RAFT_CUDA_TRY(cudaGraphInstantiate(&graph_instance, graph));
} catch (...) {
RAFT_CUDA_TRY(cudaGraphDestroy(graph));
throw;
}
RAFT_CHECK_CUDA(climber_stream);
RAFT_CUDA_TRY(cudaGraphDestroy(graph));
graph_created = true;
}
if (use_graph) {
step_graph_.run(climber_stream, step_body);
} else {
step_body();
}

if (use_graph) RAFT_CUDA_TRY(cudaGraphLaunch(graph_instance, climber_stream));
}

template <typename i_t, typename f_t>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <mip_heuristics/utils.cuh>

#include <utilities/event_handler.cuh>
#include <utilities/manual_cuda_graph.cuh>

#include <functional>

Expand Down Expand Up @@ -267,8 +268,7 @@ class fj_t {
rmm::device_uvector<fj_load_balancing_workid_mapping_t> work_id_to_nonbin_var_idx;
rmm::device_uvector<i_t> work_ids_for_related_vars;

cudaGraphExec_t graph_instance;
bool graph_created = false;
cuopt::manual_cuda_graph_t step_graph_;

// kernel launch dimensions, computed once inside the constructor
std::pair<dim3, dim3> setval_launch_dims;
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/mip_heuristics/mip_constants.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,6 @@
#define CUOPT_MIP_RINS_REQUIRED_THREAD_COUNT 4
#define CUOPT_MIP_BATCH_PDLP_REQUIRED_THREAD_COUNT 3
#define CUOPT_MIP_CLIQUE_CUTS_REQUIRED_THREAD_COUNT 3
// MIP-only gate: skip the concurrent barrier when fewer threads are available than this
// (1 PDLP + 1 dual simplex + 1 barrier). Stand-alone LP always runs all three.
#define CUOPT_CONCURRENT_LP_BARRIER_REQUIRED_THREAD_COUNT 3
17 changes: 9 additions & 8 deletions cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,20 @@
#include <mip_heuristics/presolve/bounds_presolve.cuh>
#include <mip_heuristics/problem/problem.cuh>
#include <mip_heuristics/solution/solution.cuh>
#include <utilities/omp_helpers.hpp>
#include "lp_state.cuh"

namespace cuopt::linear_programming::detail {

struct relaxed_lp_settings_t {
double tolerance = 1e-4;
double time_limit = 1.0;
bool check_infeasibility = true;
bool return_first_feasible = false;
bool save_state = true;
bool per_constraint_residual = true;
bool has_initial_primal = true;
std::atomic<int>* concurrent_halt = nullptr;
double tolerance = 1e-4;
double time_limit = 1.0;
bool check_infeasibility = true;
bool return_first_feasible = false;
bool save_state = true;
bool per_constraint_residual = true;
bool has_initial_primal = true;
cuopt::omp_atomic_t<int>* concurrent_halt = nullptr;
};

template <typename i_t, typename f_t>
Expand Down
Loading
Loading