NVIDIA · akifcorduk · May 19, 2026 · May 20, 2026 · May 20, 2026 · May 20, 2026
@@ -21,6 +21,15 @@
 
 #include <cuda/std/span>
 
+namespace cuopt {
+// Forward decl: solver_settings_t exposes a pointer to omp_atomic_t<int> (concurrent_halt) for
+// inter-thread coordination across the solver pipeline. The full definition lives in
+// utilities/omp_helpers.hpp and is included only by implementation files that read or write
+// that flag — so external consumers of this public header don't need OpenMP.
+template <typename T>
+class omp_atomic_t;
+}  // namespace cuopt
+
 namespace cuopt::linear_programming {
 
 // Forward declare solver_settings_t for friend class
@@ -309,8 +318,9 @@ class pdlp_solver_settings_t {
   int num_gpus{1};
   method_t method{method_t::Concurrent};
   bool inside_mip{false};
-  // For concurrent termination
-  std::atomic<int>* concurrent_halt{nullptr};
+  // For concurrent termination. Owned by the caller; nullptr disables cooperative halt. Driven
+  // through omp_atomic_t<int> so all reads/writes go through `#pragma omp atomic`.
+  cuopt::omp_atomic_t<int>* concurrent_halt{nullptr};
   // Shared strong branching solved flags for cooperative DS + PDLP
   cuda::std::span<std::atomic<int>> shared_sb_solved;
   static constexpr f_t minimal_absolute_tolerance = 1.0e-12;

@@ -280,13 +280,17 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
 
 #if CUDSS_VERSION_MAJOR >= 0 && CUDSS_VERSION_MINOR >= 7
     if (settings_.concurrent_halt != nullptr) {
-      CUDSS_CALL_AND_CHECK_EXIT(cudssDataSet(handle,
-                                             solverData,
-                                             CUDSS_DATA_USER_HOST_INTERRUPT,
-                                             (void*)settings_.concurrent_halt,
-                                             sizeof(int)),
-                                status,
-                                "cudssDataSet for interrupt");
+      // cuDSS polls the int directly via this raw pointer; pass the address of the underlying
+      // int rather than the omp_atomic_t<int> wrapper. omp_atomic_t<int> is standard-layout
+      // around a single int, so this is the same address — explicit for clarity.
+      CUDSS_CALL_AND_CHECK_EXIT(
+        cudssDataSet(handle,
+                     solverData,
+                     CUDSS_DATA_USER_HOST_INTERRUPT,
+                     static_cast<void*>(&settings_.concurrent_halt->underlying()),
+                     sizeof(int)),
+        status,
+        "cudssDataSet for interrupt");
     }
 
     if (settings_.cudss_deterministic) {

@@ -135,7 +135,7 @@ class branch_and_bound_t {
 
   f_t get_lower_bound();
   bool enable_concurrent_lp_root_solve() const { return enable_concurrent_lp_root_solve_; }
-  std::atomic<int>* get_root_concurrent_halt() { return &root_concurrent_halt_; }
+  omp_atomic_t<int>* get_root_concurrent_halt() { return &root_concurrent_halt_; }
   void set_root_concurrent_halt(int value) { root_concurrent_halt_ = value; }
   lp_status_t solve_root_relaxation(simplex_solver_settings_t<i_t, f_t> const& lp_settings,
                                     lp_solution_t<i_t, f_t>& root_relax_soln,
@@ -227,8 +227,8 @@ class branch_and_bound_t {
   omp_atomic_t<f_t> root_lp_current_lower_bound_;
   omp_atomic_t<bool> solving_root_relaxation_{false};
   bool enable_concurrent_lp_root_solve_{false};
-  std::atomic<int> root_concurrent_halt_{0};
-  std::atomic<int> node_concurrent_halt_{0};
+  omp_atomic_t<int> root_concurrent_halt_{0};
+  omp_atomic_t<int> node_concurrent_halt_{0};
   bool is_root_solution_set{false};
 
   // Pseudocosts

@@ -696,7 +696,7 @@ static void batch_pdlp_strong_branching_task(
   const simplex_solver_settings_t<i_t, f_t>& settings,
   i_t effective_batch_pdlp,
   f_t start_time,
-  std::atomic<int>& concurrent_halt,
+  omp_atomic_t<int>& concurrent_halt,
   const lp_problem_t<i_t, f_t>& original_lp,
   const std::vector<i_t>& new_slacks,
   const std::vector<f_t>& root_soln,
@@ -891,7 +891,7 @@ static void batch_pdlp_reliability_branching_task(
   i_t rb_mode,
   i_t num_candidates,
   f_t start_time,
-  std::atomic<int>& concurrent_halt,
+  omp_atomic_t<int>& concurrent_halt,
   const lp_problem_t<i_t, f_t>& original_lp,
   const std::vector<i_t>& new_slacks,
   const std::vector<f_t>& solution,
@@ -1051,7 +1051,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
   shared_strong_branching_context_t<i_t, f_t> shared_ctx(2 * fractional.size());
   shared_strong_branching_context_view_t<i_t, f_t> sb_view(shared_ctx.solved);
 
-  std::atomic<int> concurrent_halt{0};
+  omp_atomic_t<int> concurrent_halt{0};
 
   std::vector<f_t> pdlp_obj_down(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
   std::vector<f_t> pdlp_obj_up(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
@@ -1604,7 +1604,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   std::vector<f_t> pdlp_obj_down(num_candidates, std::numeric_limits<f_t>::quiet_NaN());
   std::vector<f_t> pdlp_obj_up(num_candidates, std::numeric_limits<f_t>::quiet_NaN());
 
-  std::atomic<int> concurrent_halt{0};
+  omp_atomic_t<int> concurrent_halt{0};
 
   if (use_pdlp) {
 #pragma omp task default(shared)

@@ -9,6 +9,7 @@
 
 #include <dual_simplex/logger.hpp>
 #include <dual_simplex/types.hpp>
+#include <utilities/omp_helpers.hpp>
 
 #include <omp.h>
 #include <algorithm>
@@ -224,8 +225,8 @@ struct simplex_solver_settings_t {
   std::function<void(std::vector<f_t>&, std::vector<f_t>&, f_t)> set_simplex_solution_callback;
   std::function<void(f_t)> dual_simplex_objective_callback;  // Called with current dual obj
   mutable logger_t log;
-  std::atomic<int>* concurrent_halt;  // if nullptr ignored, if !nullptr, 0 if solver should
-                                      // continue, 1 if solver should halt
+  cuopt::omp_atomic_t<int>* concurrent_halt;  // if nullptr ignored, if !nullptr, 0 if solver should
+                                              // continue, 1 if solver should halt
 };
 
 }  // namespace cuopt::linear_programming::dual_simplex
@@ -25,6 +25,7 @@
 #include <mip_heuristics/local_search/local_search.cuh>
 #include <mip_heuristics/solution/solution.cuh>
 #include <mip_heuristics/solver.cuh>
+#include <utilities/omp_helpers.hpp>
 #include <utilities/timer.hpp>
 
 #include <cstdint>
@@ -100,7 +101,7 @@ class diversity_manager_t {
   // mutex for the simplex solution update
   std::mutex relaxed_solution_mutex;
   // atomic for signalling pdlp to stop
-  std::atomic<int> global_concurrent_halt{0};
+  cuopt::omp_atomic_t<int> global_concurrent_halt{0};
 
   rins_t<i_t, f_t> rins;
 

@@ -108,8 +108,7 @@ fj_t<i_t, f_t>::fj_t(mip_solver_context_t<i_t, f_t>& context_, fj_settings_t in_
 template <typename i_t, typename f_t>
 void fj_t<i_t, f_t>::reset_cuda_graph()
 {
-  if (graph_created) cudaGraphExecDestroy(graph_instance);
-  graph_created = false;
+  step_graph_.reset();
 }
 
 template <typename i_t, typename f_t>
@@ -682,18 +681,23 @@ void fj_t<i_t, f_t>::run_step_device(const rmm::cuda_stream_view& climber_stream
   // Load-balanced codepath not updated yet to handle rounding mode
   if (settings.mode == fj_mode_t::ROUNDING) { use_load_balancing = false; }
 
-  cudaGraph_t graph;
   void* kernel_args[]            = {&v};
   bool force_reset               = false;
   void* reset_moves_args[]       = {&v, &force_reset};
   bool ignore_load_balancing     = false;
   void* update_assignment_args[] = {&v, &ignore_load_balancing};
-  if (!graph_created || !use_graph) {
-    // CUB temp storage initialization
-    size_t compaction_temp_storage_bytes = 0;
-    auto valid_move_iterator             = thrust::make_transform_iterator(
-      thrust::counting_iterator<i_t>(0),
-      cuda::proclaim_return_type<i_t>([v] __device__(i_t i) -> i_t { return v.admits_move(i); }));
+
+  // CUB temp storage probe + resize is intentionally done OUTSIDE the
+  // captured region: the resize would allocate, which is forbidden during
+  // capture, and the probe itself is a pure size calculation. We only need
+  // to (re)compute it on first capture for graph mode, and every time for
+  // eager mode -- the temp-storage size depends on n_variables only and is
+  // stable across iterations otherwise.
+  size_t compaction_temp_storage_bytes = 0;
+  auto valid_move_iterator             = thrust::make_transform_iterator(
+    thrust::counting_iterator<i_t>(0),
+    cuda::proclaim_return_type<i_t>([v] __device__(i_t i) -> i_t { return v.admits_move(i); }));
+  if (!step_graph_.is_initialized() || !use_graph) {
     cub::DeviceSelect::Flagged((void*)nullptr,
                                compaction_temp_storage_bytes,
                                thrust::counting_iterator<i_t>(0),
@@ -705,10 +709,9 @@ void fj_t<i_t, f_t>::run_step_device(const rmm::cuda_stream_view& climber_stream
     if (compaction_temp_storage_bytes > data.cub_storage_bytes.size()) {
       data.cub_storage_bytes.resize(compaction_temp_storage_bytes, climber_stream);
     }
+  }
 
-    if (use_graph) {
-      RAFT_CUDA_TRY(cudaStreamBeginCapture(climber_stream, cudaStreamCaptureModeThreadLocal));
-    }
+  auto step_body = [&]() {
     for (i_t i = 0; i < (use_graph ? iterations_per_graph : 1); ++i) {
       {
         // related varialbe array has to be dynamically computed each iteration
@@ -806,22 +809,13 @@ void fj_t<i_t, f_t>::run_step_device(const rmm::cuda_stream_view& climber_stream
                                      0,
                                      climber_stream));
     }
+  };
 
-    if (use_graph) {
-      RAFT_CUDA_TRY(cudaStreamEndCapture(climber_stream, &graph));
-      try {
-        RAFT_CUDA_TRY(cudaGraphInstantiate(&graph_instance, graph));
-      } catch (...) {
-        RAFT_CUDA_TRY(cudaGraphDestroy(graph));
-        throw;
-      }
-      RAFT_CHECK_CUDA(climber_stream);
-      RAFT_CUDA_TRY(cudaGraphDestroy(graph));
-      graph_created = true;
-    }
+  if (use_graph) {
+    step_graph_.run(climber_stream, step_body);
+  } else {
+    step_body();
   }
-
-  if (use_graph) RAFT_CUDA_TRY(cudaGraphLaunch(graph_instance, climber_stream));
 }
 
 template <typename i_t, typename f_t>

@@ -18,6 +18,7 @@
 #include <mip_heuristics/utils.cuh>
 
 #include <utilities/event_handler.cuh>
+#include <utilities/manual_cuda_graph.cuh>
 
 #include <functional>
 
@@ -267,8 +268,7 @@ class fj_t {
   rmm::device_uvector<fj_load_balancing_workid_mapping_t> work_id_to_nonbin_var_idx;
   rmm::device_uvector<i_t> work_ids_for_related_vars;
 
-  cudaGraphExec_t graph_instance;
-  bool graph_created = false;
+  cuopt::manual_cuda_graph_t step_graph_;
 
   // kernel launch dimensions, computed once inside the constructor
   std::pair<dim3, dim3> setval_launch_dims;

@@ -21,3 +21,6 @@
 #define CUOPT_MIP_RINS_REQUIRED_THREAD_COUNT        4
 #define CUOPT_MIP_BATCH_PDLP_REQUIRED_THREAD_COUNT  3
 #define CUOPT_MIP_CLIQUE_CUTS_REQUIRED_THREAD_COUNT 3
+// MIP-only gate: skip the concurrent barrier when fewer threads are available than this
+// (1 PDLP + 1 dual simplex + 1 barrier). Stand-alone LP always runs all three.
+#define CUOPT_CONCURRENT_LP_BARRIER_REQUIRED_THREAD_COUNT 3
@@ -12,19 +12,20 @@
 #include <mip_heuristics/presolve/bounds_presolve.cuh>
 #include <mip_heuristics/problem/problem.cuh>
 #include <mip_heuristics/solution/solution.cuh>
+#include <utilities/omp_helpers.hpp>
 #include "lp_state.cuh"
 
 namespace cuopt::linear_programming::detail {
 
 struct relaxed_lp_settings_t {
-  double tolerance                  = 1e-4;
-  double time_limit                 = 1.0;
-  bool check_infeasibility          = true;
-  bool return_first_feasible        = false;
-  bool save_state                   = true;
-  bool per_constraint_residual      = true;
-  bool has_initial_primal           = true;
-  std::atomic<int>* concurrent_halt = nullptr;
+  double tolerance                          = 1e-4;
+  double time_limit                         = 1.0;
+  bool check_infeasibility                  = true;
+  bool return_first_feasible                = false;
+  bool save_state                           = true;
+  bool per_constraint_residual              = true;
+  bool has_initial_primal                   = true;
+  cuopt::omp_atomic_t<int>* concurrent_halt = nullptr;
 };
 
 template <typename i_t, typename f_t>