acts-project
diff --git a/‎tests/benchmarks/include/detray/benchmarks/cpu/propagation_benchmark.hpp
+29-3 b/‎tests/benchmarks/include/detray/benchmarks/cpu/propagation_benchmark.hpp
+29-3
diff --git a/‎tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp
+27-12 b/‎tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp
+27-12
diff --git a/‎tests/include/detray/test/utils/simulation/event_generator/random_numbers.hpp
+3-1 b/‎tests/include/detray/test/utils/simulation/event_generator/random_numbers.hpp
+3-1
diff --git a/‎tests/tools/python/impl/__init__.py
+1 b/‎tests/tools/python/impl/__init__.py
+1
@@ -19,6 +19,11 @@
 // Benchmark include
 #include <benchmark/benchmark.h>
 
+#ifdef _OPENMP
+// openMP include
+#include <omp.h>
+#endif
+
 // System include(s)
 #include <algorithm>
 #include <cassert>
@@ -57,7 +62,10 @@ struct host_propagation_bm : public benchmark_base {
         const dvector<free_track_parameters<algebra_t>> *tracks,
         const typename propagator_t::detector_type *det, const bfield_t *bfield,
         const typename propagator_t::actor_chain_type::state_tuple
-            *input_actor_states) const {
+            *input_actor_states,
+        [[maybe_unused]] const int n_threads,
+        [[maybe_unused]] const int max_chunk_size,
+        [[maybe_unused]] const int thread_schedule) const {
         using actor_chain_t = typename propagator_t::actor_chain_type;
         using actor_states_t = typename actor_chain_t::state_tuple;
 
@@ -71,6 +79,22 @@ struct host_propagation_bm : public benchmark_base {
 
         assert(static_cast<std::size_t>(n_samples) <= tracks->size());
 
+#ifdef _OPENMP
+        // Set the number of threads fo the openMP parallel regions
+        omp_set_num_threads(n_threads);
+        // Clamp chunk size to [1, max_chunk_size]
+        int chunk_size{
+            math::min(static_cast<int>(n_samples / n_threads), max_chunk_size)};
+        chunk_size = math::max(chunk_size, 1);
+        omp_set_schedule(static_cast<omp_sched_t>(thread_schedule), chunk_size);
+#ifndef NDEBUG
+        std::cout << "No. tracks " << n_samples << std::endl;
+        std::cout << "No. threads " << n_threads << std::endl;
+        std::cout << "Schedule type " << thread_schedule << std::endl;
+        std::cout << "Chunk size " << chunk_size << std::endl;
+#endif
+#endif
+
         // Create propagator
         propagator_t p{m_cfg.propagation()};
 
@@ -99,6 +123,7 @@ struct host_propagation_bm : public benchmark_base {
                 ::benchmark::DoNotOptimize(
                     p.propagate_sync(p_state, actor_state_refs));
             }
+            assert(p.propagate_is_complete(p_state));
         };
 
         // Warm-up
@@ -108,7 +133,7 @@ struct host_propagation_bm : public benchmark_base {
             stride = (stride == 0) ? 10 : stride;
             assert(stride > 0);
 
-#pragma omp parallel for schedule(dynamic)
+#pragma omp parallel for
             for (int i = 0; i < n_samples; i += stride) {
                 // The track gets copied into the stepper state, so that the
                 // original track sample vector remains unchanged
@@ -126,12 +151,13 @@ struct host_propagation_bm : public benchmark_base {
         // https://github.com/google/benchmark/blob/main/docs/user_guide.md#custom-counters
         std::size_t total_tracks = 0u;
         for (auto _ : state) {
-#pragma omp parallel for schedule(dynamic)
+#pragma omp parallel for
             for (int i = 0; i < n_samples; ++i) {
                 run_propagation((*tracks)[static_cast<std::size_t>(i)]);
             }
             total_tracks += static_cast<std::size_t>(n_samples);
         }
+
         // Report throughput
         state.counters["TracksPropagated"] = benchmark::Counter(
             static_cast<double>(total_tracks), benchmark::Counter::kIsRate);
 
@@ -22,6 +22,7 @@
 #include <benchmark/benchmark.h>
 
 // System include(s)
+#include <thread>
 #include <type_traits>
 #include <utility>
 #include <vector>
@@ -145,17 +146,26 @@ inline void register_benchmark(
         dvector<free_track_parameters<typename detector_t::algebra_type>>>
         &track_samples,
     const std::vector<int> &n_samples = {10000},
-    vecmem::memory_resource *dev_mr = nullptr) {
+    vecmem::memory_resource *dev_mr = nullptr,
+    const std::vector<int> &n_host_threads = {static_cast<int>(
+        std::thread::hardware_concurrency())},
+    int max_chunk_size = 1, int openmp_sched = 2) {
 
     using algebra_t = typename detector_t::algebra_type;
     using propagation_benchmark_t =
         benchmark_t<propagator_t, bfield_bknd_t, kOPT>;
 
     assert(track_samples.size() == n_samples.size());
 
-    for (const auto [i, n] : detray::views::enumerate(n_samples)) {
+    const std::size_t bench_range{
+        math::max(n_samples.size(), n_host_threads.size())};
+    for (std::size_t i = 0u; i < bench_range; ++i) {
 
-        auto &tracks = track_samples[i];
+        auto &tracks =
+            track_samples.size() == 1u ? track_samples[0] : track_samples[i];
+        int host_threads{n_host_threads.size() == 1u ? n_host_threads[0]
+                                                     : n_host_threads[i]};
+        const int n{n_samples.size() == 1u ? n_samples[0] : n_samples[i]};
         assert(static_cast<std::size_t>(n) <= tracks.size());
 
         bench_cfg.n_samples(n);
@@ -176,18 +186,19 @@ inline void register_benchmark(
                           dvector<free_track_parameters<algebra_t>> *,
                           const detector_t *, const bfield_bknd_t *,
                           typename propagator_t::actor_chain_type::state_tuple
-                              *>) {
+                              *,
+                          int, int, int>) {
             // Cpu benchmark
-            ::benchmark::RegisterBenchmark(bench_name.c_str(), prop_benchmark,
-                                           &tracks, &det, &bfield,
-                                           actor_states);
-            //->MeasureProcessCPUTime();
+            ::benchmark::RegisterBenchmark(
+                bench_name.c_str(), prop_benchmark, &tracks, &det, &bfield,
+                actor_states, host_threads, max_chunk_size, openmp_sched)
+                ->UseRealTime();
         } else {
             // Device benchmark
             ::benchmark::RegisterBenchmark(bench_name.c_str(), prop_benchmark,
                                            dev_mr, &tracks, &det, &bfield,
-                                           actor_states);
-            //->MeasureProcessCPUTime();
+                                           actor_states)
+                ->UseRealTime();
         }
     }
 }
@@ -210,13 +221,17 @@ inline void register_benchmark(
     std::vector<
         dvector<free_track_parameters<typename detector_t::algebra_type>>>
         &tracks,
-    const std::vector<int> &n_samples = {10000}) {
+    const std::vector<int> &n_samples = {10000},
+    const std::vector<int> &n_host_threads = {static_cast<int>(
+        std::thread::hardware_concurrency())},
+    int max_chunk_size = 1, int openmp_sched = 2) {
 
     using propagator_t =
         propagator<stepper_t, navigator<detector_t>, actor_chain_t>;
     register_benchmark<benchmark_t, propagator_t, detector_t, bfield_bknd_t,
                        kOPT>(name, bench_cfg, prop_cfg, det, bfield,
-                             actor_states, tracks, n_samples, nullptr);
+                             actor_states, tracks, n_samples, nullptr,
+                             n_host_threads, max_chunk_size, openmp_sched);
 }
 
 }  // namespace detray::benchmarks
@@ -75,7 +75,9 @@ struct random_numbers {
 
     /// Explicit normal distribution around a @param mean and @param stddev
     DETRAY_HOST auto normal(const scalar_t mean, const scalar_t stddev) {
-        return std::normal_distribution<scalar_t>(mean, stddev)(m_engine);
+        return stddev == scalar_t{0}
+                   ? mean
+                   : std::normal_distribution<scalar_t>(mean, stddev)(m_engine);
     }
 
     /// 50:50 coin toss
 
@@ -3,6 +3,7 @@
     prepare_benchmark_data,
     plot_benchmark_case,
     plot_benchmark_data,
+    plot_scaling_data,
 )
 from .plot_navigation_validation import (
     read_scan_data,
Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,9 @@ struct random_numbers {`
`75`	`75`
`76`	`76`	`/// Explicit normal distribution around a @param mean and @param stddev`
`77`	`77`	`DETRAY_HOST auto normal(const scalar_t mean, const scalar_t stddev) {`
`78`		`- return std::normal_distribution<scalar_t>(mean, stddev)(m_engine);`
	`78`	`+ return stddev == scalar_t{0}`
	`79`	`+ ? mean`
	`80`	`+ : std::normal_distribution<scalar_t>(mean, stddev)(m_engine);`
`79`	`81`	`}`
`80`	`82`
`81`	`83`	`/// 50:50 coin toss`
Original file line number	Diff line number	Diff line change
`@@ -3,6 +3,7 @@`
`3`	`3`	`prepare_benchmark_data,`
`4`	`4`	`plot_benchmark_case,`
`5`	`5`	`plot_benchmark_data,`
	`6`	`+ plot_scaling_data,`
`6`	`7`	`)`
`7`	`8`	`from .plot_navigation_validation import (`
`8`	`9`	`read_scan_data,`