Skip to content

Commit dcf26b8

Browse files
authored
feat: plot cpu scaling (#919)
* remove print statements * Correct the time counting in google benchmarks so that the throughput is calculated correctly * Add plots
1 parent 14eac2c commit dcf26b8

File tree

11 files changed

+606
-117
lines changed

11 files changed

+606
-117
lines changed

tests/benchmarks/include/detray/benchmarks/cpu/propagation_benchmark.hpp

+29-3
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,11 @@
1919
// Benchmark include
2020
#include <benchmark/benchmark.h>
2121

22+
#ifdef _OPENMP
23+
// openMP include
24+
#include <omp.h>
25+
#endif
26+
2227
// System include(s)
2328
#include <algorithm>
2429
#include <cassert>
@@ -57,7 +62,10 @@ struct host_propagation_bm : public benchmark_base {
5762
const dvector<free_track_parameters<algebra_t>> *tracks,
5863
const typename propagator_t::detector_type *det, const bfield_t *bfield,
5964
const typename propagator_t::actor_chain_type::state_tuple
60-
*input_actor_states) const {
65+
*input_actor_states,
66+
[[maybe_unused]] const int n_threads,
67+
[[maybe_unused]] const int max_chunk_size,
68+
[[maybe_unused]] const int thread_schedule) const {
6169
using actor_chain_t = typename propagator_t::actor_chain_type;
6270
using actor_states_t = typename actor_chain_t::state_tuple;
6371

@@ -71,6 +79,22 @@ struct host_propagation_bm : public benchmark_base {
7179

7280
assert(static_cast<std::size_t>(n_samples) <= tracks->size());
7381

82+
#ifdef _OPENMP
83+
// Set the number of threads fo the openMP parallel regions
84+
omp_set_num_threads(n_threads);
85+
// Clamp chunk size to [1, max_chunk_size]
86+
int chunk_size{
87+
math::min(static_cast<int>(n_samples / n_threads), max_chunk_size)};
88+
chunk_size = math::max(chunk_size, 1);
89+
omp_set_schedule(static_cast<omp_sched_t>(thread_schedule), chunk_size);
90+
#ifndef NDEBUG
91+
std::cout << "No. tracks " << n_samples << std::endl;
92+
std::cout << "No. threads " << n_threads << std::endl;
93+
std::cout << "Schedule type " << thread_schedule << std::endl;
94+
std::cout << "Chunk size " << chunk_size << std::endl;
95+
#endif
96+
#endif
97+
7498
// Create propagator
7599
propagator_t p{m_cfg.propagation()};
76100

@@ -99,6 +123,7 @@ struct host_propagation_bm : public benchmark_base {
99123
::benchmark::DoNotOptimize(
100124
p.propagate_sync(p_state, actor_state_refs));
101125
}
126+
assert(p.propagate_is_complete(p_state));
102127
};
103128

104129
// Warm-up
@@ -108,7 +133,7 @@ struct host_propagation_bm : public benchmark_base {
108133
stride = (stride == 0) ? 10 : stride;
109134
assert(stride > 0);
110135

111-
#pragma omp parallel for schedule(dynamic)
136+
#pragma omp parallel for
112137
for (int i = 0; i < n_samples; i += stride) {
113138
// The track gets copied into the stepper state, so that the
114139
// original track sample vector remains unchanged
@@ -126,12 +151,13 @@ struct host_propagation_bm : public benchmark_base {
126151
// https://github.com/google/benchmark/blob/main/docs/user_guide.md#custom-counters
127152
std::size_t total_tracks = 0u;
128153
for (auto _ : state) {
129-
#pragma omp parallel for schedule(dynamic)
154+
#pragma omp parallel for
130155
for (int i = 0; i < n_samples; ++i) {
131156
run_propagation((*tracks)[static_cast<std::size_t>(i)]);
132157
}
133158
total_tracks += static_cast<std::size_t>(n_samples);
134159
}
160+
135161
// Report throughput
136162
state.counters["TracksPropagated"] = benchmark::Counter(
137163
static_cast<double>(total_tracks), benchmark::Counter::kIsRate);

tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp

+27-12
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <benchmark/benchmark.h>
2323

2424
// System include(s)
25+
#include <thread>
2526
#include <type_traits>
2627
#include <utility>
2728
#include <vector>
@@ -145,17 +146,26 @@ inline void register_benchmark(
145146
dvector<free_track_parameters<typename detector_t::algebra_type>>>
146147
&track_samples,
147148
const std::vector<int> &n_samples = {10000},
148-
vecmem::memory_resource *dev_mr = nullptr) {
149+
vecmem::memory_resource *dev_mr = nullptr,
150+
const std::vector<int> &n_host_threads = {static_cast<int>(
151+
std::thread::hardware_concurrency())},
152+
int max_chunk_size = 1, int openmp_sched = 2) {
149153

150154
using algebra_t = typename detector_t::algebra_type;
151155
using propagation_benchmark_t =
152156
benchmark_t<propagator_t, bfield_bknd_t, kOPT>;
153157

154158
assert(track_samples.size() == n_samples.size());
155159

156-
for (const auto [i, n] : detray::views::enumerate(n_samples)) {
160+
const std::size_t bench_range{
161+
math::max(n_samples.size(), n_host_threads.size())};
162+
for (std::size_t i = 0u; i < bench_range; ++i) {
157163

158-
auto &tracks = track_samples[i];
164+
auto &tracks =
165+
track_samples.size() == 1u ? track_samples[0] : track_samples[i];
166+
int host_threads{n_host_threads.size() == 1u ? n_host_threads[0]
167+
: n_host_threads[i]};
168+
const int n{n_samples.size() == 1u ? n_samples[0] : n_samples[i]};
159169
assert(static_cast<std::size_t>(n) <= tracks.size());
160170

161171
bench_cfg.n_samples(n);
@@ -176,18 +186,19 @@ inline void register_benchmark(
176186
dvector<free_track_parameters<algebra_t>> *,
177187
const detector_t *, const bfield_bknd_t *,
178188
typename propagator_t::actor_chain_type::state_tuple
179-
*>) {
189+
*,
190+
int, int, int>) {
180191
// Cpu benchmark
181-
::benchmark::RegisterBenchmark(bench_name.c_str(), prop_benchmark,
182-
&tracks, &det, &bfield,
183-
actor_states);
184-
//->MeasureProcessCPUTime();
192+
::benchmark::RegisterBenchmark(
193+
bench_name.c_str(), prop_benchmark, &tracks, &det, &bfield,
194+
actor_states, host_threads, max_chunk_size, openmp_sched)
195+
->UseRealTime();
185196
} else {
186197
// Device benchmark
187198
::benchmark::RegisterBenchmark(bench_name.c_str(), prop_benchmark,
188199
dev_mr, &tracks, &det, &bfield,
189-
actor_states);
190-
//->MeasureProcessCPUTime();
200+
actor_states)
201+
->UseRealTime();
191202
}
192203
}
193204
}
@@ -210,13 +221,17 @@ inline void register_benchmark(
210221
std::vector<
211222
dvector<free_track_parameters<typename detector_t::algebra_type>>>
212223
&tracks,
213-
const std::vector<int> &n_samples = {10000}) {
224+
const std::vector<int> &n_samples = {10000},
225+
const std::vector<int> &n_host_threads = {static_cast<int>(
226+
std::thread::hardware_concurrency())},
227+
int max_chunk_size = 1, int openmp_sched = 2) {
214228

215229
using propagator_t =
216230
propagator<stepper_t, navigator<detector_t>, actor_chain_t>;
217231
register_benchmark<benchmark_t, propagator_t, detector_t, bfield_bknd_t,
218232
kOPT>(name, bench_cfg, prop_cfg, det, bfield,
219-
actor_states, tracks, n_samples, nullptr);
233+
actor_states, tracks, n_samples, nullptr,
234+
n_host_threads, max_chunk_size, openmp_sched);
220235
}
221236

222237
} // namespace detray::benchmarks

tests/include/detray/test/utils/simulation/event_generator/random_numbers.hpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,9 @@ struct random_numbers {
7575

7676
/// Explicit normal distribution around a @param mean and @param stddev
7777
DETRAY_HOST auto normal(const scalar_t mean, const scalar_t stddev) {
78-
return std::normal_distribution<scalar_t>(mean, stddev)(m_engine);
78+
return stddev == scalar_t{0}
79+
? mean
80+
: std::normal_distribution<scalar_t>(mean, stddev)(m_engine);
7981
}
8082

8183
/// 50:50 coin toss

tests/tools/python/impl/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
prepare_benchmark_data,
44
plot_benchmark_case,
55
plot_benchmark_data,
6+
plot_scaling_data,
67
)
78
from .plot_navigation_validation import (
89
read_scan_data,

0 commit comments

Comments
 (0)