22
22
#include < benchmark/benchmark.h>
23
23
24
24
// System include(s)
25
+ #include < thread>
25
26
#include < type_traits>
26
27
#include < utility>
27
28
#include < vector>
@@ -145,17 +146,26 @@ inline void register_benchmark(
145
146
dvector<free_track_parameters<typename detector_t ::algebra_type>>>
146
147
&track_samples,
147
148
const std::vector<int > &n_samples = {10000 },
148
- vecmem::memory_resource *dev_mr = nullptr ) {
149
+ vecmem::memory_resource *dev_mr = nullptr ,
150
+ const std::vector<int > &n_host_threads = {static_cast <int >(
151
+ std::thread::hardware_concurrency ())},
152
+ int max_chunk_size = 1 , int openmp_sched = 2 ) {
149
153
150
154
using algebra_t = typename detector_t ::algebra_type;
151
155
using propagation_benchmark_t =
152
156
benchmark_t <propagator_t , bfield_bknd_t , kOPT >;
153
157
154
158
assert (track_samples.size () == n_samples.size ());
155
159
156
- for (const auto [i, n] : detray::views::enumerate (n_samples)) {
160
+ const std::size_t bench_range{
161
+ math::max (n_samples.size (), n_host_threads.size ())};
162
+ for (std::size_t i = 0u ; i < bench_range; ++i) {
157
163
158
- auto &tracks = track_samples[i];
164
+ auto &tracks =
165
+ track_samples.size () == 1u ? track_samples[0 ] : track_samples[i];
166
+ int host_threads{n_host_threads.size () == 1u ? n_host_threads[0 ]
167
+ : n_host_threads[i]};
168
+ const int n{n_samples.size () == 1u ? n_samples[0 ] : n_samples[i]};
159
169
assert (static_cast <std::size_t >(n) <= tracks.size ());
160
170
161
171
bench_cfg.n_samples (n);
@@ -176,18 +186,19 @@ inline void register_benchmark(
176
186
dvector<free_track_parameters<algebra_t >> *,
177
187
const detector_t *, const bfield_bknd_t *,
178
188
typename propagator_t ::actor_chain_type::state_tuple
179
- *>) {
189
+ *,
190
+ int , int , int >) {
180
191
// Cpu benchmark
181
- ::benchmark::RegisterBenchmark (bench_name.c_str(), prop_benchmark,
182
- &tracks, &det, &bfield,
183
- actor_states);
184
- // ->MeasureProcessCPUTime ();
192
+ ::benchmark::RegisterBenchmark (
193
+ bench_name.c_str(), prop_benchmark, &tracks, &det, &bfield,
194
+ actor_states, host_threads, max_chunk_size, openmp_sched)
195
+ ->UseRealTime ();
185
196
} else {
186
197
// Device benchmark
187
198
::benchmark::RegisterBenchmark (bench_name.c_str(), prop_benchmark,
188
199
dev_mr, &tracks, &det, &bfield,
189
- actor_states);
190
- // ->MeasureProcessCPUTime ();
200
+ actor_states)
201
+ ->UseRealTime ();
191
202
}
192
203
}
193
204
}
@@ -210,13 +221,17 @@ inline void register_benchmark(
210
221
std::vector<
211
222
dvector<free_track_parameters<typename detector_t ::algebra_type>>>
212
223
&tracks,
213
- const std::vector<int > &n_samples = {10000 }) {
224
+ const std::vector<int > &n_samples = {10000 },
225
+ const std::vector<int > &n_host_threads = {static_cast <int >(
226
+ std::thread::hardware_concurrency ())},
227
+ int max_chunk_size = 1 , int openmp_sched = 2 ) {
214
228
215
229
using propagator_t =
216
230
propagator<stepper_t , navigator<detector_t >, actor_chain_t >;
217
231
register_benchmark<benchmark_t , propagator_t , detector_t , bfield_bknd_t ,
218
232
kOPT >(name, bench_cfg, prop_cfg, det, bfield,
219
- actor_states, tracks, n_samples, nullptr );
233
+ actor_states, tracks, n_samples, nullptr ,
234
+ n_host_threads, max_chunk_size, openmp_sched);
220
235
}
221
236
222
237
} // namespace detray::benchmarks
0 commit comments