Skip to content

Commit 66cbfbf

Browse files
committed
[WIP] moar reduce
1 parent b240e36 commit 66cbfbf

24 files changed

+389
-208
lines changed

.vscode/launch.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,17 @@
4747
"initCommands": ["settings set target.disable-aslr false"],
4848
"args": "${input:CXX_PROGRAM_ARGS}",
4949
},
50+
{
51+
"name": "CUDA Current Target (cuda-gdb)",
52+
"type": "cuda-gdb",
53+
"request": "launch",
54+
"stopAtEntry": false,
55+
"breakOnLaunch": false,
56+
"internalConsoleOptions": "neverOpen",
57+
"program": "${command:cmake.launchTargetPath}",
58+
"cwd": "${command:cmake.launchTargetDirectory}",
59+
"args": "${input:CXX_PROGRAM_ARGS}",
60+
},
5061
],
5162
"inputs": [
5263
{

examples/nvexec/maxwell/cuda.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,13 @@ void run_cuda(
4848
cudaStream_t stream{};
4949
cudaStreamCreate(&stream);
5050

51-
kernel<block_threads><<<grid_blocks, block_threads, 0, stream>>>(cells, initializer);
51+
::kernel<block_threads><<<grid_blocks, block_threads, 0, stream>>>(cells, initializer);
5252
STDEXEC_DBG_ERR(cudaStreamSynchronize(stream));
5353

5454
report_performance(grid.cells, n_iterations, method, [&]() {
5555
for (std::size_t compute_step = 0; compute_step < n_iterations; compute_step++) {
56-
kernel<block_threads><<<grid_blocks, block_threads, 0, stream>>>(cells, h_updater);
57-
kernel<block_threads><<<grid_blocks, block_threads, 0, stream>>>(cells, e_updater);
56+
::kernel<block_threads><<<grid_blocks, block_threads, 0, stream>>>(cells, h_updater);
57+
::kernel<block_threads><<<grid_blocks, block_threads, 0, stream>>>(cells, e_updater);
5858
}
5959
writer(false);
6060
STDEXEC_DBG_ERR(cudaStreamSynchronize(stream));

examples/nvexec/reduce.cpp

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,20 +22,26 @@
2222
#include <cstdio>
2323
#include <span>
2424

25-
namespace ex = stdexec;
26-
2725
int main() {
28-
// const int n = 2 * 1024;
29-
// thrust::device_vector<float> input(n, 1.0f);
30-
// float* first = thrust::raw_pointer_cast(input.data());
31-
// float* last = thrust::raw_pointer_cast(input.data()) + input.size();
26+
const int n = 2 * 1024;
27+
thrust::device_vector<float> input(n, 1.0f);
28+
float* first = thrust::raw_pointer_cast(input.data());
29+
float* last = thrust::raw_pointer_cast(input.data()) + input.size();
30+
31+
nvexec::stream_context stream_ctx{};
32+
stdexec::scheduler auto stream_sched = stream_ctx.get_scheduler();
3233

33-
// nvexec::stream_context stream_ctx{};
34+
auto snd = //
35+
stdexec::v2::on(
36+
stream_sched,
37+
stdexec::just(std::span{first, last}) | nvexec::reduce(42.0f));
3438

35-
// auto snd = ex::transfer_just(stream_ctx.get_scheduler(), std::span{first, last})
36-
// | nvexec::reduce(42.0f);
39+
// // BUGBUG this doesn't work:
40+
// auto snd = //
41+
// stdexec::just(std::span{first, last})
42+
// | stdexec::v2::continue_on(stream_sched, nvexec::reduce(42.0f));
3743

38-
// auto [result] = stdexec::sync_wait(std::move(snd)).value();
44+
auto [result] = stdexec::sync_wait(std::move(snd)).value();
3945

40-
// std::cout << "result: " << result << std::endl;
46+
std::cout << "result: " << result << std::endl;
4147
}

include/exec/inline_scheduler.hpp

Lines changed: 1 addition & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -22,52 +22,5 @@
2222
namespace exec {
2323
// A simple scheduler that executes its continuation inline, on the
2424
// thread of the caller of start().
25-
struct inline_scheduler {
26-
template <class R_>
27-
struct __op {
28-
using R = stdexec::__t<R_>;
29-
STDEXEC_NO_UNIQUE_ADDRESS R rec_;
30-
31-
friend void tag_invoke(stdexec::start_t, __op& op) noexcept {
32-
stdexec::set_value((R&&) op.rec_);
33-
}
34-
};
35-
36-
struct __sender {
37-
using is_sender = void;
38-
using completion_signatures = stdexec::completion_signatures<stdexec::set_value_t()>;
39-
40-
template <class R>
41-
friend auto tag_invoke(stdexec::connect_t, __sender, R&& rec) //
42-
noexcept(stdexec::__nothrow_constructible_from<stdexec::__decay_t<R>, R>)
43-
-> __op<stdexec::__x<stdexec::__decay_t<R>>> {
44-
return {(R&&) rec};
45-
}
46-
47-
struct __env {
48-
friend inline_scheduler
49-
tag_invoke(stdexec::get_completion_scheduler_t<stdexec::set_value_t>, const __env&) //
50-
noexcept {
51-
return {};
52-
}
53-
};
54-
55-
friend __env tag_invoke(stdexec::get_env_t, const __sender&) noexcept {
56-
return {};
57-
}
58-
};
59-
60-
STDEXEC_DETAIL_CUDACC_HOST_DEVICE //
61-
friend __sender
62-
tag_invoke(stdexec::schedule_t, const inline_scheduler&) noexcept {
63-
return {};
64-
}
65-
66-
friend stdexec::forward_progress_guarantee
67-
tag_invoke(stdexec::get_forward_progress_guarantee_t, const inline_scheduler&) noexcept {
68-
return stdexec::forward_progress_guarantee::weakly_parallel;
69-
}
70-
71-
bool operator==(const inline_scheduler&) const noexcept = default;
72-
};
25+
using inline_scheduler = stdexec::__inln::__scheduler;
7326
}

include/nvexec/multi_gpu_context.cuh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,10 @@ namespace nvexec {
6666
}
6767
} else {
6868
if (op.status_ == cudaSuccess) {
69-
continuation_kernel<<<1, 1, 0, op.stream_>>>(std::move(op.rec_), stdexec::set_value);
69+
STDEXEC_STREAM_DETAIL_NS::continuation_kernel<<<1, 1, 0, op.stream_>>>(
70+
std::move(op.rec_), stdexec::set_value);
7071
} else {
71-
continuation_kernel<<<1, 1, 0, op.stream_>>>(
72+
STDEXEC_STREAM_DETAIL_NS::continuation_kernel<<<1, 1, 0, op.stream_>>>(
7273
std::move(op.rec_), stdexec::set_error, std::move(op.status_));
7374
}
7475
}

include/nvexec/stream/algorithm_base.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,10 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS::__algo_range_init_fun {
3131
using binary_invoke_result_t = ::cuda::std::decay_t<
3232
::cuda::std::invoke_result_t<Fun, stdexec::ranges::range_reference_t<Range>, InitT>>;
3333

34-
template <class SenderId, class ReceiverId, class InitT, class Fun, class DerivedReceiver>
34+
template <class CvrefSenderId, class ReceiverId, class InitT, class Fun, class DerivedReceiver>
3535
struct receiver_t {
3636
struct __t : public stream_receiver_base {
37-
using Sender = stdexec::__t<SenderId>;
37+
using CvrefSender = stdexec::__cvref_t<CvrefSenderId>;
3838
using Receiver = stdexec::__t<ReceiverId>;
3939

4040
template <class... Range>
@@ -54,7 +54,7 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS::__algo_range_init_fun {
5454
static constexpr ::std::size_t value = //
5555
__v< __gather_completions_for<
5656
set_value_t,
57-
Sender,
57+
CvrefSender,
5858
env_of_t<Receiver>,
5959
__q<result_size_for_t>,
6060
__q<max_in_pack>>>;

include/nvexec/stream/bulk.cuh

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS {
5858
constexpr int block_threads = 256;
5959
const int grid_blocks = (static_cast<int>(self.shape_) + block_threads - 1)
6060
/ block_threads;
61-
kernel<block_threads, As&...>
61+
_bulk::kernel<block_threads, As&...>
6262
<<<grid_blocks, block_threads, 0, stream>>>(self.shape_, std::move(self.f_), as...);
6363
}
6464

@@ -203,7 +203,7 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS {
203203
if (begin < end) {
204204
cudaSetDevice(dev);
205205
cudaStreamWaitEvent(stream, op_state.ready_to_launch_, 0);
206-
kernel<block_threads, As&...>
206+
multi_gpu_bulk::kernel<block_threads, As&...>
207207
<<<grid_blocks, block_threads, 0, stream>>>(begin, end, self.f_, as...);
208208
cudaEventRecord(op_state.ready_to_complete_[dev], op_state.streams_[dev]);
209209
}
@@ -218,7 +218,7 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS {
218218
const int grid_blocks = (shape + block_threads - 1) / block_threads;
219219

220220
if (begin < end) {
221-
kernel<block_threads, As&...>
221+
multi_gpu_bulk::kernel<block_threads, As&...>
222222
<<<grid_blocks, block_threads, 0, baseline_stream>>>(begin, end, self.f_, as...);
223223
}
224224
}
@@ -371,3 +371,15 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS {
371371
};
372372
};
373373
}
374+
375+
namespace stdexec::__detail {
376+
template <class SenderId, class Shape, class Fun>
377+
inline constexpr __mconst<
378+
nvexec::STDEXEC_STREAM_DETAIL_NS::bulk_sender_t<__name_of<__t<SenderId>>, Shape, Fun>>
379+
__name_of_v<nvexec::STDEXEC_STREAM_DETAIL_NS::bulk_sender_t<SenderId, Shape, Fun>>{};
380+
381+
template <class SenderId, class Shape, class Fun>
382+
inline constexpr __mconst<
383+
nvexec::STDEXEC_STREAM_DETAIL_NS::multi_gpu_bulk_sender_t<__name_of<__t<SenderId>>, Shape, Fun>>
384+
__name_of_v<nvexec::STDEXEC_STREAM_DETAIL_NS::multi_gpu_bulk_sender_t<SenderId, Shape, Fun>>{};
385+
}

include/nvexec/stream/common.cuh

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,8 @@ namespace nvexec {
165165
}
166166
};
167167

168+
struct stream_scheduler;
169+
168170
struct context_state_t {
169171
std::pmr::memory_resource* pinned_resource_{nullptr};
170172
std::pmr::memory_resource* managed_resource_{nullptr};
@@ -195,9 +197,9 @@ namespace nvexec {
195197
void return_stream(cudaStream_t stream) {
196198
stream_pools_->return_stream(stream, priority_);
197199
}
198-
};
199200

200-
struct stream_scheduler;
201+
stream_scheduler make_stream_scheduler() const noexcept;
202+
};
201203

202204
struct stream_sender_base {
203205
using is_sender = void;
@@ -265,6 +267,10 @@ namespace nvexec {
265267
stream_provider_t* operator()(const Env& env) const noexcept {
266268
return tag_invoke(get_stream_provider_t{}, env);
267269
}
270+
271+
friend constexpr bool tag_invoke(forwarding_query_t, const get_stream_provider_t&) noexcept {
272+
return true;
273+
}
268274
};
269275

270276
template <class... Ts>
@@ -308,7 +314,10 @@ namespace nvexec {
308314
using variant_storage_t = //
309315
__minvoke< __minvoke<
310316
__mfold_right<
311-
__mbind_front_q<stream_storage_impl::variant, ::cuda::std::tuple<set_noop>>,
317+
__mbind_front_q<
318+
stream_storage_impl::variant,
319+
::cuda::std::tuple<set_noop>,
320+
::cuda::std::tuple<set_error_t, cudaError_t>>,
312321
__mbind_front_q<stream_storage_impl::__bind_completions_t, _Sender, _Env>>,
313322
set_value_t,
314323
set_error_t,
@@ -330,7 +339,21 @@ namespace nvexec {
330339

331340
template <class BaseEnv>
332341
auto make_stream_env(BaseEnv&& base_env, stream_provider_t* stream_provider) noexcept {
333-
return __join_env(__mkprop(get_stream_provider, stream_provider), (BaseEnv&&) base_env);
342+
return __join_env(
343+
__env::__env_fn{
344+
[stream_provider]<__one_of<get_stream_provider_t, get_scheduler_t, get_domain_t> Tag>(
345+
Tag) noexcept {
346+
__mfront<stream_provider_t, Tag>* str_provider = stream_provider;
347+
if constexpr (same_as<Tag, get_stream_provider_t>) {
348+
return str_provider;
349+
} else if constexpr (same_as<Tag, get_scheduler_t>) {
350+
return str_provider->context_.make_stream_scheduler();
351+
} else {
352+
return get_domain(str_provider->context_.make_stream_scheduler());
353+
}
354+
STDEXEC_UNREACHABLE();
355+
}},
356+
(BaseEnv&&) base_env);
334357
}
335358

336359
template <class BaseEnv>
@@ -370,6 +393,10 @@ namespace nvexec {
370393
stream_sender_base,
371394
__decay_t<transform_sender_result_t<__env_domain_of_t<E>, S, E>>);
372395

396+
struct stream_scheduler;
397+
template <class = stream_scheduler>
398+
struct stream_domain;
399+
373400
template <class R>
374401
concept stream_receiver = //
375402
receiver<R> && //
@@ -427,8 +454,8 @@ namespace nvexec {
427454
};
428455
};
429456

430-
template <class Receiver, class... As, class Tag>
431-
__launch_bounds__(1) __global__ void continuation_kernel(Receiver rcvr, Tag, As... as) {
457+
template <class Receiver, class Tag, class... As>
458+
__launch_bounds__(1) __global__ void continuation_kernel(Receiver rcvr, As... as) {
432459
static_assert(trivially_copyable<Receiver, Tag, As...>);
433460
Tag()(::cuda::std::move(rcvr), static_cast<As&&>(as)...);
434461
}
@@ -552,7 +579,7 @@ namespace nvexec {
552579
if (cudaCpuDeviceId == device_id) {
553580
ptr->~T();
554581
} else {
555-
destructor_kernel<<<1, 1, 0, stream>>>(ptr);
582+
STDEXEC_STREAM_DETAIL_NS::destructor_kernel<<<1, 1, 0, stream>>>(ptr);
556583

557584
// TODO Bury all the memory associated with the stream provider and then
558585
// deallocate the memory
@@ -573,9 +600,9 @@ namespace nvexec {
573600
if constexpr (stream_receiver<outer_receiver_t>) {
574601
set_error((outer_receiver_t&&) rcvr_, (cudaError_t&&) status);
575602
} else {
576-
// pass a cudaError_t by value:
577-
continuation_kernel<outer_receiver_t, Error>
578-
<<<1, 1, 0, get_stream()>>>((outer_receiver_t&&) rcvr_, set_error_t(), status);
603+
STDEXEC_STREAM_DETAIL_NS::
604+
continuation_kernel<outer_receiver_t, set_error_t, cudaError_t> // by value
605+
<<<1, 1, 0, get_stream()>>>((outer_receiver_t&&) rcvr_, status);
579606
}
580607
}
581608

@@ -584,8 +611,9 @@ namespace nvexec {
584611
if constexpr (stream_receiver<outer_receiver_t>) {
585612
Tag()((outer_receiver_t&&) rcvr_, (As&&) as...);
586613
} else {
587-
continuation_kernel<outer_receiver_t, As&&...> // by reference
588-
<<<1, 1, 0, get_stream()>>>((outer_receiver_t&&) rcvr_, Tag(), (As&&) as...);
614+
STDEXEC_STREAM_DETAIL_NS::
615+
continuation_kernel<outer_receiver_t, Tag, As&&...> // by reference
616+
<<<1, 1, 0, get_stream()>>>((outer_receiver_t&&) rcvr_, (As&&) as...);
589617
}
590618
}
591619
};

include/nvexec/stream/ensure_started.cuh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,8 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS {
6666
cudaStream_t stream = state.stream_provider_.own_stream_.value();
6767
using tuple_t = decayed_tuple<Tag, As...>;
6868
state.index_ = SharedState::variant_t::template index_of<tuple_t>::value;
69-
copy_kernel<Tag, As&&...><<<1, 1, 0, stream>>>(state.data_, (As&&) as...);
69+
_ensure_started::copy_kernel<Tag, As&&...>
70+
<<<1, 1, 0, stream>>>(state.data_, (As&&) as...);
7071
state.stream_provider_.status_ = STDEXEC_DBG_ERR(cudaEventRecord(state.event_, stream));
7172
} else {
7273
using tuple_t = decayed_tuple<Tag, As...>;

0 commit comments

Comments
 (0)