Skip to content

Commit f2a377d

Browse files
committed
Implement RustToCuda for Arc<[T]>
1 parent 25735d0 commit f2a377d

File tree

4 files changed

+237
-7
lines changed

4 files changed

+237
-7
lines changed

src/lend/impls/arc.rs

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,14 +62,12 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Arc<T> {
6262
DeviceAccessible<Self::CudaRepresentation>,
6363
CombinedCudaAlloc<Self::CudaAllocation, A>,
6464
)> {
65-
let inner = ManuallyDrop::new(_ArcInner {
66-
strong: AtomicUsize::new(1),
67-
weak: AtomicUsize::new(1),
68-
data: std::ptr::read(&**self),
69-
});
65+
let data_ptr: *const T = std::ptr::from_ref(&**self);
66+
let offset = std::mem::offset_of!(_ArcInner<T>, data);
67+
let arc_ptr: *const _ArcInner<T> = data_ptr.byte_sub(offset).cast();
7068

7169
let mut device_box = CudaDropWrapper::from(DeviceBox::new(
72-
DeviceCopyWithPortableBitSemantics::from_ref(&*inner),
70+
DeviceCopyWithPortableBitSemantics::from_ref(&*arc_ptr),
7371
)?);
7472

7573
Ok((

src/lend/impls/arced_slice.rs

Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
use core::sync::atomic::AtomicUsize;
2+
#[cfg(feature = "host")]
3+
use std::mem::{ManuallyDrop, MaybeUninit};
4+
5+
use const_type_layout::{TypeGraphLayout, TypeLayout};
6+
7+
#[cfg(feature = "host")]
8+
use rustacuda::{
9+
error::CudaResult,
10+
memory::LockedBuffer,
11+
memory::{DeviceBox, DeviceBuffer},
12+
};
13+
use rustacuda_core::DeviceCopy;
14+
15+
use crate::{
16+
deps::alloc::sync::Arc,
17+
lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
18+
safety::PortableBitSemantics,
19+
utils::ffi::DeviceOwnedPointer,
20+
};
21+
22+
#[cfg(any(feature = "host", feature = "device"))]
23+
use crate::utils::ffi::DeviceAccessible;
24+
25+
#[cfg(feature = "host")]
26+
use crate::{
27+
alloc::{CombinedCudaAlloc, CudaAlloc},
28+
host::CudaDropWrapper,
29+
utils::adapter::DeviceCopyWithPortableBitSemantics,
30+
utils::r#async::Async,
31+
utils::r#async::CompletionFnMut,
32+
utils::r#async::NoCompletion,
33+
};
34+
35+
#[doc(hidden)]
36+
#[allow(clippy::module_name_repetitions)]
37+
#[derive(TypeLayout)]
38+
#[repr(C)]
39+
pub struct ArcedSliceCudaRepresentation<T: PortableBitSemantics + TypeGraphLayout> {
40+
data: DeviceOwnedPointer<_ArcInner<T>>,
41+
len: usize,
42+
}
43+
44+
// must be kept in sync (hehe)
45+
#[doc(hidden)]
46+
#[derive(TypeLayout)]
47+
#[repr(C)]
48+
pub struct _ArcInner<T: ?Sized> {
49+
strong: AtomicUsize,
50+
weak: AtomicUsize,
51+
data: T,
52+
}
53+
54+
#[repr(C)]
55+
struct _ArcInnerHeader {
56+
strong: AtomicUsize,
57+
weak: AtomicUsize,
58+
}
59+
60+
unsafe impl DeviceCopy for _ArcInnerHeader {}
61+
62+
unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Arc<[T]> {
63+
#[cfg(all(feature = "host", not(doc)))]
64+
type CudaAllocation = CudaDropWrapper<DeviceBuffer<DeviceCopyWithPortableBitSemantics<T>>>;
65+
#[cfg(any(not(feature = "host"), doc))]
66+
type CudaAllocation = crate::alloc::SomeCudaAlloc;
67+
type CudaRepresentation = ArcedSliceCudaRepresentation<T>;
68+
69+
#[cfg(feature = "host")]
70+
#[allow(clippy::type_complexity)]
71+
unsafe fn borrow<A: CudaAlloc>(
72+
&self,
73+
alloc: A,
74+
) -> CudaResult<(
75+
DeviceAccessible<Self::CudaRepresentation>,
76+
CombinedCudaAlloc<Self::CudaAllocation, A>,
77+
)> {
78+
use rustacuda::memory::{CopyDestination, DeviceSlice};
79+
use rustacuda_core::DevicePointer;
80+
81+
let data_ptr: *const T = std::ptr::from_ref(&**self).as_ptr();
82+
let offset = std::mem::offset_of!(_ArcInner<[T; 42]>, data);
83+
let arc_ptr: *const _ArcInner<[T; 42]> = data_ptr.byte_sub(offset).cast();
84+
85+
let header_len = (offset + (std::mem::align_of::<T>() - 1)) / std::mem::align_of::<T>();
86+
87+
let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::<
88+
DeviceCopyWithPortableBitSemantics<T>,
89+
>::uninitialized(
90+
header_len + self.len()
91+
)?);
92+
let (header, buffer): (&mut DeviceSlice<_>, &mut DeviceSlice<_>) =
93+
device_buffer.split_at_mut(header_len);
94+
buffer.copy_from(std::slice::from_raw_parts(self.as_ptr().cast(), self.len()))?;
95+
let header = DeviceSlice::from_raw_parts_mut(
96+
DevicePointer::wrap(header.as_mut_ptr().cast::<u8>()),
97+
header.len() * std::mem::size_of::<T>(),
98+
);
99+
let (_, header) = header.split_at_mut(header.len() - offset);
100+
let (header, _) = header.split_at_mut(std::mem::size_of::<_ArcInnerHeader>());
101+
#[allow(clippy::cast_ptr_alignment)]
102+
let mut header: ManuallyDrop<DeviceBox<_ArcInnerHeader>> = ManuallyDrop::new(
103+
DeviceBox::from_raw(header.as_mut_ptr().cast::<_ArcInnerHeader>()),
104+
);
105+
header.copy_from(&*arc_ptr.cast::<_ArcInnerHeader>())?;
106+
107+
Ok((
108+
DeviceAccessible::from(ArcedSliceCudaRepresentation {
109+
data: DeviceOwnedPointer(header.as_device_ptr().as_raw_mut().cast()),
110+
len: self.len(),
111+
}),
112+
CombinedCudaAlloc::new(device_buffer, alloc),
113+
))
114+
}
115+
116+
#[cfg(feature = "host")]
117+
unsafe fn restore<A: CudaAlloc>(
118+
&mut self,
119+
alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
120+
) -> CudaResult<A> {
121+
let (_alloc_front, alloc_tail) = alloc.split();
122+
Ok(alloc_tail)
123+
}
124+
}
125+
126+
unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Arc<[T]> {
127+
#[cfg(all(feature = "host", not(doc)))]
128+
type CudaAllocationAsync = CombinedCudaAlloc<
129+
CudaDropWrapper<LockedBuffer<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
130+
CudaDropWrapper<DeviceBuffer<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
131+
>;
132+
#[cfg(any(not(feature = "host"), doc))]
133+
type CudaAllocationAsync = crate::alloc::SomeCudaAlloc;
134+
135+
#[cfg(feature = "host")]
136+
unsafe fn borrow_async<'stream, A: CudaAlloc>(
137+
&self,
138+
alloc: A,
139+
stream: crate::host::Stream<'stream>,
140+
) -> rustacuda::error::CudaResult<(
141+
Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
142+
CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
143+
)> {
144+
use rustacuda::memory::AsyncCopyDestination;
145+
146+
let data_ptr: *const T = std::ptr::from_ref(&**self).as_ptr();
147+
let offset = std::mem::offset_of!(_ArcInner<[T; 42]>, data);
148+
let arc_ptr: *const _ArcInner<[T; 42]> = data_ptr.byte_sub(offset).cast();
149+
150+
let header_len = (offset + (std::mem::align_of::<T>() - 1)) / std::mem::align_of::<T>();
151+
152+
let locked_buffer = unsafe {
153+
let mut locked_buffer =
154+
CudaDropWrapper::from(LockedBuffer::<
155+
DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
156+
>::uninitialized(header_len + self.len())?);
157+
let locked_buffer_slice: &mut [MaybeUninit<T>] = std::slice::from_raw_parts_mut(
158+
locked_buffer.as_mut_slice().as_mut_ptr().cast(),
159+
locked_buffer.as_slice().len(),
160+
);
161+
let (header, buffer) = locked_buffer_slice.split_at_mut(header_len);
162+
std::ptr::copy_nonoverlapping(self.as_ptr().cast(), buffer.as_mut_ptr(), self.len());
163+
let header = std::slice::from_raw_parts_mut(
164+
header.as_mut_ptr().cast::<MaybeUninit<u8>>(),
165+
header.len() * std::mem::size_of::<T>(),
166+
);
167+
let (_, header) = header.split_at_mut(header.len() - offset);
168+
let (header, _) = header.split_at_mut(std::mem::size_of::<_ArcInnerHeader>());
169+
let header: *mut MaybeUninit<_ArcInnerHeader> = header.as_mut_ptr().cast();
170+
std::ptr::copy_nonoverlapping(
171+
&*arc_ptr.cast::<MaybeUninit<_ArcInnerHeader>>(),
172+
header,
173+
1,
174+
);
175+
176+
locked_buffer
177+
};
178+
179+
let mut device_buffer =
180+
CudaDropWrapper::from(DeviceBuffer::<
181+
DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
182+
>::uninitialized(locked_buffer.len())?);
183+
device_buffer.async_copy_from(&*locked_buffer, &stream)?;
184+
185+
Ok((
186+
Async::pending(
187+
DeviceAccessible::from(ArcedSliceCudaRepresentation {
188+
data: DeviceOwnedPointer(
189+
device_buffer
190+
.as_device_ptr()
191+
.as_raw_mut()
192+
.byte_add(header_len * std::mem::size_of::<T>() - offset)
193+
.cast(),
194+
),
195+
len: self.len(),
196+
}),
197+
stream,
198+
NoCompletion,
199+
)?,
200+
CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_buffer, device_buffer), alloc),
201+
))
202+
}
203+
204+
#[cfg(feature = "host")]
205+
unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
206+
this: owning_ref::BoxRefMut<'a, O, Self>,
207+
alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
208+
stream: crate::host::Stream<'stream>,
209+
) -> CudaResult<(
210+
Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
211+
A,
212+
)> {
213+
let (_alloc_front, alloc_tail) = alloc.split();
214+
let r#async = Async::ready(this, stream);
215+
Ok((r#async, alloc_tail))
216+
}
217+
}
218+
219+
unsafe impl<T: PortableBitSemantics + TypeGraphLayout> CudaAsRust
220+
for ArcedSliceCudaRepresentation<T>
221+
{
222+
type RustRepresentation = Arc<[T]>;
223+
224+
#[cfg(feature = "device")]
225+
unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
226+
crate::deps::alloc::sync::Arc::from_raw(core::ptr::slice_from_raw_parts(
227+
core::ptr::addr_of!((*(this.data.0)).data),
228+
this.len,
229+
))
230+
}
231+
}

src/lend/impls/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
mod arc;
2+
mod arced_slice;
23
mod r#box;
34
mod boxed_slice;
45
#[cfg(feature = "final")]

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
#![feature(never_type)]
5252
#![feature(layout_for_ptr)]
5353
#![feature(cfg_version)]
54-
#![cfg_attr(feature = "device", feature(slice_ptr_get))]
54+
#![cfg_attr(any(feature = "host", feature = "device"), feature(slice_ptr_get))]
5555
#![allow(incomplete_features)]
5656
#![feature(generic_const_exprs)]
5757
#![allow(internal_features)]

0 commit comments

Comments
 (0)