Skip to content

Commit a2f5635

Browse files
authored
Merge branch 'main' into readwrite
2 parents 580d3f5 + bd23a2b commit a2f5635

21 files changed

+697
-278
lines changed

cuda_core/cuda/core/experimental/_device.pyx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ from cuda.core.experimental._utils.cuda_utils import (
2828
from cuda.core.experimental._stream cimport default_stream
2929

3030

31+
3132
# TODO: I prefer to type these as "cdef object" and avoid accessing them from within Python,
3233
# but it seems it is very convenient to expose them for testing purposes...
3334
_tls = threading.local()
@@ -1273,7 +1274,7 @@ class Device:
12731274
"""
12741275
self._check_context_initialized()
12751276
ctx = self._get_current_context()
1276-
return Event._init(self._id, ctx, options)
1277+
return Event._init(self._id, ctx, options, True)
12771278

12781279
def allocate(self, size, stream: Optional[Stream] = None) -> Buffer:
12791280
"""Allocate device memory from a specified stream.

cuda_core/cuda/core/experimental/_event.pxd

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ cdef class Event:
1111
cydriver.CUevent _handle
1212
bint _timing_disabled
1313
bint _busy_waited
14+
bint _ipc_enabled
15+
object _ipc_descriptor
1416
int _device_id
1517
object _ctx_handle
1618

cuda_core/cuda/core/experimental/_event.pyx

Lines changed: 89 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44

55
from __future__ import annotations
66

7+
cimport cpython
78
from libc.stdint cimport uintptr_t
9+
from libc.string cimport memcpy
810

911
from cuda.bindings cimport cydriver
1012

@@ -14,6 +16,7 @@ from cuda.core.experimental._utils.cuda_utils cimport (
1416
)
1517

1618
from dataclasses import dataclass
19+
import multiprocessing
1720
from typing import TYPE_CHECKING, Optional
1821

1922
from cuda.core.experimental._context import Context
@@ -40,15 +43,15 @@ cdef class EventOptions:
4043
has actually been completed.
4144
Otherwise, the CPU thread will busy-wait until the event has
4245
been completed. (Default to False)
43-
support_ipc : bool, optional
46+
ipc_enabled : bool, optional
4447
Event will be suitable for interprocess use.
4548
Note that enable_timing must be False. (Default to False)
4649
4750
"""
4851

4952
enable_timing: Optional[bool] = False
5053
busy_waited_sync: Optional[bool] = False
51-
support_ipc: Optional[bool] = False
54+
ipc_enabled: Optional[bool] = False
5255

5356

5457
cdef class Event:
@@ -86,24 +89,35 @@ cdef class Event:
8689
raise RuntimeError("Event objects cannot be instantiated directly. Please use Stream APIs (record).")
8790

8891
@classmethod
89-
def _init(cls, device_id: int, ctx_handle: Context, options=None):
92+
def _init(cls, device_id: int, ctx_handle: Context, options=None, is_free=False):
9093
cdef Event self = Event.__new__(cls)
9194
cdef EventOptions opts = check_or_create_options(EventOptions, options, "Event options")
9295
cdef unsigned int flags = 0x0
9396
self._timing_disabled = False
9497
self._busy_waited = False
98+
self._ipc_enabled = False
99+
self._ipc_descriptor = None
95100
if not opts.enable_timing:
96101
flags |= cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING
97102
self._timing_disabled = True
98103
if opts.busy_waited_sync:
99104
flags |= cydriver.CUevent_flags.CU_EVENT_BLOCKING_SYNC
100105
self._busy_waited = True
101-
if opts.support_ipc:
102-
raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103")
106+
if opts.ipc_enabled:
107+
if is_free:
108+
raise TypeError(
109+
"IPC-enabled events must be bound; use Stream.record for creation."
110+
)
111+
flags |= cydriver.CUevent_flags.CU_EVENT_INTERPROCESS
112+
self._ipc_enabled = True
113+
if not self._timing_disabled:
114+
raise TypeError("IPC-enabled events cannot use timing.")
103115
with nogil:
104116
HANDLE_RETURN(cydriver.cuEventCreate(&self._handle, flags))
105117
self._device_id = device_id
106118
self._ctx_handle = ctx_handle
119+
if opts.ipc_enabled:
120+
self.get_ipc_descriptor()
107121
return self
108122

109123
cpdef close(self):
@@ -151,6 +165,40 @@ cdef class Event:
151165
raise CUDAError(err)
152166
raise RuntimeError(explanation)
153167

168+
def get_ipc_descriptor(self) -> IPCEventDescriptor:
169+
"""Export an event allocated for sharing between processes."""
170+
if self._ipc_descriptor is not None:
171+
return self._ipc_descriptor
172+
if not self.is_ipc_enabled:
173+
raise RuntimeError("Event is not IPC-enabled")
174+
cdef cydriver.CUipcEventHandle data
175+
with nogil:
176+
HANDLE_RETURN(cydriver.cuIpcGetEventHandle(&data, <cydriver.CUevent>(self._handle)))
177+
cdef bytes data_b = cpython.PyBytes_FromStringAndSize(<char*>(data.reserved), sizeof(data.reserved))
178+
self._ipc_descriptor = IPCEventDescriptor._init(data_b, self._busy_waited)
179+
return self._ipc_descriptor
180+
181+
@classmethod
182+
def from_ipc_descriptor(cls, ipc_descriptor: IPCEventDescriptor) -> Event:
183+
"""Import an event that was exported from another process."""
184+
cdef cydriver.CUipcEventHandle data
185+
memcpy(data.reserved, <const void*><const char*>(ipc_descriptor._reserved), sizeof(data.reserved))
186+
cdef Event self = Event.__new__(cls)
187+
with nogil:
188+
HANDLE_RETURN(cydriver.cuIpcOpenEventHandle(&self._handle, data))
189+
self._timing_disabled = True
190+
self._busy_waited = ipc_descriptor._busy_waited
191+
self._ipc_enabled = True
192+
self._ipc_descriptor = ipc_descriptor
193+
self._device_id = -1 # ??
194+
self._ctx_handle = None # ??
195+
return self
196+
197+
@property
198+
def is_ipc_enabled(self) -> bool:
199+
"""Return True if the event can be shared across process boundaries, otherwise False."""
200+
return self._ipc_enabled
201+
154202
@property
155203
def is_timing_disabled(self) -> bool:
156204
"""Return True if the event does not record timing data, otherwise False."""
@@ -161,11 +209,6 @@ cdef class Event:
161209
"""Return True if the event synchronization would keep the CPU busy-waiting, otherwise False."""
162210
return self._busy_waited
163211

164-
@property
165-
def is_ipc_supported(self) -> bool:
166-
"""Return True if this event can be used as an interprocess event, otherwise False."""
167-
raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103")
168-
169212
def sync(self):
170213
"""Synchronize until the event completes.
171214
@@ -212,12 +255,43 @@ cdef class Event:
212255
context is set current after a event is created.
213256

214257
"""
215-
216-
from cuda.core.experimental._device import Device # avoid circular import
217-
218-
return Device(self._device_id)
258+
if self._device_id >= 0:
259+
from ._device import Device # avoid circular import
260+
return Device(self._device_id)
219261

220262
@property
221263
def context(self) -> Context:
222264
"""Return the :obj:`~_context.Context` associated with this event."""
223-
return Context._from_ctx(self._ctx_handle, self._device_id)
265+
if self._ctx_handle is not None and self._device_id >= 0:
266+
return Context._from_ctx(self._ctx_handle, self._device_id)
267+
268+
269+
cdef class IPCEventDescriptor:
270+
"""Serializable object describing an event that can be shared between processes."""
271+
272+
cdef:
273+
bytes _reserved
274+
bint _busy_waited
275+
276+
def __init__(self, *arg, **kwargs):
277+
raise RuntimeError("IPCEventDescriptor objects cannot be instantiated directly. Please use Event APIs.")
278+
279+
@classmethod
280+
def _init(cls, reserved: bytes, busy_waited: bint):
281+
cdef IPCEventDescriptor self = IPCEventDescriptor.__new__(cls)
282+
self._reserved = reserved
283+
self._busy_waited = busy_waited
284+
return self
285+
286+
def __eq__(self, IPCEventDescriptor rhs):
287+
# No need to check self._busy_waited.
288+
return self._reserved == rhs._reserved
289+
290+
def __reduce__(self):
291+
return self._init, (self._reserved, self._busy_waited)
292+
293+
294+
def _reduce_event(event):
295+
return event.from_ipc_descriptor, (event.get_ipc_descriptor(),)
296+
297+
multiprocessing.reduction.register(Event, _reduce_event)

cuda_core/cuda/core/experimental/_memory.pyx

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -226,11 +226,11 @@ cdef class Buffer(_cyBuffer, MemoryResourceAttributes):
226226
if stream is None:
227227
# Note: match this behavior to DeviceMemoryResource.allocate()
228228
stream = default_stream()
229-
cdef cydriver.CUmemPoolPtrExportData share_data
230-
memcpy(share_data.reserved, <const void*><const char*>(ipc_buffer._reserved), sizeof(share_data.reserved))
229+
cdef cydriver.CUmemPoolPtrExportData data
230+
memcpy(data.reserved, <const void*><const char*>(ipc_buffer._reserved), sizeof(data.reserved))
231231
cdef cydriver.CUdeviceptr ptr
232232
with nogil:
233-
HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._mempool_handle, &share_data))
233+
HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._mempool_handle, &data))
234234
return Buffer._init(<intptr_t>ptr, ipc_buffer.size, mr, stream)
235235

236236
def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer:
@@ -511,7 +511,7 @@ cdef class DeviceMemoryResourceOptions:
511511
(Default to 0)
512512
"""
513513
ipc_enabled : cython.bint = False
514-
max_size : cython.int = 0
514+
max_size : cython.size_t = 0
515515

516516

517517
# TODO: cythonize this?

cuda_core/cuda/core/experimental/_stream.pyx

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,13 @@ cdef class Stream:
260260
# and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions.
261261
if event is None:
262262
self._get_device_and_context()
263-
event = Event._init(<int>(self._device_id), <uintptr_t>(self._ctx_handle), options)
263+
event = Event._init(<int>(self._device_id), <uintptr_t>(self._ctx_handle), options, False)
264+
elif event.is_ipc_enabled:
265+
raise TypeError(
266+
"IPC-enabled events should not be re-recorded, instead create a "
267+
"new event by supplying options."
268+
)
269+
264270
cdef cydriver.CUevent e = (<cyEvent?>(event))._handle
265271
with nogil:
266272
HANDLE_RETURN(cydriver.cuEventRecord(e, self._handle))

cuda_core/tests/helpers.py renamed to cuda_core/tests/helpers/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import os
55
import pathlib
6+
import platform
67
import sys
78

89
CUDA_PATH = os.environ.get("CUDA_PATH")
@@ -22,12 +23,13 @@
2223
import cuda_python_test_helpers
2324
except ImportError:
2425
# Import shared platform helpers for tests across repos
25-
sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[2] / "cuda_python_test_helpers"))
26+
sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[3] / "cuda_python_test_helpers"))
2627
import cuda_python_test_helpers
2728

2829

2930
IS_WSL = cuda_python_test_helpers.IS_WSL
3031
supports_ipc_mempool = cuda_python_test_helpers.supports_ipc_mempool
32+
IS_WINDOWS = platform.system() == "Windows"
3133

3234

3335
del cuda_python_test_helpers

cuda_core/tests/helpers/buffers.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
import ctypes
5+
import sys
6+
7+
from cuda.core.experimental import Buffer, MemoryResource
8+
from cuda.core.experimental._utils.cuda_utils import driver, handle_return
9+
10+
if sys.platform.startswith("win"):
11+
libc = ctypes.CDLL("msvcrt.dll")
12+
else:
13+
libc = ctypes.CDLL("libc.so.6")
14+
15+
16+
__all__ = ["DummyUnifiedMemoryResource", "PatternGen", "make_scratch_buffer", "compare_equal_buffers"]
17+
18+
19+
class DummyUnifiedMemoryResource(MemoryResource):
20+
def __init__(self, device):
21+
self.device = device
22+
23+
def allocate(self, size, stream=None) -> Buffer:
24+
ptr = handle_return(driver.cuMemAllocManaged(size, driver.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value))
25+
return Buffer.from_handle(ptr=ptr, size=size, mr=self)
26+
27+
def deallocate(self, ptr, size, stream=None):
28+
handle_return(driver.cuMemFree(ptr))
29+
30+
@property
31+
def is_device_accessible(self) -> bool:
32+
return True
33+
34+
@property
35+
def is_host_accessible(self) -> bool:
36+
return True
37+
38+
@property
39+
def device_id(self) -> int:
40+
return self.device
41+
42+
43+
class PatternGen:
44+
"""
45+
Provides methods to fill a target buffer with known test patterns and
46+
verify the expected values.
47+
48+
If a stream is provided, operations are synchronized with respect to that
49+
stream. Otherwise, they are synchronized over the device.
50+
51+
The test pattern is either a fixed value or a cyclic pattern generated from
52+
an 8-bit seed. Only one of `value` or `seed` should be supplied.
53+
54+
Distinct test patterns are stored in private buffers called pattern
55+
buffers. Calls to `fill_buffer` copy from a pattern buffer to the target
56+
buffer. Calls to `verify_buffer` copy from the target buffer to a scratch
57+
buffer and then perform a comparison.
58+
"""
59+
60+
def __init__(self, device, size, stream=None):
61+
self.device = device
62+
self.size = size
63+
self.stream = stream if stream is not None else device.create_stream()
64+
self.sync_target = stream if stream is not None else device
65+
self.pattern_buffers = {}
66+
67+
def fill_buffer(self, buffer, seed=None, value=None):
68+
"""Fill a device buffer with a sequential test pattern using unified memory."""
69+
assert buffer.size == self.size
70+
pattern_buffer = self._get_pattern_buffer(seed, value)
71+
buffer.copy_from(pattern_buffer, stream=self.stream)
72+
73+
def verify_buffer(self, buffer, seed=None, value=None):
74+
"""Verify the buffer contents against a sequential pattern."""
75+
assert buffer.size == self.size
76+
scratch_buffer = DummyUnifiedMemoryResource(self.device).allocate(self.size)
77+
ptr_test = self._ptr(scratch_buffer)
78+
pattern_buffer = self._get_pattern_buffer(seed, value)
79+
ptr_expected = self._ptr(pattern_buffer)
80+
scratch_buffer.copy_from(buffer, stream=self.stream)
81+
self.sync_target.sync()
82+
assert libc.memcmp(ptr_test, ptr_expected, self.size) == 0
83+
84+
@staticmethod
85+
def _ptr(buffer):
86+
"""Get a pointer to the specified buffer."""
87+
return ctypes.cast(int(buffer.handle), ctypes.POINTER(ctypes.c_ubyte))
88+
89+
def _get_pattern_buffer(self, seed, value):
90+
"""Get a buffer holding the specified test pattern."""
91+
assert seed is None or value is None
92+
if value is None:
93+
seed = (0 if seed is None else seed) & 0xFF
94+
key = seed, value
95+
pattern_buffer = self.pattern_buffers.get(key, None)
96+
if pattern_buffer is None:
97+
if value is not None:
98+
pattern_buffer = make_scratch_buffer(self.device, value, self.size)
99+
else:
100+
pattern_buffer = DummyUnifiedMemoryResource(self.device).allocate(self.size)
101+
ptr = self._ptr(pattern_buffer)
102+
for i in range(self.size):
103+
ptr[i] = (seed + i) & 0xFF
104+
self.pattern_buffers[key] = pattern_buffer
105+
return pattern_buffer
106+
107+
108+
def make_scratch_buffer(device, value, nbytes):
109+
"""Create a unified memory buffer with the specified value."""
110+
buffer = DummyUnifiedMemoryResource(device).allocate(nbytes)
111+
ptr = ctypes.cast(int(buffer.handle), ctypes.POINTER(ctypes.c_byte))
112+
ctypes.memset(ptr, value & 0xFF, nbytes)
113+
return buffer
114+
115+
116+
def compare_equal_buffers(buffer1, buffer2):
117+
"""Compare the contents of two host-accessible buffers for bitwise equality."""
118+
if buffer1.size != buffer2.size:
119+
return False
120+
ptr1 = ctypes.cast(int(buffer1.handle), ctypes.POINTER(ctypes.c_byte))
121+
ptr2 = ctypes.cast(int(buffer2.handle), ctypes.POINTER(ctypes.c_byte))
122+
return libc.memcmp(ptr1, ptr2, buffer1.size) == 0

0 commit comments

Comments
 (0)