Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
a0f25af
initial
brandon-b-miller Aug 15, 2025
5322eef
tests
brandon-b-miller Aug 16, 2025
251f4e9
refactor
brandon-b-miller Aug 16, 2025
505cd4d
small changes
brandon-b-miller Aug 18, 2025
b861723
__cuda_stream__
brandon-b-miller Aug 18, 2025
b53f9ca
Merge branch 'main' into cuda-core-streams
brandon-b-miller Aug 20, 2025
2181748
accomodate ctypes bindings
brandon-b-miller Aug 20, 2025
46863d3
clean
brandon-b-miller Aug 20, 2025
2082063
more pacifying ctypes bindings
brandon-b-miller Aug 20, 2025
ec5841c
fix
brandon-b-miller Aug 20, 2025
2e45f6d
Merge branch 'main' into cuda-core-streams
brandon-b-miller Aug 25, 2025
4fcf9d1
renaming
brandon-b-miller Aug 25, 2025
220c2e3
address reviews
brandon-b-miller Aug 25, 2025
f3b07c0
Update numba_cuda/numba/cuda/cudadrv/driver.py
brandon-b-miller Aug 25, 2025
387ba84
merge/resolve
brandon-b-miller Oct 7, 2025
20440ab
address some reviews
brandon-b-miller Oct 7, 2025
1a00d67
Merge branch 'main' into cuda-core-streams
brandon-b-miller Oct 13, 2025
f0ff9d5
fix ctypes tests
brandon-b-miller Oct 13, 2025
1b59b5c
addressing old comments
brandon-b-miller Oct 13, 2025
6f8ddb3
merge/resolve
brandon-b-miller Oct 14, 2025
9ab36e7
merge/resolve
brandon-b-miller Oct 15, 2025
d1ad577
small fix
brandon-b-miller Oct 15, 2025
b7b56eb
small fix
brandon-b-miller Oct 15, 2025
c3e10af
Merge branch 'main' into cuda-core-streams
brandon-b-miller Oct 24, 2025
9b301a8
Merge branch 'main' into cuda-core-streams
brandon-b-miller Oct 27, 2025
324a48a
USE_NV_BINDING
brandon-b-miller Oct 27, 2025
7df62ce
events
brandon-b-miller Oct 27, 2025
f859466
skip event tests on sim
brandon-b-miller Oct 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 63 additions & 61 deletions numba_cuda/numba/cuda/cudadrv/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,7 @@

if USE_NV_BINDING:
from cuda.bindings import driver as binding
from cuda.core.experimental import (
Linker,
LinkerOptions,
ObjectCode,
)
from cuda.core import experimental

# There is no definition of the default stream in the Nvidia bindings (nor
# is there at the C/C++ level), so we define it here so we don't need to
Expand Down Expand Up @@ -1507,7 +1503,7 @@ def create_module_ptx(self, ptx):
if isinstance(ptx, str):
ptx = ptx.encode("utf8")
if USE_NV_BINDING:
image = ObjectCode.from_ptx(ptx)
image = experimental.ObjectCode.from_ptx(ptx)
else:
image = c_char_p(ptx)
return self.create_module_image(image)
Expand Down Expand Up @@ -2337,6 +2333,11 @@ def __int__(self):
# The default stream's handle.value is 0, which gives `None`
return self.handle.value or drvapi.CU_STREAM_DEFAULT

def __cuda_stream__(self):
if not self.handle.value:
return (0, drvapi.CU_STREAM_DEFAULT)
return 0, self.handle.value if USE_NV_BINDING else self.handle

def __repr__(self):
default_streams = {
drvapi.CU_STREAM_DEFAULT: "<Default CUDA stream on %s>",
Expand Down Expand Up @@ -2975,7 +2976,7 @@ def __init__(
self.lto = lto
self.additional_flags = additional_flags

self.options = LinkerOptions(
self.options = experimental.LinkerOptions(
max_register_count=self.max_registers,
lineinfo=lineinfo,
arch=arch,
Expand All @@ -3002,7 +3003,7 @@ def error_log(self):
raise RuntimeError("Link not yet complete.")

def add_ptx(self, ptx, name="<cudapy-ptx>"):
obj = ObjectCode.from_ptx(ptx, name=name)
obj = experimental.ObjectCode.from_ptx(ptx, name=name)
self._object_codes.append(obj)

def add_cu(self, cu, name="<cudapy-cu>"):
Expand All @@ -3018,23 +3019,23 @@ def add_cu(self, cu, name="<cudapy-cu>"):
self._object_codes.append(obj)

def add_cubin(self, cubin, name="<cudapy-cubin>"):
obj = ObjectCode.from_cubin(cubin, name=name)
obj = experimental.ObjectCode.from_cubin(cubin, name=name)
self._object_codes.append(obj)

def add_ltoir(self, ltoir, name="<cudapy-ltoir>"):
obj = ObjectCode.from_ltoir(ltoir, name=name)
obj = experimental.ObjectCode.from_ltoir(ltoir, name=name)
self._object_codes.append(obj)

def add_fatbin(self, fatbin, name="<cudapy-fatbin>"):
obj = ObjectCode.from_fatbin(fatbin, name=name)
obj = experimental.ObjectCode.from_fatbin(fatbin, name=name)
self._object_codes.append(obj)

def add_object(self, obj, name="<cudapy-object>"):
obj = ObjectCode.from_object(obj, name=name)
obj = experimental.ObjectCode.from_object(obj, name=name)
self._object_codes.append(obj)

def add_library(self, lib, name="<cudapy-lib>"):
obj = ObjectCode.from_library(lib, name=name)
obj = experimental.ObjectCode.from_library(lib, name=name)
self._object_codes.append(obj)

def add_file(self, path, kind):
Expand Down Expand Up @@ -3068,15 +3069,15 @@ def add_data(self, data, kind, name):
fn(data, name)

def get_linked_ptx(self):
options = LinkerOptions(
options = experimental.LinkerOptions(
max_register_count=self.max_registers,
lineinfo=self.lineinfo,
arch=self.arch,
link_time_optimization=True,
ptx=True,
)

self.linker = Linker(*self._object_codes, options=options)
self.linker = experimental.Linker(*self._object_codes, options=options)

result = self.linker.link("ptx")
self.close()
Expand All @@ -3089,7 +3090,9 @@ def close(self):
self.linker.close()

def complete(self):
self.linker = Linker(*self._object_codes, options=self.options)
self.linker = experimental.Linker(
*self._object_codes, options=self.options
)
result = self.linker.link("cubin")
self.close()
self._complete = True
Expand Down Expand Up @@ -3496,20 +3499,15 @@ def host_to_device(dst, src, size, stream=0):
it should not be changed until the operation which can be asynchronous
completes.
"""
varargs = []
fn = driver.cuMemcpyHtoD
args = (device_pointer(dst), host_pointer(src, readonly=True), size)

if stream:
assert isinstance(stream, Stream)
assert isinstance(stream, (Stream, experimental.Stream))
fn = driver.cuMemcpyHtoDAsync
if USE_NV_BINDING:
handle = stream.handle.value
else:
handle = stream.handle
varargs.append(handle)
else:
fn = driver.cuMemcpyHtoD
args += (_stream_handle(stream),)

fn(device_pointer(dst), host_pointer(src, readonly=True), size, *varargs)
fn(*args)


def device_to_host(dst, src, size, stream=0):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As mentioned below (or above), stream semantics is changed which probably has a bigger impact to this method, because the copy is now asynchronous and to access src on host a stream synchronization is needed.

Expand All @@ -3518,67 +3516,54 @@ def device_to_host(dst, src, size, stream=0):
it should not be changed until the operation which can be asynchronous
completes.
"""
varargs = []
fn = driver.cuMemcpyDtoH
args = (host_pointer(dst), device_pointer(src), size)

if stream:
assert isinstance(stream, Stream)
assert isinstance(stream, (Stream, experimental.Stream))
fn = driver.cuMemcpyDtoHAsync
if USE_NV_BINDING:
handle = stream.handle.value
else:
handle = stream.handle
varargs.append(handle)
else:
fn = driver.cuMemcpyDtoH
args += (_stream_handle(stream),)

fn(host_pointer(dst), device_pointer(src), size, *varargs)
fn(*args)


def device_to_device(dst, src, size, stream=0):
"""
NOTE: The underlying data pointer from the host data buffer is used and
NOTE: The underlying data pointer from the device buffer is used and
it should not be changed until the operation which can be asynchronous
completes.
"""
varargs = []
fn = driver.cuMemcpyDtoD
args = (device_pointer(dst), device_pointer(src), size)

if stream:
assert isinstance(stream, Stream)
assert isinstance(stream, (Stream, experimental.Stream))
fn = driver.cuMemcpyDtoDAsync
if USE_NV_BINDING:
handle = stream.handle.value
else:
handle = stream.handle
varargs.append(handle)
else:
fn = driver.cuMemcpyDtoD
args += (_stream_handle(stream),)

fn(device_pointer(dst), device_pointer(src), size, *varargs)
fn(*args)


def device_memset(dst, val, size, stream=0):
"""Memset on the device.
If stream is not zero, asynchronous mode is used.
"""
Memset on the device.
If stream is 0, the call is synchronous.
If stream is a Stream object, asynchronous mode is used.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a bug (or change or behavior) here and elsewhere. stream can be a Stream object from either numba-cuda or cuda.core, but still holds 0 (the default stream) under the hood. However, the call now becomes asynchronous (with respect to the host) instead of synchronous. Just wanted to call it out in case it was not the intention.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a really good catch. As a follow up to this, is the output here as expected, where dev is a cuda.core.experimental.Device for whom set_current() has been called? Should it not be (0, 0)?

>>> dev.default_stream.__cuda_stream__()
(0, 1)

I ask hoping there's a reliable way of detecting this situation based on the passed object.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After a while searching around the codebase I concluded this was at least the original intention, though these are really only used for the deprecated device array API:

        If a CUDA ``stream`` is given, then the transfer will be made
        asynchronously as part as the given stream.  Otherwise, the transfer is
        synchronous: the function returns after the copy is finished.

So AFAICT this PR maintains the above behavior just with a new stream object. Ultimately though I'm not sure we should spend too much time thinking about it as these will be removed and users performing these types of memory transfers should use either cupy for a nice array API or cuda.bindings for full control of things like synchronization behavior.


dst: device memory
val: byte value to be written
size: number of byte to be written
stream: a CUDA stream
size: number of bytes to be written
stream: 0 (synchronous) or a CUDA stream
"""
varargs = []
fn = driver.cuMemsetD8
args = (device_pointer(dst), val, size)

if stream:
assert isinstance(stream, Stream)
assert isinstance(stream, (Stream, experimental.Stream))
fn = driver.cuMemsetD8Async
if USE_NV_BINDING:
handle = stream.handle.value
else:
handle = stream.handle
varargs.append(handle)
else:
fn = driver.cuMemsetD8
args += (_stream_handle(stream),)

fn(device_pointer(dst), val, size, *varargs)
fn(*args)


def profile_start():
Expand Down Expand Up @@ -3639,3 +3624,20 @@ def inspect_obj_content(objpath: str):
code_types.add(match.group(1))

return code_types


def _stream_handle(stream):
if stream == 0:
return stream
elif hasattr(stream, "__cuda_stream__"):
_, ptr = stream.__cuda_stream__()
if isinstance(ptr, binding.CUstream):
return int(ptr)
else:
return ptr
else:
allowed = (Stream, experimental.Stream) if USE_NV_BINDING else Stream
if not isinstance(stream, allowed):
raise TypeError(
"Expected a Stream object or 0, got %s" % type(stream).__name__
)
6 changes: 1 addition & 5 deletions numba_cuda/numba/cuda/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,11 +468,7 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0):
for t, v in zip(self.argument_types, args):
self._prepare_args(t, v, stream, retr, kernelargs)

if driver.USE_NV_BINDING:
stream_handle = stream and stream.handle.value or 0
else:
zero_stream = None
stream_handle = stream and stream.handle or zero_stream
stream_handle = driver._stream_handle(stream)

# Invoke kernel
driver.launch_kernel(
Expand Down
42 changes: 42 additions & 0 deletions numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@
from numba.cuda.cudadrv import devices, drvapi, driver as _driver
from numba.cuda.testing import unittest, CUDATestCase
from numba.cuda.testing import skip_on_cudasim
import contextlib

if _driver.USE_NV_BINDING:
from cuda.core import experimental

ptx1 = """
.version 1.4
Expand Down Expand Up @@ -151,6 +154,45 @@ def test_cuda_driver_stream_operations(self):
for i, v in enumerate(array):
self.assertEqual(i, v)

@unittest.skipIf(not _driver.USE_NV_BINDING, "NV binding not enabled")
def test_cuda_core_stream_operations(self):
module = self.context.create_module_ptx(self.ptx)
function = module.get_function("_Z10helloworldPi")
array = (c_int * 100)()
dev = experimental.Device()
dev.set_current()
stream = dev.create_stream()

@contextlib.contextmanager
def auto_synchronize(stream):
try:
yield stream
finally:
stream.sync()

with auto_synchronize(stream):
memory = self.context.memalloc(sizeof(array))
host_to_device(memory, array, sizeof(array), stream=stream)

ptr = memory.device_ctypes_pointer

launch_kernel(
function.handle, # Kernel
1,
1,
1, # gx, gy, gz
100,
1,
1, # bx, by, bz
0, # dynamic shared mem
stream.handle, # stream
[ptr],
)

device_to_host(array, memory, sizeof(array), stream=stream)
for i, v in enumerate(array):
self.assertEqual(i, v)

def test_cuda_driver_default_stream(self):
# Test properties of the default stream
ds = self.context.get_default_stream()
Expand Down
Loading