-
Couldn't load subscription status.
- Fork 42
Handle cuda.core.Stream in driver operations
#401
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 26 commits
a0f25af
5322eef
251f4e9
505cd4d
b861723
b53f9ca
2181748
46863d3
2082063
ec5841c
2e45f6d
4fcf9d1
220c2e3
f3b07c0
387ba84
20440ab
1a00d67
f0ff9d5
1b59b5c
6f8ddb3
9ab36e7
d1ad577
b7b56eb
c3e10af
9b301a8
324a48a
7df62ce
f859466
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -64,6 +64,12 @@ | |
| ObjectCode, | ||
| ) | ||
|
|
||
| from cuda.bindings.utils import get_cuda_native_handle | ||
| from cuda.core.experimental import ( | ||
| Stream as ExperimentalStream, | ||
| ) | ||
|
|
||
|
|
||
| # There is no definition of the default stream in the Nvidia bindings (nor | ||
| # is there at the C/C++ level), so we define it here so we don't need to | ||
| # use a magic number 0 in places where we want the default stream. | ||
|
|
@@ -2064,6 +2070,11 @@ def __int__(self): | |
| # The default stream's handle.value is 0, which gives `None` | ||
| return self.handle.value or drvapi.CU_STREAM_DEFAULT | ||
|
|
||
| def __cuda_stream__(self): | ||
| if not self.handle.value: | ||
| return (0, drvapi.CU_STREAM_DEFAULT) | ||
| return (0, self.handle.value) | ||
|
|
||
| def __repr__(self): | ||
| default_streams = { | ||
| drvapi.CU_STREAM_DEFAULT: "<Default CUDA stream on %s>", | ||
|
|
@@ -3080,17 +3091,14 @@ def host_to_device(dst, src, size, stream=0): | |
| it should not be changed until the operation which can be asynchronous | ||
| completes. | ||
| """ | ||
| varargs = [] | ||
| fn = driver.cuMemcpyHtoD | ||
| args = (device_pointer(dst), host_pointer(src, readonly=True), size) | ||
|
|
||
| if stream: | ||
| assert isinstance(stream, Stream) | ||
| fn = driver.cuMemcpyHtoDAsync | ||
| handle = stream.handle.value | ||
| varargs.append(handle) | ||
| else: | ||
| fn = driver.cuMemcpyHtoD | ||
| args += (_stream_handle(stream),) | ||
|
|
||
| fn(device_pointer(dst), host_pointer(src, readonly=True), size, *varargs) | ||
| fn(*args) | ||
|
|
||
|
|
||
| def device_to_host(dst, src, size, stream=0): | ||
|
|
@@ -3099,61 +3107,52 @@ def device_to_host(dst, src, size, stream=0): | |
| it should not be changed until the operation which can be asynchronous | ||
| completes. | ||
| """ | ||
| varargs = [] | ||
| fn = driver.cuMemcpyDtoH | ||
| args = (host_pointer(dst), device_pointer(src), size) | ||
|
|
||
| if stream: | ||
| assert isinstance(stream, Stream) | ||
| fn = driver.cuMemcpyDtoHAsync | ||
| handle = stream.handle.value | ||
| varargs.append(handle) | ||
| else: | ||
| fn = driver.cuMemcpyDtoH | ||
| args += (_stream_handle(stream),) | ||
|
|
||
| fn(host_pointer(dst), device_pointer(src), size, *varargs) | ||
| fn(*args) | ||
|
|
||
|
|
||
| def device_to_device(dst, src, size, stream=0): | ||
| """ | ||
| NOTE: The underlying data pointer from the host data buffer is used and | ||
| NOTE: The underlying data pointer from the device buffer is used and | ||
| it should not be changed until the operation which can be asynchronous | ||
| completes. | ||
| """ | ||
| varargs = [] | ||
| fn = driver.cuMemcpyDtoD | ||
| args = (device_pointer(dst), device_pointer(src), size) | ||
|
|
||
| if stream: | ||
| assert isinstance(stream, Stream) | ||
| fn = driver.cuMemcpyDtoDAsync | ||
| handle = stream.handle.value | ||
| varargs.append(handle) | ||
| else: | ||
| fn = driver.cuMemcpyDtoD | ||
| args += (_stream_handle(stream),) | ||
|
|
||
| fn(device_pointer(dst), device_pointer(src), size, *varargs) | ||
| fn(*args) | ||
|
|
||
|
|
||
| def device_memset(dst, val, size, stream=0): | ||
| """Memset on the device. | ||
| If stream is not zero, asynchronous mode is used. | ||
| """ | ||
| Memset on the device. | ||
| If stream is 0, the call is synchronous. | ||
| If stream is a Stream object, asynchronous mode is used. | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a bug (or change or behavior) here and elsewhere. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a really good catch. As a follow up to this, is the output here as expected, where I ask hoping there's a reliable way of detecting this situation based on the passed object. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After a while searching around the codebase I concluded this was at least the original intention, though these are really only used for the deprecated device array API: So AFAICT this PR maintains the above behavior just with a new stream object. Ultimately though I'm not sure we should spend too much time thinking about it as these will be removed and users performing these types of memory transfers should use either cupy for a nice array API or |
||
|
|
||
| dst: device memory | ||
| val: byte value to be written | ||
| size: number of byte to be written | ||
| stream: a CUDA stream | ||
| size: number of bytes to be written | ||
| stream: 0 (synchronous) or a CUDA stream | ||
| """ | ||
| ptr = device_pointer(dst) | ||
|
|
||
| varargs = [] | ||
| fn = driver.cuMemsetD8 | ||
| args = (device_pointer(dst), val, size) | ||
|
|
||
| if stream: | ||
| assert isinstance(stream, Stream) | ||
| fn = driver.cuMemsetD8Async | ||
| handle = stream.handle.value | ||
| varargs.append(handle) | ||
| else: | ||
| fn = driver.cuMemsetD8 | ||
| args += (_stream_handle(stream),) | ||
|
|
||
| try: | ||
| fn(ptr, val, size, *varargs) | ||
| fn(*args) | ||
| except CudaAPIError as e: | ||
| invalid = binding.CUresult.CUDA_ERROR_INVALID_VALUE | ||
| if ( | ||
|
|
@@ -3226,3 +3225,28 @@ def inspect_obj_content(objpath: str): | |
| code_types.add(match.group(1)) | ||
|
|
||
| return code_types | ||
|
|
||
|
|
||
| def _stream_handle(stream): | ||
| """ | ||
| Obtain the appropriate handle for various types of | ||
| acceptable stream objects. Acceptable types are | ||
| int (0 for default stream), Stream, ExperimentalStream | ||
|
Comment on lines
+3233
to
+3234
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the docstring outdated? int is currently not allowed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Only for the special value 0 I believe. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we consider deprecating allowing passing It would be great if we could introduce a deprecation warning in some form to passing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From the user perspective we're deprecating the apis fully in #546, so those should be gone entirely. But we should do a sweep and make sure we're being explicit with all our usages of streams internally. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Outside of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. launching is tested as part of this PR, events added in 7df62ce though. |
||
| """ | ||
|
|
||
| if stream == 0: | ||
brandon-b-miller marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| return stream | ||
| allowed = (Stream, ExperimentalStream) | ||
| if not isinstance(stream, allowed): | ||
| raise TypeError( | ||
| "Expected a Stream object or 0, got %s" % type(stream).__name__ | ||
| ) | ||
| elif hasattr(stream, "__cuda_stream__"): | ||
| ver, ptr = stream.__cuda_stream__() | ||
| assert ver == 0 | ||
| if isinstance(ptr, binding.CUstream): | ||
| return get_cuda_native_handle(ptr) | ||
brandon-b-miller marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| else: | ||
| return ptr | ||
| else: | ||
| raise TypeError("Invalid Stream") | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As mentioned below (or above),
streamsemantics is changed which probably has a bigger impact to this method, because the copy is now asynchronous and to accesssrcon host a stream synchronization is needed.