-
Couldn't load subscription status.
- Fork 42
Handle cuda.core.Stream in driver operations
#401
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
a0f25af
5322eef
251f4e9
505cd4d
b861723
b53f9ca
2181748
46863d3
2082063
ec5841c
2e45f6d
4fcf9d1
220c2e3
f3b07c0
387ba84
20440ab
1a00d67
f0ff9d5
1b59b5c
6f8ddb3
9ab36e7
d1ad577
b7b56eb
c3e10af
9b301a8
324a48a
7df62ce
f859466
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -58,11 +58,7 @@ | |
|
|
||
| if USE_NV_BINDING: | ||
| from cuda.bindings import driver as binding | ||
| from cuda.core.experimental import ( | ||
| Linker, | ||
| LinkerOptions, | ||
| ObjectCode, | ||
| ) | ||
| from cuda.core import experimental | ||
|
|
||
| # There is no definition of the default stream in the Nvidia bindings (nor | ||
| # is there at the C/C++ level), so we define it here so we don't need to | ||
|
|
@@ -1507,7 +1503,7 @@ def create_module_ptx(self, ptx): | |
| if isinstance(ptx, str): | ||
| ptx = ptx.encode("utf8") | ||
| if USE_NV_BINDING: | ||
| image = ObjectCode.from_ptx(ptx) | ||
| image = experimental.ObjectCode.from_ptx(ptx) | ||
| else: | ||
| image = c_char_p(ptx) | ||
| return self.create_module_image(image) | ||
|
|
@@ -2337,6 +2333,11 @@ def __int__(self): | |
| # The default stream's handle.value is 0, which gives `None` | ||
| return self.handle.value or drvapi.CU_STREAM_DEFAULT | ||
|
|
||
| def __cuda_stream__(self): | ||
| if not self.handle.value: | ||
| return (0, drvapi.CU_STREAM_DEFAULT) | ||
| return 0, self.handle.value if USE_NV_BINDING else self.handle | ||
brandon-b-miller marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| def __repr__(self): | ||
| default_streams = { | ||
| drvapi.CU_STREAM_DEFAULT: "<Default CUDA stream on %s>", | ||
|
|
@@ -2975,7 +2976,7 @@ def __init__( | |
| self.lto = lto | ||
| self.additional_flags = additional_flags | ||
|
|
||
| self.options = LinkerOptions( | ||
| self.options = experimental.LinkerOptions( | ||
| max_register_count=self.max_registers, | ||
| lineinfo=lineinfo, | ||
| arch=arch, | ||
|
|
@@ -3002,7 +3003,7 @@ def error_log(self): | |
| raise RuntimeError("Link not yet complete.") | ||
|
|
||
| def add_ptx(self, ptx, name="<cudapy-ptx>"): | ||
| obj = ObjectCode.from_ptx(ptx, name=name) | ||
| obj = experimental.ObjectCode.from_ptx(ptx, name=name) | ||
| self._object_codes.append(obj) | ||
|
|
||
| def add_cu(self, cu, name="<cudapy-cu>"): | ||
|
|
@@ -3018,23 +3019,23 @@ def add_cu(self, cu, name="<cudapy-cu>"): | |
| self._object_codes.append(obj) | ||
|
|
||
| def add_cubin(self, cubin, name="<cudapy-cubin>"): | ||
| obj = ObjectCode.from_cubin(cubin, name=name) | ||
| obj = experimental.ObjectCode.from_cubin(cubin, name=name) | ||
| self._object_codes.append(obj) | ||
|
|
||
| def add_ltoir(self, ltoir, name="<cudapy-ltoir>"): | ||
| obj = ObjectCode.from_ltoir(ltoir, name=name) | ||
| obj = experimental.ObjectCode.from_ltoir(ltoir, name=name) | ||
| self._object_codes.append(obj) | ||
|
|
||
| def add_fatbin(self, fatbin, name="<cudapy-fatbin>"): | ||
| obj = ObjectCode.from_fatbin(fatbin, name=name) | ||
| obj = experimental.ObjectCode.from_fatbin(fatbin, name=name) | ||
| self._object_codes.append(obj) | ||
|
|
||
| def add_object(self, obj, name="<cudapy-object>"): | ||
| obj = ObjectCode.from_object(obj, name=name) | ||
| obj = experimental.ObjectCode.from_object(obj, name=name) | ||
| self._object_codes.append(obj) | ||
|
|
||
| def add_library(self, lib, name="<cudapy-lib>"): | ||
| obj = ObjectCode.from_library(lib, name=name) | ||
| obj = experimental.ObjectCode.from_library(lib, name=name) | ||
| self._object_codes.append(obj) | ||
|
|
||
| def add_file(self, path, kind): | ||
|
|
@@ -3068,15 +3069,15 @@ def add_data(self, data, kind, name): | |
| fn(data, name) | ||
|
|
||
| def get_linked_ptx(self): | ||
| options = LinkerOptions( | ||
| options = experimental.LinkerOptions( | ||
| max_register_count=self.max_registers, | ||
| lineinfo=self.lineinfo, | ||
| arch=self.arch, | ||
| link_time_optimization=True, | ||
| ptx=True, | ||
| ) | ||
|
|
||
| self.linker = Linker(*self._object_codes, options=options) | ||
| self.linker = experimental.Linker(*self._object_codes, options=options) | ||
|
|
||
| result = self.linker.link("ptx") | ||
| self.close() | ||
|
|
@@ -3089,7 +3090,9 @@ def close(self): | |
| self.linker.close() | ||
|
|
||
| def complete(self): | ||
| self.linker = Linker(*self._object_codes, options=self.options) | ||
| self.linker = experimental.Linker( | ||
| *self._object_codes, options=self.options | ||
| ) | ||
| result = self.linker.link("cubin") | ||
| self.close() | ||
| self._complete = True | ||
|
|
@@ -3496,20 +3499,15 @@ def host_to_device(dst, src, size, stream=0): | |
| it should not be changed until the operation which can be asynchronous | ||
| completes. | ||
| """ | ||
| varargs = [] | ||
| fn = driver.cuMemcpyHtoD | ||
| args = (device_pointer(dst), host_pointer(src, readonly=True), size) | ||
|
|
||
| if stream: | ||
| assert isinstance(stream, Stream) | ||
| assert isinstance(stream, (Stream, experimental.Stream)) | ||
| fn = driver.cuMemcpyHtoDAsync | ||
| if USE_NV_BINDING: | ||
| handle = stream.handle.value | ||
| else: | ||
| handle = stream.handle | ||
| varargs.append(handle) | ||
| else: | ||
| fn = driver.cuMemcpyHtoD | ||
| args += (_stream_handle(stream),) | ||
|
|
||
| fn(device_pointer(dst), host_pointer(src, readonly=True), size, *varargs) | ||
| fn(*args) | ||
|
|
||
|
|
||
| def device_to_host(dst, src, size, stream=0): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As mentioned below (or above), |
||
|
|
@@ -3518,67 +3516,54 @@ def device_to_host(dst, src, size, stream=0): | |
| it should not be changed until the operation which can be asynchronous | ||
| completes. | ||
| """ | ||
| varargs = [] | ||
| fn = driver.cuMemcpyDtoH | ||
| args = (host_pointer(dst), device_pointer(src), size) | ||
|
|
||
| if stream: | ||
| assert isinstance(stream, Stream) | ||
| assert isinstance(stream, (Stream, experimental.Stream)) | ||
| fn = driver.cuMemcpyDtoHAsync | ||
| if USE_NV_BINDING: | ||
| handle = stream.handle.value | ||
| else: | ||
| handle = stream.handle | ||
| varargs.append(handle) | ||
| else: | ||
| fn = driver.cuMemcpyDtoH | ||
| args += (_stream_handle(stream),) | ||
|
|
||
| fn(host_pointer(dst), device_pointer(src), size, *varargs) | ||
| fn(*args) | ||
|
|
||
|
|
||
| def device_to_device(dst, src, size, stream=0): | ||
| """ | ||
| NOTE: The underlying data pointer from the host data buffer is used and | ||
| NOTE: The underlying data pointer from the device buffer is used and | ||
| it should not be changed until the operation which can be asynchronous | ||
| completes. | ||
| """ | ||
| varargs = [] | ||
| fn = driver.cuMemcpyDtoD | ||
| args = (device_pointer(dst), device_pointer(src), size) | ||
|
|
||
| if stream: | ||
| assert isinstance(stream, Stream) | ||
| assert isinstance(stream, (Stream, experimental.Stream)) | ||
| fn = driver.cuMemcpyDtoDAsync | ||
| if USE_NV_BINDING: | ||
| handle = stream.handle.value | ||
| else: | ||
| handle = stream.handle | ||
| varargs.append(handle) | ||
| else: | ||
| fn = driver.cuMemcpyDtoD | ||
| args += (_stream_handle(stream),) | ||
|
|
||
| fn(device_pointer(dst), device_pointer(src), size, *varargs) | ||
| fn(*args) | ||
|
|
||
|
|
||
| def device_memset(dst, val, size, stream=0): | ||
| """Memset on the device. | ||
| If stream is not zero, asynchronous mode is used. | ||
| """ | ||
| Memset on the device. | ||
| If stream is 0, the call is synchronous. | ||
| If stream is a Stream object, asynchronous mode is used. | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a bug (or change or behavior) here and elsewhere. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a really good catch. As a follow up to this, is the output here as expected, where I ask hoping there's a reliable way of detecting this situation based on the passed object. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After a while searching around the codebase I concluded this was at least the original intention, though these are really only used for the deprecated device array API: So AFAICT this PR maintains the above behavior just with a new stream object. Ultimately though I'm not sure we should spend too much time thinking about it as these will be removed and users performing these types of memory transfers should use either cupy for a nice array API or |
||
|
|
||
| dst: device memory | ||
| val: byte value to be written | ||
| size: number of byte to be written | ||
| stream: a CUDA stream | ||
| size: number of bytes to be written | ||
| stream: 0 (synchronous) or a CUDA stream | ||
| """ | ||
| varargs = [] | ||
| fn = driver.cuMemsetD8 | ||
| args = (device_pointer(dst), val, size) | ||
|
|
||
| if stream: | ||
| assert isinstance(stream, Stream) | ||
| assert isinstance(stream, (Stream, experimental.Stream)) | ||
| fn = driver.cuMemsetD8Async | ||
| if USE_NV_BINDING: | ||
| handle = stream.handle.value | ||
| else: | ||
| handle = stream.handle | ||
| varargs.append(handle) | ||
| else: | ||
| fn = driver.cuMemsetD8 | ||
| args += (_stream_handle(stream),) | ||
|
|
||
| fn(device_pointer(dst), val, size, *varargs) | ||
| fn(*args) | ||
|
|
||
|
|
||
| def profile_start(): | ||
|
|
@@ -3639,3 +3624,20 @@ def inspect_obj_content(objpath: str): | |
| code_types.add(match.group(1)) | ||
|
|
||
| return code_types | ||
|
|
||
|
|
||
| def _stream_handle(stream): | ||
| if stream == 0: | ||
brandon-b-miller marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| return stream | ||
| elif hasattr(stream, "__cuda_stream__"): | ||
| _, ptr = stream.__cuda_stream__() | ||
brandon-b-miller marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| if isinstance(ptr, binding.CUstream): | ||
| return int(ptr) | ||
brandon-b-miller marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| else: | ||
| return ptr | ||
| else: | ||
| allowed = (Stream, experimental.Stream) if USE_NV_BINDING else Stream | ||
| if not isinstance(stream, allowed): | ||
| raise TypeError( | ||
| "Expected a Stream object or 0, got %s" % type(stream).__name__ | ||
| ) | ||
Uh oh!
There was an error while loading. Please reload this page.