-
Notifications
You must be signed in to change notification settings - Fork 41
Open
Milestone
Description
One known overhead in numba-cuda is kernel launch. We should profile it carefully and come up with a comprehensive improvement plan, but one obvious source of latency is preparing the kernel args, which is currently done using ctypes. We should switch to use cuda.core.launch()
.
numba-cuda/numba_cuda/numba/cuda/dispatcher.py
Lines 454 to 487 in 97ce4b1
def launch(self, args, griddim, blockdim, stream=0, sharedmem=0): | |
# Prepare kernel | |
cufunc = self._codelibrary.get_cufunc() | |
if self.debug: | |
excname = cufunc.name + "__errcode__" | |
excmem, excsz = cufunc.module.get_global_symbol(excname) | |
assert excsz == ctypes.sizeof(ctypes.c_int) | |
excval = ctypes.c_int() | |
excmem.memset(0, stream=stream) | |
# Prepare arguments | |
retr = [] # hold functors for writeback | |
kernelargs = [] | |
for t, v in zip(self.argument_types, args): | |
self._prepare_args(t, v, stream, retr, kernelargs) | |
if driver.USE_NV_BINDING: | |
stream_handle = stream and stream.handle.value or 0 | |
else: | |
zero_stream = None | |
stream_handle = stream and stream.handle or zero_stream | |
# Invoke kernel | |
driver.launch_kernel( | |
cufunc.handle, | |
*griddim, | |
*blockdim, | |
sharedmem, | |
stream_handle, | |
kernelargs, | |
cooperative=self.cooperative, | |
) |
brandon-b-miller