Skip to content

Commit 9ed0173

Browse files
committed
reduce overhead
1 parent ceaa7f2 commit 9ed0173

File tree

2 files changed

+32
-20
lines changed

2 files changed

+32
-20
lines changed

cuda_core/cuda/core/experimental/_memory.pyx

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ from typing import Tuple, TypeVar, Union
1515

1616
from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
1717
from cuda.core.experimental._stream import Stream, default_stream
18-
from cuda.core.experimental._utils.cuda_utils import driver, handle_return
18+
from cuda.core.experimental._utils.cuda_utils import driver
1919

2020
# TODO: define a memory property mixin class and make Buffer and
2121
# MemoryResource both inherit from it
@@ -42,6 +42,7 @@ cdef class Buffer:
4242
uintptr_t _ptr
4343
size_t _size
4444
object _mr
45+
object _ptr_obj
4546

4647
def __init__(self, *args, **kwargs):
4748
raise RuntimeError("Buffer objects cannot be instantiated directly. Please use MemoryResource APIs.")
@@ -50,6 +51,7 @@ cdef class Buffer:
5051
def _init(cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None):
5152
cdef Buffer self = Buffer.__new__(cls)
5253
self._ptr = <uintptr_t>(int(ptr))
54+
self._ptr_obj = ptr
5355
self._size = size
5456
self._mr = mr
5557
return self
@@ -73,6 +75,7 @@ cdef class Buffer:
7375
self._mr.deallocate(self._ptr, self._size, stream)
7476
self._ptr = 0
7577
self._mr = None
78+
self._ptr_obj = None
7679

7780
@property
7881
def handle(self) -> DevicePointerT:
@@ -83,7 +86,7 @@ cdef class Buffer:
8386
This handle is a Python object. To get the memory address of the underlying C
8487
handle, call ``int(Buffer.handle)``.
8588
"""
86-
return self._ptr
89+
return self._ptr_obj
8790

8891
@property
8992
def size(self) -> int:
@@ -147,7 +150,8 @@ cdef class Buffer:
147150
raise ValueError(
148151
f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})"
149152
)
150-
handle_return(driver.cuMemcpyAsync(dst._ptr, self._ptr, src_size, stream.handle))
153+
err, = driver.cuMemcpyAsync(dst._ptr, self._ptr, src_size, stream.handle)
154+
raise_if_driver_error(err)
151155
return dst
152156

153157
def copy_from(self, src: Buffer, *, stream: Stream):
@@ -172,7 +176,8 @@ cdef class Buffer:
172176
raise ValueError(
173177
f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})"
174178
)
175-
handle_return(driver.cuMemcpyAsync(self._ptr, src._ptr, dst_size, stream.handle))
179+
err, = driver.cuMemcpyAsync(self._ptr, src._ptr, dst_size, stream.handle)
180+
raise_if_driver_error(err)
176181

177182
def __dlpack__(
178183
self,
@@ -332,25 +337,26 @@ class DeviceMemoryResource(MemoryResource):
332337
__slots__ = ("_dev_id",)
333338

334339
def __init__(self, device_id: int):
335-
self._handle = handle_return(driver.cuDeviceGetMemPool(device_id))
340+
err, self._handle = driver.cuDeviceGetMemPool(device_id)
341+
raise_if_driver_error(err)
336342
self._dev_id = device_id
337343

338344
# Set a higher release threshold to improve performance when there are no active allocations.
339345
# By default, the release threshold is 0, which means memory is immediately released back
340346
# to the OS when there are no active suballocations, causing performance issues.
341347
# Check current release threshold
342-
current_threshold = handle_return(
343-
driver.cuMemPoolGetAttribute(self._handle, driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD)
348+
err, current_threshold = driver.cuMemPoolGetAttribute(
349+
self._handle, driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD
344350
)
351+
raise_if_driver_error(err)
345352
# If threshold is 0 (default), set it to maximum to retain memory in the pool
346353
if int(current_threshold) == 0:
347-
handle_return(
348-
driver.cuMemPoolSetAttribute(
349-
self._handle,
350-
driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
351-
driver.cuuint64_t(0xFFFFFFFFFFFFFFFF),
352-
)
354+
err, = driver.cuMemPoolSetAttribute(
355+
self._handle,
356+
driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
357+
driver.cuuint64_t(0xFFFFFFFFFFFFFFFF),
353358
)
359+
raise_if_driver_error(err)
354360

355361
def allocate(self, size: int, stream: Stream = None) -> Buffer:
356362
"""Allocate a buffer of the requested size.
@@ -371,7 +377,8 @@ class DeviceMemoryResource(MemoryResource):
371377
"""
372378
if stream is None:
373379
stream = default_stream()
374-
ptr = handle_return(driver.cuMemAllocFromPoolAsync(size, self._handle, stream.handle))
380+
err, ptr = driver.cuMemAllocFromPoolAsync(size, self._handle, stream.handle)
381+
raise_if_driver_error(err)
375382
return Buffer._init(ptr, size, self)
376383

377384
def deallocate(self, ptr: DevicePointerT, size: int, stream: Stream = None):
@@ -389,7 +396,8 @@ class DeviceMemoryResource(MemoryResource):
389396
"""
390397
if stream is None:
391398
stream = default_stream()
392-
handle_return(driver.cuMemFreeAsync(ptr, stream.handle))
399+
err, = driver.cuMemFreeAsync(ptr, stream.handle)
400+
raise_if_driver_error(err)
393401

394402
@property
395403
def is_device_accessible(self) -> bool:
@@ -431,7 +439,8 @@ class LegacyPinnedMemoryResource(MemoryResource):
431439
Buffer
432440
The allocated buffer object, which is accessible on both host and device.
433441
"""
434-
ptr = handle_return(driver.cuMemAllocHost(size))
442+
err, ptr = driver.cuMemAllocHost(size)
443+
raise_if_driver_error(err)
435444
return Buffer._init(ptr, size, self)
436445

437446
def deallocate(self, ptr: DevicePointerT, size: int, stream: Stream = None):
@@ -449,7 +458,8 @@ class LegacyPinnedMemoryResource(MemoryResource):
449458
"""
450459
if stream:
451460
stream.sync()
452-
handle_return(driver.cuMemFreeHost(ptr))
461+
err, = driver.cuMemFreeHost(ptr)
462+
raise_if_driver_error(err)
453463

454464
@property
455465
def is_device_accessible(self) -> bool:
@@ -475,14 +485,16 @@ class _SynchronousMemoryResource(MemoryResource):
475485
self._dev_id = device_id
476486

477487
def allocate(self, size, stream=None) -> Buffer:
478-
ptr = handle_return(driver.cuMemAlloc(size))
488+
err, ptr = driver.cuMemAlloc(size)
489+
raise_if_driver_error(err)
479490
return Buffer._init(ptr, size, self)
480491

481492
def deallocate(self, ptr, size, stream=None):
482493
if stream is None:
483494
stream = default_stream()
484495
stream.sync()
485-
handle_return(driver.cuMemFree(ptr))
496+
err, = driver.cuMemFree(ptr)
497+
raise_if_driver_error(err)
486498

487499
@property
488500
def is_device_accessible(self) -> bool:

cuda_core/tests/test_memory.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ def test_buffer_copy_from():
203203
def buffer_close(dummy_mr: MemoryResource):
204204
buffer = dummy_mr.allocate(size=1024)
205205
buffer.close()
206-
assert buffer.handle == 0
206+
assert buffer.handle is None
207207
assert buffer.memory_resource is None
208208

209209

0 commit comments

Comments
 (0)