Skip to content
Merged
17 changes: 17 additions & 0 deletions cuda_core/cuda/core/experimental/_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,23 @@ def __init__(self, device_id: int):
self._handle = handle_return(driver.cuDeviceGetMemPool(device_id))
self._dev_id = device_id

# Set a higher release threshold to improve performance when there are no active allocations.
# By default, the release threshold is 0, which means memory is immediately released back
# to the OS when there are no active suballocations, causing performance issues.
# Check current release threshold
current_threshold = handle_return(
driver.cuMemPoolGetAttribute(self._handle, driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD)
)
# If threshold is 0 (default), set it to maximum to retain memory in the pool
if int(current_threshold) == 0:
handle_return(
driver.cuMemPoolSetAttribute(
self._handle,
driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
driver.cuuint64_t(0xFFFFFFFFFFFFFFFF),
)
)

def allocate(self, size: int, stream: Stream = None) -> Buffer:
"""Allocate a buffer of the requested size.

Expand Down
1 change: 1 addition & 0 deletions cuda_core/docs/source/release/0.X.Y-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ None.
Fixes and enhancements
----------------------

- Improved :class:`DeviceMemoryResource` allocation performance when there are no active allocations (addresses issue #771).
- Fix :class:`LaunchConfig` grid unit conversion when cluster is set (addresses issue #867).
27 changes: 26 additions & 1 deletion cuda_core/tests/test_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import pytest

from cuda.core.experimental import Buffer, Device, MemoryResource
from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, MemoryResource
from cuda.core.experimental._memory import DLDeviceType
from cuda.core.experimental._utils.cuda_utils import handle_return

Expand Down Expand Up @@ -257,3 +257,28 @@ def test_buffer_dunder_dlpack_device_failure():
buffer = dummy_mr.allocate(size=1024)
with pytest.raises(BufferError, match=r"^buffer is neither device-accessible nor host-accessible$"):
buffer.__dlpack_device__()


@pytest.mark.skipif(not Device().properties.memory_pools_supported, reason="memory pools not supported")
def test_device_memory_resource_initialization():
"""Test that DeviceMemoryResource can be initialized successfully.

This test verifies that the DeviceMemoryResource initializes properly,
including the release threshold configuration for performance optimization.
"""
device = Device()
device.set_current()

# This should succeed and configure the memory pool release threshold
mr = DeviceMemoryResource(device.device_id)

# Verify basic properties
assert mr.device_id == device.device_id
assert mr.is_device_accessible is True
assert mr.is_host_accessible is False

# Test allocation/deallocation works
buffer = mr.allocate(1024)
assert buffer.size == 1024
assert buffer.device_id == device.device_id
buffer.close()