From 658f3641ba6ddc3248e5119a29f07414d488accf Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 10 Jul 2025 08:33:39 -0700 Subject: [PATCH 1/4] Add `cuda_host_memory_resource` --- .../mr/device/cuda_host_memory_resource.hpp | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 cpp/include/rmm/mr/device/cuda_host_memory_resource.hpp diff --git a/cpp/include/rmm/mr/device/cuda_host_memory_resource.hpp b/cpp/include/rmm/mr/device/cuda_host_memory_resource.hpp new file mode 100644 index 000000000..924435f85 --- /dev/null +++ b/cpp/include/rmm/mr/device/cuda_host_memory_resource.hpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include + +namespace RMM_NAMESPACE { +namespace mr { +/** + * @addtogroup device_memory_resources + * @{ + * @file + */ +/** + * @brief `device_memory_resource` derived class that uses cudaMallocHost/cudaFreeHost for + * allocation/deallocation of pinned host memory. + */ +class cuda_host_memory_resource final : public device_memory_resource { + public: + cuda_host_memory_resource() = default; + ~cuda_host_memory_resource() override = default; + cuda_host_memory_resource(cuda_host_memory_resource const&) = + default; ///< @default_copy_constructor + cuda_host_memory_resource(cuda_host_memory_resource&&) = default; ///< @default_move_constructor + cuda_host_memory_resource& operator=(cuda_host_memory_resource const&) = + default; ///< @default_copy_assignment{cuda_host_memory_resource} + cuda_host_memory_resource& operator=(cuda_host_memory_resource&&) = + default; ///< @default_move_assignment{cuda_host_memory_resource} + + private: + /** + * @brief Allocates pinned host memory of size at least \p bytes. + * + * The returned pointer will have at minimum 256 byte alignment. + * + * The stream argument is ignored. + * + * @param bytes The size of the allocation + * @param stream This argument is ignored + * @return void* Pointer to the newly allocated memory + */ + void* do_allocate(std::size_t bytes, [[maybe_unused]] cuda_stream_view stream) override + { + void* ptr{nullptr}; + RMM_CUDA_TRY_ALLOC(cudaMallocHost(&ptr, bytes), bytes); + return ptr; + } + + /** + * @brief Deallocate pinned host memory pointed to by \p ptr. + * + * The stream argument is ignored. + * + * @param ptr Pointer to be deallocated + * @param bytes The size in bytes of the allocation. This must be equal to the + * value of `bytes` that was passed to the `allocate` call that returned `ptr`. + * @param stream This argument is ignored. + */ + void do_deallocate(void* ptr, + [[maybe_unused]] std::size_t bytes, + [[maybe_unused]] cuda_stream_view stream) override + { + RMM_ASSERT_CUDA_SUCCESS(cudaFreeHost(ptr)); + } + + /** + * @brief Compare this resource to another. + * + * Two cuda_host_memory_resources always compare equal, because they can each + * deallocate memory allocated by the other. + * + * @param other The other resource to compare to + * @return true If the two resources are equivalent + * @return false If the two resources are not equal + */ + [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override + { + return dynamic_cast(&other) != nullptr; + } +}; +/** @} */ // end of group +} // namespace mr +} // namespace RMM_NAMESPACE From 9f4ab6001638ffb04a559b02557141c13ec22d24 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 10 Jul 2025 08:50:11 -0700 Subject: [PATCH 2/4] Add tests --- cpp/tests/CMakeLists.txt | 3 + .../device/cuda_host_memory_resource_tests.cu | 291 ++++++++++++++++++ 2 files changed, 294 insertions(+) create mode 100644 cpp/tests/mr/device/cuda_host_memory_resource_tests.cu diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 89d76fd53..f9191754b 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -189,6 +189,9 @@ ConfigureTest(PINNED_POOL_MR_TEST mr/host/pinned_pool_mr_tests.cpp) # cuda stream tests ConfigureTest(CUDA_STREAM_TEST cuda_stream_tests.cpp cuda_stream_pool_tests.cpp) +# cuda host memory resource tests +ConfigureTest(CUDA_HOST_MR_TEST mr/device/cuda_host_memory_resource_tests.cu GPUS 1 PERCENT 100) + # device buffer tests ConfigureTest(DEVICE_BUFFER_TEST device_buffer_tests.cu) diff --git a/cpp/tests/mr/device/cuda_host_memory_resource_tests.cu b/cpp/tests/mr/device/cuda_host_memory_resource_tests.cu new file mode 100644 index 000000000..cbc71d721 --- /dev/null +++ b/cpp/tests/mr/device/cuda_host_memory_resource_tests.cu @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../../byte_literals.hpp" + +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include + +namespace rmm::test { +namespace { + +std::size_t constexpr size_kb{1_KiB}; +std::size_t constexpr size_mb{1_MiB}; + +__global__ void touch_memory_kernel(char* data, std::size_t size) +{ + auto const tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < size) { data[tid] = static_cast(tid); } +} + +void touch_on_gpu(void* ptr, std::size_t size) +{ + dim3 blockSize(256); + dim3 gridSize((size + blockSize.x - 1) / blockSize.x); + touch_memory_kernel<<>>(static_cast(ptr), size); + cudaDeviceSynchronize(); +} + +void touch_on_cpu(void* ptr, std::size_t size) +{ + auto* data = static_cast(ptr); + for (std::size_t i = 0; i < size; ++i) { + data[i] = static_cast(i); + } +} + +class CudaHostMemoryResourceTest : public ::testing::Test { + protected: + void SetUp() override + { + // Ensure CUDA is initialized + cudaFree(nullptr); + } +}; + +TEST_F(CudaHostMemoryResourceTest, BasicAllocation) +{ + rmm::mr::cuda_host_memory_resource mr; + + // Test basic allocation + void* ptr = mr.allocate(size_kb); + EXPECT_NE(nullptr, ptr); + + // Verify it's pinned host memory + cudaPointerAttributes attributes{}; + EXPECT_EQ(cudaSuccess, cudaPointerGetAttributes(&attributes, ptr)); + EXPECT_NE(nullptr, attributes.hostPointer); + + mr.deallocate(ptr, size_kb); +} + +TEST_F(CudaHostMemoryResourceTest, ZeroSizeAllocation) +{ + rmm::mr::cuda_host_memory_resource mr; + + // Test zero size allocation + void* ptr = mr.allocate(0); + EXPECT_EQ(nullptr, ptr); + + mr.deallocate(ptr, 0); +} + +TEST_F(CudaHostMemoryResourceTest, LargeAllocation) +{ + rmm::mr::cuda_host_memory_resource mr; + + // Test large allocation + void* ptr = mr.allocate(size_mb); + EXPECT_NE(nullptr, ptr); + + // Verify it's pinned host memory + cudaPointerAttributes attributes{}; + EXPECT_EQ(cudaSuccess, cudaPointerGetAttributes(&attributes, ptr)); + EXPECT_NE(nullptr, attributes.hostPointer); + + mr.deallocate(ptr, size_mb); +} + +TEST_F(CudaHostMemoryResourceTest, MultipleAllocations) +{ + rmm::mr::cuda_host_memory_resource mr; + + std::vector ptrs; + std::vector sizes = {size_kb, size_kb * 2, size_kb * 4, size_kb * 8}; + + // Allocate multiple blocks + for (auto size : sizes) { + void* ptr = mr.allocate(size); + EXPECT_NE(nullptr, ptr); + ptrs.push_back(ptr); + } + + // Verify all are pinned host memory + for (auto ptr : ptrs) { + cudaPointerAttributes attributes{}; + EXPECT_EQ(cudaSuccess, cudaPointerGetAttributes(&attributes, ptr)); + EXPECT_NE(nullptr, attributes.hostPointer); + } + + // Deallocate all + for (std::size_t i = 0; i < ptrs.size(); ++i) { + mr.deallocate(ptrs[i], sizes[i]); + } +} + +TEST_F(CudaHostMemoryResourceTest, AsyncAllocation) +{ + rmm::mr::cuda_host_memory_resource mr; + cudaStream_t stream; + cudaStreamCreate(&stream); + + // Test async allocation + void* ptr = mr.allocate_async(size_kb, stream); + EXPECT_NE(nullptr, ptr); + + // Verify it's pinned host memory + cudaPointerAttributes attributes{}; + EXPECT_EQ(cudaSuccess, cudaPointerGetAttributes(&attributes, ptr)); + EXPECT_NE(nullptr, attributes.hostPointer); + + mr.deallocate_async(ptr, size_kb, stream); + + cudaStreamDestroy(stream); +} + +TEST_F(CudaHostMemoryResourceTest, CpuAccess) +{ + rmm::mr::cuda_host_memory_resource mr; + + void* ptr = mr.allocate(size_kb); + EXPECT_NE(nullptr, ptr); + + // Test CPU access + touch_on_cpu(ptr, size_kb); + + // Verify the data was written + auto* data = static_cast(ptr); + for (std::size_t i = 0; i < size_kb; ++i) { + EXPECT_EQ(static_cast(i), data[i]); + } + + mr.deallocate(ptr, size_kb); +} + +TEST_F(CudaHostMemoryResourceTest, GpuAccess) +{ + rmm::mr::cuda_host_memory_resource mr; + + void* ptr = mr.allocate(size_kb); + EXPECT_NE(nullptr, ptr); + + // Test GPU access + touch_on_gpu(ptr, size_kb); + + // Verify the data was written by GPU + auto* data = static_cast(ptr); + for (std::size_t i = 0; i < size_kb; ++i) { + EXPECT_EQ(static_cast(i), data[i]); + } + + mr.deallocate(ptr, size_kb); +} + +TEST_F(CudaHostMemoryResourceTest, CpuGpuRoundTrip) +{ + rmm::mr::cuda_host_memory_resource mr; + + void* ptr = mr.allocate(size_kb); + EXPECT_NE(nullptr, ptr); + + // Write from CPU + touch_on_cpu(ptr, size_kb); + + // Verify the data was written by CPU + auto* data = static_cast(ptr); + for (std::size_t i = 0; i < size_kb; ++i) { + EXPECT_EQ(static_cast(i), data[i]); + } + + // Read/write from GPU + touch_on_gpu(ptr, size_kb); + + // Verify final state + data = static_cast(ptr); + for (std::size_t i = 0; i < size_kb; ++i) { + EXPECT_EQ(static_cast(i), data[i]); + } + + mr.deallocate(ptr, size_kb); +} + +TEST_F(CudaHostMemoryResourceTest, Equality) +{ + rmm::mr::cuda_host_memory_resource mr1; + rmm::mr::cuda_host_memory_resource mr2; + + // Two instances should be equal + EXPECT_TRUE(mr1.is_equal(mr2)); + EXPECT_TRUE(mr2.is_equal(mr1)); + + // Self equality + EXPECT_TRUE(mr1.is_equal(mr1)); +} + +TEST_F(CudaHostMemoryResourceTest, InequalityWithOtherTypes) +{ + rmm::mr::cuda_host_memory_resource host_mr; + rmm::mr::cuda_memory_resource device_mr; + + // Should not be equal to device memory resource + EXPECT_FALSE(host_mr.is_equal(device_mr)); + EXPECT_FALSE(device_mr.is_equal(host_mr)); +} + +TEST_F(CudaHostMemoryResourceTest, MemoryAlignment) +{ + rmm::mr::cuda_host_memory_resource mr; + + // Test various allocation sizes to check alignment + std::vector sizes = {1, 8, 16, 32, 64, 128, 256, 512, 1024}; + + for (auto size : sizes) { + void* ptr = mr.allocate(size); + EXPECT_NE(nullptr, ptr); + + // Check that pointer is properly aligned + EXPECT_EQ(0, reinterpret_cast(ptr) % 256); + + mr.deallocate(ptr, size); + } +} + +TEST_F(CudaHostMemoryResourceTest, StressTest) +{ + rmm::mr::cuda_host_memory_resource mr; + + constexpr std::size_t num_iterations = 1000; + constexpr std::size_t max_size = size_kb; + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution size_dist(1, max_size); + + for (std::size_t iteration = 0; iteration < num_iterations; ++iteration) { + std::size_t size = size_dist(gen); + void* ptr = mr.allocate(size); + EXPECT_NE(nullptr, ptr); + + // Touch the memory + std::memset(ptr, static_cast(iteration & 0xFF), size); + + mr.deallocate(ptr, size); + } +} + +} // namespace +} // namespace rmm::test From 6c109f4e7ca94a5498dfe296628fccaa6b746b58 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 10 Jul 2025 09:04:25 -0700 Subject: [PATCH 3/4] Add Python bindings --- python/rmm/rmm/librmm/memory_resource.pxd | 5 +++++ python/rmm/rmm/mr.py | 2 ++ python/rmm/rmm/pylibrmm/memory_resource.pyx | 15 +++++++++++++++ 3 files changed, 22 insertions(+) diff --git a/python/rmm/rmm/librmm/memory_resource.pxd b/python/rmm/rmm/librmm/memory_resource.pxd index 0194e604e..51f493aa0 100644 --- a/python/rmm/rmm/librmm/memory_resource.pxd +++ b/python/rmm/rmm/librmm/memory_resource.pxd @@ -88,6 +88,11 @@ cdef extern from "rmm/mr/device/cuda_memory_resource.hpp" \ cdef cppclass cuda_memory_resource(device_memory_resource): cuda_memory_resource() except + +cdef extern from "rmm/mr/device/cuda_host_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass cuda_host_memory_resource(device_memory_resource): + cuda_host_memory_resource() except + + cdef extern from "rmm/mr/device/managed_memory_resource.hpp" \ namespace "rmm::mr" nogil: cdef cppclass managed_memory_resource(device_memory_resource): diff --git a/python/rmm/rmm/mr.py b/python/rmm/rmm/mr.py index eada9b21b..756b6a885 100644 --- a/python/rmm/rmm/mr.py +++ b/python/rmm/rmm/mr.py @@ -17,6 +17,7 @@ CallbackMemoryResource, CudaAsyncMemoryResource, CudaAsyncViewMemoryResource, + CudaHostMemoryResource, CudaMemoryResource, DeviceMemoryResource, FailureCallbackResourceAdaptor, @@ -52,6 +53,7 @@ "CallbackMemoryResource", "CudaAsyncMemoryResource", "CudaAsyncViewMemoryResource", + "CudaHostMemoryResource", "CudaMemoryResource", "DeviceMemoryResource", "FailureCallbackResourceAdaptor", diff --git a/python/rmm/rmm/pylibrmm/memory_resource.pyx b/python/rmm/rmm/pylibrmm/memory_resource.pyx index 7e02bfa62..eb03f7f17 100644 --- a/python/rmm/rmm/pylibrmm/memory_resource.pyx +++ b/python/rmm/rmm/pylibrmm/memory_resource.pyx @@ -56,6 +56,7 @@ from rmm.librmm.memory_resource cimport ( callback_memory_resource, cuda_async_memory_resource, cuda_async_view_memory_resource, + cuda_host_memory_resource, cuda_memory_resource, deallocate_callback_t, device_memory_resource, @@ -149,6 +150,20 @@ cdef class CudaMemoryResource(DeviceMemoryResource): pass +cdef class CudaHostMemoryResource(DeviceMemoryResource): + def __cinit__(self): + self.c_obj.reset( + new cuda_host_memory_resource() + ) + + def __init__(self): + """ + Memory resource that uses ``cudaMallocHost``/``cudaFreeHost`` for + allocation/deallocation of pinned host memory. + """ + pass + + cdef class CudaAsyncMemoryResource(DeviceMemoryResource): """ Memory resource that uses ``cudaMallocAsync``/``cudaFreeAsync`` for From 273bab2a74d982b0c6132a51a284f6496b15771e Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 10 Jul 2025 09:05:34 -0700 Subject: [PATCH 4/4] Add Python tests --- python/rmm/rmm/tests/test_rmm.py | 66 ++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py index 287858487..824df4a73 100644 --- a/python/rmm/rmm/tests/test_rmm.py +++ b/python/rmm/rmm/tests/test_rmm.py @@ -741,6 +741,72 @@ def test_cuda_async_memory_resource_threshold(nelem, alloc): array_tester("u1", 2 * nelem, alloc) # should trigger release +def test_cuda_host_memory_resource(): + """Test the CudaHostMemoryResource functionality.""" + import ctypes + + import numpy as np + + from rmm.mr import CudaHostMemoryResource + + mr = CudaHostMemoryResource() + + # Test basic allocation + ptr = mr.allocate(1024) + assert ptr != 0 + + # Test that we can write to the memory from CPU + # Create a ctypes array view of the allocated memory + arr_ptr = ctypes.cast(ptr, ctypes.POINTER(ctypes.c_uint8)) + arr = np.ctypeslib.as_array(arr_ptr, shape=(1024,)) + arr[:] = np.arange(1024, dtype=np.uint8) + + # Test that we can read back the data + assert np.array_equal(arr[:10], np.arange(10, dtype=np.uint8)) + + # Test deallocation + mr.deallocate(ptr, 1024) + + # Test zero size allocation + ptr = mr.allocate(0) + assert ptr == 0 + mr.deallocate(ptr, 0) + + +def test_cuda_host_memory_resource_with_stream(): + """Test the CudaHostMemoryResource with CUDA streams.""" + import ctypes + + import numpy as np + + from rmm.mr import CudaHostMemoryResource + from rmm.pylibrmm.stream import Stream + + mr = CudaHostMemoryResource() + stream = Stream() + + # Test async allocation + ptr = mr.allocate(1024, stream) + assert ptr != 0 + + # Test that we can write to the memory from CPU + # Create a ctypes array view of the allocated memory + arr_ptr = ctypes.cast(ptr, ctypes.POINTER(ctypes.c_uint8)) + arr = np.ctypeslib.as_array(arr_ptr, shape=(1024,)) + arr[:] = np.arange(1024, dtype=np.uint8) + + # Test that we can read back the data + assert np.array_equal(arr[:10], np.arange(10, dtype=np.uint8)) + + # Test async deallocation + mr.deallocate(ptr, 1024, stream) + + # Test zero size async allocation + ptr = mr.allocate(0, stream) + assert ptr == 0 + mr.deallocate(ptr, 0, stream) + + @pytest.mark.parametrize( "mr", [