rapidsai · bdice · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/README.md b/README.md
@@ -482,7 +482,7 @@ rmm::mr::polymorphic_allocator<int> stream_alloc;
 // Constructs an adaptor that forwards all (de)allocations to `stream_alloc` on `stream`.
 auto adapted = rmm::mr::stream_allocator_adaptor(stream_alloc, stream);
 
-// Allocates 100 bytes using `stream_alloc` on `stream`
+// Allocates storage for 100 ints using `stream_alloc` on `stream`
 auto p = adapted.allocate(100);
 ...
 // Deallocates using `stream_alloc` on `stream`

@@ -44,6 +44,7 @@ option(BUILD_BENCHMARKS "Configure CMake to build (google) benchmarks" OFF)
 # This is mostly so that dependent libraries are configured in shared mode for downstream dependents
 # of RMM that get their common dependencies transitively.
 option(BUILD_SHARED_LIBS "Build RMM shared libraries" ON)
+option(RMM_ENABLE_LEGACY_MR_INTERFACE "Enable legacy memory resource interface" ON)
 set(RMM_LOGGING_LEVEL
     "INFO"
     CACHE STRING "Choose the logging level.")
@@ -54,6 +55,7 @@ message(VERBOSE "RMM: Build with NVTX support: ${RMM_NVTX}")
 # Set logging level. Must go before including gtests and benchmarks. Set the possible values of
 # build type for cmake-gui.
 message(STATUS "RMM: RMM_LOGGING_LEVEL = '${RMM_LOGGING_LEVEL}'")
+message(STATUS "RMM: Legacy MR interface enabled: ${RMM_ENABLE_LEGACY_MR_INTERFACE}")
 
 # cudart can be linked statically or dynamically
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
@@ -132,6 +134,11 @@ if(RMM_NVTX)
   target_compile_definitions(rmm PUBLIC RMM_NVTX)
 endif()
 
+# Control legacy MR interface visibility
+if(RMM_ENABLE_LEGACY_MR_INTERFACE)
+  target_compile_definitions(rmm PUBLIC RMM_ENABLE_LEGACY_MR_INTERFACE)
+endif()
+
 # ##################################################################################################
 # * tests and benchmarks ---------------------------------------------------------------------------
 

@@ -83,7 +83,7 @@ void BM_AsyncPrimingImpact(benchmark::State& state, MRFactoryFunc factory)
 
     // Deallocate all
     for (auto* ptr : allocations) {
-      mr->deallocate(ptr, allocation_size);
+      mr->deallocate_sync(ptr, allocation_size);
     }
     allocations.clear();
 
@@ -118,7 +118,7 @@ void BM_AsyncPrimingImpact(benchmark::State& state, MRFactoryFunc factory)
 
     // Clean up for next iteration
     for (auto* ptr : allocations) {
-      mr->deallocate(ptr, allocation_size);
+      mr->deallocate_sync(ptr, allocation_size);
     }
     allocations.clear();
   }

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -94,7 +94,7 @@ void random_allocation_free(rmm::mr::device_memory_resource& mr,
     void* ptr = nullptr;
     if (do_alloc) {  // try to allocate
       try {
-        ptr = mr.allocate(size, stream);
+        ptr = mr.allocate(stream, size);
       } catch (rmm::bad_alloc const&) {
         do_alloc = false;
 #if VERBOSE
@@ -118,7 +118,7 @@ void random_allocation_free(rmm::mr::device_memory_resource& mr,
         std::size_t index = index_distribution(generator) % active_allocations;
         active_allocations--;
         allocation to_free = remove_at(allocations, index);
-        mr.deallocate(to_free.ptr, to_free.size, stream);
+        mr.deallocate(stream, to_free.ptr, to_free.size);
         allocation_size -= to_free.size;
 
 #if VERBOSE

@@ -20,7 +20,6 @@
 
 #include <cassert>
 #include <cstddef>
-#include <cstdint>
 
 namespace RMM_EXPORT rmm {
 

@@ -34,6 +34,7 @@ class cccl_resource_ref : public ResourceType {
 
   cccl_resource_ref(base&& other) : base(std::move(other)) {}
 
+#ifdef RMM_ENABLE_LEGACY_MR_INTERFACE
   void* allocate(std::size_t bytes) { return this->allocate_sync(bytes); }
 
   void* allocate(std::size_t bytes, std::size_t alignment)
@@ -50,8 +51,8 @@ class cccl_resource_ref : public ResourceType {
   {
     return this->deallocate_sync(ptr, bytes, alignment);
   }
+#endif  // RMM_ENABLE_LEGACY_MR_INTERFACE
 
-#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
   void* allocate_sync(std::size_t bytes) { return base::allocate_sync(bytes); }
 
   void* allocate_sync(std::size_t bytes, std::size_t alignment)
@@ -68,24 +69,6 @@ class cccl_resource_ref : public ResourceType {
   {
     return base::deallocate_sync(ptr, bytes, alignment);
   }
-#else
-  void* allocate_sync(std::size_t bytes) { return base::allocate(bytes); }
-
-  void* allocate_sync(std::size_t bytes, std::size_t alignment)
-  {
-    return base::allocate(bytes, alignment);
-  }
-
-  void deallocate_sync(void* ptr, std::size_t bytes) noexcept
-  {
-    return base::deallocate(ptr, bytes);
-  }
-
-  void deallocate_sync(void* ptr, std::size_t bytes, std::size_t alignment) noexcept
-  {
-    return base::deallocate(ptr, bytes, alignment);
-  }
-#endif
 };
 
 template <typename ResourceType>
@@ -98,8 +81,7 @@ class cccl_async_resource_ref : public ResourceType {
   cccl_async_resource_ref(base const& other) : base(other) {}
   cccl_async_resource_ref(base&& other) : base(std::move(other)) {}
 
-  // BEGINNING OF LEGACY MR METHODS
-
+#ifdef RMM_ENABLE_LEGACY_MR_INTERFACE
   void* allocate(std::size_t bytes) { return this->allocate_sync(bytes); }
 
   void* allocate(std::size_t bytes, std::size_t alignment)
@@ -140,9 +122,8 @@ class cccl_async_resource_ref : public ResourceType {
     return this->deallocate(stream, ptr, bytes, alignment);
   }
 
-  // END OF LEGACY MR METHODS
+#endif  // RMM_ENABLE_LEGACY_MR_INTERFACE
 
-#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
   void* allocate_sync(std::size_t bytes) { return base::allocate_sync(bytes); }
 
   void* allocate_sync(std::size_t bytes, std::size_t alignment)
@@ -182,47 +163,6 @@ class cccl_async_resource_ref : public ResourceType {
   {
     return base::deallocate(stream, ptr, bytes, alignment);
   }
-#else
-  void* allocate_sync(std::size_t bytes) { return base::allocate(bytes); }
-
-  void* allocate_sync(std::size_t bytes, std::size_t alignment)
-  {
-    return base::allocate(bytes, alignment);
-  }
-
-  void deallocate_sync(void* ptr, std::size_t bytes) noexcept
-  {
-    return base::deallocate(ptr, bytes);
-  }
-
-  void deallocate_sync(void* ptr, std::size_t bytes, std::size_t alignment) noexcept
-  {
-    return base::deallocate(ptr, bytes, alignment);
-  }
-
-  void* allocate(cuda_stream_view stream, std::size_t bytes)
-  {
-    return base::allocate_async(bytes, stream);
-  }
-
-  void* allocate(cuda_stream_view stream, std::size_t bytes, std::size_t alignment)
-  {
-    return base::allocate_async(bytes, alignment, stream);
-  }
-
-  void deallocate(cuda_stream_view stream, void* ptr, std::size_t bytes) noexcept
-  {
-    return base::deallocate_async(ptr, bytes, stream);
-  }
-
-  void deallocate(cuda_stream_view stream,
-                  void* ptr,
-                  std::size_t bytes,
-                  std::size_t alignment) noexcept
-  {
-    return base::deallocate_async(ptr, bytes, alignment, stream);
-  }
-#endif
 };
 
 }  // namespace detail

@@ -18,7 +18,7 @@
 #ifndef LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
 #error \
   "RMM requires LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE to be defined. Please add -DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE to the compiler flags (this is done automatically when using RMM via CMake)."
-#endif
+#endif  // LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
 
 #include <rmm/detail/export.hpp>
 
@@ -28,7 +28,6 @@ namespace RMM_NAMESPACE {
 namespace detail {
 namespace polyfill {
 
-#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
 template <class Resource>
 inline constexpr bool resource = cuda::mr::synchronous_resource<Resource>;
 template <class Resource, class... Properties>
@@ -37,16 +36,6 @@ template <class Resource>
 inline constexpr bool async_resource = cuda::mr::resource<Resource>;
 template <class Resource, class... Properties>
 inline constexpr bool async_resource_with = cuda::mr::resource_with<Resource, Properties...>;
-#else   // ^^^ CCCL >= 3.1 ^^^ / vvv CCCL < 3.1 vvv
-template <class Resource>
-inline constexpr bool resource = cuda::mr::resource<Resource>;
-template <class Resource, class... Properties>
-inline constexpr bool resource_with = cuda::mr::resource_with<Resource, Properties...>;
-template <class Resource>
-inline constexpr bool async_resource = cuda::mr::async_resource<Resource>;
-template <class Resource, class... Properties>
-inline constexpr bool async_resource_with = cuda::mr::async_resource_with<Resource, Properties...>;
-#endif  // CCCL < 3.1
 
 }  // namespace polyfill
 }  // namespace detail

@@ -143,10 +143,10 @@ class aligned_resource_adaptor final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
   {
     if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) {
-      return get_upstream_resource().allocate_async(bytes, 1, stream);
+      return get_upstream_resource().allocate(stream, bytes, 1);
     }
     auto const size = upstream_allocation_size(bytes);
-    void* pointer   = get_upstream_resource().allocate_async(size, 1, stream);
+    void* pointer   = get_upstream_resource().allocate(stream, size, 1);
     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
     auto const address         = reinterpret_cast<std::size_t>(pointer);
     auto const aligned_address = rmm::align_up(address, alignment_);
@@ -169,7 +169,7 @@ class aligned_resource_adaptor final : public device_memory_resource {
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) noexcept override
   {
     if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) {
-      get_upstream_resource().deallocate_async(ptr, bytes, 1, stream);
+      get_upstream_resource().deallocate(stream, ptr, bytes, 1);
     } else {
       {
         lock_guard lock(mtx_);
@@ -179,7 +179,7 @@ class aligned_resource_adaptor final : public device_memory_resource {
           pointers_.erase(iter);
         }
       }
-      get_upstream_resource().deallocate_async(ptr, upstream_allocation_size(bytes), 1, stream);
+      get_upstream_resource().deallocate(stream, ptr, upstream_allocation_size(bytes), 1);
     }
   }
 

@@ -156,14 +156,14 @@ class arena_memory_resource final : public device_memory_resource {
 
     {
       std::shared_lock lock(mtx_);
-      void* pointer = arena.allocate(bytes);
+      void* pointer = arena.allocate_sync(bytes);
       if (pointer != nullptr) { return pointer; }
     }
 
     {
       std::unique_lock lock(mtx_);
       defragment();
-      void* pointer = arena.allocate(bytes);
+      void* pointer = arena.allocate_sync(bytes);
       if (pointer == nullptr) {
         if (dump_log_on_failure_) { dump_memory_log(bytes); }
         auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") +
@@ -209,7 +209,7 @@ class arena_memory_resource final : public device_memory_resource {
     {
       std::shared_lock lock(mtx_);
       // If the memory being freed does not belong to the arena, the following will return false.
-      if (arena.deallocate(ptr, bytes, stream)) { return; }
+      if (arena.deallocate(stream, ptr, bytes)) { return; }
     }
 
     {
@@ -218,31 +218,31 @@ class arena_memory_resource final : public device_memory_resource {
       stream.synchronize_no_throw();
 
       std::unique_lock lock(mtx_);
-      deallocate_from_other_arena(ptr, bytes, stream);
+      deallocate_from_other_arena(stream, ptr, bytes);
     }
   }
 
   /**
    * @brief Deallocate memory pointed to by `ptr` that was allocated in a different arena.
    *
+   * @param stream Stream on which to perform deallocation.
    * @param ptr Pointer to be deallocated.
    * @param bytes The size in bytes of the allocation. This must be equal to the
    * value of `bytes` that was passed to the `allocate` call that returned `ptr`.
-   * @param stream Stream on which to perform deallocation.
    */
-  void deallocate_from_other_arena(void* ptr, std::size_t bytes, cuda_stream_view stream)
+  void deallocate_from_other_arena(cuda_stream_view stream, void* ptr, std::size_t bytes)
   {
     if (use_per_thread_arena(stream)) {
       for (auto const& thread_arena : thread_arenas_) {
-        if (thread_arena.second->deallocate(ptr, bytes)) { return; }
+        if (thread_arena.second->deallocate_sync(ptr, bytes)) { return; }
       }
     } else {
       for (auto& stream_arena : stream_arenas_) {
-        if (stream_arena.second.deallocate(ptr, bytes)) { return; }
+        if (stream_arena.second.deallocate_sync(ptr, bytes)) { return; }
       }
     }
 
-    if (!global_arena_.deallocate(ptr, bytes)) {
+    if (!global_arena_.deallocate_sync(ptr, bytes)) {
       // It's possible to use per thread default streams along with another pool of streams.
       // This means that it's possible for an allocation to move from a thread or stream arena
       // back into the global arena during a defragmentation and then move down into another arena
@@ -253,11 +253,11 @@ class arena_memory_resource final : public device_memory_resource {
       // arenas all the time.
       if (use_per_thread_arena(stream)) {
         for (auto& stream_arena : stream_arenas_) {
-          if (stream_arena.second.deallocate(ptr, bytes)) { return; }
+          if (stream_arena.second.deallocate_sync(ptr, bytes)) { return; }
         }
       } else {
         for (auto const& thread_arena : thread_arenas_) {
-          if (thread_arena.second->deallocate(ptr, bytes)) { return; }
+          if (thread_arena.second->deallocate_sync(ptr, bytes)) { return; }
         }
       }
       RMM_FAIL("allocation not found");

@@ -196,7 +196,7 @@ class binning_memory_resource final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
   {
     if (bytes <= 0) { return nullptr; }
-    return get_resource_ref(bytes).allocate_async(bytes, stream);
+    return get_resource_ref(bytes).allocate(stream, bytes);
   }
 
   /**
@@ -209,7 +209,7 @@ class binning_memory_resource final : public device_memory_resource {
    */
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) noexcept override
   {
-    get_resource_ref(bytes).deallocate_async(ptr, bytes, stream);
+    get_resource_ref(bytes).deallocate(stream, ptr, bytes);
   }
 
   device_async_resource_ref

@@ -99,7 +99,7 @@ class cuda_async_managed_memory_resource final : public device_memory_resource {
    */
   void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
   {
-    return pool_.allocate(bytes, stream);
+    return pool_.allocate(stream, bytes);
   }
 
   /**
@@ -112,7 +112,7 @@ class cuda_async_managed_memory_resource final : public device_memory_resource {
    */
   void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) noexcept override
   {
-    pool_.deallocate(ptr, bytes, stream);
+    pool_.deallocate(stream, ptr, bytes);
   }
 
   /**

@@ -180,7 +180,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
   {
     void* ptr{nullptr};
-    ptr = pool_.allocate(bytes, stream);
+    ptr = pool_.allocate(stream, bytes);
     return ptr;
   }
 
@@ -194,7 +194,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
    */
   void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) noexcept override
   {
-    pool_.deallocate(ptr, bytes, stream);
+    pool_.deallocate(stream, ptr, bytes);
   }
 
   /**