llnl · kab163 · Jul 15, 2025 · Jul 7, 2025 · Jul 7, 2025 · Jul 7, 2025
diff --git a/...06_raja_umpire_uvm/06_raja_umpire_uvm.cpp → ..._device/06_07_raja_umpire_host_device.cpp b/...06_raja_umpire_uvm/06_raja_umpire_uvm.cpp → ..._device/06_07_raja_umpire_host_device.cpp
@@ -15,24 +15,33 @@ int main()
   constexpr std::size_t CUDA_BLOCK_SIZE{????};
   double* a{nullptr};
   double* b{nullptr};
+  double* a_h{nullptr};
+  double* b_h{nullptr};
 
   auto& rm = umpire::ResourceManager::getInstance();
-  // TODO: allocate with device unified memory
+
+  // TODO: create 2 allocators, one with device memory and one with host memory
   auto allocator = rm.getAllocator("??");
+  auto host_allocator = rm.getAllocator("??");
 
   a = static_cast<double*>(allocator.allocate(N*sizeof(double)));
   b = static_cast<double*>(allocator.allocate(N*sizeof(double)));
+  a_h = static_cast<double*>(host_allocator.allocate(N*sizeof(double)));
+  b_h = static_cast<double*>(host_allocator.allocate(N*sizeof(double)));
 
-  //TODO: fill in the forall statement with the CUDA execution policy
-  //TODO: and its block size argument. Then be sure to use RAJA_DEVICE
-  RAJA::forall<????? <?????> >(
-    RAJA::TypedRangeSegment<int>(0, N), [=] ?????? (int i) {
+  //TODO: fill in the forall statement with the sequential exec policy.
+  //TODO: Alternatively, you could use the memset operator to do this instead...
+  RAJA::forall< ????? >(
+    RAJA::TypedRangeSegment<int>(0, N), [=] (int i) {
       a[i] = 1.0;
       b[i] = 1.0;
     }
   );
 
+  // TODO: copy data from a_h to a, and b_h to b (i.e. from the host to the device vars)
+
   double dot{0.0};
+
   //TODO: create a RAJA::ReduceSum with cuda_reduce called "cudot" for the GPU
 
   //TODO: fill in the forall statement with the CUDA execution policy
@@ -48,6 +57,9 @@ int main()
 
   allocator.deallocate(a);
   allocator.deallocate(b);
+  host_allocator.deallocate(a_h);
+  host_allocator.deallocate(b_h);
+
 #endif
   return 0;
 }
diff --git a/Intro_Tutorial/lessons/06_07_raja_umpire_host_device/CMakeLists.txt b/Intro_Tutorial/lessons/06_07_raja_umpire_host_device/CMakeLists.txt
@@ -0,0 +1,6 @@
+if (ENABLE_CUDA)
+  blt_add_executable(
+    NAME 06_07_raja_umpire_host_device
+    SOURCES 06_07_raja_umpire_host_device.cpp
+    DEPENDS_ON RAJA umpire cuda)
+endif()
diff --git a/Intro_Tutorial/lessons/06_07_raja_umpire_host_device/README.md b/Intro_Tutorial/lessons/06_07_raja_umpire_host_device/README.md
@@ -0,0 +1,89 @@
+# Lesson 6 and 7
+
+## Part 1: Lesson 6
+
+For lesson 6, you will learn about Umpire's different memory resources and in
+particular, those used to allocate memory on a GPU. 
+
+Each computer system will have a number of distinct places in which the system
+will allow you to allocate memory. In Umpire's world, these are memory
+resources. A memory resource can correspond to a hardware resource, but can also
+be used to identify memory with a particular characteristic, like `pinned`
+memory in a GPU system.
+
+Umpire creates predefined allocators for each of the available resources, and
+they can be accessed using the `ResourceManager::getAllocator` method.
+
+The predefined names can include:
+
+- "HOST": CPU memory, like `malloc`.
+- "DEVICE": device memory, and a "::<N>" suffix can be added to request memory on a specific device.
+- "UM": unified memory that can be accessed by both the CPU and GPU.
+- "PINNED": CPU memory that is pinned and will be accessible by the GPU.
+
+Other memory resources include:
+
+- "DEVICE_CONST": constant, read-only GPU memory
+- "FILE": mmapped file memory that is accessible by the CPU.
+- "SHARED": Includes POSIX shared memory which can be accessible by the CPU or GPU depending
+on what your system accommodates and the MPI3 shared memory that is accessible on the CPU.
+- "UNKNOWN": If an incorrect name is used or if the allocator was not set up correctly.
+
+## Part 2: Lesson 7
+
+For lesson 7, you will learn how to use Umpire's operations to copy data
+between CPU and GPU memory in a portable way, using the memory resources you learned 
+about in lesson 6.
+
+In `07_raja_umpire_host_device.cpp`, we create an allocator for the GPU with:
+```  
+auto allocator = rm.getAllocator("DEVICE");
+```
+
+and a separate allocator on the CPU with:
+
+```
+  auto host_allocator = rm.getAllocator("HOST");
+```
+
+We will initialize the data on the CPU, but we want to do computations on
+the GPU. Therefore, we have to take advantage of some Umpire "Operators".
+In lesson 3, we learned how to use Umpire's `memset` operator. This lesson
+builds on top of that to show other available operators.
+
+Umpire provides a number of operations implemented as methods on the
+`ResourceManager`. These typically take pointer and size arguments, but you do
+not need to tell Umpire which Allocator each pointer came from. Umpire keeps
+track of this and will call the appropriate underlying vendor function.
+
+The copy method has the following signature:
+
+```
+void umpire::ResourceManager::copy (void* dst_ptr, void * src_ptr, std::size_t size = 0)	
+```
+
+*Note:* The destination is the first argument.
+
+In the file `07_raja_umpire_host_device.cpp`, there is a `TODO` comment where you should insert two copy
+calls to copy data from the CPU memory to the DEVICE memory.
+
+You will also find that we are adjusting the `RAJA::forall` to now work on the GPU.
+In order for this to happen, we need a few extra things. First, we create a 
+`CUDA_BLOCK_SIZE` variable to tell RAJA how big we want our CUDA blocks to be.
+Since there are 32 threads in a warp, 256 tends to be a good value for a block size.
+Other sizes will work too, such as 128 or 512. This just depends on your GPU.
+
+Additionally, the `RAJA::forall` needs the CUDA execution policy. More on GPU
+execution policies can be found here: https://raja.readthedocs.io/en/develop/sphinx/user_guide/feature/policies.html#gpu-policies-for-cuda-and-hip
+
+The `cuda_exec` policy takes the cuda block size argument we created before
+as a template parameter. Finally, as we are filling in the lambda portion of
+the `RAJA::forall`, we need to specify where it will reside in GPU memory. 
+This can be done directly or by using the `RAJA_DEVICE` macro. 
+
+When you are done editing the file, compile and run it:
+
+```
+$ make 07_raja_umpire_host_device
+$ ./bin/07_raja_umpire_host_device
+```
diff --git a/...n/07_raja_umpire_host_device_solution.cpp → ...6_07_raja_umpire_host_device_solution.cpp b/...n/07_raja_umpire_host_device_solution.cpp → ...6_07_raja_umpire_host_device_solution.cpp
@@ -6,13 +6,15 @@
 int main()
 {
   constexpr int N{10000};
+  //TODO: Set up a block size value
   constexpr std::size_t CUDA_BLOCK_SIZE{256};
   double* a{nullptr};
   double* b{nullptr};
   double* a_h{nullptr};
   double* b_h{nullptr};
 
   auto& rm = umpire::ResourceManager::getInstance();
+  // TODO: create 2 allocators, one with device memory and one with host memory
   auto allocator = rm.getAllocator("DEVICE");
   auto host_allocator = rm.getAllocator("HOST");
 
@@ -21,6 +23,8 @@ int main()
   a_h = static_cast<double*>(host_allocator.allocate(N*sizeof(double)));
   b_h = static_cast<double*>(host_allocator.allocate(N*sizeof(double)));
 
+  //TODO: fill in the forall statement with the sequential exec policy.
+  //TODO: Alternatively, you could use the memset operator to do this instead...
   RAJA::forall< RAJA::seq_exec >(
     RAJA::TypedRangeSegment<int>(0, N), [=] (int i) {
       a_h[i] = 1.0;
@@ -33,8 +37,11 @@ int main()
   rm.copy(b, b_h, N*sizeof(double));
 
   double dot{0.0};
+  //TODO: create a RAJA::ReduceSum with cuda_reduce called "cudot" for the GPU
   RAJA::ReduceSum<RAJA::cuda_reduce, double> cudot(0.0);
 
+  //TODO: fill in the forall statement with the CUDA execution policy
+  //TODO: and its block size argument. Then be sure to use RAJA_DEVICE
   RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
     [=] RAJA_DEVICE (int i) { 
     cudot += a[i] * b[i]; 

diff --git a/Intro_Tutorial/lessons/06_raja_umpire_uvm/CMakeLists.txt b/Intro_Tutorial/lessons/06_raja_umpire_uvm/CMakeLists.txt
diff --git a/Intro_Tutorial/lessons/06_raja_umpire_uvm/README.md b/Intro_Tutorial/lessons/06_raja_umpire_uvm/README.md
diff --git a/Intro_Tutorial/lessons/06_raja_umpire_uvm/solution/06_raja_umpire_uvm_solution.cpp b/Intro_Tutorial/lessons/06_raja_umpire_uvm/solution/06_raja_umpire_uvm_solution.cpp
diff --git a/Intro_Tutorial/lessons/07_raja_umpire_host_device/07_raja_umpire_host_device.cpp b/Intro_Tutorial/lessons/07_raja_umpire_host_device/07_raja_umpire_host_device.cpp
diff --git a/Intro_Tutorial/lessons/07_raja_umpire_host_device/CMakeLists.txt b/Intro_Tutorial/lessons/07_raja_umpire_host_device/CMakeLists.txt
diff --git a/Intro_Tutorial/lessons/07_raja_umpire_host_device/README.md b/Intro_Tutorial/lessons/07_raja_umpire_host_device/README.md
diff --git a/Intro_Tutorial/lessons/CMakeLists.txt b/Intro_Tutorial/lessons/CMakeLists.txt
@@ -3,8 +3,7 @@ add_subdirectory(02_raja_umpire)
 add_subdirectory(03_umpire_allocator)
 add_subdirectory(04_raja_forall)
 add_subdirectory(05_raja_reduce)
-add_subdirectory(06_raja_umpire_uvm)
-add_subdirectory(07_raja_umpire_host_device)
+add_subdirectory(06_07_raja_umpire_host_device)
 add_subdirectory(08_raja_umpire_quick_pool)
 add_subdirectory(09_raja_view)
 add_subdirectory(10_raja_kernel)