llnl · artv3 · Jul 15, 2025 · Jul 7, 2025 · Jul 7, 2025 · Jul 8, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -6,6 +6,7 @@ set (ENABLE_TESTS Off CACHE BOOL "")
 set (ENABLE_EXAMPLES Off CACHE BOOL "")
 set (ENABLE_REPRODUCERS Off CACHE BOOL "")
 set (ENABLE_EXERCISES Off CACHE BOOL "")
+set (RAJA_ENABLE_EXERCISES Off CACHE BOOL "")
 set (ENABLE_DOCUMENTATION Off CACHE BOOL "")
 set (ENABLE_BENCHMARKS Off CACHE BOOL "")
 

diff --git a/Intro_Tutorial/lessons/04_raja_forall/04_raja_forall.cpp b/Intro_Tutorial/lessons/04_raja_forall/04_raja_forall.cpp
@@ -1,26 +1,59 @@
 #include <iostream>
 
 #include "RAJA/RAJA.hpp"
+#include "RAJA/util/Timer.hpp"
+
 #include "umpire/Umpire.hpp"
 
 int main()
 {
   double* data{nullptr};
+  double* data1{nullptr};
+
+  auto timer = RAJA::Timer();
 
-  constexpr int N = 100;
+  constexpr int N = 5000000;
 
   auto& rm = umpire::ResourceManager::getInstance();
   auto allocator = rm.getAllocator("HOST");
 
   data = static_cast<double*>(allocator.allocate(N*sizeof(double)));
+  data1 = static_cast<double*>(allocator.allocate(N*sizeof(double)));
 
   std::cout << "Address of data: " << data << std::endl;
+  std::cout << "Address of data1: " << data1 << std::endl;
 
-  // TODO: write a RAJA forall loop to set each element of the array 'data' to
-  // the value of the loop index
+  // Sequential kernel that sets each element of array 'data' to its index
+  timer.start();
+  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N), [=] (int i) {
+      data[i] = i;
+  });
+  timer.stop();
 
+  RAJA::Timer::ElapsedType elapsed = timer.elapsed();
+
+  std::cout << "\nSequential loop exec time = " << elapsed << std::endl;
   std::cout << "data[50] = " << data[50] << std::endl;
+  std::cout << "data[100] = " << data[100] << std::endl;
+  std::cout << "data[1000] = " << data[1000] << std::endl;
+  std::cout << "data[5000] = " << data[5000] << std::endl;
+
+  timer.reset();
+
+  timer.start();
+  // TODO: write a parallel RAJA forall loop using OpenMP to set each element of the 
+  // array 'data1' to its index
+  timer.stop();
+
+  elapsed = timer.elapsed();
+
+  std::cout << "\nOpenMP loop exec time = " << elapsed << std::endl;
+  std::cout << "data1[50] = " << data1[50] << std::endl;
+  std::cout << "data1[100] = " << data1[100] << std::endl;
+  std::cout << "data1[1000] = " << data1[1000] << std::endl;
+  std::cout << "data1[5000] = " << data1[5000] << std::endl;
 
   allocator.deallocate(data);
+  allocator.deallocate(data1);
   return 0;
 }
diff --git a/Intro_Tutorial/lessons/04_raja_forall/README.md b/Intro_Tutorial/lessons/04_raja_forall/README.md
@@ -1,24 +1,62 @@
 # Lesson Four
 
-In this lesson, you will learn to write a loop using the `RAJA::forall` statement.
+Data parallel kernels are common in many parallel HPC applications. In a data
+parallel loop kernel, the processing of data that occurs at each iterate **is
+independent** of the processing of data at all other iterates. This is
+sometimes referred to as "embarrassingly parallel" because it is
+straightforward to parallelize a kernel when there is no chance that the
+computation done in one thread or process can impact the computation done in
+another.
 
-The `RAJA::forall` loop execution method is a template that takes an execution
-policy type template parameter. A `RAJA::forall` method takes two arguments: an
-iteration space object, such as a contiguous range of loop indices as shown
-here, and a single lambda expression representing the loop kernel body:
+A simple example of a data parallel loop kernel that is parallelized using
+OpenMP is:
+
+```
+double data* = new....;
+
+#pragma omp parallel for
+for (int i = 0; i < N; ++i) {
+  data[i] = i;
+}
+```
+
+Each loop iterate sets the array element at the iterate index to the index
+value. Clearly, each iterate is independent of the others. If this OpenMP
+kernel were run with M thread, then depending on how the loop work is
+scheduled, iterates may be partitioned into chunks of size N/M with each 
+thread executing one chunk of iterates. This is illustrated in the figure.
+
+<figure>
+<img src="./images/parchunk.png">
+</figure>
+
+If the loop takes T time units to run on one process/thread, then ideally it
+would run on T / M time units in parallel when using M processors/threads
+(M <= N). However, parallel overheads often prevent one from observing this
+optimal speed up. Indeed, depending on the kernel, number of iterates, number
+of threads, etc., a kernel may run slower in parallel than it does sequentially.
+
+In this lesson, you will learn about the `RAJA::forall` loop kernel execution
+method to parallelize this kernel.
+
+The `RAJA::forall` template method is specialized on an execution policy type
+parameter that specifies how the kernel will be compiled to run. A
+`RAJA::forall` method takes two arguments: an iteration space object,
+such as a contiguous range of loop indices as shown in this lesson, and a
+C++ lambda expression that represents the loop kernel body:
 
 ```
 RAJA::forall<EXEC_POLICY>( ITERATION SPACE, LAMBDA);
 ```
 
-We can create a `RAJA::TypedRangeSegment` to describe an iteration space
-that is a contiguous sequence of integers `[0, N)`.
+To describe an iteration space that is a contiguous sequence of integers
+`[0, N)`, we create a `RAJA::TypedRangeSegment` as follows:
 
 ```
 RAJA::TypedRangeSegment<int>(0, N)
 ```
 
-The lambda expression needs to take one argument, the loop index:
+The lambda expression takes one argument, the loop iterate index:
 
 ```
 [=](int i) { // loop body }
@@ -27,21 +65,35 @@ The lambda expression needs to take one argument, the loop index:
 The `[=]` syntax tells the lambda to capture arguments by value (e.g. create a
 copy, rather than a reference).
 
-The `EXEC_POLICY` template argument controls how the loop will be executed. In
-this example, we will use the `RAJA::seq_exec` policy to execute this loop on
-the CPU. In later lessons, we will learn about other policies that allow us to
-run code on a GPU.
+The code for this lesson resides in the file `04_raja_forall.cpp`. It provides
+a RAJA implementation of a kernel that sets each element of an array `data` to
+the value of its array index using the `RAJA::seq_exec` policy. With this
+policy, the loop will execute sequentially on a CPU. The code will record the
+time of the loop execution and print it out along with a few values of the
+array to show that the array entries are set as expected. 
 
-In the file 04_raja_forall.cpp, you will see a `TODO` comment where you can add a
-`RAJA::forall` loop to initialize the array you allocated in the previous
-lesson.
+Following that, you will see a `TODO` comment where you can add a similar
+`RAJA::forall` kernel to set the elements of the array `data1` in the same way
+as the sequential kernel. However, you will use an OpenMP execution policy
+`RAJA::omp_parallel_for_exec` to run the loop in parallel on a CPU. Again, the
+code will record and print the kernel execution time and array values for
+comparison to the previous case and verification that they are set as you
+expect.
 
 When you have made your changes, compile and run the code in the same way as the
 other lessons:
 
 ```
 $ make 04_raja_forall
 $ ./bin/04_raja_forall
-Address of data:
-data[50] = 50
 ```
+
+If you need help, you can compare your version of the code to the solution
+code using the command `diff 04_raja_forall.cpp solution/04_raja_forall_solution.cpp`.
+
+Are the array elements that are printed out the same in each case? How do the 
+execution times compare? Which kernel ran faster?
+
+For more information about `RAJA::forall` use, execution policies, etc. please
+see [RAJA Basic Loop Execution](https://raja.readthedocs.io/en/develop/sphinx/user_guide/tutorial/add_vectors.html).
+
diff --git a/Intro_Tutorial/lessons/04_raja_forall/images/parchunk.png b/Intro_Tutorial/lessons/04_raja_forall/images/parchunk.png
diff --git a/Intro_Tutorial/lessons/04_raja_forall/solution/04_raja_forall_solution.cpp b/Intro_Tutorial/lessons/04_raja_forall/solution/04_raja_forall_solution.cpp
@@ -1,30 +1,62 @@
 #include <iostream>
 
 #include "RAJA/RAJA.hpp"
+#include "RAJA/util/Timer.hpp"
+
 #include "umpire/Umpire.hpp"
 
 int main()
 {
   double* data{nullptr};
+  double* data1{nullptr};
+
+  auto timer = RAJA::Timer();
 
-  constexpr int N = 100;
+  constexpr int N = 5000000;
 
   auto& rm = umpire::ResourceManager::getInstance();
   auto allocator = rm.getAllocator("HOST");
 
   data = static_cast<double*>(allocator.allocate(N*sizeof(double)));
+  data1 = static_cast<double*>(allocator.allocate(N*sizeof(double)));
 
   std::cout << "Address of data: " << data << std::endl;
+  std::cout << "Address of data1: " << data1 << std::endl;
 
-  // TODO: write a RAJA forall loop to set each element of the array 'data' to 
-  // the value of the loop index
-
+  // Sequential kernel that sets each element of array 'data' to its index
+  timer.start();
   RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N), [=] (int i) {
       data[i] = i;
   });
+  timer.stop();
 
+  RAJA::Timer::ElapsedType elapsed = timer.elapsed();
+
+  std::cout << "\nSequential loop exec time = " << elapsed << std::endl;
   std::cout << "data[50] = " << data[50] << std::endl;
+  std::cout << "data[100] = " << data[100] << std::endl;
+  std::cout << "data[1000] = " << data[1000] << std::endl;
+  std::cout << "data[5000] = " << data[5000] << std::endl;
+
+  timer.reset();
+
+  timer.start();
+  // TODO: write a parallel RAJA forall loop using OpenMP to set each element of the 
+  // array 'data1' to its index
+  RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::TypedRangeSegment<int>(0, N), [=] (int i) {
+      data1[i] = i;
+  });
+  timer.stop();
+
+  elapsed = timer.elapsed();
+
+  std::cout << "\nOpenMP loop exec time = " << elapsed << std::endl;
+  std::cout << "data1[50] = " << data1[50] << std::endl;
+  std::cout << "data1[100] = " << data1[100] << std::endl;
+  std::cout << "data1[1000] = " << data1[1000] << std::endl;
+  std::cout << "data1[5000] = " << data1[5000] << std::endl;
 
   allocator.deallocate(data);
+  allocator.deallocate(data1);
   return 0;
 }