llnl · rhornung67 · Jul 15, 2025 · Jul 7, 2025 · Jul 8, 2025 · Jul 11, 2025
diff --git a/Intro_Tutorial/lessons/05_raja_reduce/05_raja_reduce.cpp b/Intro_Tutorial/lessons/05_raja_reduce/05_raja_reduce.cpp
@@ -1,48 +1,48 @@
-#include <iostream>
 #include <cstdlib>
+#include <iostream>
+#include <iomanip>
 
 #include "RAJA/RAJA.hpp"
-#include "umpire/Umpire.hpp"
+
+//TODO: uncomment this out in order to build!
+//#define COMPILE
 
 int main()
 {
-  constexpr int N{1000};
-  double* a{nullptr};
-  double* b{nullptr};
-
-  auto& rm = umpire::ResourceManager::getInstance();
-  auto allocator = rm.getAllocator("HOST");
-
-  a = static_cast<double*>(allocator.allocate(N*sizeof(double)));
-  b = static_cast<double*>(allocator.allocate(N*sizeof(double)));
-
-  std::srand(4793);
-
-  // Initialize data arrays to random positive and negative values
-  RAJA::forall< RAJA::seq_exec >(
-    RAJA::TypedRangeSegment<int>(0, N), [=] (int i) {
-      double signfact = static_cast<double>(std::rand()/RAND_MAX);
-      signfact = ( signfact < 0.5 ? -1.0 : 1.0 );
-
-      a[i] = signfact * (i + 1.1)/(i + 1.12);
-      b[i] = (i + 1.1)/(i + 1.12);
-    }
-  );
-
-  // TODO: Change this dot variable to instead use a RAJA OpenMP parallel
-  // reduction 
-  double dot{0.0};
-
-  // TODO: Calculate and output the dot product of a and b using a RAJA::forall
-  RAJA::forall< RAJA::omp_parallel_for_exec >(
-    RAJA::TypedRangeSegment<int>(0, N), [=] (int i) {
-    }
-  );
-
-  std::cout << "dot product is "<< dot << std::endl;
-
-  allocator.deallocate(a);
-  allocator.deallocate(b);
+  constexpr int N{1000000};
+
+  constexpr double dx{1.0 / N}; 
+
+  // Sequential kernel that approximates pi 
+  {
+    RAJA::ReduceSum<RAJA::seq_reduce, double> pi(0.0);
+
+    RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N), [=] (int i) {
+        double x = (double(i) + 0.5) * dx;
+        pi += dx / (1.0 + x * x);
+    });
+    double tpi = 4.0 * pi.get(); 
+
+    std::cout << "Sequential pi approximation " << " = " 
+              << std::setprecision(20) << tpi << std::endl;
+  }
+
+#if defined(COMPILE)
+  // OpenMP kernel that approximates pi
+  {
+    // TODO: write a parallel RAJA forall loop using OpenMP to approximate pi.
+    // First, will have to define create a RAJA::ReduceSum object that will
+    // work in an OpenMP kernel.
+
+    RAJA::forall<RAJA::omp_parallel_for_exec>( ??? 
+
+    ); 
+    double tpi = 4.0 * pi.get();
+
+    std::cout << "OpenMP pi approximation " << " = "
+              << std::setprecision(20) << tpi << std::endl;
+  }
+#endif  // if defined(COMPILE)
 
   return 0;
 }
diff --git a/Intro_Tutorial/lessons/05_raja_reduce/README.md b/Intro_Tutorial/lessons/05_raja_reduce/README.md
@@ -1,41 +1,105 @@
 # Lesson 5
 
-In this lesson, you will learn how to use a `RAJA::ReduceSum` object to perform
-a safe parallel reduction in a vector dot product.  Different programming models
-support parallel reduction operations differently, so RAJA provides portable
-reduction types that make it easy to perform reduction operations in kernels.
-For example, we can create a sum reduction variable `dot` for a data type
-`TYPE` with the initial reduction value of `0.0`: 
+In lesson 4, we looked at a data parallel loop kernel in which each loop
+iterate was independent of the others. In this lesson, we consider a kernel 
+that is **not data parallel** because there is a data dependence between
+iterates. A data dependence occurs when multiple threads (or tasks) could
+write to the same memory location at the same time. This is often called a
+**race condition** and can cause an algorithm to produce **non-deterministic**,
+order-dependent results. 
+
+Consider an attempt to write an OpenMP parallel kernel to computed the sum of
+the elements in an array:
+
+```
+double sum = 0;
+
+#pragma omp parallel for
+for (int i = 0; i < N; ++i) {
+  sum += data[i];
+}
+```
+
+This kernel has a race condition since multiple loop iterates could write
+to the `sum` variable at the same time. As a result, the computed `sum` value
+could be correct, but probably not. OpenMP provides a reduction clause that
+allows a user to write a kernel to compute a reduction without a race condition.
+That is, only one thread will write to the reduction variable at a time. For
+example:
 
 ```
-RAJA::ReduceSum<REDUCTION_POLICY, TYPE> dot(0.0);
+double sum = 0;
+
+#pragma omp parallel for reduction(+:sum)
+for (int i = 0; i < N; ++i) {
+  sum += data[i];
+}
 ```
 
-Although in this lesson, we only use `RAJA::ReduceSum`, RAJA provides 
-other reduction types, such as `RAJA::ReduceMin` and `RAJA::ReduceMax`.
-More information on RAJA reductions can be found at:
-https://raja.readthedocs.io/en/develop/sphinx/user_guide/feature/reduction.html#
+This kernel implementation does not have a race condition. However, the results
+could still be non-deterministic due to order in which the values are
+accumulated in the sum in parallel.
 
-A reduction object takes two template parameters. The `REDUCTION_POLICY` and a
-`TYPE`, which was mentioned above. The `REDUCTION_POLICY` controls how the 
-reduction is performed, and must be compatible with the execution policy of 
-the kernel where the reduction is used.
+It is important to note that not all parallel programming models provide a
+mechanism to write a parallel reduction. For example, CUDA and HIP do not.
+RAJA provides a reduction construct enabling users to write parallel reductions
+in each of its supported programming model back-ends in a manner that is
+portable across those back-ends. This lesson uses the `RAJA::ReduceSum` type
+to compute a sum reduction. More information on RAJA reductions, including
+other reduction operations that RAJA supports, can be found in the
+[RAJA Reduction Operations](https://raja.readthedocs.io/en/develop/sphinx/user_guide/feature/reduction.html).
 
-For example, if my `RAJA::forall` has an execution policy of `RAJA::omp_parallel_for_exec`,
-then the reduction policy must be `RAJA::omp_reduce`. More documentation on 
-compatible reduction and execution policies can be found here:
-https://raja.readthedocs.io/en/develop/sphinx/user_guide/feature/policies.html#reducepolicy-label
+In this lesson, we will use a `RAJA::ReduceSum` object to approximate $\pi$,
+the ratio of the area in a circle over its diameter, using a Riemann integral
+approximation of the formula
+```math
+\frac{ \pi }{4} = \tan^{-1}(1) = \int_0^1 \frac{1}{1+x^2}\,dx \approx \sum_{i=0}^{N} \frac{1}{1 + ( (i+0.5) \Delta x )^{2}} \Delta x
+```
+where $\Delta x = \frac{1}{N}$.
 
-The second parameter, the `TYPE` parameter, is just the data type of the 
-variable, such as `int`.
+We create a sum reduction variable `pi` as follows:
 
-In the file `05_raja_reduce.cpp`, follow the instruction in the `TODO` comment to create
-a RAJA Reduction using `seq_exec`. 
+```
+RAJA::ReduceSum<REDUCTION_POLICY, TYPE> pi(0.0);
+```
 
+The `REDUCTION_POLICY` type specializes the reduction object for use in a
+kernel with a compatible execution policy. For example, if a `RAJA::forall`
+method uses an execution policy `RAJA::omp_parallel_for_exec`, then the
+reduction policy must be `RAJA::omp_reduce`. Similarly, for CUDA, HIP, etc.
+More information about compatible reduction and execution policies can be
+found in [RAJA Reduction Policies](https://raja.readthedocs.io/en/develop/sphinx/user_guide/feature/policies.html#reducepolicy-label).
+The second parameter `TYPE` is the data type of the reduced quantity,
+such as `int` or `double`. The `0.0` passed to the reduction variable
+constructor initializes the sum to zero.
+
+The code for this lesson resides in the file `05_raja_reduce.cpp`. It provides
+a RAJA implementation of a kernel that approximates the value of $\pi$ using
+the `RAJA::seq_exec` policy. With this policy, the loop will execute
+sequentially on a CPU. The code will print out the result which should look
+familiar `3.145...` as expected. 
+
+After that, you will find a `TODO` asking you to implement an OpenMP version of
+the kernel. You will use `RAJA::omp_parallel_for_exec` for the execution policy
+to specialize the `RAJA::forall` template and `RAJA::omp_reduce` for the reduce
+policy to specialize the `RAJA::ReduceSum` object.
 
 Once you have filled in the correct reduction statement, compile and run:
 
 ```
 $ make 05_raja_reduce
 $ ./bin/05_raja_reduce
 ```
+
+Are the results printed for the sequential and OpenMP kernels the same?
+Why or why not?
+
+Try running the code multiple times by repeatedly executing:
+
+```
+$ ./bin/05_raja_reduce
+```
+
+What do you observe about the result printed for the sequential kernel for
+each run of the code? What about the results printed for the OpenMP kernel for
+each run of the code? Can you explain what is happening?
diff --git a/Intro_Tutorial/lessons/05_raja_reduce/solution/05_raja_reduce_solution.cpp b/Intro_Tutorial/lessons/05_raja_reduce/solution/05_raja_reduce_solution.cpp
@@ -1,49 +1,50 @@
-#include <iostream>
 #include <cstdlib>
+#include <iostream>
+#include <iomanip>
 
 #include "RAJA/RAJA.hpp"
-#include "umpire/Umpire.hpp"
+
+//TODO: uncomment this out in order to build!
+//#define COMPILE
 
 int main()
 {
-  constexpr int N{10000};
-  double* a{nullptr};
-  double* b{nullptr};
-
-  auto& rm = umpire::ResourceManager::getInstance();
-  auto allocator = rm.getAllocator("HOST");
-
-  a = static_cast<double*>(allocator.allocate(N*sizeof(double)));
-  b = static_cast<double*>(allocator.allocate(N*sizeof(double)));
-
-  std::srand(4793);
-
-  // Initialize data arrays to random positive and negative values
-  RAJA::forall< RAJA::seq_exec >(
-    RAJA::TypedRangeSegment<int>(0, N), [=] (int i) {
-      double signfact = static_cast<double>(std::rand()/RAND_MAX);
-      signfact = ( signfact < 0.5 ? -1.0 : 1.0 );
-
-      a[i] = signfact * (i + 1.1)/(i + 1.12);
-      b[i] = (i + 1.1)/(i + 1.12);
-    }
-  );
-
-  // TODO: Change this dot variable to instead use a RAJA OpenMP parallel
-  // reduction
-  RAJA::ReduceSum< RAJA::omp_reduce, double > dot(0.0);
-
-  // TODO: Calculate and output the dot product of a and b using a RAJA::forall
-  RAJA::forall< RAJA::omp_parallel_for_exec >(
-    RAJA::TypedRangeSegment<int>(0, N), [=] (int i) {
-      dot += a[i] * b[i];
-    }
-  );
-
-  std::cout << "dot product is "<< dot.get() << std::endl;
-
-  allocator.deallocate(a);
-  allocator.deallocate(b);
+  constexpr int N{1000000};
+
+  constexpr double dx{1.0 / N}; 
+
+  // Sequential kernel that approximates pi 
+  {
+    RAJA::ReduceSum<RAJA::seq_reduce, double> pi(0.0);
+
+    RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N), [=] (int i) {
+        double x = (double(i) + 0.5) * dx;
+        pi += dx / (1.0 + x * x);
+    });
+    double tpi = 4.0 * pi.get(); 
+
+    std::cout << "Sequential pi approximation " << " = " 
+              << std::setprecision(20) << tpi << std::endl;
+  }
+
+#if defined(COMPILE)
+  // OpenMP kernel that approximates pi
+  {
+    // TODO: write a parallel RAJA forall loop using OpenMP to approximate pi.
+    // First, will have to define create a RAJA::ReduceSum object that will
+    // work in an OpenMP kernel.
+    RAJA::ReduceSum<RAJA::omp_reduce, double> pi(0.0);
+
+    RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::TypedRangeSegment<int>(0, N), [=] (int i) {
+        double x = (double(i) + 0.5) * dx;
+        pi += dx / (1.0 + x * x);
+    });
+    double tpi = 4.0 * pi.get();
+
+    std::cout << "OpenMP pi approximation " << " = "
+              << std::setprecision(20) << tpi << std::endl;
+  }
+#endif  // if defined(COMPILE)
 
   return 0;
 }