diff --git a/Intro_Tutorial/lessons/05_raja_reduce/05_raja_reduce.cpp b/Intro_Tutorial/lessons/05_raja_reduce/05_raja_reduce.cpp index 79db58c..d73997d 100644 --- a/Intro_Tutorial/lessons/05_raja_reduce/05_raja_reduce.cpp +++ b/Intro_Tutorial/lessons/05_raja_reduce/05_raja_reduce.cpp @@ -1,48 +1,48 @@ -#include #include +#include +#include #include "RAJA/RAJA.hpp" -#include "umpire/Umpire.hpp" + +//TODO: uncomment this out in order to build! +//#define COMPILE int main() { - constexpr int N{1000}; - double* a{nullptr}; - double* b{nullptr}; - - auto& rm = umpire::ResourceManager::getInstance(); - auto allocator = rm.getAllocator("HOST"); - - a = static_cast(allocator.allocate(N*sizeof(double))); - b = static_cast(allocator.allocate(N*sizeof(double))); - - std::srand(4793); - - // Initialize data arrays to random positive and negative values - RAJA::forall< RAJA::seq_exec >( - RAJA::TypedRangeSegment(0, N), [=] (int i) { - double signfact = static_cast(std::rand()/RAND_MAX); - signfact = ( signfact < 0.5 ? -1.0 : 1.0 ); - - a[i] = signfact * (i + 1.1)/(i + 1.12); - b[i] = (i + 1.1)/(i + 1.12); - } - ); - - // TODO: Change this dot variable to instead use a RAJA OpenMP parallel - // reduction - double dot{0.0}; - - // TODO: Calculate and output the dot product of a and b using a RAJA::forall - RAJA::forall< RAJA::omp_parallel_for_exec >( - RAJA::TypedRangeSegment(0, N), [=] (int i) { - } - ); - - std::cout << "dot product is "<< dot << std::endl; - - allocator.deallocate(a); - allocator.deallocate(b); + constexpr int N{1000000}; + + constexpr double dx{1.0 / N}; + + // Sequential kernel that approximates pi + { + RAJA::ReduceSum pi(0.0); + + RAJA::forall(RAJA::TypedRangeSegment(0, N), [=] (int i) { + double x = (double(i) + 0.5) * dx; + pi += dx / (1.0 + x * x); + }); + double tpi = 4.0 * pi.get(); + + std::cout << "Sequential pi approximation " << " = " + << std::setprecision(20) << tpi << std::endl; + } + +#if defined(COMPILE) + // OpenMP kernel that approximates pi + { + // TODO: write a parallel RAJA forall loop using OpenMP to approximate pi. + // First, will have to define create a RAJA::ReduceSum object that will + // work in an OpenMP kernel. + + RAJA::forall( ??? + + ); + double tpi = 4.0 * pi.get(); + + std::cout << "OpenMP pi approximation " << " = " + << std::setprecision(20) << tpi << std::endl; + } +#endif // if defined(COMPILE) return 0; } diff --git a/Intro_Tutorial/lessons/05_raja_reduce/README.md b/Intro_Tutorial/lessons/05_raja_reduce/README.md index 03f247f..488d298 100644 --- a/Intro_Tutorial/lessons/05_raja_reduce/README.md +++ b/Intro_Tutorial/lessons/05_raja_reduce/README.md @@ -1,37 +1,88 @@ # Lesson 5 -In this lesson, you will learn how to use a `RAJA::ReduceSum` object to perform -a safe parallel reduction in a vector dot product. Different programming models -support parallel reduction operations differently, so RAJA provides portable -reduction types that make it easy to perform reduction operations in kernels. -For example, we can create a sum reduction variable `dot` for a data type -`TYPE` with the initial reduction value of `0.0`: +In lesson 4, we looked at a data parallel loop kernel in which each loop +iterate was independent of the others. In this lesson, we consider a kernel +that is **not data parallel** because there is a data dependence between +iterates. A data dependence occurs when multiple threads (or tasks) could +write to the same memory location at the same time. This is often called a +**race condition** and can cause an algorithm to produce **non-deterministic**, +order-dependent results. + +Consider an attempt to write an OpenMP parallel kernel to computed the sum of +the elements in an array: + +``` +double sum = 0; + +#pragma omp parallel for +for (int i = 0; i < N; ++i) { + sum += data[i]; +} +``` + +This kernel has a race condition since multiple loop iterates could write +to the `sum` variable at the same time. As a result, the computed `sum` value +could be correct, but probably not. OpenMP provides a reduction clause that +allows a user to write a kernel to compute a reduction without a race condition. +That is, only one thread will write to the reduction variable at a time. For +example: ``` -RAJA::ReduceSum dot(0.0); +double sum = 0; + +#pragma omp parallel for reduction(+:sum) +for (int i = 0; i < N; ++i) { + sum += data[i]; +} ``` -Although in this lesson, we only use `RAJA::ReduceSum`, RAJA provides -other reduction types, such as `RAJA::ReduceMin` and `RAJA::ReduceMax`. -More information on RAJA reductions can be found at: -https://raja.readthedocs.io/en/develop/sphinx/user_guide/feature/reduction.html# +This kernel implementation does not have a race condition. However, the results +could still be non-deterministic due to order in which the values are +accumulated in the sum in parallel. -A reduction object takes two template parameters. The `REDUCTION_POLICY` and a -`TYPE`, which was mentioned above. The `REDUCTION_POLICY` controls how the -reduction is performed, and must be compatible with the execution policy of -the kernel where the reduction is used. +It is important to note that not all parallel programming models provide a +mechanism to write a parallel reduction. For example, CUDA and HIP do not. +RAJA provides a reduction construct enabling users to write parallel reductions +in each of its supported programming model back-ends in a manner that is +portable across those back-ends. This lesson uses the `RAJA::ReduceSum` type +to compute a sum reduction. More information on RAJA reductions, including +other reduction operations that RAJA supports, can be found in the +[RAJA Reduction Operations](https://raja.readthedocs.io/en/develop/sphinx/user_guide/feature/reduction.html). -For example, if my `RAJA::forall` has an execution policy of `RAJA::omp_parallel_for_exec`, -then the reduction policy must be `RAJA::omp_reduce`. More documentation on -compatible reduction and execution policies can be found here: -https://raja.readthedocs.io/en/develop/sphinx/user_guide/feature/policies.html#reducepolicy-label +In this lesson, we will use a `RAJA::ReduceSum` object to approximate $\pi$, +the ratio of the area in a circle over its diameter, using a Riemann integral +approximation of the formula +```math +\frac{ \pi }{4} = \tan^{-1}(1) = \int_0^1 \frac{1}{1+x^2}\,dx \approx \sum_{i=0}^{N} \frac{1}{1 + ( (i+0.5) \Delta x )^{2}} \Delta x +``` +where $\Delta x = \frac{1}{N}$. -The second parameter, the `TYPE` parameter, is just the data type of the -variable, such as `int`. +We create a sum reduction variable `pi` as follows: -In the file `05_raja_reduce.cpp`, follow the instruction in the `TODO` comment to create -a RAJA Reduction using `seq_exec`. +``` +RAJA::ReduceSum pi(0.0); +``` +The `REDUCTION_POLICY` type specializes the reduction object for use in a +kernel with a compatible execution policy. For example, if a `RAJA::forall` +method uses an execution policy `RAJA::omp_parallel_for_exec`, then the +reduction policy must be `RAJA::omp_reduce`. Similarly, for CUDA, HIP, etc. +More information about compatible reduction and execution policies can be +found in [RAJA Reduction Policies](https://raja.readthedocs.io/en/develop/sphinx/user_guide/feature/policies.html#reducepolicy-label). +The second parameter `TYPE` is the data type of the reduced quantity, +such as `int` or `double`. The `0.0` passed to the reduction variable +constructor initializes the sum to zero. + +The code for this lesson resides in the file `05_raja_reduce.cpp`. It provides +a RAJA implementation of a kernel that approximates the value of $\pi$ using +the `RAJA::seq_exec` policy. With this policy, the loop will execute +sequentially on a CPU. The code will print out the result which should look +familiar `3.145...` as expected. + +After that, you will find a `TODO` asking you to implement an OpenMP version of +the kernel. You will use `RAJA::omp_parallel_for_exec` for the execution policy +to specialize the `RAJA::forall` template and `RAJA::omp_reduce` for the reduce +policy to specialize the `RAJA::ReduceSum` object. Once you have filled in the correct reduction statement, compile and run: @@ -39,3 +90,16 @@ Once you have filled in the correct reduction statement, compile and run: $ make 05_raja_reduce $ ./bin/05_raja_reduce ``` + +Are the results printed for the sequential and OpenMP kernels the same? +Why or why not? + +Try running the code multiple times by repeatedly executing: + +``` +$ ./bin/05_raja_reduce +``` + +What do you observe about the result printed for the sequential kernel for +each run of the code? What about the results printed for the OpenMP kernel for +each run of the code? Can you explain what is happening? diff --git a/Intro_Tutorial/lessons/05_raja_reduce/solution/05_raja_reduce_solution.cpp b/Intro_Tutorial/lessons/05_raja_reduce/solution/05_raja_reduce_solution.cpp index 4b76a48..e1a28c8 100644 --- a/Intro_Tutorial/lessons/05_raja_reduce/solution/05_raja_reduce_solution.cpp +++ b/Intro_Tutorial/lessons/05_raja_reduce/solution/05_raja_reduce_solution.cpp @@ -1,49 +1,50 @@ -#include #include +#include +#include #include "RAJA/RAJA.hpp" -#include "umpire/Umpire.hpp" + +//TODO: uncomment this out in order to build! +//#define COMPILE int main() { - constexpr int N{10000}; - double* a{nullptr}; - double* b{nullptr}; - - auto& rm = umpire::ResourceManager::getInstance(); - auto allocator = rm.getAllocator("HOST"); - - a = static_cast(allocator.allocate(N*sizeof(double))); - b = static_cast(allocator.allocate(N*sizeof(double))); - - std::srand(4793); - - // Initialize data arrays to random positive and negative values - RAJA::forall< RAJA::seq_exec >( - RAJA::TypedRangeSegment(0, N), [=] (int i) { - double signfact = static_cast(std::rand()/RAND_MAX); - signfact = ( signfact < 0.5 ? -1.0 : 1.0 ); - - a[i] = signfact * (i + 1.1)/(i + 1.12); - b[i] = (i + 1.1)/(i + 1.12); - } - ); - - // TODO: Change this dot variable to instead use a RAJA OpenMP parallel - // reduction - RAJA::ReduceSum< RAJA::omp_reduce, double > dot(0.0); - - // TODO: Calculate and output the dot product of a and b using a RAJA::forall - RAJA::forall< RAJA::omp_parallel_for_exec >( - RAJA::TypedRangeSegment(0, N), [=] (int i) { - dot += a[i] * b[i]; - } - ); - - std::cout << "dot product is "<< dot.get() << std::endl; - - allocator.deallocate(a); - allocator.deallocate(b); + constexpr int N{1000000}; + + constexpr double dx{1.0 / N}; + + // Sequential kernel that approximates pi + { + RAJA::ReduceSum pi(0.0); + + RAJA::forall(RAJA::TypedRangeSegment(0, N), [=] (int i) { + double x = (double(i) + 0.5) * dx; + pi += dx / (1.0 + x * x); + }); + double tpi = 4.0 * pi.get(); + + std::cout << "Sequential pi approximation " << " = " + << std::setprecision(20) << tpi << std::endl; + } + +#if defined(COMPILE) + // OpenMP kernel that approximates pi + { + // TODO: write a parallel RAJA forall loop using OpenMP to approximate pi. + // First, will have to define create a RAJA::ReduceSum object that will + // work in an OpenMP kernel. + RAJA::ReduceSum pi(0.0); + + RAJA::forall(RAJA::TypedRangeSegment(0, N), [=] (int i) { + double x = (double(i) + 0.5) * dx; + pi += dx / (1.0 + x * x); + }); + double tpi = 4.0 * pi.get(); + + std::cout << "OpenMP pi approximation " << " = " + << std::setprecision(20) << tpi << std::endl; + } +#endif // if defined(COMPILE) return 0; }