llnl · neelakausik · Jul 19, 2023 · Jul 19, 2023 · Jul 24, 2023 · Jul 24, 2023
diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp
@@ -0,0 +1,89 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// Copyright 2020 Lawrence Livermore National Security, LLC and other CARE developers.
+// See the top-level LICENSE file for details.
+//
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////////////
+
+// CARE headers
+#include "care/DefaultMacros.h"
+#include "care/host_device_ptr.h"
+#include "care/forall.h"
+#include "care/policies.h"
+#include "RAJA/RAJA.hpp"
+
+// Other library headers
+#include <benchmark/benchmark.h>
+#include <omp.h>
+
+// Std library headers
+#include <climits>
+#include <cmath>
+
+#define NUM_KERNELS 4
+
+using namespace care;
+
+#if defined(CARE_GPUCC)
+//each kernel has a separate stream
+static void benchmark_gpu_loop_separate_streams(benchmark::State& state) {
+   int N = state.range(0);
+	const char * fileName = "test";
+
+	RAJA::resources::Cuda res_arr[NUM_KERNELS];
+	RAJA::resources::Event event_arr[NUM_KERNELS];
+	for(int i = 0; i < NUM_KERNELS; i++)
+	{
+		RAJA::resources::Cuda res;
+		res_arr[i] = res;
+		RAJA::resources::Event e = res.get_event();
+		event_arr[i] = e;
+	}
+
+	care::host_device_ptr<int> arr(N, "arr");
+   for (auto _ : state) {
+        		//run num kernels
+        		for(int j = 0; j < NUM_KERNELS; j++)
+				{
+					CARE_STREAMED_LOOP(i, res_arr[j], 0 , N) {
+						arr[i] = i;
+               } CARE_STREAMED_LOOP_END
+
+					if(j > 0) res_arr[j].wait_for(&event_arr[j - 1]);
+				}
+   	}
+		arr.free();
+}
+
+// Register the function as a benchmark
+BENCHMARK(benchmark_gpu_loop_separate_streams)->Range(1, INT_MAX);
+
+//all kernels on one stream
+static void benchmark_gpu_loop_single_stream(benchmark::State& state) {
+int N = state.range(0);
+	const char * fileName = "test";
+	RAJA::resources::Cuda res;
+
+	care::host_device_ptr<int> arr(N, "arr");
+
+	RAJA::resources::Event e = res.get_event();
+	for (auto _ : state) {
+        		//run num kernels
+        		for(int j = 0; j < NUM_KERNELS; j++)
+				{
+					CARE_STREAMED_LOOP(i, res, 0 , N) {
+						arr[i] = i;
+					}CARE_STREAMED_LOOP_END
+					res.wait();
+				}
+   	}
+		arr.free();
+}
+
+// Register the function as a benchmark
+BENCHMARK(benchmark_gpu_loop_single_stream)->Range(1, INT_MAX);
+
+#endif
+
+// Run the benchmarks
+BENCHMARK_MAIN();
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -36,6 +36,19 @@ target_include_directories(BenchmarkForall
 blt_add_benchmark(NAME BenchmarkForall
                   COMMAND BenchmarkForall)
 
+blt_add_executable(NAME BenchmarkRaja
+                   SOURCES BenchmarkRaja.cpp
+                   DEPENDS_ON ${care_benchmark_depends})
+
+target_include_directories(BenchmarkRaja
+                           PRIVATE ${PROJECT_SOURCE_DIR}/src)
+
+target_include_directories(BenchmarkRaja
+                           PRIVATE ${PROJECT_BINARY_DIR}/include)
+
+blt_add_benchmark(NAME BenchmarkRaja
+                  COMMAND BenchmarkRaja)
+
 blt_add_executable(NAME BenchmarkNumeric
                    SOURCES BenchmarkNumeric.cpp
                    DEPENDS_ON ${care_benchmark_depends})

diff --git a/src/care/DefaultMacros.h b/src/care/DefaultMacros.h
@@ -548,6 +548,15 @@
 #define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) }); \
    CARE_NEST_END(CHECK) }}
 
+#define CARE_CHECKED_STREAMED_LOOP_START(INDEX, RESOURCE, START_INDEX, END_INDEX, CHECK) { \
+   if (END_INDEX > START_INDEX) { \
+      CARE_NEST_BEGIN(CHECK) \
+      care::forall_with_stream(care::gpu{}, RESOURCE, __FILE__, __LINE__, START_INDEX, END_INDEX, [=] CARE_DEVICE (const int INDEX) { \
+         CARE_SET_THREAD_ID(INDEX)
+
+#define CARE_CHECKED_STREAMED_LOOP_END(CHECK) }); \
+   CARE_NEST_END(CHECK) }}
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is
@@ -753,6 +762,10 @@
 
 #define CARE_PARALLEL_LOOP_END CARE_CHECKED_PARALLEL_LOOP_END(care_parallel_loop_check)
 
+#define CARE_STREAMED_LOOP(INDEX, RESOURCE, START_INDEX, END_INDEX) CARE_CHECKED_STREAMED_LOOP_START(INDEX, RESOURCE, START_INDEX, END_INDEX, care_streamed_loop_check)
+
+#define CARE_STREAMED_LOOP_END CARE_CHECKED_STREAMED_LOOP_END(care_streamed_loop_check)
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a RAJA loop that uses at least one

diff --git a/src/care/forall.h b/src/care/forall.h
@@ -189,6 +189,49 @@ namespace care {
 #endif
    }
 
+   ////////////////////////////////////////////////////////////////////////////////
+   ///
+   /// @author Neela Kausik
+   ///
+   /// @brief If GPU is available, execute on the device. Otherwise, execute on
+   ///        the host. This specialization is needed for clang-query.
+   ///
+   /// @arg[in] gpu Used to choose this overload of forall
+   /// @arg[in] res Resource provided for execution
+   /// @arg[in] fileName The name of the file where this function is called
+   /// @arg[in] lineNumber The line number in the file where this function is called
+   /// @arg[in] start The starting index (inclusive)
+   /// @arg[in] end The ending index (exclusive)
+   /// @arg[in] body The loop body to execute at each index
+   ///
+   ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CARE_GPUCC)
+   template <typename LB> 
+   void forall_with_stream(gpu, RAJA::resources::Cuda res, const char * fileName, const int lineNumber,
+               const int start, const int end, LB&& body) {
+#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
+      s_reverseLoopOrder = true;
+#endif
+
+#if CARE_ENABLE_GPU_SIMULATION_MODE
+      forall(gpu_simulation{}, res, fileName, lineNumber, start, end, std::forward<LB>(body));
+#elif defined(__CUDACC__)
+      forall(RAJA::cuda_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>{},
+             res, RAJA::RangeSegment(start, end), std::forward<LB>(body));
+#elif defined(__HIPCC__)
+      forall(RAJA::hip_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>{},
+             res, RAJA::RangeSegment(start, end), std::forward<LB>(body));
+#else
+      forall(RAJA::seq_exec{}, res, fileName, lineNumber, start, end, std::forward<LB>(body));
+#endif
+
+#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
+      s_reverseLoopOrder = false;
+#endif   
+	}   
+#endif
+
    ////////////////////////////////////////////////////////////////////////////////
    ///
    /// @author Alan Dayton