llnl · artv3 · Jul 17, 2025 · Jun 30, 2025 · Jul 2, 2025 · Jul 2, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -25,3 +25,4 @@ endif()
 add_subdirectory(tpl)
 add_subdirectory(Intro_Tutorial)
 add_subdirectory(Intermediate_Tutorial)
+add_subdirectory(Profile_Demo)
diff --git a/Profile_Demo/CMakeLists.txt b/Profile_Demo/CMakeLists.txt
@@ -0,0 +1,6 @@
+if (ENABLE_CUDA)
+  blt_add_executable(
+    NAME profile_raja
+    SOURCES profile_raja.cpp
+    DEPENDS_ON RAJA umpire cuda)
+endif()
diff --git a/Profile_Demo/README.md b/Profile_Demo/README.md
@@ -0,0 +1,9 @@
+# Basic RAJA profiling with Caliper
+
+Building Caliper:
+cmake -DCMAKE_INSTALL_PREFIX=${caliper_path} -DWITH_NVTX=ON -DWITH_CUPTI=ON ../
+
+Building RAJA:
+cmake -DENABLE_CUDA=ON -DRAJA_ENABLE_RUNTIME_PLUGINS=ON -DRAJA_ENABLE_CALIPER=ON -DCMAKE_CUDA_FLAGS="--expt-extended-lambda" -Dcaliper_DIR=${caliper_path} ../ && make profile_raja -j
+
+TODO: Add to readme
diff --git a/Profile_Demo/caliper-plugin.cpp b/Profile_Demo/caliper-plugin.cpp
@@ -0,0 +1,37 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "RAJA/util/PluginStrategy.hpp"
+
+#include <iostream>
+#include <caliper/cali.h>
+
+class CaliperPlugin : public RAJA::util::PluginStrategy
+{
+public:
+  void preLaunch(const RAJA::util::PluginContext&p) override
+  {
+    if(!p.kernel_name.empty()) CALI_MARK_BEGIN(p.kernel_name.c_str());
+  }
+
+  void postLaunch(const RAJA::util::PluginContext& p) override
+  {
+    if(!p.kernel_name.empty()) CALI_MARK_END(p.kernel_name.c_str());
+  }
+
+private:
+
+};
+
+// Dynamically loading plugin.
+extern "C" RAJA::util::PluginStrategy *RAJAGetPlugin()
+{
+  return new CaliperPlugin;
+}
+
+// Statically loading plugin.
+static RAJA::util::PluginRegistry::add<CaliperPlugin> P("Caliper", "Enables Caliper Profiling");
diff --git a/Profile_Demo/profile_raja.cpp b/Profile_Demo/profile_raja.cpp
@@ -0,0 +1,158 @@
+#include <stdexcept>
+#include <iostream>
+
+#include "RAJA/RAJA.hpp"
+#include "umpire/Umpire.hpp"
+
+#include "caliper-plugin.cpp"
+
+const bool async = false;
+using forall_pol = RAJA::cuda_exec<1024, async>;
+using launch_pol = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async>>;
+
+void init(double *A, double *B, double *C, int m, int n) {
+
+  RAJA::forall<forall_pol>(RAJA::RangeSegment(0, n * n),
+                           RAJA::Name("init"),
+     [=] RAJA_HOST_DEVICE (RAJA::Index_type i) {
+       A[i] = 1.0;
+       B[i] = 1.0;
+       C[i] = 0.0;
+     });
+}
+
+void matrix_add(const double *A, const double *B, double *C, int m, int n) {
+
+  RAJA::forall<forall_pol>
+    (RAJA::RangeSegment(0, m * n), RAJA::Name("matrix_add"), [=] RAJA_HOST_DEVICE (RAJA::Index_type i) {
+        C[i] = A[i] + B[i];
+    });
+
+}
+
+void matrix_scalar_mult(const double *A, double *B, double scalar, int m, int n) {
+
+  RAJA::forall<forall_pol>
+    (RAJA::RangeSegment(0, m * n), RAJA::Name("matrix_scalar_mult"), [=] RAJA_HOST_DEVICE (RAJA::Index_type i) {
+        B[i] = scalar * A[i];
+  });
+}
+
+void matrix_multiply(const double *A, const double *B, double *C, int m, int n, int p) {
+
+  // A: m x n, B: n x p, C: m x p
+  auto v_A = RAJA::make_permuted_view<RAJA::layout_right>(A, m, n);
+  auto v_B = RAJA::make_permuted_view<RAJA::layout_right>(B, n, p);
+  auto v_C = RAJA::make_permuted_view<RAJA::layout_right>(C, m, p);
+
+#if 1
+  const int threads = p;
+  const int teams = m;
+
+  RAJA::LaunchParams params{RAJA::Teams(teams), RAJA::Threads(threads)};
+
+  using loop1_pol = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
+  using loop0_pol = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
+#endif
+
+#if 0
+  const int threads = 256;
+  const int teams = m;
+
+  RAJA::LaunchParams params{RAJA::Teams(teams), RAJA::Threads(threads)};
+
+  using loop1_pol = RAJA::LoopPolicy<RAJA::cuda_block_x_loop>;
+  using loop0_pol = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
+#endif
+
+#if 0
+  const int threads = 16;
+  const int teams_x = (n - 1)/threads + 1;
+  const int teams_y = (m - 1)/threads + 1;
+
+  RAJA::LaunchParams params{RAJA::Teams(teams_x, teams_y), RAJA::Threads(threads, threads)};
+
+  using loop1_pol = RAJA::LoopPolicy<RAJA::cuda_global_thread_y>;
+  using loop0_pol = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
+#endif
+
+  RAJA::launch<launch_pol>
+  (params, RAJA::Name("matrix_multiply"),
+   [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::loop<loop1_pol>(ctx, RAJA::RangeSegment(0, m), [&] (int i) {
+          RAJA::loop<loop0_pol> (ctx, RAJA::RangeSegment(0, p), [&] (int j) {
+
+            double dot =0.0;
+            for (int k = 0; k < n; k++) {
+              dot += v_A(i, k) * v_B(k, j);
+            }
+            v_C(i, j) = dot;
+           });
+         });
+    });
+}
+
+
+bool check_matrix_multiply(const double *C, const int n)
+{
+
+  bool pass = true;
+  auto v_C = RAJA::make_permuted_view<RAJA::layout_right>(C, n, n);
+
+  for(int r=0; r<n; ++r) {
+    for(int c=0; c<n; ++c) {
+
+      if(v_C(r, c) != n) {
+        pass = false;
+      }
+    }
+  }
+  return pass;
+}
+
+int main(int argc, char* argv[])
+{
+
+  if(argc != 2) {
+    throw std::runtime_error("usage ./main N -- where N is matrix size (N x N )");
+  }
+
+  int n = std::atoi(argv[1]);
+  std::cout<<"Using matrix size "<<n<<" x "<<n<<std::endl;
+
+  double* A{nullptr};
+  double* B{nullptr};
+  double* C{nullptr};
+
+  //Use host and device memory
+  auto& rm = umpire::ResourceManager::getInstance();
+  auto allocator = rm.getAllocator("UM");
+
+  A = static_cast<double*>(allocator.allocate(n*n*sizeof(double)));
+  B = static_cast<double*>(allocator.allocate(n*n*sizeof(double)));
+  C = static_cast<double*>(allocator.allocate(n*n*sizeof(double)));
+
+  init(A, B, C, n, n);
+
+  matrix_add(A, B, C, n, n);
+
+  matrix_scalar_mult(A, C, 2.0, n, n);
+
+  matrix_multiply(A, B, C, n, n, n);
+
+  bool pass = check_matrix_multiply(C, n);
+
+  if(!pass) {
+    throw std::runtime_error("matrix_multiply did not pass");
+  }
+
+  std::cout<<"Matrix multiply passed"<<std::endl;
+
+  allocator.deallocate(A);
+  allocator.deallocate(B);
+  allocator.deallocate(C);
+
+
+  return 0;
+}