Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ endif()
add_subdirectory(tpl)
add_subdirectory(Intro_Tutorial)
add_subdirectory(Intermediate_Tutorial)
add_subdirectory(Profile_Demo)
6 changes: 6 additions & 0 deletions Profile_Demo/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
if (ENABLE_CUDA)
blt_add_executable(
NAME profile_raja
SOURCES profile_raja.cpp
DEPENDS_ON RAJA umpire cuda)
endif()
23 changes: 23 additions & 0 deletions Profile_Demo/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Basic RAJA profiling with Caliper

In this example, we explore profiling RAJA kernels using the Caliper library developed at LLNL.
Below are example build commands you can use to configure Caliper and RAJA for profiling on NVIDIA GPUs.

Building Caliper on an NVIDIA platform:
``cmake -DCMAKE_INSTALL_PREFIX=${caliper_path} -DWITH_NVTX=ON -DWITH_CUPTI=ON ../``

Building RAJA:
``cmake -DENABLE_CUDA=ON -DRAJA_ENABLE_RUNTIME_PLUGINS=ON -DRAJA_ENABLE_CALIPER=ON -Dcaliper_DIR=${caliper_path}/build/share/cmake/caliper -DCMAKE_CUDA_FLAGS="--expt-extended-lambda" -Dcaliper_DIR=${caliper_path} ../ && make profile_raja -j``

Once the suite is built, you can invoke the following command to profile a set of basic linear algebra kernels:

``CALI_CONFIG=runtime-report ./bin/profile_raja 1024``

This example provides three different kernel policies, allowing users to observe runtime performance differences between the kernels.
To switch between them, uncomment the desired variable at the top of the file.

For more information on Caliper we refer the reader to the following pages:

- [RAJA-Caliper Quick Start Documentation](https://raja.readthedocs.io/en/develop/sphinx/user_guide/profiling_with_caliper.html)
- [Caliper GitHub](https://github.com/LLNL/Caliper)
- [Caliper Documentation](https://software.llnl.gov/Caliper/)
37 changes: 37 additions & 0 deletions Profile_Demo/caliper-plugin.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
// and RAJA project contributors. See the RAJA/LICENSE file for details.
//
// SPDX-License-Identifier: (BSD-3-Clause)
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//

#include "RAJA/util/PluginStrategy.hpp"

#include <iostream>
#include <caliper/cali.h>

class CaliperPlugin : public RAJA::util::PluginStrategy
{
public:
void preLaunch(const RAJA::util::PluginContext&p) override
{
if(!p.kernel_name.empty()) CALI_MARK_BEGIN(p.kernel_name.c_str());
}

void postLaunch(const RAJA::util::PluginContext& p) override
{
if(!p.kernel_name.empty()) CALI_MARK_END(p.kernel_name.c_str());
}

private:

};

// Dynamically loading plugin.
extern "C" RAJA::util::PluginStrategy *RAJAGetPlugin()
{
return new CaliperPlugin;
}

// Statically loading plugin.
static RAJA::util::PluginRegistry::add<CaliperPlugin> P("Caliper", "Enables Caliper Profiling");
165 changes: 165 additions & 0 deletions Profile_Demo/profile_raja.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#include <stdexcept>
#include <iostream>

#include "RAJA/RAJA.hpp"
#include "umpire/Umpire.hpp"

#include "caliper-plugin.cpp"

//Uncomment for policy selection

#define DIRECT_POLICY
///#define LOOP_POLICY
//#define GLOBAL_POLICY

constexpr int max_threads = 1024;
constexpr bool async = false;
using forall_pol = RAJA::cuda_exec<max_threads, async>;
using launch_pol = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async>>;

void init(double *A, double *B, double *C, int m, int n) {

RAJA::forall<forall_pol>(RAJA::RangeSegment(0, n * n),
RAJA::Name("init"),
[=] RAJA_HOST_DEVICE (RAJA::Index_type i) {
A[i] = 1.0;
B[i] = 1.0;
C[i] = 0.0;
});
}

void matrix_add(const double *A, const double *B, double *C, int m, int n) {

RAJA::forall<forall_pol>
(RAJA::RangeSegment(0, m * n), RAJA::Name("matrix_add"), [=] RAJA_HOST_DEVICE (RAJA::Index_type i) {
C[i] = A[i] + B[i];
});

}

void matrix_scalar_mult(const double *A, double *B, double scalar, int m, int n) {

RAJA::forall<forall_pol>
(RAJA::RangeSegment(0, m * n), RAJA::Name("matrix_scalar_mult"), [=] RAJA_HOST_DEVICE (RAJA::Index_type i) {
B[i] = scalar * A[i];
});
}

void matrix_multiply(const double *A, const double *B, double *C, int m, int n, int p) {

// A: m x n, B: n x p, C: m x p
auto v_A = RAJA::make_permuted_view<RAJA::layout_right>(A, m, n);
auto v_B = RAJA::make_permuted_view<RAJA::layout_right>(B, n, p);
auto v_C = RAJA::make_permuted_view<RAJA::layout_right>(C, m, p);

#if defined(DIRECT_POLICY)
const int threads = p;
const int teams = m;

RAJA::LaunchParams params{RAJA::Teams(teams), RAJA::Threads(threads)};

using loop1_pol = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
using loop0_pol = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
#endif

#if defined(LOOP_POLICY)
const int threads = 256;
const int teams = m;

RAJA::LaunchParams params{RAJA::Teams(teams), RAJA::Threads(threads)};

using loop1_pol = RAJA::LoopPolicy<RAJA::cuda_block_x_loop>;
using loop0_pol = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
#endif

#if defined(GLOBAL_POLICY)
const int threads = 16;
const int teams_x = (n - 1)/threads + 1;
const int teams_y = (m - 1)/threads + 1;

RAJA::LaunchParams params{RAJA::Teams(teams_x, teams_y), RAJA::Threads(threads, threads)};

using loop1_pol = RAJA::LoopPolicy<RAJA::cuda_global_thread_y>;
using loop0_pol = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
#endif

RAJA::launch<launch_pol>
(params, RAJA::Name("matrix_multiply"),
[=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {

RAJA::loop<loop1_pol>(ctx, RAJA::RangeSegment(0, m), [&] (int i) {
RAJA::loop<loop0_pol> (ctx, RAJA::RangeSegment(0, p), [&] (int j) {

double dot =0.0;
for (int k = 0; k < n; k++) {
dot += v_A(i, k) * v_B(k, j);
}
v_C(i, j) = dot;
});
});
});
}


bool check_matrix_multiply(const double *C, const int n)
{

bool pass = true;
auto v_C = RAJA::make_permuted_view<RAJA::layout_right>(C, n, n);

for(int r=0; r<n; ++r) {
for(int c=0; c<n; ++c) {

if(v_C(r, c) != n) {
pass = false;
}
}
}
return pass;
}

int main(int argc, char* argv[])
{

if(argc != 2) {
throw std::runtime_error("usage ./main N -- where N is matrix size (N x N )");
}

int n = std::atoi(argv[1]);
std::cout<<"Using matrix size "<<n<<" x "<<n<<std::endl;

double* A{nullptr};
double* B{nullptr};
double* C{nullptr};

//Use host and device memory
auto& rm = umpire::ResourceManager::getInstance();
auto allocator = rm.getAllocator("UM");

A = static_cast<double*>(allocator.allocate(n*n*sizeof(double)));
B = static_cast<double*>(allocator.allocate(n*n*sizeof(double)));
C = static_cast<double*>(allocator.allocate(n*n*sizeof(double)));

init(A, B, C, n, n);

matrix_add(A, B, C, n, n);

matrix_scalar_mult(A, C, 2.0, n, n);

matrix_multiply(A, B, C, n, n, n);

bool pass = check_matrix_multiply(C, n);

if(!pass) {
throw std::runtime_error("matrix_multiply did not pass");
}

std::cout<<"Matrix multiply passed"<<std::endl;

allocator.deallocate(A);
allocator.deallocate(B);
allocator.deallocate(C);


return 0;
}