Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ endif()
add_subdirectory(tpl)
add_subdirectory(Intro_Tutorial)
add_subdirectory(Intermediate_Tutorial)
add_subdirectory(Profile_Demo)
6 changes: 6 additions & 0 deletions Profile_Demo/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
if (ENABLE_CUDA)
blt_add_executable(
NAME profile_raja
SOURCES profile_raja.cpp
DEPENDS_ON RAJA umpire cuda)
endif()
9 changes: 9 additions & 0 deletions Profile_Demo/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Basic RAJA profiling with Caliper

Building Caliper:
cmake -DCMAKE_INSTALL_PREFIX=${caliper_path} -DWITH_NVTX=ON -DWITH_CUPTI=ON ../

Building RAJA:
cmake -DENABLE_CUDA=ON -DRAJA_ENABLE_RUNTIME_PLUGINS=ON -DRAJA_ENABLE_CALIPER=ON -DCMAKE_CUDA_FLAGS="--expt-extended-lambda" -Dcaliper_DIR=${caliper_path} ../ && make profile_raja -j

TODO: Add to readme
37 changes: 37 additions & 0 deletions Profile_Demo/caliper-plugin.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
// and RAJA project contributors. See the RAJA/LICENSE file for details.
//
// SPDX-License-Identifier: (BSD-3-Clause)
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//

#include "RAJA/util/PluginStrategy.hpp"

#include <iostream>
#include <caliper/cali.h>

class CaliperPlugin : public RAJA::util::PluginStrategy
{
public:
void preLaunch(const RAJA::util::PluginContext&p) override
{
if(!p.kernel_name.empty()) CALI_MARK_BEGIN(p.kernel_name.c_str());
}

void postLaunch(const RAJA::util::PluginContext& p) override
{
if(!p.kernel_name.empty()) CALI_MARK_END(p.kernel_name.c_str());
}

private:

};

// Dynamically loading plugin.
extern "C" RAJA::util::PluginStrategy *RAJAGetPlugin()
{
return new CaliperPlugin;
}

// Statically loading plugin.
static RAJA::util::PluginRegistry::add<CaliperPlugin> P("Caliper", "Enables Caliper Profiling");
158 changes: 158 additions & 0 deletions Profile_Demo/profile_raja.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
#include <stdexcept>
#include <iostream>

#include "RAJA/RAJA.hpp"
#include "umpire/Umpire.hpp"

#include "caliper-plugin.cpp"

const bool async = false;
using forall_pol = RAJA::cuda_exec<1024, async>;
using launch_pol = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async>>;

void init(double *A, double *B, double *C, int m, int n) {

RAJA::forall<forall_pol>(RAJA::RangeSegment(0, n * n),
RAJA::Name("init"),
[=] RAJA_HOST_DEVICE (RAJA::Index_type i) {
A[i] = 1.0;
B[i] = 1.0;
C[i] = 0.0;
});
}

void matrix_add(const double *A, const double *B, double *C, int m, int n) {

RAJA::forall<forall_pol>
(RAJA::RangeSegment(0, m * n), RAJA::Name("matrix_add"), [=] RAJA_HOST_DEVICE (RAJA::Index_type i) {
C[i] = A[i] + B[i];
});

}

void matrix_scalar_mult(const double *A, double *B, double scalar, int m, int n) {

RAJA::forall<forall_pol>
(RAJA::RangeSegment(0, m * n), RAJA::Name("matrix_scalar_mult"), [=] RAJA_HOST_DEVICE (RAJA::Index_type i) {
B[i] = scalar * A[i];
});
}

void matrix_multiply(const double *A, const double *B, double *C, int m, int n, int p) {

// A: m x n, B: n x p, C: m x p
auto v_A = RAJA::make_permuted_view<RAJA::layout_right>(A, m, n);
auto v_B = RAJA::make_permuted_view<RAJA::layout_right>(B, n, p);
auto v_C = RAJA::make_permuted_view<RAJA::layout_right>(C, m, p);

#if 1
const int threads = p;
const int teams = m;

RAJA::LaunchParams params{RAJA::Teams(teams), RAJA::Threads(threads)};

using loop1_pol = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
using loop0_pol = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
#endif

#if 0
const int threads = 256;
const int teams = m;

RAJA::LaunchParams params{RAJA::Teams(teams), RAJA::Threads(threads)};

using loop1_pol = RAJA::LoopPolicy<RAJA::cuda_block_x_loop>;
using loop0_pol = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
#endif

#if 0
const int threads = 16;
const int teams_x = (n - 1)/threads + 1;
const int teams_y = (m - 1)/threads + 1;

RAJA::LaunchParams params{RAJA::Teams(teams_x, teams_y), RAJA::Threads(threads, threads)};

using loop1_pol = RAJA::LoopPolicy<RAJA::cuda_global_thread_y>;
using loop0_pol = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
#endif

RAJA::launch<launch_pol>
(params, RAJA::Name("matrix_multiply"),
[=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {

RAJA::loop<loop1_pol>(ctx, RAJA::RangeSegment(0, m), [&] (int i) {
RAJA::loop<loop0_pol> (ctx, RAJA::RangeSegment(0, p), [&] (int j) {

double dot =0.0;
for (int k = 0; k < n; k++) {
dot += v_A(i, k) * v_B(k, j);
}
v_C(i, j) = dot;
});
});
});
}


bool check_matrix_multiply(const double *C, const int n)
{

bool pass = true;
auto v_C = RAJA::make_permuted_view<RAJA::layout_right>(C, n, n);

for(int r=0; r<n; ++r) {
for(int c=0; c<n; ++c) {

if(v_C(r, c) != n) {
pass = false;
}
}
}
return pass;
}

int main(int argc, char* argv[])
{

if(argc != 2) {
throw std::runtime_error("usage ./main N -- where N is matrix size (N x N )");
}

int n = std::atoi(argv[1]);
std::cout<<"Using matrix size "<<n<<" x "<<n<<std::endl;

double* A{nullptr};
double* B{nullptr};
double* C{nullptr};

//Use host and device memory
auto& rm = umpire::ResourceManager::getInstance();
auto allocator = rm.getAllocator("UM");

A = static_cast<double*>(allocator.allocate(n*n*sizeof(double)));
B = static_cast<double*>(allocator.allocate(n*n*sizeof(double)));
C = static_cast<double*>(allocator.allocate(n*n*sizeof(double)));

init(A, B, C, n, n);

matrix_add(A, B, C, n, n);

matrix_scalar_mult(A, C, 2.0, n, n);

matrix_multiply(A, B, C, n, n, n);

bool pass = check_matrix_multiply(C, n);

if(!pass) {
throw std::runtime_error("matrix_multiply did not pass");
}

std::cout<<"Matrix multiply passed"<<std::endl;

allocator.deallocate(A);
allocator.deallocate(B);
allocator.deallocate(C);


return 0;
}