Skip to content

Commit 2c4a60a

Browse files
authored
Merge pull request #42 from LLNL/bowen/launch-kernel
Lesson 10: Launch - Matrix transpose
2 parents ccd5eb0 + 866411d commit 2c4a60a

16 files changed

Lines changed: 251 additions & 452 deletions

Intro_Tutorial/lessons/10_raja_kernel/10_raja_kernel.cpp

Lines changed: 0 additions & 54 deletions
This file was deleted.

Intro_Tutorial/lessons/10_raja_kernel/CMakeLists.txt

Lines changed: 0 additions & 4 deletions
This file was deleted.

Intro_Tutorial/lessons/10_raja_kernel/README.md

Lines changed: 0 additions & 73 deletions
This file was deleted.

Intro_Tutorial/lessons/10_raja_kernel/solution/10_raja_kernel.cpp

Lines changed: 0 additions & 67 deletions
This file was deleted.
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#include <iostream>
2+
3+
#include "RAJA/RAJA.hpp"
4+
#include "umpire/Umpire.hpp"
5+
#include "umpire/strategy/QuickPool.hpp"
6+
7+
template<typename U, typename V>
8+
void check_solution(U &A, V &A_t, const int M, const int N);
9+
10+
//TODO: uncomment this out in order to build!
11+
//#define COMPILE
12+
13+
int main()
14+
{
15+
#if defined(COMPILE)
16+
constexpr int N{10000};
17+
constexpr int M{7000};
18+
double* h_a{nullptr};
19+
double* h_a_t{nullptr};
20+
double* d_a{nullptr};
21+
double* d_a_t{nullptr};
22+
23+
auto& rm = umpire::ResourceManager::getInstance();
24+
25+
auto device_allocator = rm.getAllocator("DEVICE");
26+
auto host_allocator = rm.getAllocator("HOST");
27+
28+
d_a = static_cast<double *>(device_allocator.allocate(N*M*sizeof(double)));
29+
d_a_t = static_cast<double *>(device_allocator.allocate(N*M*sizeof(double)));
30+
h_a = static_cast<double *>(host_allocator.allocate(N*M*sizeof(double)));
31+
h_a_t = static_cast<double *>(host_allocator.allocate(N*M*sizeof(double)));
32+
33+
auto h_A = RAJA::make_permuted_view<RAJA::layout_right>(h_a, M, N);
34+
auto h_A_t = RAJA::make_permuted_view<RAJA::layout_right>(h_a_t, N, M);
35+
36+
// Intialize data
37+
for(int row = 0; row < M; ++row) {
38+
for(int col = 0; col < N; ++col) {
39+
h_A(row, col) = col + N * row;
40+
}
41+
}
42+
43+
rm.copy(d_a, h_a, N*M*sizeof(double));
44+
rm.copy(d_a_t, h_a_t, N*M*sizeof(double));
45+
46+
auto d_A = RAJA::make_permuted_view<RAJA::layout_right>(d_a, M, N);
47+
auto d_A_t = RAJA::make_permuted_view<RAJA::layout_right>(d_a_t, N, M);
48+
49+
constexpr int team_size = 16;
50+
const int teams_x = (M - 1) / team_size + 1;
51+
const int teams_y = (N - 1) / team_size + 1;
52+
53+
const bool async = false;
54+
using EXEC_POL =
55+
RAJA::LaunchPolicy<RAJA::cuda_launch_t<async>>;
56+
using outer_loop = RAJA::LoopPolicy</*Construct the CUDA y global index policy*/>;
57+
using inner_loop = RAJA::LoopPolicy</*Construct the CUDA x global index policy*/>;
58+
59+
RAJA::launch<EXEC_POL>(
60+
RAJA::LaunchParams(RAJA::Teams(teams_x, teams_y), RAJA::Threads(team_size, team_size)),
61+
[=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
62+
RAJA::loop<inner_loop>(ctx, RAJA::TypedRangeSegment<int>(0,M), [&] (int row) {
63+
RAJA::loop<outer_loop>(ctx, RAJA::TypedRangeSegment<int>(0,N), [&] (int col) {
64+
d_A_t(col, row) = d_A(row, col);
65+
});
66+
});
67+
});
68+
69+
rm.copy(h_a, d_a, N*M*sizeof(double));
70+
rm.copy(h_a_t, d_a_t, N*M*sizeof(double));
71+
72+
check_solution(h_A, h_A_t, M, N);
73+
74+
device_allocator.deallocate(d_a);
75+
device_allocator.deallocate(d_a_t);
76+
host_allocator.deallocate(h_a);
77+
host_allocator.deallocate(h_a_t);
78+
79+
#endif //COMPILE
80+
81+
return 0;
82+
}
83+
84+
template<typename U, typename V>
85+
void check_solution(U &A, V &A_t, const int M, const int N)
86+
{
87+
bool pass = true;
88+
89+
for(int row = 0; row < M; ++row) {
90+
for(int col = 0; col < N; ++col) {
91+
if(A(row, col) != A_t(col, row)) {
92+
pass = false;
93+
}
94+
}
95+
}
96+
97+
if(pass) {
98+
std::cout<<"SUCCESS! Matrix transpose passed"<<std::endl;
99+
}else{
100+
std::cout<<"Error! Matrix transpose did not pass"<<std::endl;
101+
}
102+
103+
}

0 commit comments

Comments
 (0)