Skip to content

Commit 58d263c

Browse files
committed
#10: wip: add dpotrf benchmark and refactor to run sensors before and after each benchmark
1 parent e3efcaf commit 58d263c

File tree

3 files changed

+234
-84
lines changed

3 files changed

+234
-84
lines changed

src/benchmarks.cc

Lines changed: 195 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,7 @@
11
#include <mkl.h>
2+
#include <mpi.h>
23
#include <Kokkos_Random.hpp>
34

4-
enum benchmarks {
5-
level1,
6-
level2,
7-
level3,
8-
dpotrf,
9-
num_benchmarks
10-
};
11-
12-
template <typename T>
13-
std::string typeToString();
145

156
template <>
167
std::string typeToString<double>() {
@@ -22,6 +13,16 @@ std::string typeToString<std::complex<double>>() {
2213
return "complex";
2314
}
2415

16+
std::string benchmarkToString(const benchmarks& b) {
17+
switch (b) {
18+
case level1: return "level1";
19+
case level2: return "level2";
20+
case level3: return "level3";
21+
case dpotrf: return "dpotrf";
22+
default: return "unknown";
23+
}
24+
}
25+
2526
template <typename T>
2627
std::tuple<std::vector<double>, double> runBenchmarkLevel1(int N, int iters) {
2728
std::cout << "-- level 1 benchmark [" << typeToString<T>() << "] -- " << std::endl;
@@ -139,12 +140,127 @@ std::tuple<std::vector<double>, double> runBenchmarkLevel3() {
139140
N, // shared inner dimension (columns of A, rows of B)
140141
1.0, // alpha
141142
A_ptr, // matrix A pointer
142-
N, // leading dimension of A (because A is M×N)
143+
N, // leading dimension of A (A is M×N)
143144
B_ptr, // matrix B pointer
144-
K, // leading dimension of B (because B is N×K)
145+
K, // leading dimension of B (B is N×K)
145146
0.0, // beta
146147
C_ptr, // matrix C pointer
147-
K); // leading dimension of C (because C is M×K)
148+
K); // leading dimension of C (C is M×K)
149+
Kokkos::fence();
150+
151+
// Skip the first iteration
152+
if (i > 0) {
153+
double time = timer.seconds();
154+
total_time += time;
155+
iter_timings.push_back(time);
156+
}
157+
}
158+
159+
int rank = -1;
160+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
161+
162+
std::cout << "rank: " << rank << ", total_time=" << total_time << std::endl;
163+
164+
return std::make_tuple(iter_timings, total_time);
165+
}
166+
167+
template <typename T>
168+
std::tuple<std::vector<double>, double> runBenchmarkLevel3() {
169+
std::cout << "-- level 3 benchmark [" << typeToString<T>() << "] -- " << std::endl;
170+
Kokkos::View<T**> A("A", M, N);
171+
Kokkos::View<T**> B("B", N, K);
172+
Kokkos::View<T**> C("C", M, K);
173+
174+
T* A_ptr = A.data();
175+
T* B_ptr = B.data();
176+
T* C_ptr = C.data();
177+
178+
Kokkos::Random_XorShift64_Pool pool(123);
179+
Kokkos::fill_random(A, pool, 10.0);
180+
Kokkos::fill_random(B, pool, 10.0);
181+
182+
std::vector<double> iter_timings;
183+
184+
double total_time = 0.0;
185+
186+
MPI_Barrier(MPI_COMM_WORLD);
187+
188+
for (int i = 0; i < iters; i++) {
189+
Kokkos::Timer timer;
190+
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
191+
M, // number of rows in C (and A)
192+
K, // number of columns in C (and B)
193+
N, // shared inner dimension (columns of A, rows of B)
194+
1.0, // alpha
195+
A_ptr, // matrix A pointer
196+
N, // leading dimension of A (A is M×N)
197+
B_ptr, // matrix B pointer
198+
K, // leading dimension of B (B is N×K)
199+
0.0, // beta
200+
C_ptr, // matrix C pointer
201+
K); // leading dimension of C (C is M×K)
202+
Kokkos::fence();
203+
204+
// Skip the first iteration
205+
if (i > 0) {
206+
double time = timer.seconds();
207+
total_time += time;
208+
iter_timings.push_back(time);
209+
}
210+
}
211+
212+
int rank = -1;
213+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
214+
215+
std::cout << "rank: " << rank << ", total_time=" << total_time << std::endl;
216+
217+
return std::make_tuple(iter_timings, total_time);
218+
}
219+
Assistant
220+
221+
Certainly! Below is an implementation of a benchmark function that performs the MKL DPOTRF operation, which is used for Cholesky factorization of a symmetric positive definite matrix. This benchmark follows the same style as your provided runBenchmarkLevel3 function.
222+
223+
#include <iostream>
224+
#include <vector>
225+
#include <tuple>
226+
#include <mkl.h>
227+
#include <Kokkos_Core.hpp>
228+
#include <mpi.h>
229+
230+
template <typename T>
231+
std::tuple<std::vector<double>, double> runBenchmarkDPOTRF(int N, int iters) {
232+
std::cout << "-- dpotrf benchmark [" << typeToString<T>() << "] -- " << std::endl;
233+
234+
// Define matrix size
235+
Kokkos::View<T**> A("A", N, N);
236+
237+
// Fill matrix A with random values
238+
Kokkos::Random_XorShift64_Pool pool(123);
239+
Kokkos::fill_random(A, pool, 10.0);
240+
241+
// Make A symmetric positive definite
242+
Kokkos::parallel_for("MakeSPD", N, KOKKOS_LAMBDA(int i) {
243+
for (int j = 0; j < N; j++) {
244+
A(i, j) = A(i, j) + A(j, i); // Symmetrize
245+
}
246+
});
247+
Kokkos::fence();
248+
249+
// Prepare for MKL DPOTRF
250+
T* A_ptr = A.data();
251+
std::vector<double> iter_timings;
252+
double total_time = 0.0;
253+
254+
MPI_Barrier(MPI_COMM_WORLD);
255+
256+
// Perform the benchmark iterations
257+
for (int i = 0; i < iters; i++) {
258+
Kokkos::Timer timer;
259+
260+
// Call MKL DPOTRF
261+
int info;
262+
dpotrf_("L", &N, A_ptr, &N, &info); // 'L' for lower triangular
263+
148264
Kokkos::fence();
149265

150266
// Skip the first iteration
@@ -167,14 +283,77 @@ template <typename T>
167283
std::tuple<std::vector<double>, double> runBenchmark(benchmarks type, int M, int N, int K, int iters) {
168284
switch (type) {
169285
case level1:
170-
return runBenchmarkLevel1<T>(M, N, K, iters);
286+
return runBenchmarkLevel1<T>(N, iters);
171287
case level2:
172-
return runBenchmarkLevel2<T>(M, N, K, iters);
288+
return runBenchmarkLevel2<T>(M, N, iters);
173289
case level3:
174290
return runBenchmarkLevel3<T>(M, N, K, iters);
175291
case dpotrf:
176-
return runBenchmarkDPOTRF<T>(M, N, K, iters);
292+
return runBenchmarkDPOTRF<T>(N, iters);
177293
default:
178294
throw std::invalid_argument("Unsupported benchmark type");
179295
}
180296
}
297+
298+
void reduceAndPrintBenchmarkOutput(
299+
std::vector<double> iter_timings,
300+
double total_time,
301+
std::string benchmark)
302+
{
303+
int rank = -1;
304+
int num_ranks = 0;
305+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
306+
MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);
307+
308+
char processor_name[MPI_MAX_PROCESSOR_NAME];
309+
int name_len;
310+
MPI_Get_processor_name(processor_name, &name_len);
311+
312+
std::vector<double> all_times;
313+
all_times.resize(num_ranks);
314+
315+
std::vector<double> all_iter_times;
316+
all_iter_times.resize(num_ranks * iters);
317+
318+
std::vector<char> all_processor_names;
319+
all_processor_names.resize(num_ranks * MPI_MAX_PROCESSOR_NAME);
320+
321+
if (rank == 0) {
322+
std::cout << "num_ranks: " << num_ranks << std::endl;
323+
}
324+
325+
MPI_Gather(
326+
&total_time, 1, MPI_DOUBLE,
327+
&all_times[0], 1, MPI_DOUBLE, 0,
328+
MPI_COMM_WORLD
329+
);
330+
331+
MPI_Gather(
332+
&iter_timings[0], iters, MPI_DOUBLE,
333+
&all_iter_times[0], iters, MPI_DOUBLE, 0,
334+
MPI_COMM_WORLD
335+
);
336+
337+
MPI_Gather(
338+
&processor_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR,
339+
&all_processor_names[0], MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0,
340+
MPI_COMM_WORLD
341+
);
342+
343+
if (rank == 0) {
344+
int cur_rank = 0;
345+
int cur = 0;
346+
std::cout << "=== " << benchmark << " ===" << std::endl;
347+
for (auto&& time : all_times) {
348+
std::cout << "gather: " << cur_rank << " ("
349+
<< std::string(&all_processor_names[cur_rank * MPI_MAX_PROCESSOR_NAME])
350+
<< "): " << time << ": breakdown: ";
351+
for (int i = cur; i < iters + cur; i++) {
352+
std::cout << all_iter_times[i] << " ";
353+
}
354+
std::cout << std::endl;
355+
cur += iters;
356+
cur_rank++;
357+
}
358+
}
359+
}

src/benchmarks.h

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,35 @@
1+
#include <std::string>
2+
3+
enum benchmarks {
4+
level1,
5+
level2,
6+
level3,
7+
dpotrf,
8+
num_benchmarks
9+
};
10+
11+
template <typename T>
12+
std::string typeToString();
13+
14+
std::string benchmarkToString(const benchmarks& b);
115

216
template <typename T>
3-
std::tuple<std::vector<double>, double> runBenchmarkLevel3();
17+
std::tuple<std::vector<double>, double> runBenchmarkLevel1(int N, int iters);
18+
19+
template <typename T>
20+
std::tuple<std::vector<double>, double> runBenchmarkLevel2(int M, int N, int iters);
21+
22+
template <typename T>
23+
std::tuple<std::vector<double>, double> runBenchmarkLevel3(int M, int N, int K, int iters);
24+
25+
template <typename T>
26+
std::tuple<std::vector<double>, double> runBenchmarkDPOTRF(int N, int iters);
27+
28+
template <typename T>
29+
std::tuple<std::vector<double>, double> runBenchmark(
30+
benchmarks type, int M, int N, int K, int iters);
31+
32+
void reduceAndPrintBenchmarkOutput(
33+
std::vector<double> iter_timings,
34+
double total_time,
35+
std::string benchmark);

src/slow_node.cc

Lines changed: 6 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
#include "sensors.h"
33
#include "benchmarks.h"
44

5-
#include <mkl.h>
65
#include <Kokkos_Random.hpp>
76

87
#include <iostream>
@@ -22,74 +21,14 @@ int main(int argc, char** argv) {
2221
MPI_Init(&argc, &argv);
2322
Kokkos::initialize(argc, argv);
2423

25-
int rank = -1;
26-
int num_ranks = 0;
27-
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
28-
MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);
29-
30-
char processor_name[MPI_MAX_PROCESSOR_NAME];
31-
int name_len;
32-
MPI_Get_processor_name(processor_name, &name_len);
33-
34-
sensors::runSensorsAndReduceOutput(processor_name, "pre");
35-
36-
// Simple for loop just for now
24+
// Loop through all available benchmarks
3725
for (int i=0; i < benchmarks::num_benchmarks; i++) {
3826
auto benchmark_type = static_cast<benchmarks>(i);
39-
try {
40-
auto const& [iter_timings, total_time] results = runBenchmark<T>(benchmark_type, M, N, K, iters);
41-
} catch (const std::invalid_argument& e) {
42-
std::cerr << "Error running benchmark type " << i << ": " << e.what() << std::endl;
43-
}
44-
}
45-
// auto const& [iter_timings, total_time] = runBenchmark<double>(level3, M, N, K, iters);
46-
sensors::runSensorsAndReduceOutput(processor_name, "post");
47-
48-
std::vector<double> all_times;
49-
all_times.resize(num_ranks);
50-
51-
std::vector<double> all_iter_times;
52-
all_iter_times.resize(num_ranks * iters);
53-
54-
std::vector<char> all_processor_names;
55-
all_processor_names.resize(num_ranks * MPI_MAX_PROCESSOR_NAME);
56-
57-
if (rank == 0) {
58-
std::cout << "num_ranks: " << num_ranks << std::endl;
59-
}
60-
61-
MPI_Gather(
62-
&total_time, 1, MPI_DOUBLE,
63-
&all_times[0], 1, MPI_DOUBLE, 0,
64-
MPI_COMM_WORLD
65-
);
66-
67-
MPI_Gather(
68-
&iter_timings[0], iters, MPI_DOUBLE,
69-
&all_iter_times[0], iters, MPI_DOUBLE, 0,
70-
MPI_COMM_WORLD
71-
);
72-
73-
MPI_Gather(
74-
&processor_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR,
75-
&all_processor_names[0], MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0,
76-
MPI_COMM_WORLD
77-
);
78-
79-
if (rank == 0) {
80-
int cur_rank = 0;
81-
int cur = 0;
82-
for (auto&& time : all_times) {
83-
std::cout << "gather: " << cur_rank << " ("
84-
<< std::string(&all_processor_names[cur_rank * MPI_MAX_PROCESSOR_NAME])
85-
<< "): " << time << ": breakdown: ";
86-
for (int i = cur; i < iters + cur; i++) {
87-
std::cout << all_iter_times[i] << " ";
88-
}
89-
std::cout << std::endl;
90-
cur += iters;
91-
cur_rank++;
92-
}
27+
std::string benchmark_str = benchmarkToString(benchmark_type) + "_double";
28+
sensors::runSensorsAndReduceOutput(processor_name, "pre_" + benchmark_str);
29+
auto const& [iter_timings, total_time] results = runBenchmark<double>(benchmark_type, M, N, K, iters);
30+
sensors::runSensorsAndReduceOutput(processor_name, "post" + benchmark_str);
31+
reduceAndPrintBenchmarkOutput(iter_timings, total_time, benchmark_str);
9332
}
9433

9534
Kokkos::finalize();

0 commit comments

Comments
 (0)