11#include < mkl.h>
2+ #include < mpi.h>
23#include < Kokkos_Random.hpp>
34
4- enum benchmarks {
5- level1,
6- level2,
7- level3,
8- dpotrf,
9- num_benchmarks
10- };
11-
12- template <typename T>
13- std::string typeToString ();
145
156template <>
167std::string typeToString<double >() {
@@ -22,6 +13,16 @@ std::string typeToString<std::complex<double>>() {
2213 return " complex" ;
2314}
2415
16+ std::string benchmarkToString (const benchmarks& b) {
17+ switch (b) {
18+ case level1: return " level1" ;
19+ case level2: return " level2" ;
20+ case level3: return " level3" ;
21+ case dpotrf: return " dpotrf" ;
22+ default : return " unknown" ;
23+ }
24+ }
25+
2526template <typename T>
2627std::tuple<std::vector<double >, double > runBenchmarkLevel1 (int N, int iters) {
2728 std::cout << " -- level 1 benchmark [" << typeToString<T>() << " ] -- " << std::endl;
@@ -139,12 +140,127 @@ std::tuple<std::vector<double>, double> runBenchmarkLevel3() {
139140 N, // shared inner dimension (columns of A, rows of B)
140141 1.0 , // alpha
141142 A_ptr, // matrix A pointer
142- N, // leading dimension of A (because A is M×N)
143+ N, // leading dimension of A (A is M×N)
143144 B_ptr, // matrix B pointer
144- K, // leading dimension of B (because B is N×K)
145+ K, // leading dimension of B (B is N×K)
145146 0.0 , // beta
146147 C_ptr, // matrix C pointer
147- K); // leading dimension of C (because C is M×K)
148+ K); // leading dimension of C (C is M×K)
149+ Kokkos::fence ();
150+
151+ // Skip the first iteration
152+ if (i > 0 ) {
153+ double time = timer.seconds ();
154+ total_time += time;
155+ iter_timings.push_back (time);
156+ }
157+ }
158+
159+ int rank = -1 ;
160+ MPI_Comm_rank (MPI_COMM_WORLD, &rank);
161+
162+ std::cout << " rank: " << rank << " , total_time=" << total_time << std::endl;
163+
164+ return std::make_tuple (iter_timings, total_time);
165+ }
166+
167+ template <typename T>
168+ std::tuple<std::vector<double >, double > runBenchmarkLevel3 () {
169+ std::cout << " -- level 3 benchmark [" << typeToString<T>() << " ] -- " << std::endl;
170+ Kokkos::View<T**> A (" A" , M, N);
171+ Kokkos::View<T**> B (" B" , N, K);
172+ Kokkos::View<T**> C (" C" , M, K);
173+
174+ T* A_ptr = A.data ();
175+ T* B_ptr = B.data ();
176+ T* C_ptr = C.data ();
177+
178+ Kokkos::Random_XorShift64_Pool pool (123 );
179+ Kokkos::fill_random (A, pool, 10.0 );
180+ Kokkos::fill_random (B, pool, 10.0 );
181+
182+ std::vector<double > iter_timings;
183+
184+ double total_time = 0.0 ;
185+
186+ MPI_Barrier (MPI_COMM_WORLD);
187+
188+ for (int i = 0 ; i < iters; i++) {
189+ Kokkos::Timer timer;
190+ cblas_dgemm (CblasRowMajor, CblasNoTrans, CblasNoTrans,
191+ M, // number of rows in C (and A)
192+ K, // number of columns in C (and B)
193+ N, // shared inner dimension (columns of A, rows of B)
194+ 1.0 , // alpha
195+ A_ptr, // matrix A pointer
196+ N, // leading dimension of A (A is M×N)
197+ B_ptr, // matrix B pointer
198+ K, // leading dimension of B (B is N×K)
199+ 0.0 , // beta
200+ C_ptr, // matrix C pointer
201+ K); // leading dimension of C (C is M×K)
202+ Kokkos::fence ();
203+
204+ // Skip the first iteration
205+ if (i > 0 ) {
206+ double time = timer.seconds ();
207+ total_time += time;
208+ iter_timings.push_back (time);
209+ }
210+ }
211+
212+ int rank = -1 ;
213+ MPI_Comm_rank (MPI_COMM_WORLD, &rank);
214+
215+ std::cout << " rank: " << rank << " , total_time=" << total_time << std::endl;
216+
217+ return std::make_tuple (iter_timings, total_time);
218+ }
219+ Assistant
220+
221+ Certainly! Below is an implementation of a benchmark function that performs the MKL DPOTRF operation, which is used for Cholesky factorization of a symmetric positive definite matrix. This benchmark follows the same style as your provided runBenchmarkLevel3 function.
222+
223+ #include < iostream>
224+ #include < vector>
225+ #include < tuple>
226+ #include < mkl.h>
227+ #include < Kokkos_Core.hpp>
228+ #include < mpi.h>
229+
230+ template <typename T>
231+ std::tuple<std::vector<double >, double > runBenchmarkDPOTRF (int N, int iters) {
232+ std::cout << " -- dpotrf benchmark [" << typeToString<T>() << " ] -- " << std::endl;
233+
234+ // Define matrix size
235+ Kokkos::View<T**> A (" A" , N, N);
236+
237+ // Fill matrix A with random values
238+ Kokkos::Random_XorShift64_Pool pool (123 );
239+ Kokkos::fill_random (A, pool, 10.0 );
240+
241+ // Make A symmetric positive definite
242+ Kokkos::parallel_for (" MakeSPD" , N, KOKKOS_LAMBDA (int i) {
243+ for (int j = 0 ; j < N; j++) {
244+ A (i, j) = A (i, j) + A (j, i); // Symmetrize
245+ }
246+ });
247+ Kokkos::fence ();
248+
249+ // Prepare for MKL DPOTRF
250+ T* A_ptr = A.data ();
251+ std::vector<double > iter_timings;
252+ double total_time = 0.0 ;
253+
254+ MPI_Barrier (MPI_COMM_WORLD);
255+
256+ // Perform the benchmark iterations
257+ for (int i = 0 ; i < iters; i++) {
258+ Kokkos::Timer timer;
259+
260+ // Call MKL DPOTRF
261+ int info;
262+ dpotrf_ (" L" , &N, A_ptr, &N, &info); // 'L' for lower triangular
263+
148264 Kokkos::fence ();
149265
150266 // Skip the first iteration
@@ -167,14 +283,77 @@ template <typename T>
167283std::tuple<std::vector<double >, double > runBenchmark (benchmarks type, int M, int N, int K, int iters) {
168284 switch (type) {
169285 case level1:
170- return runBenchmarkLevel1<T>(M, N, K , iters);
286+ return runBenchmarkLevel1<T>(N , iters);
171287 case level2:
172- return runBenchmarkLevel2<T>(M, N, K, iters);
288+ return runBenchmarkLevel2<T>(M, N, iters);
173289 case level3:
174290 return runBenchmarkLevel3<T>(M, N, K, iters);
175291 case dpotrf:
176- return runBenchmarkDPOTRF<T>(M, N, K , iters);
292+ return runBenchmarkDPOTRF<T>(N , iters);
177293 default :
178294 throw std::invalid_argument (" Unsupported benchmark type" );
179295 }
180296}
297+
298+ void reduceAndPrintBenchmarkOutput (
299+ std::vector<double > iter_timings,
300+ double total_time,
301+ std::string benchmark)
302+ {
303+ int rank = -1 ;
304+ int num_ranks = 0 ;
305+ MPI_Comm_rank (MPI_COMM_WORLD, &rank);
306+ MPI_Comm_size (MPI_COMM_WORLD, &num_ranks);
307+
308+ char processor_name[MPI_MAX_PROCESSOR_NAME];
309+ int name_len;
310+ MPI_Get_processor_name (processor_name, &name_len);
311+
312+ std::vector<double > all_times;
313+ all_times.resize (num_ranks);
314+
315+ std::vector<double > all_iter_times;
316+ all_iter_times.resize (num_ranks * iters);
317+
318+ std::vector<char > all_processor_names;
319+ all_processor_names.resize (num_ranks * MPI_MAX_PROCESSOR_NAME);
320+
321+ if (rank == 0 ) {
322+ std::cout << " num_ranks: " << num_ranks << std::endl;
323+ }
324+
325+ MPI_Gather (
326+ &total_time, 1 , MPI_DOUBLE,
327+ &all_times[0 ], 1 , MPI_DOUBLE, 0 ,
328+ MPI_COMM_WORLD
329+ );
330+
331+ MPI_Gather (
332+ &iter_timings[0 ], iters, MPI_DOUBLE,
333+ &all_iter_times[0 ], iters, MPI_DOUBLE, 0 ,
334+ MPI_COMM_WORLD
335+ );
336+
337+ MPI_Gather (
338+ &processor_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR,
339+ &all_processor_names[0 ], MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0 ,
340+ MPI_COMM_WORLD
341+ );
342+
343+ if (rank == 0 ) {
344+ int cur_rank = 0 ;
345+ int cur = 0 ;
346+ std::cout << " === " << benchmark << " ===" << std::endl;
347+ for (auto && time : all_times) {
348+ std::cout << " gather: " << cur_rank << " ("
349+ << std::string (&all_processor_names[cur_rank * MPI_MAX_PROCESSOR_NAME])
350+ << " ): " << time << " : breakdown: " ;
351+ for (int i = cur; i < iters + cur; i++) {
352+ std::cout << all_iter_times[i] << " " ;
353+ }
354+ std::cout << std::endl;
355+ cur += iters;
356+ cur_rank++;
357+ }
358+ }
359+ }
0 commit comments