Skip to content

Commit 37bddca

Browse files
committed
update block size
1 parent eb1372f commit 37bddca

File tree

1 file changed

+7
-5
lines changed

1 file changed

+7
-5
lines changed

src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/MaxFlops_gmma_common.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -154,9 +154,12 @@ float run_wgmma_maxflops_test_typed() {
154154
gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t)));
155155
gpuErrchk(cudaMalloc(&checksum_g, sizeof(uint32_t)));
156156

157-
// Launch kernel with 1024 threads
158-
dim3 grid(1);
159-
dim3 block(1024);
157+
// Launch kernel with 256 threads
158+
config.BLOCKS_NUM = 1;
159+
config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM;
160+
int TOTAL_WARPS = TOTAL_THREADS / 32;
161+
dim3 grid(config.BLOCKS_NUM);
162+
dim3 block(config.THREADS_PER_BLOCK);
160163
wgmma_max_flops_kernel<ElementA, ElementB, ElementC, TileShape_MNK><<<grid, block>>>(startClk_g, stopClk_g, checksum_g);
161164

162165
gpuErrchk(cudaPeekAtLastError());
@@ -169,8 +172,7 @@ float run_wgmma_maxflops_test_typed() {
169172
gpuErrchk(cudaMemcpy(&checksum, checksum_g, sizeof(uint32_t), cudaMemcpyDeviceToHost));
170173

171174
// Calculate max instruction throughput
172-
// Each warp group has 4 warps, so we need to multiply by 4
173-
float inst_throughput = ((float)(REPEAT_TIMES) * 4) / ((float)(stopClk - startClk));
175+
float inst_throughput = ((float)(REPEAT_TIMES) * TOTAL_WARPS) / ((float)(stopClk - startClk));
174176

175177
// Cleanup
176178
cudaFree(startClk_g);

0 commit comments

Comments
 (0)