Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
bea007b
Add tma unittest
William-An Jul 13, 2025
e645411
add regular load to TMA benchmark
William-An Oct 7, 2025
73bfa8f
make the regular load to have same access pattern as TMA load
William-An Oct 7, 2025
646517d
avoid compiler optimization
William-An Oct 7, 2025
9daed3a
move cuda mempcy to be before kernel launch
William-An Oct 8, 2025
8d7937e
add iteration count for tma ubench
William-An Oct 15, 2025
31a54b6
minor formatting
William-An Oct 17, 2025
3b365af
move tma to ubench folder
William-An Oct 17, 2025
3525929
make setup script works with zsh
William-An Oct 17, 2025
bda12c2
fix the issue that ubench all return 1 even without issue
William-An Oct 17, 2025
a7d5b0e
add a sample test kernel for mbarrier PTX mapping to SASS
William-An Oct 27, 2025
b7f2552
update gitignore
William-An Nov 1, 2025
390ff6e
add gmma kernels for latency measurement
William-An Nov 1, 2025
433324c
increase iter to 1024
William-An Nov 1, 2025
86163a7
add missed kernels
William-An Nov 1, 2025
eb1372f
add maxflops for gmma
William-An Nov 1, 2025
efe7108
update block size
William-An Nov 1, 2025
8f96178
update prints for MaxFlops_gmma
William-An Nov 2, 2025
efb18e5
fix a bug
William-An Nov 2, 2025
3d7ad80
fix include after updating it
William-An Nov 3, 2025
f933e19
fix for cpp and c source
William-An Nov 3, 2025
d48d603
fix compile
William-An Nov 3, 2025
df168f3
fix for pattern matching
William-An Nov 3, 2025
76b7948
fix compilation for mbarrier
William-An Nov 3, 2025
ba70262
Fix makefile for tma app
William-An Nov 3, 2025
6bc9197
generate SASS and PTX for TMA and GMMA workloads
William-An Nov 3, 2025
e188546
update makefile to force PTX to be embedded in final fat bin
William-An Nov 3, 2025
07b7aba
change naming
William-An Nov 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
11 changes: 10 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,13 @@ src/cuda/rodinia/3.1/cuda/particlefilter/particlefilter_naive
src/cuda/rodinia/3.1/cuda/pathfinder/pathfinder
4.2
.venv/
__pycache__/
__pycache__/
compile_commands.json
.cache/
tmp/

# Ignoring files without extension (but keep Makefile and files with extensions)
src/cuda/GPU_Microbenchmark/ubench/**/*
!src/cuda/GPU_Microbenchmark/ubench/**/*/
!src/cuda/GPU_Microbenchmark/ubench/**/*.*
!src/cuda/GPU_Microbenchmark/ubench/**/Makefile
10 changes: 9 additions & 1 deletion src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,11 @@ huggingface:
chmod u+x $(BINDIR)/$(BINSUBDIR)/huggingface/helloworld
chmod u+x $(BINDIR)/$(BINSUBDIR)/huggingface/*.py

tma:
mkdir -p $(BINDIR)/$(BINSUBDIR)/tma
cp -r cuda/tma $(BINDIR)/$(BINSUBDIR)
cd $(BINDIR)/$(BINSUBDIR)/tma && make all

clean_heterosync:
rm -rf cuda/heterosync

Expand Down Expand Up @@ -696,4 +701,7 @@ clean_cuda_samples:
make clean -C ./cuda/cuda-samples/build

clean_huggingface:
rm -rf $(BINDIR)/$(BINSUBDIR)/huggingface
rm -rf $(BINDIR)/$(BINSUBDIR)/huggingface

clean_tma:
rm -rf $(BINDIR)/$(BINSUBDIR)/tma
1 change: 1 addition & 0 deletions src/cuda/GPU_Microbenchmark/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
bin/
*.o
*.out
*.a
38 changes: 32 additions & 6 deletions src/cuda/GPU_Microbenchmark/common/common.mk
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,42 @@ CC := nvcc

LIB :=

release:
$(CC) $(NVCC_FLAGS) $(CUOPTS) $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart
# Generate object file list from SRC (for parallel compilation)
CUDA_SRC_FILES := $(filter %.cu, $(SRC))
CPP_SRC_FILES := $(filter %.cpp, $(SRC))
C_SRC_FILES := $(filter %.c, $(SRC))

# To preserve PTX in multi-step compilation, we have to compile the CUDA source files to .a files
CUDA_LIB_FILES := $(CUDA_SRC_FILES:.cu=.a)

# Host side source files
CPP_OBJECT_FILES := $(CPP_SRC_FILES:.cpp=.o)
C_OBJECT_FILES := $(C_SRC_FILES:.c=.o)
OBJECT_FILES := $(CPP_OBJECT_FILES) $(C_OBJECT_FILES)

# If multiple source files are provided, compile them separately and link
# To preserve PTX in final binary: First create static library, then link to executable
# This avoids nvlink stripping PTX during device linking
release: $(CUDA_LIB_FILES) $(OBJECT_FILES)
$(CC) $(NVCC_FLAGS) $^ -o $(EXE) -L$(LIB) -lcudart
cp $(EXE) $(BIN_DIR)

# Pattern rule for compiling individual .cu files to .o files
%.a: %.cu
$(CC) $(NVCC_FLAGS) $(INCLUDE) $(CUOPTS) --lib $< -o $@

%.o: %.cpp
$(CC) $(NVCC_FLAGS) $(INCLUDE) $(CUOPTS) -dc $< -o $@

%.o: %.c
$(CC) $(NVCC_FLAGS) $(INCLUDE) $(CUOPTS) -dc $< -o $@

tuner:
$(CC) $(NVCC_FLAGS) $(CUOPTS) -DTUNER $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart
$(CC) $(NVCC_FLAGS) $(CUOPTS) -DTUNER $(SRC) -o $(EXE) $(INCLUDE) -L$(LIB) -lcudart
cp $(EXE) $(BIN_DIR)

clean:
rm -f *.o; rm -f $(EXE)
rm -f *.o $(OBJECTS); rm -f $(EXE) $(LIB_FILE)

run:
./$(EXE)
Expand All @@ -36,7 +62,7 @@ nvsight:
nv-nsight-cu-cli --metrics gpc__cycles_elapsed.avg,sm__cycles_elapsed.sum,smsp__inst_executed.sum,sm__warps_active.avg.pct_of_peak_sustained_active,l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum,lts__t_sectors_srcunit_tex_op_read.sum,lts__t_sectors_srcunit_tex_op_write.sum,lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum,lts__t_sectors_srcunit_tex_op_write_lookup_hit.sum,lts__t_sector_op_read_hit_rate.pct,lts__t_sector_op_write_hit_rate.pct,lts__t_sectors_srcunit_tex_op_read.sum.per_second,dram__sectors_read.sum,dram__sectors_write.sum,dram__bytes_read.sum --csv --page raw ./$(EXE) | tee nsight.csv

ptx:
cuobjdump -ptx ./$(EXE) tee ptx.txt
cuobjdump -ptx ./$(EXE) | tee ptx.txt

sass:
cuobjdump -sass ./$(EXE) tee sass.txt
cuobjdump -sass ./$(EXE) | tee sass.txt
12 changes: 6 additions & 6 deletions src/cuda/GPU_Microbenchmark/hw_def/common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ enum dram_model { GDDR5 = 1, GDDR5X = 2, GDDR6 = 3, HBM = 4 };

// source:
// https://stackoverflow.com/questions/466204/rounding-up-to-next-power-of-2
unsigned round_up_2n(unsigned v) {
inline unsigned round_up_2n(unsigned v) {
v--;
v |= v >> 1;
v |= v >> 2;
Expand All @@ -34,9 +34,9 @@ unsigned round_up_2n(unsigned v) {
return v;
}

unsigned round_up_2n(float n) { return round_up_2n((unsigned)ceil(n)); }
inline unsigned round_up_2n(float n) { return round_up_2n((unsigned)ceil(n)); }

bool isPowerOfTwo(int n) {
inline bool isPowerOfTwo(int n) {
if (n == 0)
return false;

Expand All @@ -51,17 +51,17 @@ static const unsigned dram_model_burst_length[] = {0, 8, 8, 16, 2};
static const unsigned dram_model_freq_ratio[] = {0, 4, 4, 4, 2};
// atom size =
// dram_model_channel_width*dram_model_mem_per_ctrlr*dram_model_burst_length
unsigned get_atom_size_inByte(enum dram_model model) {
inline unsigned get_atom_size_inByte(enum dram_model model) {
return (dram_model_bus_width[model] / 8) * dram_model_mem_per_ctrlr[model] *
dram_model_burst_length[model];
}
// CCD = dram_model_burst_length/dram_model_freq_ratio
unsigned get_adjusted_CCD(enum dram_model model) {
inline unsigned get_adjusted_CCD(enum dram_model model) {
assert(dram_model_burst_length[model] % dram_model_freq_ratio[model] == 0);
return dram_model_burst_length[model] / dram_model_freq_ratio[model];
}

unsigned get_num_channels(unsigned total_memory_width, enum dram_model model) {
inline unsigned get_num_channels(unsigned total_memory_width, enum dram_model model) {
unsigned channel_width =
dram_model_bus_width[model] * dram_model_mem_per_ctrlr[model];
assert(total_memory_width % channel_width == 0);
Expand Down
6 changes: 3 additions & 3 deletions src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ struct GpuConfig
unsigned BLOCKS_NUM = 640; // Total blocks launched
unsigned TOTAL_THREADS = 163840; // Total threads launched
};
GpuConfig config;
inline GpuConfig config;
// Parses short flags like --sm 80 into a GpuConfig object
inline void parseGpuConfigArgs(int argc, char *argv[])
{
Expand Down Expand Up @@ -144,9 +144,9 @@ inline void gpuAssert(cudaError_t code, const char *file, int line,
}
}

cudaDeviceProp deviceProp;
inline cudaDeviceProp deviceProp;

unsigned intilizeDeviceProp(unsigned deviceID, int argc, char *argv[])
inline unsigned intilizeDeviceProp(unsigned deviceID, int argc, char *argv[])
{
#ifdef TUNER

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,5 +111,5 @@ int main(int argc, char *argv[])
printf("Atomic int32 bandwidth = %f (byte/clk)\n", bw);
printf("Total Clk number = %ld \n", total_time);

return 1;
return 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -111,5 +111,5 @@ int main(int argc, char *argv[])
printf("Atomic int32 bandwidth = %f (byte/clk)\n", bw);
printf("Total Clk number = %u \n", total_time);

return 1;
return 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -97,5 +97,5 @@ int main(int argc, char *argv[])
printf("Atomic int32 latency = %f (clk)\n", latency);
printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);

return 1;
return 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ int main(int argc, char *argv[])

dpu_max_flops();

return 1;
return 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ int main(int argc, char *argv[])

fpu_max_flops();

return 1;
return 0;
}
20 changes: 20 additions & 0 deletions src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Source files split for parallel compilation
# Use wildcard to automatically include all size-specific breakdown files
SRC = MaxFlops_gmma.cu $(wildcard kernels/MaxFlops_gmma_*.cu)

EXE = MaxFlops_gmma

# Add include path for CUTLASS
INCLUDE += -I$(GPUAPPS_ROOT)/src/cuda/cutlass-bench/include -I./

# GMMA is only supported in sm_90a
ARCH?=sm_90a
# Unset the CUDA_CPPFLAGS which is set based on CUDA version
CUDA_CPPFLAGS=
# Generate code for both sm_XXX and compute_XXX (SASS and PTX)
HOPPER_CUDA_CPPFLAGS=$(foreach arch,$(ARCH),-gencode=arch=compute_$(subst sm_,,$(arch)),code=$(arch) -gencode=arch=compute_$(subst sm_,,$(arch)),code=compute_$(subst sm_,,$(arch)))

# CUTLASS cute library requires C++17
NVCC_FLAGS := $(HOPPER_CUDA_CPPFLAGS) -std=c++17

include ../../../common/common.mk
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#include <cuda.h>
#include "MaxFlops_gmma.h"
#include "../../../hw_def/hw_def.h"

int main(int argc, char *argv[])
{
intilizeDeviceProp(0, argc, argv);

// Run comprehensive sweep over all valid MMA operations
run_all_wgmma_maxflops_tests();

return 0;
}
Loading
Loading