accel-sim · William-An · Jul 13, 2025 · Oct 7, 2025 · Oct 7, 2025 · Oct 7, 2025
diff --git a/.gitignore b/.gitignore
@@ -15,4 +15,13 @@ src/cuda/rodinia/3.1/cuda/particlefilter/particlefilter_naive
 src/cuda/rodinia/3.1/cuda/pathfinder/pathfinder
 4.2
 .venv/
-__pycache__/
+__pycache__/
+compile_commands.json
+.cache/
+tmp/
+
+# Ignoring files without extension (but keep Makefile and files with extensions)
+src/cuda/GPU_Microbenchmark/ubench/**/*
+!src/cuda/GPU_Microbenchmark/ubench/**/*/
+!src/cuda/GPU_Microbenchmark/ubench/**/*.*
+!src/cuda/GPU_Microbenchmark/ubench/**/Makefile
diff --git a/src/Makefile b/src/Makefile
@@ -520,6 +520,11 @@ huggingface:
 	chmod u+x $(BINDIR)/$(BINSUBDIR)/huggingface/helloworld
 	chmod u+x $(BINDIR)/$(BINSUBDIR)/huggingface/*.py
 
+tma:
+	mkdir -p $(BINDIR)/$(BINSUBDIR)/tma
+	cp -r cuda/tma $(BINDIR)/$(BINSUBDIR)
+	cd $(BINDIR)/$(BINSUBDIR)/tma && make all
+
 clean_heterosync:
 	rm -rf cuda/heterosync
 
@@ -696,4 +701,7 @@ clean_cuda_samples:
 	make clean -C ./cuda/cuda-samples/build
 
 clean_huggingface:
-	rm -rf $(BINDIR)/$(BINSUBDIR)/huggingface
+	rm -rf $(BINDIR)/$(BINSUBDIR)/huggingface
+
+clean_tma:
+	rm -rf $(BINDIR)/$(BINSUBDIR)/tma
diff --git a/src/cuda/GPU_Microbenchmark/.gitignore b/src/cuda/GPU_Microbenchmark/.gitignore
@@ -1,3 +1,4 @@
 bin/
 *.o
 *.out
+*.a
diff --git a/src/cuda/GPU_Microbenchmark/common/common.mk b/src/cuda/GPU_Microbenchmark/common/common.mk
@@ -9,16 +9,42 @@ CC := nvcc
 
 LIB :=
 
-release:
-	$(CC) $(NVCC_FLAGS) $(CUOPTS) $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart
+# Generate object file list from SRC (for parallel compilation)
+CUDA_SRC_FILES := $(filter %.cu, $(SRC))
+CPP_SRC_FILES := $(filter %.cpp, $(SRC))
+C_SRC_FILES := $(filter %.c, $(SRC))
+
+# To preserve PTX in multi-step compilation, we have to compile the CUDA source files to .a files
+CUDA_LIB_FILES := $(CUDA_SRC_FILES:.cu=.a)
+
+# Host side source files
+CPP_OBJECT_FILES := $(CPP_SRC_FILES:.cpp=.o)
+C_OBJECT_FILES := $(C_SRC_FILES:.c=.o)
+OBJECT_FILES := $(CPP_OBJECT_FILES) $(C_OBJECT_FILES)
+
+# If multiple source files are provided, compile them separately and link
+# To preserve PTX in final binary: First create static library, then link to executable
+# This avoids nvlink stripping PTX during device linking
+release: $(CUDA_LIB_FILES) $(OBJECT_FILES)
+	$(CC) $(NVCC_FLAGS) $^ -o $(EXE) -L$(LIB) -lcudart
 	cp $(EXE) $(BIN_DIR)
 
+# Pattern rule for compiling individual .cu files to .o files
+%.a: %.cu
+	$(CC) $(NVCC_FLAGS) $(INCLUDE) $(CUOPTS) --lib $< -o $@
+
+%.o: %.cpp
+	$(CC) $(NVCC_FLAGS) $(INCLUDE) $(CUOPTS) -dc $< -o $@
+
+%.o: %.c
+	$(CC) $(NVCC_FLAGS) $(INCLUDE) $(CUOPTS) -dc $< -o $@
+
 tuner:
-	$(CC) $(NVCC_FLAGS) $(CUOPTS) -DTUNER $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart
+	$(CC) $(NVCC_FLAGS) $(CUOPTS) -DTUNER $(SRC) -o $(EXE) $(INCLUDE) -L$(LIB) -lcudart
 	cp $(EXE) $(BIN_DIR)
 
 clean:
-	rm -f *.o; rm -f $(EXE)
+	rm -f *.o $(OBJECTS); rm -f $(EXE) $(LIB_FILE)
 
 run:
 	./$(EXE)
@@ -36,7 +62,7 @@ nvsight:
 	nv-nsight-cu-cli --metrics gpc__cycles_elapsed.avg,sm__cycles_elapsed.sum,smsp__inst_executed.sum,sm__warps_active.avg.pct_of_peak_sustained_active,l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum,lts__t_sectors_srcunit_tex_op_read.sum,lts__t_sectors_srcunit_tex_op_write.sum,lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum,lts__t_sectors_srcunit_tex_op_write_lookup_hit.sum,lts__t_sector_op_read_hit_rate.pct,lts__t_sector_op_write_hit_rate.pct,lts__t_sectors_srcunit_tex_op_read.sum.per_second,dram__sectors_read.sum,dram__sectors_write.sum,dram__bytes_read.sum  --csv --page raw ./$(EXE) | tee nsight.csv
 
 ptx:
-	cuobjdump -ptx ./$(EXE)  tee ptx.txt
+	cuobjdump -ptx ./$(EXE) | tee ptx.txt
 
 sass:
-	cuobjdump -sass ./$(EXE)  tee sass.txt
+	cuobjdump -sass ./$(EXE) | tee sass.txt
diff --git a/src/cuda/GPU_Microbenchmark/hw_def/common/common.h b/src/cuda/GPU_Microbenchmark/hw_def/common/common.h
@@ -22,7 +22,7 @@ enum dram_model { GDDR5 = 1, GDDR5X = 2, GDDR6 = 3, HBM = 4 };
 
 // source:
 // https://stackoverflow.com/questions/466204/rounding-up-to-next-power-of-2
-unsigned round_up_2n(unsigned v) {
+inline unsigned round_up_2n(unsigned v) {
   v--;
   v |= v >> 1;
   v |= v >> 2;
@@ -34,9 +34,9 @@ unsigned round_up_2n(unsigned v) {
   return v;
 }
 
-unsigned round_up_2n(float n) { return round_up_2n((unsigned)ceil(n)); }
+inline unsigned round_up_2n(float n) { return round_up_2n((unsigned)ceil(n)); }
 
-bool isPowerOfTwo(int n) {
+inline bool isPowerOfTwo(int n) {
   if (n == 0)
     return false;
 
@@ -51,17 +51,17 @@ static const unsigned dram_model_burst_length[] = {0, 8, 8, 16, 2};
 static const unsigned dram_model_freq_ratio[] = {0, 4, 4, 4, 2};
 // atom size =
 // dram_model_channel_width*dram_model_mem_per_ctrlr*dram_model_burst_length
-unsigned get_atom_size_inByte(enum dram_model model) {
+inline unsigned get_atom_size_inByte(enum dram_model model) {
   return (dram_model_bus_width[model] / 8) * dram_model_mem_per_ctrlr[model] *
          dram_model_burst_length[model];
 }
 // CCD = dram_model_burst_length/dram_model_freq_ratio
-unsigned get_adjusted_CCD(enum dram_model model) {
+inline unsigned get_adjusted_CCD(enum dram_model model) {
   assert(dram_model_burst_length[model] % dram_model_freq_ratio[model] == 0);
   return dram_model_burst_length[model] / dram_model_freq_ratio[model];
 }
 
-unsigned get_num_channels(unsigned total_memory_width, enum dram_model model) {
+inline unsigned get_num_channels(unsigned total_memory_width, enum dram_model model) {
   unsigned channel_width =
       dram_model_bus_width[model] * dram_model_mem_per_ctrlr[model];
   assert(total_memory_width % channel_width == 0);

diff --git a/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h b/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h
@@ -33,7 +33,7 @@ struct GpuConfig
     unsigned BLOCKS_NUM = 640;        // Total blocks launched
     unsigned TOTAL_THREADS = 163840;  // Total threads launched
 };
-GpuConfig config;
+inline GpuConfig config;
 // Parses short flags like --sm 80 into a GpuConfig object
 inline void parseGpuConfigArgs(int argc, char *argv[])
 {
@@ -144,9 +144,9 @@ inline void gpuAssert(cudaError_t code, const char *file, int line,
     }
 }
 
-cudaDeviceProp deviceProp;
+inline cudaDeviceProp deviceProp;
 
-unsigned intilizeDeviceProp(unsigned deviceID, int argc, char *argv[])
+inline unsigned intilizeDeviceProp(unsigned deviceID, int argc, char *argv[])
 {
 #ifdef TUNER
 

diff --git a/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/atomic_add_bw.cu b/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/atomic_add_bw.cu
@@ -111,5 +111,5 @@ int main(int argc, char *argv[])
   printf("Atomic int32 bandwidth = %f (byte/clk)\n", bw);
   printf("Total Clk number = %ld \n", total_time);
 
-  return 1;
+  return 0;
 }
diff --git a/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict.cu b/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict.cu
@@ -111,5 +111,5 @@ int main(int argc, char *argv[])
   printf("Atomic int32 bandwidth = %f (byte/clk)\n", bw);
   printf("Total Clk number = %u \n", total_time);
 
-  return 1;
+  return 0;
 }
diff --git a/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/atomic_add_lat.cu b/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/atomic_add_lat.cu
@@ -97,5 +97,5 @@ int main(int argc, char *argv[])
   printf("Atomic int32 latency = %f (clk)\n", latency);
   printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
 
-  return 1;
+  return 0;
 }
diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.cu
@@ -7,5 +7,5 @@ int main(int argc, char *argv[])
 
   dpu_max_flops();
 
-  return 1;
+  return 0;
 }
diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.cu
@@ -7,5 +7,5 @@ int main(int argc, char *argv[])
 
   fpu_max_flops();
 
-  return 1;
+  return 0;
 }
diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/Makefile b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/Makefile
@@ -0,0 +1,20 @@
+# Source files split for parallel compilation
+# Use wildcard to automatically include all size-specific breakdown files
+SRC = MaxFlops_gmma.cu $(wildcard kernels/MaxFlops_gmma_*.cu)
+
+EXE = MaxFlops_gmma
+
+# Add include path for CUTLASS
+INCLUDE += -I$(GPUAPPS_ROOT)/src/cuda/cutlass-bench/include -I./
+
+# GMMA is only supported in sm_90a
+ARCH?=sm_90a
+# Unset the CUDA_CPPFLAGS which is set based on CUDA version
+CUDA_CPPFLAGS=
+# Generate code for both sm_XXX and compute_XXX (SASS and PTX)
+HOPPER_CUDA_CPPFLAGS=$(foreach arch,$(ARCH),-gencode=arch=compute_$(subst sm_,,$(arch)),code=$(arch) -gencode=arch=compute_$(subst sm_,,$(arch)),code=compute_$(subst sm_,,$(arch)))
+
+# CUTLASS cute library requires C++17
+NVCC_FLAGS := $(HOPPER_CUDA_CPPFLAGS) -std=c++17
+
+include ../../../common/common.mk
diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/MaxFlops_gmma.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/MaxFlops_gmma.cu
@@ -0,0 +1,13 @@
+#include <cuda.h>
+#include "MaxFlops_gmma.h"
+#include "../../../hw_def/hw_def.h"
+
+int main(int argc, char *argv[])
+{
+  intilizeDeviceProp(0, argc, argv);
+
+  // Run comprehensive sweep over all valid MMA operations
+  run_all_wgmma_maxflops_tests();
+
+  return 0;
+}
-Original file line number
+Diff line change
@@ -1,3 +1,4 @@
     bin/
     *.o
     *.out
+    *.a