Skip to content

Commit c2afeb8

Browse files
authored
Merge pull request #51 from purdue-aalp/micro_bench_merge
merge microbenchmarks with tuner ubench
2 parents 568a86a + b47fcd1 commit c2afeb8

File tree

216 files changed

+8133
-713
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

216 files changed

+8133
-713
lines changed

.github/workflows/test-build.yml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,23 @@ on:
1313

1414
# A workflow run is made up of one or more jobs that can run sequentially or in parallel
1515
jobs:
16-
test-12-6:
16+
test-12-8:
1717
runs-on: ubuntu-latest
1818
container:
1919
image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
2020

2121
# Steps represent a sequence of tasks that will be executed as part of the job
2222
steps:
2323
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
24-
- uses: actions/checkout@v4
24+
- name: Checkout repository with submodules
25+
uses: actions/checkout@v4
26+
with:
27+
submodules: false # This clones submodules
2528

2629
- name: Build Apps
2730
run: |
2831
git config --global --add safe.directory /__w/gpu-app-collection/gpu-app-collection
32+
git submodule update --init -- src/cuda/cuda-samples
2933
/bin/bash test-build.sh
3034
3135
- name: Print Successful Apps

src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw/Makefile

Lines changed: 0 additions & 31 deletions
This file was deleted.

src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw_conflict/Makefile

Lines changed: 0 additions & 31 deletions
This file was deleted.

src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw_diverge/Makefile

Lines changed: 0 additions & 31 deletions
This file was deleted.

src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw_profile/Makefile

Lines changed: 0 additions & 31 deletions
This file was deleted.

src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_lat/Makefile

Lines changed: 0 additions & 31 deletions
This file was deleted.

src/cuda/GPU_Microbenchmark/Makefile

100755100644
Lines changed: 16 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,21 @@
1-
21
BASE_DIR := $(shell pwd)
32
BIN_DIR := $(BASE_DIR)/bin
3+
SUB_DIRS = $(wildcard ubench/*/*/)
4+
SUB_DIRS_ALL = $(SUB_DIRS:%=all-%)
5+
SUB_DIRS_CLEAN = $(SUB_DIRS:%=clean-%)
46

5-
all:
6-
mkdir -p $(BIN_DIR)
7-
cd l1_bw_32f && make && cp l1_bw_32f $(BIN_DIR)
8-
cd l1_bw_64f && make && cp l1_bw_64f $(BIN_DIR)
9-
cd l1_bw_128 && make && cp l1_bw_128 $(BIN_DIR)
10-
cd l1_lat && make && cp l1_lat $(BIN_DIR)
11-
cd l2_bw_32f && make && cp l2_bw_32f $(BIN_DIR)
12-
cd l2_bw_64f && make && cp l2_bw_64f $(BIN_DIR)
13-
cd l2_bw_128 && make && cp l2_bw_128 $(BIN_DIR)
14-
cd l2_lat && make && cp l2_lat $(BIN_DIR)
15-
cd mem_bw && make && cp mem_bw $(BIN_DIR)
16-
cd mem_lat && make && cp mem_lat $(BIN_DIR)
17-
cd shared_bw && make && cp shared_bw $(BIN_DIR)
18-
cd shared_lat && make && cp shared_lat $(BIN_DIR)
19-
cd MaxFlops && make && cp MaxFlops $(BIN_DIR)
20-
cd l1_shared_bw && make && cp l1_shared_bw $(BIN_DIR)
21-
cd shared_bank_conflicts && make && cp shared_bank_conflicts $(BIN_DIR)
22-
cd l1_bw_32f_unroll && make && cp l1_bw_32f_unroll $(BIN_DIR)
23-
cd l1_bw_32f_unroll_large && make && cp l1_bw_32f_unroll_large $(BIN_DIR)
24-
cd Atomic_ubench/Atomic_add/Atomic_add_bw && make && cp atomic_add_bw $(BIN_DIR)
25-
cd Atomic_ubench/Atomic_add/Atomic_add_bw_conflict && make && cp atomic_add_bw_conflict $(BIN_DIR)
26-
cd Atomic_ubench/Atomic_add/Atomic_add_bw_profile && make && cp atomic_add_bw_profile $(BIN_DIR)
27-
cd Atomic_ubench/Atomic_add/Atomic_add_bw_diverge && make && cp atomic_add_bw_diverge $(BIN_DIR)
28-
cd Atomic_ubench/Atomic_add/Atomic_add_lat && make && cp atomic_add_lat $(BIN_DIR)
7+
all: create_dir $(SUB_DIRS_ALL)
8+
9+
clean: delete_dir $(SUB_DIRS_CLEAN)
2910

11+
$(SUB_DIRS_ALL):
12+
$(MAKE) $(MAKE_FLAGS) -C $(@:all-%=%)
13+
14+
$(SUB_DIRS_CLEAN):
15+
$(MAKE) $(MAKE_FLAGS) -C $(@:clean-%=%) clean
16+
17+
create_dir:
18+
mkdir -p $(BIN_DIR)
3019

31-
clean:
32-
cd $(BIN_DIR) && rm -f *
20+
delete_dir:
21+
cd $(BIN_DIR); rm -f *

src/cuda/GPU_Microbenchmark/MaxFlops/Makefile

Lines changed: 0 additions & 31 deletions
This file was deleted.
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
BASE_DIR := $(shell pwd)
2+
BIN_DIR := $(BASE_DIR)/../../../bin/
3+
4+
GENCODE_SM30 ?= -gencode=arch=compute_30,code=\"sm_30,compute_30\"
5+
GENCODE_SM35 ?= -gencode=arch=compute_35,code=\"sm_35,compute_35\"
6+
GENCODE_SM50 ?= -gencode=arch=compute_50,code=\"sm_50,compute_50\"
7+
GENCODE_SM60 ?= -gencode=arch=compute_60,code=\"sm_60,compute_60\"
8+
GENCODE_SM62 ?= -gencode=arch=compute_62,code=\"sm_62,compute_62\"
9+
GENCODE_SM70 ?= -gencode=arch=compute_70,code=\"sm_70,compute_70\"
10+
GENCODE_SM75 ?= -gencode=arch=compute_75,code=\"sm_75,compute_75\"
11+
GENCODE_SM80 ?= -gencode=arch=compute_80,code=\"sm_80,compute_80\"
12+
GENCODE_SM86 ?= -gencode=arch=compute_86,code=\"sm_86,compute_86\"
13+
14+
CUOPTS = $(GENCODE_ARCH) $(GENCODE_SM50) $(GENCODE_SM60) $(GENCODE_SM62) $(GENCODE_SM70) $(GENCODE_SM75) $(GENCODE_SM80)
15+
16+
CC := nvcc
17+
18+
# CUDA_PATH ?= /use/local/cuda-10.1/
19+
INCLUDE := $(BASE_DIR)/../../../../cuda-samples/Common/
20+
LIB :=
21+
22+
release:
23+
$(CC) $(NVCC_FLGAS) $(CUOPTS) $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart
24+
cp $(EXE) $(BIN_DIR)
25+
26+
clean:
27+
rm -f *.o; rm -f $(EXE)
28+
29+
run:
30+
./$(EXE)
31+
32+
profile:
33+
nvprof ./$(EXE)
34+
35+
events:
36+
nvprof --events elapsed_cycles_sm ./$(EXE)
37+
38+
profileall:
39+
nvprof --concurrent-kernels off --print-gpu-trace -u us --metrics all --demangling off --csv --log-file data.csv ./$(EXE)
40+
41+
nvsight:
42+
nv-nsight-cu-cli --metrics gpc__cycles_elapsed.avg,sm__cycles_elapsed.sum,smsp__inst_executed.sum,sm__warps_active.avg.pct_of_peak_sustained_active,l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum,lts__t_sectors_srcunit_tex_op_read.sum,lts__t_sectors_srcunit_tex_op_write.sum,lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum,lts__t_sectors_srcunit_tex_op_write_lookup_hit.sum,lts__t_sector_op_read_hit_rate.pct,lts__t_sector_op_write_hit_rate.pct,lts__t_sectors_srcunit_tex_op_read.sum.per_second,dram__sectors_read.sum,dram__sectors_write.sum,dram__bytes_read.sum --csv --page raw ./$(EXE) | tee nsight.csv
43+
44+
ptx:
45+
cuobjdump -ptx ./$(EXE) tee ptx.txt
46+
47+
sass:
48+
cuobjdump -sass ./$(EXE) tee sass.txt
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
// These are the configration parameters that can be found publicly
2+
// Sources:
3+
// https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf
4+
// https://en.wikipedia.org/wiki/GeForce_30_series
5+
// https://en.wikipedia.org/wiki/CUDA
6+
7+
#ifndef AMPERE_RTX3070_DEF_H
8+
#define AMPERE_RTX3070_DEF_H
9+
10+
#include "./common/common.h"
11+
#include "./common/deviceQuery.h"
12+
13+
#define L1_SIZE (192 * 1024) // Max L1 size in bytes
14+
15+
#define CLK_FREQUENCY 1410 // frequency in MHz
16+
17+
#define ISSUE_MODEL issue_model::single // single issue core or dual issue
18+
#define CORE_MODEL core_model::subcore // subcore model or shared model
19+
#define DRAM_MODEL dram_model::HBM // memory type
20+
#define WARP_SCHEDS_PER_SM 4 // number of warp schedulers per SM
21+
22+
// number of SASS HMMA per 16x16 PTX WMMA for FP16 - FP32 accumlate operation
23+
// see slide 22 at
24+
// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
25+
#define SASS_hmma_per_PTX_wmma 2
26+
27+
// These vars are almost constant between HW generation
28+
// see slide 24 from Nvidia at
29+
// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
30+
#define L2_BANKS_PER_MEM_CHANNEL 2
31+
#define L2_BANK_WIDTH_in_BYTE 32
32+
33+
#endif

0 commit comments

Comments
 (0)