accel-sim · LAhmos · Apr 30, 2021 · Jul 27, 2021 · Jul 28, 2021 · Jul 28, 2021
diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml
@@ -0,0 +1,35 @@
+name: Test Build
+
+# Controls when the workflow will run
+on:
+  # Triggers the workflow on push or pull request events but only for the mydev branch
+  push:
+    branches-ignore:
+      - "gh-readonly-queue**"
+  pull_request:
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:  
+  test-12-6:
+    runs-on: ubuntu-latest
+    container:
+     image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+
+      - name: Build Apps
+        run: |
+             git config --global --add safe.directory /__w/gpu-app-collection/gpu-app-collection
+             /bin/bash test-build.sh
+
+      - name: Print Successful Apps
+        if: always()
+        run: |
+              echo "Built `ls bin/12.8/release | wc` Apps:"
+              ls bin/12.8/release
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,4 @@ src/cuda/rodinia/3.1/cuda/nn/nn
 src/cuda/rodinia/3.1/cuda/particlefilter/particlefilter_float
 src/cuda/rodinia/3.1/cuda/particlefilter/particlefilter_naive
 src/cuda/rodinia/3.1/cuda/pathfinder/pathfinder
+4.2
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,13 @@
 [submodule "src/cuda/cutlass-bench/tools/external/googletest"]
 	path = src/cuda/cutlass-bench/tools/external/googletest
 	url = https://github.com/google/googletest.git
+[submodule "src/cuda/cutlass-bench"]
+	path = src/cuda/cutlass-bench
+	url = https://github.com/NVIDIA/cutlass.git
+[submodule "src/cuda/cuda-samples"]
+	path = src/cuda/cuda-samples
+	url = https://github.com/NVIDIA/cuda-samples.git
+[submodule "src/cuda/pytorch_examples"]
+	path = src/cuda/pytorch_examples
+	url = https://github.com/accel-sim/pytorch_examples.git
+	branch = inference_accelsim_v2
diff --git a/README.md b/README.md
@@ -44,3 +44,27 @@ will grab this data, as well as:
 ```
 make data
 ```
+
+#### AccelWattch
+
+The source code for AccelWattch Microbenchmarks are located at: 
+```
+src/cuda/accelwattch-ubench
+```
+
+To compile AccelWattch Microbenchmarks: 
+```
+make accelwattch_ubench -C ./src
+```
+To compile AccelWattch validation set benchmarks for simulator runs:
+```
+make accelwattch_validation -C ./src
+```
+To compile AccelWattch validation set benchmarks for power profiling individual-kernels:
+```
+make accelwattch_hw_power -C ./src
+```
+To compile everything above for AccelWattch:
+```
+make accelwattch -C ./src
+```
diff --git a/get_data.sh b/get_data.sh
@@ -4,7 +4,9 @@ DATA_SUBDIR="/data_dirs/"
 DATA_ROOT=$BASH_ROOT$DATA_SUBDIR
 
 if [ ! -d $DATA_ROOT ]; then
-    wget https://engineering.purdue.edu/tgrogers/gpgpu-sim/benchmark_data/all.gpgpu-sim-app-data.tgz
+	if [ ! -f $BASH_ROOT/all.gpgpu-sim-app-data.tgz ]; then
+		wget https://engineering.purdue.edu/tgrogers/gpgpu-sim/benchmark_data/all.gpgpu-sim-app-data.tgz
+	fi
     tar xzvf all.gpgpu-sim-app-data.tgz -C $BASH_ROOT
     rm all.gpgpu-sim-app-data.tgz
 fi
diff --git a/src/Makefile b/src/Makefile
diff --git a/src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw/atomic_add_bw.cu b/src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw/atomic_add_bw.cu
@@ -84,6 +84,8 @@ int main(){
 	gpuErrchk( cudaMemcpy(startClk, startClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) );
 	gpuErrchk( cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) );
 	gpuErrchk( cudaMemcpy(res, res_g, TOTAL_THREADS*sizeof(int32_t), cudaMemcpyDeviceToHost) );
+	printf("Found GPU Data Value = %d %d %d %d\n", data1[0], data1[1], data1[2], data1[3]);
+	printf("Found GPU Result Value = %d %d %d %d\n", res[0], res[1], res[2], res[3]);
 
 	float bw;
 	uint32_t total_time = *std::max_element(&stopClk[0],&stopClk[TOTAL_THREADS-1])-*std::min_element(&startClk[0],&startClk[TOTAL_THREADS-1]);

diff --git a/..._Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw_conflict/atomic_add_bw_conflict.cu b/..._Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw_conflict/atomic_add_bw_conflict.cu
@@ -55,7 +55,7 @@ int main(){
 	uint32_t *startClk = (uint32_t*) malloc(TOTAL_THREADS*sizeof(uint32_t));
 	uint32_t *stopClk = (uint32_t*) malloc(TOTAL_THREADS*sizeof(uint32_t));
 	int32_t *data1 = (int32_t*) malloc(TOTAL_THREADS*sizeof(int32_t));
-	//int32_t *data2 = (int32_t*) malloc(TOTAL_THREADS*sizeof(int32_t));
+	int32_t *data2 = (int32_t*) malloc(TOTAL_THREADS*sizeof(int32_t));
 	int32_t *res = (int32_t*) malloc(TOTAL_THREADS*sizeof(int32_t));
 
 	uint32_t *startClk_g;
@@ -84,6 +84,9 @@ int main(){
 	gpuErrchk( cudaMemcpy(startClk, startClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) );
 	gpuErrchk( cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) );
 	gpuErrchk( cudaMemcpy(res, res_g, TOTAL_THREADS*sizeof(int32_t), cudaMemcpyDeviceToHost) );
+	gpuErrchk( cudaMemcpy(data2, data1_g, TOTAL_THREADS*sizeof(int32_t), cudaMemcpyDeviceToHost) );
+	printf("Found GPU Data Value = %d %d %d %d\n", data2[0], data2[1], data2[2], data2[3]);
+	printf("Found GPU Result Value = %d %d %d %d\n", res[0], res[1], res[2], res[3]);
 
 	float bw;
 	uint32_t total_time = *std::max_element(&stopClk[0],&stopClk[TOTAL_THREADS-1])-*std::min_element(&startClk[0],&startClk[TOTAL_THREADS-1]);

diff --git a/src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw_diverge/Makefile b/src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw_diverge/Makefile
@@ -0,0 +1,31 @@
+GENCODE_SM50 := -gencode=arch=compute_50,code=\"sm_50,compute_50\"
+GENCODE_SM61 := -gencode=arch=compute_61,code=\"sm_61,compute_61\"
+GENCODE_SM70 := -gencode=arch=compute_70,code=\"sm_70,compute_70\"
+GENCODE_SM75 := -gencode=arch=compute_75,code=\"sm_75,compute_75\"
+
+CUOPTS = $(GENCODE_SM50) $(GENCODE_SM61) $(GENCODE_SM70)
+
+
+CC := nvcc
+
+INCLUDE := 
+LIB := 
+
+SRC = atomic_add_bw_diverge.cu
+
+EXE = atomic_add_bw_diverge
+
+release:
+	$(CC) $(CUOPTS) $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart
+
+clean:
+	rm -f *.o; rm -f $(EXE)
+
+run:
+	./$(EXE)
+
+profile:
+	nvprof ./$(EXE)
+
+events:
+	nvprof  --events elapsed_cycles_sm ./$(EXE)
diff --git a/...PU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw_diverge/atomic_add_bw_diverge.cu b/...PU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw_diverge/atomic_add_bw_diverge.cu
@@ -0,0 +1,119 @@
+#include <stdio.h>   
+#include <stdlib.h> 
+#include <cuda.h>
+#include <iostream>
+#include <algorithm>
+
+#define THREADS_PER_BLOCK 1024
+#define THREADS_PER_SM 2048
+#define BLOCKS_NUM 160
+#define TOTAL_THREADS (THREADS_PER_BLOCK*BLOCKS_NUM)
+#define WARP_SIZE 32
+#define REPEAT_TIMES 1
+
+#define CONFLICT_COUNT 1	// Must be between 1 to 16 
+
+// GPU error check
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
+	if (code != cudaSuccess) {
+		fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+		if (abort) exit(code);
+	}
+}
+
+
+template <class T>
+__global__ void max_flops(uint32_t *startClk, uint32_t *stopClk, T *data1, T *res, uint32_t ConflictCount) {
+	int gid = blockIdx.x*blockDim.x + threadIdx.x;
+	//register T s1 = data1[gid];
+	//register T s2 = data2[gid];
+	//register T result = 0;
+
+	register int atomic_loc = 0; 
+
+	// synchronize all threads
+	asm volatile ("bar.sync 0;");
+
+	// start timing
+	uint32_t start = 0;
+	asm volatile ("mov.u32 %0, %%clock;" : "=r"(start) :: "memory");
+
+	if ((gid % 32) < ConflictCount) {
+		for (int j=0 ; j<REPEAT_TIMES ; ++j) {
+			atomicAdd(&data1[atomic_loc], 10);
+		}
+
+	}
+	// synchronize all threads
+	asm volatile("bar.sync 0;");
+
+	// stop timing
+	uint32_t stop = 0;
+	asm volatile("mov.u32 %0, %%clock;" : "=r"(stop) :: "memory");
+
+	// write time and data back to memory
+	startClk[gid] = start;
+	stopClk[gid] = stop;
+	res[gid] = data1[0];
+}
+
+int main(int argc, char ** argv){
+	uint32_t *startClk = (uint32_t*) malloc(TOTAL_THREADS*sizeof(uint32_t));
+	uint32_t *stopClk = (uint32_t*) malloc(TOTAL_THREADS*sizeof(uint32_t));
+	int32_t *data1 = (int32_t*) malloc(TOTAL_THREADS*sizeof(int32_t));
+	int32_t *data2 = (int32_t*) malloc(TOTAL_THREADS*sizeof(int32_t));
+	int32_t *res = (int32_t*) malloc(TOTAL_THREADS*sizeof(int32_t));
+
+	uint32_t *startClk_g;
+	uint32_t *stopClk_g;
+	int32_t *data1_g;
+	//int32_t *data2_g;
+	int32_t *res_g;
+
+	//	Extract Cmdline Args
+	uint32_t ConflictCount = 0;
+	if (argc < 2) {
+		printf("Usage : atomics_add_bw_profile [# Conflict Atomics]     \n");
+		printf("        [# Diverged Atomics]  must be between 1 and 16  \n");
+		return -1;
+	}
+	else {
+		ConflictCount = atoi(argv[1]);
+		printf(" Atomic : %d, Diverged %d \n", ConflictCount, 32 - ConflictCount);
+	}
+
+
+	for (uint32_t i=0; i<TOTAL_THREADS; i++) {
+		data1[i] = (int32_t)i;
+		//data2[i] = (int32_t)i;
+	}
+
+	gpuErrchk( cudaMalloc(&startClk_g, TOTAL_THREADS*sizeof(uint32_t)) );
+	gpuErrchk( cudaMalloc(&stopClk_g, TOTAL_THREADS*sizeof(uint32_t)) );
+	gpuErrchk( cudaMalloc(&data1_g, TOTAL_THREADS*sizeof(int32_t)) );
+	//gpuErrchk( cudaMalloc(&data2_g, TOTAL_THREADS*sizeof(int32_t)) );
+	gpuErrchk( cudaMalloc(&res_g, TOTAL_THREADS*sizeof(int32_t)) );
+
+	gpuErrchk( cudaMemcpy(data1_g, data1, TOTAL_THREADS*sizeof(int32_t), cudaMemcpyHostToDevice) );
+	//gpuErrchk( cudaMemcpy(data2_g, data2, TOTAL_THREADS*sizeof(int32_t), cudaMemcpyHostToDevice) );
+
+	max_flops<int32_t><<<BLOCKS_NUM,THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, data1_g, res_g, ConflictCount);
+	gpuErrchk( cudaPeekAtLastError() );
+
+	gpuErrchk( cudaMemcpy(startClk, startClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) );
+	gpuErrchk( cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) );
+	gpuErrchk( cudaMemcpy(res, res_g, TOTAL_THREADS*sizeof(int32_t), cudaMemcpyDeviceToHost) );
+	gpuErrchk( cudaMemcpy(data2, data1_g, TOTAL_THREADS*sizeof(int32_t), cudaMemcpyDeviceToHost) );
+	printf("Found GPU Data Value = %d %d %d %d\n", data2[0], data2[1], data2[2], data2[3]);
+	printf("Found GPU Result Value = %d %d %d %d\n", res[0], res[1], res[2], res[3]);
+
+	float bw;
+	uint32_t total_time = *std::max_element(&stopClk[0],&stopClk[TOTAL_THREADS-1])-*std::min_element(&startClk[0],&startClk[TOTAL_THREADS-1]);
+	bw = ((float)(REPEAT_TIMES*TOTAL_THREADS*4)/(float)(total_time));
+	printf("int32 bendwidth = %f (byte/clk)\n", bw);
+	printf("Total Clk number = %u \n", total_time);
+
+	return 0;
+} 
+
diff --git a/src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw_profile/Makefile b/src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw_profile/Makefile
@@ -0,0 +1,31 @@
+GENCODE_SM50 := -gencode=arch=compute_50,code=\"sm_50,compute_50\"
+GENCODE_SM61 := -gencode=arch=compute_61,code=\"sm_61,compute_61\"
+GENCODE_SM70 := -gencode=arch=compute_70,code=\"sm_70,compute_70\"
+GENCODE_SM75 := -gencode=arch=compute_75,code=\"sm_75,compute_75\"
+
+CUOPTS = $(GENCODE_SM50) $(GENCODE_SM61) $(GENCODE_SM70)
+
+
+CC := nvcc
+
+INCLUDE := 
+LIB := 
+
+SRC = atomic_add_bw_profile.cu
+
+EXE = atomic_add_bw_profile
+
+release:
+	$(CC) $(CUOPTS) $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart
+
+clean:
+	rm -f *.o; rm -f $(EXE)
+
+run:
+	./$(EXE)
+
+profile:
+	nvprof ./$(EXE)
+
+events:
+	nvprof  --events elapsed_cycles_sm ./$(EXE)