Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
161 commits
Select commit Hold shift + click to select a range
52c11fb
Merge pull request #4 from mattsinc/release
mkhairy Apr 30, 2021
abc0ba2
Updating Compilation flow for AccelWattch Validation Benchmarks
VijayKandiah Jul 27, 2021
1b0db94
Adding validation set binaries for AccelWattch HW power measurements
VijayKandiah Jul 28, 2021
ecff05d
Adding Accelwattch makefile config
VijayKandiah Jul 28, 2021
eaba928
Fixing accelwattch config bugs
VijayKandiah Jul 28, 2021
5e90823
Fixing pathfinder
Jul 28, 2021
6ae9a1e
Fixing kmeans cudasetdevice and kernel names
Jul 29, 2021
ce2c8e9
Updating cutlass makefile
VijayKandiah Jul 30, 2021
552ebd6
Updating makefile for Accelwattch
Jul 30, 2021
a4a333f
Adding Accelwattch-ubench
VijayKandiah Jul 30, 2021
1957503
Fixing accelwatch_ubench
Jul 31, 2021
3aba658
Adding License for accelwattch_ubench
VijayKandiah Jul 31, 2021
72e5b6d
Updating ubench LICENSES
VijayKandiah Aug 1, 2021
6e6548b
Update README.md
VijayKandiah Aug 1, 2021
257d310
Update README.md
VijayKandiah Aug 1, 2021
bc492b5
Update README.md
VijayKandiah Aug 1, 2021
85db8fe
Updating get_data.sh
VijayKandiah Aug 20, 2021
8e78b21
Fixing naming issues with Parboil and btree_k2
VijayKandiah Aug 21, 2021
b53114e
Removing dct.tgz
VijayKandiah Oct 9, 2021
822146b
Merge pull request #5 from VijayKandiah/release
VijayKandiah Oct 9, 2021
1861f34
dwt2d: fix memory leak in filename malloc
Nov 2, 2021
e2f523f
Merge pull request #6 from mattsinc/release
tgrogers Nov 2, 2021
9072f14
Merge branch 'dev' into release-accelwattch
JRPan Feb 16, 2022
32282bb
Merge pull request #7 from accel-sim/release-accelwattch
JRPan Feb 18, 2022
9f93567
MLPerf support RC
cesar-avalos3 Feb 20, 2022
2309942
Benchmarks should return to original expected folders after running
cesar-avalos3 Feb 21, 2022
afe1a3d
fix 4.2 in setup_environment
JRPan Feb 28, 2022
5f5d114
make cuda compute sdk
JRPan Mar 1, 2022
df234ca
Several fixes
cesar-avalos3 Mar 1, 2022
6293679
use absolute path
JRPan Mar 1, 2022
283c049
forgot to use absolute path when sdk is detected
JRPan Mar 1, 2022
1ca7809
Merge pull request #8 from cesar-avalos3/mlperf_first_integration
JRPan Mar 1, 2022
cbc1db3
Merge pull request #9 from JRPan/dev-fix-4.2
JRPan Mar 1, 2022
5eb8a49
Fix if statement to stop bash warnings
cesar-avalos3 Mar 7, 2022
67c2a98
Microbenchmarks for profiling, correlating atomic performance on QV100
abhaumick Mar 31, 2022
7f3fd71
added diverged atomic_add_bw test
abhaumick Apr 4, 2022
56a472b
Removed executable atomic_add_bw_profile from repo
abhaumick Apr 5, 2022
cf0ed9e
Restored incorrectly commented out code
abhaumick Apr 5, 2022
963cf38
Delete atomic_add_bw_conflict_profile.cu
abhaumick Apr 5, 2022
d3220a5
Merge pull request #11 from abhaumick/dev
barnes88 Apr 13, 2022
9151d37
add lineinfo flag
barnes88 Jun 16, 2022
4dd2ba9
Merge pull request #12 from barnes88/line_info
barnes88 Jun 8, 2023
00ed39b
shared memory bank conflicts bench
barnes88 Jun 12, 2023
5c5adea
Fix missing cutlass_perf_test binary
cesar-avalos3 Jul 10, 2023
eebbfbb
Merge pull request #17 from cesar-avalos3/fix-cutlass
cesar-avalos3 Jul 10, 2023
bff309d
Cutlass Version update
Shreya-gaur Jul 10, 2023
fa65ff6
Merge with the dev branch
Shreya-gaur Jul 10, 2023
b7d0ed3
Merge branch 'dev' into temp
Shreya-gaur Jul 20, 2023
b95cecc
Removing the .github folder from cutlass-bench
Shreya-gaur Jul 20, 2023
50f4eae
Merge branch 'temp' of https://github.com/Shreya-gaur/gpu-app-collect…
Shreya-gaur Jul 20, 2023
a92aaa6
update Makefile
barnes88 Aug 15, 2023
151b129
Merge pull request #16 from barnes88/bank-conflict
barnes88 Aug 15, 2023
24ab434
fix Makefile space error
barnes88 Aug 15, 2023
a8d28a2
Merge pull request #20 from barnes88/fix-spacing
barnes88 Aug 15, 2023
cb6f866
Merge pull request #10 from cesar-avalos3/mlperf_first_integration
JRPan Sep 15, 2023
e855843
Merge branch 'dev' into temp
Shreya-gaur Nov 29, 2023
7db9564
Updating cutlass to submodule format
Shreya-gaur Feb 7, 2024
e00d15b
Merge branch 'temp' of https://github.com/Shreya-gaur/gpu-app-collect…
Shreya-gaur Feb 7, 2024
60e781e
changes to the gitmodule
Shreya-gaur Feb 7, 2024
f662582
retesting
Shreya-gaur Feb 7, 2024
0c9dc3a
cutlass-bench readded
Shreya-gaur Feb 7, 2024
63f97ac
Merge pull request #18 from Shreya-gaur/temp
William-An May 7, 2024
554a552
update cutlass to build with name used in accelwattch, delete unused …
barnes88 Aug 12, 2024
051a445
Merge pull request #22 from barnes88/cutlass-build
barnes88 Aug 12, 2024
ac1a1a1
getting rid of kmeans that still relies on text API that seems to no …
tgrogers Jan 22, 2025
8ee7e7c
trying to create an automated tester
tgrogers Jan 22, 2025
90f60fe
renaming
tgrogers Jan 22, 2025
d81cc1c
does not like decimals
tgrogers Jan 22, 2025
ec83b39
setting env
tgrogers Jan 22, 2025
83ffc07
some errors in the simple script
tgrogers Jan 22, 2025
0fb7088
more apps using legacy texture stuff that cuda no longer supports aft…
tgrogers Jan 22, 2025
bfe30e7
changing the scope of the 4.2 build and doing it after all the right …
tgrogers Jan 22, 2025
40d060a
do post list
tgrogers Jan 22, 2025
ed96973
better post processing list
tgrogers Jan 22, 2025
46b927e
trying another always run block
tgrogers Jan 22, 2025
594dcea
update
tgrogers Jan 22, 2025
36558b3
dragon wants scons
tgrogers Jan 22, 2025
0ec1810
a few more packages we needed
tgrogers Jan 22, 2025
96aa197
make everything sequentially so errors are a lot easier to parse.
tgrogers Jan 22, 2025
04beffb
adding a 12.6
tgrogers Jan 22, 2025
a517e6a
scons has gotten picky about spaces...
tgrogers Jan 22, 2025
4418d11
trying another install add
tgrogers Jan 22, 2025
af9787f
some updates to the install packages needed
tgrogers Jan 22, 2025
d7a8231
forgot -y
tgrogers Jan 22, 2025
d89c54e
Installing boost, and setting the path, also fixing some odd whitespa…
tgrogers Jan 22, 2025
e7ccf04
more deps
tgrogers Jan 22, 2025
0eb6031
the CDP apps have not proved to be especially useful. Let's not worry…
tgrogers Jan 22, 2025
a2bd01b
adding CUDA 12 target
tgrogers Jan 23, 2025
f90093c
Fixing some errors introduced in newer versions of CUDA. Some thrust …
tgrogers Jan 23, 2025
f83ec7f
mummer need tex. CUDA 12 does not support the interface anymore
tgrogers Jan 23, 2025
dca88b7
Apparently parboil needs python to run?
tgrogers Jan 23, 2025
5ba1627
python just needs an alias
tgrogers Jan 23, 2025
a463b98
Who cares about 11. Let's prioritize 12.6 and use the nvidia golden t…
tgrogers Jan 23, 2025
9c6877d
We have a lot of apps in here that are not super-important and no lon…
tgrogers Jan 23, 2025
b423057
Deepbench seems to have dies on the vine. Things have moved very fast…
tgrogers Jan 23, 2025
a6ee258
misspelling
tgrogers Jan 23, 2025
8b1252e
adding in mlperf
tgrogers Jan 23, 2025
fec0850
Merge pull request #23 from accel-sim/regress
tgrogers Jan 23, 2025
3631c3b
Drop the mnist_cudnn and install cmake for cutlass
Zhaoyu-Jin Jan 23, 2025
11335df
add -y flag for apt-get install
Zhaoyu-Jin Jan 23, 2025
86fc21c
install git
Zhaoyu-Jin Jan 23, 2025
599736d
Debugging Fetal: Not a git repo error
Zhaoyu-Jin Jan 23, 2025
c38d0a4
Add pytorch example submodule
cesar-avalos3 Jan 24, 2025
1d127aa
Fix to lonestargpu
Zhaoyu-Jin Jan 24, 2025
6f78171
Install git before checkout
Zhaoyu-Jin Jan 24, 2025
2c5c61f
Mark the directory as safe
Zhaoyu-Jin Jan 24, 2025
4cd18f2
clear typo
Zhaoyu-Jin Jan 24, 2025
49355af
removing the mlperf temporarily to test the build for other applications
Zhaoyu-Jin Jan 24, 2025
81af207
Merge pull request #27 from cesar-avalos3/pytorch_examples
tgrogers Jan 25, 2025
6de1a5d
Drop some apps in custom apps that failed due to texture, update the …
Zhaoyu-Jin Jan 26, 2025
ea79c02
heterosync: switch to the latest version
Zhaoyu-Jin Jan 26, 2025
cbc8965
switch to custom docker image
Zhaoyu-Jin Jan 27, 2025
aefadf4
Integrated CUDA samples as submodule
Connie120 Jan 27, 2025
1c3f6d6
heterosync: checkout a specific commit
Zhaoyu-Jin Feb 1, 2025
ba4ec42
Merge pull request #28 from accel-sim/dev_regression_error_fix
tgrogers Feb 1, 2025
a8373f6
Merge branch 'dev' into dev
tgrogers Feb 1, 2025
36012cb
Fixed makefile
Connie120 Feb 3, 2025
0973782
Updated Makefile for cuda-samples
Connie120 Feb 4, 2025
096bf87
Merge pull request #29 from Connie120/dev
tgrogers Feb 11, 2025
e4621e2
Update test-build.sh
tgrogers Feb 16, 2025
d669e27
MLperf Inference (bert) Integration
Zhaoyu-Jin Feb 16, 2025
f12d76d
Add the README
Zhaoyu-Jin Feb 17, 2025
16ee360
Remove old submodule
cesar-avalos3 Feb 18, 2025
bf3c754
Add accel-sim inference-only pytorch examples
cesar-avalos3 Feb 18, 2025
897fee1
Create runnable scripts for pytorch_examples
cesar-avalos3 Feb 18, 2025
58c0d8b
Update to absolute path
cesar-avalos3 Feb 18, 2025
dd45bf5
Updated cuda samples submodule to tag v12.8
LAhmos Feb 18, 2025
3174886
update Makefile for cuda-samples
LAhmos Feb 18, 2025
a5a64d2
update the image to 12.8
LAhmos Feb 18, 2025
67138f5
update the print succesful app to 12.8
LAhmos Feb 18, 2025
d8f02c4
remove -j
LAhmos Feb 18, 2025
3b95e19
Merge pull request #35 from LAhmos/cuda-samples-cmake
tgrogers Feb 18, 2025
c714a8a
Merge pull request #34 from cesar-avalos3/pytorch_integration
tgrogers Feb 18, 2025
c0241b3
Merge pull request #32 from Zhaoyu-Jin/dev
tgrogers Feb 18, 2025
af0746e
Merge pull request #31 from accel-sim/tgrogers-patch-1
tgrogers Feb 18, 2025
4d26841
change make to use j for samples
LAhmos Feb 19, 2025
c56bef5
Merge pull request #36 from LAhmos/cuda-samples-cmake
tgrogers Feb 19, 2025
fc018a2
Update README.md
Zhaoyu-Jin Feb 20, 2025
259ab0b
clean up on data get
tgrogers Feb 23, 2025
a3c552f
Update cutlass to latest
tgrogers Feb 23, 2025
9c158d8
getting latest cutlass to compile
tgrogers Feb 23, 2025
947807c
make and copy the examples
tgrogers Feb 24, 2025
510ffd3
Merge pull request #37 from tgrogers/dev
tgrogers Feb 25, 2025
96dba5c
fix mlperf rerun
LAhmos Feb 27, 2025
3a2ba22
another fix
LAhmos Feb 27, 2025
ddb7cd8
Merge pull request #38 from LAhmos/mlperf_rerun
tgrogers Feb 28, 2025
7ea5ed1
fix mlperf path
LAhmos Mar 4, 2025
47b234d
Merge pull request #39 from LAhmos/mlperf_rerun
tgrogers Mar 4, 2025
49f6b21
update readme
Zhaoyu-Jin Mar 6, 2025
4c2dec4
Merge branch 'dev' of github.com:Zhaoyu-Jin/gpu-app-collection into dev
Zhaoyu-Jin Mar 6, 2025
0489d20
Merge pull request #40 from purdue-aalp/dev
tgrogers Mar 13, 2025
0ced3ba
Removed CUDA_VERSION definition from bert inference test script
Anunalla Apr 22, 2025
f0ec614
Removed CUDA_VERSION definition from bert inference test script
Anunalla Apr 22, 2025
ab3d85a
Removed CUDA_VERSION def from other scripts too
Anunalla Apr 22, 2025
2bf213c
Merge pull request #41 from purdue-aalp/fix_mlperf_cudaver_bug
LAhmos Apr 22, 2025
eb0ff5f
speed up building
LAhmos Apr 23, 2025
62f311d
add j8
LAhmos Apr 23, 2025
7de9187
add j4
LAhmos Apr 23, 2025
c97c60a
add j4
LAhmos Apr 23, 2025
f35a464
another way
LAhmos Apr 23, 2025
1997928
another way
LAhmos Apr 23, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
35 changes: 35 additions & 0 deletions .github/workflows/test-build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: Test Build

# Controls when the workflow will run
on:
# Triggers the workflow on push or pull request events but only for the mydev branch
push:
branches-ignore:
- "gh-readonly-queue**"
pull_request:

# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:

# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
test-12-6:
runs-on: ubuntu-latest
container:
image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8

# Steps represent a sequence of tasks that will be executed as part of the job
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v4

- name: Build Apps
run: |
git config --global --add safe.directory /__w/gpu-app-collection/gpu-app-collection
/bin/bash test-build.sh

- name: Print Successful Apps
if: always()
run: |
echo "Built `ls bin/12.8/release | wc` Apps:"
ls bin/12.8/release
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ src/cuda/rodinia/3.1/cuda/nn/nn
src/cuda/rodinia/3.1/cuda/particlefilter/particlefilter_float
src/cuda/rodinia/3.1/cuda/particlefilter/particlefilter_naive
src/cuda/rodinia/3.1/cuda/pathfinder/pathfinder
4.2
10 changes: 10 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
[submodule "src/cuda/cutlass-bench/tools/external/googletest"]
path = src/cuda/cutlass-bench/tools/external/googletest
url = https://github.com/google/googletest.git
[submodule "src/cuda/cutlass-bench"]
path = src/cuda/cutlass-bench
url = https://github.com/NVIDIA/cutlass.git
[submodule "src/cuda/cuda-samples"]
path = src/cuda/cuda-samples
url = https://github.com/NVIDIA/cuda-samples.git
[submodule "src/cuda/pytorch_examples"]
path = src/cuda/pytorch_examples
url = https://github.com/accel-sim/pytorch_examples.git
branch = inference_accelsim_v2
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,27 @@ will grab this data, as well as:
```
make data
```

#### AccelWattch

The source code for AccelWattch Microbenchmarks are located at:
```
src/cuda/accelwattch-ubench
```

To compile AccelWattch Microbenchmarks:
```
make accelwattch_ubench -C ./src
```
To compile AccelWattch validation set benchmarks for simulator runs:
```
make accelwattch_validation -C ./src
```
To compile AccelWattch validation set benchmarks for power profiling individual-kernels:
```
make accelwattch_hw_power -C ./src
```
To compile everything above for AccelWattch:
```
make accelwattch -C ./src
```
4 changes: 3 additions & 1 deletion get_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ DATA_SUBDIR="/data_dirs/"
DATA_ROOT=$BASH_ROOT$DATA_SUBDIR

if [ ! -d $DATA_ROOT ]; then
wget https://engineering.purdue.edu/tgrogers/gpgpu-sim/benchmark_data/all.gpgpu-sim-app-data.tgz
if [ ! -f $BASH_ROOT/all.gpgpu-sim-app-data.tgz ]; then
wget https://engineering.purdue.edu/tgrogers/gpgpu-sim/benchmark_data/all.gpgpu-sim-app-data.tgz
fi
tar xzvf all.gpgpu-sim-app-data.tgz -C $BASH_ROOT
rm all.gpgpu-sim-app-data.tgz
fi
307 changes: 262 additions & 45 deletions src/Makefile

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ int main(){
gpuErrchk( cudaMemcpy(startClk, startClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(res, res_g, TOTAL_THREADS*sizeof(int32_t), cudaMemcpyDeviceToHost) );
printf("Found GPU Data Value = %d %d %d %d\n", data1[0], data1[1], data1[2], data1[3]);
printf("Found GPU Result Value = %d %d %d %d\n", res[0], res[1], res[2], res[3]);

float bw;
uint32_t total_time = *std::max_element(&stopClk[0],&stopClk[TOTAL_THREADS-1])-*std::min_element(&startClk[0],&startClk[TOTAL_THREADS-1]);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ int main(){
uint32_t *startClk = (uint32_t*) malloc(TOTAL_THREADS*sizeof(uint32_t));
uint32_t *stopClk = (uint32_t*) malloc(TOTAL_THREADS*sizeof(uint32_t));
int32_t *data1 = (int32_t*) malloc(TOTAL_THREADS*sizeof(int32_t));
//int32_t *data2 = (int32_t*) malloc(TOTAL_THREADS*sizeof(int32_t));
int32_t *data2 = (int32_t*) malloc(TOTAL_THREADS*sizeof(int32_t));
int32_t *res = (int32_t*) malloc(TOTAL_THREADS*sizeof(int32_t));

uint32_t *startClk_g;
Expand Down Expand Up @@ -84,6 +84,9 @@ int main(){
gpuErrchk( cudaMemcpy(startClk, startClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(res, res_g, TOTAL_THREADS*sizeof(int32_t), cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(data2, data1_g, TOTAL_THREADS*sizeof(int32_t), cudaMemcpyDeviceToHost) );
printf("Found GPU Data Value = %d %d %d %d\n", data2[0], data2[1], data2[2], data2[3]);
printf("Found GPU Result Value = %d %d %d %d\n", res[0], res[1], res[2], res[3]);

float bw;
uint32_t total_time = *std::max_element(&stopClk[0],&stopClk[TOTAL_THREADS-1])-*std::min_element(&startClk[0],&startClk[TOTAL_THREADS-1]);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
GENCODE_SM50 := -gencode=arch=compute_50,code=\"sm_50,compute_50\"
GENCODE_SM61 := -gencode=arch=compute_61,code=\"sm_61,compute_61\"
GENCODE_SM70 := -gencode=arch=compute_70,code=\"sm_70,compute_70\"
GENCODE_SM75 := -gencode=arch=compute_75,code=\"sm_75,compute_75\"

CUOPTS = $(GENCODE_SM50) $(GENCODE_SM61) $(GENCODE_SM70)


CC := nvcc

INCLUDE :=
LIB :=

SRC = atomic_add_bw_diverge.cu

EXE = atomic_add_bw_diverge

release:
$(CC) $(CUOPTS) $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart

clean:
rm -f *.o; rm -f $(EXE)

run:
./$(EXE)

profile:
nvprof ./$(EXE)

events:
nvprof --events elapsed_cycles_sm ./$(EXE)
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <iostream>
#include <algorithm>

#define THREADS_PER_BLOCK 1024
#define THREADS_PER_SM 2048
#define BLOCKS_NUM 160
#define TOTAL_THREADS (THREADS_PER_BLOCK*BLOCKS_NUM)
#define WARP_SIZE 32
#define REPEAT_TIMES 1

#define CONFLICT_COUNT 1 // Must be between 1 to 16

// GPU error check
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}


template <class T>
__global__ void max_flops(uint32_t *startClk, uint32_t *stopClk, T *data1, T *res, uint32_t ConflictCount) {
int gid = blockIdx.x*blockDim.x + threadIdx.x;
//register T s1 = data1[gid];
//register T s2 = data2[gid];
//register T result = 0;

register int atomic_loc = 0;

// synchronize all threads
asm volatile ("bar.sync 0;");

// start timing
uint32_t start = 0;
asm volatile ("mov.u32 %0, %%clock;" : "=r"(start) :: "memory");

if ((gid % 32) < ConflictCount) {
for (int j=0 ; j<REPEAT_TIMES ; ++j) {
atomicAdd(&data1[atomic_loc], 10);
}

}
// synchronize all threads
asm volatile("bar.sync 0;");

// stop timing
uint32_t stop = 0;
asm volatile("mov.u32 %0, %%clock;" : "=r"(stop) :: "memory");

// write time and data back to memory
startClk[gid] = start;
stopClk[gid] = stop;
res[gid] = data1[0];
}

int main(int argc, char ** argv){
uint32_t *startClk = (uint32_t*) malloc(TOTAL_THREADS*sizeof(uint32_t));
uint32_t *stopClk = (uint32_t*) malloc(TOTAL_THREADS*sizeof(uint32_t));
int32_t *data1 = (int32_t*) malloc(TOTAL_THREADS*sizeof(int32_t));
int32_t *data2 = (int32_t*) malloc(TOTAL_THREADS*sizeof(int32_t));
int32_t *res = (int32_t*) malloc(TOTAL_THREADS*sizeof(int32_t));

uint32_t *startClk_g;
uint32_t *stopClk_g;
int32_t *data1_g;
//int32_t *data2_g;
int32_t *res_g;

// Extract Cmdline Args
uint32_t ConflictCount = 0;
if (argc < 2) {
printf("Usage : atomics_add_bw_profile [# Conflict Atomics] \n");
printf(" [# Diverged Atomics] must be between 1 and 16 \n");
return -1;
}
else {
ConflictCount = atoi(argv[1]);
printf(" Atomic : %d, Diverged %d \n", ConflictCount, 32 - ConflictCount);
}


for (uint32_t i=0; i<TOTAL_THREADS; i++) {
data1[i] = (int32_t)i;
//data2[i] = (int32_t)i;
}

gpuErrchk( cudaMalloc(&startClk_g, TOTAL_THREADS*sizeof(uint32_t)) );
gpuErrchk( cudaMalloc(&stopClk_g, TOTAL_THREADS*sizeof(uint32_t)) );
gpuErrchk( cudaMalloc(&data1_g, TOTAL_THREADS*sizeof(int32_t)) );
//gpuErrchk( cudaMalloc(&data2_g, TOTAL_THREADS*sizeof(int32_t)) );
gpuErrchk( cudaMalloc(&res_g, TOTAL_THREADS*sizeof(int32_t)) );

gpuErrchk( cudaMemcpy(data1_g, data1, TOTAL_THREADS*sizeof(int32_t), cudaMemcpyHostToDevice) );
//gpuErrchk( cudaMemcpy(data2_g, data2, TOTAL_THREADS*sizeof(int32_t), cudaMemcpyHostToDevice) );

max_flops<int32_t><<<BLOCKS_NUM,THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, data1_g, res_g, ConflictCount);
gpuErrchk( cudaPeekAtLastError() );

gpuErrchk( cudaMemcpy(startClk, startClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(res, res_g, TOTAL_THREADS*sizeof(int32_t), cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(data2, data1_g, TOTAL_THREADS*sizeof(int32_t), cudaMemcpyDeviceToHost) );
printf("Found GPU Data Value = %d %d %d %d\n", data2[0], data2[1], data2[2], data2[3]);
printf("Found GPU Result Value = %d %d %d %d\n", res[0], res[1], res[2], res[3]);

float bw;
uint32_t total_time = *std::max_element(&stopClk[0],&stopClk[TOTAL_THREADS-1])-*std::min_element(&startClk[0],&startClk[TOTAL_THREADS-1]);
bw = ((float)(REPEAT_TIMES*TOTAL_THREADS*4)/(float)(total_time));
printf("int32 bendwidth = %f (byte/clk)\n", bw);
printf("Total Clk number = %u \n", total_time);

return 0;
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
GENCODE_SM50 := -gencode=arch=compute_50,code=\"sm_50,compute_50\"
GENCODE_SM61 := -gencode=arch=compute_61,code=\"sm_61,compute_61\"
GENCODE_SM70 := -gencode=arch=compute_70,code=\"sm_70,compute_70\"
GENCODE_SM75 := -gencode=arch=compute_75,code=\"sm_75,compute_75\"

CUOPTS = $(GENCODE_SM50) $(GENCODE_SM61) $(GENCODE_SM70)


CC := nvcc

INCLUDE :=
LIB :=

SRC = atomic_add_bw_profile.cu

EXE = atomic_add_bw_profile

release:
$(CC) $(CUOPTS) $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart

clean:
rm -f *.o; rm -f $(EXE)

run:
./$(EXE)

profile:
nvprof ./$(EXE)

events:
nvprof --events elapsed_cycles_sm ./$(EXE)
Loading