Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
dedoogong authored May 1, 2018
1 parent aa31653 commit 112e89b
Show file tree
Hide file tree
Showing 16 changed files with 1,411 additions and 0 deletions.
11 changes: 11 additions & 0 deletions cuda-cpp/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
CU_APPS=coalescing finite-difference optimize-data-transfers-bandwidthtest optimize-data-transfers-profile overlap-data-transfers-async shared-memory transpose
C_APPS=

all: ${C_APPS} ${CU_APPS}

%: %.cu
nvcc -O2 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_61,code=sm_61 -o $@ $< -lcudadevrt --ptxas-options=-v --relocatable-device-code true
%: %.c
gcc -O2 -std=c99 -o $@ $<
clean:
rm -f ${CU_APPS} ${C_APPS}
11 changes: 11 additions & 0 deletions cuda-cpp/Makefile~
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
CU_APPS=coalescing finite-difference optimize-data-transfers-bandwidthtest optimize-data-transfers-profile overlap-data-transfers-async shared-memory transpose
C_APPS=

all: ${C_APPS} ${CU_APPS}

%: %.cu
nvcc -O2 -arch=sm_61 sm_52 -o $@ $< -lcudadevrt --ptxas-options=-v --relocatable-device-code true
%: %.c
gcc -O2 -std=c99 -o $@ $<
clean:
rm -f ${CU_APPS} ${C_APPS}
Binary file added cuda-cpp/coalescing
Binary file not shown.
136 changes: 136 additions & 0 deletions cuda-cpp/coalescing.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
/* Copyright (c) 1993-2015, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdio.h>
#include <assert.h>

// Convenience function for checking CUDA runtime API results
// can be wrapped around any runtime API call. No-op in release builds.
inline
cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}

template <typename T>
__global__ void offset(T* a, int s)
{
int i = blockDim.x * blockIdx.x + threadIdx.x + s;
a[i] = a[i] + 1;
}

template <typename T>
__global__ void stride(T* a, int s)
{
int i = (blockDim.x * blockIdx.x + threadIdx.x) * s;
a[i] = a[i] + 1;
}

template <typename T>
void runTest(int deviceId, int nMB)
{
int blockSize = 256;
float ms;

T *d_a;
cudaEvent_t startEvent, stopEvent;

int n = nMB*1024*1024/sizeof(T);

// NB: d_a(33*nMB) for stride case
checkCuda( cudaMalloc(&d_a, n * 33 * sizeof(T)) );

checkCuda( cudaEventCreate(&startEvent) );
checkCuda( cudaEventCreate(&stopEvent) );

printf("Offset, Bandwidth (GB/s):\n");

offset<<<n/blockSize, blockSize>>>(d_a, 0); // warm up

for (int i = 0; i <= 32; i++) {
checkCuda( cudaMemset(d_a, 0, n * sizeof(T)) );

checkCuda( cudaEventRecord(startEvent,0) );
offset<<<n/blockSize, blockSize>>>(d_a, i);
checkCuda( cudaEventRecord(stopEvent,0) );
checkCuda( cudaEventSynchronize(stopEvent) );

checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
printf("%d, %f\n", i, 2*nMB/ms);
}

printf("\n");
printf("Stride, Bandwidth (GB/s):\n");

stride<<<n/blockSize, blockSize>>>(d_a, 1); // warm up
for (int i = 1; i <= 32; i++) {
checkCuda( cudaMemset(d_a, 0, n * sizeof(T)) );

checkCuda( cudaEventRecord(startEvent,0) );
stride<<<n/blockSize, blockSize>>>(d_a, i);
checkCuda( cudaEventRecord(stopEvent,0) );
checkCuda( cudaEventSynchronize(stopEvent) );

checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
printf("%d, %f\n", i, 2*nMB/ms);
}

checkCuda( cudaEventDestroy(startEvent) );
checkCuda( cudaEventDestroy(stopEvent) );
cudaFree(d_a);
}

int main(int argc, char **argv)
{
int nMB = 4;
int deviceId = 0;
bool bFp64 = false;

for (int i = 1; i < argc; i++) {
if (!strncmp(argv[i], "dev=", 4))
deviceId = atoi((char*)(&argv[i][4]));
else if (!strcmp(argv[i], "fp64"))
bFp64 = true;
}

cudaDeviceProp prop;

checkCuda( cudaSetDevice(deviceId) );
checkCuda( cudaGetDeviceProperties(&prop, deviceId) );
printf("Device: %s\n", prop.name);
printf("Transfer size (MB): %d\n", nMB);

printf("%s Precision\n", bFp64 ? "Double" : "Single");

if (bFp64) runTest<double>(deviceId, nMB);
else runTest<float>(deviceId, nMB);
}
Binary file added cuda-cpp/finite-difference
Binary file not shown.
Loading

0 comments on commit 112e89b

Please sign in to comment.