-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
61 changed files
with
8,911 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
APPS=hello | ||
|
||
all: ${APPS} | ||
|
||
%: %.cu | ||
nvcc -O2 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_61,code=sm_61 -o $@ $< | ||
clean: | ||
rm -f ${APPS} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
#include "../common/common.h" | ||
#include <stdio.h> | ||
|
||
__global__ void helloOnGPU() | ||
{ | ||
printf("Hello World on GPU!\n"); | ||
} | ||
|
||
int main(int argc, char **argv) | ||
{ | ||
printf("Hello World on CPU!\n"); | ||
|
||
helloOnGPU<<<1, 10>>>(); | ||
CHECK(cudaDeviceReset()); | ||
return 0; | ||
} | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
CU_APPS=checkDeviceInfor checkThreadIndex sumArraysOnGPU-timer \ | ||
sumMatrixOnGPU-1D-grid-1D-block sumMatrixOnGPU-2D-grid-2D-block \ | ||
checkDimension defineGridBlock sumArraysOnGPU-small-case \ | ||
sumMatrixOnGPU-2D-grid-1D-block sumMatrixOnGPU | ||
C_APPS=sumArraysOnHost | ||
|
||
all: ${C_APPS} ${CU_APPS} | ||
|
||
%: %.cu | ||
nvcc -O2 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_61,code=sm_61 -o $@ $< | ||
%: %.c | ||
gcc -O2 -std=c99 -o $@ $< | ||
clean: | ||
rm -f ${CU_APPS} ${C_APPS} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
#include "../common/common.h" | ||
#include <cuda_runtime.h> | ||
#include <stdio.h> | ||
|
||
|
||
int main(int argc, char **argv) | ||
{ | ||
printf("%s Starting...\n", argv[0]); | ||
|
||
int deviceCount = 0; | ||
cudaGetDeviceCount(&deviceCount); | ||
|
||
if (deviceCount == 0) | ||
{ | ||
printf("There are no available device(s) that support CUDA\n"); | ||
} | ||
else | ||
{ | ||
printf("Detected %d CUDA Capable device(s)\n", deviceCount); | ||
} | ||
|
||
int dev = 0, driverVersion = 0, runtimeVersion = 0; | ||
CHECK(cudaSetDevice(dev)); | ||
cudaDeviceProp deviceProp; | ||
CHECK(cudaGetDeviceProperties(&deviceProp, dev)); | ||
printf("Device %d: \"%s\"\n", dev, deviceProp.name); | ||
|
||
cudaDriverGetVersion(&driverVersion); | ||
cudaRuntimeGetVersion(&runtimeVersion); | ||
printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", | ||
driverVersion / 1000, (driverVersion % 100) / 10, | ||
runtimeVersion / 1000, (runtimeVersion % 100) / 10); | ||
printf(" CUDA Capability Major/Minor version number: %d.%d\n", | ||
deviceProp.major, deviceProp.minor); | ||
printf(" Total amount of global memory: %.2f GBytes (%llu " | ||
"bytes)\n", (float)deviceProp.totalGlobalMem / pow(1024.0, 3), | ||
(unsigned long long)deviceProp.totalGlobalMem); | ||
printf(" GPU Clock rate: %.0f MHz (%0.2f " | ||
"GHz)\n", deviceProp.clockRate * 1e-3f, | ||
deviceProp.clockRate * 1e-6f); | ||
printf(" Memory Clock rate: %.0f Mhz\n", | ||
deviceProp.memoryClockRate * 1e-3f); | ||
printf(" Memory Bus Width: %d-bit\n", | ||
deviceProp.memoryBusWidth); | ||
|
||
if (deviceProp.l2CacheSize) | ||
{ | ||
printf(" L2 Cache Size: %d bytes\n", | ||
deviceProp.l2CacheSize); | ||
} | ||
|
||
printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), " | ||
"2D=(%d,%d), 3D=(%d,%d,%d)\n", deviceProp.maxTexture1D, | ||
deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], | ||
deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], | ||
deviceProp.maxTexture3D[2]); | ||
printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, " | ||
"2D=(%d,%d) x %d\n", deviceProp.maxTexture1DLayered[0], | ||
deviceProp.maxTexture1DLayered[1], deviceProp.maxTexture2DLayered[0], | ||
deviceProp.maxTexture2DLayered[1], | ||
deviceProp.maxTexture2DLayered[2]); | ||
printf(" Total amount of constant memory: %lu bytes\n", | ||
deviceProp.totalConstMem); | ||
printf(" Total amount of shared memory per block: %lu bytes\n", | ||
deviceProp.sharedMemPerBlock); | ||
printf(" Total number of registers available per block: %d\n", | ||
deviceProp.regsPerBlock); | ||
printf(" Warp size: %d\n", | ||
deviceProp.warpSize); | ||
printf(" Maximum number of threads per multiprocessor: %d\n", | ||
deviceProp.maxThreadsPerMultiProcessor); | ||
printf(" Maximum number of threads per block: %d\n", | ||
deviceProp.maxThreadsPerBlock); | ||
printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", | ||
deviceProp.maxThreadsDim[0], | ||
deviceProp.maxThreadsDim[1], | ||
deviceProp.maxThreadsDim[2]); | ||
printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", | ||
deviceProp.maxGridSize[0], | ||
deviceProp.maxGridSize[1], | ||
deviceProp.maxGridSize[2]); | ||
printf(" Maximum memory pitch: %lu bytes\n", | ||
deviceProp.memPitch); | ||
|
||
exit(EXIT_SUCCESS); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#include "../common/common.h" | ||
#include <cuda_runtime.h> | ||
#include <stdio.h> | ||
|
||
|
||
__global__ void checkIndex(void) | ||
{ | ||
printf("threadIdx:(%d, %d, %d)\n", threadIdx.x, threadIdx.y, threadIdx.z); | ||
printf("blockIdx:(%d, %d, %d)\n", blockIdx.x, blockIdx.y, blockIdx.z); | ||
|
||
printf("blockDim:(%d, %d, %d)\n", blockDim.x, blockDim.y, blockDim.z); | ||
printf("gridDim:(%d, %d, %d)\n", gridDim.x, gridDim.y, gridDim.z); | ||
|
||
} | ||
|
||
int main(int argc, char **argv) | ||
{ | ||
int nElem = 6; | ||
|
||
dim3 block(3); | ||
dim3 grid((nElem + block.x - 1) / block.x); | ||
|
||
printf("grid.x %d grid.y %d grid.z %d\n", grid.x, grid.y, grid.z); | ||
printf("block.x %d block.y %d block.z %d\n", block.x, block.y, block.z); | ||
|
||
checkIndex<<<grid, block>>>(); | ||
|
||
CHECK(cudaDeviceReset()); | ||
|
||
return(0); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
#include "../common/common.h" | ||
#include <cuda_runtime.h> | ||
#include <stdio.h> | ||
|
||
|
||
void printMatrix(int *C, const int nx, const int ny) | ||
{ | ||
int *ic = C; | ||
printf("\nMatrix: (%d.%d)\n", nx, ny); | ||
|
||
for (int iy = 0; iy < ny; iy++) | ||
{ | ||
for (int ix = 0; ix < nx; ix++) | ||
{ | ||
printf("%3d", ic[ix]); | ||
|
||
} | ||
|
||
ic += nx; | ||
printf("\n"); | ||
} | ||
|
||
printf("\n"); | ||
return; | ||
} | ||
|
||
__global__ void printThreadIndex(int *A, const int nx, const int ny) | ||
{ | ||
int ix = threadIdx.x + blockIdx.x * blockDim.x; | ||
int iy = threadIdx.y + blockIdx.y * blockDim.y; | ||
unsigned int idx = iy * nx + ix; | ||
|
||
printf("thread_id (%d,%d) block_id (%d,%d) coordinate (%d,%d) global index" | ||
" %2d ival %2d\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, | ||
ix, iy, idx, A[idx]); | ||
} | ||
|
||
int main(int argc, char **argv) | ||
{ | ||
printf("%s Starting...\n", argv[0]); | ||
|
||
int dev = 0; | ||
cudaDeviceProp deviceProp; | ||
CHECK(cudaGetDeviceProperties(&deviceProp, dev)); | ||
printf("Using Device %d: %s\n", dev, deviceProp.name); | ||
CHECK(cudaSetDevice(dev)); | ||
|
||
int nx = 8; | ||
int ny = 6; | ||
int nxy = nx * ny; | ||
int nBytes = nxy * sizeof(float); | ||
|
||
int *h_A; | ||
h_A = (int *)malloc(nBytes); | ||
|
||
for (int i = 0; i < nxy; i++) | ||
{ | ||
h_A[i] = i; | ||
} | ||
printMatrix(h_A, nx, ny); | ||
|
||
int *d_MatA; | ||
CHECK(cudaMalloc((void **)&d_MatA, nBytes)); | ||
|
||
CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice)); | ||
|
||
dim3 block(4, 2); | ||
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y); | ||
|
||
printThreadIndex<<<grid, block>>>(d_MatA, nx, ny); | ||
CHECK(cudaGetLastError()); | ||
|
||
CHECK(cudaFree(d_MatA)); | ||
free(h_A); | ||
|
||
CHECK(cudaDeviceReset()); | ||
|
||
return (0); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#include "../common/common.h" | ||
#include <cuda_runtime.h> | ||
#include <stdio.h> | ||
|
||
int main(int argc, char **argv) | ||
{ | ||
int nElem = 1024; | ||
|
||
dim3 block (1024); | ||
dim3 grid ((nElem + block.x - 1) / block.x); | ||
printf("grid.x %d block.x %d \n", grid.x, block.x); | ||
|
||
block.x = 512; | ||
grid.x = (nElem + block.x - 1) / block.x; | ||
printf("grid.x %d block.x %d \n", grid.x, block.x); | ||
|
||
block.x = 256; | ||
grid.x = (nElem + block.x - 1) / block.x; | ||
printf("grid.x %d block.x %d \n", grid.x, block.x); | ||
|
||
block.x = 128; | ||
grid.x = (nElem + block.x - 1) / block.x; | ||
printf("grid.x %d block.x %d \n", grid.x, block.x); | ||
|
||
CHECK(cudaDeviceReset()); | ||
|
||
return(0); | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
#include "../common/common.h" | ||
#include <cuda_runtime.h> | ||
#include <stdio.h> | ||
|
||
void checkResult(float *hostRef, float *gpuRef, const int N) | ||
{ | ||
double epsilon = 1.0E-8; | ||
bool match = 1; | ||
|
||
for (int i = 0; i < N; i++) | ||
{ | ||
if (abs(hostRef[i] - gpuRef[i]) > epsilon) | ||
{ | ||
match = 0; | ||
printf("Arrays do not match!\n"); | ||
printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i], | ||
gpuRef[i], i); | ||
break; | ||
} | ||
} | ||
|
||
if (match) printf("Arrays match.\n\n"); | ||
|
||
return; | ||
} | ||
|
||
|
||
void initialData(float *ip, int size) | ||
{ | ||
time_t t; | ||
srand((unsigned) time(&t)); | ||
|
||
for (int i = 0; i < size; i++) | ||
{ | ||
ip[i] = (float)(rand() & 0xFF) / 10.0f; | ||
} | ||
|
||
return; | ||
} | ||
|
||
|
||
void sumArraysOnHost(float *A, float *B, float *C, const int N) | ||
{ | ||
for (int idx = 0; idx < N; idx++) | ||
C[idx] = A[idx] + B[idx]; | ||
} | ||
|
||
__global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N) | ||
{ | ||
int i = threadIdx.x; | ||
|
||
if (i < N) C[i] = A[i] + B[i]; | ||
} | ||
|
||
|
||
int main(int argc, char **argv) | ||
{ | ||
printf("%s Starting...\n", argv[0]); | ||
|
||
int dev = 0; | ||
CHECK(cudaSetDevice(dev)); | ||
|
||
int nElem = 1 << 5; | ||
printf("Vector size %d\n", nElem); | ||
|
||
size_t nBytes = nElem * sizeof(float); | ||
|
||
float *h_A, *h_B, *hostRef, *gpuRef; | ||
h_A = (float *)malloc(nBytes); | ||
h_B = (float *)malloc(nBytes); | ||
hostRef = (float *)malloc(nBytes); | ||
gpuRef = (float *)malloc(nBytes); | ||
|
||
initialData(h_A, nElem); | ||
initialData(h_B, nElem); | ||
|
||
memset(hostRef, 0, nBytes); | ||
memset(gpuRef, 0, nBytes); | ||
|
||
float *d_A, *d_B, *d_C; | ||
CHECK(cudaMalloc((float**)&d_A, nBytes)); | ||
CHECK(cudaMalloc((float**)&d_B, nBytes)); | ||
CHECK(cudaMalloc((float**)&d_C, nBytes)); | ||
|
||
CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice)); | ||
CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice)); | ||
CHECK(cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice)); | ||
|
||
dim3 block (nElem); | ||
dim3 grid (1); | ||
|
||
sumArraysOnGPU<<<grid, block>>>(d_A, d_B, d_C, nElem); | ||
printf("Execution configure <<<%d, %d>>>\n", grid.x, block.x); | ||
|
||
CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); | ||
|
||
sumArraysOnHost(h_A, h_B, hostRef, nElem); | ||
|
||
checkResult(hostRef, gpuRef, nElem); | ||
|
||
CHECK(cudaFree(d_A)); | ||
CHECK(cudaFree(d_B)); | ||
CHECK(cudaFree(d_C)); | ||
|
||
free(h_A); | ||
free(h_B); | ||
free(hostRef); | ||
free(gpuRef); | ||
|
||
CHECK(cudaDeviceReset()); | ||
return(0); | ||
} |
Oops, something went wrong.