Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
dedoogong authored May 1, 2018
1 parent 17618a9 commit aa31653
Show file tree
Hide file tree
Showing 61 changed files with 8,911 additions and 0 deletions.
8 changes: 8 additions & 0 deletions chapter01/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
APPS=hello

all: ${APPS}

%: %.cu
nvcc -O2 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_61,code=sm_61 -o $@ $<
clean:
rm -f ${APPS}
18 changes: 18 additions & 0 deletions chapter01/hello.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#include "../common/common.h"
#include <stdio.h>

__global__ void helloOnGPU()
{
printf("Hello World on GPU!\n");
}

int main(int argc, char **argv)
{
printf("Hello World on CPU!\n");

helloOnGPU<<<1, 10>>>();
CHECK(cudaDeviceReset());
return 0;
}


14 changes: 14 additions & 0 deletions chapter02/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
CU_APPS=checkDeviceInfor checkThreadIndex sumArraysOnGPU-timer \
sumMatrixOnGPU-1D-grid-1D-block sumMatrixOnGPU-2D-grid-2D-block \
checkDimension defineGridBlock sumArraysOnGPU-small-case \
sumMatrixOnGPU-2D-grid-1D-block sumMatrixOnGPU
C_APPS=sumArraysOnHost

all: ${C_APPS} ${CU_APPS}

%: %.cu
nvcc -O2 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_61,code=sm_61 -o $@ $<
%: %.c
gcc -O2 -std=c99 -o $@ $<
clean:
rm -f ${CU_APPS} ${C_APPS}
86 changes: 86 additions & 0 deletions chapter02/checkDeviceInfor.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#include "../common/common.h"
#include <cuda_runtime.h>
#include <stdio.h>


int main(int argc, char **argv)
{
printf("%s Starting...\n", argv[0]);

int deviceCount = 0;
cudaGetDeviceCount(&deviceCount);

if (deviceCount == 0)
{
printf("There are no available device(s) that support CUDA\n");
}
else
{
printf("Detected %d CUDA Capable device(s)\n", deviceCount);
}

int dev = 0, driverVersion = 0, runtimeVersion = 0;
CHECK(cudaSetDevice(dev));
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("Device %d: \"%s\"\n", dev, deviceProp.name);

cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n",
driverVersion / 1000, (driverVersion % 100) / 10,
runtimeVersion / 1000, (runtimeVersion % 100) / 10);
printf(" CUDA Capability Major/Minor version number: %d.%d\n",
deviceProp.major, deviceProp.minor);
printf(" Total amount of global memory: %.2f GBytes (%llu "
"bytes)\n", (float)deviceProp.totalGlobalMem / pow(1024.0, 3),
(unsigned long long)deviceProp.totalGlobalMem);
printf(" GPU Clock rate: %.0f MHz (%0.2f "
"GHz)\n", deviceProp.clockRate * 1e-3f,
deviceProp.clockRate * 1e-6f);
printf(" Memory Clock rate: %.0f Mhz\n",
deviceProp.memoryClockRate * 1e-3f);
printf(" Memory Bus Width: %d-bit\n",
deviceProp.memoryBusWidth);

if (deviceProp.l2CacheSize)
{
printf(" L2 Cache Size: %d bytes\n",
deviceProp.l2CacheSize);
}

printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), "
"2D=(%d,%d), 3D=(%d,%d,%d)\n", deviceProp.maxTexture1D,
deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1],
deviceProp.maxTexture3D[2]);
printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, "
"2D=(%d,%d) x %d\n", deviceProp.maxTexture1DLayered[0],
deviceProp.maxTexture1DLayered[1], deviceProp.maxTexture2DLayered[0],
deviceProp.maxTexture2DLayered[1],
deviceProp.maxTexture2DLayered[2]);
printf(" Total amount of constant memory: %lu bytes\n",
deviceProp.totalConstMem);
printf(" Total amount of shared memory per block: %lu bytes\n",
deviceProp.sharedMemPerBlock);
printf(" Total number of registers available per block: %d\n",
deviceProp.regsPerBlock);
printf(" Warp size: %d\n",
deviceProp.warpSize);
printf(" Maximum number of threads per multiprocessor: %d\n",
deviceProp.maxThreadsPerMultiProcessor);
printf(" Maximum number of threads per block: %d\n",
deviceProp.maxThreadsPerBlock);
printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n",
deviceProp.maxThreadsDim[0],
deviceProp.maxThreadsDim[1],
deviceProp.maxThreadsDim[2]);
printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n",
deviceProp.maxGridSize[0],
deviceProp.maxGridSize[1],
deviceProp.maxGridSize[2]);
printf(" Maximum memory pitch: %lu bytes\n",
deviceProp.memPitch);

exit(EXIT_SUCCESS);
}
31 changes: 31 additions & 0 deletions chapter02/checkDimension.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#include "../common/common.h"
#include <cuda_runtime.h>
#include <stdio.h>


__global__ void checkIndex(void)
{
printf("threadIdx:(%d, %d, %d)\n", threadIdx.x, threadIdx.y, threadIdx.z);
printf("blockIdx:(%d, %d, %d)\n", blockIdx.x, blockIdx.y, blockIdx.z);

printf("blockDim:(%d, %d, %d)\n", blockDim.x, blockDim.y, blockDim.z);
printf("gridDim:(%d, %d, %d)\n", gridDim.x, gridDim.y, gridDim.z);

}

int main(int argc, char **argv)
{
int nElem = 6;

dim3 block(3);
dim3 grid((nElem + block.x - 1) / block.x);

printf("grid.x %d grid.y %d grid.z %d\n", grid.x, grid.y, grid.z);
printf("block.x %d block.y %d block.z %d\n", block.x, block.y, block.z);

checkIndex<<<grid, block>>>();

CHECK(cudaDeviceReset());

return(0);
}
79 changes: 79 additions & 0 deletions chapter02/checkThreadIndex.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#include "../common/common.h"
#include <cuda_runtime.h>
#include <stdio.h>


void printMatrix(int *C, const int nx, const int ny)
{
int *ic = C;
printf("\nMatrix: (%d.%d)\n", nx, ny);

for (int iy = 0; iy < ny; iy++)
{
for (int ix = 0; ix < nx; ix++)
{
printf("%3d", ic[ix]);

}

ic += nx;
printf("\n");
}

printf("\n");
return;
}

__global__ void printThreadIndex(int *A, const int nx, const int ny)
{
int ix = threadIdx.x + blockIdx.x * blockDim.x;
int iy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int idx = iy * nx + ix;

printf("thread_id (%d,%d) block_id (%d,%d) coordinate (%d,%d) global index"
" %2d ival %2d\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y,
ix, iy, idx, A[idx]);
}

int main(int argc, char **argv)
{
printf("%s Starting...\n", argv[0]);

int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("Using Device %d: %s\n", dev, deviceProp.name);
CHECK(cudaSetDevice(dev));

int nx = 8;
int ny = 6;
int nxy = nx * ny;
int nBytes = nxy * sizeof(float);

int *h_A;
h_A = (int *)malloc(nBytes);

for (int i = 0; i < nxy; i++)
{
h_A[i] = i;
}
printMatrix(h_A, nx, ny);

int *d_MatA;
CHECK(cudaMalloc((void **)&d_MatA, nBytes));

CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice));

dim3 block(4, 2);
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);

printThreadIndex<<<grid, block>>>(d_MatA, nx, ny);
CHECK(cudaGetLastError());

CHECK(cudaFree(d_MatA));
free(h_A);

CHECK(cudaDeviceReset());

return (0);
}
29 changes: 29 additions & 0 deletions chapter02/defineGridBlock.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#include "../common/common.h"
#include <cuda_runtime.h>
#include <stdio.h>

int main(int argc, char **argv)
{
int nElem = 1024;

dim3 block (1024);
dim3 grid ((nElem + block.x - 1) / block.x);
printf("grid.x %d block.x %d \n", grid.x, block.x);

block.x = 512;
grid.x = (nElem + block.x - 1) / block.x;
printf("grid.x %d block.x %d \n", grid.x, block.x);

block.x = 256;
grid.x = (nElem + block.x - 1) / block.x;
printf("grid.x %d block.x %d \n", grid.x, block.x);

block.x = 128;
grid.x = (nElem + block.x - 1) / block.x;
printf("grid.x %d block.x %d \n", grid.x, block.x);

CHECK(cudaDeviceReset());

return(0);
}

112 changes: 112 additions & 0 deletions chapter02/sumArraysOnGPU-small-case.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#include "../common/common.h"
#include <cuda_runtime.h>
#include <stdio.h>

void checkResult(float *hostRef, float *gpuRef, const int N)
{
double epsilon = 1.0E-8;
bool match = 1;

for (int i = 0; i < N; i++)
{
if (abs(hostRef[i] - gpuRef[i]) > epsilon)
{
match = 0;
printf("Arrays do not match!\n");
printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i],
gpuRef[i], i);
break;
}
}

if (match) printf("Arrays match.\n\n");

return;
}


void initialData(float *ip, int size)
{
time_t t;
srand((unsigned) time(&t));

for (int i = 0; i < size; i++)
{
ip[i] = (float)(rand() & 0xFF) / 10.0f;
}

return;
}


void sumArraysOnHost(float *A, float *B, float *C, const int N)
{
for (int idx = 0; idx < N; idx++)
C[idx] = A[idx] + B[idx];
}

__global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N)
{
int i = threadIdx.x;

if (i < N) C[i] = A[i] + B[i];
}


int main(int argc, char **argv)
{
printf("%s Starting...\n", argv[0]);

int dev = 0;
CHECK(cudaSetDevice(dev));

int nElem = 1 << 5;
printf("Vector size %d\n", nElem);

size_t nBytes = nElem * sizeof(float);

float *h_A, *h_B, *hostRef, *gpuRef;
h_A = (float *)malloc(nBytes);
h_B = (float *)malloc(nBytes);
hostRef = (float *)malloc(nBytes);
gpuRef = (float *)malloc(nBytes);

initialData(h_A, nElem);
initialData(h_B, nElem);

memset(hostRef, 0, nBytes);
memset(gpuRef, 0, nBytes);

float *d_A, *d_B, *d_C;
CHECK(cudaMalloc((float**)&d_A, nBytes));
CHECK(cudaMalloc((float**)&d_B, nBytes));
CHECK(cudaMalloc((float**)&d_C, nBytes));

CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice));

dim3 block (nElem);
dim3 grid (1);

sumArraysOnGPU<<<grid, block>>>(d_A, d_B, d_C, nElem);
printf("Execution configure <<<%d, %d>>>\n", grid.x, block.x);

CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));

sumArraysOnHost(h_A, h_B, hostRef, nElem);

checkResult(hostRef, gpuRef, nElem);

CHECK(cudaFree(d_A));
CHECK(cudaFree(d_B));
CHECK(cudaFree(d_C));

free(h_A);
free(h_B);
free(hostRef);
free(gpuRef);

CHECK(cudaDeviceReset());
return(0);
}
Loading

0 comments on commit aa31653

Please sign in to comment.