Add files via upload

dedoogong · May 1, 2018 · aa31653 · aa31653
1 parent 17618a9
commit aa31653
Show file tree

Hide file tree

Showing 61 changed files with 8,911 additions and 0 deletions.
diff --git a/chapter01/Makefile b/chapter01/Makefile
@@ -0,0 +1,8 @@
+APPS=hello
+
+all: ${APPS}
+
+%: %.cu
+	nvcc -O2 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_61,code=sm_61 -o $@ $<
+clean:
+	rm -f ${APPS}
diff --git a/chapter01/hello.cu b/chapter01/hello.cu
@@ -0,0 +1,18 @@
+#include "../common/common.h"
+#include <stdio.h>
+
+__global__ void helloOnGPU()
+{
+    printf("Hello World on GPU!\n");
+}
+
+int main(int argc, char **argv)
+{
+    printf("Hello World on CPU!\n");
+
+    helloOnGPU<<<1, 10>>>();
+    CHECK(cudaDeviceReset());
+    return 0;
+}
+
+
diff --git a/chapter02/Makefile b/chapter02/Makefile
@@ -0,0 +1,14 @@
+CU_APPS=checkDeviceInfor checkThreadIndex sumArraysOnGPU-timer \
+        sumMatrixOnGPU-1D-grid-1D-block sumMatrixOnGPU-2D-grid-2D-block \
+        checkDimension defineGridBlock sumArraysOnGPU-small-case \
+        sumMatrixOnGPU-2D-grid-1D-block sumMatrixOnGPU
+C_APPS=sumArraysOnHost
+
+all: ${C_APPS} ${CU_APPS}
+
+%: %.cu
+	nvcc -O2 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_61,code=sm_61 -o $@ $<
+%: %.c
+	gcc -O2 -std=c99 -o $@ $<
+clean:
+	rm -f ${CU_APPS} ${C_APPS}
diff --git a/chapter02/checkDeviceInfor.cu b/chapter02/checkDeviceInfor.cu
@@ -0,0 +1,86 @@
+#include "../common/common.h"
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+
+int main(int argc, char **argv)
+{
+    printf("%s Starting...\n", argv[0]);
+
+    int deviceCount = 0;
+    cudaGetDeviceCount(&deviceCount);
+
+    if (deviceCount == 0)
+    {
+        printf("There are no available device(s) that support CUDA\n");
+    }
+    else
+    {
+        printf("Detected %d CUDA Capable device(s)\n", deviceCount);
+    }
+
+    int dev = 0, driverVersion = 0, runtimeVersion = 0;
+    CHECK(cudaSetDevice(dev));
+    cudaDeviceProp deviceProp;
+    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
+    printf("Device %d: \"%s\"\n", dev, deviceProp.name);
+
+    cudaDriverGetVersion(&driverVersion);
+    cudaRuntimeGetVersion(&runtimeVersion);
+    printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n",
+           driverVersion / 1000, (driverVersion % 100) / 10,
+           runtimeVersion / 1000, (runtimeVersion % 100) / 10);
+    printf("  CUDA Capability Major/Minor version number:    %d.%d\n",
+           deviceProp.major, deviceProp.minor);
+    printf("  Total amount of global memory:                 %.2f GBytes (%llu "
+           "bytes)\n", (float)deviceProp.totalGlobalMem / pow(1024.0, 3),
+           (unsigned long long)deviceProp.totalGlobalMem);
+    printf("  GPU Clock rate:                                %.0f MHz (%0.2f "
+           "GHz)\n", deviceProp.clockRate * 1e-3f,
+           deviceProp.clockRate * 1e-6f);
+    printf("  Memory Clock rate:                             %.0f Mhz\n",
+           deviceProp.memoryClockRate * 1e-3f);
+    printf("  Memory Bus Width:                              %d-bit\n",
+           deviceProp.memoryBusWidth);
+
+    if (deviceProp.l2CacheSize)
+    {
+        printf("  L2 Cache Size:                                 %d bytes\n",
+               deviceProp.l2CacheSize);
+    }
+
+    printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), "
+           "2D=(%d,%d), 3D=(%d,%d,%d)\n", deviceProp.maxTexture1D,
+           deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
+           deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1],
+           deviceProp.maxTexture3D[2]);
+    printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, "
+           "2D=(%d,%d) x %d\n", deviceProp.maxTexture1DLayered[0],
+           deviceProp.maxTexture1DLayered[1], deviceProp.maxTexture2DLayered[0],
+           deviceProp.maxTexture2DLayered[1],
+           deviceProp.maxTexture2DLayered[2]);
+    printf("  Total amount of constant memory:               %lu bytes\n",
+           deviceProp.totalConstMem);
+    printf("  Total amount of shared memory per block:       %lu bytes\n",
+           deviceProp.sharedMemPerBlock);
+    printf("  Total number of registers available per block: %d\n",
+           deviceProp.regsPerBlock);
+    printf("  Warp size:                                     %d\n",
+           deviceProp.warpSize);
+    printf("  Maximum number of threads per multiprocessor:  %d\n",
+           deviceProp.maxThreadsPerMultiProcessor);
+    printf("  Maximum number of threads per block:           %d\n",
+           deviceProp.maxThreadsPerBlock);
+    printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
+           deviceProp.maxThreadsDim[0],
+           deviceProp.maxThreadsDim[1],
+           deviceProp.maxThreadsDim[2]);
+    printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",
+           deviceProp.maxGridSize[0],
+           deviceProp.maxGridSize[1],
+           deviceProp.maxGridSize[2]);
+    printf("  Maximum memory pitch:                          %lu bytes\n",
+           deviceProp.memPitch);
+
+    exit(EXIT_SUCCESS);
+}
diff --git a/chapter02/checkDimension.cu b/chapter02/checkDimension.cu
@@ -0,0 +1,31 @@
+#include "../common/common.h"
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+
+__global__ void checkIndex(void)
+{
+    printf("threadIdx:(%d, %d, %d)\n", threadIdx.x, threadIdx.y, threadIdx.z);
+    printf("blockIdx:(%d, %d, %d)\n", blockIdx.x, blockIdx.y, blockIdx.z);
+
+    printf("blockDim:(%d, %d, %d)\n", blockDim.x, blockDim.y, blockDim.z);
+    printf("gridDim:(%d, %d, %d)\n", gridDim.x, gridDim.y, gridDim.z);
+
+}
+
+int main(int argc, char **argv)
+{ 
+    int nElem = 6;
+
+    dim3 block(3);
+    dim3 grid((nElem + block.x - 1) / block.x);
+
+    printf("grid.x %d grid.y %d grid.z %d\n", grid.x, grid.y, grid.z);
+    printf("block.x %d block.y %d block.z %d\n", block.x, block.y, block.z);
+
+    checkIndex<<<grid, block>>>();
+
+    CHECK(cudaDeviceReset());
+
+    return(0);
+}
diff --git a/chapter02/checkThreadIndex.cu b/chapter02/checkThreadIndex.cu
@@ -0,0 +1,79 @@
+#include "../common/common.h"
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+
+void printMatrix(int *C, const int nx, const int ny)
+{
+    int *ic = C;
+    printf("\nMatrix: (%d.%d)\n", nx, ny);
+
+    for (int iy = 0; iy < ny; iy++)
+    {
+        for (int ix = 0; ix < nx; ix++)
+        {
+            printf("%3d", ic[ix]);
+
+        }
+
+        ic += nx;
+        printf("\n");
+    }
+
+    printf("\n");
+    return;
+}
+
+__global__ void printThreadIndex(int *A, const int nx, const int ny)
+{
+    int ix = threadIdx.x + blockIdx.x * blockDim.x;
+    int iy = threadIdx.y + blockIdx.y * blockDim.y;
+    unsigned int idx = iy * nx + ix;
+
+    printf("thread_id (%d,%d) block_id (%d,%d) coordinate (%d,%d) global index"
+           " %2d ival %2d\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y,
+           ix, iy, idx, A[idx]);
+}
+
+int main(int argc, char **argv)
+{
+    printf("%s Starting...\n", argv[0]);
+
+    int dev = 0;
+    cudaDeviceProp deviceProp;
+    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
+    printf("Using Device %d: %s\n", dev, deviceProp.name);
+    CHECK(cudaSetDevice(dev));
+
+    int nx = 8;
+    int ny = 6;
+    int nxy = nx * ny;
+    int nBytes = nxy * sizeof(float);
+
+    int *h_A;
+    h_A = (int *)malloc(nBytes);
+
+    for (int i = 0; i < nxy; i++)
+    {
+        h_A[i] = i;
+    }
+    printMatrix(h_A, nx, ny);
+
+    int *d_MatA;
+    CHECK(cudaMalloc((void **)&d_MatA, nBytes));
+
+    CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice));
+
+    dim3 block(4, 2);
+    dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
+
+    printThreadIndex<<<grid, block>>>(d_MatA, nx, ny);
+    CHECK(cudaGetLastError()); 
+
+    CHECK(cudaFree(d_MatA));
+    free(h_A); 
+
+    CHECK(cudaDeviceReset());
+
+    return (0);
+}
diff --git a/chapter02/defineGridBlock.cu b/chapter02/defineGridBlock.cu
@@ -0,0 +1,29 @@
+#include "../common/common.h"
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+int main(int argc, char **argv)
+{ 
+    int nElem = 1024;
+
+    dim3 block (1024);
+    dim3 grid  ((nElem + block.x - 1) / block.x);
+    printf("grid.x %d block.x %d \n", grid.x, block.x);
+
+    block.x = 512;
+    grid.x  = (nElem + block.x - 1) / block.x;
+    printf("grid.x %d block.x %d \n", grid.x, block.x);
+
+    block.x = 256;
+    grid.x  = (nElem + block.x - 1) / block.x;
+    printf("grid.x %d block.x %d \n", grid.x, block.x);
+
+    block.x = 128;
+    grid.x  = (nElem + block.x - 1) / block.x;
+    printf("grid.x %d block.x %d \n", grid.x, block.x);
+
+    CHECK(cudaDeviceReset());
+
+    return(0);
+}
+
diff --git a/chapter02/sumArraysOnGPU-small-case.cu b/chapter02/sumArraysOnGPU-small-case.cu
@@ -0,0 +1,112 @@
+#include "../common/common.h"
+#include <cuda_runtime.h>
+#include <stdio.h> 
+
+void checkResult(float *hostRef, float *gpuRef, const int N)
+{
+    double epsilon = 1.0E-8;
+    bool match = 1;
+
+    for (int i = 0; i < N; i++)
+    {
+        if (abs(hostRef[i] - gpuRef[i]) > epsilon)
+        {
+            match = 0;
+            printf("Arrays do not match!\n");
+            printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i],
+                   gpuRef[i], i);
+            break;
+        }
+    }
+
+    if (match) printf("Arrays match.\n\n");
+
+    return;
+}
+
+
+void initialData(float *ip, int size)
+{ 
+    time_t t;
+    srand((unsigned) time(&t));
+
+    for (int i = 0; i < size; i++)
+    {
+        ip[i] = (float)(rand() & 0xFF) / 10.0f;
+    }
+
+    return;
+}
+
+
+void sumArraysOnHost(float *A, float *B, float *C, const int N)
+{
+    for (int idx = 0; idx < N; idx++)
+        C[idx] = A[idx] + B[idx];
+}
+
+__global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N)
+{
+    int i = threadIdx.x;
+
+    if (i < N) C[i] = A[i] + B[i];
+}
+
+
+int main(int argc, char **argv)
+{
+    printf("%s Starting...\n", argv[0]);
+
+    int dev = 0;
+    CHECK(cudaSetDevice(dev));
+
+    int nElem = 1 << 5;
+    printf("Vector size %d\n", nElem);
+
+    size_t nBytes = nElem * sizeof(float);
+
+    float *h_A, *h_B, *hostRef, *gpuRef;
+    h_A     = (float *)malloc(nBytes);
+    h_B     = (float *)malloc(nBytes);
+    hostRef = (float *)malloc(nBytes);
+    gpuRef  = (float *)malloc(nBytes);
+
+    initialData(h_A, nElem);
+    initialData(h_B, nElem);
+
+    memset(hostRef, 0, nBytes);
+    memset(gpuRef,  0, nBytes);
+
+    float *d_A, *d_B, *d_C;
+    CHECK(cudaMalloc((float**)&d_A, nBytes));
+    CHECK(cudaMalloc((float**)&d_B, nBytes));
+    CHECK(cudaMalloc((float**)&d_C, nBytes));
+
+    CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
+    CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
+    CHECK(cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice));
+
+    dim3 block (nElem);
+    dim3 grid  (1);
+
+    sumArraysOnGPU<<<grid, block>>>(d_A, d_B, d_C, nElem);
+    printf("Execution configure <<<%d, %d>>>\n", grid.x, block.x);
+
+    CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
+
+    sumArraysOnHost(h_A, h_B, hostRef, nElem);
+
+    checkResult(hostRef, gpuRef, nElem);
+
+    CHECK(cudaFree(d_A));
+    CHECK(cudaFree(d_B));
+    CHECK(cudaFree(d_C));
+
+    free(h_A);
+    free(h_B);
+    free(hostRef);
+    free(gpuRef);
+
+    CHECK(cudaDeviceReset());
+    return(0);
+}