Add files via upload

dedoogong · Nov 4, 2018 · f4fe2e9 · f4fe2e9
1 parent 1998157
commit f4fe2e9
Show file tree

Hide file tree

Showing 2 changed files with 352 additions and 0 deletions.
diff --git a/2D_conv_basic.cu b/2D_conv_basic.cu
@@ -0,0 +1,105 @@
+/*
+Two Dimensional (2D) Image Convolution : A Basic Approch
+Image Convolution is a very basic operation in the field of Image Processing. 
+It is required in many algorithms in Image Processing. Also it is very compute intensive task as 
+it involves operation with pixels.
+Its a transformation which involves a Mask and an Image on which that mask will be performing the operation.
+
+
+General steps of Convolution:
+The center point of Mask is first placed on to the Image Pixel.
+Each pixel of Mask is multiplied to corresponding pixel of Image.
+A complete sum (Cumulative sum) of all multiplications performed between Mask and Image pixels are then 
+put in the related Image pixel value as a result of Convolution.
+Following is the sample code of Image convolution. The Mask_width, Mask_height are set to 3 as 
+its a 3 X 3 2D array with all values set to 1. Also width and height are also set to 3 
+as I considered image of size 3 X 3 only for the sake of the example. 
+You may change the values as per your need.
+*/
+
+
+#include<stdio.h>
+#include<stdlib.h>
+#define Mask_width 3
+#define Mask_height 3
+#define width 3
+#define height 3
+float convolution_2D_OnHost(float * N,float * M,int i,int j);
+
+int main()
+{
+ float * input;
+ float * Mask;
+ float * output;
+
+ input=(float *)malloc(sizeof(float)*width*height);
+ Mask=(float *)malloc(sizeof(float)*Mask_width*Mask_height);
+ output=(float *)malloc(sizeof(float)*width*height);
+
+ for(int i=0;i<width*height;i++)
+ {
+  input[i]=1.0;
+ }
+ for(int i=0;i<Mask_width*Mask_height;i++)
+ {
+  Mask[i]=1.0;
+ }
+
+ printf("\nInput Array:\n");
+ for(int i=0;i<width*height;i++)
+ {
+  if(i>0 && (i%width==0))
+  printf("\n");
+  printf("%0.2f ",input[i]);
+
+ }printf("\n");
+
+ printf("\nMask:\n");
+  for(int i=0;i<Mask_width*Mask_height;i++)
+  {
+   if(i>0 && (i%Mask_width==0))
+   printf("\n");
+   printf("%0.2f ",Mask[i]);
+
+  }printf("\n");
+
+  for(int i=0;i<width;i++)
+  {
+   for(int j=0;j<height;j++)
+   {
+    output[(i*width)+j]=convolution_2D_OnHost(input,Mask,i,j);
+   }
+  }
+
+  printf("\nOutput:\n");
+  for(int i=0;i<width*height;i++)
+    {
+    if(i>0 && (i%width==0))
+     printf("\n");
+    printf("%d = %0.2f \t",i,*(output+i));
+    }
+
+free(input);
+free(Mask);
+free(output);
+return 0;
+}
+
+float convolution_2D_OnHost(float * N,float * M,int i,int j)
+{
+ float Pvalue=0.0;
+ int N_start_point_i=i-(Mask_width/2);
+ int N_start_point_j=j-(Mask_height/2);
+
+ for(int k=0;k<Mask_width;k++)
+ {
+  for(int l=0;l<Mask_height;l++)
+  {
+   if(((N_start_point_i+k)>=0) && ((N_start_point_i+k)<width)&&((N_start_point_j+l)>=0)&&((N_start_point_j+l)<height))
+   {
+   Pvalue+=N[(N_start_point_i+k)*width+(N_start_point_j+l)]*M[(k*Mask_width)+l];
+   }
+  }
+ }
+ return(Pvalue);
+}
diff --git a/2D_conv_opt.cu b/2D_conv_opt.cu
@@ -0,0 +1,247 @@
+/*
+Two Dimensional (2D) Image Convolution in CUDA by Shared & Constant Memory: An Optimized way
+After learning the concept of two dimension (2D) Convolution and its implementation in C language;
+the next step is to learn to optimize it. As Convolution is one of the most Compute Intensive task in 
+Image Processing, it is always better to save time required for it. So, today I am going to share a technique 
+to optimize the Convolution process by using CUDA. Here we will use Shared Memory and Constant Memory resources
+available in CUDA to get fastest implementation of Convolution.
+
+Here is the strategy, 
+We will store convolution mask in Constant Memory as all threads will need the same Mask. Constant Memory is 
+the off-chip memory present on GPU card which is read-only memory and faster than Global memory. The Mask is 
+passed to kernel with the keyword "__restrict__". Check for it in the kernel definition.
+Instead of accessing complete image and performing the convolution operation on it; we will fix TILE size. 
+The image pixels are copied to this TILE first and then convolution is performed accordingly. 
+This TILE will be stored in the Shared Memory of GPU card which is on-chip memory and faster than 
+the Global Memory.
+Following is the sample code of the implementation. You can also get it from my GitHub repository.
+
+*/
+/*
+ * Title: 2D Image Convolution on GPU by using Shared Memory and Constant Memory.
+ *
+ * Image Size: 2048 X 2048
+ * Mask Size: 64 X 64
+ * TILE_WIDTH 32
+ *
+ *
+ * */
+#include<stdio.h>
+#include<cuda_runtime_api.h>
+#include<cuda.h>
+#include<stdlib.h>
+#include<assert.h>
+
+#define wbCheck(stmt)  do {                                                    \
+        cudaError_t err = stmt;                                               \
+        if (err != cudaSuccess) {                                             \
+            printf( "Failed to run stmt %d ", __LINE__);                       \
+            printf( "Got CUDA error ...  %s ", cudaGetErrorString(err));    \
+            return -1;                                                        \
+        }                                                                     \
+    } while(0)
+
+
+#define Mask_width  64
+#define Mask_height 64
+#define Mask_radius_x Mask_width/2
+#define Mask_radius_y Mask_height/2
+#define TILE_WIDTH 32  //16 X 16 TILE
+#define w_x (TILE_WIDTH + Mask_width - 1) //Shared Memory Elements needed to load as per Mask Size
+#define w_y (TILE_WIDTH + Mask_height - 1)
+#define clamp(x) (min(max((x), 0.0), 1.0))
+
+//@@ INSERT CODE HERE
+__global__ void convolution(float *I, const float* __restrict__ M, float *P,int channels, int width, int height)
+{
+   __shared__ float N_ds[w_y][w_x];
+   int k;
+   for (k = 0; k < channels; k++)
+   {
+      //1. Phase to Load Data into Shared Memory. Each Thread loads multiple elements indexed by each Batch loading
+    //1.dest: RMO ID 2. destY & destX: Row and Column of Shared Memory
+    //3. srcY & srcX: Indexes to fetch data from input Image
+    //4. src: RMO index of Input Image
+
+    // First batch loading
+      int dest = threadIdx.y * TILE_WIDTH + threadIdx.x,
+         destY = dest / w_x, destX = dest % w_x,
+         srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius_x,
+         srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius_y,
+         src = (srcY * width + srcX) * channels + k;
+      if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
+         N_ds[destY][destX] = I[src];
+      else
+         N_ds[destY][destX] = 0.0;
+
+        for (int iter=1; iter <= (w_x * w_y) / (TILE_WIDTH*TILE_WIDTH); iter++)
+        {
+          // Second batch loading
+          dest = threadIdx.y * TILE_WIDTH + threadIdx.x + iter*(TILE_WIDTH * TILE_WIDTH);
+            destY = dest / w_x, destX = dest % w_x;
+            srcY  = blockIdx.y * TILE_WIDTH + destY - Mask_radius_x;
+            srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius_y;
+            src = (srcY * width + srcX) * channels + k;
+            if (destY < w_y && destX < w_x)
+            {
+                if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
+                    N_ds[destY][destX] = I[src];
+                else
+                    N_ds[destY][destX] = 0.0;
+            }
+        }
+      __syncthreads();
+
+      float accum = 0;
+      int y, x;
+      for (y = 0; y < Mask_width; y++)
+         for (x = 0; x < Mask_width; x++)
+            accum += N_ds[threadIdx.y + y][threadIdx.x + x] * M[y * Mask_width + x];
+      y = blockIdx.y * TILE_WIDTH + threadIdx.y;
+      x = blockIdx.x * TILE_WIDTH + threadIdx.x;
+      if (y < height && x < width)
+         //P[(y * width + x) * channels + k] = clamp(accum);
+       P[(y * width + x) * channels + k] = accum;
+      __syncthreads();
+   }
+}
+
+float convolution_2D_OnHost(float * N,float * M,int width, int height,int i,int j);
+
+int main() {
+
+    int maskRows=Mask_height; // Set it as per requirement of 64 X 32
+    int maskColumns=Mask_width;
+
+    int imageChannels=1;
+    int imageWidth=2048;
+    int imageHeight=2048;
+
+    float * hostInputImageData;
+    float * hostOutputImageData;
+    float * hostMaskData;
+    float * deviceInputImageData;
+    float * deviceOutputImageData;
+    float * deviceMaskData;
+    float * outputImageOnHost;
+
+    hostMaskData = (float *) malloc(sizeof(float)*maskRows*maskColumns);
+    for(int i=0;i<maskRows*maskColumns;i++)//To set Mask of size 5*5 which has all values as 1
+    {
+     hostMaskData[i]=1.0;
+    }
+
+//Comment this assert code for 64 X 32
+    //assert(maskRows == 5); /* mask height is fixed to 5 in this mp */
+    //assert(maskColumns == 5); /* mask width is fixed to 5 in this mp */
+
+    //To store Memory
+    hostInputImageData = (float *) malloc(sizeof(float)*imageWidth*imageHeight);
+    hostOutputImageData = (float *) malloc(sizeof(float)*imageWidth*imageHeight);
+    outputImageOnHost = (float *) malloc(sizeof(float)*imageWidth*imageHeight);
+    for(int i=0;i<imageWidth*imageHeight;i++)//To set Image data as 1.0
+    {
+     hostInputImageData[i]=1.0;
+    }
+
+    wbCheck(cudaMalloc((void **) &deviceInputImageData, imageWidth * imageHeight * imageChannels * sizeof(float)));
+    wbCheck(cudaMalloc((void **) &deviceOutputImageData, imageWidth * imageHeight * imageChannels * sizeof(float)));
+    wbCheck(cudaMalloc((void **) &deviceMaskData, maskRows * maskColumns * sizeof(float)));
+
+    wbCheck(cudaMemcpy(deviceInputImageData,
+               hostInputImageData,
+               imageWidth * imageHeight * imageChannels * sizeof(float),
+               cudaMemcpyHostToDevice));
+    wbCheck(cudaMemcpy(deviceMaskData,
+               hostMaskData,
+               maskRows * maskColumns * sizeof(float),
+               cudaMemcpyHostToDevice));
+
+    dim3 dimGrid(((imageWidth-1)/TILE_WIDTH)+1, ((imageHeight-1)/TILE_WIDTH)+1,1);
+    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
+    convolution<<<dimGrid,dimBlock>>>(deviceInputImageData, deviceMaskData, deviceOutputImageData,
+                                       imageChannels, imageWidth, imageHeight);
+
+ cudaError_t err1 = cudaPeekAtLastError();
+ cudaDeviceSynchronize();
+ printf( "Got CUDA error ... %s \n", cudaGetErrorString(err1));
+
+    cudaMemcpy(hostOutputImageData,
+               deviceOutputImageData,
+               imageWidth * imageHeight * imageChannels * sizeof(float),
+               cudaMemcpyDeviceToHost);
+
+    //printf("\n Output from Device:\n");
+#if 0  //comment this to run the portion of code
+    for(int i=0;i<imageWidth*imageHeight;i++)
+    {
+        if(i>0 && (i%imageWidth==0))
+            printf("\n");
+      printf("%0.2f \t",*(hostOutputImageData+i));
+
+      }
+#endif
+
+    //Convolution on Host
+    for(int i=0;i<imageWidth;i++)
+          {
+           for(int j=0;j<imageHeight;j++)
+           {
+            outputImageOnHost[(i*imageWidth)+j]=convolution_2D_OnHost(hostInputImageData,hostMaskData,imageWidth,imageHeight,i,j);
+           }
+          }
+
+    //printf("\n Output from Host:\n");
+#if 0  //comment this to run the portion of code
+    for(int i=0;i<imageWidth*imageHeight;i++)
+      {
+      if(i>0 && (i%imageWidth==0))
+       printf("\n");
+      printf("%0.2f \t",*(outputImageOnHost+i));
+
+      }
+#endif
+
+
+        for(int i=0;i<imageWidth*imageHeight;i++)
+        {
+         if(outputImageOnHost[i]!=hostOutputImageData[i])
+         {
+           printf("\nMismatch at (Row,Col) = [%d][%d], hostComputed[]: %0.0f And device[]: %0.0f", i / imageWidth, i % imageHeight, outputImageOnHost[i], hostOutputImageData[i]);
+            }
+        }
+
+    cudaFree(deviceInputImageData);
+    cudaFree(deviceOutputImageData);
+    cudaFree(deviceMaskData);
+
+    free(hostMaskData);
+    free(hostInputImageData);
+    free(hostOutputImageData);
+
+    printf("\n Number of Threads Per Block created in code: %d",TILE_WIDTH*TILE_WIDTH);
+    printf("\n Number of Blocks Created :%d",(((imageWidth-1)/TILE_WIDTH)+1)*(((imageWidth-1)/TILE_WIDTH)+1));
+    printf("No Error");
+    return 0;
+}
+
+float convolution_2D_OnHost(float * N,float * M,int width, int height,int i,int j)
+{
+ float Pvalue=0.0;
+ int N_start_point_i = i - (Mask_width/2);
+ int N_start_point_j = j - (Mask_height/2);
+
+ for(int k=0;k<Mask_width;k++)
+ {
+  for(int l=0;l<Mask_height;l++)
+  {
+   if(((N_start_point_i+k)>=0) && ((N_start_point_i+k)<width)&&((N_start_point_j+l)>=0)&&((N_start_point_j+l)<height))
+   {
+       Pvalue+=N[(N_start_point_i+k)*width+(N_start_point_j+l)]*M[(k*Mask_width)+l];
+   }
+  }
+ }
+
+// return(clamp(Pvalue));
+ return((Pvalue));
+}