-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
352 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
/* | ||
Two Dimensional (2D) Image Convolution : A Basic Approch | ||
Image Convolution is a very basic operation in the field of Image Processing. | ||
It is required in many algorithms in Image Processing. Also it is very compute intensive task as | ||
it involves operation with pixels. | ||
Its a transformation which involves a Mask and an Image on which that mask will be performing the operation. | ||
General steps of Convolution: | ||
The center point of Mask is first placed on to the Image Pixel. | ||
Each pixel of Mask is multiplied to corresponding pixel of Image. | ||
A complete sum (Cumulative sum) of all multiplications performed between Mask and Image pixels are then | ||
put in the related Image pixel value as a result of Convolution. | ||
Following is the sample code of Image convolution. The Mask_width, Mask_height are set to 3 as | ||
its a 3 X 3 2D array with all values set to 1. Also width and height are also set to 3 | ||
as I considered image of size 3 X 3 only for the sake of the example. | ||
You may change the values as per your need. | ||
*/ | ||
|
||
|
||
#include<stdio.h> | ||
#include<stdlib.h> | ||
#define Mask_width 3 | ||
#define Mask_height 3 | ||
#define width 3 | ||
#define height 3 | ||
float convolution_2D_OnHost(float * N,float * M,int i,int j); | ||
|
||
int main() | ||
{ | ||
float * input; | ||
float * Mask; | ||
float * output; | ||
|
||
input=(float *)malloc(sizeof(float)*width*height); | ||
Mask=(float *)malloc(sizeof(float)*Mask_width*Mask_height); | ||
output=(float *)malloc(sizeof(float)*width*height); | ||
|
||
for(int i=0;i<width*height;i++) | ||
{ | ||
input[i]=1.0; | ||
} | ||
for(int i=0;i<Mask_width*Mask_height;i++) | ||
{ | ||
Mask[i]=1.0; | ||
} | ||
|
||
printf("\nInput Array:\n"); | ||
for(int i=0;i<width*height;i++) | ||
{ | ||
if(i>0 && (i%width==0)) | ||
printf("\n"); | ||
printf("%0.2f ",input[i]); | ||
|
||
}printf("\n"); | ||
|
||
printf("\nMask:\n"); | ||
for(int i=0;i<Mask_width*Mask_height;i++) | ||
{ | ||
if(i>0 && (i%Mask_width==0)) | ||
printf("\n"); | ||
printf("%0.2f ",Mask[i]); | ||
|
||
}printf("\n"); | ||
|
||
for(int i=0;i<width;i++) | ||
{ | ||
for(int j=0;j<height;j++) | ||
{ | ||
output[(i*width)+j]=convolution_2D_OnHost(input,Mask,i,j); | ||
} | ||
} | ||
|
||
printf("\nOutput:\n"); | ||
for(int i=0;i<width*height;i++) | ||
{ | ||
if(i>0 && (i%width==0)) | ||
printf("\n"); | ||
printf("%d = %0.2f \t",i,*(output+i)); | ||
} | ||
|
||
free(input); | ||
free(Mask); | ||
free(output); | ||
return 0; | ||
} | ||
|
||
float convolution_2D_OnHost(float * N,float * M,int i,int j) | ||
{ | ||
float Pvalue=0.0; | ||
int N_start_point_i=i-(Mask_width/2); | ||
int N_start_point_j=j-(Mask_height/2); | ||
|
||
for(int k=0;k<Mask_width;k++) | ||
{ | ||
for(int l=0;l<Mask_height;l++) | ||
{ | ||
if(((N_start_point_i+k)>=0) && ((N_start_point_i+k)<width)&&((N_start_point_j+l)>=0)&&((N_start_point_j+l)<height)) | ||
{ | ||
Pvalue+=N[(N_start_point_i+k)*width+(N_start_point_j+l)]*M[(k*Mask_width)+l]; | ||
} | ||
} | ||
} | ||
return(Pvalue); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,247 @@ | ||
/* | ||
Two Dimensional (2D) Image Convolution in CUDA by Shared & Constant Memory: An Optimized way | ||
After learning the concept of two dimension (2D) Convolution and its implementation in C language; | ||
the next step is to learn to optimize it. As Convolution is one of the most Compute Intensive task in | ||
Image Processing, it is always better to save time required for it. So, today I am going to share a technique | ||
to optimize the Convolution process by using CUDA. Here we will use Shared Memory and Constant Memory resources | ||
available in CUDA to get fastest implementation of Convolution. | ||
Here is the strategy, | ||
We will store convolution mask in Constant Memory as all threads will need the same Mask. Constant Memory is | ||
the off-chip memory present on GPU card which is read-only memory and faster than Global memory. The Mask is | ||
passed to kernel with the keyword "__restrict__". Check for it in the kernel definition. | ||
Instead of accessing complete image and performing the convolution operation on it; we will fix TILE size. | ||
The image pixels are copied to this TILE first and then convolution is performed accordingly. | ||
This TILE will be stored in the Shared Memory of GPU card which is on-chip memory and faster than | ||
the Global Memory. | ||
Following is the sample code of the implementation. You can also get it from my GitHub repository. | ||
*/ | ||
/* | ||
* Title: 2D Image Convolution on GPU by using Shared Memory and Constant Memory. | ||
* | ||
* Image Size: 2048 X 2048 | ||
* Mask Size: 64 X 64 | ||
* TILE_WIDTH 32 | ||
* | ||
* | ||
* */ | ||
#include<stdio.h> | ||
#include<cuda_runtime_api.h> | ||
#include<cuda.h> | ||
#include<stdlib.h> | ||
#include<assert.h> | ||
|
||
#define wbCheck(stmt) do { \ | ||
cudaError_t err = stmt; \ | ||
if (err != cudaSuccess) { \ | ||
printf( "Failed to run stmt %d ", __LINE__); \ | ||
printf( "Got CUDA error ... %s ", cudaGetErrorString(err)); \ | ||
return -1; \ | ||
} \ | ||
} while(0) | ||
|
||
|
||
#define Mask_width 64 | ||
#define Mask_height 64 | ||
#define Mask_radius_x Mask_width/2 | ||
#define Mask_radius_y Mask_height/2 | ||
#define TILE_WIDTH 32 //16 X 16 TILE | ||
#define w_x (TILE_WIDTH + Mask_width - 1) //Shared Memory Elements needed to load as per Mask Size | ||
#define w_y (TILE_WIDTH + Mask_height - 1) | ||
#define clamp(x) (min(max((x), 0.0), 1.0)) | ||
|
||
//@@ INSERT CODE HERE | ||
__global__ void convolution(float *I, const float* __restrict__ M, float *P,int channels, int width, int height) | ||
{ | ||
__shared__ float N_ds[w_y][w_x]; | ||
int k; | ||
for (k = 0; k < channels; k++) | ||
{ | ||
//1. Phase to Load Data into Shared Memory. Each Thread loads multiple elements indexed by each Batch loading | ||
//1.dest: RMO ID 2. destY & destX: Row and Column of Shared Memory | ||
//3. srcY & srcX: Indexes to fetch data from input Image | ||
//4. src: RMO index of Input Image | ||
|
||
// First batch loading | ||
int dest = threadIdx.y * TILE_WIDTH + threadIdx.x, | ||
destY = dest / w_x, destX = dest % w_x, | ||
srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius_x, | ||
srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius_y, | ||
src = (srcY * width + srcX) * channels + k; | ||
if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width) | ||
N_ds[destY][destX] = I[src]; | ||
else | ||
N_ds[destY][destX] = 0.0; | ||
|
||
for (int iter=1; iter <= (w_x * w_y) / (TILE_WIDTH*TILE_WIDTH); iter++) | ||
{ | ||
// Second batch loading | ||
dest = threadIdx.y * TILE_WIDTH + threadIdx.x + iter*(TILE_WIDTH * TILE_WIDTH); | ||
destY = dest / w_x, destX = dest % w_x; | ||
srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius_x; | ||
srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius_y; | ||
src = (srcY * width + srcX) * channels + k; | ||
if (destY < w_y && destX < w_x) | ||
{ | ||
if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width) | ||
N_ds[destY][destX] = I[src]; | ||
else | ||
N_ds[destY][destX] = 0.0; | ||
} | ||
} | ||
__syncthreads(); | ||
|
||
float accum = 0; | ||
int y, x; | ||
for (y = 0; y < Mask_width; y++) | ||
for (x = 0; x < Mask_width; x++) | ||
accum += N_ds[threadIdx.y + y][threadIdx.x + x] * M[y * Mask_width + x]; | ||
y = blockIdx.y * TILE_WIDTH + threadIdx.y; | ||
x = blockIdx.x * TILE_WIDTH + threadIdx.x; | ||
if (y < height && x < width) | ||
//P[(y * width + x) * channels + k] = clamp(accum); | ||
P[(y * width + x) * channels + k] = accum; | ||
__syncthreads(); | ||
} | ||
} | ||
|
||
float convolution_2D_OnHost(float * N,float * M,int width, int height,int i,int j); | ||
|
||
int main() { | ||
|
||
int maskRows=Mask_height; // Set it as per requirement of 64 X 32 | ||
int maskColumns=Mask_width; | ||
|
||
int imageChannels=1; | ||
int imageWidth=2048; | ||
int imageHeight=2048; | ||
|
||
float * hostInputImageData; | ||
float * hostOutputImageData; | ||
float * hostMaskData; | ||
float * deviceInputImageData; | ||
float * deviceOutputImageData; | ||
float * deviceMaskData; | ||
float * outputImageOnHost; | ||
|
||
hostMaskData = (float *) malloc(sizeof(float)*maskRows*maskColumns); | ||
for(int i=0;i<maskRows*maskColumns;i++)//To set Mask of size 5*5 which has all values as 1 | ||
{ | ||
hostMaskData[i]=1.0; | ||
} | ||
|
||
//Comment this assert code for 64 X 32 | ||
//assert(maskRows == 5); /* mask height is fixed to 5 in this mp */ | ||
//assert(maskColumns == 5); /* mask width is fixed to 5 in this mp */ | ||
|
||
//To store Memory | ||
hostInputImageData = (float *) malloc(sizeof(float)*imageWidth*imageHeight); | ||
hostOutputImageData = (float *) malloc(sizeof(float)*imageWidth*imageHeight); | ||
outputImageOnHost = (float *) malloc(sizeof(float)*imageWidth*imageHeight); | ||
for(int i=0;i<imageWidth*imageHeight;i++)//To set Image data as 1.0 | ||
{ | ||
hostInputImageData[i]=1.0; | ||
} | ||
|
||
wbCheck(cudaMalloc((void **) &deviceInputImageData, imageWidth * imageHeight * imageChannels * sizeof(float))); | ||
wbCheck(cudaMalloc((void **) &deviceOutputImageData, imageWidth * imageHeight * imageChannels * sizeof(float))); | ||
wbCheck(cudaMalloc((void **) &deviceMaskData, maskRows * maskColumns * sizeof(float))); | ||
|
||
wbCheck(cudaMemcpy(deviceInputImageData, | ||
hostInputImageData, | ||
imageWidth * imageHeight * imageChannels * sizeof(float), | ||
cudaMemcpyHostToDevice)); | ||
wbCheck(cudaMemcpy(deviceMaskData, | ||
hostMaskData, | ||
maskRows * maskColumns * sizeof(float), | ||
cudaMemcpyHostToDevice)); | ||
|
||
dim3 dimGrid(((imageWidth-1)/TILE_WIDTH)+1, ((imageHeight-1)/TILE_WIDTH)+1,1); | ||
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1); | ||
convolution<<<dimGrid,dimBlock>>>(deviceInputImageData, deviceMaskData, deviceOutputImageData, | ||
imageChannels, imageWidth, imageHeight); | ||
|
||
cudaError_t err1 = cudaPeekAtLastError(); | ||
cudaDeviceSynchronize(); | ||
printf( "Got CUDA error ... %s \n", cudaGetErrorString(err1)); | ||
|
||
cudaMemcpy(hostOutputImageData, | ||
deviceOutputImageData, | ||
imageWidth * imageHeight * imageChannels * sizeof(float), | ||
cudaMemcpyDeviceToHost); | ||
|
||
//printf("\n Output from Device:\n"); | ||
#if 0 //comment this to run the portion of code | ||
for(int i=0;i<imageWidth*imageHeight;i++) | ||
{ | ||
if(i>0 && (i%imageWidth==0)) | ||
printf("\n"); | ||
printf("%0.2f \t",*(hostOutputImageData+i)); | ||
|
||
} | ||
#endif | ||
|
||
//Convolution on Host | ||
for(int i=0;i<imageWidth;i++) | ||
{ | ||
for(int j=0;j<imageHeight;j++) | ||
{ | ||
outputImageOnHost[(i*imageWidth)+j]=convolution_2D_OnHost(hostInputImageData,hostMaskData,imageWidth,imageHeight,i,j); | ||
} | ||
} | ||
|
||
//printf("\n Output from Host:\n"); | ||
#if 0 //comment this to run the portion of code | ||
for(int i=0;i<imageWidth*imageHeight;i++) | ||
{ | ||
if(i>0 && (i%imageWidth==0)) | ||
printf("\n"); | ||
printf("%0.2f \t",*(outputImageOnHost+i)); | ||
|
||
} | ||
#endif | ||
|
||
|
||
for(int i=0;i<imageWidth*imageHeight;i++) | ||
{ | ||
if(outputImageOnHost[i]!=hostOutputImageData[i]) | ||
{ | ||
printf("\nMismatch at (Row,Col) = [%d][%d], hostComputed[]: %0.0f And device[]: %0.0f", i / imageWidth, i % imageHeight, outputImageOnHost[i], hostOutputImageData[i]); | ||
} | ||
} | ||
|
||
cudaFree(deviceInputImageData); | ||
cudaFree(deviceOutputImageData); | ||
cudaFree(deviceMaskData); | ||
|
||
free(hostMaskData); | ||
free(hostInputImageData); | ||
free(hostOutputImageData); | ||
|
||
printf("\n Number of Threads Per Block created in code: %d",TILE_WIDTH*TILE_WIDTH); | ||
printf("\n Number of Blocks Created :%d",(((imageWidth-1)/TILE_WIDTH)+1)*(((imageWidth-1)/TILE_WIDTH)+1)); | ||
printf("No Error"); | ||
return 0; | ||
} | ||
|
||
float convolution_2D_OnHost(float * N,float * M,int width, int height,int i,int j) | ||
{ | ||
float Pvalue=0.0; | ||
int N_start_point_i = i - (Mask_width/2); | ||
int N_start_point_j = j - (Mask_height/2); | ||
|
||
for(int k=0;k<Mask_width;k++) | ||
{ | ||
for(int l=0;l<Mask_height;l++) | ||
{ | ||
if(((N_start_point_i+k)>=0) && ((N_start_point_i+k)<width)&&((N_start_point_j+l)>=0)&&((N_start_point_j+l)<height)) | ||
{ | ||
Pvalue+=N[(N_start_point_i+k)*width+(N_start_point_j+l)]*M[(k*Mask_width)+l]; | ||
} | ||
} | ||
} | ||
|
||
// return(clamp(Pvalue)); | ||
return((Pvalue)); | ||
} |