From 084f1fe810c6cc4d2381dad5247a7c38626a63fd Mon Sep 17 00:00:00 2001 From: Zenovore <agustinus.alexander17@gmail.com> Date: Wed, 16 Mar 2022 06:59:40 +0700 Subject: [PATCH] add paralel convolution Co-authored-by: <riisuki@users.noreply.github.com> Co-authored-by: <girvinjunod@users.noreply.github.com> --- src/cuda.cu | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) diff --git a/src/cuda.cu b/src/cuda.cu index e69de29..6341ed0 100644 --- a/src/cuda.cu +++ b/src/cuda.cu @@ -0,0 +1,194 @@ +%%writefile test.cu +#include <stdlib.h> +#include <stdio.h> + +#define NMAX 100 +#define DATAMAX 1000 +#define DATAMIN -1000 + +__host__ __device__ void print_matrix(int *m,int row, int col) +{ + for (int i = 0; i < row; i++) + { + for (int j = 0; j < col; j++) + { + printf("%d ", m[i * col + j]); + } + printf("\n"); + } +} + + +__device__ int supression_op(int *kernel, int *target, int rowKernel,int colKernel,int rowTarget,int colTarget, int iterationRow, int iterationCol,int iterator) +{ + int intermediate_sum = 0; + for (int i = 0; i < rowKernel; i++) + { + for (int j = 0; j < colKernel; j++) + { + intermediate_sum += kernel[i * colKernel + j] * target[(iterationRow + i) * colTarget + iterationCol + j]; + } + } + + return intermediate_sum; +} + +__global__ void convolution(int *output,int *kernel, int *target,int rowKernel,int colKernel,int rowTarget,int colTarget,int index) +{ + int rowC = rowTarget-rowKernel + 1; + int colC = colTarget-colKernel + 1; + int row; + int col; + + // BLOCK + int block = blockIdx.x; + int numberOfBlock = gridDim.x; + // printf("NUMBER OF BLOCK BLOCK %d \n",numberOfBlock); + // int k = block; + // printf("BLOCK %d \n",blockIdx.x); + // THREAD + int tx= threadIdx.x; + // printf("BLOCK DMENSION X %d \n",blockDim.x); + // printf("BLOCK DIMENSION Y %d \n",blockDim.y); + int T= tx + blockIdx.x * blockDim.x; // ALL THREAD INDEX + // printf("GRID DIMENSION : %d \n",); + // print_matrix(&target[0],3,3); + // print_matrix(&target[1],3,3); + // print_matrix(&target[2],3,3); + // printf("THREAD : %d \n",tx); + for (int k = block; k < index ;k+= numberOfBlock){ + for (int i = tx; i < blockDim.x; i+=blockDim.x) + { + // printf("MASUK THREAD : %d \n",i); + // printf("THIS IS i : %d \n",i); + for (int j = i; j < rowC*colC ; j+= blockDim.x) + { + // printf("MASUK THREAD J : %d \n",j); + // printf("THIS IS j : %d \n",j); + row = floorf(j/colC); + col = j%colC; + if (i == 0 && k == 0 && (row*colC + col) < 100){ + printf("THREAD %d FOR ROW %d AND COL %d \n",i, row , col); + printf("J IS %d \n",j); + printf("FILLING FOR MATRIX %d with index %d \n",k, row*colC + col); + printf("VALUE OF SUPRESION : %d \n",supression_op(kernel, (&target[k*rowTarget*colTarget]), rowKernel, colKernel, rowTarget, colTarget, row, col,k)); + + } + // printf("ITEM IS THIS %d \n",(&target[k])[j]); + // print_matrix(&target[k*rowTarget*colTarget],3,3); + // print_matrix(&target[9],3,3); + (&output[k*rowC*colC])[row*colC + col] = supression_op(kernel, (&target[k*rowTarget*colTarget]), rowKernel, colKernel, rowTarget, colTarget, row, col,k); + } + } + // print_matrix(&output[4],3,3); + // printf("ITEM IS THIS %d \n",(&output[0])[0]); + // printf("ITEM IS THIS %d \n",(&output[1])[0]); + // printf("ITEM IS THIS %d \n",(&output[1])[1]); + // printf("ITEM IS THIS %d \n",(&output[0])[3]); + } +} + +void convolutionWithCUDA(int* output, int* kernel, int* target, int rowKernel, int colKernel, int rowTarget, int colTarget,int index){ + // B = Kernel + int* dev_a = nullptr; + int* dev_b = nullptr; + int* dev_c = nullptr; + + // Allocate GPU buffers for three vectors (two input, one output) + // print_matrix(&target[0],3,3); + // print_matrix(&target[1],3,3); + // print_matrix(&target[2],3,3); + // printf("\n %d \n", *(target[0][1])); + int rowC = rowTarget-rowKernel + 1; + int colC = colTarget-colKernel + 1; + cudaMalloc((void**)&dev_c,index * rowC * colC * sizeof(int)); + cudaMalloc((void**)&dev_a, rowKernel * colKernel * sizeof(int)); + cudaMalloc((void**)&dev_b,index * rowTarget * colTarget * sizeof(int)); + + cudaMemcpy(dev_a, kernel, rowKernel * colKernel * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(dev_b, target,index * rowTarget * colTarget * sizeof(int), cudaMemcpyHostToDevice); + convolution<<<2, 2>>>(dev_c, dev_a, dev_b, rowKernel,colKernel,rowTarget,colTarget,index); + cudaDeviceSynchronize(); + + // Copy output vector from GPU buffer to host memory. + cudaMemcpy(output, dev_c, index * rowC * colC * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_c); + cudaFree(dev_a); + cudaFree(dev_b); +} + +int main(){ + int num_targets = 3; + int rowA = 2; + int colA = 2; + int rowB = 3; + int colB = 3; + + scanf("%d %d", &rowA, &colA); + // INISIASI ARRAY/MATRIX + int a[rowA*colA] = {0}; + // INPUT MATRIX KERNEL + for (int i = 0; i < rowA; i++) + { + for (int j = 0; j < colA; j++) + { + scanf("%d", &a[i * colA + j]); + } + } + + scanf("%d %d %d", &num_targets, &rowB, &colB); + // INISIASI ARRAY/MATRIX + int b[num_targets][rowB*colB] = {0}; + // INPUT MATRIX KERNEL + for (int k = 0; k < num_targets ; k++){ + for (int i = 0; i < rowB; i++) + { + for (int j = 0; j < colB; j++) + { + scanf("%d", &b[k][i * colB + j]); + } + } + } + + + int rowC = rowB-rowA + 1; + int colC = colB-colA + 1; + int c[num_targets][rowC*colC] = { 0 }; + // int a[rowA*colA] = { + // 1,0, + // 0,-1 + // }; + // int a[rowA*colA] = { + // 1,0,1, + // 0,1,0, + // 1,0,1 + // }; + // int b[index][rowB*colB] = { + // {1,1,0,1,0,1,11,-1,1 }, + // {2,12,2,2,2,-2,2,21,2}, + // {30,-1,3,3,30,-3,-31,10,-9 } + // }; + // int b[rowB*colB]= { + // 0,1,1,1,0,0,0, + // 0,0,1,1,1,0,0, + // 0,0,0,1,1,1,0, + // 0,0,0,1,1,0,0, + // 0,0,1,1,0,0,0, + // 0,1,1,0,0,0,0, + // 0,1,0,0,0,0,0 + // }; + // int b[rowB*colB]= { + // 1,1,0, + // 1,0,1, + // 11,-1,1 + // }; + + convolutionWithCUDA(*c, a, *b, rowA,colA,rowB,colB,num_targets); + // convolution(c,a,b[0],rowA,colA,rowB,colB); + print_matrix(c[0],rowC,colC); + print_matrix(c[1],rowC,colC); + print_matrix(c[2],rowC,colC); + + cudaDeviceReset(); +} \ No newline at end of file -- GitLab