From 084f1fe810c6cc4d2381dad5247a7c38626a63fd Mon Sep 17 00:00:00 2001
From: Zenovore <agustinus.alexander17@gmail.com>
Date: Wed, 16 Mar 2022 06:59:40 +0700
Subject: [PATCH] add paralel convolution

Co-authored-by: <riisuki@users.noreply.github.com>
Co-authored-by: <girvinjunod@users.noreply.github.com>
---
 src/cuda.cu | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 194 insertions(+)

diff --git a/src/cuda.cu b/src/cuda.cu
index e69de29..6341ed0 100644
--- a/src/cuda.cu
+++ b/src/cuda.cu
@@ -0,0 +1,194 @@
+%%writefile test.cu
+#include <stdlib.h>
+#include <stdio.h>
+
+#define NMAX 100
+#define DATAMAX 1000
+#define DATAMIN -1000
+
+__host__ __device__ void print_matrix(int *m,int row, int col)
+{
+  for (int i = 0; i < row; i++)
+  {
+    for (int j = 0; j < col; j++)
+    {
+      printf("%d ", m[i * col + j]);
+    }
+    printf("\n");
+  }
+}
+
+
+__device__ int supression_op(int *kernel, int *target, int rowKernel,int colKernel,int rowTarget,int colTarget, int iterationRow, int iterationCol,int iterator)
+{
+  int intermediate_sum = 0;
+  for (int i = 0; i < rowKernel; i++)
+  {
+    for (int j = 0; j < colKernel; j++)
+    {
+      intermediate_sum += kernel[i * colKernel + j] * target[(iterationRow + i) * colTarget + iterationCol + j];
+    }
+  }
+
+  return intermediate_sum;
+}
+
+__global__ void convolution(int *output,int *kernel, int *target,int rowKernel,int colKernel,int rowTarget,int colTarget,int index)
+{
+  int rowC = rowTarget-rowKernel + 1;
+  int colC = colTarget-colKernel + 1;
+  int row;
+  int col;
+
+   // BLOCK
+  int block = blockIdx.x;
+  int numberOfBlock = gridDim.x;
+  // printf("NUMBER OF BLOCK BLOCK %d \n",numberOfBlock);
+  // int k = block;
+  // printf("BLOCK %d \n",blockIdx.x);
+  // THREAD
+  int tx= threadIdx.x;
+  // printf("BLOCK DMENSION X %d \n",blockDim.x);
+  // printf("BLOCK DIMENSION Y %d \n",blockDim.y);
+  int T= tx + blockIdx.x * blockDim.x; // ALL THREAD INDEX
+  // printf("GRID DIMENSION : %d \n",);
+  // print_matrix(&target[0],3,3);
+  // print_matrix(&target[1],3,3);
+  // print_matrix(&target[2],3,3);
+  // printf("THREAD : %d \n",tx);
+  for (int k = block; k < index ;k+= numberOfBlock){
+    for (int i = tx; i < blockDim.x; i+=blockDim.x)
+    {
+      // printf("MASUK THREAD : %d \n",i);
+      // printf("THIS IS i : %d \n",i);
+      for (int j = i; j < rowC*colC ; j+= blockDim.x)
+      {
+        // printf("MASUK THREAD J : %d \n",j);
+        // printf("THIS IS j : %d \n",j);
+        row = floorf(j/colC);
+        col = j%colC;
+        if (i == 0 && k == 0 && (row*colC + col) < 100){
+          printf("THREAD %d FOR ROW %d AND COL %d \n",i, row , col);
+          printf("J IS %d \n",j);
+          printf("FILLING FOR MATRIX %d with index %d \n",k, row*colC + col);
+          printf("VALUE OF SUPRESION : %d \n",supression_op(kernel, (&target[k*rowTarget*colTarget]), rowKernel, colKernel, rowTarget, colTarget, row, col,k));
+        
+        }
+        // printf("ITEM IS THIS %d \n",(&target[k])[j]);
+        // print_matrix(&target[k*rowTarget*colTarget],3,3);
+        // print_matrix(&target[9],3,3);
+        (&output[k*rowC*colC])[row*colC + col] = supression_op(kernel, (&target[k*rowTarget*colTarget]), rowKernel, colKernel, rowTarget, colTarget, row, col,k);
+      }
+    }
+    // print_matrix(&output[4],3,3);
+      // printf("ITEM IS THIS %d \n",(&output[0])[0]);
+      // printf("ITEM IS THIS %d \n",(&output[1])[0]);
+      // printf("ITEM IS THIS %d \n",(&output[1])[1]);
+      // printf("ITEM IS THIS %d \n",(&output[0])[3]);
+  }
+}
+
+void convolutionWithCUDA(int* output, int* kernel, int* target, int rowKernel, int colKernel, int rowTarget, int colTarget,int index){
+  // B = Kernel
+  int* dev_a = nullptr;
+  int* dev_b = nullptr;
+  int* dev_c = nullptr;
+
+  // Allocate GPU buffers for three vectors (two input, one output)
+  // print_matrix(&target[0],3,3);
+  // print_matrix(&target[1],3,3);
+  // print_matrix(&target[2],3,3);
+  // printf("\n %d \n", *(target[0][1]));
+  int rowC = rowTarget-rowKernel + 1;
+  int colC = colTarget-colKernel + 1;
+  cudaMalloc((void**)&dev_c,index * rowC * colC * sizeof(int));
+  cudaMalloc((void**)&dev_a, rowKernel * colKernel * sizeof(int));
+  cudaMalloc((void**)&dev_b,index * rowTarget * colTarget * sizeof(int));
+
+  cudaMemcpy(dev_a, kernel, rowKernel * colKernel * sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_b, target,index * rowTarget * colTarget * sizeof(int), cudaMemcpyHostToDevice);
+  convolution<<<2, 2>>>(dev_c, dev_a, dev_b, rowKernel,colKernel,rowTarget,colTarget,index);
+  cudaDeviceSynchronize();
+
+  // Copy output vector from GPU buffer to host memory.
+  cudaMemcpy(output, dev_c, index * rowC * colC * sizeof(int), cudaMemcpyDeviceToHost);
+
+  cudaFree(dev_c);
+  cudaFree(dev_a);
+  cudaFree(dev_b);  
+}
+
+int main(){
+  int num_targets = 3;
+  int rowA = 2;
+  int colA = 2;
+  int rowB = 3;
+  int colB = 3;
+
+  scanf("%d %d", &rowA, &colA);
+  // INISIASI ARRAY/MATRIX
+  int a[rowA*colA] = {0};
+  // INPUT MATRIX KERNEL
+  for (int i = 0; i < rowA; i++)
+  {
+    for (int j = 0; j < colA; j++)
+    {
+      scanf("%d", &a[i * colA + j]);
+    }
+  }
+
+  scanf("%d %d %d", &num_targets, &rowB, &colB);
+  // INISIASI ARRAY/MATRIX
+  int b[num_targets][rowB*colB] = {0};
+  // INPUT MATRIX KERNEL
+  for (int k = 0; k < num_targets ; k++){
+    for (int i = 0; i < rowB; i++)
+    {
+      for (int j = 0; j < colB; j++)
+      {
+        scanf("%d", &b[k][i * colB + j]);
+      }
+    }
+  }
+  
+  
+  int rowC = rowB-rowA + 1;
+  int colC = colB-colA + 1;
+  int c[num_targets][rowC*colC] = { 0 };
+  // int a[rowA*colA] = {
+  //   1,0,
+  //   0,-1
+  //  };
+  // int a[rowA*colA] = {
+  //   1,0,1,
+  //   0,1,0,
+  //   1,0,1
+  //  };
+  //   int b[index][rowB*colB] = { 
+  //   {1,1,0,1,0,1,11,-1,1 },
+  //   {2,12,2,2,2,-2,2,21,2},
+  //   {30,-1,3,3,30,-3,-31,10,-9 }
+  // };
+  // int b[rowB*colB]= {
+  //   0,1,1,1,0,0,0,
+  //   0,0,1,1,1,0,0,
+  //   0,0,0,1,1,1,0,
+  //   0,0,0,1,1,0,0,
+  //   0,0,1,1,0,0,0,
+  //   0,1,1,0,0,0,0,
+  //   0,1,0,0,0,0,0
+  // };
+  // int b[rowB*colB]= {
+  //   1,1,0,
+  //   1,0,1,
+  //   11,-1,1 
+  // };
+
+  convolutionWithCUDA(*c, a, *b, rowA,colA,rowB,colB,num_targets);
+  // convolution(c,a,b[0],rowA,colA,rowB,colB);
+  print_matrix(c[0],rowC,colC);
+  print_matrix(c[1],rowC,colC);
+  print_matrix(c[2],rowC,colC);
+
+  cudaDeviceReset();
+}
\ No newline at end of file
-- 
GitLab