diff --git a/src/cuda.cu b/src/cuda.cu
index 85eeff05f99318b6d2f327c5714e70c1d5c5859d..a12e7c82fee29b4c41afa8513fb6d78a9e923e3c 100644
--- a/src/cuda.cu
+++ b/src/cuda.cu
@@ -19,7 +19,7 @@ __host__ __device__ void print_matrix(int *m,int row, int col)
 }
 
 
-__device__ int supression_op(int *kernel, int *target, int rowKernel,int colKernel,int rowTarget,int colTarget, int iterationRow, int iterationCol,int iterator)
+__device__ int supression_op(int *kernel, int *target, int rowKernel,int colKernel,int rowTarget,int colTarget, int iterationRow, int iterationCol)
 {
   int intermediate_sum = 0;
   for (int i = 0; i < rowKernel; i++)
@@ -33,12 +33,10 @@ __device__ int supression_op(int *kernel, int *target, int rowKernel,int colKern
   return intermediate_sum;
 }
 
-__global__ void convolution(int *output,int *kernel, int *target,int rowKernel,int colKernel,int rowTarget,int colTarget,int index)
+__global__ void convolution(int *output,int *kernel, int *target, int rowKernel, int colKernel, int rowTarget, int colTarget, int index)
 {
   int rowC = rowTarget-rowKernel + 1;
   int colC = colTarget-colKernel + 1;
-  int row;
-  int col;
 
    // BLOCK
   int block = blockIdx.x;
@@ -47,52 +45,89 @@ __global__ void convolution(int *output,int *kernel, int *target,int rowKernel,i
   // int k = block;
   // printf("BLOCK %d \n",blockIdx.x);
   // THREAD
+
   int tx= threadIdx.x;
-  // printf("BLOCK DMENSION X %d \n",blockDim.x);
-  // printf("BLOCK DIMENSION Y %d \n",blockDim.y);
-  int T= tx + blockIdx.x * blockDim.x; // ALL THREAD INDEX
-  // printf("GRID DIMENSION : %d \n",);
-  // print_matrix(&target[0],3,3);
-  // print_matrix(&target[1],3,3);
-  // print_matrix(&target[2],3,3);
-  // printf("THREAD : %d \n",tx);
-  for (int k = block; k < index ;k+= numberOfBlock){
-    for (int i = tx; i < blockDim.x; i+=blockDim.x)
-    {
-      // printf("MASUK THREAD : %d \n",i);
-      // printf("THIS IS i : %d \n",i);
-      for (int j = i; j < rowC*colC ; j+= blockDim.x)
-      {
-        // printf("MASUK THREAD J : %d \n",j);
-        // printf("THIS IS j : %d \n",j);
-        row = floorf(j/colC);
-        col = j%colC;
-        if (i == 0 && k == 0 && (row*colC + col) < 100){
-          printf("THREAD %d FOR ROW %d AND COL %d \n",i, row , col);
-          printf("J IS %d \n",j);
-          printf("FILLING FOR MATRIX %d with index %d \n",k, row*colC + col);
-          printf("VALUE OF SUPRESION : %d \n",supression_op(kernel, (&target[k*rowTarget*colTarget]), rowKernel, colKernel, rowTarget, colTarget, row, col,k));
-        
-        }
-        // printf("ITEM IS THIS %d \n",(&target[k])[j]);
-        // print_matrix(&target[k*rowTarget*colTarget],3,3);
-        // print_matrix(&target[9],3,3);
-        (&output[k*rowC*colC])[row*colC + col] = supression_op(kernel, (&target[k*rowTarget*colTarget]), rowKernel, colKernel, rowTarget, colTarget, row, col,k);
+  int ty= threadIdx.y;
+
+  // int T = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; // ALL THREAD INDEX
+  
+  for (int k = block; k < index; k+= numberOfBlock){
+    for (int i = tx; i < rowC; i+= blockDim.x){
+      for (int j = ty; j < colC; j += blockDim.y){
+        // printf("BLOCK %d ITERATION k : %d THREAD %d FOR i : %d FOR j : %d \n",block,T,k,i,j);
+        (&output[k*rowC*colC])[i*colC + j] = supression_op(kernel, (&target[k*rowTarget*colTarget]), rowKernel, colKernel, rowTarget, colTarget, i, j);
       }
     }
-    // print_matrix(&output[4],3,3);
-      // printf("ITEM IS THIS %d \n",(&output[0])[0]);
-      // printf("ITEM IS THIS %d \n",(&output[1])[0]);
-      // printf("ITEM IS THIS %d \n",(&output[1])[1]);
-      // printf("ITEM IS THIS %d \n",(&output[0])[3]);
   }
 }
 
-void convolutionWithCUDA(int* output, int* kernel, int* target, int rowKernel, int colKernel, int rowTarget, int colTarget,int index){
+__device__ int getDataRange(int *source, int rowA, int colA){
+  int max = DATAMIN;
+  int min = DATAMAX;
+  for (int i = 0; i < rowA; i++)
+  {
+    for (int j = 0; j < colA; j++)
+    {
+      int el = source[i * colA + j];
+      if (el > max)
+        max = el;
+      if (el < min)
+        min = el;
+    }
+  }
+
+  return max - min;
+}
+__global__ void getDataRangeArray(int *output, int *source, int index,int rowSource, int colSource){
+  int tx = threadIdx.x;
+  int threadXinBlock = blockDim.x;
+  // printf("output : %d, i : %d \n",getDataRange(&source[-1*rowSource*colSource],rowSource,colSource),0);
+  // printf("output : %d, i : %d \n",getDataRange(&source[0*rowSource*colSource],rowSource,colSource),0);
+  // printf("output : %d, i : %d \n",getDataRange(&source[1*rowSource*colSource],rowSource,colSource),1);
+  // printf("output : %d, i : %d \n",getDataRange(&source[2*rowSource*colSource],rowSource,colSource),2);
+  for( int i = tx ; i < index ; i+=threadXinBlock ){
+    output[i] = getDataRange(&source[i*rowSource*colSource], rowSource, colSource);
+    // printf("output : %d, i : %d \n",output[i],i);
+  }
+  
+}
+__device__ void mergeArray(int *output, int *temporary_array, int left,int mid, int right){
+  int iterleft = left, iterright = mid;
+  for(int i = left; i < right; i++){
+    if(iterleft < mid && (iterright >= right || output[iterleft] < output[iterright])){
+      temporary_array[i] = output[iterleft++];
+    }
+    else{
+      temporary_array[i] = output[iterright++];
+    }
+  }
+}
+
+__global__ void mergeSort(int* output, int *temporary_array, int index, int width, int arrayPerThread){
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int left = width*x*arrayPerThread;
+  int mid,right;
+  for(int i = 0; i < arrayPerThread; i++){
+    if(left >= index) break;
+    mid = fminf(left+(width/2), index); // ini min nya emang defined? @nyamnyam, nggak tau, tpi blm merge sort aja di TC 4 ada segfault, jdi mau solve itu dlu
+    right = fminf(left+width, index);
+    mergeArray(output, temporary_array, left, mid, right);
+    left += width;
+  }
+}
+
+
+void convolutionWithCUDA(int* output, int* kernel, int* target, int rowKernel, int colKernel, int rowTarget, int colTarget,int index, int block_num, int thread_num){
   // B = Kernel
   int* dev_a = nullptr;
   int* dev_b = nullptr;
   int* dev_c = nullptr;
+  int* output_datarange = nullptr;
+  int* temporary_array = nullptr;
+  int* temp_swap;
+  // int* datarange_sor = nullptr;
+  // dim3* deviceBlock_num = nullptr;
+  // dim3* deviceThread_num = nullptr;
 
   // Allocate GPU buffers for three vectors (two input, one output)
   // print_matrix(&target[0],3,3);
@@ -101,29 +136,92 @@ void convolutionWithCUDA(int* output, int* kernel, int* target, int rowKernel, i
   // printf("\n %d \n", *(target[0][1]));
   int rowC = rowTarget-rowKernel + 1;
   int colC = colTarget-colKernel + 1;
+
+
+  // cudaMalloc((void**)&deviceBlock_num, sizeof(dim3));
+  // cudaMalloc((void**)&deviceThread_num, sizeof(dim3));
+  
+  cudaMalloc((void**)&output_datarange, index * sizeof(int));
+  cudaMalloc((void**)&temporary_array, index * sizeof(int));
   cudaMalloc((void**)&dev_c,index * rowC * colC * sizeof(int));
   cudaMalloc((void**)&dev_a, rowKernel * colKernel * sizeof(int));
   cudaMalloc((void**)&dev_b,index * rowTarget * colTarget * sizeof(int));
 
+
+  // cudaMemcpy(deviceBlock_num, block_num, sizeof(dim3), cudaMemcpyHostToDevice);
+  // cudaMemcpy(deviceThread_num, thread_num, sizeof(dim3), cudaMemcpyHostToDevice);
+  
   cudaMemcpy(dev_a, kernel, rowKernel * colKernel * sizeof(int), cudaMemcpyHostToDevice);
   cudaMemcpy(dev_b, target,index * rowTarget * colTarget * sizeof(int), cudaMemcpyHostToDevice);
-  convolution<<<2, 2>>>(dev_c, dev_a, dev_b, rowKernel,colKernel,rowTarget,colTarget,index);
+  convolution<<<dim3(block_num,1,1),dim3(thread_num,thread_num,1)>>>(dev_c, dev_a, dev_b, rowKernel,colKernel,rowTarget,colTarget,index);
   cudaDeviceSynchronize();
 
+  
+  getDataRangeArray<<<1,thread_num>>>(output_datarange,dev_c,index,rowC,colC);
+  cudaDeviceSynchronize();
+
+
+  long nThreads = thread_num;
+  for(int width = 2; width < index*2; width*=2){
+    int arrayPerThread = index/(nThreads*width) + 1;
+    mergeSort<<<1, dim3(thread_num, 1, 1)>>>(output_datarange, temporary_array, index, width, arrayPerThread);
+    temp_swap = output_datarange;
+    output_datarange = temporary_array;
+    temporary_array = temp_swap;
+  }
+  cudaDeviceSynchronize();
+  // printf("ITEM : %d \n",output_datarange[0]);
+  // printf("ITEM : %d \n",dev_c[0]);
+
   // Copy output vector from GPU buffer to host memory.
-  cudaMemcpy(output, dev_c, index * rowC * colC * sizeof(int), cudaMemcpyDeviceToHost);
+  cudaMemcpy(output, output_datarange, index * sizeof(int), cudaMemcpyDeviceToHost);
 
+  cudaFree(output_datarange);
+  cudaFree(temporary_array);
   cudaFree(dev_c);
   cudaFree(dev_a);
   cudaFree(dev_b);  
 }
 
-int main(){
+int get_median(int *n, int length)
+{
+  clock_t begin = clock();
+  int mid = length / 2;
+  if (length & 1)
+    return n[mid];
+
+  return (n[mid - 1] + n[mid]) / 2;
+}
+
+long get_floored_mean(int *n, int length)
+{
+  long sum = 0;
+  for (int i = 0; i < length; i++)
+  {
+    sum += n[i];
+  }
+
+  return sum / length;
+}
+
+void print_array(int *n, int size)
+{
+  for (int i = 0; i < size; i++)
+    printf("%d ", n[i]);
+  printf("\n");
+}
+
+int main(int argc, char **argv){
   int num_targets = 3;
-  int rowA = 2;
-  int colA = 2;
-  int rowB = 3;
-  int colB = 3;
+  int rowA,colA;
+  int rowB,colB;
+
+  char *ptr;
+
+  int block_num = strtol(argv[1], &ptr, 10);
+  if(block_num <= 0) block_num = 1;
+  int thread_num = strtol(argv[2], &ptr, 10);
+  if(thread_num <= 0) thread_num = 16;
 
   scanf("%d %d", &rowA, &colA);
   // INISIASI ARRAY/MATRIX
@@ -139,28 +237,37 @@ int main(){
 
   scanf("%d %d %d", &num_targets, &rowB, &colB);
   // INISIASI ARRAY/MATRIX
-  int b[num_targets][rowB*colB] = {0};
-  // INPUT MATRIX KERNEL
+
+  printf("%d %d %d\n", num_targets, rowB, colB);
+  int *b = (int *)malloc(num_targets*rowB*colB * sizeof(int));
+  // INPUT MATRIX TARGET
   for (int k = 0; k < num_targets ; k++){
     for (int i = 0; i < rowB; i++)
     {
       for (int j = 0; j < colB; j++)
       {
-        scanf("%d", &b[k][i * colB + j]);
+        scanf("%d", &(b[(k*rowB*colB) + i * colB + j]));
       }
     }
   }
   
-  
-  int rowC = rowB-rowA + 1;
-  int colC = colB-colA + 1;
-  int c[num_targets][rowC*colC] = { 0 };
+  // int rowC = rowB-rowA + 1;
+  // int colC = colB-colA + 1;
+  // int c[num_targets][rowC*colC] = { 0 };
+  int c[num_targets];
 
-  convolutionWithCUDA(*c, a, *b, rowA,colA,rowB,colB,num_targets);
+  convolutionWithCUDA(c, a, b, rowA,colA,rowB,colB,num_targets,block_num,thread_num);
+  free(b);
   // convolution(c,a,b[0],rowA,colA,rowB,colB);
-  print_matrix(c[0],rowC,colC);
-  print_matrix(c[1],rowC,colC);
-  print_matrix(c[2],rowC,colC);
+  // print_matrix(c[0],rowC,colC);
+  // print_matrix(c[1],rowC,colC);
+  // print_matrix(c[2],rowC,colC);
+  print_array(c,num_targets);
+  printf("\n");
+  printf("MIN : %d \n",c[0]);
+  printf("MAX : %d \n",c[num_targets-1]);
+  printf("MEDIAN : %d \n",get_median(c,num_targets));
+  printf("AVERAGE : %ld \n",get_floored_mean(c,num_targets));
 
   cudaDeviceReset();
 }
\ No newline at end of file