diff --git a/src/cuda.cu b/src/cuda.cu index 85eeff05f99318b6d2f327c5714e70c1d5c5859d..a12e7c82fee29b4c41afa8513fb6d78a9e923e3c 100644 --- a/src/cuda.cu +++ b/src/cuda.cu @@ -19,7 +19,7 @@ __host__ __device__ void print_matrix(int *m,int row, int col) } -__device__ int supression_op(int *kernel, int *target, int rowKernel,int colKernel,int rowTarget,int colTarget, int iterationRow, int iterationCol,int iterator) +__device__ int supression_op(int *kernel, int *target, int rowKernel,int colKernel,int rowTarget,int colTarget, int iterationRow, int iterationCol) { int intermediate_sum = 0; for (int i = 0; i < rowKernel; i++) @@ -33,12 +33,10 @@ __device__ int supression_op(int *kernel, int *target, int rowKernel,int colKern return intermediate_sum; } -__global__ void convolution(int *output,int *kernel, int *target,int rowKernel,int colKernel,int rowTarget,int colTarget,int index) +__global__ void convolution(int *output,int *kernel, int *target, int rowKernel, int colKernel, int rowTarget, int colTarget, int index) { int rowC = rowTarget-rowKernel + 1; int colC = colTarget-colKernel + 1; - int row; - int col; // BLOCK int block = blockIdx.x; @@ -47,52 +45,89 @@ __global__ void convolution(int *output,int *kernel, int *target,int rowKernel,i // int k = block; // printf("BLOCK %d \n",blockIdx.x); // THREAD + int tx= threadIdx.x; - // printf("BLOCK DMENSION X %d \n",blockDim.x); - // printf("BLOCK DIMENSION Y %d \n",blockDim.y); - int T= tx + blockIdx.x * blockDim.x; // ALL THREAD INDEX - // printf("GRID DIMENSION : %d \n",); - // print_matrix(&target[0],3,3); - // print_matrix(&target[1],3,3); - // print_matrix(&target[2],3,3); - // printf("THREAD : %d \n",tx); - for (int k = block; k < index ;k+= numberOfBlock){ - for (int i = tx; i < blockDim.x; i+=blockDim.x) - { - // printf("MASUK THREAD : %d \n",i); - // printf("THIS IS i : %d \n",i); - for (int j = i; j < rowC*colC ; j+= blockDim.x) - { - // printf("MASUK THREAD J : %d \n",j); - // printf("THIS IS j : %d \n",j); - row = floorf(j/colC); - col = j%colC; - if (i == 0 && k == 0 && (row*colC + col) < 100){ - printf("THREAD %d FOR ROW %d AND COL %d \n",i, row , col); - printf("J IS %d \n",j); - printf("FILLING FOR MATRIX %d with index %d \n",k, row*colC + col); - printf("VALUE OF SUPRESION : %d \n",supression_op(kernel, (&target[k*rowTarget*colTarget]), rowKernel, colKernel, rowTarget, colTarget, row, col,k)); - - } - // printf("ITEM IS THIS %d \n",(&target[k])[j]); - // print_matrix(&target[k*rowTarget*colTarget],3,3); - // print_matrix(&target[9],3,3); - (&output[k*rowC*colC])[row*colC + col] = supression_op(kernel, (&target[k*rowTarget*colTarget]), rowKernel, colKernel, rowTarget, colTarget, row, col,k); + int ty= threadIdx.y; + + // int T = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; // ALL THREAD INDEX + + for (int k = block; k < index; k+= numberOfBlock){ + for (int i = tx; i < rowC; i+= blockDim.x){ + for (int j = ty; j < colC; j += blockDim.y){ + // printf("BLOCK %d ITERATION k : %d THREAD %d FOR i : %d FOR j : %d \n",block,T,k,i,j); + (&output[k*rowC*colC])[i*colC + j] = supression_op(kernel, (&target[k*rowTarget*colTarget]), rowKernel, colKernel, rowTarget, colTarget, i, j); } } - // print_matrix(&output[4],3,3); - // printf("ITEM IS THIS %d \n",(&output[0])[0]); - // printf("ITEM IS THIS %d \n",(&output[1])[0]); - // printf("ITEM IS THIS %d \n",(&output[1])[1]); - // printf("ITEM IS THIS %d \n",(&output[0])[3]); } } -void convolutionWithCUDA(int* output, int* kernel, int* target, int rowKernel, int colKernel, int rowTarget, int colTarget,int index){ +__device__ int getDataRange(int *source, int rowA, int colA){ + int max = DATAMIN; + int min = DATAMAX; + for (int i = 0; i < rowA; i++) + { + for (int j = 0; j < colA; j++) + { + int el = source[i * colA + j]; + if (el > max) + max = el; + if (el < min) + min = el; + } + } + + return max - min; +} +__global__ void getDataRangeArray(int *output, int *source, int index,int rowSource, int colSource){ + int tx = threadIdx.x; + int threadXinBlock = blockDim.x; + // printf("output : %d, i : %d \n",getDataRange(&source[-1*rowSource*colSource],rowSource,colSource),0); + // printf("output : %d, i : %d \n",getDataRange(&source[0*rowSource*colSource],rowSource,colSource),0); + // printf("output : %d, i : %d \n",getDataRange(&source[1*rowSource*colSource],rowSource,colSource),1); + // printf("output : %d, i : %d \n",getDataRange(&source[2*rowSource*colSource],rowSource,colSource),2); + for( int i = tx ; i < index ; i+=threadXinBlock ){ + output[i] = getDataRange(&source[i*rowSource*colSource], rowSource, colSource); + // printf("output : %d, i : %d \n",output[i],i); + } + +} +__device__ void mergeArray(int *output, int *temporary_array, int left,int mid, int right){ + int iterleft = left, iterright = mid; + for(int i = left; i < right; i++){ + if(iterleft < mid && (iterright >= right || output[iterleft] < output[iterright])){ + temporary_array[i] = output[iterleft++]; + } + else{ + temporary_array[i] = output[iterright++]; + } + } +} + +__global__ void mergeSort(int* output, int *temporary_array, int index, int width, int arrayPerThread){ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int left = width*x*arrayPerThread; + int mid,right; + for(int i = 0; i < arrayPerThread; i++){ + if(left >= index) break; + mid = fminf(left+(width/2), index); // ini min nya emang defined? @nyamnyam, nggak tau, tpi blm merge sort aja di TC 4 ada segfault, jdi mau solve itu dlu + right = fminf(left+width, index); + mergeArray(output, temporary_array, left, mid, right); + left += width; + } +} + + +void convolutionWithCUDA(int* output, int* kernel, int* target, int rowKernel, int colKernel, int rowTarget, int colTarget,int index, int block_num, int thread_num){ // B = Kernel int* dev_a = nullptr; int* dev_b = nullptr; int* dev_c = nullptr; + int* output_datarange = nullptr; + int* temporary_array = nullptr; + int* temp_swap; + // int* datarange_sor = nullptr; + // dim3* deviceBlock_num = nullptr; + // dim3* deviceThread_num = nullptr; // Allocate GPU buffers for three vectors (two input, one output) // print_matrix(&target[0],3,3); @@ -101,29 +136,92 @@ void convolutionWithCUDA(int* output, int* kernel, int* target, int rowKernel, i // printf("\n %d \n", *(target[0][1])); int rowC = rowTarget-rowKernel + 1; int colC = colTarget-colKernel + 1; + + + // cudaMalloc((void**)&deviceBlock_num, sizeof(dim3)); + // cudaMalloc((void**)&deviceThread_num, sizeof(dim3)); + + cudaMalloc((void**)&output_datarange, index * sizeof(int)); + cudaMalloc((void**)&temporary_array, index * sizeof(int)); cudaMalloc((void**)&dev_c,index * rowC * colC * sizeof(int)); cudaMalloc((void**)&dev_a, rowKernel * colKernel * sizeof(int)); cudaMalloc((void**)&dev_b,index * rowTarget * colTarget * sizeof(int)); + + // cudaMemcpy(deviceBlock_num, block_num, sizeof(dim3), cudaMemcpyHostToDevice); + // cudaMemcpy(deviceThread_num, thread_num, sizeof(dim3), cudaMemcpyHostToDevice); + cudaMemcpy(dev_a, kernel, rowKernel * colKernel * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, target,index * rowTarget * colTarget * sizeof(int), cudaMemcpyHostToDevice); - convolution<<<2, 2>>>(dev_c, dev_a, dev_b, rowKernel,colKernel,rowTarget,colTarget,index); + convolution<<<dim3(block_num,1,1),dim3(thread_num,thread_num,1)>>>(dev_c, dev_a, dev_b, rowKernel,colKernel,rowTarget,colTarget,index); cudaDeviceSynchronize(); + + getDataRangeArray<<<1,thread_num>>>(output_datarange,dev_c,index,rowC,colC); + cudaDeviceSynchronize(); + + + long nThreads = thread_num; + for(int width = 2; width < index*2; width*=2){ + int arrayPerThread = index/(nThreads*width) + 1; + mergeSort<<<1, dim3(thread_num, 1, 1)>>>(output_datarange, temporary_array, index, width, arrayPerThread); + temp_swap = output_datarange; + output_datarange = temporary_array; + temporary_array = temp_swap; + } + cudaDeviceSynchronize(); + // printf("ITEM : %d \n",output_datarange[0]); + // printf("ITEM : %d \n",dev_c[0]); + // Copy output vector from GPU buffer to host memory. - cudaMemcpy(output, dev_c, index * rowC * colC * sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(output, output_datarange, index * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(output_datarange); + cudaFree(temporary_array); cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b); } -int main(){ +int get_median(int *n, int length) +{ + clock_t begin = clock(); + int mid = length / 2; + if (length & 1) + return n[mid]; + + return (n[mid - 1] + n[mid]) / 2; +} + +long get_floored_mean(int *n, int length) +{ + long sum = 0; + for (int i = 0; i < length; i++) + { + sum += n[i]; + } + + return sum / length; +} + +void print_array(int *n, int size) +{ + for (int i = 0; i < size; i++) + printf("%d ", n[i]); + printf("\n"); +} + +int main(int argc, char **argv){ int num_targets = 3; - int rowA = 2; - int colA = 2; - int rowB = 3; - int colB = 3; + int rowA,colA; + int rowB,colB; + + char *ptr; + + int block_num = strtol(argv[1], &ptr, 10); + if(block_num <= 0) block_num = 1; + int thread_num = strtol(argv[2], &ptr, 10); + if(thread_num <= 0) thread_num = 16; scanf("%d %d", &rowA, &colA); // INISIASI ARRAY/MATRIX @@ -139,28 +237,37 @@ int main(){ scanf("%d %d %d", &num_targets, &rowB, &colB); // INISIASI ARRAY/MATRIX - int b[num_targets][rowB*colB] = {0}; - // INPUT MATRIX KERNEL + + printf("%d %d %d\n", num_targets, rowB, colB); + int *b = (int *)malloc(num_targets*rowB*colB * sizeof(int)); + // INPUT MATRIX TARGET for (int k = 0; k < num_targets ; k++){ for (int i = 0; i < rowB; i++) { for (int j = 0; j < colB; j++) { - scanf("%d", &b[k][i * colB + j]); + scanf("%d", &(b[(k*rowB*colB) + i * colB + j])); } } } - - int rowC = rowB-rowA + 1; - int colC = colB-colA + 1; - int c[num_targets][rowC*colC] = { 0 }; + // int rowC = rowB-rowA + 1; + // int colC = colB-colA + 1; + // int c[num_targets][rowC*colC] = { 0 }; + int c[num_targets]; - convolutionWithCUDA(*c, a, *b, rowA,colA,rowB,colB,num_targets); + convolutionWithCUDA(c, a, b, rowA,colA,rowB,colB,num_targets,block_num,thread_num); + free(b); // convolution(c,a,b[0],rowA,colA,rowB,colB); - print_matrix(c[0],rowC,colC); - print_matrix(c[1],rowC,colC); - print_matrix(c[2],rowC,colC); + // print_matrix(c[0],rowC,colC); + // print_matrix(c[1],rowC,colC); + // print_matrix(c[2],rowC,colC); + print_array(c,num_targets); + printf("\n"); + printf("MIN : %d \n",c[0]); + printf("MAX : %d \n",c[num_targets-1]); + printf("MEDIAN : %d \n",get_median(c,num_targets)); + printf("AVERAGE : %ld \n",get_floored_mean(c,num_targets)); cudaDeviceReset(); } \ No newline at end of file