Skip to content
Snippets Groups Projects
Commit 5603b3a4 authored by Zenovore's avatar Zenovore
Browse files

add paralel getdatarange and mergesort and fix bug


Co-authored-by: default avatarGirvin Junod <girvinjunod@users.noreply.github.com>
Co-authored-by: default avatarMelita <riisuki@users.noreply.github.com>
parent bd860a82
No related merge requests found
......@@ -19,7 +19,7 @@ __host__ __device__ void print_matrix(int *m,int row, int col)
}
__device__ int supression_op(int *kernel, int *target, int rowKernel,int colKernel,int rowTarget,int colTarget, int iterationRow, int iterationCol,int iterator)
__device__ int supression_op(int *kernel, int *target, int rowKernel,int colKernel,int rowTarget,int colTarget, int iterationRow, int iterationCol)
{
int intermediate_sum = 0;
for (int i = 0; i < rowKernel; i++)
......@@ -33,12 +33,10 @@ __device__ int supression_op(int *kernel, int *target, int rowKernel,int colKern
return intermediate_sum;
}
__global__ void convolution(int *output,int *kernel, int *target,int rowKernel,int colKernel,int rowTarget,int colTarget,int index)
__global__ void convolution(int *output,int *kernel, int *target, int rowKernel, int colKernel, int rowTarget, int colTarget, int index)
{
int rowC = rowTarget-rowKernel + 1;
int colC = colTarget-colKernel + 1;
int row;
int col;
// BLOCK
int block = blockIdx.x;
......@@ -47,52 +45,89 @@ __global__ void convolution(int *output,int *kernel, int *target,int rowKernel,i
// int k = block;
// printf("BLOCK %d \n",blockIdx.x);
// THREAD
int tx= threadIdx.x;
// printf("BLOCK DMENSION X %d \n",blockDim.x);
// printf("BLOCK DIMENSION Y %d \n",blockDim.y);
int T= tx + blockIdx.x * blockDim.x; // ALL THREAD INDEX
// printf("GRID DIMENSION : %d \n",);
// print_matrix(&target[0],3,3);
// print_matrix(&target[1],3,3);
// print_matrix(&target[2],3,3);
// printf("THREAD : %d \n",tx);
for (int k = block; k < index ;k+= numberOfBlock){
for (int i = tx; i < blockDim.x; i+=blockDim.x)
{
// printf("MASUK THREAD : %d \n",i);
// printf("THIS IS i : %d \n",i);
for (int j = i; j < rowC*colC ; j+= blockDim.x)
{
// printf("MASUK THREAD J : %d \n",j);
// printf("THIS IS j : %d \n",j);
row = floorf(j/colC);
col = j%colC;
if (i == 0 && k == 0 && (row*colC + col) < 100){
printf("THREAD %d FOR ROW %d AND COL %d \n",i, row , col);
printf("J IS %d \n",j);
printf("FILLING FOR MATRIX %d with index %d \n",k, row*colC + col);
printf("VALUE OF SUPRESION : %d \n",supression_op(kernel, (&target[k*rowTarget*colTarget]), rowKernel, colKernel, rowTarget, colTarget, row, col,k));
}
// printf("ITEM IS THIS %d \n",(&target[k])[j]);
// print_matrix(&target[k*rowTarget*colTarget],3,3);
// print_matrix(&target[9],3,3);
(&output[k*rowC*colC])[row*colC + col] = supression_op(kernel, (&target[k*rowTarget*colTarget]), rowKernel, colKernel, rowTarget, colTarget, row, col,k);
int ty= threadIdx.y;
// int T = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; // ALL THREAD INDEX
for (int k = block; k < index; k+= numberOfBlock){
for (int i = tx; i < rowC; i+= blockDim.x){
for (int j = ty; j < colC; j += blockDim.y){
// printf("BLOCK %d ITERATION k : %d THREAD %d FOR i : %d FOR j : %d \n",block,T,k,i,j);
(&output[k*rowC*colC])[i*colC + j] = supression_op(kernel, (&target[k*rowTarget*colTarget]), rowKernel, colKernel, rowTarget, colTarget, i, j);
}
}
// print_matrix(&output[4],3,3);
// printf("ITEM IS THIS %d \n",(&output[0])[0]);
// printf("ITEM IS THIS %d \n",(&output[1])[0]);
// printf("ITEM IS THIS %d \n",(&output[1])[1]);
// printf("ITEM IS THIS %d \n",(&output[0])[3]);
}
}
void convolutionWithCUDA(int* output, int* kernel, int* target, int rowKernel, int colKernel, int rowTarget, int colTarget,int index){
__device__ int getDataRange(int *source, int rowA, int colA){
int max = DATAMIN;
int min = DATAMAX;
for (int i = 0; i < rowA; i++)
{
for (int j = 0; j < colA; j++)
{
int el = source[i * colA + j];
if (el > max)
max = el;
if (el < min)
min = el;
}
}
return max - min;
}
__global__ void getDataRangeArray(int *output, int *source, int index,int rowSource, int colSource){
int tx = threadIdx.x;
int threadXinBlock = blockDim.x;
// printf("output : %d, i : %d \n",getDataRange(&source[-1*rowSource*colSource],rowSource,colSource),0);
// printf("output : %d, i : %d \n",getDataRange(&source[0*rowSource*colSource],rowSource,colSource),0);
// printf("output : %d, i : %d \n",getDataRange(&source[1*rowSource*colSource],rowSource,colSource),1);
// printf("output : %d, i : %d \n",getDataRange(&source[2*rowSource*colSource],rowSource,colSource),2);
for( int i = tx ; i < index ; i+=threadXinBlock ){
output[i] = getDataRange(&source[i*rowSource*colSource], rowSource, colSource);
// printf("output : %d, i : %d \n",output[i],i);
}
}
__device__ void mergeArray(int *output, int *temporary_array, int left,int mid, int right){
int iterleft = left, iterright = mid;
for(int i = left; i < right; i++){
if(iterleft < mid && (iterright >= right || output[iterleft] < output[iterright])){
temporary_array[i] = output[iterleft++];
}
else{
temporary_array[i] = output[iterright++];
}
}
}
__global__ void mergeSort(int* output, int *temporary_array, int index, int width, int arrayPerThread){
int x = blockIdx.x * blockDim.x + threadIdx.x;
int left = width*x*arrayPerThread;
int mid,right;
for(int i = 0; i < arrayPerThread; i++){
if(left >= index) break;
mid = fminf(left+(width/2), index); // ini min nya emang defined? @nyamnyam, nggak tau, tpi blm merge sort aja di TC 4 ada segfault, jdi mau solve itu dlu
right = fminf(left+width, index);
mergeArray(output, temporary_array, left, mid, right);
left += width;
}
}
void convolutionWithCUDA(int* output, int* kernel, int* target, int rowKernel, int colKernel, int rowTarget, int colTarget,int index, int block_num, int thread_num){
// B = Kernel
int* dev_a = nullptr;
int* dev_b = nullptr;
int* dev_c = nullptr;
int* output_datarange = nullptr;
int* temporary_array = nullptr;
int* temp_swap;
// int* datarange_sor = nullptr;
// dim3* deviceBlock_num = nullptr;
// dim3* deviceThread_num = nullptr;
// Allocate GPU buffers for three vectors (two input, one output)
// print_matrix(&target[0],3,3);
......@@ -101,29 +136,92 @@ void convolutionWithCUDA(int* output, int* kernel, int* target, int rowKernel, i
// printf("\n %d \n", *(target[0][1]));
int rowC = rowTarget-rowKernel + 1;
int colC = colTarget-colKernel + 1;
// cudaMalloc((void**)&deviceBlock_num, sizeof(dim3));
// cudaMalloc((void**)&deviceThread_num, sizeof(dim3));
cudaMalloc((void**)&output_datarange, index * sizeof(int));
cudaMalloc((void**)&temporary_array, index * sizeof(int));
cudaMalloc((void**)&dev_c,index * rowC * colC * sizeof(int));
cudaMalloc((void**)&dev_a, rowKernel * colKernel * sizeof(int));
cudaMalloc((void**)&dev_b,index * rowTarget * colTarget * sizeof(int));
// cudaMemcpy(deviceBlock_num, block_num, sizeof(dim3), cudaMemcpyHostToDevice);
// cudaMemcpy(deviceThread_num, thread_num, sizeof(dim3), cudaMemcpyHostToDevice);
cudaMemcpy(dev_a, kernel, rowKernel * colKernel * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, target,index * rowTarget * colTarget * sizeof(int), cudaMemcpyHostToDevice);
convolution<<<2, 2>>>(dev_c, dev_a, dev_b, rowKernel,colKernel,rowTarget,colTarget,index);
convolution<<<dim3(block_num,1,1),dim3(thread_num,thread_num,1)>>>(dev_c, dev_a, dev_b, rowKernel,colKernel,rowTarget,colTarget,index);
cudaDeviceSynchronize();
getDataRangeArray<<<1,thread_num>>>(output_datarange,dev_c,index,rowC,colC);
cudaDeviceSynchronize();
long nThreads = thread_num;
for(int width = 2; width < index*2; width*=2){
int arrayPerThread = index/(nThreads*width) + 1;
mergeSort<<<1, dim3(thread_num, 1, 1)>>>(output_datarange, temporary_array, index, width, arrayPerThread);
temp_swap = output_datarange;
output_datarange = temporary_array;
temporary_array = temp_swap;
}
cudaDeviceSynchronize();
// printf("ITEM : %d \n",output_datarange[0]);
// printf("ITEM : %d \n",dev_c[0]);
// Copy output vector from GPU buffer to host memory.
cudaMemcpy(output, dev_c, index * rowC * colC * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(output, output_datarange, index * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(output_datarange);
cudaFree(temporary_array);
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
}
int main(){
int get_median(int *n, int length)
{
clock_t begin = clock();
int mid = length / 2;
if (length & 1)
return n[mid];
return (n[mid - 1] + n[mid]) / 2;
}
long get_floored_mean(int *n, int length)
{
long sum = 0;
for (int i = 0; i < length; i++)
{
sum += n[i];
}
return sum / length;
}
void print_array(int *n, int size)
{
for (int i = 0; i < size; i++)
printf("%d ", n[i]);
printf("\n");
}
int main(int argc, char **argv){
int num_targets = 3;
int rowA = 2;
int colA = 2;
int rowB = 3;
int colB = 3;
int rowA,colA;
int rowB,colB;
char *ptr;
int block_num = strtol(argv[1], &ptr, 10);
if(block_num <= 0) block_num = 1;
int thread_num = strtol(argv[2], &ptr, 10);
if(thread_num <= 0) thread_num = 16;
scanf("%d %d", &rowA, &colA);
// INISIASI ARRAY/MATRIX
......@@ -139,28 +237,37 @@ int main(){
scanf("%d %d %d", &num_targets, &rowB, &colB);
// INISIASI ARRAY/MATRIX
int b[num_targets][rowB*colB] = {0};
// INPUT MATRIX KERNEL
printf("%d %d %d\n", num_targets, rowB, colB);
int *b = (int *)malloc(num_targets*rowB*colB * sizeof(int));
// INPUT MATRIX TARGET
for (int k = 0; k < num_targets ; k++){
for (int i = 0; i < rowB; i++)
{
for (int j = 0; j < colB; j++)
{
scanf("%d", &b[k][i * colB + j]);
scanf("%d", &(b[(k*rowB*colB) + i * colB + j]));
}
}
}
int rowC = rowB-rowA + 1;
int colC = colB-colA + 1;
int c[num_targets][rowC*colC] = { 0 };
// int rowC = rowB-rowA + 1;
// int colC = colB-colA + 1;
// int c[num_targets][rowC*colC] = { 0 };
int c[num_targets];
convolutionWithCUDA(*c, a, *b, rowA,colA,rowB,colB,num_targets);
convolutionWithCUDA(c, a, b, rowA,colA,rowB,colB,num_targets,block_num,thread_num);
free(b);
// convolution(c,a,b[0],rowA,colA,rowB,colB);
print_matrix(c[0],rowC,colC);
print_matrix(c[1],rowC,colC);
print_matrix(c[2],rowC,colC);
// print_matrix(c[0],rowC,colC);
// print_matrix(c[1],rowC,colC);
// print_matrix(c[2],rowC,colC);
print_array(c,num_targets);
printf("\n");
printf("MIN : %d \n",c[0]);
printf("MAX : %d \n",c[num_targets-1]);
printf("MEDIAN : %d \n",get_median(c,num_targets));
printf("AVERAGE : %ld \n",get_floored_mean(c,num_targets));
cudaDeviceReset();
}
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment