From bd9234298b7974cce6ec27c633cf9c7a0b87a03c Mon Sep 17 00:00:00 2001 From: andhikarei <andhikareihan349@gmail.com> Date: Wed, 16 Mar 2022 15:19:05 +0700 Subject: [PATCH] Add dummy convolution --- makefile | 10 ++ result/K01-03-TC1_serial.txt | 3 +- result/K01-03-TC2_serial.txt | 6 -- result/K01-03-TC4_serial.txt | 3 +- src/serial.c | 173 +++++++++++++++++++++++++---------- 5 files changed, 141 insertions(+), 54 deletions(-) diff --git a/makefile b/makefile index 3d8f75e..6411707 100644 --- a/makefile +++ b/makefile @@ -55,6 +55,16 @@ TC4-serial: install-serial TC-serial: TC1-serial TC2-serial TC3-serial TC4-serial +# Serial Testcase no output. +TC1-serial-no: install-serial + ./bin/${EXEC_SERIAL} < ./testcase/K01-03-TC1 +TC2-serial-no: install-serial + ./bin/${EXEC_SERIAL} < ./testcase/K01-03-TC2 +TC3-serial-no: install-serial + ./bin/${EXEC_SERIAL} < ./testcase/K01-03-TC3 +TC4-serial-no: install-serial + ./bin/${EXEC_SERIAL} < ./testcase/K01-03-TC4 + # Generic parallel testcase. TC1-parallel: install-parallel (./bin/${EXEC_PARALLEL} < ./testcase/K01-03-TC1) > ./result/K01-03-TC1_parallel.txt diff --git a/result/K01-03-TC1_serial.txt b/result/K01-03-TC1_serial.txt index d0f9d3b..8be6411 100644 --- a/result/K01-03-TC1_serial.txt +++ b/result/K01-03-TC1_serial.txt @@ -1,6 +1,7 @@ +yey 8539213 11916317 10114197 10323010 -Runtime: 0.016319 s +Runtime: 0.006773 s diff --git a/result/K01-03-TC2_serial.txt b/result/K01-03-TC2_serial.txt index c928606..e69de29 100644 --- a/result/K01-03-TC2_serial.txt +++ b/result/K01-03-TC2_serial.txt @@ -1,6 +0,0 @@ -35064588 -46265294 -37739803 -38222937 - -Runtime: 0.772688 s diff --git a/result/K01-03-TC4_serial.txt b/result/K01-03-TC4_serial.txt index fa4259f..b874af3 100644 --- a/result/K01-03-TC4_serial.txt +++ b/result/K01-03-TC4_serial.txt @@ -1,6 +1,7 @@ +yey 41250811 71841136 51451884 51774352 -Runtime: 9.509920 s +Runtime: 9.726326 s diff --git a/src/serial.c b/src/serial.c index f4a69bd..ed2febe 100644 --- a/src/serial.c +++ b/src/serial.c @@ -273,6 +273,22 @@ int* map_matrix(int mat[][100], int row, int col) { return map; } +/* + * Function map_matrix_extended + * + * Returns a row major matrix of the input matrix. + **/ +int* map_matrix_extended(int** mat, int row, int col) { + int* map = (int*) malloc(sizeof(int) * row * col); + for (int i = 0; i < row; i++) { + for (int j = 0; j < col; j++) { + map[index_to_row_major(i, j, row, col)] = mat[i][j]; + } + } + return map; +} + + /** * Function reverse_map_matrix * @@ -305,31 +321,77 @@ Matrix rm_to_matrix_object(int* map, int row, int col) { return mat; } +/** + * Function rm_to_list_matrix_object + * + * Return List of Matrix Struct of row major matrix + */ +Matrix* rm_to_list_matrix_object(int* map, int row, int col, int row_inner, int col_inner) { + Matrix* mat = (Matrix*) malloc(sizeof(Matrix) * row); + for (int i = 0; i < row; i++) { + init_matrix(&mat[i], row_inner, col_inner); + int pad = i * col; + for (int j = 0; j < row_inner; j++) { + for (int k = 0; k < col_inner; k++) { + int index = index_to_row_major(j, k, row_inner, col_inner) + pad; + mat[i].mat[j][k] = map[index]; + } + } + } + return mat; +} +/** + * Function list_matrix_object_to_rm + * + * Return row major matrix of list of Matrix struct + */ +int* list_matrix_object_to_rm(Matrix* mat, int num_matrix, int row_inner, int col_inner) { + int* map = (int*) malloc(sizeof(int) * num_matrix * row_inner * col_inner); + for (int i = 0; i < num_matrix; i++) { + int pad = i * row_inner * col_inner; + for (int j = 0; j < row_inner; j++) { + for (int k = 0; k < col_inner; k++) { + int index = index_to_row_major(j, k, row_inner, col_inner) + pad; + map[index] = mat[i].mat[j][k]; + } + } + } + + return map; +} + /** * Function cuda_convolution * * Returns a matrix of the convolution of the input matrix with the kernel */ -void cuda_convolution(int* d_out_mat, int* arr_mat_rm, int* kernel_rm, int row_eff, int col_eff, int kernel_row, int kernel_col) { +void cuda_convolution(int* out_mat_rm, int* arr_mat_rm, int* kernel_rm, int row_eff, int col_eff, int kernel_row, int kernel_col, int curr_mat) { // Calculate real row and column of input matrix. int row = row_eff + kernel_row - 1; int col = col_eff + kernel_col - 1; + // Calculate padding target and output matrix. + int pad = curr_mat * row * col; + int pad_out = curr_mat * row_eff * col_eff; + // For each element in input matrix that is not on the boundary, - for (int i = 0; i < row_eff; i++) { + for (int i = 0 ; i < row_eff; i++) { for (int j = 0; j < col_eff; j++) { // Convolution of the element with the kernel. // Calculate the sum of the kernel and the input matrix. int intermediate_sum = 0; for (int k = 0; k < kernel_row; k++) { for (int l = 0; l < kernel_col; l++) { - int index = index_to_row_major(i + k, j + l, row, col); + int index = index_to_row_major(i + k, j + l, row, col) + pad; int kernel_index = index_to_row_major(k, l, kernel_row, kernel_col); intermediate_sum += arr_mat_rm[index] * kernel_rm[kernel_index]; + // Print all i,j,k,l + // printf("i:%d, j:%d, k:%d, l:%d\n", i, j, k, l); + } } // Store the sum in the output matrix. - d_out_mat[index_to_row_major(i, j, row_eff, col_eff)] = intermediate_sum; + out_mat_rm[index_to_row_major(i, j, row_eff, col_eff) + pad_out ] = intermediate_sum; } } } @@ -339,6 +401,13 @@ void cuda_convolution(int* d_out_mat, int* arr_mat_rm, int* kernel_rm, int row_e // int row = row_eff + kernel_row - 1; // int col = col_eff + kernel_col - 1; // +// // Determine current matrix from block; +// int curr_mat = blockIdx.y; +// +// // Calculate padding target and output matrix. +// int pad = curr_mat * row * col; +// int pad_out = curr_mat * row_eff * col_eff; +// // // Get i, and j from threadIdx // int tid = blockIdx.x * blockDim.x + threadIdx.x; // int i, j; @@ -349,17 +418,16 @@ void cuda_convolution(int* d_out_mat, int* arr_mat_rm, int* kernel_rm, int row_e // int intermediate_sum = 0; // for (int k = 0; k < kernel_row; k++) { // for (int l = 0; l < kernel_col; l++) { -// int index = index_to_row_major(i + k, j + l, row, col); -// int kernel_index = index_to_row_major(k, l, kernel_row, kernel_col); +// int index = d_index_to_row_major(i + k, j + l, row, col) + pad; +// int kernel_index = d_index_to_row_major(k, l, kernel_row, kernel_col); // intermediate_sum += arr_mat_rm[index] * kernel_rm[kernel_index]; // } // } -// d_out_mat[index_to_row_major(i, j, row_eff, col_eff)] = intermediate_sum; +// d_out_mat[d_index_to_row_major(i, j, row_eff, col_eff) + pad_out] = intermediate_sum; // } // } - // main() driver int main() { // Time. @@ -386,54 +454,67 @@ int main() { int row_eff = target_row - kernel_row + 1; int col_eff = target_col - kernel_col + 1; - // Initialize host and device input and output matrixes. - int ** arr_mat_rm, **h_out_mat, ** d_out_mat, *kernel_rm; - // Allocate input matrix. - arr_mat_rm = (int**)malloc(sizeof(int*) * num_targets); - for (int i = 0; i < num_targets; i++) { - arr_mat_rm[i] = (int*)malloc(sizeof(int) * target_row * target_col); - } - // Allocate output matrix. - h_out_mat = (int**)malloc(sizeof(int*) * num_targets); - for (int i = 0; i < num_targets; i++) { - h_out_mat[i] = (int*)malloc(sizeof(int) * row_eff * col_eff); - } - // cudaMalloc((void**)&d_out_mat, sizeof(int*) * num_targets); - // for (int i = 0; i < num_targets; i++) { - // cudaMalloc(&h_out_mat[i], sizeof(int) * row_eff * col_eff); - // } - // cudaMemcpy(d_out_mat, h_out_mat, sizeof(int*) * num_targets, cudaMemcpyHostToDevice); - d_out_mat = (int**)malloc(sizeof(int*) * num_targets); - for (int i = 0; i < num_targets; i++) { - d_out_mat[i] = (int*)malloc(sizeof(int) * row_eff * col_eff); - } - kernel_rm = (int*)malloc(sizeof(int) * kernel_col * kernel_row); - + // Allocate variable. + // rm means row-major. It's indicate matrix are in row-major order. + // Variable declaration. + int * arr_mat_rm, * d_arr_mat_rm, *out_mat_rm, *d_out_mat_rm, *kernel_rm, *d_kernel_rm; + int size_arr_mat, size_out_mat, size_kernel; + // cudaError err; + // Allocate input matrix in device and host. + size_arr_mat = num_targets * target_row * target_col; + arr_mat_rm = (int*)malloc(sizeof(int*) * size_arr_mat); + // cudaMalloc(&d_arr_mat_rm, sizeof(int*) * size_arr_mat); - // Store kernel in row major form. - kernel_rm = map_matrix(kernel.mat, kernel_row, kernel_col); + // Allocate output matrix in device and host. + size_out_mat = num_targets * row_eff * col_eff; + out_mat_rm = (int*)malloc(sizeof(int*) * size_out_mat); + // cudaMalloc(&d_out_mat_rm, sizeof(int*) * size_out_mat); - // read each target matrix, and get the row major matrix from. + // Allocate kernel matrix in host. + size_kernel = kernel_row * kernel_col; + kernel_rm = (int*)malloc(sizeof(int) * size_kernel); + // Store kernel in row major form and allocate kernel for device. + kernel_rm = map_matrix(kernel.mat, kernel_row, kernel_col); + // cudaMalloc(&d_kernel_rm, sizeof(int) * size_kernel); + // err = cudaMemcpy(d_kernel_rm, kernel_rm, sizeof(int) * size_kernel, cudaMemcpyHostToDevice); + // if (err != cudaSuccess) { + // printf("Error copy host to device: %s\n", cudaGetErrorString(err)); + // } + + + // Read each target matrix. for (int i = 0; i < num_targets; i++) { arr_mat[i] = input_matrix(target_row, target_col); - arr_mat_rm[i] = map_matrix(arr_mat[i].mat, target_row, target_col); - // cuda_convolution<<<block_size, threads_per_block>>>(d_out_mat[i], arr_mat_rm[i], kernel_rm, target_row, target_col, kernel_row, kernel_col); - // cudaMemcpy(h_out_mat[i], d_out_mat[i], sizeof(int) * row_eff * col_eff, cudaMemcpyDeviceToHost); - cuda_convolution(d_out_mat[i], arr_mat_rm[i], kernel_rm, row_eff, col_eff, kernel_row, kernel_col); - arr_mat[i] = rm_to_matrix_object(d_out_mat[i], row_eff, col_eff); } - - // // Free cuda memory - // for (int i = 0; i < num_targets; i++) { - // cudaFree(h_out_mat[i]); + // Store each target matrix in row major form and allocate target matrix for device. + arr_mat_rm = list_matrix_object_to_rm(arr_mat, num_targets, target_row, target_col); + // cudaMalloc(&d_arr_mat_rm, sizeof(int) * size_arr_mat); + // err = cudaMemcpy(d_arr_mat_rm, arr_mat_rm, sizeof(int) * size_arr_mat, cudaMemcpyHostToDevice); + // if (err != cudaSuccess) { + // printf("Error copy host to device: %s\n", cudaGetErrorString(err)); // } - // cudaFree(d_out_mat); - // For each target matrix, compute their convolution matrices, and compute their data ranges + // For each target matrix, compute their convolution matrices. + // dim3 grid, block; + // block.x = block_size; + // block.y = num_targets; + // grid.x = threads_per_block; + // d_cuda_convolution<<<grid, block>>>(d_out_mat_rm, d_arr_mat_rm, d_kernel_rm, row_eff, col_eff, kernel_row, kernel_col); + // cudaMemcpy(out_mat_rm, d_out_mat_rm, sizeof(int) * size_out_mat, cudaMemcpyDeviceToHost); + for (int i = 0; i < num_targets; i++){ + cuda_convolution(out_mat_rm, arr_mat_rm, kernel_rm, row_eff, col_eff, kernel_row, kernel_col, i); + } + arr_mat = rm_to_list_matrix_object(out_mat_rm, num_targets, row_eff*col_eff, row_eff, col_eff); + + // // Free cuda Memory. + // cudaFree(d_arr_mat_rm); + // cudaFree(d_out_mat_rm); + // cudaFree(d_kernel_rm); + + // For each target matrix, and compute their data ranges for (int i = 0; i < num_targets; i++) { - // arr_mat[i] = convolution(&kernel, &arr_mat[i]); arr_range[i] = get_matrix_datarange(&arr_mat[i]); } -- GitLab