diff --git a/makefile b/makefile index ccac332a6133a25e7afba790d0085659ba63ebda..d5535070df035d91b45f12027d5d809e0a9f714a 100644 --- a/makefile +++ b/makefile @@ -1,36 +1,25 @@ SOURCE_SERIAL=./src/serial.c SOURCE_PARALLEL=./src/parallel.cu -SOURCE_TEST=./src/test.c -SOURCE_PARALLEL_COLLAB=./src/parallel_collab.cu EXEC_SERIAL=main-serial EXEC_PARALLEL=main-parallel -EXEC_TEST=test -EXEC_PARALLEL_COLLAB=parallel-collab -COMPILER_PARALLEL_COLLAB=nvcc COMPILER_PARALLEL=nvcc COMPILER_SERIAL=gcc - - # Compile program. -compile-parallel-collab: - ${COMPILER_PARALLEL_COLLAB} -rdc=true -o ./bin/${EXEC_PARALLEL_COLLAB} ${SOURCE_PARALLEL_COLLAB} compile-serial: ${COMPILER_SERIAL} -o ./bin/${EXEC_SERIAL} ${SOURCE_SERIAL} compile-parallel: - ${COMPILER_PARALLEL} -rdc=true -o ./bin/${EXEC_PARALLEL} ${SOURCE_PARALLEL} ./src/lib/matrix.cu ./src/lib/utils.cu ./src/lib/bitonic_sort.cu + ${COMPILER_PARALLEL} -o ./bin/${EXEC_PARALLEL} ${SOURCE_PARALLEL} # Link program. -link-parallel-collab : compile-parallel-collab link-serial: compile-serial link-parallel: compile-parallel # Compile and link. install-serial: compile-serial link-serial install-parallel: compile-parallel link-parallel -install-parallel-collab: compile-parallel-collab link-parallel-collab # Clean bin folder. clean: @@ -45,11 +34,6 @@ run-parallel: install-parallel ./bin/${EXEC_PARALLEL} run-parallel-example: install-parallel ./bin/${EXEC_PARALLEL} < ./testcase/Example-TC -run-parallel-collab: install-parallel-collab - ./bin/${EXEC_PARALLEL_COLLAB} -run-parallel-collab-example: install-parallel-collab - ./bin/${EXEC_PARALLEL_COLLAB} < ./testcase/Example-TC - # Example testcase. TC-example: @@ -87,17 +71,5 @@ TC3-parallel: install-parallel TC4-parallel: install-parallel (./bin/${EXEC_PARALLEL} < ./testcase/K01-03-TC4) > ./result/K01-03-TC4_parallel.txt -# Generic parallel testcase. -TC1-parallel-collab: install-parallel-collab - (./bin/${EXEC_PARALLEL_COLLAB} < ./testcase/K01-03-TC1) > ./result/K01-03-TC1_parallel_collab.txt -TC2-parallel-collab: install-parallel-collab - (./bin/${EXEC_PARALLEL_COLLAB} < ./testcase/K01-03-TC2) > ./result/K01-03-TC2_parallel_collab.txt -TC3-parallel-collab: install-parallel-collab - (./bin/${EXEC_PARALLEL_COLLAB} < ./testcase/K01-03-TC3) > ./result/K01-03-TC3_parallel_collab.txt -TC4-parallel-collab: install-parallel-collab - (./bin/${EXEC_PARALLEL_COLLAB} < ./testcase/K01-03-TC4) > ./result/K01-03-TC4_parallel_collab.txt - TC-parallel: TC1-parallel TC2-parallel TC3-parallel TC4-parallel -TC-parallel-collab: TC1-parallel-collab TC2-parallel-collab TC3-parallel-collab TC4-parallel-collab TC-generic: TC-serial TC-parallel -TC-generic-collab: TC-serial TC-parallel TC-parallel-collab diff --git a/result/K01-03-TC1_parallel.txt b/result/K01-03-TC1_parallel.txt index 04a2a8a057522cd947d63637586f766961593a41..f47a14b1fba197f5910886390a2c72b16296df45 100644 --- a/result/K01-03-TC1_parallel.txt +++ b/result/K01-03-TC1_parallel.txt @@ -3,4 +3,4 @@ 10114197 10323010 -Runtime: 0.155827 s +Runtime: 0.167720 s diff --git a/result/K01-03-TC1_parallel_collab.txt b/result/K01-03-TC1_parallel_collab.txt deleted file mode 100644 index f91c91498972f9821048fe87fcfa55f8541102bd..0000000000000000000000000000000000000000 --- a/result/K01-03-TC1_parallel_collab.txt +++ /dev/null @@ -1,6 +0,0 @@ -8539213 -11916317 -10114197 -10323010 - -Runtime: 0.144829 s diff --git a/result/K01-03-TC1_serial.txt b/result/K01-03-TC1_serial.txt index 989f60d71fa1e0c0241fe48f4002911dc70b42a3..b22612021dc7582e8673daa8aadf33bdd2f5e7e1 100644 --- a/result/K01-03-TC1_serial.txt +++ b/result/K01-03-TC1_serial.txt @@ -3,4 +3,4 @@ 10114197 10323010 -Runtime: 0.014589 s +Runtime: 0.017340 s diff --git a/result/K01-03-TC2_parallel.txt b/result/K01-03-TC2_parallel.txt index b594a3766218ae5fd38542ac06209b35e37a0d08..45430c0a83191e51aabdbea83844377227e3c51f 100644 --- a/result/K01-03-TC2_parallel.txt +++ b/result/K01-03-TC2_parallel.txt @@ -3,4 +3,4 @@ 37739803 38222937 -Runtime: 0.833981 s +Runtime: 0.598738 s diff --git a/result/K01-03-TC2_parallel_collab.txt b/result/K01-03-TC2_parallel_collab.txt deleted file mode 100644 index aafe92c5fb785c3edd4df7804b7738e67538871e..0000000000000000000000000000000000000000 --- a/result/K01-03-TC2_parallel_collab.txt +++ /dev/null @@ -1,6 +0,0 @@ -35064588 -46265294 -37739803 -38222937 - -Runtime: 0.422364 s diff --git a/result/K01-03-TC2_serial.txt b/result/K01-03-TC2_serial.txt index d21e02c539226977d1c4530c9f3d311952f5e585..2c5f38b28e7ed24903bfb446e41026496d512a0a 100644 --- a/result/K01-03-TC2_serial.txt +++ b/result/K01-03-TC2_serial.txt @@ -3,4 +3,4 @@ 37739803 38222937 -Runtime: 0.689801 s +Runtime: 0.883971 s diff --git a/result/K01-03-TC3_parallel.txt b/result/K01-03-TC3_parallel.txt index bff74c68d7ed3ca321d3c8ab9652e9188c8e3c3b..c625ff6e84c4486eaa63b71721ea450db4696cc0 100644 --- a/result/K01-03-TC3_parallel.txt +++ b/result/K01-03-TC3_parallel.txt @@ -3,4 +3,4 @@ 23198319 23380111 -Runtime: 0.988793 s +Runtime: 0.881576 s diff --git a/result/K01-03-TC3_parallel_collab.txt b/result/K01-03-TC3_parallel_collab.txt deleted file mode 100644 index 98ed976ee323820459b37cd55f3de94c9d95a6ce..0000000000000000000000000000000000000000 --- a/result/K01-03-TC3_parallel_collab.txt +++ /dev/null @@ -1,6 +0,0 @@ -18815130 -28707695 -23198319 -23380111 - -Runtime: 0.644663 s diff --git a/result/K01-03-TC3_serial.txt b/result/K01-03-TC3_serial.txt index 7cb8cc056c333f053e33a019358c42312018d65e..d7059bc7ec7d3bb01be089534f4016aeb4b5aba0 100644 --- a/result/K01-03-TC3_serial.txt +++ b/result/K01-03-TC3_serial.txt @@ -3,4 +3,4 @@ 23198319 23380111 -Runtime: 0.815450 s +Runtime: 1.182247 s diff --git a/result/K01-03-TC4_parallel.txt b/result/K01-03-TC4_parallel.txt index b1f42bf07ab2b8af5ba8ecf6f68fa3c04dd2d74e..575c491dc91102ec205c9741d9a0460f0050e8e3 100644 --- a/result/K01-03-TC4_parallel.txt +++ b/result/K01-03-TC4_parallel.txt @@ -3,4 +3,4 @@ 51451884 51774352 -Runtime: 9.862558 s +Runtime: 4.115475 s diff --git a/result/K01-03-TC4_parallel_collab.txt b/result/K01-03-TC4_parallel_collab.txt deleted file mode 100644 index af651cae921a3cf2ce03c8c4da198cf005c0ce95..0000000000000000000000000000000000000000 --- a/result/K01-03-TC4_parallel_collab.txt +++ /dev/null @@ -1,6 +0,0 @@ -41250811 -71841136 -51451884 -51774352 - -Runtime: 4.018500 s diff --git a/result/K01-03-TC4_serial.txt b/result/K01-03-TC4_serial.txt index 789cfe32519f4f3f2f09d530e824f62572d97e15..f66713b7003c1a9713cdecd91c5144ee87f39443 100644 --- a/result/K01-03-TC4_serial.txt +++ b/result/K01-03-TC4_serial.txt @@ -3,4 +3,4 @@ 51451884 51774352 -Runtime: 8.795971 s +Runtime: 9.939554 s diff --git a/src/lib/brick_sort.cu b/src/lib/brick_sort.cu deleted file mode 100644 index 08d9b64a775db81e4d8c725e81fd223ee510ba1a..0000000000000000000000000000000000000000 --- a/src/lib/brick_sort.cu +++ /dev/null @@ -1,152 +0,0 @@ -#include <stdlib.h> -#include <stdio.h> -#include <time.h> - -#define THREADS 1024 -#define NUM_VALS 100000 - -// PROBLEM: BLOCKS GA BOLEH LEBIH DARI 1. - -void print_elapsed(clock_t start, clock_t stop) -{ - double elapsed = ((double)(stop - start)) / CLOCKS_PER_SEC; - printf("Elapsed time: %.3fs\n", elapsed); -} - -int checksum(int *arr, int size) -{ - int sum = 0; - for (int i = 0; i < size; i++) - sum += arr[i]; - return sum; -} - -int random_int() -{ - return (int)rand() / (int)RAND_MAX; -} - -void array_print(int *arr, int length) -{ - int i; - for (i = 0; i < length; ++i) - { - printf("%d ", arr[i]); - } - printf("\n"); -} - -void array_fill(int *arr, int length) -{ - srand(time(NULL)); - int i; - for (i = 0; i < length; ++i) - { - arr[i] = rand(); - } -} - - -__device__ void swap(int *arr, int i, int j) -{ - int temp = arr[i]; - arr[i] = arr[j]; - arr[j] = temp; -} - - -__global__ void brick_sort_even(int *d_arr, int length) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i >= length - 1) - { - return; - } - - if (i % 2 == 0) - { - if (d_arr[i] > d_arr[i + 1]) - { - swap(d_arr, i, i + 1); - } - } - -} - -__global__ void brick_sort_odd(int *d_arr, int length) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i >= length - 1) - { - return; - } - - if (i % 2 != 0) - { - if (d_arr[i] > d_arr[i + 1]) - { - swap(d_arr, i, i + 1); - } - } -} - -void brick_sort(int *arr, int length) { - int blocks = (NUM_VALS + THREADS - 1) / THREADS; - size_t size = length * sizeof(int); - - int *d_arr; - cudaMalloc((void **)&d_arr, size); - cudaMemcpy(d_arr, arr, size, cudaMemcpyHostToDevice); - - for (int i = 0; i < NUM_VALS / 2; ++i) - { - brick_sort_odd<<<blocks, THREADS>>>(d_arr, NUM_VALS); - brick_sort_even<<<blocks, THREADS>>>(d_arr, NUM_VALS); - } - - cudaMemcpy(arr, d_arr, size, cudaMemcpyDeviceToHost); - cudaFree(d_arr); -} - - -int driver(void) -{ - clock_t start, stop; - - int *arr = (int *)malloc(NUM_VALS * sizeof(int)); - array_fill(arr, NUM_VALS); - - int checksum1 = checksum(arr, NUM_VALS); - - start = clock(); - brick_sort(arr, NUM_VALS); - stop = clock(); - - int checksum2 = checksum(arr, NUM_VALS); - - print_elapsed(start, stop); - - bool passed_sort = true; - bool passed_checksum = true; - - for (int i = 1; i < NUM_VALS; i++) - { - if (arr[i - 1] > arr[i]) - { - passed_sort = false; - } - } - - if (checksum1 != checksum2) - { - passed_checksum = false; - printf("%d != %d\n", checksum1, checksum2); - } - - // array_print(arr, NUM_VALS); - - printf("Test %s\n", passed_sort ? "PASSED SORT" : "FAILED SORT"); - printf("Test %s\n", passed_checksum ? "PASSED CHECKSUM" : "FAILED CHECKSUM"); -} diff --git a/src/lib/brick_sort.cuh b/src/lib/brick_sort.cuh deleted file mode 100644 index bb088368e8417d8a4f1e99a50d84db97d521dd24..0000000000000000000000000000000000000000 --- a/src/lib/brick_sort.cuh +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _BRICK_SORT_H_ -#define _BRICK_SORT_H_ - -void brick_sort(int *d_arr, int length); - -#endif \ No newline at end of file diff --git a/src/parallel.cu b/src/parallel.cu index d6d0bb217003fe6d0798645722ca760a2e61db1c..541b60cf8047682333573976c80ef6dbc1e15295 100644 --- a/src/parallel.cu +++ b/src/parallel.cu @@ -1,54 +1,688 @@ -// serial.c - -#include <stdio.h> -#include <stdlib.h> -#include <time.h> - -#include "./lib/bitonic_sort.cuh" -#include "./lib/matrix.cuh" -#include "./lib/utils.cuh" - - -int main() { - // Time. - clock_t t; - t = clock(); - - int kernel_row, kernel_col, target_row, target_col, num_targets; - - // reads kernel's row and column and initalize kernel matrix from input - scanf("%d %d", &kernel_row, &kernel_col); - Matrix kernel = input_matrix(kernel_row, kernel_col); - - // reads number of target matrices and their dimensions. - // initialize array of matrices and array of data ranges (int) - scanf("%d %d %d", &num_targets, &target_row, &target_col); - Matrix* arr_mat = (Matrix*)malloc(num_targets * sizeof(Matrix)); - int *arr_range = (int*)malloc(num_targets * sizeof(int)); - - // read each target matrix, compute their convolution matrices, and compute their data ranges - for (int i = 0; i < num_targets; i++) { - arr_mat[i] = input_matrix(target_row, target_col); - arr_mat[i] = convolution(&kernel, &arr_mat[i]); - arr_range[i] = get_matrix_datarange(&arr_mat[i]); - } - - // sort the data range array - bitonic_sort(arr_range, num_targets); - - int median = get_median(arr_range, num_targets); - int floored_mean = get_floored_mean(arr_range, num_targets); - - // print the min, max, median, and floored mean of data range array - printf("%d\n%d\n%d\n%d\n", - arr_range[0], - arr_range[num_targets - 1], - median, - floored_mean); - - // Print execution time in seconds. - t = clock() - t; - printf("\nRuntime: %f s\n", ((float)t) / CLOCKS_PER_SEC); - - return 0; -} +// parallel_collab.cu + +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <math.h> + +#define NMAX 100 +#define DATAMAX 1000 +#define DATAMIN -1000 + +/* + * Struct Matrix + * + * Matrix representation consists of matrix data + * and effective dimensions + * */ +typedef struct Matrix { + int mat[NMAX][NMAX]; // Matrix cells + int row_eff; // Matrix effective row + int col_eff; // Matrix effective column +} Matrix; + + +/* + * Procedure init_matrix + * + * Initializing newly allocated matrix + * Setting all data to 0 and effective dimensions according + * to nrow and ncol + * */ +void init_matrix(Matrix *m, int nrow, int ncol) { + m->row_eff = nrow; + m->col_eff = ncol; + + for (int i = 0; i < m->row_eff; i++) { + for (int j = 0; j < m->col_eff; j++) { + m->mat[i][j] = 0; + } + } +} + + +/* + * Function input_matrix + * + * Returns a matrix with values from stdin input + * */ +Matrix input_matrix(int nrow, int ncol) { + Matrix input; + init_matrix(&input, nrow, ncol); + + for (int i = 0; i < nrow; i++) { + for (int j = 0; j < ncol; j++) { + scanf("%d", &input.mat[i][j]); + } + } + + return input; +} + + +/* + * Procedure print_matrix + * + * Print matrix data + * */ +void print_matrix(Matrix *m) { + for (int i = 0; i < m->row_eff; i++) { + for (int j = 0; j < m->col_eff; j++) { + printf("%d ", m->mat[i][j]); + } + printf("\n"); + } +} + + +/* + * Function get_matrix_datarange + * + * Returns the range between maximum and minimum + * element of a matrix + * */ +int get_matrix_datarange(Matrix *m) { + int max = DATAMIN; + int min = DATAMAX; + for (int i = 0; i < m->row_eff; i++) { + for (int j = 0; j < m->col_eff; j++) { + int el = m->mat[i][j]; + if (el > max) max = el; + if (el < min) min = el; + } + } + + return max - min; +} + + +/* + * Function supression_op + * + * Returns the sum of intermediate value of special multiplication + * operation where kernel[0][0] corresponds to target[row][col] + * */ +int supression_op(Matrix *kernel, Matrix *target, int row, int col) { + int intermediate_sum = 0; + for (int i = 0; i < kernel->row_eff; i++) { + for (int j = 0; j < kernel->col_eff; j++) { + intermediate_sum += kernel->mat[i][j] * target->mat[row + i][col + j]; + } + } + + return intermediate_sum; +} + + +/* + * Function convolution + * + * Return the output matrix of convolution operation + * between kernel and target + * */ +Matrix convolution(Matrix *kernel, Matrix *target) { + Matrix out; + int out_row_eff = target->row_eff - kernel->row_eff + 1; + int out_col_eff = target->col_eff - kernel->col_eff + 1; + + init_matrix(&out, out_row_eff, out_col_eff); + + for (int i = 0; i < out.row_eff; i++) { + for (int j = 0; j < out.col_eff; j++) { + out.mat[i][j] = supression_op(kernel, target, i, j); + } + } + + return out; +} + + +/** + * Swap the values of two elements in an array. + * + * @param d_arr - the array. + * @param i - the index of the first element. + * @param j - the index of the second element. + */ +__device__ void swap(int *d_arr, int i, int j) +{ + int temp = d_arr[i]; + d_arr[i] = d_arr[j]; + d_arr[j] = temp; +} + +/** + * Pad the remaining empty value in the array with the maximum value. + * + * @param arr - the array. + * @param length - the number of elements inside the array. + * @param length_buffer - the length of the buffer. + */ +void padding_array(int *arr, int length, int length_buffer) +{ + for (int i = length; i < length_buffer; i++) { + arr[i] = INT_MAX; + } +} + +/** + * Copy the array from src to dest and pad the remaining empty value with the maximum value. + * + * @param dest - the destination array. + * @param src - the source array. + * @param length - the number of elements inside the src array. + * @param length_buffer - the length of the buffer. + */ +void copy_padding(int *dest, int *src, int length, int length_buffer) +{ + for (int i = 0; i < length; i++) { + dest[i] = src[i]; + } + padding_array(dest, length, length_buffer); +} + +/** + * Get the number of minimum blocks as a power of 2. + * + * @param length - the number of elements inside the array. + * @param num_threads - the number of threads. + * @return int - the number of minimum blocks needed. + */ +int minimum_blocks(int length, int num_threads) { + int num_blocks = 1; + + // Increase the number of blocks 2 times, so it will be a power of 2. + while (num_blocks * num_threads < length) { + num_blocks *= 2; + } + + return num_blocks; +} + +/** + * Do the bitnoic sort step by step. + * + * @param d_values - array in the device to be sorted. + * @param i - major step index. + * @param j - minor step index. + */ +__global__ void bitonic_sort_step(int *d_arr, int i, int j) +{ + // The array index and its patner. + int idx, patner; + + // The thread index. + idx = threadIdx.x + blockDim.x * blockIdx.x; + + // The thread index of the patner. + patner = idx ^ j; + + // Sort the array by threads with the lowest idx. + if (idx < patner) { + if ((idx & i) == 0) { + // Sort ascending. + if (d_arr[idx] > d_arr[patner]) { + swap(d_arr, idx, patner); + } + } + if ((idx & i) != 0) { + // Sort descending. + if (d_arr[idx] < d_arr[patner]) { + swap(d_arr, idx, patner); + } + } + } +} + +/** + * Perform a bitonic sort on the array. + * + * @param h_arr The host array to sort. + * @param length The length of the array. + */ +void bitonic_sort(int *h_arr, int length) +{ + // Initialize the constants variable. + const int threads = 1024; + const int blocks = minimum_blocks(length, threads); + const int buffer_length = threads * blocks; + + // Initialize the memory size of the array. + size_t size = length * sizeof(int); + size_t buffer_size = buffer_length * sizeof(int); + + // Create the buffer array and pad with maximum value of Int. + int *h_buffer = (int *)malloc(buffer_size); + copy_padding(h_buffer, h_arr, length, buffer_length); + + // Allocate and copy array into device memory. + int *d_arr; + cudaMalloc((void**) &d_arr, buffer_size); + cudaMemcpy(d_arr, h_buffer, buffer_size, cudaMemcpyHostToDevice); + + // Sort the array using bitonic_sort_step. + int i, j; + // The major step. + for (i = 2; i <= buffer_length; i *= 2) { + // The minor step. + for (j = i / 2; j > 0; j = j / 2) { + bitonic_sort_step<<<blocks, threads>>>(d_arr, i, j); + } + } + + // Copy the values back to the host. + cudaMemcpy(h_arr, d_arr, size, cudaMemcpyDeviceToHost); + + // Free device memory. + cudaFree(d_arr); + free(h_buffer); +} + + +/* + * Procedure print_array + * + * Prints all elements of array n of size to stdout + * */ +void print_array(int *n, int size) { + for (int i = 0; i < size; i++ ) printf("%d ", n[i]); + printf("\n"); +} + + +/* + * Function get_median + * + * Returns median of array n of length + * */ +int get_median(int *n, int length) { + int mid = length / 2; + if (length & 1) return n[mid]; + + return (n[mid - 1] + n[mid]) / 2; +} + + +/* + * Function get_floored_mean + * + * Returns floored mean from an array of integers + * */ +long get_floored_mean(int *n, int length) { + long sum = 0; + for (int i = 0; i < length; i++) { + sum += n[i]; + } + + return sum / length; +} + +/** + * Function index_to_row_major + * + * Returns the index of a matrix element in row-major order + */ +int index_to_row_major(int row, int col, int row_eff, int col_eff) { + return row * col_eff + col; +} + +__device__ int d_index_to_row_major(int row, int col, int row_eff, int col_eff) { + return row * col_eff + col; +} + +/** + * Function row_major_to_index + * + * Returns the row and column of a matrix element in row-major order + */ +void row_major_to_index(int index, int row_eff, int col_eff, int *row, int *col) { + *row = index / col_eff; + *col = index % col_eff; +} + +__device__ void d_row_major_to_index(int index, int row_eff, int col_eff, int *row, int *col) { + *row = index / col_eff; + *col = index % col_eff; +} + +/** + * Function map_matrix + * + * Returns a row major matrix of the input matrix. + **/ +int* map_matrix(int mat[][100], int row, int col) { + int* map = (int*) malloc(sizeof(int) * row * col); + for (int i = 0; i < row; i++) { + for (int j = 0; j < col; j++) { + map[index_to_row_major(i, j, row, col)] = mat[i][j]; + } + } + return map; +} + +/* + * Function map_matrix_extended + * + * Returns a row major matrix of the input matrix. + **/ +int* map_matrix_extended(int** mat, int row, int col) { + int* map = (int*) malloc(sizeof(int) * row * col); + for (int i = 0; i < row; i++) { + for (int j = 0; j < col; j++) { + map[index_to_row_major(i, j, row, col)] = mat[i][j]; + } + } + return map; +} + + +/** + * Function reverse_map_matrix + * + * Returns a matrix of the input row major matrix. + */ +int** reverse_map_matrix(int* map, int row, int col) { + int** mat = (int**) malloc(sizeof(int*) * row); + for (int i = 0; i < row; i++) { + mat[i] = (int*) malloc(sizeof(int) * col); + for (int j = 0; j < col; j++) { + mat[i][j] = map[index_to_row_major(i, j, row, col)]; + } + } + return mat; +} + +/** + * Function rm_to_matrix_object + * + * Return Matrix struct of row major matrix + */ +Matrix rm_to_matrix_object(int* map, int row, int col) { + Matrix mat; + init_matrix(&mat, row, col); + for (int i = 0; i < row; i++) { + for (int j = 0; j < col; j++) { + mat.mat[i][j] = map[index_to_row_major(i, j, row, col)]; + } + } + return mat; +} + +/** + * Function rm_to_list_matrix_object + * + * Return List of Matrix Struct of row major matrix + */ +Matrix* rm_to_list_matrix_object(int* map, int row, int col, int row_inner, int col_inner) { + Matrix* mat = (Matrix*) malloc(sizeof(Matrix) * row); + for (int i = 0; i < row; i++) { + init_matrix(&mat[i], row_inner, col_inner); + int pad = i * col; + for (int j = 0; j < row_inner; j++) { + for (int k = 0; k < col_inner; k++) { + int index = index_to_row_major(j, k, row_inner, col_inner) + pad; + mat[i].mat[j][k] = map[index]; + } + } + } + return mat; +} +/** + * Function list_matrix_object_to_rm + * + * Return row major matrix of list of Matrix struct + */ +int* list_matrix_object_to_rm(Matrix* mat, int num_matrix, int row_inner, int col_inner) { + int* map = (int*) malloc(sizeof(int) * num_matrix * row_inner * col_inner); + for (int i = 0; i < num_matrix; i++) { + int pad = i * row_inner * col_inner; + for (int j = 0; j < row_inner; j++) { + for (int k = 0; k < col_inner; k++) { + int index = index_to_row_major(j, k, row_inner, col_inner) + pad; + map[index] = mat[i].mat[j][k]; + } + } + } + + return map; +} + +/** + * Function cuda_convolution + * + * Returns a matrix of the convolution of the input matrix with the kernel + */ +void cuda_convolution(int* out_mat_rm, int* arr_mat_rm, int* kernel_rm, int row_eff, int col_eff, int kernel_row, int kernel_col, int curr_mat) { + // Calculate real row and column of input matrix. + int row = row_eff + kernel_row - 1; + int col = col_eff + kernel_col - 1; + + // Calculate padding target and output matrix. + int pad = curr_mat * row * col; + int pad_out = curr_mat * row_eff * col_eff; + + // For each element in input matrix that is not on the boundary, + for (int i = 0 ; i < row_eff; i++) { + for (int j = 0; j < col_eff; j++) { + // Convolution of the element with the kernel. + // Calculate the sum of the kernel and the input matrix. + int intermediate_sum = 0; + for (int k = 0; k < kernel_row; k++) { + for (int l = 0; l < kernel_col; l++) { + int index = index_to_row_major(i + k, j + l, row, col) + pad; + int kernel_index = index_to_row_major(k, l, kernel_row, kernel_col); + intermediate_sum += arr_mat_rm[index] * kernel_rm[kernel_index]; + // Print all i,j,k,l + // printf("i:%d, j:%d, k:%d, l:%d\n", i, j, k, l); + + } + } + // Store the sum in the output matrix. + out_mat_rm[index_to_row_major(i, j, row_eff, col_eff) + pad_out ] = intermediate_sum; + } + } +} + +__global__ void d_cuda_convolution(int* d_out_mat, int* arr_mat_rm, int* kernel_rm, int row_eff, int col_eff, int kernel_row, int kernel_col) { + // Calculate real row and column of input matrix. + int row = row_eff + kernel_row - 1; + int col = col_eff + kernel_col - 1; + + // Determine current matrix from block; + int curr_mat = blockIdx.y; + // Calculate padding target and output matrix. + int pad = curr_mat * row * col; + int pad_out = curr_mat * row_eff * col_eff; + + // Get i, and j from threadIdx + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int i, j; + d_row_major_to_index(tid, row_eff, col_eff, &i, &j); + + // Calculate element in input matrix that is not on the boundary, + if (i < row_eff && j < col_eff) { + int intermediate_sum = 0; + for (int k = 0; k < kernel_row; k++) { + for (int l = 0; l < kernel_col; l++) { + int index = d_index_to_row_major(i + k, j + l, row, col) + pad; + int kernel_index = d_index_to_row_major(k, l, kernel_row, kernel_col); + intermediate_sum += arr_mat_rm[index] * kernel_rm[kernel_index]; + } + } + d_out_mat[d_index_to_row_major(i, j, row_eff, col_eff) + pad_out] = intermediate_sum; + } +} + +/* + * Function d_get_matrix_datarange + * Set range between maximum and minimum + * for every matrix in out_mat to d_arr_range + * */ +__global__ void d_get_matrix_datarange(int* out_mat, int* d_arr_range, int row_eff, int col_eff) { + // Determine current matrix from block; + int curr_mat = blockIdx.y; + // Calculate padding output matrix and arr_range matrix. + int pad = curr_mat * row_eff * col_eff; + int pad_arr_range = curr_mat; + + // Get i, and j from threadIdx + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int i, j; + d_row_major_to_index(tid, row_eff, col_eff, &i, &j); + // Get datarange in every output matrix + if (i < row_eff && j < col_eff) { + int max = DATAMIN; + int min = DATAMAX; + for (int k = 0; k < row_eff; k++) { + for (int l = 0; l < col_eff; l++) { + int index = d_index_to_row_major(i + k, j + l, row_eff, col_eff) + pad; + + int el = out_mat[index]; + if (el > max) max = el; + if (el < min) min = el; + } + } + + d_arr_range[d_index_to_row_major(i, j, row_eff, col_eff) + pad_arr_range] = max-min; + } +} + +// main() driver +int main() { + // Time. + clock_t t; + t = clock(); + + int kernel_row, kernel_col, target_row, target_col, num_targets; + + // reads kernel's row and column and initalize kernel matrix from input + scanf("%d %d", &kernel_row, &kernel_col); + Matrix kernel = input_matrix(kernel_row, kernel_col); + + // reads number of target matrices and their dimensions. + // initialize array of matrices and array of data ranges (int) + scanf("%d %d %d", &num_targets, &target_row, &target_col); + Matrix* arr_mat = (Matrix*)malloc(num_targets * sizeof(Matrix)); + + // Calculate variable for cuda computing. + int a = (target_row-kernel_row+1) * (target_col-kernel_col+1); + int b = 1024; + int block_size = a/b + (a % b != 0); // ceil(a/b) + int threads_per_block = 1024; + int row_eff = target_row - kernel_row + 1; + int col_eff = target_col - kernel_col + 1; + + + // Allocate variable. + // rm means row-major. It's indicate matrix are in row-major order. + // Variable declaration. + int * arr_mat_rm=0, * d_arr_mat_rm=0, *out_mat_rm=0, *d_out_mat_rm=0, *kernel_rm=0, *d_kernel_rm=0, *arr_range=0, *d_arr_range=0; + int size_arr_mat, size_out_mat, size_kernel, size_arr_range; + cudaError err; + + // Allocate input matrix in device and host. + size_arr_mat = num_targets * target_row * target_col; + arr_mat_rm = (int*)malloc(sizeof(int) * size_arr_mat); + cudaMalloc((void **)&d_arr_mat_rm, sizeof(int) * size_arr_mat); + if (arr_mat_rm == 0 | d_arr_mat_rm == 0) { + printf("Error: Memory allocation failed for arr_mat.\n"); + return 1; + } + + // Allocate output matrix in device and host. + size_out_mat = num_targets * row_eff * col_eff; + out_mat_rm = (int*)malloc(sizeof(int) * size_out_mat); + cudaMalloc((void **)&d_out_mat_rm, sizeof(int) * size_out_mat); + if (out_mat_rm == 0 | d_out_mat_rm == 0) { + printf("Error: Memory allocation failed for out_mat.\n"); + return 1; + } + cudaMemset(d_out_mat_rm, 0, sizeof(int) * size_out_mat); + + + // Allocate kernel matrix in host. + size_kernel = kernel_row * kernel_col; + kernel_rm = (int*)malloc(sizeof(int) * size_kernel); + // Store kernel in row major form and allocate kernel for device. + kernel_rm = map_matrix(kernel.mat, kernel_row, kernel_col); + cudaMalloc((void **)&d_kernel_rm, sizeof(int) * size_kernel); + if (kernel_rm == 0 | d_kernel_rm == 0) { + printf("Error: Memory allocation failed for kernel.\n"); + return 1; + } + err = cudaMemcpy(d_kernel_rm, kernel_rm, sizeof(int) * size_kernel, cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + printf("Error copy host to device: %s\n", cudaGetErrorString(err)); + return 1; + } + + + // Read each target matrix. + for (int i = 0; i < num_targets; i++) { + arr_mat[i] = input_matrix(target_row, target_col); + } + // Store each target matrix in row major form and allocate target matrix for device. + arr_mat_rm = list_matrix_object_to_rm(arr_mat, num_targets, target_row, target_col); + err = cudaMemcpy(d_arr_mat_rm, arr_mat_rm, sizeof(int) * size_arr_mat, cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + printf("Error copy host to device: %s\n", cudaGetErrorString(err)); + return 1; + } + + // For each target matrix, compute their convolution matrices. + dim3 grid, block; + grid.x = block_size; + grid.y = num_targets; + block.x = threads_per_block; + d_cuda_convolution<<<grid, block>>>(d_out_mat_rm, d_arr_mat_rm, d_kernel_rm, row_eff, col_eff, kernel_row, kernel_col); + err = cudaMemcpy(out_mat_rm, d_out_mat_rm, sizeof(int) * size_out_mat, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + printf("Error copy device to host: %s\n", cudaGetErrorString(err)); + } + // for (int i = 0; i < num_targets; i++){ + // cuda_convolution(out_mat_rm, arr_mat_rm, kernel_rm, row_eff, col_eff, kernel_row, kernel_col, i); + // } + // arr_mat = rm_to_list_matrix_object(out_mat_rm, num_targets, row_eff*col_eff, row_eff, col_eff); + + // Allocate arr_range matrix in device and host. + size_arr_range = num_targets; + arr_range = (int*)malloc(sizeof(int) * size_arr_range); + cudaMalloc((void **)&d_arr_range, sizeof(int) * size_arr_range); + if (arr_range == 0 | d_arr_range == 0) { + printf("Error: Memory allocation failed for arr_range.\n"); + return 1; + } + cudaMemset(d_arr_range, 0, sizeof(int) * size_arr_range); + + grid.x = 1; + grid.y = num_targets; + block.x = 1; + d_get_matrix_datarange<<<grid, block>>>(d_out_mat_rm, d_arr_range, row_eff, col_eff); + err = cudaMemcpy(arr_range, d_arr_range, sizeof(int) * size_arr_range, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + printf("Error copy device to host 2: %s\n", cudaGetErrorString(err)); + } + + // sort the data range array + bitonic_sort(arr_range, num_targets); + + int median = get_median(arr_range, num_targets); + int floored_mean = get_floored_mean(arr_range, num_targets); + + // print the min, max, median, and floored mean of data range array + printf("%d\n%d\n%d\n%d\n", + arr_range[0], + arr_range[num_targets - 1], + median, + floored_mean); + + // Print execution time in seconds. + t = clock() - t; + printf("\nRuntime: %f s\n", ((float)t) / CLOCKS_PER_SEC); + + // Free cuda Memory. + cudaFree(d_arr_mat_rm); + cudaFree(d_out_mat_rm); + cudaFree(d_kernel_rm); + return 0; +} diff --git a/src/parallel_collab.cu b/src/parallel_collab.cu deleted file mode 100644 index ffb6074f45ef824a57c0e550ccbf228cc3530311..0000000000000000000000000000000000000000 --- a/src/parallel_collab.cu +++ /dev/null @@ -1,689 +0,0 @@ -// parallel_collab.cu - -#include <stdio.h> -#include <stdlib.h> -#include <time.h> -#include <math.h> - -#define NMAX 100 -#define DATAMAX 1000 -#define DATAMIN -1000 - -/* - * Struct Matrix - * - * Matrix representation consists of matrix data - * and effective dimensions - * */ -typedef struct Matrix { - int mat[NMAX][NMAX]; // Matrix cells - int row_eff; // Matrix effective row - int col_eff; // Matrix effective column -} Matrix; - - -/* - * Procedure init_matrix - * - * Initializing newly allocated matrix - * Setting all data to 0 and effective dimensions according - * to nrow and ncol - * */ -void init_matrix(Matrix *m, int nrow, int ncol) { - m->row_eff = nrow; - m->col_eff = ncol; - - for (int i = 0; i < m->row_eff; i++) { - for (int j = 0; j < m->col_eff; j++) { - m->mat[i][j] = 0; - } - } -} - - -/* - * Function input_matrix - * - * Returns a matrix with values from stdin input - * */ -Matrix input_matrix(int nrow, int ncol) { - Matrix input; - init_matrix(&input, nrow, ncol); - - for (int i = 0; i < nrow; i++) { - for (int j = 0; j < ncol; j++) { - scanf("%d", &input.mat[i][j]); - } - } - - return input; -} - - -/* - * Procedure print_matrix - * - * Print matrix data - * */ -void print_matrix(Matrix *m) { - for (int i = 0; i < m->row_eff; i++) { - for (int j = 0; j < m->col_eff; j++) { - printf("%d ", m->mat[i][j]); - } - printf("\n"); - } -} - - -/* - * Function get_matrix_datarange - * - * Returns the range between maximum and minimum - * element of a matrix - * */ -int get_matrix_datarange(Matrix *m) { - int max = DATAMIN; - int min = DATAMAX; - for (int i = 0; i < m->row_eff; i++) { - for (int j = 0; j < m->col_eff; j++) { - int el = m->mat[i][j]; - if (el > max) max = el; - if (el < min) min = el; - } - } - - return max - min; -} - - -/* - * Function supression_op - * - * Returns the sum of intermediate value of special multiplication - * operation where kernel[0][0] corresponds to target[row][col] - * */ -int supression_op(Matrix *kernel, Matrix *target, int row, int col) { - int intermediate_sum = 0; - for (int i = 0; i < kernel->row_eff; i++) { - for (int j = 0; j < kernel->col_eff; j++) { - intermediate_sum += kernel->mat[i][j] * target->mat[row + i][col + j]; - } - } - - return intermediate_sum; -} - - -/* - * Function convolution - * - * Return the output matrix of convolution operation - * between kernel and target - * */ -Matrix convolution(Matrix *kernel, Matrix *target) { - Matrix out; - int out_row_eff = target->row_eff - kernel->row_eff + 1; - int out_col_eff = target->col_eff - kernel->col_eff + 1; - - init_matrix(&out, out_row_eff, out_col_eff); - - for (int i = 0; i < out.row_eff; i++) { - for (int j = 0; j < out.col_eff; j++) { - out.mat[i][j] = supression_op(kernel, target, i, j); - } - } - - return out; -} - - -/** - * Swap the values of two elements in an array. - * - * @param d_arr - the array. - * @param i - the index of the first element. - * @param j - the index of the second element. - */ -__device__ void swap(int *d_arr, int i, int j) -{ - int temp = d_arr[i]; - d_arr[i] = d_arr[j]; - d_arr[j] = temp; -} - -/** - * Pad the remaining empty value in the array with the maximum value. - * - * @param arr - the array. - * @param length - the number of elements inside the array. - * @param length_buffer - the length of the buffer. - */ -void padding_array(int *arr, int length, int length_buffer) -{ - for (int i = length; i < length_buffer; i++) { - arr[i] = INT_MAX; - } -} - -/** - * Copy the array from src to dest and pad the remaining empty value with the maximum value. - * - * @param dest - the destination array. - * @param src - the source array. - * @param length - the number of elements inside the src array. - * @param length_buffer - the length of the buffer. - */ -void copy_padding(int *dest, int *src, int length, int length_buffer) -{ - for (int i = 0; i < length; i++) { - dest[i] = src[i]; - } - padding_array(dest, length, length_buffer); -} - -/** - * Get the number of minimum blocks as a power of 2. - * - * @param length - the number of elements inside the array. - * @param num_threads - the number of threads. - * @return int - the number of minimum blocks needed. - */ -int minimum_blocks(int length, int num_threads) { - int num_blocks = 1; - - // Increase the number of blocks 2 times, so it will be a power of 2. - while (num_blocks * num_threads < length) { - num_blocks *= 2; - } - - return num_blocks; -} - -/** - * Do the bitnoic sort step by step. - * - * @param d_values - array in the device to be sorted. - * @param i - major step index. - * @param j - minor step index. - */ -__global__ void bitonic_sort_step(int *d_arr, int i, int j) -{ - // The array index and its patner. - int idx, patner; - - // The thread index. - idx = threadIdx.x + blockDim.x * blockIdx.x; - - // The thread index of the patner. - patner = idx ^ j; - - // Sort the array by threads with the lowest idx. - if (idx < patner) { - if ((idx & i) == 0) { - // Sort ascending. - if (d_arr[idx] > d_arr[patner]) { - swap(d_arr, idx, patner); - } - } - if ((idx & i) != 0) { - // Sort descending. - if (d_arr[idx] < d_arr[patner]) { - swap(d_arr, idx, patner); - } - } - } -} - -/** - * Perform a bitonic sort on the array. - * - * @param h_arr The host array to sort. - * @param length The length of the array. - */ -void bitonic_sort(int *h_arr, int length) -{ - // Initialize the constants variable. - const int threads = 1024; - const int blocks = minimum_blocks(length, threads); - const int buffer_length = threads * blocks; - - // Initialize the memory size of the array. - size_t size = length * sizeof(int); - size_t buffer_size = buffer_length * sizeof(int); - - // Create the buffer array and pad with maximum value of Int. - int *h_buffer = (int *)malloc(buffer_size); - copy_padding(h_buffer, h_arr, length, buffer_length); - - // Allocate and copy array into device memory. - int *d_arr; - cudaMalloc((void**) &d_arr, buffer_size); - cudaMemcpy(d_arr, h_buffer, buffer_size, cudaMemcpyHostToDevice); - - // Sort the array using bitonic_sort_step. - int i, j; - // The major step. - for (i = 2; i <= buffer_length; i *= 2) { - // The minor step. - for (j = i / 2; j > 0; j = j / 2) { - bitonic_sort_step<<<blocks, threads>>>(d_arr, i, j); - } - } - - // Copy the values back to the host. - cudaMemcpy(h_arr, d_arr, size, cudaMemcpyDeviceToHost); - - // Free device memory. - cudaFree(d_arr); - free(h_buffer); -} - - -/* - * Procedure print_array - * - * Prints all elements of array n of size to stdout - * */ -void print_array(int *n, int size) { - for (int i = 0; i < size; i++ ) printf("%d ", n[i]); - printf("\n"); -} - - -/* - * Function get_median - * - * Returns median of array n of length - * */ -int get_median(int *n, int length) { - int mid = length / 2; - if (length & 1) return n[mid]; - - return (n[mid - 1] + n[mid]) / 2; -} - - -/* - * Function get_floored_mean - * - * Returns floored mean from an array of integers - * */ -long get_floored_mean(int *n, int length) { - long sum = 0; - for (int i = 0; i < length; i++) { - sum += n[i]; - } - - return sum / length; -} - -/** - * Function index_to_row_major - * - * Returns the index of a matrix element in row-major order - */ -int index_to_row_major(int row, int col, int row_eff, int col_eff) { - return row * col_eff + col; -} - -__device__ int d_index_to_row_major(int row, int col, int row_eff, int col_eff) { - return row * col_eff + col; -} - -/** - * Function row_major_to_index - * - * Returns the row and column of a matrix element in row-major order - */ -void row_major_to_index(int index, int row_eff, int col_eff, int *row, int *col) { - *row = index / col_eff; - *col = index % col_eff; -} - -__device__ void d_row_major_to_index(int index, int row_eff, int col_eff, int *row, int *col) { - *row = index / col_eff; - *col = index % col_eff; -} - -/** - * Function map_matrix - * - * Returns a row major matrix of the input matrix. - **/ -int* map_matrix(int mat[][100], int row, int col) { - int* map = (int*) malloc(sizeof(int) * row * col); - for (int i = 0; i < row; i++) { - for (int j = 0; j < col; j++) { - map[index_to_row_major(i, j, row, col)] = mat[i][j]; - } - } - return map; -} - -/* - * Function map_matrix_extended - * - * Returns a row major matrix of the input matrix. - **/ -int* map_matrix_extended(int** mat, int row, int col) { - int* map = (int*) malloc(sizeof(int) * row * col); - for (int i = 0; i < row; i++) { - for (int j = 0; j < col; j++) { - map[index_to_row_major(i, j, row, col)] = mat[i][j]; - } - } - return map; -} - - -/** - * Function reverse_map_matrix - * - * Returns a matrix of the input row major matrix. - */ -int** reverse_map_matrix(int* map, int row, int col) { - int** mat = (int**) malloc(sizeof(int*) * row); - for (int i = 0; i < row; i++) { - mat[i] = (int*) malloc(sizeof(int) * col); - for (int j = 0; j < col; j++) { - mat[i][j] = map[index_to_row_major(i, j, row, col)]; - } - } - return mat; -} - -/** - * Function rm_to_matrix_object - * - * Return Matrix struct of row major matrix - */ -Matrix rm_to_matrix_object(int* map, int row, int col) { - Matrix mat; - init_matrix(&mat, row, col); - for (int i = 0; i < row; i++) { - for (int j = 0; j < col; j++) { - mat.mat[i][j] = map[index_to_row_major(i, j, row, col)]; - } - } - return mat; -} - -/** - * Function rm_to_list_matrix_object - * - * Return List of Matrix Struct of row major matrix - */ -Matrix* rm_to_list_matrix_object(int* map, int row, int col, int row_inner, int col_inner) { - Matrix* mat = (Matrix*) malloc(sizeof(Matrix) * row); - for (int i = 0; i < row; i++) { - init_matrix(&mat[i], row_inner, col_inner); - int pad = i * col; - for (int j = 0; j < row_inner; j++) { - for (int k = 0; k < col_inner; k++) { - int index = index_to_row_major(j, k, row_inner, col_inner) + pad; - mat[i].mat[j][k] = map[index]; - } - } - } - return mat; -} -/** - * Function list_matrix_object_to_rm - * - * Return row major matrix of list of Matrix struct - */ -int* list_matrix_object_to_rm(Matrix* mat, int num_matrix, int row_inner, int col_inner) { - int* map = (int*) malloc(sizeof(int) * num_matrix * row_inner * col_inner); - for (int i = 0; i < num_matrix; i++) { - int pad = i * row_inner * col_inner; - for (int j = 0; j < row_inner; j++) { - for (int k = 0; k < col_inner; k++) { - int index = index_to_row_major(j, k, row_inner, col_inner) + pad; - map[index] = mat[i].mat[j][k]; - } - } - } - - return map; -} - -/** - * Function cuda_convolution - * - * Returns a matrix of the convolution of the input matrix with the kernel - */ -void cuda_convolution(int* out_mat_rm, int* arr_mat_rm, int* kernel_rm, int row_eff, int col_eff, int kernel_row, int kernel_col, int curr_mat) { - // Calculate real row and column of input matrix. - int row = row_eff + kernel_row - 1; - int col = col_eff + kernel_col - 1; - - // Calculate padding target and output matrix. - int pad = curr_mat * row * col; - int pad_out = curr_mat * row_eff * col_eff; - - // For each element in input matrix that is not on the boundary, - for (int i = 0 ; i < row_eff; i++) { - for (int j = 0; j < col_eff; j++) { - // Convolution of the element with the kernel. - // Calculate the sum of the kernel and the input matrix. - int intermediate_sum = 0; - for (int k = 0; k < kernel_row; k++) { - for (int l = 0; l < kernel_col; l++) { - int index = index_to_row_major(i + k, j + l, row, col) + pad; - int kernel_index = index_to_row_major(k, l, kernel_row, kernel_col); - intermediate_sum += arr_mat_rm[index] * kernel_rm[kernel_index]; - // Print all i,j,k,l - // printf("i:%d, j:%d, k:%d, l:%d\n", i, j, k, l); - - } - } - // Store the sum in the output matrix. - out_mat_rm[index_to_row_major(i, j, row_eff, col_eff) + pad_out ] = intermediate_sum; - } - } -} - -__global__ void d_cuda_convolution(int* d_out_mat, int* arr_mat_rm, int* kernel_rm, int row_eff, int col_eff, int kernel_row, int kernel_col) { - // Calculate real row and column of input matrix. - int row = row_eff + kernel_row - 1; - int col = col_eff + kernel_col - 1; - - // Determine current matrix from block; - int curr_mat = blockIdx.y; - // Calculate padding target and output matrix. - int pad = curr_mat * row * col; - int pad_out = curr_mat * row_eff * col_eff; - - // Get i, and j from threadIdx - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int i, j; - d_row_major_to_index(tid, row_eff, col_eff, &i, &j); - - // Calculate element in input matrix that is not on the boundary, - if (i < row_eff && j < col_eff) { - int intermediate_sum = 0; - for (int k = 0; k < kernel_row; k++) { - for (int l = 0; l < kernel_col; l++) { - int index = d_index_to_row_major(i + k, j + l, row, col) + pad; - int kernel_index = d_index_to_row_major(k, l, kernel_row, kernel_col); - intermediate_sum += arr_mat_rm[index] * kernel_rm[kernel_index]; - } - } - d_out_mat[d_index_to_row_major(i, j, row_eff, col_eff) + pad_out] = intermediate_sum; - } -} - -/* - * Function d_get_matrix_datarange - * Set range between maximum and minimum - * for every matrix in out_mat to d_arr_range - * */ -__global__ void d_get_matrix_datarange(int* out_mat, int* d_arr_range, int row_eff, int col_eff) { - // Determine current matrix from block; - int curr_mat = blockIdx.y; - // Calculate padding output matrix and arr_range matrix. - int pad = curr_mat * row_eff * col_eff; - int pad_arr_range = curr_mat; - - // Get i, and j from threadIdx - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int i, j; - d_row_major_to_index(tid, row_eff, col_eff, &i, &j); - // Get datarange in every output matrix - if (i < row_eff && j < col_eff) { - int max = DATAMIN; - int min = DATAMAX; - for (int k = 0; k < row_eff; k++) { - for (int l = 0; l < col_eff; l++) { - int index = d_index_to_row_major(i + k, j + l, row_eff, col_eff) + pad; - - int el = out_mat[index]; - if (el > max) max = el; - if (el < min) min = el; - } - } - - d_arr_range[d_index_to_row_major(i, j, row_eff, col_eff) + pad_arr_range] = max-min; - } -} - -// main() driver -int main() { - // Time. - clock_t t; - t = clock(); - - int kernel_row, kernel_col, target_row, target_col, num_targets; - - // reads kernel's row and column and initalize kernel matrix from input - scanf("%d %d", &kernel_row, &kernel_col); - Matrix kernel = input_matrix(kernel_row, kernel_col); - - // reads number of target matrices and their dimensions. - // initialize array of matrices and array of data ranges (int) - scanf("%d %d %d", &num_targets, &target_row, &target_col); - Matrix* arr_mat = (Matrix*)malloc(num_targets * sizeof(Matrix)); - // int arr_range[num_targets]; - - // Calculate variable for cuda computing. - int a = (target_row-kernel_row+1) * (target_col-kernel_col+1); - int b = 1024; - int block_size = a/b + (a % b != 0); // ceil(a/b) - int threads_per_block = 1024; - int row_eff = target_row - kernel_row + 1; - int col_eff = target_col - kernel_col + 1; - - - // Allocate variable. - // rm means row-major. It's indicate matrix are in row-major order. - // Variable declaration. - int * arr_mat_rm=0, * d_arr_mat_rm=0, *out_mat_rm=0, *d_out_mat_rm=0, *kernel_rm=0, *d_kernel_rm=0, *arr_range=0, *d_arr_range=0; - int size_arr_mat, size_out_mat, size_kernel, size_arr_range; - cudaError err; - - // Allocate input matrix in device and host. - size_arr_mat = num_targets * target_row * target_col; - arr_mat_rm = (int*)malloc(sizeof(int) * size_arr_mat); - cudaMalloc((void **)&d_arr_mat_rm, sizeof(int) * size_arr_mat); - if (arr_mat_rm == 0 | d_arr_mat_rm == 0) { - printf("Error: Memory allocation failed for arr_mat.\n"); - return 1; - } - - // Allocate output matrix in device and host. - size_out_mat = num_targets * row_eff * col_eff; - out_mat_rm = (int*)malloc(sizeof(int) * size_out_mat); - cudaMalloc((void **)&d_out_mat_rm, sizeof(int) * size_out_mat); - if (out_mat_rm == 0 | d_out_mat_rm == 0) { - printf("Error: Memory allocation failed for out_mat.\n"); - return 1; - } - cudaMemset(d_out_mat_rm, 0, sizeof(int) * size_out_mat); - - - // Allocate kernel matrix in host. - size_kernel = kernel_row * kernel_col; - kernel_rm = (int*)malloc(sizeof(int) * size_kernel); - // Store kernel in row major form and allocate kernel for device. - kernel_rm = map_matrix(kernel.mat, kernel_row, kernel_col); - cudaMalloc((void **)&d_kernel_rm, sizeof(int) * size_kernel); - if (kernel_rm == 0 | d_kernel_rm == 0) { - printf("Error: Memory allocation failed for kernel.\n"); - return 1; - } - err = cudaMemcpy(d_kernel_rm, kernel_rm, sizeof(int) * size_kernel, cudaMemcpyHostToDevice); - if (err != cudaSuccess) { - printf("Error copy host to device: %s\n", cudaGetErrorString(err)); - return 1; - } - - - // Read each target matrix. - for (int i = 0; i < num_targets; i++) { - arr_mat[i] = input_matrix(target_row, target_col); - } - // Store each target matrix in row major form and allocate target matrix for device. - arr_mat_rm = list_matrix_object_to_rm(arr_mat, num_targets, target_row, target_col); - err = cudaMemcpy(d_arr_mat_rm, arr_mat_rm, sizeof(int) * size_arr_mat, cudaMemcpyHostToDevice); - if (err != cudaSuccess) { - printf("Error copy host to device: %s\n", cudaGetErrorString(err)); - return 1; - } - - // For each target matrix, compute their convolution matrices. - dim3 grid, block; - grid.x = block_size; - grid.y = num_targets; - block.x = threads_per_block; - d_cuda_convolution<<<grid, block>>>(d_out_mat_rm, d_arr_mat_rm, d_kernel_rm, row_eff, col_eff, kernel_row, kernel_col); - err = cudaMemcpy(out_mat_rm, d_out_mat_rm, sizeof(int) * size_out_mat, cudaMemcpyDeviceToHost); - if (err != cudaSuccess) { - printf("Error copy device to host: %s\n", cudaGetErrorString(err)); - } - // for (int i = 0; i < num_targets; i++){ - // cuda_convolution(out_mat_rm, arr_mat_rm, kernel_rm, row_eff, col_eff, kernel_row, kernel_col, i); - // } - // arr_mat = rm_to_list_matrix_object(out_mat_rm, num_targets, row_eff*col_eff, row_eff, col_eff); - - // Allocate arr_range matrix in device and host. - size_arr_range = num_targets; - arr_range = (int*)malloc(sizeof(int) * size_arr_range); - cudaMalloc((void **)&d_arr_range, sizeof(int) * size_arr_range); - if (arr_range == 0 | d_arr_range == 0) { - printf("Error: Memory allocation failed for arr_range.\n"); - return 1; - } - cudaMemset(d_arr_range, 0, sizeof(int) * size_arr_range); - - grid.x = 1; - grid.y = num_targets; - block.x = 1; - d_get_matrix_datarange<<<grid, block>>>(d_out_mat_rm, d_arr_range, row_eff, col_eff); - err = cudaMemcpy(arr_range, d_arr_range, sizeof(int) * size_arr_range, cudaMemcpyDeviceToHost); - if (err != cudaSuccess) { - printf("Error copy device to host 2: %s\n", cudaGetErrorString(err)); - } - - // sort the data range array - bitonic_sort(arr_range, num_targets); - - int median = get_median(arr_range, num_targets); - int floored_mean = get_floored_mean(arr_range, num_targets); - - // print the min, max, median, and floored mean of data range array - printf("%d\n%d\n%d\n%d\n", - arr_range[0], - arr_range[num_targets - 1], - median, - floored_mean); - - // Print execution time in seconds. - t = clock() - t; - printf("\nRuntime: %f s\n", ((float)t) / CLOCKS_PER_SEC); - - // Free cuda Memory. - cudaFree(d_arr_mat_rm); - cudaFree(d_out_mat_rm); - cudaFree(d_kernel_rm); - return 0; -}