diff --git a/result/K01-03-TC1_parallel.txt b/result/K01-03-TC1_parallel.txt index 621341380641e8b0258d72fc13ca768ac5a89874..adfa022b810f199d3098dcaa2f4ec2e70267c0cf 100644 --- a/result/K01-03-TC1_parallel.txt +++ b/result/K01-03-TC1_parallel.txt @@ -3,4 +3,4 @@ 10114197 10323010 -Runtime: 0.213774 s +Runtime: 0.160294 s diff --git a/result/K01-03-TC1_serial.txt b/result/K01-03-TC1_serial.txt index 8be64112235231a0e6990ea1479b9cbd3807f98b..d985c9557acdd6f1c4f0af8ffd5cd94005b0f05d 100644 --- a/result/K01-03-TC1_serial.txt +++ b/result/K01-03-TC1_serial.txt @@ -4,4 +4,4 @@ yey 10114197 10323010 -Runtime: 0.006773 s +Runtime: 0.016049 s diff --git a/result/K01-03-TC2_parallel.txt b/result/K01-03-TC2_parallel.txt index 43987c2572fa70c6a2e24590015fca773d289fa6..5bb2408e1415c74b18d449148f66d05b4ff4534c 100644 --- a/result/K01-03-TC2_parallel.txt +++ b/result/K01-03-TC2_parallel.txt @@ -3,4 +3,4 @@ 37739803 38222937 -Runtime: 0.919108 s +Runtime: 0.960191 s diff --git a/result/K01-03-TC2_serial.txt b/result/K01-03-TC2_serial.txt index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..c794bb80d78e0c28ec6fce6b2eb0d20767e84af8 100644 --- a/result/K01-03-TC2_serial.txt +++ b/result/K01-03-TC2_serial.txt @@ -0,0 +1,6 @@ +35064588 +46265294 +37739803 +38222937 + +Runtime: 0.742690 s diff --git a/result/K01-03-TC3_parallel.txt b/result/K01-03-TC3_parallel.txt index a3f236bfec62bd72f46dfd57d491c461fc04f69d..c19d3246e0be37fe7e3a1d5112e3fd2be183d2ef 100644 --- a/result/K01-03-TC3_parallel.txt +++ b/result/K01-03-TC3_parallel.txt @@ -3,4 +3,4 @@ 23198319 23380111 -Runtime: 1.026466 s +Runtime: 1.101716 s diff --git a/result/K01-03-TC3_serial.txt b/result/K01-03-TC3_serial.txt index aefabd33e10fbd433489260714e3c183ede9e3dd..8edba32113e491f17d87a3f1179d4cdb8be8df61 100644 --- a/result/K01-03-TC3_serial.txt +++ b/result/K01-03-TC3_serial.txt @@ -3,4 +3,4 @@ 23198319 23380111 -Runtime: 0.893097 s +Runtime: 0.884338 s diff --git a/result/K01-03-TC4_parallel.txt b/result/K01-03-TC4_parallel.txt index 0e9c566e434823660b09a9f44104a1e8a9441ff7..9ee11a5f49187c982513d57daff3b38720bf4ef1 100644 --- a/result/K01-03-TC4_parallel.txt +++ b/result/K01-03-TC4_parallel.txt @@ -3,4 +3,4 @@ 51451884 51774352 -Runtime: 10.590979 s +Runtime: 10.189295 s diff --git a/result/K01-03-TC4_serial.txt b/result/K01-03-TC4_serial.txt index b874af3d99cc848fcb357c88c52736bc43cdae2d..e228912dc4e97c068a2f9d7296b6457992d9c2e9 100644 --- a/result/K01-03-TC4_serial.txt +++ b/result/K01-03-TC4_serial.txt @@ -4,4 +4,4 @@ yey 51451884 51774352 -Runtime: 9.726326 s +Runtime: 9.711713 s diff --git a/src/lib/bitonic_sort.cu b/src/lib/bitonic_sort.cu index 90a2735b25725af46aebe01fd65ce5d8b61b916a..7b8c1816c29580c4435b1a3bfccf7e373dd731ad 100644 --- a/src/lib/bitonic_sort.cu +++ b/src/lib/bitonic_sort.cu @@ -7,7 +7,7 @@ #include <time.h> #include "bitonic_sort.cuh" -// PROBLEM: THREADS DAN BLOCKS HARUS KELIPATAN 2 +// PROBLEM: THREADS DAN BLOCKS HARUS KELIPATAN PANGKAT 2 // SOLUTION: PADDING DATA KE DALAM BUFFER DENGAN NILAI MAX_INT void array_fill(int *arr, int length) @@ -94,6 +94,24 @@ void copy_padding(int *dest, int *src, int length, int length_buffer) padding_array(dest, length, length_buffer); } +/** + * Get the number of minimum blocks as a power of 2. + * + * @param length - the number of elements inside the array. + * @param num_threads - the number of threads. + * @return int - the number of minimum blocks needed. + */ +int minimum_blocks(int length, int num_threads) { + int num_blocks = 1; + + // Increase the number of blocks 2 times, so it will be a power of 2. + while (num_blocks * num_threads < length) { + num_blocks *= 2; + } + + return num_blocks; +} + /** * Do the bitnoic sort step by step. * @@ -104,10 +122,11 @@ void copy_padding(int *dest, int *src, int length, int length_buffer) __global__ void bitonic_sort_step(int *d_arr, int i, int j) { // The array index and its patner. - unsigned int idx, patner; - idx = threadIdx.x + blockDim.x * blockIdx.x; + int idx, patner; - // Get the patner. + // The thread index. + idx = threadIdx.x + blockDim.x * blockIdx.x; + // The thread index of the patner. patner = idx ^ j; // Sort the array by threads with the lowest idx. @@ -137,15 +156,9 @@ void bitonic_sort(int *h_arr, int length) { // Initialize the constants variable. const int threads = 1024; - const int blocks = 8; + const int blocks = minimum_blocks(length, threads); const int buffer_length = threads * blocks; - // Check that the buffer can hold the array. - if (length > buffer_length) { - printf("[ERROR] The array length is %d, but the buffer length is %d.\n", length, buffer_length); - return; - } - // Initialize the memory size of the array. size_t size = length * sizeof(int); size_t buffer_size = buffer_length * sizeof(int); @@ -180,7 +193,7 @@ void bitonic_sort(int *h_arr, int length) int driver(void) { clock_t start, stop; - int length = 1000000; + int length = 109725; int *values = (int*) malloc( length * sizeof(int)); array_fill(values, length); diff --git a/src/lib/brick_sort.cu b/src/lib/brick_sort.cu index 8ae6b15193a49c2b84868a21e33a1b2adb075965..08d9b64a775db81e4d8c725e81fd223ee510ba1a 100644 --- a/src/lib/brick_sort.cu +++ b/src/lib/brick_sort.cu @@ -2,8 +2,8 @@ #include <stdio.h> #include <time.h> -#define THREADS 512 -#define NUM_VALS 512*5 +#define THREADS 1024 +#define NUM_VALS 100000 // PROBLEM: BLOCKS GA BOLEH LEBIH DARI 1. @@ -55,7 +55,7 @@ __device__ void swap(int *arr, int i, int j) } -__global__ void brick_sort(int *d_arr, int length) +__global__ void brick_sort_even(int *d_arr, int length) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -64,8 +64,6 @@ __global__ void brick_sort(int *d_arr, int length) return; } - __syncthreads(); - if (i % 2 == 0) { if (d_arr[i] > d_arr[i + 1]) @@ -74,7 +72,16 @@ __global__ void brick_sort(int *d_arr, int length) } } - __syncthreads(); +} + +__global__ void brick_sort_odd(int *d_arr, int length) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if (i >= length - 1) + { + return; + } if (i % 2 != 0) { @@ -83,42 +90,42 @@ __global__ void brick_sort(int *d_arr, int length) swap(d_arr, i, i + 1); } } - - __syncthreads(); } - -int driver(void) -{ - clock_t start, stop; - size_t size = NUM_VALS * sizeof(int); +void brick_sort(int *arr, int length) { int blocks = (NUM_VALS + THREADS - 1) / THREADS; - printf("Blocks: %d\n", blocks); - - int *arr = (int *)malloc(NUM_VALS * sizeof(int)); - array_fill(arr, NUM_VALS); - - int checksum1 = checksum(arr, NUM_VALS); - - start = clock(); + size_t size = length * sizeof(int); int *d_arr; - cudaMalloc((void **)&d_arr, size); cudaMemcpy(d_arr, arr, size, cudaMemcpyHostToDevice); for (int i = 0; i < NUM_VALS / 2; ++i) { - brick_sort<<<blocks, THREADS>>>(d_arr, NUM_VALS); + brick_sort_odd<<<blocks, THREADS>>>(d_arr, NUM_VALS); + brick_sort_even<<<blocks, THREADS>>>(d_arr, NUM_VALS); } cudaMemcpy(arr, d_arr, size, cudaMemcpyDeviceToHost); cudaFree(d_arr); +} - int checksum2 = checksum(arr, NUM_VALS); +int driver(void) +{ + clock_t start, stop; + + int *arr = (int *)malloc(NUM_VALS * sizeof(int)); + array_fill(arr, NUM_VALS); + + int checksum1 = checksum(arr, NUM_VALS); + + start = clock(); + brick_sort(arr, NUM_VALS); stop = clock(); + int checksum2 = checksum(arr, NUM_VALS); + print_elapsed(start, stop); bool passed_sort = true; diff --git a/src/lib/brick_sort.cuh b/src/lib/brick_sort.cuh index a923b784d8cb252e08b6d03813db1e717b65fcb7..bb088368e8417d8a4f1e99a50d84db97d521dd24 100644 --- a/src/lib/brick_sort.cuh +++ b/src/lib/brick_sort.cuh @@ -1,6 +1,6 @@ #ifndef _BRICK_SORT_H_ #define _BRICK_SORT_H_ -__global__ void brick_sort(int *d_arr, int length); +void brick_sort(int *d_arr, int length); #endif \ No newline at end of file diff --git a/src/parallel_collab.cu b/src/parallel_collab.cu new file mode 100644 index 0000000000000000000000000000000000000000..f214d43ebbe9e28218fc3992e82163c6b9b4e981 --- /dev/null +++ b/src/parallel_collab.cu @@ -0,0 +1,469 @@ +// parallel.cu + +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <math.h> + +#define NMAX 100 +#define DATAMAX 1000 +#define DATAMIN -1000 + +/* + * Struct Matrix + * + * Matrix representation consists of matrix data + * and effective dimensions + * */ +typedef struct Matrix { + int mat[NMAX][NMAX]; // Matrix cells + int row_eff; // Matrix effective row + int col_eff; // Matrix effective column +} Matrix; + + +/* + * Procedure init_matrix + * + * Initializing newly allocated matrix + * Setting all data to 0 and effective dimensions according + * to nrow and ncol + * */ +void init_matrix(Matrix *m, int nrow, int ncol) { + m->row_eff = nrow; + m->col_eff = ncol; + + for (int i = 0; i < m->row_eff; i++) { + for (int j = 0; j < m->col_eff; j++) { + m->mat[i][j] = 0; + } + } +} + + +/* + * Function input_matrix + * + * Returns a matrix with values from stdin input + * */ +Matrix input_matrix(int nrow, int ncol) { + Matrix input; + init_matrix(&input, nrow, ncol); + + for (int i = 0; i < nrow; i++) { + for (int j = 0; j < ncol; j++) { + scanf("%d", &input.mat[i][j]); + } + } + + return input; +} + + +/* + * Procedure print_matrix + * + * Print matrix data + * */ +void print_matrix(Matrix *m) { + for (int i = 0; i < m->row_eff; i++) { + for (int j = 0; j < m->col_eff; j++) { + printf("%d ", m->mat[i][j]); + } + printf("\n"); + } +} + + +/* + * Function get_matrix_datarange + * + * Returns the range between maximum and minimum + * element of a matrix + * */ +int get_matrix_datarange(Matrix *m) { + int max = DATAMIN; + int min = DATAMAX; + for (int i = 0; i < m->row_eff; i++) { + for (int j = 0; j < m->col_eff; j++) { + int el = m->mat[i][j]; + if (el > max) max = el; + if (el < min) min = el; + } + } + + return max - min; +} + + +/* + * Function supression_op + * + * Returns the sum of intermediate value of special multiplication + * operation where kernel[0][0] corresponds to target[row][col] + * */ +int supression_op(Matrix *kernel, Matrix *target, int row, int col) { + int intermediate_sum = 0; + for (int i = 0; i < kernel->row_eff; i++) { + for (int j = 0; j < kernel->col_eff; j++) { + intermediate_sum += kernel->mat[i][j] * target->mat[row + i][col + j]; + } + } + + return intermediate_sum; +} + + +/* + * Function convolution + * + * Return the output matrix of convolution operation + * between kernel and target + * */ +Matrix convolution(Matrix *kernel, Matrix *target) { + Matrix out; + int out_row_eff = target->row_eff - kernel->row_eff + 1; + int out_col_eff = target->col_eff - kernel->col_eff + 1; + + init_matrix(&out, out_row_eff, out_col_eff); + + for (int i = 0; i < out.row_eff; i++) { + for (int j = 0; j < out.col_eff; j++) { + out.mat[i][j] = supression_op(kernel, target, i, j); + } + } + + return out; +} + + +/* + * Procedure merge_array + * + * Merges two subarrays of n with n[left..mid] and n[mid+1..right] + * to n itself, with n now ordered ascendingly + * */ +void merge_array(int *n, int left, int mid, int right) { + int n_left = mid - left + 1; + int n_right = right - mid; + int iter_left = 0, iter_right = 0, iter_merged = left; + int arr_left[n_left], arr_right[n_right]; + + for (int i = 0; i < n_left; i++) { + arr_left[i] = n[i + left]; + } + + for (int i = 0; i < n_right; i++) { + arr_right[i] = n[i + mid + 1]; + } + + while (iter_left < n_left && iter_right < n_right) { + if (arr_left[iter_left] <= arr_right[iter_right]) { + n[iter_merged] = arr_left[iter_left++]; + } else { + n[iter_merged] = arr_right[iter_right++]; + } + iter_merged++; + } + + while (iter_left < n_left) { + n[iter_merged++] = arr_left[iter_left++]; + } + while (iter_right < n_right) { + n[iter_merged++] = arr_right[iter_right++]; + } +} + + +/* + * Procedure merge_sort + * + * Sorts array n with merge sort algorithm + * */ +void merge_sort(int *n, int left, int right) { + if (left < right) { + int mid = left + (right - left) / 2; + + merge_sort(n, left, mid); + merge_sort(n, mid + 1, right); + + merge_array(n, left, mid, right); + } +} + + +/* + * Procedure print_array + * + * Prints all elements of array n of size to stdout + * */ +void print_array(int *n, int size) { + for (int i = 0; i < size; i++ ) printf("%d ", n[i]); + printf("\n"); +} + + +/* + * Function get_median + * + * Returns median of array n of length + * */ +int get_median(int *n, int length) { + int mid = length / 2; + if (length & 1) return n[mid]; + + return (n[mid - 1] + n[mid]) / 2; +} + + +/* + * Function get_floored_mean + * + * Returns floored mean from an array of integers + * */ +long get_floored_mean(int *n, int length) { + long sum = 0; + for (int i = 0; i < length; i++) { + sum += n[i]; + } + + return sum / length; +} + +/** + * Function index_to_row_major + * + * Returns the index of a matrix element in row-major order + */ +int index_to_row_major(int row, int col, int row_eff, int col_eff) { + return row * col_eff + col; +} + +__device__ int d_index_to_row_major(int row, int col, int row_eff, int col_eff) { + return row * col_eff + col; +} + +/** + * Function row_major_to_index + * + * Returns the row and column of a matrix element in row-major order + */ +void row_major_to_index(int index, int row_eff, int col_eff, int *row, int *col) { + *row = index / col_eff; + *col = index % col_eff; +} + +__device__ void d_row_major_to_index(int index, int row_eff, int col_eff, int *row, int *col) { + *row = index / col_eff; + *col = index % col_eff; +} + +/** + * Function map_matrix + * + * Returns a row major matrix of the input matrix. + **/ +int* map_matrix(int mat[][100], int row, int col) { + int* map = (int*) malloc(sizeof(int) * row * col); + for (int i = 0; i < row; i++) { + for (int j = 0; j < col; j++) { + map[index_to_row_major(i, j, row, col)] = mat[i][j]; + } + } + return map; +} + +/** + * Function reverse_map_matrix + * + * Returns a matrix of the input row major matrix. + */ +int** reverse_map_matrix(int* map, int row, int col) { + int** mat = (int**) malloc(sizeof(int*) * row); + for (int i = 0; i < row; i++) { + mat[i] = (int*) malloc(sizeof(int) * col); + for (int j = 0; j < col; j++) { + mat[i][j] = map[index_to_row_major(i, j, row, col)]; + } + } + return mat; +} + +/** + * Function rm_to_matrix_object + * + * Return Matrix struct of row major matrix + */ +Matrix rm_to_matrix_object(int* map, int row, int col) { + Matrix mat; + init_matrix(&mat, row, col); + for (int i = 0; i < row; i++) { + for (int j = 0; j < col; j++) { + mat.mat[i][j] = map[index_to_row_major(i, j, row, col)]; + } + } + return mat; +} + +/** + * Function cuda_convolution + * + * Returns a matrix of the convolution of the input matrix with the kernel + */ +void cuda_convolution(int* d_out_mat, int* arr_mat_rm, int* kernel_rm, int row_eff, int col_eff, int kernel_row, int kernel_col) { + // Calculate real row and column of input matrix. + int row = row_eff + kernel_row - 1; + int col = col_eff + kernel_col - 1; + + // For each element in input matrix that is not on the boundary, + for (int i = 0; i < row_eff; i++) { + for (int j = 0; j < col_eff; j++) { + // Convolution of the element with the kernel. + // Calculate the sum of the kernel and the input matrix. + int intermediate_sum = 0; + for (int k = 0; k < kernel_row; k++) { + for (int l = 0; l < kernel_col; l++) { + int index = index_to_row_major(i + k, j + l, row, col); + int kernel_index = index_to_row_major(k, l, kernel_row, kernel_col); + intermediate_sum += arr_mat_rm[index] * kernel_rm[kernel_index]; + } + } + // Store the sum in the output matrix. + d_out_mat[index_to_row_major(i, j, row_eff, col_eff)] = intermediate_sum; + } + } +} + +__global__ void d_cuda_convolution(int* d_out_mat, int* arr_mat_rm, int* kernel_rm, int row_eff, int col_eff, int kernel_row, int kernel_col) { + printf("aaaa\n"); + // Calculate real row and column of input matrix. + int row = row_eff + kernel_row - 1; + int col = col_eff + kernel_col - 1; + + // Get i, and j from threadIdx + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int i, j; + d_row_major_to_index(tid, row_eff, col_eff, &i, &j); + printf("bbbb\n"); + + // Calculate element in input matrix that is not on the boundary + if (i < row_eff && j < col_eff) { + int intermediate_sum = 0; + for (int k = 0; k < kernel_row; k++) { + for (int l = 0; l < kernel_col; l++) { + int index = d_index_to_row_major(i + k, j + l, row, col); + int kernel_index = d_index_to_row_major(k, l, kernel_row, kernel_col); + intermediate_sum += arr_mat_rm[index] * kernel_rm[kernel_index]; + } + } + d_out_mat[d_index_to_row_major(i, j, row_eff, col_eff)] = intermediate_sum; + } +} + + + +// main() driver +int main() { + // Time. + clock_t t; + t = clock(); + + int kernel_row, kernel_col, target_row, target_col, num_targets; + + // reads kernel's row and column and initalize kernel matrix from input + scanf("%d %d", &kernel_row, &kernel_col); + Matrix kernel = input_matrix(kernel_row, kernel_col); + + // reads number of target matrices and their dimensions. + // initialize array of matrices and array of data ranges (int) + scanf("%d %d %d", &num_targets, &target_row, &target_col); + Matrix* arr_mat = (Matrix*)malloc(num_targets * sizeof(Matrix)); + int arr_range[num_targets]; + + // Calculate variable for cuda computing. + int a = (target_row-kernel_row+1) * (target_col-kernel_col+1); + int b = 1024; + int block_size = a/b + (a % b != 0); // ceil(a/b) + int threads_per_block = 1024; + int row_eff = target_row - kernel_row + 1; + int col_eff = target_col - kernel_col + 1; + + // Initialize host and device input and output matrixes. + int ** arr_mat_rm, **h_out_mat, ** d_out_mat, *kernel_rm; + // Allocate input matrix. + arr_mat_rm = (int**)malloc(sizeof(int*) * num_targets); + for (int i = 0; i < num_targets; i++) { + arr_mat_rm[i] = (int*)malloc(sizeof(int) * target_row * target_col); + } + // Allocate output matrix. + h_out_mat = (int**)malloc(sizeof(int*) * num_targets); + for (int i = 0; i < num_targets; i++) { + h_out_mat[i] = (int*)malloc(sizeof(int) * row_eff * col_eff); + } + cudaMalloc((void**)&d_out_mat, sizeof(int*) * num_targets); + for (int i = 0; i < num_targets; i++) { + cudaMalloc(&h_out_mat[i], sizeof(int) * row_eff * col_eff); + } + cudaError err = cudaMemcpy(d_out_mat, h_out_mat, sizeof(int*) * num_targets, cudaMemcpyHostToDevice); + if(err!=cudaSuccess) { + printf("CUDA error copying to Host: %s\n", cudaGetErrorString(err)); + } + + // d_out_mat = (int**)malloc(sizeof(int*) * num_targets); + // for (int i = 0; i < num_targets; i++) { + // d_out_mat[i] = (int*)malloc(sizeof(int) * row_eff * col_eff); + // } + kernel_rm = (int*)malloc(sizeof(int) * kernel_col * kernel_row); + + + + // Store kernel in row major form. + kernel_rm = map_matrix(kernel.mat, kernel_row, kernel_col); + + // read each target matrix, and get the row major matrix from. + for (int i = 0; i < num_targets; i++) { + printf("a\n"); + arr_mat[i] = input_matrix(target_row, target_col); + printf("b\n"); + arr_mat_rm[i] = map_matrix(arr_mat[i].mat, target_row, target_col); + printf("c\n"); + d_cuda_convolution<<<block_size, threads_per_block>>>(d_out_mat[i], arr_mat_rm[i], kernel_rm, row_eff, col_eff, kernel_row, kernel_col); + printf("d\n"); + cudaMemcpy(h_out_mat[i], d_out_mat[i], sizeof(int) * row_eff * col_eff, cudaMemcpyDeviceToHost); + // cuda_convolution(d_out_mat[i], arr_mat_rm[i], kernel_rm, row_eff, col_eff, kernel_row, kernel_col); + printf("e\n"); + arr_mat[i] = rm_to_matrix_object(d_out_mat[i], row_eff, col_eff); + printf("f\n"); + } + + // Free cuda memory + for (int i = 0; i < num_targets; i++) { + cudaFree(h_out_mat[i]); + } + cudaFree(d_out_mat); + + // For each target matrix, compute their convolution matrices, and compute their data ranges + for (int i = 0; i < num_targets; i++) { + // arr_mat[i] = convolution(&kernel, &arr_mat[i]); + arr_range[i] = get_matrix_datarange(&arr_mat[i]); + } + + // sort the data range array + merge_sort(arr_range, 0, num_targets - 1); + + int median = get_median(arr_range, num_targets); + int floored_mean = get_floored_mean(arr_range, num_targets); + + // print the min, max, median, and floored mean of data range array + printf("%d\n%d\n%d\n%d\n", + arr_range[0], + arr_range[num_targets - 1], + median, + floored_mean); + + // Print execution time in seconds. + t = clock() - t; + printf("\nRuntime: %f s\n", ((float)t) / CLOCKS_PER_SEC); + + return 0; +} \ No newline at end of file