From 7d804fd83116bbfffb4b7b1b4ddc19d14778ecf1 Mon Sep 17 00:00:00 2001 From: azharfatrr <m.azhar.faturahman@gmail.com> Date: Mon, 14 Mar 2022 13:26:04 +0700 Subject: [PATCH] feat: add sorting algorithm --- .gitignore | 2 + algorithm/bitonic_sort.cu | 184 ++++++++++++++++++++++++++++++++++++++ algorithm/brick_sort.cu | 139 ++++++++++++++++++++++++++++ 3 files changed, 325 insertions(+) create mode 100644 .gitignore create mode 100644 algorithm/bitonic_sort.cu create mode 100644 algorithm/brick_sort.cu diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0c2595e --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +# Binary file +/bin \ No newline at end of file diff --git a/algorithm/bitonic_sort.cu b/algorithm/bitonic_sort.cu new file mode 100644 index 0000000..4b03314 --- /dev/null +++ b/algorithm/bitonic_sort.cu @@ -0,0 +1,184 @@ +/* + * Parallel bitonic sort using CUDA. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <time.h> + +// PROBLEM: THREADS DAN BLOCKS HARUS KELIPATAN 2 +// SOLUTION: PADDING DATA KE DALAM BUFFER DENGAN NILAI MAX_INT + +/* Every thread gets exactly one value in the unsorted array. */ +#define THREADS 1024 +#define BLOCKS 16 +#define NUM_VALS 1000 + +void print_elapsed(clock_t start, clock_t stop) +{ + double elapsed = ((double) (stop - start)) / CLOCKS_PER_SEC; + printf("Elapsed time: %.3fs\n", elapsed); +} + +int checksum(int *arr, int size) +{ + int sum = 0; + for (int i = 0; i < size; i++) + sum += arr[i]; + return sum; +} + +int ceil_power_of_two(int n) +{ + int i = 1; + while (i < n) { + i = i << 1; + } + return i; +} + +void array_print(int *arr, int length) +{ + int i; + for (i = 0; i < length; ++i) { + printf("%d ", arr[i]); + } + printf("\n"); +} + +void array_fill(int *arr, int length) +{ + srand(time(NULL)); + int i; + for (i = 0; i < length; ++i) { + arr[i] = rand(); + } +} + +void pad_array(int *arr, int length, int size) +{ + int i; + for (i = length; i < size; i++) + { + arr[i] = INT_MAX; + } +} + +void copy_and_pad(int *dest, int *src, int length, int size) +{ + int i; + for (i = 0; i < length; i++) + { + dest[i] = src[i]; + } + pad_array(dest, length, size); +} + +__global__ void bitonic_sort_step(int *dev_values, int j, int k) +{ + unsigned int i, ixj; /* Sorting partners: i and ixj */ + i = threadIdx.x + blockDim.x * blockIdx.x; + + ixj = i^j; + + /* The threads with the lowest ids sort the array. */ + if ((ixj)>i) { + if ((i&k)==0) { + /* Sort ascending */ + if (dev_values[i]>dev_values[ixj]) { + /* exchange(i,ixj); */ + int temp = dev_values[i]; + dev_values[i] = dev_values[ixj]; + dev_values[ixj] = temp; + } + } + if ((i&k)!=0) { + /* Sort descending */ + if (dev_values[i]<dev_values[ixj]) { + /* exchange(i,ixj); */ + int temp = dev_values[i]; + dev_values[i] = dev_values[ixj]; + dev_values[ixj] = temp; + } + } + } +} + +void bitonic_sort(int *values) +{ + int buffer_length = BLOCKS * THREADS; + + size_t size = buffer_length * sizeof(int); + size_t original_size = NUM_VALS * sizeof(int); + + // Copy and padding the values. + int *h_values = (int *)malloc(size); + copy_and_pad(h_values, values, NUM_VALS, buffer_length); + + // Allocate device memory. + int *dev_values; + cudaMalloc((void**) &dev_values, size); + cudaMemcpy(dev_values, h_values, size, cudaMemcpyHostToDevice); + + // Sort the values. + int j, k; + // Major step. + for (k = 2; k <= buffer_length; k <<= 1) { + // Minor step. + for (j=k>>1; j>0; j=j>>1) { + bitonic_sort_step<<<BLOCKS, THREADS>>>(dev_values, j, k); + } + } + + // Copy the values back to the host. + cudaMemcpy(values, dev_values, original_size, cudaMemcpyDeviceToHost); + + // Free device memory. + cudaFree(dev_values); + free(h_values); +} + +int main(void) +{ + clock_t start, stop; + + int *values = (int*) malloc( NUM_VALS * sizeof(int)); + array_fill(values, NUM_VALS); + + array_print(values, NUM_VALS); + printf("\n"); + + int checksum1 = checksum(values, NUM_VALS); + + start = clock(); + bitonic_sort(values); /* Inplace */ + stop = clock(); + + int checksum2 = checksum(values, NUM_VALS); + + array_print(values, NUM_VALS); + printf("\n"); + + print_elapsed(start, stop); + + // TESTCASE. + bool passed_sort = true; + bool passed_checksum = true; + + for (int i = 1; i < NUM_VALS; i++) + { + if (values[i - 1] > values[i]) + { + passed_sort = false; + } + } + + if (checksum1 != checksum2) + { + passed_checksum = false; + printf("%d != %d\n", checksum1, checksum2); + } + + printf("Test %s\n", passed_sort ? "PASSED SORT" : "FAILED SORT"); + printf("Test %s\n", passed_checksum ? "PASSED CHECKSUM" : "FAILED CHECKSUM"); +} \ No newline at end of file diff --git a/algorithm/brick_sort.cu b/algorithm/brick_sort.cu new file mode 100644 index 0000000..f7295d4 --- /dev/null +++ b/algorithm/brick_sort.cu @@ -0,0 +1,139 @@ +#include <stdlib.h> +#include <stdio.h> +#include <time.h> +#include <cooperative_groups.h> + +// PROBLEM: INI GA JALAN KALAU BLOCKS > 1 + +/* Every thread gets exactly one value in the unsorted array. */ +#define THREADS 16 +#define NUM_VALS 2000 + +using namespace cooperative_groups; + +void print_elapsed(clock_t start, clock_t stop) +{ + double elapsed = ((double)(stop - start)) / CLOCKS_PER_SEC; + printf("Elapsed time: %.3fs\n", elapsed); +} + +int checksum(int *arr, int size) +{ + int sum = 0; + for (int i = 0; i < size; i++) + sum += arr[i]; + return sum; +} + +int random_int() +{ + return (int)rand() / (int)RAND_MAX; +} + +void array_print(int *arr, int length) +{ + int i; + for (i = 0; i < length; ++i) + { + printf("%d ", arr[i]); + } + printf("\n"); +} + +void array_fill(int *arr, int length) +{ + srand(time(NULL)); + int i; + for (i = 0; i < length; ++i) + { + arr[i] = rand(); + } +} + +__device__ void swap(int *arr, int i, int j) +{ + int temp = arr[i]; + arr[i] = arr[j]; + arr[j] = temp; +} + +__global__ void brick_sort(int *d_arr, int length) +{ + int l; + if (length % 2 == 0) + l = length / 2; + else + l = (length / 2) + 1; + for (int i = 0; i < l; i++) + { + if ((!(threadIdx.x & 1)) && (threadIdx.x < (length - 1))) // even phase + { + if (d_arr[threadIdx.x] > d_arr[threadIdx.x + 1]) + swap(d_arr, threadIdx.x, threadIdx.x + 1); + } + + __syncthreads(); + if ((threadIdx.x & 1) && (threadIdx.x < (length - 1))) // odd phase + { + if (d_arr[threadIdx.x] > d_arr[threadIdx.x + 1]) + swap(d_arr, threadIdx.x, threadIdx.x + 1); + } + __syncthreads(); + } // for +} + +int main(void) +{ + clock_t start, stop; + size_t size = NUM_VALS * sizeof(int); + int blocks = (NUM_VALS + THREADS - 1) / THREADS; + printf("Blocks: %d\n", blocks); + + int *arr = (int *)malloc(NUM_VALS * sizeof(int)); + array_fill(arr, NUM_VALS); + + int checksum1 = checksum(arr, NUM_VALS); + + start = clock(); + + int *d_arr; + + cudaMalloc((void **)&d_arr, size); + cudaMemcpy(d_arr, arr, size, cudaMemcpyHostToDevice); + + for (int i = 0; i < NUM_VALS / 2; ++i) + { + brick_sort<<<1, THREADS>>>(d_arr, NUM_VALS); + } + + cudaMemcpy(arr, d_arr, size, cudaMemcpyDeviceToHost); + cudaFree(d_arr); + + int checksum2 = checksum(arr, NUM_VALS); + + stop = clock(); + + print_elapsed(start, stop); + + bool passed_sort = true; + bool passed_checksum = true; + + for (int i = 1; i < NUM_VALS; i++) + { + if (arr[i - 1] > arr[i]) + { + passed_sort = false; + } + } + + if (checksum1 != checksum2) + { + passed_checksum = false; + printf("%d != %d\n", checksum1, checksum2); + } + + array_print(arr, NUM_VALS); + + printf("Test %s\n", passed_sort ? "PASSED SORT" : "FAILED SORT"); + printf("Test %s\n", passed_checksum ? "PASSED CHECKSUM" : "FAILED CHECKSUM"); +} -- GitLab