diff --git a/algorithm/bitonic_sort.cu b/algorithm/bitonic_sort.cu index 4b0331477cb2df51b9aad871980eee347af5a06f..e35d1c6c26aeba6b5076dd99625f3969a72d815a 100644 --- a/algorithm/bitonic_sort.cu +++ b/algorithm/bitonic_sort.cu @@ -9,15 +9,23 @@ // PROBLEM: THREADS DAN BLOCKS HARUS KELIPATAN 2 // SOLUTION: PADDING DATA KE DALAM BUFFER DENGAN NILAI MAX_INT -/* Every thread gets exactly one value in the unsorted array. */ -#define THREADS 1024 -#define BLOCKS 16 -#define NUM_VALS 1000 -void print_elapsed(clock_t start, clock_t stop) +void array_fill(int *arr, int length) { - double elapsed = ((double) (stop - start)) / CLOCKS_PER_SEC; - printf("Elapsed time: %.3fs\n", elapsed); + srand(time(NULL)); + int i; + for (i = 0; i < length; ++i) { + arr[i] = rand(); + } +} + +void array_print(int *arr, int length) +{ + int i; + for (i = 0; i < length; ++i) { + printf("%d ", arr[i]); + } + printf("\n"); } int checksum(int *arr, int size) @@ -28,136 +36,170 @@ int checksum(int *arr, int size) return sum; } -int ceil_power_of_two(int n) +bool isSorted(int *arr, int size) { - int i = 1; - while (i < n) { - i = i << 1; - } - return i; + for (int i = 0; i < size - 1; i++) + if (arr[i] > arr[i + 1]) + return false; + return true; } -void array_print(int *arr, int length) +void print_elapsed(clock_t start, clock_t stop) { - int i; - for (i = 0; i < length; ++i) { - printf("%d ", arr[i]); - } - printf("\n"); + double elapsed = ((double) (stop - start)) / CLOCKS_PER_SEC; + printf("Elapsed time: %.3fs\n", elapsed); } -void array_fill(int *arr, int length) +/** + * Swap the values of two elements in an array. + * + * @param d_arr - the array. + * @param i - the index of the first element. + * @param j - the index of the second element. + */ +__device__ void swap(int *d_arr, int i, int j) { - srand(time(NULL)); - int i; - for (i = 0; i < length; ++i) { - arr[i] = rand(); - } + int temp = d_arr[i]; + d_arr[i] = d_arr[j]; + d_arr[j] = temp; } -void pad_array(int *arr, int length, int size) +/** + * Pad the remaining empty value in the array with the maximum value. + * + * @param arr - the array. + * @param length - the number of elements inside the array. + * @param length_buffer - the length of the buffer. + */ +void padding_array(int *arr, int length, int length_buffer) { - int i; - for (i = length; i < size; i++) - { + for (int i = length; i < length_buffer; i++) { arr[i] = INT_MAX; } } -void copy_and_pad(int *dest, int *src, int length, int size) +/** + * Copy the array from src to dest and pad the remaining empty value with the maximum value. + * + * @param dest - the destination array. + * @param src - the source array. + * @param length - the number of elements inside the src array. + * @param length_buffer - the length of the buffer. + */ +void copy_padding(int *dest, int *src, int length, int length_buffer) { - int i; - for (i = 0; i < length; i++) - { + for (int i = 0; i < length; i++) { dest[i] = src[i]; } - pad_array(dest, length, size); + padding_array(dest, length, length_buffer); } -__global__ void bitonic_sort_step(int *dev_values, int j, int k) +/** + * Do the bitnoic sort step by step. + * + * @param d_values - array in the device to be sorted. + * @param i - major step index. + * @param j - minor step index. + */ +__global__ void bitonic_sort_step(int *d_arr, int i, int j) { - unsigned int i, ixj; /* Sorting partners: i and ixj */ - i = threadIdx.x + blockDim.x * blockIdx.x; - - ixj = i^j; - - /* The threads with the lowest ids sort the array. */ - if ((ixj)>i) { - if ((i&k)==0) { - /* Sort ascending */ - if (dev_values[i]>dev_values[ixj]) { - /* exchange(i,ixj); */ - int temp = dev_values[i]; - dev_values[i] = dev_values[ixj]; - dev_values[ixj] = temp; + // The array index and its patner. + unsigned int idx, patner; + idx = threadIdx.x + blockDim.x * blockIdx.x; + + // Get the patner. + patner = idx ^ j; + + // Sort the array by threads with the lowest idx. + if (idx < patner) { + if ((idx & i) == 0) { + // Sort ascending. + if (d_arr[idx] > d_arr[patner]) { + swap(d_arr, idx, patner); } } - if ((i&k)!=0) { - /* Sort descending */ - if (dev_values[i]<dev_values[ixj]) { - /* exchange(i,ixj); */ - int temp = dev_values[i]; - dev_values[i] = dev_values[ixj]; - dev_values[ixj] = temp; + if ((idx & i) != 0) { + // Sort descending. + if (d_arr[idx] < d_arr[patner]) { + swap(d_arr, idx, patner); } } } } -void bitonic_sort(int *values) +/** + * Perform a bitonic sort on the array. + * + * @param h_arr The host array to sort. + * @param length The length of the array. + */ +void bitonic_sort(int *h_arr, int length) { - int buffer_length = BLOCKS * THREADS; - - size_t size = buffer_length * sizeof(int); - size_t original_size = NUM_VALS * sizeof(int); - - // Copy and padding the values. - int *h_values = (int *)malloc(size); - copy_and_pad(h_values, values, NUM_VALS, buffer_length); - - // Allocate device memory. - int *dev_values; - cudaMalloc((void**) &dev_values, size); - cudaMemcpy(dev_values, h_values, size, cudaMemcpyHostToDevice); - - // Sort the values. - int j, k; - // Major step. - for (k = 2; k <= buffer_length; k <<= 1) { - // Minor step. - for (j=k>>1; j>0; j=j>>1) { - bitonic_sort_step<<<BLOCKS, THREADS>>>(dev_values, j, k); + // Initialize the constants variable. + const int threads = 1024; + const int blocks = 32; + const int buffer_length = threads * blocks; + + // Check that the buffer can hold the array. + if (length > buffer_length) { + printf("[ERROR] The array length is %d, but the buffer length is %d.\n", length, buffer_length); + return; + } + + // Initialize the memory size of the array. + size_t size = length * sizeof(int); + size_t buffer_size = buffer_length * sizeof(int); + + // Create the buffer array and pad with maximum value of Int. + int *h_buffer = (int *)malloc(buffer_size); + copy_padding(h_buffer, h_arr, length, buffer_length); + + // Allocate and copy array into device memory. + int *d_arr; + cudaMalloc((void**) &d_arr, buffer_size); + cudaMemcpy(d_arr, h_buffer, buffer_size, cudaMemcpyHostToDevice); + + // Sort the array using bitonic_sort_step. + int i, j; + // The major step. + for (i = 2; i <= buffer_length; i *= 2) { + // The minor step. + for (j = i / 2; j > 0; j = j / 2) { + bitonic_sort_step<<<blocks, threads>>>(d_arr, i, j); } } // Copy the values back to the host. - cudaMemcpy(values, dev_values, original_size, cudaMemcpyDeviceToHost); + cudaMemcpy(h_arr, d_arr, size, cudaMemcpyDeviceToHost); // Free device memory. - cudaFree(dev_values); - free(h_values); + cudaFree(d_arr); + free(h_buffer); } int main(void) { clock_t start, stop; + int length = 1000000; - int *values = (int*) malloc( NUM_VALS * sizeof(int)); - array_fill(values, NUM_VALS); + int *values = (int*) malloc( length * sizeof(int)); + array_fill(values, length); - array_print(values, NUM_VALS); - printf("\n"); + // array_print(values, length); + // printf("\n"); - int checksum1 = checksum(values, NUM_VALS); + int checksum1 = checksum(values, length); start = clock(); - bitonic_sort(values); /* Inplace */ + + bitonic_sort(values, length); + stop = clock(); - int checksum2 = checksum(values, NUM_VALS); + int checksum2 = checksum(values, length); - array_print(values, NUM_VALS); - printf("\n"); + // array_print(values, length); + // printf("\n"); print_elapsed(start, stop); @@ -165,13 +207,7 @@ int main(void) bool passed_sort = true; bool passed_checksum = true; - for (int i = 1; i < NUM_VALS; i++) - { - if (values[i - 1] > values[i]) - { - passed_sort = false; - } - } + passed_sort = isSorted(values, length); if (checksum1 != checksum2) { diff --git a/algorithm/brick_sort.cu b/algorithm/brick_sort.cu index f7295d45eaa22aba2af218a9f69a8dd96c36ce79..aa2c27e2192505d2c01e1ecc1f8d755e7eab8c77 100644 --- a/algorithm/brick_sort.cu +++ b/algorithm/brick_sort.cu @@ -1,7 +1,6 @@ #include <stdlib.h> #include <stdio.h> #include <time.h> -#include <cooperative_groups.h> // PROBLEM: INI GA JALAN KALAU BLOCKS > 1 @@ -9,8 +8,6 @@ #define THREADS 16 #define NUM_VALS 2000 -using namespace cooperative_groups; - void print_elapsed(clock_t start, clock_t stop) { double elapsed = ((double)(stop - start)) / CLOCKS_PER_SEC; @@ -50,11 +47,11 @@ void array_fill(int *arr, int length) } } -__device__ void swap(int *arr, int i, int j) +__device__ void swap(int *d_arr, int i, int j) { - int temp = arr[i]; - arr[i] = arr[j]; - arr[j] = temp; + int temp = d_arr[i]; + d_arr[i] = d_arr[j]; + d_arr[j] = temp; } __global__ void brick_sort(int *d_arr, int length)