diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..059f294156c08c21ef791e5fe7e609f65d26a9a7 --- /dev/null +++ b/Makefile @@ -0,0 +1,2 @@ +radix_sort: + nvcc radix_sort_parallel.cu -o radix_sort diff --git a/logo.png b/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..7437be5009b2dbb38fe88e0088f318d99f06c773 Binary files /dev/null and b/logo.png differ diff --git a/src/radix_sort_parallel.c b/src/radix_sort_parallel.c new file mode 100644 index 0000000000000000000000000000000000000000..8e3275e35726231362c15753e5ccbda554cbf1a6 --- /dev/null +++ b/src/radix_sort_parallel.c @@ -0,0 +1,144 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <cuda_runtime.h> + +void print_to_file(int* arr, int arr_len, char* type) { + char filename[100]; + time_t now = time(NULL); + struct tm *t = localtime(&now); + strftime(filename, sizeof(filename) - 1, "output/%Y_%m_%d_%H%M%S", t); + snprintf(filename, sizeof(filename) - 1, "%s-%s.txt", filename, type); + FILE *f = fopen(filename, "w"); + for (int i = 0; i < arr_len; i++) { + fprintf(f, "%d ", arr[i]); + } + fclose(f); +} + +void rng(int* arr, int n) { + int seed = 13516095; + srand(seed); + for(long i = 0; i < n; i++) { + arr[i] = (int)rand(); + } +} + +u_int32_t* down_sweep(u_int32_t* flags, int n) { + u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t)); + prefix_sum[0] = 0; + + for (int i = 1; i < n; i++) { + prefix_sum[i] = prefix_sum[i - 1] + flags[i - 1]; + } + + return prefix_sum; +} + +u_int32_t* up_sweep(u_int32_t* flags, int n) { + u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t)); + prefix_sum[n - 1] = n - 1; + + for (int i = n - 2; i >= 0; i--) { + int substract = (flags[i + 1] ? 0 : 1); + prefix_sum[i] = prefix_sum[i + 1] - substract; + } + + return prefix_sum; +} + +void count_sort(int* d_arr, int n, int idx) { + int block_dim = 64; + int grid_dim = (n + block_dim - 1) / block_dim; + u_int32_t* d_flags; + cudaMalloc(&d_flags, n * sizeof(u_int32_t)); + assign_flagss<<<grid_dim, block_dim>>>(d_arr, n, idx, d_flags); + cudaDeviceSynchronize(); + u_int32_t* flags = (u_int32_t*) malloc(n * sizeof(u_int32_t)); + cudaMemcpy(flags, d_flags, n * sizeof(u_int32_t), cudaMemcpyDeviceToHost); + u_int32_t* down_sweep_i = down_sweep(flags, n); + u_int32_t* up_sweep_i = up_sweep(flags, n); + free(flags); + u_int32_t *d_arr_temp, *d_arr_idx, *d_idx_down, *d_idx_up; + cudaMalloc(&d_arr_temp, n * sizeof(u_int32_t)); + cudaMalloc(&d_arr_idx, n * sizeof(u_int32_t)); + cudaMalloc(&d_idx_down, n * sizeof(u_int32_t)); + cudaMalloc(&d_idx_up, n * sizeof(u_int32_t)); + cudaMemcpy(d_idx_down, down_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_idx_up, up_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_arr_temp, d_arr, n * sizeof(u_int32_t), cudaMemcpyDeviceToDevice); + assign_index<<<grid_dim, block_dim>>>(d_arr, d_arr_idx, d_idx_down, d_idx_up, d_flags, n); + cudaDeviceSynchronize(); + scatter<<<grid_dim, block_dim>>>(d_arr, d_arr_temp, d_arr_idx, n); + cudaDeviceSynchronize(); + cudaFree(d_arr_idx); + cudaFree(d_idx_down); + cudaFree(d_idx_up); + cudaFree(d_flags); + cudaFree(d_arr_temp); +} + +__global__ void assign_index(u_int32_t* arr, u_int32_t* arr_idx, u_int32_t* down_sweep_i, u_int32_t* up_sweep_i, u_int32_t* flags, int n) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { + arr_idx[i] = flags[i] ? down_sweep_i[i] : up_sweep_i[i]; + } +} + +__global__ void assign_flagss(u_int32_t* arr, int n, int idx, u_int32_t* flags) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { + flags[i] = ((arr[i] & 1 << idx) == 1 << idx) ? 0 : 1; + } +} + +__global__ void scatter(u_int32_t* arr, u_int32_t* arr_temp, u_int32_t* arr_idx, int n) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { + arr[arr_idx[i]] = arr_temp[i]; + } +} + +void radix_sort(int* d_arr, int n) { + for (int idx = 1; idx < 32; idx++) { + count_sort<<<grid_dim, block_dim>>>(d_arr, n, idx); + } +} + +void print(int* arr, int n) { + for (int i = 0; i < n; i++) + printf("%d ", arr[i]); + printf("\n"); +} + +int main(int argc, char *argv[]) { + if (argc != 2) { + printf("Usage: ./radix_sort <n>\n"); + return 1; + } + + // Generate numbers + int n = atoi(argv[1]); + int* arr = (int*) malloc(sizeof(int) * n); + rng(arr, n); + + // Sort numbers + clock_t start, end; + double cpu_time_used; + + printf("Sorting in parallel...\n"); + start = clock(); + + // Move sorting CPU + u_int32_t* d_arr; + cudaMalloc(&d_arr, sizeof(u_int32_t) * n); + cudaMemcpy(d_arr, arr, sizeof(u_int32_t) * n, cudaMemcpyHostToDevice); + radix_sort(d_arr, n); + cudaMemcpy(arr, d_arr, n * sizeof(u_int32_t), cudaMemcpyDeviceToHost); + cudaFree(d_arr); + + end = clock(); + cpu_time_used = ((double) (end - start)) * 1000000 / CLOCKS_PER_SEC; + printf("Parallel radix sort finished, time taken: %f μs\n\n", cpu_time_used); + print_to_file(arr, n, "output-parallel"); + + return 0; +} diff --git a/src/radix_sort_serial.c b/src/radix_sort_serial.c new file mode 100644 index 0000000000000000000000000000000000000000..7cba8ef05a3c09d92acde9213f717a7dcf1d9797 --- /dev/null +++ b/src/radix_sort_serial.c @@ -0,0 +1,91 @@ +#include <stdio.h> +#include <stdlib.h> +#include <time.h> + +void print_to_file(int* arr, int arr_len, char* type) { + char filename[100]; + time_t now = time(NULL); + struct tm *t = localtime(&now); + strftime(filename, sizeof(filename) - 1, "output/%Y_%m_%d_%H%M%S", t); + snprintf(filename, sizeof(filename) - 1, "%s-%s.txt", filename, type); + FILE *f = fopen(filename, "w"); + for (int i = 0; i < arr_len; i++) { + fprintf(f, "%d ", arr[i]); + } + fclose(f); +} + +void rng(int* arr, int n) { + int seed = 13516095; + srand(seed); + for(long i = 0; i < n; i++) { + arr[i] = (int)rand(); + } +} + +int get_max(int* arr, int n) { + int mx = arr[0]; + for (int i = 1; i < n; i++) + if (arr[i] > mx) + mx = arr[i]; + return mx; +} + +void count_sort(int* arr, int n, int exp) { + int output[n]; + int i, count[10] = {0}; + + for (i = 0; i < n; i++) + count[(arr[i] / exp) % 10]++; + + for (i = 1; i < 10; i++) + count[i] += count[i - 1]; + + for (i = n - 1; i >= 0; i--) + { + output[count[(arr[i] / exp) % 10] - 1] = arr[i]; + count[(arr[i] / exp) % 10]--; + } + + for (i = 0; i < n; i++) + arr[i] = output[i]; +} + +void radix_sort(int* arr, int n) { + int m = get_max(arr, n); + for (int exp = 1; m/exp > 0; exp *= 10) { + count_sort(arr, n, exp); + } +} + +void print(int* arr, int n) { + for (int i = 0; i < n; i++) + printf("%d ", arr[i]); + printf("\n"); +} + +int main(int argc, char *argv[]) { + if (argc != 2) { + printf("Usage: ./serial_radix_sort <n>\n"); + return 1; + } + + int n = atoi(argv[1]); + int* arr = (int*) malloc(sizeof(int) * n); + + rng(arr, n); + + clock_t start, end; + double cpu_time_used; + + printf("Sorting in serial...\n"); + start = clock(); + radix_sort(arr, n); + end = clock(); + cpu_time_used = ((double) (end - start)) * 1000000 / CLOCKS_PER_SEC; + printf("Serial radix sort finished, time taken: %f μs\n\n", cpu_time_used); + + print_to_file(arr, n, "output-serial"); + + return 0; +}