diff --git a/src/radix_sort_parallel.cu b/src/radix_sort_parallel.cu index 8e3275e35726231362c15753e5ccbda554cbf1a6..fc72fdcdb6aace9d5a12368b78c99f1622b5a505 100644 --- a/src/radix_sort_parallel.cu +++ b/src/radix_sort_parallel.cu @@ -26,66 +26,35 @@ void rng(int* arr, int n) { } u_int32_t* down_sweep(u_int32_t* flags, int n) { - u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t)); - prefix_sum[0] = 0; + u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t)); + prefix_sum[0] = 0; - for (int i = 1; i < n; i++) { - prefix_sum[i] = prefix_sum[i - 1] + flags[i - 1]; - } + for (int i = 1; i < n; i++) { + prefix_sum[i] = prefix_sum[i - 1] + flags[i - 1]; + } - return prefix_sum; + return prefix_sum; } u_int32_t* up_sweep(u_int32_t* flags, int n) { - u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t)); - prefix_sum[n - 1] = n - 1; - - for (int i = n - 2; i >= 0; i--) { - int substract = (flags[i + 1] ? 0 : 1); - prefix_sum[i] = prefix_sum[i + 1] - substract; - } + u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t)); + prefix_sum[n - 1] = n - 1; - return prefix_sum; -} + for (int i = n - 2; i >= 0; i--) { + int substract = (flags[i + 1] ? 0 : 1); + prefix_sum[i] = prefix_sum[i + 1] - substract; + } -void count_sort(int* d_arr, int n, int idx) { - int block_dim = 64; - int grid_dim = (n + block_dim - 1) / block_dim; - u_int32_t* d_flags; - cudaMalloc(&d_flags, n * sizeof(u_int32_t)); - assign_flagss<<<grid_dim, block_dim>>>(d_arr, n, idx, d_flags); - cudaDeviceSynchronize(); - u_int32_t* flags = (u_int32_t*) malloc(n * sizeof(u_int32_t)); - cudaMemcpy(flags, d_flags, n * sizeof(u_int32_t), cudaMemcpyDeviceToHost); - u_int32_t* down_sweep_i = down_sweep(flags, n); - u_int32_t* up_sweep_i = up_sweep(flags, n); - free(flags); - u_int32_t *d_arr_temp, *d_arr_idx, *d_idx_down, *d_idx_up; - cudaMalloc(&d_arr_temp, n * sizeof(u_int32_t)); - cudaMalloc(&d_arr_idx, n * sizeof(u_int32_t)); - cudaMalloc(&d_idx_down, n * sizeof(u_int32_t)); - cudaMalloc(&d_idx_up, n * sizeof(u_int32_t)); - cudaMemcpy(d_idx_down, down_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_idx_up, up_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_arr_temp, d_arr, n * sizeof(u_int32_t), cudaMemcpyDeviceToDevice); - assign_index<<<grid_dim, block_dim>>>(d_arr, d_arr_idx, d_idx_down, d_idx_up, d_flags, n); - cudaDeviceSynchronize(); - scatter<<<grid_dim, block_dim>>>(d_arr, d_arr_temp, d_arr_idx, n); - cudaDeviceSynchronize(); - cudaFree(d_arr_idx); - cudaFree(d_idx_down); - cudaFree(d_idx_up); - cudaFree(d_flags); - cudaFree(d_arr_temp); + return prefix_sum; } __global__ void assign_index(u_int32_t* arr, u_int32_t* arr_idx, u_int32_t* down_sweep_i, u_int32_t* up_sweep_i, u_int32_t* flags, int n) { - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { - arr_idx[i] = flags[i] ? down_sweep_i[i] : up_sweep_i[i]; - } + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { + arr_idx[i] = flags[i] ? down_sweep_i[i] : up_sweep_i[i]; + } } -__global__ void assign_flagss(u_int32_t* arr, int n, int idx, u_int32_t* flags) { +__global__ void assign_flags(u_int32_t* arr, int n, int idx, u_int32_t* flags) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { flags[i] = ((arr[i] & 1 << idx) == 1 << idx) ? 0 : 1; } @@ -97,6 +66,37 @@ __global__ void scatter(u_int32_t* arr, u_int32_t* arr_temp, u_int32_t* arr_idx, } } +void count_sort(int* d_arr, int n, int idx) { + int block_dim = 64; + int grid_dim = (n + block_dim - 1) / block_dim; + u_int32_t* d_flags; + cudaMalloc(&d_flags, n * sizeof(u_int32_t)); + assign_flags<<<grid_dim, block_dim>>>(d_arr, n, idx, d_flags); + cudaDeviceSynchronize(); + u_int32_t* flags = (u_int32_t*) malloc(n * sizeof(u_int32_t)); + cudaMemcpy(flags, d_flags, n * sizeof(u_int32_t), cudaMemcpyDeviceToHost); + u_int32_t* down_sweep_i = down_sweep(flags, n); + u_int32_t* up_sweep_i = up_sweep(flags, n); + free(flags); + u_int32_t *d_arr_temp, *d_arr_idx, *d_idx_down, *d_idx_up; + cudaMalloc(&d_arr_temp, n * sizeof(u_int32_t)); + cudaMalloc(&d_arr_idx, n * sizeof(u_int32_t)); + cudaMalloc(&d_idx_down, n * sizeof(u_int32_t)); + cudaMalloc(&d_idx_up, n * sizeof(u_int32_t)); + cudaMemcpy(d_idx_down, down_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_idx_up, up_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_arr_temp, d_arr, n * sizeof(u_int32_t), cudaMemcpyDeviceToDevice); + assign_index<<<grid_dim, block_dim>>>(d_arr, d_arr_idx, d_idx_down, d_idx_up, d_flags, n); + cudaDeviceSynchronize(); + scatter<<<grid_dim, block_dim>>>(d_arr, d_arr_temp, d_arr_idx, n); + cudaDeviceSynchronize(); + cudaFree(d_arr_idx); + cudaFree(d_idx_down); + cudaFree(d_idx_up); + cudaFree(d_flags); + cudaFree(d_arr_temp); +} + void radix_sort(int* d_arr, int n) { for (int idx = 1; idx < 32; idx++) { count_sort<<<grid_dim, block_dim>>>(d_arr, n, idx);