diff --git a/src/radix_sort_parallel.cu b/src/radix_sort_parallel.cu index 9509a5e55081b001e193d507a1b90aecfe694006..e857a870b26041437b6372998334030e630321ab 100644 --- a/src/radix_sort_parallel.cu +++ b/src/radix_sort_parallel.cu @@ -25,8 +25,8 @@ void rng(int* arr, int n) { } } -u_int32_t* down_sweep(u_int32_t* flags, int n) { - u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t)); +int* down_sweep(int* flags, int n) { + int* prefix_sum = (int*) malloc(n * sizeof(int)); prefix_sum[0] = 0; for (int i = 1; i < n; i++) { @@ -36,8 +36,8 @@ u_int32_t* down_sweep(u_int32_t* flags, int n) { return prefix_sum; } -u_int32_t* up_sweep(u_int32_t* flags, int n) { - u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t)); +int* up_sweep(int* flags, int n) { + int* prefix_sum = (int*) malloc(n * sizeof(int)); prefix_sum[n - 1] = n - 1; for (int i = n - 2; i >= 0; i--) { @@ -48,19 +48,19 @@ u_int32_t* up_sweep(u_int32_t* flags, int n) { return prefix_sum; } -__global__ void assign_index(u_int32_t* arr, u_int32_t* arr_idx, u_int32_t* down_sweep_i, u_int32_t* up_sweep_i, u_int32_t* flags, int n) { +__global__ void assign_index(int* arr, int* arr_idx, int* down_sweep_i, int* up_sweep_i, int* flags, int n) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { arr_idx[i] = flags[i] ? down_sweep_i[i] : up_sweep_i[i]; } } -__global__ void assign_flags(u_int32_t* arr, int n, int idx, u_int32_t* flags) { +__global__ void assign_flags(int* arr, int n, int idx, int* flags) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { flags[i] = ((arr[i] & 1 << idx) == 1 << idx) ? 0 : 1; } } -__global__ void scatter(u_int32_t* arr, u_int32_t* arr_temp, u_int32_t* arr_idx, int n) { +__global__ void scatter(int* arr, int* arr_temp, int* arr_idx, int n) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { arr[arr_idx[i]] = arr_temp[i]; } @@ -69,23 +69,23 @@ __global__ void scatter(u_int32_t* arr, u_int32_t* arr_temp, u_int32_t* arr_idx, void count_sort(int* d_arr, int n, int idx) { int block_dim = 64; int grid_dim = (n + block_dim - 1) / block_dim; - u_int32_t* d_flags; - cudaMalloc(&d_flags, n * sizeof(u_int32_t)); + int* d_flags; + cudaMalloc(&d_flags, n * sizeof(int)); assign_flags<<<grid_dim, block_dim>>>(d_arr, n, idx, d_flags); cudaDeviceSynchronize(); - u_int32_t* flags = (u_int32_t*) malloc(n * sizeof(u_int32_t)); - cudaMemcpy(flags, d_flags, n * sizeof(u_int32_t), cudaMemcpyDeviceToHost); - u_int32_t* down_sweep_i = down_sweep(flags, n); - u_int32_t* up_sweep_i = up_sweep(flags, n); + int* flags = (int*) malloc(n * sizeof(int)); + cudaMemcpy(flags, d_flags, n * sizeof(int), cudaMemcpyDeviceToHost); + int* down_sweep_i = down_sweep(flags, n); + int* up_sweep_i = up_sweep(flags, n); free(flags); - u_int32_t *d_arr_temp, *d_arr_idx, *d_idx_down, *d_idx_up; - cudaMalloc(&d_arr_temp, n * sizeof(u_int32_t)); - cudaMalloc(&d_arr_idx, n * sizeof(u_int32_t)); - cudaMalloc(&d_idx_down, n * sizeof(u_int32_t)); - cudaMalloc(&d_idx_up, n * sizeof(u_int32_t)); - cudaMemcpy(d_idx_down, down_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_idx_up, up_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_arr_temp, d_arr, n * sizeof(u_int32_t), cudaMemcpyDeviceToDevice); + int *d_arr_temp, *d_arr_idx, *d_idx_down, *d_idx_up; + cudaMalloc(&d_arr_temp, n * sizeof(int)); + cudaMalloc(&d_arr_idx, n * sizeof(int)); + cudaMalloc(&d_idx_down, n * sizeof(int)); + cudaMalloc(&d_idx_up, n * sizeof(int)); + cudaMemcpy(d_idx_down, down_sweep_i, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_idx_up, up_sweep_i, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_arr_temp, d_arr, n * sizeof(int), cudaMemcpyDeviceToDevice); assign_index<<<grid_dim, block_dim>>>(d_arr, d_arr_idx, d_idx_down, d_idx_up, d_flags, n); cudaDeviceSynchronize(); scatter<<<grid_dim, block_dim>>>(d_arr, d_arr_temp, d_arr_idx, n); @@ -128,11 +128,11 @@ int main(int argc, char *argv[]) { start = clock(); // Move sorting CPU - u_int32_t* d_arr; - cudaMalloc(&d_arr, sizeof(u_int32_t) * n); - cudaMemcpy(d_arr, arr, sizeof(u_int32_t) * n, cudaMemcpyHostToDevice); + int* d_arr; + cudaMalloc(&d_arr, sizeof(int) * n); + cudaMemcpy(d_arr, arr, sizeof(int) * n, cudaMemcpyHostToDevice); radix_sort(d_arr, n); - cudaMemcpy(arr, d_arr, n * sizeof(u_int32_t), cudaMemcpyDeviceToHost); + cudaMemcpy(arr, d_arr, n * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(d_arr); end = clock();