Skip to content
Snippets Groups Projects
Commit 4225c481 authored by Nicholas Rianto Putra's avatar Nicholas Rianto Putra
Browse files

fix bug

parent 2a4675ec
No related merge requests found
...@@ -26,66 +26,35 @@ void rng(int* arr, int n) { ...@@ -26,66 +26,35 @@ void rng(int* arr, int n) {
} }
u_int32_t* down_sweep(u_int32_t* flags, int n) { u_int32_t* down_sweep(u_int32_t* flags, int n) {
u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t)); u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t));
prefix_sum[0] = 0; prefix_sum[0] = 0;
for (int i = 1; i < n; i++) { for (int i = 1; i < n; i++) {
prefix_sum[i] = prefix_sum[i - 1] + flags[i - 1]; prefix_sum[i] = prefix_sum[i - 1] + flags[i - 1];
} }
return prefix_sum; return prefix_sum;
} }
u_int32_t* up_sweep(u_int32_t* flags, int n) { u_int32_t* up_sweep(u_int32_t* flags, int n) {
u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t)); u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t));
prefix_sum[n - 1] = n - 1; prefix_sum[n - 1] = n - 1;
for (int i = n - 2; i >= 0; i--) {
int substract = (flags[i + 1] ? 0 : 1);
prefix_sum[i] = prefix_sum[i + 1] - substract;
}
return prefix_sum; for (int i = n - 2; i >= 0; i--) {
} int substract = (flags[i + 1] ? 0 : 1);
prefix_sum[i] = prefix_sum[i + 1] - substract;
}
void count_sort(int* d_arr, int n, int idx) { return prefix_sum;
int block_dim = 64;
int grid_dim = (n + block_dim - 1) / block_dim;
u_int32_t* d_flags;
cudaMalloc(&d_flags, n * sizeof(u_int32_t));
assign_flagss<<<grid_dim, block_dim>>>(d_arr, n, idx, d_flags);
cudaDeviceSynchronize();
u_int32_t* flags = (u_int32_t*) malloc(n * sizeof(u_int32_t));
cudaMemcpy(flags, d_flags, n * sizeof(u_int32_t), cudaMemcpyDeviceToHost);
u_int32_t* down_sweep_i = down_sweep(flags, n);
u_int32_t* up_sweep_i = up_sweep(flags, n);
free(flags);
u_int32_t *d_arr_temp, *d_arr_idx, *d_idx_down, *d_idx_up;
cudaMalloc(&d_arr_temp, n * sizeof(u_int32_t));
cudaMalloc(&d_arr_idx, n * sizeof(u_int32_t));
cudaMalloc(&d_idx_down, n * sizeof(u_int32_t));
cudaMalloc(&d_idx_up, n * sizeof(u_int32_t));
cudaMemcpy(d_idx_down, down_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_idx_up, up_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_arr_temp, d_arr, n * sizeof(u_int32_t), cudaMemcpyDeviceToDevice);
assign_index<<<grid_dim, block_dim>>>(d_arr, d_arr_idx, d_idx_down, d_idx_up, d_flags, n);
cudaDeviceSynchronize();
scatter<<<grid_dim, block_dim>>>(d_arr, d_arr_temp, d_arr_idx, n);
cudaDeviceSynchronize();
cudaFree(d_arr_idx);
cudaFree(d_idx_down);
cudaFree(d_idx_up);
cudaFree(d_flags);
cudaFree(d_arr_temp);
} }
__global__ void assign_index(u_int32_t* arr, u_int32_t* arr_idx, u_int32_t* down_sweep_i, u_int32_t* up_sweep_i, u_int32_t* flags, int n) { __global__ void assign_index(u_int32_t* arr, u_int32_t* arr_idx, u_int32_t* down_sweep_i, u_int32_t* up_sweep_i, u_int32_t* flags, int n) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
arr_idx[i] = flags[i] ? down_sweep_i[i] : up_sweep_i[i]; arr_idx[i] = flags[i] ? down_sweep_i[i] : up_sweep_i[i];
} }
} }
__global__ void assign_flagss(u_int32_t* arr, int n, int idx, u_int32_t* flags) { __global__ void assign_flags(u_int32_t* arr, int n, int idx, u_int32_t* flags) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
flags[i] = ((arr[i] & 1 << idx) == 1 << idx) ? 0 : 1; flags[i] = ((arr[i] & 1 << idx) == 1 << idx) ? 0 : 1;
} }
...@@ -97,6 +66,37 @@ __global__ void scatter(u_int32_t* arr, u_int32_t* arr_temp, u_int32_t* arr_idx, ...@@ -97,6 +66,37 @@ __global__ void scatter(u_int32_t* arr, u_int32_t* arr_temp, u_int32_t* arr_idx,
} }
} }
void count_sort(int* d_arr, int n, int idx) {
int block_dim = 64;
int grid_dim = (n + block_dim - 1) / block_dim;
u_int32_t* d_flags;
cudaMalloc(&d_flags, n * sizeof(u_int32_t));
assign_flags<<<grid_dim, block_dim>>>(d_arr, n, idx, d_flags);
cudaDeviceSynchronize();
u_int32_t* flags = (u_int32_t*) malloc(n * sizeof(u_int32_t));
cudaMemcpy(flags, d_flags, n * sizeof(u_int32_t), cudaMemcpyDeviceToHost);
u_int32_t* down_sweep_i = down_sweep(flags, n);
u_int32_t* up_sweep_i = up_sweep(flags, n);
free(flags);
u_int32_t *d_arr_temp, *d_arr_idx, *d_idx_down, *d_idx_up;
cudaMalloc(&d_arr_temp, n * sizeof(u_int32_t));
cudaMalloc(&d_arr_idx, n * sizeof(u_int32_t));
cudaMalloc(&d_idx_down, n * sizeof(u_int32_t));
cudaMalloc(&d_idx_up, n * sizeof(u_int32_t));
cudaMemcpy(d_idx_down, down_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_idx_up, up_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_arr_temp, d_arr, n * sizeof(u_int32_t), cudaMemcpyDeviceToDevice);
assign_index<<<grid_dim, block_dim>>>(d_arr, d_arr_idx, d_idx_down, d_idx_up, d_flags, n);
cudaDeviceSynchronize();
scatter<<<grid_dim, block_dim>>>(d_arr, d_arr_temp, d_arr_idx, n);
cudaDeviceSynchronize();
cudaFree(d_arr_idx);
cudaFree(d_idx_down);
cudaFree(d_idx_up);
cudaFree(d_flags);
cudaFree(d_arr_temp);
}
void radix_sort(int* d_arr, int n) { void radix_sort(int* d_arr, int n) {
for (int idx = 1; idx < 32; idx++) { for (int idx = 1; idx < 32; idx++) {
count_sort<<<grid_dim, block_dim>>>(d_arr, n, idx); count_sort<<<grid_dim, block_dim>>>(d_arr, n, idx);
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment