Skip to content
Snippets Groups Projects
Commit 4225c481 authored by Nicholas Rianto Putra's avatar Nicholas Rianto Putra
Browse files

fix bug

parent 2a4675ec
No related merge requests found
......@@ -26,66 +26,35 @@ void rng(int* arr, int n) {
}
u_int32_t* down_sweep(u_int32_t* flags, int n) {
u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t));
prefix_sum[0] = 0;
u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t));
prefix_sum[0] = 0;
for (int i = 1; i < n; i++) {
prefix_sum[i] = prefix_sum[i - 1] + flags[i - 1];
}
for (int i = 1; i < n; i++) {
prefix_sum[i] = prefix_sum[i - 1] + flags[i - 1];
}
return prefix_sum;
return prefix_sum;
}
u_int32_t* up_sweep(u_int32_t* flags, int n) {
u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t));
prefix_sum[n - 1] = n - 1;
for (int i = n - 2; i >= 0; i--) {
int substract = (flags[i + 1] ? 0 : 1);
prefix_sum[i] = prefix_sum[i + 1] - substract;
}
u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t));
prefix_sum[n - 1] = n - 1;
return prefix_sum;
}
for (int i = n - 2; i >= 0; i--) {
int substract = (flags[i + 1] ? 0 : 1);
prefix_sum[i] = prefix_sum[i + 1] - substract;
}
void count_sort(int* d_arr, int n, int idx) {
int block_dim = 64;
int grid_dim = (n + block_dim - 1) / block_dim;
u_int32_t* d_flags;
cudaMalloc(&d_flags, n * sizeof(u_int32_t));
assign_flagss<<<grid_dim, block_dim>>>(d_arr, n, idx, d_flags);
cudaDeviceSynchronize();
u_int32_t* flags = (u_int32_t*) malloc(n * sizeof(u_int32_t));
cudaMemcpy(flags, d_flags, n * sizeof(u_int32_t), cudaMemcpyDeviceToHost);
u_int32_t* down_sweep_i = down_sweep(flags, n);
u_int32_t* up_sweep_i = up_sweep(flags, n);
free(flags);
u_int32_t *d_arr_temp, *d_arr_idx, *d_idx_down, *d_idx_up;
cudaMalloc(&d_arr_temp, n * sizeof(u_int32_t));
cudaMalloc(&d_arr_idx, n * sizeof(u_int32_t));
cudaMalloc(&d_idx_down, n * sizeof(u_int32_t));
cudaMalloc(&d_idx_up, n * sizeof(u_int32_t));
cudaMemcpy(d_idx_down, down_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_idx_up, up_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_arr_temp, d_arr, n * sizeof(u_int32_t), cudaMemcpyDeviceToDevice);
assign_index<<<grid_dim, block_dim>>>(d_arr, d_arr_idx, d_idx_down, d_idx_up, d_flags, n);
cudaDeviceSynchronize();
scatter<<<grid_dim, block_dim>>>(d_arr, d_arr_temp, d_arr_idx, n);
cudaDeviceSynchronize();
cudaFree(d_arr_idx);
cudaFree(d_idx_down);
cudaFree(d_idx_up);
cudaFree(d_flags);
cudaFree(d_arr_temp);
return prefix_sum;
}
__global__ void assign_index(u_int32_t* arr, u_int32_t* arr_idx, u_int32_t* down_sweep_i, u_int32_t* up_sweep_i, u_int32_t* flags, int n) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
arr_idx[i] = flags[i] ? down_sweep_i[i] : up_sweep_i[i];
}
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
arr_idx[i] = flags[i] ? down_sweep_i[i] : up_sweep_i[i];
}
}
__global__ void assign_flagss(u_int32_t* arr, int n, int idx, u_int32_t* flags) {
__global__ void assign_flags(u_int32_t* arr, int n, int idx, u_int32_t* flags) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
flags[i] = ((arr[i] & 1 << idx) == 1 << idx) ? 0 : 1;
}
......@@ -97,6 +66,37 @@ __global__ void scatter(u_int32_t* arr, u_int32_t* arr_temp, u_int32_t* arr_idx,
}
}
void count_sort(int* d_arr, int n, int idx) {
int block_dim = 64;
int grid_dim = (n + block_dim - 1) / block_dim;
u_int32_t* d_flags;
cudaMalloc(&d_flags, n * sizeof(u_int32_t));
assign_flags<<<grid_dim, block_dim>>>(d_arr, n, idx, d_flags);
cudaDeviceSynchronize();
u_int32_t* flags = (u_int32_t*) malloc(n * sizeof(u_int32_t));
cudaMemcpy(flags, d_flags, n * sizeof(u_int32_t), cudaMemcpyDeviceToHost);
u_int32_t* down_sweep_i = down_sweep(flags, n);
u_int32_t* up_sweep_i = up_sweep(flags, n);
free(flags);
u_int32_t *d_arr_temp, *d_arr_idx, *d_idx_down, *d_idx_up;
cudaMalloc(&d_arr_temp, n * sizeof(u_int32_t));
cudaMalloc(&d_arr_idx, n * sizeof(u_int32_t));
cudaMalloc(&d_idx_down, n * sizeof(u_int32_t));
cudaMalloc(&d_idx_up, n * sizeof(u_int32_t));
cudaMemcpy(d_idx_down, down_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_idx_up, up_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_arr_temp, d_arr, n * sizeof(u_int32_t), cudaMemcpyDeviceToDevice);
assign_index<<<grid_dim, block_dim>>>(d_arr, d_arr_idx, d_idx_down, d_idx_up, d_flags, n);
cudaDeviceSynchronize();
scatter<<<grid_dim, block_dim>>>(d_arr, d_arr_temp, d_arr_idx, n);
cudaDeviceSynchronize();
cudaFree(d_arr_idx);
cudaFree(d_idx_down);
cudaFree(d_idx_up);
cudaFree(d_flags);
cudaFree(d_arr_temp);
}
void radix_sort(int* d_arr, int n) {
for (int idx = 1; idx < 32; idx++) {
count_sort<<<grid_dim, block_dim>>>(d_arr, n, idx);
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment