diff --git a/src/radix_sort_parallel.cu b/src/radix_sort_parallel.cu
index 8e3275e35726231362c15753e5ccbda554cbf1a6..fc72fdcdb6aace9d5a12368b78c99f1622b5a505 100644
--- a/src/radix_sort_parallel.cu
+++ b/src/radix_sort_parallel.cu
@@ -26,66 +26,35 @@ void rng(int* arr, int n) {
 }
 
 u_int32_t* down_sweep(u_int32_t* flags, int n) {
-		u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t));
-		prefix_sum[0] = 0;
+	u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t));
+	prefix_sum[0] = 0;
 
-		for (int i = 1; i < n; i++) {
-				prefix_sum[i] = prefix_sum[i - 1] + flags[i - 1];
-		}
+	for (int i = 1; i < n; i++) {
+		prefix_sum[i] = prefix_sum[i - 1] + flags[i - 1];
+	}
 
-		return prefix_sum;
+	return prefix_sum;
 }
 
 u_int32_t* up_sweep(u_int32_t* flags, int n) {
-		u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t));
-		prefix_sum[n - 1] = n - 1;
-
-		for (int i = n - 2; i >= 0; i--) {
-				int substract = (flags[i + 1] ? 0 : 1);
-				prefix_sum[i] = prefix_sum[i + 1] - substract;
-		}
+	u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t));
+	prefix_sum[n - 1] = n - 1;
 
-		return prefix_sum;
-}
+	for (int i = n - 2; i >= 0; i--) {
+		int substract = (flags[i + 1] ? 0 : 1);
+		prefix_sum[i] = prefix_sum[i + 1] - substract;
+	}
 
-void count_sort(int* d_arr, int n, int idx) {
-    int block_dim = 64;
-		int grid_dim = (n + block_dim - 1) / block_dim;
-		u_int32_t*  d_flags;
-		cudaMalloc(&d_flags, n * sizeof(u_int32_t));
-		assign_flagss<<<grid_dim, block_dim>>>(d_arr, n, idx, d_flags);
-		cudaDeviceSynchronize();
-		u_int32_t* flags = (u_int32_t*) malloc(n * sizeof(u_int32_t));
-		cudaMemcpy(flags, d_flags, n * sizeof(u_int32_t), cudaMemcpyDeviceToHost);
-		u_int32_t* down_sweep_i = down_sweep(flags, n);
-		u_int32_t* up_sweep_i = up_sweep(flags, n);
-		free(flags);
-		u_int32_t *d_arr_temp, *d_arr_idx, *d_idx_down, *d_idx_up;
-		cudaMalloc(&d_arr_temp, n * sizeof(u_int32_t));
-		cudaMalloc(&d_arr_idx, n * sizeof(u_int32_t));
-		cudaMalloc(&d_idx_down, n * sizeof(u_int32_t));
-		cudaMalloc(&d_idx_up, n * sizeof(u_int32_t));
-		cudaMemcpy(d_idx_down, down_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice);
-		cudaMemcpy(d_idx_up, up_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice);
-		cudaMemcpy(d_arr_temp, d_arr, n * sizeof(u_int32_t), cudaMemcpyDeviceToDevice);
-		assign_index<<<grid_dim, block_dim>>>(d_arr, d_arr_idx, d_idx_down, d_idx_up, d_flags, n);
-		cudaDeviceSynchronize();
-		scatter<<<grid_dim, block_dim>>>(d_arr, d_arr_temp, d_arr_idx, n);
-		cudaDeviceSynchronize();
-    cudaFree(d_arr_idx);
-		cudaFree(d_idx_down);
-		cudaFree(d_idx_up);
-		cudaFree(d_flags);
-		cudaFree(d_arr_temp);
+	return prefix_sum;
 }
 
 __global__ void assign_index(u_int32_t* arr, u_int32_t* arr_idx, u_int32_t* down_sweep_i, u_int32_t* up_sweep_i, u_int32_t* flags, int n) {
-		for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
-				arr_idx[i] = flags[i] ? down_sweep_i[i] : up_sweep_i[i];
-		}
+	for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
+		arr_idx[i] = flags[i] ? down_sweep_i[i] : up_sweep_i[i];
+	}
 }
 
-__global__ void assign_flagss(u_int32_t* arr, int n, int idx, u_int32_t* flags) {
+__global__ void assign_flags(u_int32_t* arr, int n, int idx, u_int32_t* flags) {
 		for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
 			flags[i] = ((arr[i] & 1 << idx) == 1 << idx) ? 0 : 1;
 		}
@@ -97,6 +66,37 @@ __global__ void scatter(u_int32_t* arr, u_int32_t* arr_temp, u_int32_t* arr_idx,
 		}
 }
 
+void count_sort(int* d_arr, int n, int idx) {
+    int block_dim = 64;
+	int grid_dim = (n + block_dim - 1) / block_dim;
+	u_int32_t*  d_flags;
+	cudaMalloc(&d_flags, n * sizeof(u_int32_t));
+	assign_flags<<<grid_dim, block_dim>>>(d_arr, n, idx, d_flags);
+	cudaDeviceSynchronize();
+	u_int32_t* flags = (u_int32_t*) malloc(n * sizeof(u_int32_t));
+	cudaMemcpy(flags, d_flags, n * sizeof(u_int32_t), cudaMemcpyDeviceToHost);
+	u_int32_t* down_sweep_i = down_sweep(flags, n);
+	u_int32_t* up_sweep_i = up_sweep(flags, n);
+	free(flags);
+	u_int32_t *d_arr_temp, *d_arr_idx, *d_idx_down, *d_idx_up;
+	cudaMalloc(&d_arr_temp, n * sizeof(u_int32_t));
+	cudaMalloc(&d_arr_idx, n * sizeof(u_int32_t));
+	cudaMalloc(&d_idx_down, n * sizeof(u_int32_t));
+	cudaMalloc(&d_idx_up, n * sizeof(u_int32_t));
+	cudaMemcpy(d_idx_down, down_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_idx_up, up_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_arr_temp, d_arr, n * sizeof(u_int32_t), cudaMemcpyDeviceToDevice);
+	assign_index<<<grid_dim, block_dim>>>(d_arr, d_arr_idx, d_idx_down, d_idx_up, d_flags, n);
+	cudaDeviceSynchronize();
+	scatter<<<grid_dim, block_dim>>>(d_arr, d_arr_temp, d_arr_idx, n);
+	cudaDeviceSynchronize();
+    cudaFree(d_arr_idx);
+	cudaFree(d_idx_down);
+	cudaFree(d_idx_up);
+	cudaFree(d_flags);
+	cudaFree(d_arr_temp);
+}
+
 void radix_sort(int* d_arr, int n) { 
     for (int idx = 1; idx < 32; idx++) {
         count_sort<<<grid_dim, block_dim>>>(d_arr, n, idx);