diff --git a/src/radix_sort_parallel.cu b/src/radix_sort_parallel.cu
index 950f89f4209570f22f6cbd8f1c533290a81c023f..0725ae9e019646615689308ec603ceb017dda107 100644
--- a/src/radix_sort_parallel.cu
+++ b/src/radix_sort_parallel.cu
@@ -1,142 +1,127 @@
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include "radix_sort_parallel.h"
-
-__global__ void copyArrayParallel(int *arr, int *output, int n) {
-	for (int i = 0; i < n; i++) {
-		arr[i] = output[i];
-	}
+#include <iostream>
+#include <fstream>
+#include <numeric>
+#include "cuda_runtime.h"
+#include "cuda.h"
+#include "curand_kernel.h"
+#include "device_launch_parameters.h"
+#include <time.h>
+
+#define MAX_BLOCK_SIZE 1024
+#define RNG_SEED 13516013
+
+using namespace std;
+
+void print(unsigned int * arr, int n) {
+	for (int i = 0; i < n; i++) 
+		cout << i <<" :"<<arr[i] << " "<<endl;
+	cout << endl; 
 }
 
-__global__ void getMaxParallel(int *arr, int *max, int n) {
-	int maximum = arr[0];
-	for (int i = 0; i < n; i++) {
-		if (arr[i] > maximum) {
-			maximum = arr[i];
+__global__ void getMax(unsigned int * input, int n)
+{
+	const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+	auto step_size = 1;
+	int number_of_threads = n / 2;
+	while (number_of_threads > 0)
+	{
+		if (tid < number_of_threads) // still alive?
+		{
+			const auto fst = tid * step_size * 2;
+			const auto snd = fst + step_size;
+			if(input[fst] < input[snd]){
+				input[fst] = input[snd];
+			}
 		}
+		step_size <<= 1; 
+		number_of_threads >>= 1;
 	}
-	max[0] = maximum;
+	__syncthreads();
 }
 
-int getMax(int arr[], int n) 
-{ 
-	int result;
-    int *max; 
-    int *d_arr, *d_max;
-
-    // Allocate host memory
-    max = (int*)malloc(n * sizeof(int));
-
-    // Initialize host memory
-    for (int i = 0; i < n; i++) {
-    	max[i] = 0;
-    }
-
-    // Allocate device memory
-    cudaMalloc((void**)&d_arr, n * sizeof(int));
-    cudaMalloc((void**)&d_max, n * sizeof(int));
-
-    // Transfer data from host to device memory
-    cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice);
-
-    // Executing kernel
-    getMaxParallel<<<1,500>>>(d_arr, d_max, n);
-
-    // Transfer data back to host memory
-    cudaMemcpy(max, d_max, n * sizeof(int), cudaMemcpyDeviceToHost);
-
-    // Deallocate device memory
-    cudaFree(d_max);
-    cudaFree(d_arr);
-
-    result = max[0];
-
-    // Deallocate host memory
-    free(max);
-
-    return result;
-} 
-
-void countSort(int arr[], int n, int exp) 
-{ 
-    int *output;
-    int *d_output, *d_arr;
-    int i, count[10] = {0};
-
-    // Allocate host memory
-    output = (int*)malloc(n * sizeof(int));
-
-    for (i = 0; i < n; i++) 
-        count[ (arr[i]/exp)%10 ]++; 
-
-    for (i = 1; i < 10; i++) 
-        count[i] += count[i - 1]; 
-
-    for (i = n - 1; i >= 0; i--) 
-    { 
-        output[count[ (arr[i]/exp)%10 ] - 1] = arr[i]; 
-        count[ (arr[i]/exp)%10 ]--; 
-    } 
-
-    // Allocate device memory
-    cudaMalloc((void**)&d_arr, sizeof(n * sizeof(int)));
-    cudaMalloc((void**)&d_output, sizeof(n * sizeof(int)));
-
-    // Transfer data from host to device memory
-    cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice);
-
-    // Executing kernel
-    copyArrayParallel<<<1,500>>>(d_arr, d_output, n);
-
-    //Transfer data back to host memory
-    cudaMemcpy(output, d_output, n * sizeof(int), cudaMemcpyDeviceToHost);
-
-    // Deallocate device memory
-    cudaFree(d_arr);
-    cudaFree(d_output);
-
-    // Deallocate host memory
-    free(output);
-} 
-
-void radix_sort(int arr[], int n) 
-{ 
-    int m = getMax(arr, n);
-
-    for (int exp = 1; m/exp > 0; exp *= 10) 
-        countSort(arr, n, exp);
-} 
-
-void print(int arr[], int n) 
-{ 
-    for (int i = 0; i < n; i++) 
-        printf("%d: %d\n",i, arr[i]);
-} 
-
-void rng(int* arr, int n) {
-    int seed = 13516013;
-    srand(seed);
-    for(long i = 0; i < n; i++) {
-        arr[i] = (int)rand();
-    }
+__global__ void storeCount(unsigned int * arr, int n, int * count, unsigned long long exp){
+	const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+	int digit;
+	if(tid < n){
+		digit = (arr[tid] / exp) % 10;
+		atomicAdd(&count[digit], 1);
+	}
 }
-
-int main(int argc, char *argv[]) {
+int main(int argc, char *argv[])
+{
+	srand(time(NULL));
+	curandGenerator_t curand_gen;
+	cudaEvent_t start, stop;
+	float time;
 	int N;
 	if (argc == 2) {
-        N = strtol(argv[1], NULL, 10);
-    } else {
-        printf("ERROR: ./radix_sort <array_length>\n");
-        return 1;
-    }
-	int arr[N];
-	rng(arr,N);
-	clock_t begin = clock();
-    radix_sort(arr, N);
-    clock_t end = clock();
-    double time = (double)(end - begin) * 1000 / CLOCKS_PER_SEC;
-	print(arr,N);
-	printf("Executed in %lf ms\n",time);
+		N = strtol(argv[1], NULL, 10);
+	} else {
+		printf("ERROR: ./radix_sort <array_length>\n");
+		return 1;
+	}
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	curandCreateGenerator(&curand_gen, CURAND_RNG_PSEUDO_DEFAULT);
+	curandSetPseudoRandomGeneratorSeed(curand_gen, RNG_SEED);
+	const long count = N;
+	unsigned int h_arr[count];
+	unsigned int * d_arr;
+	int grid, block;
+	cudaMalloc(&d_arr, count * sizeof(unsigned int));
+	//rng
+	curandGenerate(curand_gen, d_arr, count);
+	cudaDeviceSynchronize();
+	cudaMemcpy(h_arr, d_arr, count * sizeof(int), cudaMemcpyDeviceToHost);
+	//find max
+	if(count > MAX_BLOCK_SIZE){
+		grid = count / (MAX_BLOCK_SIZE) + 1;
+	}
+	else{
+		grid = 1;
+	}
+	block = MAX_BLOCK_SIZE / 2;
+	getMax<<<grid, block >>>(d_arr, count);
+	unsigned int global_max;
+	cudaMemcpy(&global_max, d_arr, sizeof(int), cudaMemcpyDeviceToHost);
+	//sort
+	cudaEventRecord(start, 0);
+	cudaMemcpy(d_arr, h_arr, count * sizeof(unsigned int), cudaMemcpyHostToDevice);
+	block *= 2;
+	for (unsigned long long exp = 1; global_max / exp > 0; exp *= 10) {
+		unsigned int output[count];
+		int h_count[10] = {0};
+		int * d_count;
+		cudaMalloc(&d_count, sizeof(int) * 10);
+		cudaMemset(d_count, 0, sizeof(int) * 10);
+		storeCount<<<grid, block>>>(d_arr, count, d_count, exp);
+		cudaMemcpy(h_count, d_count, 10 * sizeof(int), cudaMemcpyDeviceToHost);
+		for (int i = 1; i < 10; i++) {
+			h_count[i] += h_count[i - 1];
+		}
+		for (int i = count - 1; i >= 0; i--) { 
+			output[h_count[ (h_arr[i] / exp) % 10 ] - 1] = h_arr[i]; 
+			h_count[ (h_arr[i] / exp) % 10 ]--;
+		}
+		cudaMemcpy(d_arr, output, sizeof(unsigned int) * count, cudaMemcpyHostToDevice);
+		memcpy(h_arr, output, sizeof(unsigned int) * count);
+		cudaFree(d_count);
+		cudaDeviceSynchronize();
+	}
+	cudaMemcpy(h_arr, d_arr, count * sizeof(unsigned int), cudaMemcpyDeviceToHost);
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time, start, stop);
+	print(h_arr,count);
+	ofstream output("output.txt", std::ios::out | std::ios::trunc);
+	for(int k = 0; k < count; k ++){
+		output << k << ":"<<h_arr[k] << " " ;
+	}
+	output.close();
+	cout << "Executed in " << time * 1000 << " microseconds." << endl;
+	cudaFree(d_arr);
+	curandDestroyGenerator(curand_gen);
+	cudaEventDestroy(start);
+	cudaEventDestroy(stop);
 	return 0;
-}
-
+}
\ No newline at end of file