diff --git a/src/radix_sort_parallel.cu b/src/radix_sort_parallel.cu
index 77f98caec6e5c6b9a72a2aaac4a1d6739124be27..1e32101f0afc340de516d29e95c57125733c49c4 100644
--- a/src/radix_sort_parallel.cu
+++ b/src/radix_sort_parallel.cu
@@ -1,115 +1,128 @@
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include "radix_sort_parallel.h"
-
-__global__ void getMax(int *arr, int *max, int n) {
-    int mx = arr[0]; 
-
-    for (int i = 0; i < n; i++) 
-        if (arr[i] > mx) 
-            mx = arr[i];
-    max[0] = mx;
-} 
-
-__global__ void countSort(int *arr, int n, int exp) {
-    int* output = (int*)malloc(n * sizeof(int));
-    int i, count[10] = {0};
-
-    for (i = 0; i < n; i++) 
-        count[ (arr[i]/exp)%10 ]++; 
-
-    for (i = 1; i < 10; i++) 
-        count[i] += count[i - 1]; 
-
-    for (i = n - 1; i >= 0; i--) 
-    { 
-        output[count[ (arr[i]/exp)%10 ] - 1] = arr[i]; 
-        count[ (arr[i]/exp)%10 ]--; 
-    } 
-
-    for (i = 0; i < n; i++) 
-        arr[i] = output[i];
-} 
-
-void radix_sort(int arr[], int n) 
-{ 
-    int *max;
-    int *d_max, *d_arr;
-
-    // Allocate host memory
-    max = (int*)malloc(n * sizeof(int));
-
-    // Allocate device memory
-    cudaMalloc((void**)&d_max, n * sizeof(int));
-    cudaMalloc((void**)&d_arr, n * sizeof(int));
-
-    // Transfer data from host to device
-    cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_max, max, n * sizeof(int), cudaMemcpyHostToDevice);
-
-    // Executing kernel
-    getMax<<<1, 500>>>(d_arr, d_max, n);
-
-    // Transfer data back to host memory
-    cudaMemcpy(max, d_max, n * sizeof(int), cudaMemcpyDeviceToHost);
-
-    for (int exp = 1; max[0]/exp > 0; exp *= 10) {
-        countSort<<<1, 500>>>(d_arr, n, exp);
-    }
-
-    cudaMemcpy(arr, d_arr, n * sizeof(int), cudaMemcpyDeviceToHost);
-    cudaFree(d_max);
-    cudaFree(d_arr);
-    free(max);
-} 
-
-void print(int arr[], int n) 
-{ 
-    for (int i = 0; i < n; i++) 
-        printf("%d: %d\n",i, arr[i]);
-} 
-
-void rng(int *arr, int n) {
-    int seed = 13516013;
-    srand(seed);
-    for(long i = 0; i < n; i++) {
-        arr[i] = (int)rand();
-    }
+#include <iostream>
+#include <fstream>
+#include <numeric>
+#include "cuda_runtime.h"
+#include "cuda.h"
+#include "curand_kernel.h"
+#include "device_launch_parameters.h"
+#include <time.h>
+
+#define MAX_BLOCK_SIZE 1024
+#define RNG_SEED 13516013
+
+using namespace std;
+
+void print(unsigned int * arr, int n) {
+	for (int i = 0; i < n; i++) 
+		cout << i <<" :"<<arr[i] << " "<<endl;
+	cout << endl; 
 }
 
-int main(int argc, char *argv[]) {
-    int N;
-    int *arr;
-    int *d_arr;
-
-    if (argc == 2) {
-        N = strtol(argv[1], NULL, 10);
-    } else {
-        printf("ERROR: ./radix_sort <array_length>\n");
-        return 1;
-    }
-
-    // Allocate host memory
-    arr = (int*)malloc(N * sizeof(int));
-
-    // Initialize host memory
-    rng(arr,N);
-
-    // Allocate device memory
-    cudaMalloc((void**)&d_arr, N * sizeof(int));
-
-    // Transfer data from host to device memory
-    cudaMemcpy(d_arr, arr, N * sizeof(int), cudaMemcpyHostToDevice);
-
-    clock_t begin = clock();
-    radix_sort(arr, N);
-    clock_t end = clock();
-    double time = (double)(end - begin) * 1000 / CLOCKS_PER_SEC;
-    print(arr,N);
-    printf("Executed in %lf ms\n",time);
-
-    cudaFree(d_arr);
-    free(arr);
-    return 0;
+__global__ void getMax(unsigned int * input, int n)
+{
+	const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+	auto step_size = 1;
+	int number_of_threads = n / 2;
+	while (number_of_threads > 0)
+	{
+		if (tid < number_of_threads) // still alive?
+		{
+			const auto fst = tid * step_size * 2;
+			const auto snd = fst + step_size;
+			if(input[fst] < input[snd]){
+				input[fst] = input[snd];
+			}
+		}
+		step_size <<= 1; 
+		number_of_threads >>= 1;
+	}
+	__syncthreads();
 }
 
+__global__ void storeCount(unsigned int * arr, int n, int * count, unsigned long long exp){
+	const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+	int digit;
+	if(tid < n){
+		digit = (arr[tid] / exp) % 10;
+		atomicAdd(&count[digit], 1);
+	}
+}
+int main(int argc, char *argv[])
+{
+	srand(time(NULL));
+	curandGenerator_t curand_gen;
+	cudaEvent_t start, stop;
+	float time;
+	int N;
+	if (argc == 2) {
+		N = strtol(argv[1], NULL, 10);
+	} else {
+		printf("ERROR: ./radix_sort <array_length>\n");
+		return 1;
+	}
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	curandCreateGenerator(&curand_gen, CURAND_RNG_PSEUDO_DEFAULT);
+	curandSetPseudoRandomGeneratorSeed(curand_gen, RNG_SEED);
+	const long count = N;
+	unsigned int h_arr[count];
+	unsigned int * d_arr;
+	int grid, block;
+	cudaMalloc(&d_arr, count * sizeof(unsigned int));
+	//rng
+	curandGenerate(curand_gen, d_arr, count);
+	cudaDeviceSynchronize();
+	cudaMemcpy(h_arr, d_arr, count * sizeof(int), cudaMemcpyDeviceToHost);
+	//find max
+	if(count > MAX_BLOCK_SIZE){
+		grid = count / (MAX_BLOCK_SIZE) + 1;
+	}
+	else{
+		grid = 1;
+	}
+	block = MAX_BLOCK_SIZE / 2;
+	getMax<<<grid, block >>>(d_arr, count);
+	unsigned int global_max;
+	cudaMemcpy(&global_max, d_arr, sizeof(int), cudaMemcpyDeviceToHost);
+	//sort
+	cudaEventRecord(start, 0);
+	cudaMemcpy(d_arr, h_arr, count * sizeof(unsigned int), cudaMemcpyHostToDevice);
+	block *= 2;
+	for (unsigned long long exp = 1; global_max / exp > 0; exp *= 10) {
+		unsigned int output[count];
+		int h_count[10] = {0};
+		int * d_count;
+		cudaMalloc(&d_count, sizeof(int) * 10);
+		cudaMemset(d_count, 0, sizeof(int) * 10);
+		storeCount<<<grid, block>>>(d_arr, count, d_count, exp);
+		cudaMemcpy(h_count, d_count, 10 * sizeof(int), cudaMemcpyDeviceToHost);
+		for (int i = 1; i < 10; i++) {
+			h_count[i] += h_count[i - 1];
+		}
+		for (int i = count - 1; i >= 0; i--) { 
+			output[h_count[ (h_arr[i] / exp) % 10 ] - 1] = h_arr[i]; 
+			h_count[ (h_arr[i] / exp) % 10 ]--;
+		}
+		cudaMemcpy(d_arr, output, sizeof(unsigned int) * count, cudaMemcpyHostToDevice);
+		memcpy(h_arr, output, sizeof(unsigned int) * count);
+		cudaFree(d_count);
+		cudaDeviceSynchronize();
+	}
+	cudaMemcpy(h_arr, d_arr, count * sizeof(unsigned int), cudaMemcpyDeviceToHost);
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time, start, stop);
+	print(h_arr,count);
+	ofstream output("test\output.txt", std::ios::out | std::ios::trunc);
+	for(int k = 0; k < count; k ++){
+		output << k << ":"<<h_arr[k] << " " ;
+	}
+	output.close();
+	cout << "Executed in " << time * 1000 << " microseconds." << endl;
+	cudaFree(d_arr);
+	curandDestroyGenerator(curand_gen);
+	cudaEventDestroy(start);
+	cudaEventDestroy(stop);
+	return 0;
+}
+>>>>>>> 2ccd6cd7d8a384702d5a60c7d95fe2a2808d9b80
diff --git a/src/radix_sort_parallel.h b/src/radix_sort_parallel.h
deleted file mode 100644
index b2477e55f5cb5350db3a7bf1605b94b9c5de8cc8..0000000000000000000000000000000000000000
--- a/src/radix_sort_parallel.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <omp.h>
-#include <time.h>
-#include <math.h>
-#include <assert.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#define t 8
-
-void rng(int* arr, int n);
-void radix_sort(int arr[], int n);
-void print(int arr[], int n) ;
\ No newline at end of file