diff --git a/src/parallel_radix.cu b/src/parallel_radix.cu
index 9e7a15963f48ecbf640e39fbe06bc4eb4a1cc247..25b778ad130f5fc990c1aed6ac270b029c3797ab 100644
--- a/src/parallel_radix.cu
+++ b/src/parallel_radix.cu
@@ -1,125 +1,131 @@
-#include <iostream>
-#include <fstream>
-#include <numeric>
-#include "cuda_runtime.h"
-#include "cuda.h"
-#include "curand_kernel.h"
-#include "device_launch_parameters.h"
-
-#define ARRAY_SIZE 100000
-#define MAX_BLOCK_SIZE 1024
-#define RNG_SEED 13516120
-#define BASE 10
-
-
-using namespace std;
-
-void printArray(unsigned int * arr, int n) {
-    for (int i = 0; i < n; i++) 
-        cout << arr[i] << " ";
-    cout << endl; 
-}
-
-__global__ void get_max(unsigned int * input, int n)
-{
-	const int tid = blockDim.x * blockIdx.x + threadIdx.x;
-	auto step_size = 1;
-	int number_of_threads = n / 2;
-	while (number_of_threads > 0)
-	{
-		if (tid < number_of_threads) // still alive?
-		{
-			const auto fst = tid * step_size * 2;
-			const auto snd = fst + step_size;
-			if(input[fst] < input[snd]){
-				input[fst] = input[snd];
-			}
-		}
-		step_size <<= 1; 
-		number_of_threads >>= 1;
-    }
-    __syncthreads();
-}
-
-__global__ void count_occurences(unsigned int * arr, int n, int * count, unsigned long long exp){
-    const int tid = blockDim.x * blockIdx.x + threadIdx.x;
-    int digit;
-    if(tid < n){
-        digit = (arr[tid] / exp) % BASE;
-        atomicAdd(&count[digit], 1);
-    }
-}
-int main(int argc, char *argv[])
-{
-    curandGenerator_t curand_gen;
-    cudaEvent_t start, stop;
-    float time;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-    curandCreateGenerator(&curand_gen, CURAND_RNG_PSEUDO_DEFAULT);
-    curandSetPseudoRandomGeneratorSeed(curand_gen, RNG_SEED);
-    const long count = ARRAY_SIZE;
-    unsigned int deviceArr[count];
-    unsigned int * darr;
-    int grid, block;
-    cudaMalloc(&darr, count * sizeof(unsigned int));
-    
-    
-    curandGenerate(curand_gen, darr, count);
-    cudaDeviceSynchronize();
-    cudaMemcpy(deviceArr, darr, count * sizeof(int), cudaMemcpyDeviceToHost);
-    //find max
-    if(count > MAX_BLOCK_SIZE){
-        grid = count / (MAX_BLOCK_SIZE);
-    }
-    else{
-        grid = 1;
-    }
-    block = MAX_BLOCK_SIZE / 2;
-	get_max <<<grid, block >>>(darr, count);
-	unsigned int global_max;
-    cudaMemcpy(&global_max, darr, sizeof(int), cudaMemcpyDeviceToHost);
-    
-    
-    
-    cudaEventRecord(start, 0);
-    cudaMemcpy(darr, deviceArr, count * sizeof(unsigned int), cudaMemcpyHostToDevice);
-    block *= 2;
-    for (unsigned long long exp = 1; global_max / exp > 0; exp *= BASE) {
-        unsigned int output[count];
-        int hcount[BASE] = {0};
-        int * dcount;
-        cudaMalloc(&dcount, sizeof(int) * BASE);
-        cudaMemset(dcount, 0, sizeof(int) * BASE);
-        count_occurences <<<grid, block>>>(darr, count, dcount, exp);
-        cudaMemcpy(hcount, dcount, BASE * sizeof(int), cudaMemcpyDeviceToHost);
-        for (int i = 1; i < BASE; i++) {
-            hcount[i] += hcount[i - 1];
-        }
-        for (int i = count - 1; i >= 0; i--) { 
-            output[hcount[ (deviceArr[i] / exp) % BASE ] - 1] = deviceArr[i]; 
-            hcount[ (deviceArr[i] / exp) % BASE ]--;
-        }
-        cudaMemcpy(darr, output, sizeof(unsigned int) * count, cudaMemcpyHostToDevice);
-        memcpy(deviceArr, output, sizeof(unsigned int) * count);
-        cudaFree(dcount);
-        cudaDeviceSynchronize();
-    }
-
-    cudaMemcpy(deviceArr, darr, count * sizeof(unsigned int), cudaMemcpyDeviceToHost);
-    cudaEventRecord(stop, 0);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&time, start, stop);
-    ofstream out("test/output", std::ios::out | std::ios::trunc);
-    for(int k = 0; k < count; k ++){
-        out << deviceArr[k] << endl ;
-    }
-
-    out.close();
-    cout << "Time =  " << time * 1000 << " ms" << endl;
-    cudaFree(darr);
-    curandDestroyGenerator(curand_gen);
-    cudaEventDestroy(start);
-    cudaEventDestroy(stop);
-	return 0;
+#include <iostream>
+#include <fstream>
+#include <numeric>
+#include "cuda_runtime.h"
+#include "cuda.h"
+#include "curand_kernel.h"
+#include "device_launch_parameters.h"
+
+#define ARRAY_SIZE 100000
+#define MAX_BLOCK_SIZE 1024
+#define RNG_SEED 13516120
+#define BASE 10
+
+
+using namespace std;
+
+
+__global__ void get_max(unsigned int * input, int n)
+{
+	const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+	auto step_size = 1;
+	int number_of_threads = n / 2;
+	while (number_of_threads > 0)
+	{
+		if (tid < number_of_threads) // still alive?
+		{
+			const auto fst = tid * step_size * 2;
+			const auto snd = fst + step_size;
+			if(input[fst] < input[snd]){
+				input[fst] = input[snd];
+			}
+		}
+		step_size <<= 1; 
+		number_of_threads >>= 1;
+    }
+    __syncthreads();
+}
+
+
+__global__ void count_occurences(unsigned int * arr, int n, int * count, unsigned long long exp){
+    const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    int digit;
+    if(tid < n){
+        digit = (arr[tid] / exp) % BASE;
+        atomicAdd(&count[digit], 1);
+    }
+}
+
+
+int main(int argc, char *argv[])
+{
+    curandGenerator_t curand_gen;
+    cudaEvent_t start, stop;
+    float time;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    curandCreateGenerator(&curand_gen, CURAND_RNG_PSEUDO_DEFAULT);
+    curandSetPseudoRandomGeneratorSeed(curand_gen, RNG_SEED);
+    const long count = ARRAY_SIZE;
+    unsigned int hostArr[count];
+    unsigned int * deviceArr;
+    int grid, block;
+    cudaMalloc(&deviceArr, count * sizeof(unsigned int));
+    
+    
+    //generate random number
+    curandGenerate(curand_gen, deviceArr, count);
+    cudaDeviceSynchronize();
+    cudaMemcpy(hostArr, deviceArr, count * sizeof(int), cudaMemcpyDeviceToHost);
+    
+    
+    //find max
+    if(count > MAX_BLOCK_SIZE){
+        grid = count / (MAX_BLOCK_SIZE);
+    }
+    else{
+        grid = 1;
+    }
+    block = MAX_BLOCK_SIZE / 2;
+	get_max <<<grid, block >>>(deviceArr, count);
+	unsigned int global_max;
+    cudaMemcpy(&global_max, deviceArr, sizeof(int), cudaMemcpyDeviceToHost);
+    
+    
+    //start sorting
+    cudaEventRecord(start, 0);
+    cudaMemcpy(deviceArr, hostArr, count * sizeof(unsigned int), cudaMemcpyHostToDevice);
+    block *= 2;
+    for (unsigned long long exp = 1; global_max / exp > 0; exp *= BASE) {
+        unsigned int output[count];
+        int hcount[BASE] = {0};
+        int * dcount;
+        cudaMalloc(&dcount, sizeof(int) * BASE);
+        cudaMemset(dcount, 0, sizeof(int) * BASE);
+        count_occurences <<<grid, block>>>(deviceArr, count, dcount, exp);
+        cudaMemcpy(hcount, dcount, BASE * sizeof(int), cudaMemcpyDeviceToHost);
+        for (int i = 1; i < BASE; i++) {
+            hcount[i] += hcount[i - 1];
+        }
+        for (int i = count - 1; i >= 0; i--) { 
+            output[hcount[ (hostArr[i] / exp) % BASE ] - 1] = hostArr[i]; 
+            hcount[ (hostArr[i] / exp) % BASE ]--;
+        }
+        cudaMemcpy(deviceArr, output, sizeof(unsigned int) * count, cudaMemcpyHostToDevice);
+        memcpy(hostArr, output, sizeof(unsigned int) * count);
+        cudaFree(dcount);
+        cudaDeviceSynchronize();
+    }
+    cudaMemcpy(hostArr, deviceArr, count * sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&time, start, stop);
+    
+    
+    //write to file
+    ofstream out("test/output", std::ios::out | std::ios::trunc);
+    for(int k = 0; k < count; k ++){
+        out << hostArr[k] << endl ;
+    }
+    out.close();
+    
+    
+    cout << "Time =  " << time * 1000 << " ms" << endl;
+    
+    
+    cudaFree(deviceArr);
+    curandDestroyGenerator(curand_gen);
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+	return 0;
 }
\ No newline at end of file