diff --git a/src/radixsort_parallel.cu b/src/radixsort_parallel.cu
index 2dabb613902093ba92c1e9c866c66619525b3ac3..0181a0b79a20c581da5ebacfbbc453a318c13d51 100644
--- a/src/radixsort_parallel.cu
+++ b/src/radixsort_parallel.cu
@@ -3,14 +3,14 @@
 #include "radix_sort_parallel.h"
 
 __global__ void getMax(int *arr, int *max, int n) {
-    //int index = threadIdx.x;
-    //int stride = blockDim.x;
-    int mx = arr[0]; 
+    int index = threadIdx.x;
+    int stride = blockDim.x;
+    int mx = arr[index]; 
 
-    for (int i = 0; i < n; i++) 
+    for (int i = index+stride; i < n; i+=stride) 
         if (arr[i] > mx) 
             mx = arr[i];
-    max[0] = mx;
+    max[index] = mx;
 } 
 
 __global__ void countSort(int *arr, int n, int exp) {
@@ -39,21 +39,28 @@ void radix_sort(int arr[], int n)
     int *d_max, *d_arr;
 
     // Allocate host memory
-    max = (int*)malloc(1 * sizeof(int));
+    max = (int*)malloc(n * sizeof(int));
 
     // Allocate device memory
-    cudaMalloc((void**)&d_max, 1 * sizeof(int));
+    cudaMalloc((void**)&d_max, n * sizeof(int));
     cudaMalloc((void**)&d_arr, n * sizeof(int));
 
     // Transfer data from host to device
     cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_max, max, sizeof(int) * 1, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_max, max, n * sizeof(int), cudaMemcpyHostToDevice);
 
     // Executing kernel
     getMax<<<1, 500>>>(d_arr, d_max, n);
 
     // Transfer data back to host memory
-    cudaMemcpy(max, d_max, 1 * sizeof(int), cudaMemcpyDeviceToHost);
+    cudaMemcpy(max, d_max, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+    int maks = max[0];
+    for (int i = 0; i < n; i++) {
+        if (maks < max[i]) {
+            maks = max[i];
+        }
+    }
 
     for (int exp = 1; max[0]/exp > 0; exp *= 10) {
         countSort<<<1, 500>>>(d_arr, n, exp);