diff --git a/src/radix_sort_parallel.cu b/src/radix_sort_parallel.cu
index ffa62952d4481ed186b64c4aa1d796ab7970d691..7341405c9c97e0ac1521d5e88454f7850314f40d 100644
--- a/src/radix_sort_parallel.cu
+++ b/src/radix_sort_parallel.cu
@@ -3,20 +3,14 @@
 #include "radix_sort_parallel.h"
 
 __global__ void copyArrayParallel(int *arr, int *output, int n) {
-	int index = threadIdx.x;
-	int stride = blockDim.x;
-
-	for (int i = index+stride; i < n; i+=stride) {
+	for (int i = 0; i < n; i++) {
 		arr[i] = output[i];
 	}
 }
 
 __global__ void getMaxParallel(int *arr, int *max, int n) {
-	int index = threadIdx.x;
-	int stride = blockDim.x;
-
-	int maximum = arr[index];
-	for (int i = index+stride; i < n; i+=stride) {
+	int maximum = arr[0];
+	for (int i = 0; i < n; i++) {
 		if (arr[i] > maximum) {
 			maximum = arr[i];
 		}
diff --git a/src/radix_sort_parallel.h b/src/radix_sort_parallel.h
index 76c36e31a7ae88ddbe6c3829a242b893aeb72d8f..b2477e55f5cb5350db3a7bf1605b94b9c5de8cc8 100644
--- a/src/radix_sort_parallel.h
+++ b/src/radix_sort_parallel.h
@@ -10,5 +10,4 @@
 
 void rng(int* arr, int n);
 void radix_sort(int arr[], int n);
-void print(int arr[], int n) ;
-void countSort(int arr[], int n, int exp);
\ No newline at end of file
+void print(int arr[], int n) ;
\ No newline at end of file
diff --git a/src/radixsort_parallel.cu b/src/radixsort_parallel.cu
index 00f142434afa356d3a17eebb506eaf92f47e9168..2dabb613902093ba92c1e9c866c66619525b3ac3 100644
--- a/src/radixsort_parallel.cu
+++ b/src/radixsort_parallel.cu
@@ -13,20 +13,10 @@ __global__ void getMax(int *arr, int *max, int n) {
     max[0] = mx;
 } 
 
-__global__ void copyArrayParallel(int *arr, int *output, int n) {
-    for (int i = 0; i < n; i++) {
-        arr[i] = output[i];
-    }
-}
-
-void countSort(int arr[], int n, int exp) {
-    int *output;
-    int *d_output, *d_arr;
+__global__ void countSort(int *arr, int n, int exp) {
+    int* output = (int*)malloc(n * sizeof(int));
     int i, count[10] = {0};
 
-    // Allocate host memory
-    output = (int*)malloc(n * sizeof(int));
-
     for (i = 0; i < n; i++) 
         count[ (arr[i]/exp)%10 ]++; 
 
@@ -39,25 +29,8 @@ void countSort(int arr[], int n, int exp) {
         count[ (arr[i]/exp)%10 ]--; 
     } 
 
-    // Allocate device memory
-    cudaMalloc((void**)&d_arr, sizeof(n * sizeof(int)));
-    cudaMalloc((void**)&d_output, sizeof(n * sizeof(int)));
-
-    // Transfer data from host to device memory
-    cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice);
-
-    // Executing kernel
-    copyArrayParallel<<<1,500>>>(d_arr, d_output, n);
-
-    //Transfer data back to host memory
-    cudaMemcpy(output, d_output, n * sizeof(int), cudaMemcpyDeviceToHost);
-
-    // Deallocate device memory
-    cudaFree(d_arr);
-    cudaFree(d_output);
-
-    // Deallocate host memory
-    free(output);
+    for (i = 0; i < n; i++) 
+        arr[i] = output[i];
 } 
 
 void radix_sort(int arr[], int n) 
@@ -83,7 +56,7 @@ void radix_sort(int arr[], int n)
     cudaMemcpy(max, d_max, 1 * sizeof(int), cudaMemcpyDeviceToHost);
 
     for (int exp = 1; max[0]/exp > 0; exp *= 10) {
-        countSort(d_arr, n, exp);
+        countSort<<<1, 500>>>(d_arr, n, exp);
     }
 
     cudaMemcpy(arr, d_arr, n * sizeof(int), cudaMemcpyDeviceToHost);