diff --git a/src/radix_sort_parallel.cu b/src/radix_sort_parallel.cu
index d7c7fdc0aa3d316f9be715482399df8500e51dc6..ffa62952d4481ed186b64c4aa1d796ab7970d691 100644
--- a/src/radix_sort_parallel.cu
+++ b/src/radix_sort_parallel.cu
@@ -6,7 +6,7 @@ __global__ void copyArrayParallel(int *arr, int *output, int n) {
 	int index = threadIdx.x;
 	int stride = blockDim.x;
 
-	for (int i = index; i < n; i+=stride) {
+	for (int i = index+stride; i < n; i+=stride) {
 		arr[i] = output[i];
 	}
 }
diff --git a/src/radixsort_parallel.cu b/src/radixsort_parallel.cu
index 8b1eb8375aec022d7e07b796784171ffd7ef7d8e..005a9d9b4d70b5d40b6d14c0f8a1d77b63913724 100644
--- a/src/radixsort_parallel.cu
+++ b/src/radixsort_parallel.cu
@@ -3,20 +3,30 @@
 #include "radix_sort_parallel.h"
 
 __global__ void getMax(int *arr, int *max, int n) {
-    int index = threadIdx.x;
-    int stride = blockDim.x;
-    int mx = arr[index]; 
+    //int index = threadIdx.x;
+    //int stride = blockDim.x;
+    int mx = arr[0]; 
 
-    for (int i = index+stride; i < n; i+=stride) 
+    for (int i = 0; i < n; i++) 
         if (arr[i] > mx) 
             mx = arr[i];
-    max[index] = mx;
+    max[0] = mx;
 } 
 
-__global__ void countSort(int *arr, int n, int exp) {
-    int* output = (int*)malloc(n * sizeof(int));
+__global__ void copyArrayParallel(int *arr, int *output, int n) {
+    for (int i = 0; i < n; i++) {
+        arr[i] = output[i];
+    }
+}
+
+void countSort(int arr[], int n, int exp) {
+    int *output;
+    int *d_output, *d_arr;
     int i, count[10] = {0};
 
+    // Allocate host memory
+    output = (int*)malloc(n * sizeof(int));
+
     for (i = 0; i < n; i++) 
         count[ (arr[i]/exp)%10 ]++; 
 
@@ -29,8 +39,25 @@ __global__ void countSort(int *arr, int n, int exp) {
         count[ (arr[i]/exp)%10 ]--; 
     } 
 
-    for (i = 0; i < n; i++) 
-        arr[i] = output[i];
+    // Allocate device memory
+    cudaMalloc((void**)&d_arr, sizeof(n * sizeof(int)));
+    cudaMalloc((void**)&d_output, sizeof(n * sizeof(int)));
+
+    // Transfer data from host to device memory
+    cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice);
+
+    // Executing kernel
+    copyArrayParallel<<<1,500>>>(d_arr, d_output, n);
+
+    //Transfer data back to host memory
+    cudaMemcpy(output, d_output, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+    // Deallocate device memory
+    cudaFree(d_arr);
+    cudaFree(d_output);
+
+    // Deallocate host memory
+    free(output);
 } 
 
 void radix_sort(int arr[], int n)