diff --git a/src/radix_sort_par.cu b/src/radix_sort_par.cu
index 371862304caf572dcc6fcb49c74ac9f51df822e6..ecc6b1814b7c678a37b24bad9fab4e610a324587 100644
--- a/src/radix_sort_par.cu
+++ b/src/radix_sort_par.cu
@@ -49,9 +49,9 @@ __device__ void countSort(int arr[], int n, int exp)
 // The main function to that sorts arr[] of size n using  
 // Radix Sort
 
-__device__ void radixsort(int *arr, int n) 
+__global__ void radixsort(int *arr, int n) 
 { 
-    int *d_arr;
+    //int *d_arr;
 
     // Find the maximum number to know number of digits 
     int m = getMax(arr, n); 
@@ -61,18 +61,18 @@ __device__ void radixsort(int *arr, int n)
     // where i is current digit number 
 
     // allocate device memory
-    cudaMalloc((void**)&d_arr,sizeof(int)*n);
+    //cudaMalloc((void**)&d_arr,sizeof(int)*n);
 
-    cudaMemcpy(d_arr, arr, sizeof(int)*n,cudaMemcpyHostToDevice);
+    //cudaMemcpy(d_arr, arr, sizeof(int)*n,cudaMemcpyHostToDevice);
     for (int exp = 1; m/exp > 0; exp *= 10) 
-        countSort<<<1,1024>>>(d_arr, n, exp); 
-
+        //countSort<<<1,1024>>>(d_arr, n, exp); 
+        countSort(arr,n,exp);
         //transfer data back to host memory
-        cudaMemcpy(arr, d_arr, sizeof(int)*n, cudaMemcpyDeviceToHost);
+        //cudaMemcpy(arr, d_arr, sizeof(int)*n, cudaMemcpyDeviceToHost);
         
     //deallocate device memory
-    cudaFree(d_arr);
-
+    //cudaFree(d_arr);
+    __syncthreads();
 
 } 
   
@@ -109,12 +109,18 @@ int main(int argc, char *argv[])
 { 
     timespec start, stop;
     
+    int *d_arr;
     int n;
     n= atoi(argv[1]);
     int arr[n];
     rng(arr,n);
+
+    cudaMalloc((void**)&d_arr,sizeof(int)*n);
+    cudaMemcpy(d_arr, arr, sizeof(int)*n,cudaMemcpyHostToDevice);
+
     clock_gettime(CLOCK_REALTIME, &start);
-    radixsort(arr,n);
+    radixsort<<<1,32>>>(d_arr,n);
+    cudaMemcpy(arr, d_arr, sizeof(int)*n,cudaMemcpyDeviceToHost);
     clock_gettime(CLOCK_REALTIME, &stop);
     print(arr,n);
     
@@ -123,6 +129,7 @@ int main(int argc, char *argv[])
     printf("\n%d.%09d s\n", duration.tv_sec, duration.tv_nsec);
     
     //deallocate host memory
+    cudaFree(d_arr);
     
     return 0; 
 }