diff --git a/src/radix_sort_par.cu b/src/radix_sort_par.cu
index ecc6b1814b7c678a37b24bad9fab4e610a324587..a4613c255805dc63210ca3a2a134675fe79e28dd 100644
--- a/src/radix_sort_par.cu
+++ b/src/radix_sort_par.cu
@@ -18,14 +18,14 @@ __device__ int getMax(int arr[], int n)
 // the digit represented by exp. 
 __device__ void countSort(int arr[], int n, int exp) 
 { 
-	int index = threadIdx.x;
-    int stride = blockDim.x;
+//	int index = threadIdx.x;
+//    int stride = blockDim.x;
 
     int *output= (int*)malloc(sizeof(int)*n); // output array 
     int i, count[10] = {0}; 
   
     // Store count of occurrences in count[] 
-    for (i = index; i < n; i+=stride) 
+    for (i = 0; i < n; i++) 
         count[ (arr[i]/exp)%10 ]++; 
   
     // Change count[i] so that count[i] now contains actual 
@@ -34,7 +34,7 @@ __device__ void countSort(int arr[], int n, int exp)
         count[i] += count[i - 1]; 
   
     // Build the output array 
-    for (i = n - 1; i >= index; i-=stride) 
+    for (i = n - 1; i >= 0; i--) 
     { 
         output[count[ (arr[i]/exp)%10 ] - 1] = arr[i]; 
         count[ (arr[i]/exp)%10 ]--; 
@@ -42,7 +42,7 @@ __device__ void countSort(int arr[], int n, int exp)
   
     // Copy the output array to arr[], so that arr[] now 
     // contains sorted numbers according to current digit 
-    for (i = index; i < n; i+=stride) 
+    for (i = 0; i < n; i++) 
         arr[i] = output[i]; 
 } 
   
diff --git a/src/radix_sort_par3.cu b/src/radix_sort_par3.cu
new file mode 100644
index 0000000000000000000000000000000000000000..27c479204fe96950cf7b516bf2399bbd536d2a79
--- /dev/null
+++ b/src/radix_sort_par3.cu
@@ -0,0 +1,135 @@
+// C++ implementation of Radix Sort 
+#include <iostream> 
+#include <cstdlib>
+#include <time.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+using namespace std; 
+  
+// A utility function to get maximum value in arr[] 
+int getMax(int arr[], int n) 
+{ 
+    int mx = arr[0]; 
+    for (int i = 1; i < n; i++) 
+        if (arr[i] > mx) 
+            mx = arr[i]; 
+    return mx; 
+} 
+  
+__global__ void storeCount(int *count, int *arr, int n, int exp){
+    int index = threadIdx.x;
+    int stride = blockDim.x;
+
+    for (int i = index; i < n; i+=stride) 
+        count[ (arr[i]/exp)%10 ]++;
+}
+
+// A function to do counting sort of arr[] according to 
+// the digit represented by exp. 
+void countSort(int arr[], int n, int exp) 
+{ 
+    int *d_arr;
+
+    int *output= (int*)malloc(sizeof(int)*n); // output array 
+    int i;
+    int d_count[10] = {0}; 
+    int h_count[10] = {0};
+    cudaMalloc((void**)&d_arr,sizeof(int)*n);
+    cudaMemcpy(d_arr, arr, sizeof(int)*n,cudaMemcpyHostToDevice);
+  
+    // Store count of occurrences in count[] 
+    storeCount<<<1,32>>>(d_count,d_arr,n,exp); 
+    cudaMemcpy(h_count, d_count, 10,cudaMemcpyDeviceToHost);
+    // Change count[i] so that count[i] now contains actual 
+    //  position of this digit in output[] 
+    for (i = 1; i < 10; i++) 
+        h_count[i] += h_count[i - 1]; 
+  
+    // Build the output array 
+    for (i = n - 1; i >= 0; i--) 
+    { 
+        output[h_count[ (arr[i]/exp)%10 ] - 1] = arr[i]; 
+        h_count[ (arr[i]/exp)%10 ]--; 
+    } 
+  
+    // Copy the output array to arr[], so that arr[] now 
+    // contains sorted numbers according to current digit 
+    for (i = 0; i < n; i++) 
+        arr[i] = output[i]; 
+
+    cudaFree(d_arr);
+    //cudaFree(d_count);
+} 
+  
+// The main function to that sorts arr[] of size n using  
+// Radix Sort
+
+void radixsort(int *arr, int n) 
+{ 
+    
+    int m = getMax(arr, n); 
+  
+    // Do counting sort for every digit. Note that instead 
+    // of passing digit number, exp is passed. exp is 10^i 
+    // where i is current digit number 
+
+    
+    //cudaMemcpy(d_arr, arr, sizeof(int)*n,cudaMemcpyHostToDevice);
+    for (int exp = 1; m/exp > 0; exp *= 10)  
+        countSort(arr,n,exp);
+
+} 
+  
+// A utility function to print an array 
+void print(int arr[], int n){ 
+    for (int i = 0; i < n; i++) 
+        cout << arr[i] << "\n"; 
+} 
+  
+void rng(int* arr,int n){
+    int seed = 13516067;
+    srand(seed);
+    for (long i=0; i<n; i++){
+        arr[i] = (int) rand();
+    }
+}
+
+ 
+timespec diff(timespec start, timespec end)
+{
+    timespec temp;
+    if ((end.tv_nsec - start.tv_nsec) < 0) {
+        temp.tv_sec = end.tv_sec - start.tv_sec - 1;
+        temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
+    } else {
+        temp.tv_sec = end.tv_sec - start.tv_sec;
+        temp.tv_nsec = end.tv_nsec - start.tv_nsec;
+    }
+    return temp;
+}
+
+// Driver program to test above functions 
+int main(int argc, char *argv[]) 
+{ 
+    timespec start, stop;
+    
+    
+    int n;
+    n= atoi(argv[1]);
+    int arr[n];
+    rng(arr,n);
+
+    
+    clock_gettime(CLOCK_REALTIME, &start);
+    radixsort(arr,n);
+    clock_gettime(CLOCK_REALTIME, &stop);
+    print(arr,n);
+    
+    timespec duration = diff(start, stop);
+    long time = duration.tv_sec * 1000000 + duration.tv_nsec/1000;
+    printf("\n%d.%09d s\n", duration.tv_sec, duration.tv_nsec);
+    
+    //deallocate host memory
+    
+    return 0; 
+}