diff --git a/src/radix_sort_par3.cu b/src/radix_sort_par3.cu
index 27c479204fe96950cf7b516bf2399bbd536d2a79..e9e1a90630e4ad3b5af7f101b4789cc114a0eddc 100644
--- a/src/radix_sort_par3.cu
+++ b/src/radix_sort_par3.cu
@@ -4,8 +4,11 @@
 #include <time.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#define MAX_THREAD 32
+
 using namespace std; 
-  
+
+ 
 // A utility function to get maximum value in arr[] 
 int getMax(int arr[], int n) 
 { 
@@ -17,11 +20,37 @@ int getMax(int arr[], int n)
 } 
   
 __global__ void storeCount(int *count, int *arr, int n, int exp){
-    int index = threadIdx.x;
-    int stride = blockDim.x;
+    __shared__ s_Count[MAX_THREAD][10]={0};
+
+    int idx = threadIdx.x;
 
-    for (int i = index; i < n; i+=stride) 
-        count[ (arr[i]/exp)%10 ]++;
+    if(n<=MAX_THREAD){
+        s_Count[idx][(arr[idx]/exp)%10 ]++;
+    } else{
+        int block = (int) ceil(n/MAX_THREAD);
+        int index = idx * block; 
+        int last_index;
+        if(idx+1==MAX_THREAD){
+            last_index = n;
+        } else{
+            last_index= index + block;
+        }
+        for (int i = index; i < last_index; i++){
+            s_Count[idx][(arr[i]/exp)%10 ]++;
+        }
+    }
+
+    if(idx==0){
+        for (i = 1; i<MAX_THREAD;j++){
+            for(int j=0; j<10;j++){
+                s_Count[0][j]+=sCount[i][j];
+            }
+        }
+        for (j =0; j<10;j++){
+            count[j]=s_Count[0][j];
+        }
+    }
+        
 }
 
 // A function to do counting sort of arr[] according to 
@@ -33,13 +62,14 @@ void countSort(int arr[], int n, int exp)
     int *output= (int*)malloc(sizeof(int)*n); // output array 
     int i;
     int d_count[10] = {0}; 
-    int h_count[10] = {0};
+    int h_count[10];
     cudaMalloc((void**)&d_arr,sizeof(int)*n);
     cudaMemcpy(d_arr, arr, sizeof(int)*n,cudaMemcpyHostToDevice);
   
     // Store count of occurrences in count[] 
     storeCount<<<1,32>>>(d_count,d_arr,n,exp); 
     cudaMemcpy(h_count, d_count, 10,cudaMemcpyDeviceToHost);
+
     // Change count[i] so that count[i] now contains actual 
     //  position of this digit in output[] 
     for (i = 1; i < 10; i++)