diff --git a/src/radix_sort_par3.cu b/src/radix_sort_par3.cu index 27c479204fe96950cf7b516bf2399bbd536d2a79..e9e1a90630e4ad3b5af7f101b4789cc114a0eddc 100644 --- a/src/radix_sort_par3.cu +++ b/src/radix_sort_par3.cu @@ -4,8 +4,11 @@ #include <time.h> #include <cuda.h> #include <cuda_runtime.h> +#define MAX_THREAD 32 + using namespace std; - + + // A utility function to get maximum value in arr[] int getMax(int arr[], int n) { @@ -17,11 +20,37 @@ int getMax(int arr[], int n) } __global__ void storeCount(int *count, int *arr, int n, int exp){ - int index = threadIdx.x; - int stride = blockDim.x; + __shared__ s_Count[MAX_THREAD][10]={0}; + + int idx = threadIdx.x; - for (int i = index; i < n; i+=stride) - count[ (arr[i]/exp)%10 ]++; + if(n<=MAX_THREAD){ + s_Count[idx][(arr[idx]/exp)%10 ]++; + } else{ + int block = (int) ceil(n/MAX_THREAD); + int index = idx * block; + int last_index; + if(idx+1==MAX_THREAD){ + last_index = n; + } else{ + last_index= index + block; + } + for (int i = index; i < last_index; i++){ + s_Count[idx][(arr[i]/exp)%10 ]++; + } + } + + if(idx==0){ + for (i = 1; i<MAX_THREAD;j++){ + for(int j=0; j<10;j++){ + s_Count[0][j]+=sCount[i][j]; + } + } + for (j =0; j<10;j++){ + count[j]=s_Count[0][j]; + } + } + } // A function to do counting sort of arr[] according to @@ -33,13 +62,14 @@ void countSort(int arr[], int n, int exp) int *output= (int*)malloc(sizeof(int)*n); // output array int i; int d_count[10] = {0}; - int h_count[10] = {0}; + int h_count[10]; cudaMalloc((void**)&d_arr,sizeof(int)*n); cudaMemcpy(d_arr, arr, sizeof(int)*n,cudaMemcpyHostToDevice); // Store count of occurrences in count[] storeCount<<<1,32>>>(d_count,d_arr,n,exp); cudaMemcpy(h_count, d_count, 10,cudaMemcpyDeviceToHost); + // Change count[i] so that count[i] now contains actual // position of this digit in output[] for (i = 1; i < 10; i++)