diff --git a/src/radix_sort_parallel.cu b/src/radix_sort_parallel.cu index d7c7fdc0aa3d316f9be715482399df8500e51dc6..ffa62952d4481ed186b64c4aa1d796ab7970d691 100644 --- a/src/radix_sort_parallel.cu +++ b/src/radix_sort_parallel.cu @@ -6,7 +6,7 @@ __global__ void copyArrayParallel(int *arr, int *output, int n) { int index = threadIdx.x; int stride = blockDim.x; - for (int i = index; i < n; i+=stride) { + for (int i = index+stride; i < n; i+=stride) { arr[i] = output[i]; } } diff --git a/src/radixsort_parallel.cu b/src/radixsort_parallel.cu index 8b1eb8375aec022d7e07b796784171ffd7ef7d8e..005a9d9b4d70b5d40b6d14c0f8a1d77b63913724 100644 --- a/src/radixsort_parallel.cu +++ b/src/radixsort_parallel.cu @@ -3,20 +3,30 @@ #include "radix_sort_parallel.h" __global__ void getMax(int *arr, int *max, int n) { - int index = threadIdx.x; - int stride = blockDim.x; - int mx = arr[index]; + //int index = threadIdx.x; + //int stride = blockDim.x; + int mx = arr[0]; - for (int i = index+stride; i < n; i+=stride) + for (int i = 0; i < n; i++) if (arr[i] > mx) mx = arr[i]; - max[index] = mx; + max[0] = mx; } -__global__ void countSort(int *arr, int n, int exp) { - int* output = (int*)malloc(n * sizeof(int)); +__global__ void copyArrayParallel(int *arr, int *output, int n) { + for (int i = 0; i < n; i++) { + arr[i] = output[i]; + } +} + +void countSort(int arr[], int n, int exp) { + int *output; + int *d_output, *d_arr; int i, count[10] = {0}; + // Allocate host memory + output = (int*)malloc(n * sizeof(int)); + for (i = 0; i < n; i++) count[ (arr[i]/exp)%10 ]++; @@ -29,8 +39,25 @@ __global__ void countSort(int *arr, int n, int exp) { count[ (arr[i]/exp)%10 ]--; } - for (i = 0; i < n; i++) - arr[i] = output[i]; + // Allocate device memory + cudaMalloc((void**)&d_arr, sizeof(n * sizeof(int))); + cudaMalloc((void**)&d_output, sizeof(n * sizeof(int))); + + // Transfer data from host to device memory + cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice); + + // Executing kernel + copyArrayParallel<<<1,500>>>(d_arr, d_output, n); + + //Transfer data back to host memory + cudaMemcpy(output, d_output, n * sizeof(int), cudaMemcpyDeviceToHost); + + // Deallocate device memory + cudaFree(d_arr); + cudaFree(d_output); + + // Deallocate host memory + free(output); } void radix_sort(int arr[], int n)