diff --git a/src/radixsort_parallel.cu b/src/radixsort_parallel.cu index 2dabb613902093ba92c1e9c866c66619525b3ac3..0181a0b79a20c581da5ebacfbbc453a318c13d51 100644 --- a/src/radixsort_parallel.cu +++ b/src/radixsort_parallel.cu @@ -3,14 +3,14 @@ #include "radix_sort_parallel.h" __global__ void getMax(int *arr, int *max, int n) { - //int index = threadIdx.x; - //int stride = blockDim.x; - int mx = arr[0]; + int index = threadIdx.x; + int stride = blockDim.x; + int mx = arr[index]; - for (int i = 0; i < n; i++) + for (int i = index+stride; i < n; i+=stride) if (arr[i] > mx) mx = arr[i]; - max[0] = mx; + max[index] = mx; } __global__ void countSort(int *arr, int n, int exp) { @@ -39,21 +39,28 @@ void radix_sort(int arr[], int n) int *d_max, *d_arr; // Allocate host memory - max = (int*)malloc(1 * sizeof(int)); + max = (int*)malloc(n * sizeof(int)); // Allocate device memory - cudaMalloc((void**)&d_max, 1 * sizeof(int)); + cudaMalloc((void**)&d_max, n * sizeof(int)); cudaMalloc((void**)&d_arr, n * sizeof(int)); // Transfer data from host to device cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_max, max, sizeof(int) * 1, cudaMemcpyHostToDevice); + cudaMemcpy(d_max, max, n * sizeof(int), cudaMemcpyHostToDevice); // Executing kernel getMax<<<1, 500>>>(d_arr, d_max, n); // Transfer data back to host memory - cudaMemcpy(max, d_max, 1 * sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(max, d_max, n * sizeof(int), cudaMemcpyDeviceToHost); + + int maks = max[0]; + for (int i = 0; i < n; i++) { + if (maks < max[i]) { + maks = max[i]; + } + } for (int exp = 1; max[0]/exp > 0; exp *= 10) { countSort<<<1, 500>>>(d_arr, n, exp);