diff --git a/src/radix_sort_parallel.cu b/src/radix_sort_parallel.cu index 950f89f4209570f22f6cbd8f1c533290a81c023f..0725ae9e019646615689308ec603ceb017dda107 100644 --- a/src/radix_sort_parallel.cu +++ b/src/radix_sort_parallel.cu @@ -1,142 +1,127 @@ -#include <cuda.h> -#include <cuda_runtime.h> -#include "radix_sort_parallel.h" - -__global__ void copyArrayParallel(int *arr, int *output, int n) { - for (int i = 0; i < n; i++) { - arr[i] = output[i]; - } +#include <iostream> +#include <fstream> +#include <numeric> +#include "cuda_runtime.h" +#include "cuda.h" +#include "curand_kernel.h" +#include "device_launch_parameters.h" +#include <time.h> + +#define MAX_BLOCK_SIZE 1024 +#define RNG_SEED 13516013 + +using namespace std; + +void print(unsigned int * arr, int n) { + for (int i = 0; i < n; i++) + cout << i <<" :"<<arr[i] << " "<<endl; + cout << endl; } -__global__ void getMaxParallel(int *arr, int *max, int n) { - int maximum = arr[0]; - for (int i = 0; i < n; i++) { - if (arr[i] > maximum) { - maximum = arr[i]; +__global__ void getMax(unsigned int * input, int n) +{ + const int tid = blockDim.x * blockIdx.x + threadIdx.x; + auto step_size = 1; + int number_of_threads = n / 2; + while (number_of_threads > 0) + { + if (tid < number_of_threads) // still alive? + { + const auto fst = tid * step_size * 2; + const auto snd = fst + step_size; + if(input[fst] < input[snd]){ + input[fst] = input[snd]; + } } + step_size <<= 1; + number_of_threads >>= 1; } - max[0] = maximum; + __syncthreads(); } -int getMax(int arr[], int n) -{ - int result; - int *max; - int *d_arr, *d_max; - - // Allocate host memory - max = (int*)malloc(n * sizeof(int)); - - // Initialize host memory - for (int i = 0; i < n; i++) { - max[i] = 0; - } - - // Allocate device memory - cudaMalloc((void**)&d_arr, n * sizeof(int)); - cudaMalloc((void**)&d_max, n * sizeof(int)); - - // Transfer data from host to device memory - cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice); - - // Executing kernel - getMaxParallel<<<1,500>>>(d_arr, d_max, n); - - // Transfer data back to host memory - cudaMemcpy(max, d_max, n * sizeof(int), cudaMemcpyDeviceToHost); - - // Deallocate device memory - cudaFree(d_max); - cudaFree(d_arr); - - result = max[0]; - - // Deallocate host memory - free(max); - - return result; -} - -void countSort(int arr[], int n, int exp) -{ - int *output; - int *d_output, *d_arr; - int i, count[10] = {0}; - - // Allocate host memory - output = (int*)malloc(n * sizeof(int)); - - for (i = 0; i < n; i++) - count[ (arr[i]/exp)%10 ]++; - - for (i = 1; i < 10; i++) - count[i] += count[i - 1]; - - for (i = n - 1; i >= 0; i--) - { - output[count[ (arr[i]/exp)%10 ] - 1] = arr[i]; - count[ (arr[i]/exp)%10 ]--; - } - - // Allocate device memory - cudaMalloc((void**)&d_arr, sizeof(n * sizeof(int))); - cudaMalloc((void**)&d_output, sizeof(n * sizeof(int))); - - // Transfer data from host to device memory - cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice); - - // Executing kernel - copyArrayParallel<<<1,500>>>(d_arr, d_output, n); - - //Transfer data back to host memory - cudaMemcpy(output, d_output, n * sizeof(int), cudaMemcpyDeviceToHost); - - // Deallocate device memory - cudaFree(d_arr); - cudaFree(d_output); - - // Deallocate host memory - free(output); -} - -void radix_sort(int arr[], int n) -{ - int m = getMax(arr, n); - - for (int exp = 1; m/exp > 0; exp *= 10) - countSort(arr, n, exp); -} - -void print(int arr[], int n) -{ - for (int i = 0; i < n; i++) - printf("%d: %d\n",i, arr[i]); -} - -void rng(int* arr, int n) { - int seed = 13516013; - srand(seed); - for(long i = 0; i < n; i++) { - arr[i] = (int)rand(); - } +__global__ void storeCount(unsigned int * arr, int n, int * count, unsigned long long exp){ + const int tid = blockDim.x * blockIdx.x + threadIdx.x; + int digit; + if(tid < n){ + digit = (arr[tid] / exp) % 10; + atomicAdd(&count[digit], 1); + } } - -int main(int argc, char *argv[]) { +int main(int argc, char *argv[]) +{ + srand(time(NULL)); + curandGenerator_t curand_gen; + cudaEvent_t start, stop; + float time; int N; if (argc == 2) { - N = strtol(argv[1], NULL, 10); - } else { - printf("ERROR: ./radix_sort <array_length>\n"); - return 1; - } - int arr[N]; - rng(arr,N); - clock_t begin = clock(); - radix_sort(arr, N); - clock_t end = clock(); - double time = (double)(end - begin) * 1000 / CLOCKS_PER_SEC; - print(arr,N); - printf("Executed in %lf ms\n",time); + N = strtol(argv[1], NULL, 10); + } else { + printf("ERROR: ./radix_sort <array_length>\n"); + return 1; + } + cudaEventCreate(&start); + cudaEventCreate(&stop); + curandCreateGenerator(&curand_gen, CURAND_RNG_PSEUDO_DEFAULT); + curandSetPseudoRandomGeneratorSeed(curand_gen, RNG_SEED); + const long count = N; + unsigned int h_arr[count]; + unsigned int * d_arr; + int grid, block; + cudaMalloc(&d_arr, count * sizeof(unsigned int)); + //rng + curandGenerate(curand_gen, d_arr, count); + cudaDeviceSynchronize(); + cudaMemcpy(h_arr, d_arr, count * sizeof(int), cudaMemcpyDeviceToHost); + //find max + if(count > MAX_BLOCK_SIZE){ + grid = count / (MAX_BLOCK_SIZE) + 1; + } + else{ + grid = 1; + } + block = MAX_BLOCK_SIZE / 2; + getMax<<<grid, block >>>(d_arr, count); + unsigned int global_max; + cudaMemcpy(&global_max, d_arr, sizeof(int), cudaMemcpyDeviceToHost); + //sort + cudaEventRecord(start, 0); + cudaMemcpy(d_arr, h_arr, count * sizeof(unsigned int), cudaMemcpyHostToDevice); + block *= 2; + for (unsigned long long exp = 1; global_max / exp > 0; exp *= 10) { + unsigned int output[count]; + int h_count[10] = {0}; + int * d_count; + cudaMalloc(&d_count, sizeof(int) * 10); + cudaMemset(d_count, 0, sizeof(int) * 10); + storeCount<<<grid, block>>>(d_arr, count, d_count, exp); + cudaMemcpy(h_count, d_count, 10 * sizeof(int), cudaMemcpyDeviceToHost); + for (int i = 1; i < 10; i++) { + h_count[i] += h_count[i - 1]; + } + for (int i = count - 1; i >= 0; i--) { + output[h_count[ (h_arr[i] / exp) % 10 ] - 1] = h_arr[i]; + h_count[ (h_arr[i] / exp) % 10 ]--; + } + cudaMemcpy(d_arr, output, sizeof(unsigned int) * count, cudaMemcpyHostToDevice); + memcpy(h_arr, output, sizeof(unsigned int) * count); + cudaFree(d_count); + cudaDeviceSynchronize(); + } + cudaMemcpy(h_arr, d_arr, count * sizeof(unsigned int), cudaMemcpyDeviceToHost); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&time, start, stop); + print(h_arr,count); + ofstream output("output.txt", std::ios::out | std::ios::trunc); + for(int k = 0; k < count; k ++){ + output << k << ":"<<h_arr[k] << " " ; + } + output.close(); + cout << "Executed in " << time * 1000 << " microseconds." << endl; + cudaFree(d_arr); + curandDestroyGenerator(curand_gen); + cudaEventDestroy(start); + cudaEventDestroy(stop); return 0; -} - +} \ No newline at end of file