diff --git a/src/radix_sort_parallel.cu b/src/radix_sort_parallel.cu index 77f98caec6e5c6b9a72a2aaac4a1d6739124be27..1e32101f0afc340de516d29e95c57125733c49c4 100644 --- a/src/radix_sort_parallel.cu +++ b/src/radix_sort_parallel.cu @@ -1,115 +1,128 @@ -#include <cuda.h> -#include <cuda_runtime.h> -#include "radix_sort_parallel.h" - -__global__ void getMax(int *arr, int *max, int n) { - int mx = arr[0]; - - for (int i = 0; i < n; i++) - if (arr[i] > mx) - mx = arr[i]; - max[0] = mx; -} - -__global__ void countSort(int *arr, int n, int exp) { - int* output = (int*)malloc(n * sizeof(int)); - int i, count[10] = {0}; - - for (i = 0; i < n; i++) - count[ (arr[i]/exp)%10 ]++; - - for (i = 1; i < 10; i++) - count[i] += count[i - 1]; - - for (i = n - 1; i >= 0; i--) - { - output[count[ (arr[i]/exp)%10 ] - 1] = arr[i]; - count[ (arr[i]/exp)%10 ]--; - } - - for (i = 0; i < n; i++) - arr[i] = output[i]; -} - -void radix_sort(int arr[], int n) -{ - int *max; - int *d_max, *d_arr; - - // Allocate host memory - max = (int*)malloc(n * sizeof(int)); - - // Allocate device memory - cudaMalloc((void**)&d_max, n * sizeof(int)); - cudaMalloc((void**)&d_arr, n * sizeof(int)); - - // Transfer data from host to device - cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_max, max, n * sizeof(int), cudaMemcpyHostToDevice); - - // Executing kernel - getMax<<<1, 500>>>(d_arr, d_max, n); - - // Transfer data back to host memory - cudaMemcpy(max, d_max, n * sizeof(int), cudaMemcpyDeviceToHost); - - for (int exp = 1; max[0]/exp > 0; exp *= 10) { - countSort<<<1, 500>>>(d_arr, n, exp); - } - - cudaMemcpy(arr, d_arr, n * sizeof(int), cudaMemcpyDeviceToHost); - cudaFree(d_max); - cudaFree(d_arr); - free(max); -} - -void print(int arr[], int n) -{ - for (int i = 0; i < n; i++) - printf("%d: %d\n",i, arr[i]); -} - -void rng(int *arr, int n) { - int seed = 13516013; - srand(seed); - for(long i = 0; i < n; i++) { - arr[i] = (int)rand(); - } +#include <iostream> +#include <fstream> +#include <numeric> +#include "cuda_runtime.h" +#include "cuda.h" +#include "curand_kernel.h" +#include "device_launch_parameters.h" +#include <time.h> + +#define MAX_BLOCK_SIZE 1024 +#define RNG_SEED 13516013 + +using namespace std; + +void print(unsigned int * arr, int n) { + for (int i = 0; i < n; i++) + cout << i <<" :"<<arr[i] << " "<<endl; + cout << endl; } -int main(int argc, char *argv[]) { - int N; - int *arr; - int *d_arr; - - if (argc == 2) { - N = strtol(argv[1], NULL, 10); - } else { - printf("ERROR: ./radix_sort <array_length>\n"); - return 1; - } - - // Allocate host memory - arr = (int*)malloc(N * sizeof(int)); - - // Initialize host memory - rng(arr,N); - - // Allocate device memory - cudaMalloc((void**)&d_arr, N * sizeof(int)); - - // Transfer data from host to device memory - cudaMemcpy(d_arr, arr, N * sizeof(int), cudaMemcpyHostToDevice); - - clock_t begin = clock(); - radix_sort(arr, N); - clock_t end = clock(); - double time = (double)(end - begin) * 1000 / CLOCKS_PER_SEC; - print(arr,N); - printf("Executed in %lf ms\n",time); - - cudaFree(d_arr); - free(arr); - return 0; +__global__ void getMax(unsigned int * input, int n) +{ + const int tid = blockDim.x * blockIdx.x + threadIdx.x; + auto step_size = 1; + int number_of_threads = n / 2; + while (number_of_threads > 0) + { + if (tid < number_of_threads) // still alive? + { + const auto fst = tid * step_size * 2; + const auto snd = fst + step_size; + if(input[fst] < input[snd]){ + input[fst] = input[snd]; + } + } + step_size <<= 1; + number_of_threads >>= 1; + } + __syncthreads(); } +__global__ void storeCount(unsigned int * arr, int n, int * count, unsigned long long exp){ + const int tid = blockDim.x * blockIdx.x + threadIdx.x; + int digit; + if(tid < n){ + digit = (arr[tid] / exp) % 10; + atomicAdd(&count[digit], 1); + } +} +int main(int argc, char *argv[]) +{ + srand(time(NULL)); + curandGenerator_t curand_gen; + cudaEvent_t start, stop; + float time; + int N; + if (argc == 2) { + N = strtol(argv[1], NULL, 10); + } else { + printf("ERROR: ./radix_sort <array_length>\n"); + return 1; + } + cudaEventCreate(&start); + cudaEventCreate(&stop); + curandCreateGenerator(&curand_gen, CURAND_RNG_PSEUDO_DEFAULT); + curandSetPseudoRandomGeneratorSeed(curand_gen, RNG_SEED); + const long count = N; + unsigned int h_arr[count]; + unsigned int * d_arr; + int grid, block; + cudaMalloc(&d_arr, count * sizeof(unsigned int)); + //rng + curandGenerate(curand_gen, d_arr, count); + cudaDeviceSynchronize(); + cudaMemcpy(h_arr, d_arr, count * sizeof(int), cudaMemcpyDeviceToHost); + //find max + if(count > MAX_BLOCK_SIZE){ + grid = count / (MAX_BLOCK_SIZE) + 1; + } + else{ + grid = 1; + } + block = MAX_BLOCK_SIZE / 2; + getMax<<<grid, block >>>(d_arr, count); + unsigned int global_max; + cudaMemcpy(&global_max, d_arr, sizeof(int), cudaMemcpyDeviceToHost); + //sort + cudaEventRecord(start, 0); + cudaMemcpy(d_arr, h_arr, count * sizeof(unsigned int), cudaMemcpyHostToDevice); + block *= 2; + for (unsigned long long exp = 1; global_max / exp > 0; exp *= 10) { + unsigned int output[count]; + int h_count[10] = {0}; + int * d_count; + cudaMalloc(&d_count, sizeof(int) * 10); + cudaMemset(d_count, 0, sizeof(int) * 10); + storeCount<<<grid, block>>>(d_arr, count, d_count, exp); + cudaMemcpy(h_count, d_count, 10 * sizeof(int), cudaMemcpyDeviceToHost); + for (int i = 1; i < 10; i++) { + h_count[i] += h_count[i - 1]; + } + for (int i = count - 1; i >= 0; i--) { + output[h_count[ (h_arr[i] / exp) % 10 ] - 1] = h_arr[i]; + h_count[ (h_arr[i] / exp) % 10 ]--; + } + cudaMemcpy(d_arr, output, sizeof(unsigned int) * count, cudaMemcpyHostToDevice); + memcpy(h_arr, output, sizeof(unsigned int) * count); + cudaFree(d_count); + cudaDeviceSynchronize(); + } + cudaMemcpy(h_arr, d_arr, count * sizeof(unsigned int), cudaMemcpyDeviceToHost); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&time, start, stop); + print(h_arr,count); + ofstream output("test\output.txt", std::ios::out | std::ios::trunc); + for(int k = 0; k < count; k ++){ + output << k << ":"<<h_arr[k] << " " ; + } + output.close(); + cout << "Executed in " << time * 1000 << " microseconds." << endl; + cudaFree(d_arr); + curandDestroyGenerator(curand_gen); + cudaEventDestroy(start); + cudaEventDestroy(stop); + return 0; +} +>>>>>>> 2ccd6cd7d8a384702d5a60c7d95fe2a2808d9b80 diff --git a/src/radix_sort_parallel.h b/src/radix_sort_parallel.h deleted file mode 100644 index b2477e55f5cb5350db3a7bf1605b94b9c5de8cc8..0000000000000000000000000000000000000000 --- a/src/radix_sort_parallel.h +++ /dev/null @@ -1,13 +0,0 @@ -#include <stdio.h> -#include <stdlib.h> -#include <omp.h> -#include <time.h> -#include <math.h> -#include <assert.h> -#include <cuda.h> -#include <cuda_runtime.h> -#define t 8 - -void rng(int* arr, int n); -void radix_sort(int arr[], int n); -void print(int arr[], int n) ; \ No newline at end of file