diff --git a/src/parallel_radix.cu b/src/parallel_radix.cu index 9e7a15963f48ecbf640e39fbe06bc4eb4a1cc247..25b778ad130f5fc990c1aed6ac270b029c3797ab 100644 --- a/src/parallel_radix.cu +++ b/src/parallel_radix.cu @@ -1,125 +1,131 @@ -#include <iostream> -#include <fstream> -#include <numeric> -#include "cuda_runtime.h" -#include "cuda.h" -#include "curand_kernel.h" -#include "device_launch_parameters.h" - -#define ARRAY_SIZE 100000 -#define MAX_BLOCK_SIZE 1024 -#define RNG_SEED 13516120 -#define BASE 10 - - -using namespace std; - -void printArray(unsigned int * arr, int n) { - for (int i = 0; i < n; i++) - cout << arr[i] << " "; - cout << endl; -} - -__global__ void get_max(unsigned int * input, int n) -{ - const int tid = blockDim.x * blockIdx.x + threadIdx.x; - auto step_size = 1; - int number_of_threads = n / 2; - while (number_of_threads > 0) - { - if (tid < number_of_threads) // still alive? - { - const auto fst = tid * step_size * 2; - const auto snd = fst + step_size; - if(input[fst] < input[snd]){ - input[fst] = input[snd]; - } - } - step_size <<= 1; - number_of_threads >>= 1; - } - __syncthreads(); -} - -__global__ void count_occurences(unsigned int * arr, int n, int * count, unsigned long long exp){ - const int tid = blockDim.x * blockIdx.x + threadIdx.x; - int digit; - if(tid < n){ - digit = (arr[tid] / exp) % BASE; - atomicAdd(&count[digit], 1); - } -} -int main(int argc, char *argv[]) -{ - curandGenerator_t curand_gen; - cudaEvent_t start, stop; - float time; - cudaEventCreate(&start); - cudaEventCreate(&stop); - curandCreateGenerator(&curand_gen, CURAND_RNG_PSEUDO_DEFAULT); - curandSetPseudoRandomGeneratorSeed(curand_gen, RNG_SEED); - const long count = ARRAY_SIZE; - unsigned int deviceArr[count]; - unsigned int * darr; - int grid, block; - cudaMalloc(&darr, count * sizeof(unsigned int)); - - - curandGenerate(curand_gen, darr, count); - cudaDeviceSynchronize(); - cudaMemcpy(deviceArr, darr, count * sizeof(int), cudaMemcpyDeviceToHost); - //find max - if(count > MAX_BLOCK_SIZE){ - grid = count / (MAX_BLOCK_SIZE); - } - else{ - grid = 1; - } - block = MAX_BLOCK_SIZE / 2; - get_max <<<grid, block >>>(darr, count); - unsigned int global_max; - cudaMemcpy(&global_max, darr, sizeof(int), cudaMemcpyDeviceToHost); - - - - cudaEventRecord(start, 0); - cudaMemcpy(darr, deviceArr, count * sizeof(unsigned int), cudaMemcpyHostToDevice); - block *= 2; - for (unsigned long long exp = 1; global_max / exp > 0; exp *= BASE) { - unsigned int output[count]; - int hcount[BASE] = {0}; - int * dcount; - cudaMalloc(&dcount, sizeof(int) * BASE); - cudaMemset(dcount, 0, sizeof(int) * BASE); - count_occurences <<<grid, block>>>(darr, count, dcount, exp); - cudaMemcpy(hcount, dcount, BASE * sizeof(int), cudaMemcpyDeviceToHost); - for (int i = 1; i < BASE; i++) { - hcount[i] += hcount[i - 1]; - } - for (int i = count - 1; i >= 0; i--) { - output[hcount[ (deviceArr[i] / exp) % BASE ] - 1] = deviceArr[i]; - hcount[ (deviceArr[i] / exp) % BASE ]--; - } - cudaMemcpy(darr, output, sizeof(unsigned int) * count, cudaMemcpyHostToDevice); - memcpy(deviceArr, output, sizeof(unsigned int) * count); - cudaFree(dcount); - cudaDeviceSynchronize(); - } - - cudaMemcpy(deviceArr, darr, count * sizeof(unsigned int), cudaMemcpyDeviceToHost); - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&time, start, stop); - ofstream out("test/output", std::ios::out | std::ios::trunc); - for(int k = 0; k < count; k ++){ - out << deviceArr[k] << endl ; - } - - out.close(); - cout << "Time = " << time * 1000 << " ms" << endl; - cudaFree(darr); - curandDestroyGenerator(curand_gen); - cudaEventDestroy(start); - cudaEventDestroy(stop); - return 0; +#include <iostream> +#include <fstream> +#include <numeric> +#include "cuda_runtime.h" +#include "cuda.h" +#include "curand_kernel.h" +#include "device_launch_parameters.h" + +#define ARRAY_SIZE 100000 +#define MAX_BLOCK_SIZE 1024 +#define RNG_SEED 13516120 +#define BASE 10 + + +using namespace std; + + +__global__ void get_max(unsigned int * input, int n) +{ + const int tid = blockDim.x * blockIdx.x + threadIdx.x; + auto step_size = 1; + int number_of_threads = n / 2; + while (number_of_threads > 0) + { + if (tid < number_of_threads) // still alive? + { + const auto fst = tid * step_size * 2; + const auto snd = fst + step_size; + if(input[fst] < input[snd]){ + input[fst] = input[snd]; + } + } + step_size <<= 1; + number_of_threads >>= 1; + } + __syncthreads(); +} + + +__global__ void count_occurences(unsigned int * arr, int n, int * count, unsigned long long exp){ + const int tid = blockDim.x * blockIdx.x + threadIdx.x; + int digit; + if(tid < n){ + digit = (arr[tid] / exp) % BASE; + atomicAdd(&count[digit], 1); + } +} + + +int main(int argc, char *argv[]) +{ + curandGenerator_t curand_gen; + cudaEvent_t start, stop; + float time; + cudaEventCreate(&start); + cudaEventCreate(&stop); + curandCreateGenerator(&curand_gen, CURAND_RNG_PSEUDO_DEFAULT); + curandSetPseudoRandomGeneratorSeed(curand_gen, RNG_SEED); + const long count = ARRAY_SIZE; + unsigned int hostArr[count]; + unsigned int * deviceArr; + int grid, block; + cudaMalloc(&deviceArr, count * sizeof(unsigned int)); + + + //generate random number + curandGenerate(curand_gen, deviceArr, count); + cudaDeviceSynchronize(); + cudaMemcpy(hostArr, deviceArr, count * sizeof(int), cudaMemcpyDeviceToHost); + + + //find max + if(count > MAX_BLOCK_SIZE){ + grid = count / (MAX_BLOCK_SIZE); + } + else{ + grid = 1; + } + block = MAX_BLOCK_SIZE / 2; + get_max <<<grid, block >>>(deviceArr, count); + unsigned int global_max; + cudaMemcpy(&global_max, deviceArr, sizeof(int), cudaMemcpyDeviceToHost); + + + //start sorting + cudaEventRecord(start, 0); + cudaMemcpy(deviceArr, hostArr, count * sizeof(unsigned int), cudaMemcpyHostToDevice); + block *= 2; + for (unsigned long long exp = 1; global_max / exp > 0; exp *= BASE) { + unsigned int output[count]; + int hcount[BASE] = {0}; + int * dcount; + cudaMalloc(&dcount, sizeof(int) * BASE); + cudaMemset(dcount, 0, sizeof(int) * BASE); + count_occurences <<<grid, block>>>(deviceArr, count, dcount, exp); + cudaMemcpy(hcount, dcount, BASE * sizeof(int), cudaMemcpyDeviceToHost); + for (int i = 1; i < BASE; i++) { + hcount[i] += hcount[i - 1]; + } + for (int i = count - 1; i >= 0; i--) { + output[hcount[ (hostArr[i] / exp) % BASE ] - 1] = hostArr[i]; + hcount[ (hostArr[i] / exp) % BASE ]--; + } + cudaMemcpy(deviceArr, output, sizeof(unsigned int) * count, cudaMemcpyHostToDevice); + memcpy(hostArr, output, sizeof(unsigned int) * count); + cudaFree(dcount); + cudaDeviceSynchronize(); + } + cudaMemcpy(hostArr, deviceArr, count * sizeof(unsigned int), cudaMemcpyDeviceToHost); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&time, start, stop); + + + //write to file + ofstream out("test/output", std::ios::out | std::ios::trunc); + for(int k = 0; k < count; k ++){ + out << hostArr[k] << endl ; + } + out.close(); + + + cout << "Time = " << time * 1000 << " ms" << endl; + + + cudaFree(deviceArr); + curandDestroyGenerator(curand_gen); + cudaEventDestroy(start); + cudaEventDestroy(stop); + return 0; } \ No newline at end of file