Skip to content
Snippets Groups Projects
Commit ea3bfbfb authored by Jose Hosea's avatar Jose Hosea
Browse files

Add src and makefile

parent e79f852c
No related merge requests found
Pipeline #12132 failed with stages
CP = parallel_radix.cu
EXE = radix
SRC_DIR = src
LDLIBS += -lcurand
build:
nvcc -o $(EXE) $(SRC_DIR)/$(CP) $(LDLIBS)
#include <iostream>
#include <fstream>
#include <numeric>
#include "cuda_runtime.h"
#include "cuda.h"
#include "curand_kernel.h"
#include "device_launch_parameters.h"
#define MAX_BLOCK_SIZE 1024
#define ARRAY_SIZE 200000
#define BASE 10
#define RNG_SEED 13516027
using namespace std;
void printArray(unsigned int * arr, int n) {
for (int i = 0; i < n; i++)
cout << arr[i] << " ";
cout << endl;
}
__global__ void get_max(unsigned int * input, int n)
{
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
auto step_size = 1;
int number_of_threads = n / 2;
while (number_of_threads > 0)
{
if (tid < number_of_threads) // still alive?
{
const auto fst = tid * step_size * 2;
const auto snd = fst + step_size;
if(input[fst] < input[snd]){
input[fst] = input[snd];
}
}
step_size <<= 1;
number_of_threads >>= 1;
}
__syncthreads();
}
__global__ void count_occurences(unsigned int * arr, int n, int * count, unsigned long long exp){
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
int digit;
if(tid < n){
digit = (arr[tid] / exp) % BASE;
atomicAdd(&count[digit], 1);
}
}
int main(int argc, char *argv[])
{
curandGenerator_t curand_gen;
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
curandCreateGenerator(&curand_gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(curand_gen, RNG_SEED);
const long count = ARRAY_SIZE;
unsigned int harr[count];
unsigned int * darr;
int grid, block;
cudaMalloc(&darr, count * sizeof(unsigned int));
//rng
curandGenerate(curand_gen, darr, count);
cudaDeviceSynchronize();
cudaMemcpy(harr, darr, count * sizeof(int), cudaMemcpyDeviceToHost);
//find max
if(count > MAX_BLOCK_SIZE){
grid = count / (MAX_BLOCK_SIZE);
}
else{
grid = 1;
}
block = MAX_BLOCK_SIZE / 2;
get_max <<<grid, block >>>(darr, count);
unsigned int global_max;
cudaMemcpy(&global_max, darr, sizeof(int), cudaMemcpyDeviceToHost);
//sort
cudaEventRecord(start, 0);
cudaMemcpy(darr, harr, count * sizeof(unsigned int), cudaMemcpyHostToDevice);
block *= 2;
for (unsigned long long exp = 1; global_max / exp > 0; exp *= BASE) {
unsigned int output[count];
int hcount[BASE] = {0};
int * dcount;
cudaMalloc(&dcount, sizeof(int) * BASE);
cudaMemset(dcount, 0, sizeof(int) * BASE);
count_occurences <<<grid, block>>>(darr, count, dcount, exp);
cudaMemcpy(hcount, dcount, BASE * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 1; i < BASE; i++) {
hcount[i] += hcount[i - 1];
}
for (int i = count - 1; i >= 0; i--) {
output[hcount[ (harr[i] / exp) % BASE ] - 1] = harr[i];
hcount[ (harr[i] / exp) % BASE ]--;
}
cudaMemcpy(darr, output, sizeof(unsigned int) * count, cudaMemcpyHostToDevice);
memcpy(harr, output, sizeof(unsigned int) * count);
cudaFree(dcount);
cudaDeviceSynchronize();
}
cudaMemcpy(harr, darr, count * sizeof(unsigned int), cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
ofstream out("test/output", std::ios::out | std::ios::trunc);
for(int k = 0; k < count; k ++){
out << harr[k] << " " ;
}
out.close();
cout << "The sorting process took " << time * 1000 << " microseconds to run." << endl;
cudaFree(darr);
curandDestroyGenerator(curand_gen);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return 0;
}
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment