From ea3bfbfb8aef5121f850f0bf2bc504b236e60f85 Mon Sep 17 00:00:00 2001 From: Jose <13516027@std.stei.itb.ac.id> Date: Thu, 11 Apr 2019 08:26:28 +0700 Subject: [PATCH] Add src and makefile --- makefile | 6 +++ src/parallel_radix.cu | 119 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 makefile create mode 100755 src/parallel_radix.cu diff --git a/makefile b/makefile new file mode 100644 index 0000000..7d35f96 --- /dev/null +++ b/makefile @@ -0,0 +1,6 @@ +CP = parallel_radix.cu +EXE = radix +SRC_DIR = src +LDLIBS += -lcurand +build: + nvcc -o $(EXE) $(SRC_DIR)/$(CP) $(LDLIBS) diff --git a/src/parallel_radix.cu b/src/parallel_radix.cu new file mode 100755 index 0000000..55f734e --- /dev/null +++ b/src/parallel_radix.cu @@ -0,0 +1,119 @@ +#include <iostream> +#include <fstream> +#include <numeric> +#include "cuda_runtime.h" +#include "cuda.h" +#include "curand_kernel.h" +#include "device_launch_parameters.h" + +#define MAX_BLOCK_SIZE 1024 +#define ARRAY_SIZE 200000 +#define BASE 10 +#define RNG_SEED 13516027 + +using namespace std; + +void printArray(unsigned int * arr, int n) { + for (int i = 0; i < n; i++) + cout << arr[i] << " "; + cout << endl; +} + +__global__ void get_max(unsigned int * input, int n) +{ + const int tid = blockDim.x * blockIdx.x + threadIdx.x; + auto step_size = 1; + int number_of_threads = n / 2; + while (number_of_threads > 0) + { + if (tid < number_of_threads) // still alive? + { + const auto fst = tid * step_size * 2; + const auto snd = fst + step_size; + if(input[fst] < input[snd]){ + input[fst] = input[snd]; + } + } + step_size <<= 1; + number_of_threads >>= 1; + } + __syncthreads(); +} + +__global__ void count_occurences(unsigned int * arr, int n, int * count, unsigned long long exp){ + const int tid = blockDim.x * blockIdx.x + threadIdx.x; + int digit; + if(tid < n){ + digit = (arr[tid] / exp) % BASE; + atomicAdd(&count[digit], 1); + } +} +int main(int argc, char *argv[]) +{ + curandGenerator_t curand_gen; + cudaEvent_t start, stop; + float time; + cudaEventCreate(&start); + cudaEventCreate(&stop); + curandCreateGenerator(&curand_gen, CURAND_RNG_PSEUDO_DEFAULT); + curandSetPseudoRandomGeneratorSeed(curand_gen, RNG_SEED); + const long count = ARRAY_SIZE; + unsigned int harr[count]; + unsigned int * darr; + int grid, block; + cudaMalloc(&darr, count * sizeof(unsigned int)); + //rng + curandGenerate(curand_gen, darr, count); + cudaDeviceSynchronize(); + cudaMemcpy(harr, darr, count * sizeof(int), cudaMemcpyDeviceToHost); + //find max + if(count > MAX_BLOCK_SIZE){ + grid = count / (MAX_BLOCK_SIZE); + } + else{ + grid = 1; + } + block = MAX_BLOCK_SIZE / 2; + get_max <<<grid, block >>>(darr, count); + unsigned int global_max; + cudaMemcpy(&global_max, darr, sizeof(int), cudaMemcpyDeviceToHost); + //sort + cudaEventRecord(start, 0); + cudaMemcpy(darr, harr, count * sizeof(unsigned int), cudaMemcpyHostToDevice); + block *= 2; + for (unsigned long long exp = 1; global_max / exp > 0; exp *= BASE) { + unsigned int output[count]; + int hcount[BASE] = {0}; + int * dcount; + cudaMalloc(&dcount, sizeof(int) * BASE); + cudaMemset(dcount, 0, sizeof(int) * BASE); + count_occurences <<<grid, block>>>(darr, count, dcount, exp); + cudaMemcpy(hcount, dcount, BASE * sizeof(int), cudaMemcpyDeviceToHost); + for (int i = 1; i < BASE; i++) { + hcount[i] += hcount[i - 1]; + } + for (int i = count - 1; i >= 0; i--) { + output[hcount[ (harr[i] / exp) % BASE ] - 1] = harr[i]; + hcount[ (harr[i] / exp) % BASE ]--; + } + cudaMemcpy(darr, output, sizeof(unsigned int) * count, cudaMemcpyHostToDevice); + memcpy(harr, output, sizeof(unsigned int) * count); + cudaFree(dcount); + cudaDeviceSynchronize(); + } + cudaMemcpy(harr, darr, count * sizeof(unsigned int), cudaMemcpyDeviceToHost); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&time, start, stop); + ofstream out("test/output", std::ios::out | std::ios::trunc); + for(int k = 0; k < count; k ++){ + out << harr[k] << " " ; + } + out.close(); + cout << "The sorting process took " << time * 1000 << " microseconds to run." << endl; + cudaFree(darr); + curandDestroyGenerator(curand_gen); + cudaEventDestroy(start); + cudaEventDestroy(stop); + return 0; +} \ No newline at end of file -- GitLab