From c27cb4a8eb5ee378f14de8dc012d1fbfdd3204a4 Mon Sep 17 00:00:00 2001 From: tung <13516135@std.stei.itb.ac.id> Date: Thu, 11 Apr 2019 22:20:19 +0700 Subject: [PATCH] make parallel radix --- src/parallelRadix.cu | 113 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 src/parallelRadix.cu diff --git a/src/parallelRadix.cu b/src/parallelRadix.cu new file mode 100644 index 0000000..0c839d1 --- /dev/null +++ b/src/parallelRadix.cu @@ -0,0 +1,113 @@ +#include <iostream> +#include <fstream> +#include <numeric> +#include "cuda_runtime.h" +#include "cuda.h" +#include "curand_kernel.h" +#include "device_launch_parameters.h" + +using namespace std; + +//fungsi cari isi array yg terbesar +__global__ void getMax(unsigned int* array, int n) { + const int tid = blockDim.x * blockIdx.x + threadIdx.x; + auto lStep = 1; + int nThreads = n / 2; + while (nThreads > 0) + { + if (tid < nThreads) // still alive? + { + const auto fst = tid * lStep * 2; + const auto snd = fst + lStep; + if(array[fst] < array[snd]){ + array[fst] = array[snd]; + } + } + lStep <<= 1; + nThreads >>= 1; + } + __syncthreads(); +} + +__global__ void countOccurences(unsigned int * arr, int n, int * count, unsigned long long exp) { + const int tid = blockDim.x * blockIdx.x + threadIdx.x; + int digit; + if(tid < n){ + digit = (arr[tid] / exp) % 10; + atomicAdd(&count[digit], 1); + } +} + +int main(int argc, char *argv[]) { + curandGenerator_t curandGenerator; + cudaEvent_t start, end; + const long count = 200000; + unsigned int dst[count]; + unsigned int * src; + int g, b; + float ms; + unsigned int maxGlobal; + + cudaEventCreate(&start); + cudaEventCreate(&end); + curandCreateGenerator(&curandGenerator, CURAND_RNG_PSEUDO_DEFAULT); + curandSetPseudoRandomGeneratorSeed(curandGenerator, 13516042); + cudaMalloc(&src, count * sizeof(unsigned int)); + curandGenerate(curandGenerator, src, count); + cudaDeviceSynchronize(); + cudaMemcpy(dst, src, count * sizeof(int), cudaMemcpyDeviceToHost); + + if(count > 1024) { + g = count / 1024; + } else { + g = 1; + } + + b = 512; + getMax <<<g, b >>>(src, count); + cudaMemcpy(&maxGlobal, src, sizeof(int), cudaMemcpyDeviceToHost); + cudaEventRecord(start, 0); + cudaMemcpy(src, dst, count * sizeof(unsigned int), cudaMemcpyHostToDevice); + b *= 2; + + for (unsigned long long exp = 1; maxGlobal / exp > 0; exp *= 10) { + unsigned int output[count]; + int hcount[10] = {0}; + int * dcount; + cudaMalloc(&dcount, sizeof(int) * 10); + cudaMemset(dcount, 0, sizeof(int) * 10); + countOccurences <<<g, b>>>(src, count, dcount, exp); + cudaMemcpy(hcount, dcount, 10 * sizeof(int), cudaMemcpyDeviceToHost); + for (int i = 1; i < 10; i++) { + hcount[i] += hcount[i - 1]; + } + for (int i = count - 1; i >= 0; i--) { + output[hcount[ (dst[i] / exp) % 10 ] - 1] = dst[i]; + hcount[ (dst[i] / exp) % 10 ]--; + } + cudaMemcpy(src, output, sizeof(unsigned int) * count, cudaMemcpyHostToDevice); + memcpy(dst, output, sizeof(unsigned int) * count); + cudaFree(dcount); + cudaDeviceSynchronize(); + } + + cudaMemcpy(dst, src, count * sizeof(unsigned int), cudaMemcpyDeviceToHost); + cudaEventRecord(end, 0); + cudaEventSynchronize(end); + cudaEventElapsedTime(&ms, start, end); + ofstream out("test/output", std::ios::out | std::ios::trunc); + + for(int i = 0; i < count; i++){ + out << dst[i] << " " ; + } + + out.close(); + cout << "Waktu : " << ms * 1000 << " detik." << endl; + + cudaFree(src); + curandDestroyGenerator(curandGenerator); + cudaEventDestroy(start); + cudaEventDestroy(end); + return 0; +} + -- GitLab