From c27cb4a8eb5ee378f14de8dc012d1fbfdd3204a4 Mon Sep 17 00:00:00 2001
From: tung <13516135@std.stei.itb.ac.id>
Date: Thu, 11 Apr 2019 22:20:19 +0700
Subject: [PATCH] make parallel radix

---
 src/parallelRadix.cu | 113 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100644 src/parallelRadix.cu

diff --git a/src/parallelRadix.cu b/src/parallelRadix.cu
new file mode 100644
index 0000000..0c839d1
--- /dev/null
+++ b/src/parallelRadix.cu
@@ -0,0 +1,113 @@
+#include <iostream>
+#include <fstream>
+#include <numeric>
+#include "cuda_runtime.h"
+#include "cuda.h"
+#include "curand_kernel.h"
+#include "device_launch_parameters.h"
+
+using namespace std;
+
+//fungsi cari isi array yg terbesar
+__global__ void getMax(unsigned int* array, int n) {
+	const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+	auto lStep = 1;
+	int nThreads = n / 2;
+	while (nThreads > 0)
+	{
+		if (tid < nThreads) // still alive?
+		{
+			const auto fst = tid * lStep * 2;
+			const auto snd = fst + lStep;
+			if(array[fst] < array[snd]){
+			    array[fst] = array[snd];
+			}
+		}
+		lStep <<= 1; 
+		nThreads >>= 1;
+    }
+    __syncthreads();
+}
+
+__global__ void countOccurences(unsigned int * arr, int n, int * count, unsigned long long exp) {
+    const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    int digit;
+    if(tid < n){
+        digit = (arr[tid] / exp) % 10;
+        atomicAdd(&count[digit], 1);
+    }
+}
+
+int main(int argc, char *argv[]) {
+    curandGenerator_t curandGenerator;
+    cudaEvent_t start, end;
+    const long count = 200000;
+    unsigned int dst[count];
+    unsigned int * src;
+    int g, b;
+    float ms;
+    unsigned int maxGlobal;
+    
+    cudaEventCreate(&start);
+    cudaEventCreate(&end);
+    curandCreateGenerator(&curandGenerator, CURAND_RNG_PSEUDO_DEFAULT);
+    curandSetPseudoRandomGeneratorSeed(curandGenerator, 13516042);
+    cudaMalloc(&src, count * sizeof(unsigned int));
+    curandGenerate(curandGenerator, src, count);
+    cudaDeviceSynchronize();
+    cudaMemcpy(dst, src, count * sizeof(int), cudaMemcpyDeviceToHost);
+
+    if(count > 1024) {
+        g = count / 1024;
+    } else {
+        g = 1;
+    }
+
+    b = 512;
+    getMax <<<g, b >>>(src, count);
+    cudaMemcpy(&maxGlobal, src, sizeof(int), cudaMemcpyDeviceToHost);
+    cudaEventRecord(start, 0);
+    cudaMemcpy(src, dst, count * sizeof(unsigned int), cudaMemcpyHostToDevice);
+    b *= 2;
+
+    for (unsigned long long exp = 1; maxGlobal / exp > 0; exp *= 10) {
+        unsigned int output[count];
+        int hcount[10] = {0};
+        int * dcount;
+        cudaMalloc(&dcount, sizeof(int) * 10);
+        cudaMemset(dcount, 0, sizeof(int) * 10);
+        countOccurences <<<g, b>>>(src, count, dcount, exp);
+        cudaMemcpy(hcount, dcount, 10 * sizeof(int), cudaMemcpyDeviceToHost);
+        for (int i = 1; i < 10; i++) {
+            hcount[i] += hcount[i - 1];
+        }
+        for (int i = count - 1; i >= 0; i--) { 
+            output[hcount[ (dst[i] / exp) % 10 ] - 1] = dst[i]; 
+            hcount[ (dst[i] / exp) % 10 ]--;
+        }
+        cudaMemcpy(src, output, sizeof(unsigned int) * count, cudaMemcpyHostToDevice);
+        memcpy(dst, output, sizeof(unsigned int) * count);
+        cudaFree(dcount);
+        cudaDeviceSynchronize();
+    }
+
+    cudaMemcpy(dst, src, count * sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    cudaEventRecord(end, 0);
+    cudaEventSynchronize(end);
+    cudaEventElapsedTime(&ms, start, end);
+    ofstream out("test/output", std::ios::out | std::ios::trunc);
+    
+    for(int i = 0; i < count; i++){
+        out << dst[i] << " " ;
+    }
+
+    out.close();
+    cout << "Waktu : " << ms * 1000 << " detik." << endl;
+    
+    cudaFree(src);
+    curandDestroyGenerator(curandGenerator);
+    cudaEventDestroy(start);
+    cudaEventDestroy(end);
+	return 0;
+}
+
-- 
GitLab