From ea3bfbfb8aef5121f850f0bf2bc504b236e60f85 Mon Sep 17 00:00:00 2001
From: Jose <13516027@std.stei.itb.ac.id>
Date: Thu, 11 Apr 2019 08:26:28 +0700
Subject: [PATCH] Add src and makefile

---
 makefile              |   6 +++
 src/parallel_radix.cu | 119 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 125 insertions(+)
 create mode 100644 makefile
 create mode 100755 src/parallel_radix.cu

diff --git a/makefile b/makefile
new file mode 100644
index 0000000..7d35f96
--- /dev/null
+++ b/makefile
@@ -0,0 +1,6 @@
+CP = parallel_radix.cu
+EXE = radix
+SRC_DIR = src
+LDLIBS += -lcurand
+build:
+	nvcc -o $(EXE) $(SRC_DIR)/$(CP) $(LDLIBS)
diff --git a/src/parallel_radix.cu b/src/parallel_radix.cu
new file mode 100755
index 0000000..55f734e
--- /dev/null
+++ b/src/parallel_radix.cu
@@ -0,0 +1,119 @@
+#include <iostream>
+#include <fstream>
+#include <numeric>
+#include "cuda_runtime.h"
+#include "cuda.h"
+#include "curand_kernel.h"
+#include "device_launch_parameters.h"
+
+#define MAX_BLOCK_SIZE 1024
+#define ARRAY_SIZE 200000
+#define BASE 10
+#define RNG_SEED 13516027
+
+using namespace std;
+
+void printArray(unsigned int * arr, int n) {
+    for (int i = 0; i < n; i++) 
+        cout << arr[i] << " ";
+    cout << endl; 
+}
+
+__global__ void get_max(unsigned int * input, int n)
+{
+	const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+	auto step_size = 1;
+	int number_of_threads = n / 2;
+	while (number_of_threads > 0)
+	{
+		if (tid < number_of_threads) // still alive?
+		{
+			const auto fst = tid * step_size * 2;
+			const auto snd = fst + step_size;
+			if(input[fst] < input[snd]){
+				input[fst] = input[snd];
+			}
+		}
+		step_size <<= 1; 
+		number_of_threads >>= 1;
+    }
+    __syncthreads();
+}
+
+__global__ void count_occurences(unsigned int * arr, int n, int * count, unsigned long long exp){
+    const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    int digit;
+    if(tid < n){
+        digit = (arr[tid] / exp) % BASE;
+        atomicAdd(&count[digit], 1);
+    }
+}
+int main(int argc, char *argv[])
+{
+    curandGenerator_t curand_gen;
+    cudaEvent_t start, stop;
+    float time;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    curandCreateGenerator(&curand_gen, CURAND_RNG_PSEUDO_DEFAULT);
+    curandSetPseudoRandomGeneratorSeed(curand_gen, RNG_SEED);
+    const long count = ARRAY_SIZE;
+    unsigned int harr[count];
+    unsigned int * darr;
+    int grid, block;
+    cudaMalloc(&darr, count * sizeof(unsigned int));
+    //rng
+    curandGenerate(curand_gen, darr, count);
+    cudaDeviceSynchronize();
+    cudaMemcpy(harr, darr, count * sizeof(int), cudaMemcpyDeviceToHost);
+    //find max
+    if(count > MAX_BLOCK_SIZE){
+        grid = count / (MAX_BLOCK_SIZE);
+    }
+    else{
+        grid = 1;
+    }
+    block = MAX_BLOCK_SIZE / 2;
+	get_max <<<grid, block >>>(darr, count);
+	unsigned int global_max;
+    cudaMemcpy(&global_max, darr, sizeof(int), cudaMemcpyDeviceToHost);
+    //sort
+    cudaEventRecord(start, 0);
+    cudaMemcpy(darr, harr, count * sizeof(unsigned int), cudaMemcpyHostToDevice);
+    block *= 2;
+    for (unsigned long long exp = 1; global_max / exp > 0; exp *= BASE) {
+        unsigned int output[count];
+        int hcount[BASE] = {0};
+        int * dcount;
+        cudaMalloc(&dcount, sizeof(int) * BASE);
+        cudaMemset(dcount, 0, sizeof(int) * BASE);
+        count_occurences <<<grid, block>>>(darr, count, dcount, exp);
+        cudaMemcpy(hcount, dcount, BASE * sizeof(int), cudaMemcpyDeviceToHost);
+        for (int i = 1; i < BASE; i++) {
+            hcount[i] += hcount[i - 1];
+        }
+        for (int i = count - 1; i >= 0; i--) { 
+            output[hcount[ (harr[i] / exp) % BASE ] - 1] = harr[i]; 
+            hcount[ (harr[i] / exp) % BASE ]--;
+        }
+        cudaMemcpy(darr, output, sizeof(unsigned int) * count, cudaMemcpyHostToDevice);
+        memcpy(harr, output, sizeof(unsigned int) * count);
+        cudaFree(dcount);
+        cudaDeviceSynchronize();
+    }
+    cudaMemcpy(harr, darr, count * sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&time, start, stop);
+    ofstream out("test/output", std::ios::out | std::ios::trunc);
+    for(int k = 0; k < count; k ++){
+        out << harr[k] << " " ;
+    }
+    out.close();
+    cout << "The sorting process took " << time * 1000 << " microseconds to run." << endl;
+    cudaFree(darr);
+    curandDestroyGenerator(curand_gen);
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+	return 0;
+}
\ No newline at end of file
-- 
GitLab