From e4d05172b37a880504d85b93eed58dfb08624548 Mon Sep 17 00:00:00 2001
From: Ayrton Cyril <13516019@std.stei.itb.ac.id>
Date: Thu, 11 Apr 2019 18:25:35 +0700
Subject: [PATCH] done

---
 Makefile                    |  9 ++--
 run_cuda.sh                 |  3 +-
 src/radixSort/radixSort.cu  | 99 +++++++++++++++++++++++--------------
 src/radixSort/radixSort.cuh |  2 +-
 src/radixsort_paralel.cu    | 60 +++-------------------
 src/radixsort_serial.c      | 60 ++++++----------------
 src/util/util.cpp           |  7 ++-
 src/util/util.hpp           |  3 +-
 8 files changed, 99 insertions(+), 144 deletions(-)

diff --git a/Makefile b/Makefile
index 42b6e0c..ffb4e92 100644
--- a/Makefile
+++ b/Makefile
@@ -5,10 +5,6 @@ LIB := -lm
 EXEC_PARALEL := radixsort_paralel
 EXEC_SERIAL := radixsort_serial
 
-serial-only:
-	@mkdir -p ${OUTPUT_DIR}
-	${CXX} src/radixsort_serial.c -o ${EXEC_SERIAL}
-
 debug:
 	@mkdir -p ${OUTPUT_DIR}
 	${CUDA} src/radixsort_paralel.cu src/cudaUtil/cudaUtil.cu src/radixSort/radixSort.cu src/util/util.cpp --device-c
@@ -17,8 +13,9 @@ debug:
 
 build:
 	@mkdir -p ${OUTPUT_DIR}
-	${CUDA} src/radixsort_paralel.cu -o ${EXEC_PARALEL}
-	${CXX} src/radixsort_serial.c -o ${EXEC_SERIAL}
+	${CUDA} src/radixsort_paralel.cu src/cudaUtil/cudaUtil.cu src/radixSort/radixSort.cu src/util/util.cpp --device-c
+	${CUDA}  radixsort_paralel.o util.o cudaUtil.o radixSort.o -o bin/radixsort_paralel.out
+	${CXX} src/util/util.cpp src/radixsort_serial.c  -o ${EXEC_SERIAL} 
 
 clean:
 	@rm -r ${OUTPUT_DIR} || true
diff --git a/run_cuda.sh b/run_cuda.sh
index adc6b33..b0d0937 100644
--- a/run_cuda.sh
+++ b/run_cuda.sh
@@ -1 +1,2 @@
-./bin/radixsort_paralel.out $1
\ No newline at end of file
+./bin/radixsort_paralel.out $1
+./radixsort_serial $1
\ No newline at end of file
diff --git a/src/radixSort/radixSort.cu b/src/radixSort/radixSort.cu
index 0bc3d16..9a5182a 100644
--- a/src/radixSort/radixSort.cu
+++ b/src/radixSort/radixSort.cu
@@ -1,54 +1,81 @@
 #include "radixSort.cuh"
 #include "../cudaUtil/cudaUtil.cuh"
+#include "../util/util.hpp"
 #include <stdio.h>
 using namespace std; 
 #define WSIZE 32
-__device__ int* d_arr;
 
-__device__ void print_arr(int* arr, int n){
-	for (int i = 0; i<n; i++){
-	  printf("%d ", arr[i]);
-	}
-	printf("\n");
-  }
+__global__ void createBucket(int arr[], int count[], int n, int exp){
+	int stride = blockDim.x * gridDim.x;
+	int pointer = blockDim.x * blockIdx.x + threadIdx.x;
+	for (int i = pointer; i < n; i += stride) {
+        atomicAdd(&count[ (arr[i]/exp)%10 ], 1);
+    }
+}
 
-__global__ void countSort(int arr[], int n, int exp) 
-{ 
-	__shared__ int *output; // output d_array 
-	int num_div_per_thread = n / WSIZE;
-	int i;
-	__shared__ int count[10];
-	if(threadIdx.x == 0){
-		output = (int*) malloc(sizeof(int) * n);
-		for(int j = 0; j<10;j++){
-			count[j] = 0;
-		}
-	}
+__global__ void copyToArray(int arr[], int output[], int n){
+	int stride = blockDim.x * gridDim.x;
+	int pointer = blockDim.x * blockIdx.x + threadIdx.x;
+	for (int i = pointer; i < n; i += stride) {
+		arr[i] = output[i];
+        __syncthreads();
+    }
+}
+
+void countSort(int arr[], int n, int exp) {
+	dim3 gridSize(20,1,1); 
+	dim3 blockSize(20,1,1); 
+	int *d_output, *output = (int*) malloc(sizeof(int) * n); 
+	int *d_arr, *d_count;
+    int i, count[10] = {0}; 
+  
+	cudaMalloc((void **) &d_count, sizeof(int) * 10);
+    cudaMalloc((void **) &d_arr, sizeof(int) * n);
+	cudaMalloc((void **) &d_output, sizeof(int) * n);
+	
+	cudaMemcpy(d_arr, arr, sizeof(int) * n, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_count, count, sizeof(int) * 10, cudaMemcpyHostToDevice);
     // Store count of occurrences in count[] 
-    for (i = threadIdx.x * num_div_per_thread; i < (threadIdx.x + 1) * num_div_per_thread; i++) 
-        count[ (arr[i]/exp)%10 ]++; 
+	createBucket<<<gridSize, blockSize>>>(d_arr, d_count, n, exp);
+	cudaDeviceSynchronize();
+	cudaMemcpy(count, d_count, sizeof(int) * 10, cudaMemcpyDeviceToHost);
   
     // Change count[i] so that count[i] now contains actual 
-	//  position of this digit in output[] 
-	__syncthreads();
-	if(threadIdx.x == 0){
-		for (i = 1; i < 10; i++) 
-			count[i] += count[i - 1]; 
-	}
-  
-    // Build the output d_array 
+    //  position of this digit in output[] 
+    for (i = 1; i < 10; i++) 
+        count[i] += count[i - 1]; 
+  
+    // Build the output array 
     for (i = n - 1; i >= 0; i--) 
     { 
-        output[count[ (d_arr[i]/exp)%10 ] - 1] = d_arr[i]; 
-        count[ (d_arr[i]/exp)%10 ]--; 
+        output[count[ (arr[i]/exp)%10 ] - 1] = arr[i]; 
+        count[ (arr[i]/exp)%10 ]--; 
     } 
   
-    // Copy the output d_array to d_arr[], so that d_arr[] now 
+    // Copy the output array to arr[], so that arr[] now 
     // contains sorted numbers according to current digit 
-    for (i = threadIdx.x * num_div_per_thread; i < (threadIdx.x + 1) * num_div_per_thread; i++) 
-		arr[i] = output[i]; 
-	print_arr(arr,n);
-	__syncthreads();
+    cudaMemcpy(d_output, output, sizeof(int) * n, cudaMemcpyHostToDevice);
+    copyToArray<<<gridSize, blockSize>>>(d_arr, d_output, n);
+    cudaDeviceSynchronize();
+    cudaMemcpy(arr, d_arr, sizeof(int) * n, cudaMemcpyDeviceToHost);
+
+    cudaFree(d_count);
+    cudaFree(d_output);
+    cudaFree(d_arr);
+    free(output);
+} 
+  
+void radix_sort(int arr[], int n)
+{   
+    // Find the maximum number to know number of digits 
+    int m = get_max(arr, n); 
+  
+    // Do counting sort for every digit. Note that instead 
+    // of passing digit number, exp is passed. exp is BASE^i 
+    // where i is current digit number 
+    for (int exp = 1; m/exp > 0; exp *= 10) {
+        countSort(arr, n, exp);
+    }
 } 
   
   
diff --git a/src/radixSort/radixSort.cuh b/src/radixSort/radixSort.cuh
index e6aec1b..1b73c9c 100644
--- a/src/radixSort/radixSort.cuh
+++ b/src/radixSort/radixSort.cuh
@@ -1,7 +1,7 @@
 #ifndef RADIX_SORT_CUH
 #define RADIX_SORT_CUH
 
-__global__ void countSort(int arr[], int n, int exp); 
+void radix_sort(int arr[], int n);
 __device__ void sortMSB(int* out, int* arr, int n);
 
 #endif
\ No newline at end of file
diff --git a/src/radixsort_paralel.cu b/src/radixsort_paralel.cu
index f6872cc..8520fc3 100644
--- a/src/radixsort_paralel.cu
+++ b/src/radixsort_paralel.cu
@@ -2,44 +2,9 @@
 #include "util/util.hpp"
 #include "cudaUtil/cudaUtil.cuh"
 #include "radixSort/radixSort.cuh"
-
+#include <sys/time.h>
 #define WSIZE 32
 
-void count_sort_paralel_first(int arr[],int count[], int n, int exp) 
-{ 
-    int *output = (int*) malloc (sizeof(int) * n); // output array 
-    int i,displacement[10] = {0};
-  
-    // Store count of occurrences in count[] 
-    for (i = 0; i < n; i++) {
-        count[ (arr[i]/exp)%10 ]++; 
-        displacement[ (arr[i]/exp)%10 ]++; 
-    }
-  
-    // Change count[i] so that count[i] now contains actual 
-    //  position of this digit in output[] 
-    for (i = 1; i < 10; i++) 
-        displacement[i] += displacement[i - 1]; 
-  
-    // Build the output array 
-    for (i = n - 1; i >= 0; i--) 
-    { 
-        output[displacement[ (arr[i]/exp)%10 ] - 1] = arr[i]; 
-        displacement[ (arr[i]/exp)%10 ]--; 
-    } 
-  
-    // Copy the output array to arr[], so that arr[] now 
-    // contains sorted numbers according to current digit 
-    for (i = 0; i < n; i++) 
-        arr[i] = output[i]; 
-}
-int getMax(int* arr, int n){
-	int mx = arr[0]; 
-    for (int i = 1; i < n; i++) 
-        if (arr[i] > mx) 
-            mx = arr[i]; 
-    return mx; 
-}
 
 
 int main(int argc, char *argv[]) {
@@ -50,7 +15,7 @@ int main(int argc, char *argv[]) {
     }
 
     int n = atoi(argv[1]);
-
+	struct timeval start, end;
     int* arr = (int *) malloc(sizeof(int) * n);
     
     int* d_arr;
@@ -60,25 +25,14 @@ int main(int argc, char *argv[]) {
     // cudaMalloc((void**)&d_out, sizeof(int) * n);
     
 	generate_random(arr, n);
-
-	printArr(arr,n);
-
-	int m = getMax(arr,n);
-	cudaMemcpyToSymbol(d_arr, arr, sizeof(float) * n);   
-	for (int exp = 1; m/exp > 0; exp *= 10) 
-        countSort<<<1,WSIZE>>>(arr, n, exp); 
-	cudaDeviceSynchronize();
-	
-	
-	
-    
+    gettimeofday(&start, NULL);
+    radix_sort(arr, n);
+	gettimeofday(&end, NULL);
 	
-    cudaMemcpyFromSymbol(arr, d_arr, sizeof(float) * n);    	
-    cudaDeviceSynchronize();
-	printArr(arr,n);
+	printf("\n");
+    printf("Parallel execution time = %ld.\n", getTimeDiff(start, end));
 
 
-    cudaFree(d_arr);
     
     return 0;
 }
diff --git a/src/radixsort_serial.c b/src/radixsort_serial.c
index a95dac0..ac2d806 100644
--- a/src/radixsort_serial.c
+++ b/src/radixsort_serial.c
@@ -1,22 +1,19 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <time.h>
 
-#define SEED 13516019
-#define MAX_DIGIT 128
-#define OUTPUT_FILE "output/output_serial.txt"
 
-// A function to randomize arr with size n using seed
-void randomizeArray(int arr[], int n) {
-    int seed = SEED; // Ganti dengan NIM anda sebagai seed.
-    srand(seed);
-    for(long i = 0; i < n; i++) {
-        arr[i] = (int)rand() % 999999;
-    }
-}
+#include <stdio.h>
+#include <stdlib.h>
+#include "util/util.hpp"
+#include <sys/time.h>
 
 
+void generate_random(int *arr, int n){
+  int seed = 13516016;
+  srand(seed);
+  for (long i = 0; i < n; i++)
+  {
+    arr[i] = (int)rand();
+  }
+}
 int getMax(int arr[], int n) 
 { 
     int mx = arr[0]; 
@@ -72,32 +69,6 @@ void radix_sort(int arr[], int n)
         countSort(arr, n, exp); 
 } 
   
-// A utility function to print an array 
-void print(int arr[], int n) 
-{ 
-    for (int i = 0; i < n; i++) 
-        printf("%d ", arr[i]);
-}
-// A utility function to write array in a output file
-void printToFile(int arr[], int n, char name[50]) {
-    FILE* fOut = fopen(name, "w");
-    char temp[MAX_DIGIT];
-
-    if (fOut != NULL) {
-        sprintf(temp, "Array size: %d\n", n);
-        fputs(temp, fOut);
-        for (int i = 0; i < n; i++) {
-            sprintf(temp, "%d ", arr[i]);
-            fputs(temp, fOut);
-        }
-    } else {
-        printf("Unable to write file");
-    }
-
-    fclose(fOut);
-}
-  
-// A utility function to get different of two time in nanosecond
 long getTimeDiff(struct timeval start, struct timeval end) {
     long result = (long) end.tv_sec * 1000000 + end.tv_usec - \
                   (long) start.tv_sec * 1000000 + start.tv_usec;
@@ -105,6 +76,7 @@ long getTimeDiff(struct timeval start, struct timeval end) {
     return result;
 }
   
+  
 // Main program to implement RadixSort
 int main(int argc, char** argv) 
 { 
@@ -118,7 +90,7 @@ int main(int argc, char** argv)
     int* arr;
 
     arr = (int*) malloc(sizeof(int) * n);
-    randomizeArray(arr, n);
+    generate_random(arr, n);
 
     // Calculate the time and execute the radixsort
     gettimeofday(&start, NULL);
@@ -126,9 +98,7 @@ int main(int argc, char** argv)
     gettimeofday(&end, NULL);
 
     printf("\n");
-    printf("Execution time (microseconds) - Serial: %ld \xE6s.\n", getTimeDiff(start, end));
-
-    printToFile(arr, n, OUTPUT_FILE);
+    printf("Serial execution time = %ld.\n", getTimeDiff(start, end));
 
     return 0; 
 }
\ No newline at end of file
diff --git a/src/util/util.cpp b/src/util/util.cpp
index c1cc0d5..d43dbb1 100644
--- a/src/util/util.cpp
+++ b/src/util/util.cpp
@@ -8,12 +8,17 @@ void generate_random(int *arr, int n){
   srand(seed);
   for (long i = 0; i < n; i++)
   {
-    arr[i] = (int)rand()%100;
+    arr[i] = (int)rand();
   }
 }
 
 
+long getTimeDiff(struct timeval start, struct timeval end) {
+    long result = (long) end.tv_sec * 1000000 + end.tv_usec - \
+                  (long) start.tv_sec * 1000000 + start.tv_usec;
 
+    return result;
+}
 int get_max(int arr[], int n){
   int max=INT_MIN;
   for (int i=0; i<n; i++){
diff --git a/src/util/util.hpp b/src/util/util.hpp
index 97b27dc..db69ab8 100644
--- a/src/util/util.hpp
+++ b/src/util/util.hpp
@@ -4,5 +4,6 @@
 void generate_random(int* arr, int n);
 // void print_arr(int * array, int num_elements);
 void assert_sorted(int arr[], int n, double time_elapsed);
-
+int get_max(int arr[], int n);
+long getTimeDiff(struct timeval start, struct timeval end);
 #endif
\ No newline at end of file
-- 
GitLab