diff --git a/.DS_Store b/.DS_Store
index ea1d8fcae3082442de5386a3a8dd2fa5cbd0955a..707c39b8ba9621345fc9797d7071c7eea90abfc7 100644
Binary files a/.DS_Store and b/.DS_Store differ
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..2d663dc0d3f81bb59d4d4e4cc7b6f9d4a39ab6c3
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,6 @@
+all:
+	nvcc src/radix_sort_cuda.cu -o radix_sort_cuda
+
+clean:
+	rm -rf radix_sort_cuda
+	rm -rf radix_sort_cuda.exe
diff --git a/doc/README.md b/doc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..10acabed211b8da80e076e428023014ef500f139
--- /dev/null
+++ b/doc/README.md
@@ -0,0 +1,62 @@
+## Praktikum 02
+# OpenMPI - Radix Sort
+
+# How To
+Run `make` then open `sort` using `mpirun`
+
+# Pembagian Tugas
+13516100 - Putu Gery Wahyu Nugraha
+* Counting Sort
+* Radix Sort
+* All realted with MPIs
+* Doc and README
+
+13516091 - Yasya Rusyda Aslina
+* Desain solusi paralel
+* Testing
+
+# Deskripsi Solusi Paralel
+OpenMPI merupakan sebuah message passing interface, oleh karena itu kemampuan terbesarnya adalah mengirimkan pesan antar proses.
+
+Perlu dibuat sebuah solusi radix sort yang menggunakan sistem pengiriman pesan antar proses. Salah satu cara termudahnya adalah
+
+1. Bagi data yang akan di sort kepada setiap proses secara adil (data lokal)
+2. Hitung nilai count dari setiap data yang akan di sort
+3. Sort data lokal pada tiap-tiap proses
+4. Gabungkan semua data lokal yang ada dari proses
+5. Gabungkan nilai count dari tiap proses
+6. Lakukan sort secara global
+7. Ulangi langkah 1 
+
+Dari langkah-langkah tersebut
+
+* Langkah 1 bisa kita selesaikan dengan menggunakan `MPI_Scatter`
+* Langkah 4 bisa kita selesaikan dengan menggunakan `MPI_Gather`
+* Langkah 5 bisa kita selesaikan dengan menggunakan `MPI_Reduce` dan `MPI_SUM`
+
+# Analisis Solusi
+Setelah penulis menulis kode diatas, penulis menemukan banyak hal yang dapat diefektifkan. Salah satunya adalah bagaimana program ini melakukan sorting. Jika kita melihat langkah yang telah dideskripsikan diatas, maka kita menemukan bahwa: 
+* Sort dilakukan dua kali, dalam scope lokal dan scope global. Tidak efektif, kemungkinan besar bisa dilakukan sekali saja
+
+# Jumlah Thread yang Digunakan
+## 2
+Ketika melakukan pengerjaan melalui komputer pribadi, kami menemukan bahwa jika nilai thread lebih dari 2, ada kemungkinan program MPI tersebut stuck. Walaupun hal ini tidak pernah kami temukan di komputer remote. Maka dari itu untuk mendapatkan data yang konsisten kami memutuskan untuk menggunakan 2 thread saja
+
+# Pengukuran Kinerja
+| N | Size | 1     | 2 | 3 | avg |
+|---|------|-------|---|---|-----|
+| 1 | 5000 | 15145 | 15131 | 15273 | 15183 | 
+| 2 | 5000 | 40787 | 11316 | 5893  | 19332 |
+| 1 | 50000| 187103| 148275| 105336 | 146904 |
+| 2 | 50000| 89551 | 130345| 119153 | 113016 |
+| 1 |100000| 224312|154683 |154609 |177868 | 
+| 2 |100000| 217054|197716|150975|188581 |
+| 1 |200000|621111|629161|706030|652100 |
+| 2 |200000|304229|358362|302638|321743 |
+| 1 |400000|1142259|728879|1230207|1033781 |
+| 2 |400000|1339009|1217159|1199709|1251959 |
+
+# Analisis Pengukuran Kinerja
+Dari data kinerja diatas, sebenarnya kami lebih cenderung merasa bahwa faktor terbesar berkurangnya waktu bukanlah jumlah thread yang digunakan, tapi bagaimana kondisi cache dari prosesor. Walaupun dari data diatas dapat kita temukan perbedaan, tapi perbedaan tersebut sifatnya random dan tidak memiliki arah yang jelas. Oleh karena itu setiap pengurangan waktu yang terjadi ketika jumlah thread ditambahkan (atau penambahan waktu) lebih cenderung disebabkan oleh kondisi cache pada saat itu.
+
+Tapi karena radix sort parallel jauh lebih unggul ketika jumlah data banyak maka mungkin saja penurunan drastis yang terjadi ketika menggunakan dua thread ketika size = 200000 disebabkan oleh penggunaan parallel programming dan bukan kondisi cache saja.
diff --git a/radix_sort_cuda b/radix_sort_cuda
new file mode 100755
index 0000000000000000000000000000000000000000..9d9a3d32e12815753cea4e6f8e471a19f8e132bb
Binary files /dev/null and b/radix_sort_cuda differ
diff --git a/src/.DS_Store b/src/.DS_Store
index 8d1c6ed5fdc8f98ecae7e03285b3580fa78ccedd..4912157825bb3ffe363aeada5f93b27d3b9d50db 100644
Binary files a/src/.DS_Store and b/src/.DS_Store differ
diff --git a/src/Makefile b/src/Makefile
deleted file mode 100644
index 544e8c26fc65071f1dd73af69516a4d916b9f3ae..0000000000000000000000000000000000000000
--- a/src/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-build:
-	gcc-8 -g -Wall -o radix_sort radix_sort.c -fopenmp -lm 
diff --git a/src/radix_sort_cuda b/src/radix_sort_cuda
new file mode 100755
index 0000000000000000000000000000000000000000..b7f9a8e77e7da0869d7203a4bdc89c10cddf3768
Binary files /dev/null and b/src/radix_sort_cuda differ
diff --git a/src/radix_par2.cu b/src/radix_sort_cuda.cu
old mode 100644
new mode 100755
similarity index 96%
rename from src/radix_par2.cu
rename to src/radix_sort_cuda.cu
index 356cdad952dde190110e139dc56d3b58dd39c58a..501fec820357cfd6d39a822dce28b22b5602cca4
--- a/src/radix_par2.cu
+++ b/src/radix_sort_cuda.cu
@@ -1,118 +1,118 @@
-// C++ implementation of Radix Sort 
-#include <iostream> 
-#include <cstdlib>
-#include <time.h>
-using namespace std; 
-  
-// A utility function to get maximum value in arr[] 
-int getMax(int arr[], int n) 
-{ 
-    int mx = arr[0]; 
-    for (int i = 1; i < n; i++) 
-        if (arr[i] > mx) 
-            mx = arr[i]; 
-    return mx; 
-} 
-  
-// A function to do counting sort of arr[] according to 
-// the digit represented by exp. 
-__global__ void countSort(int arr[], int n, int exp) 
-{ 
-    int *output= (int*)malloc(sizeof(int)*n); // output array 
-    int i, count[10] = {0}; 
-  
-    // Store count of occurrences in count[] 
-    for (i = 0; i < n; i++) 
-        count[ (arr[i]/exp)%10 ]++; 
-  
-    // Change count[i] so that count[i] now contains actual 
-    //  position of this digit in output[] 
-    for (i = 1; i < 10; i++) 
-        count[i] += count[i - 1]; 
-  
-    // Build the output array 
-    for (i = n - 1; i >= 0; i--) 
-    { 
-        output[count[ (arr[i]/exp)%10 ] - 1] = arr[i]; 
-        count[ (arr[i]/exp)%10 ]--; 
-    } 
-  
-    // Copy the output array to arr[], so that arr[] now 
-    // contains sorted numbers according to current digit 
-    for (i = 0; i < n; i++) 
-        arr[i] = output[i]; 
-} 
-  
-// The main function to that sorts arr[] of size n using  
-// Radix Sort 
-void radixsort(int arr[], int n) 
-{ 
-    int *d_arr;
-    // Find the maximum number to know number of digits 
-    int m = getMax(arr, n); 
-  
-    // Do counting sort for every digit. Note that instead 
-    // of passing digit number, exp is passed. exp is 10^i 
-    // where i is current digit number 
-
-    cudaMalloc((void**)&d_arr,sizeof(int)*n);
-    cudaMemcpy(d_arr, arr, sizeof(int)*n,cudaMemcpyHostToDevice);
- 
-    for (int exp = 1; m/exp > 0; exp *= 10) {
-        countSort<<<1,32>>>(arr, n, exp);
-        cudaMemcpy(arr, d_arr, sizeof(int)*n, cudaMemcpyDeviceToHost);
-    }
-
-    cudaFree(d_arr); 
-} 
-  
-// A utility function to print an array 
-void print(int arr[], int n) 
-{ 
-    for (int i = 0; i < n; i++) 
-        cout << arr[i] << "\n"; 
-} 
-  
-void rng(int* arr,int n){
-    int seed = 13516067;
-    srand(seed);
-    for (long i=0; i<n; i++){
-        arr[i] = (int) rand();
-    }
-}
-
- 
-timespec diff(timespec start, timespec end)
-{
-    timespec temp;
-    if ((end.tv_nsec - start.tv_nsec) < 0) {
-        temp.tv_sec = end.tv_sec - start.tv_sec - 1;
-        temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
-    } else {
-        temp.tv_sec = end.tv_sec - start.tv_sec;
-        temp.tv_nsec = end.tv_nsec - start.tv_nsec;
-    }
-    return temp;
-}
-
-// Driver program to test above functions 
-int main() 
-{ 
-    timespec start, stop;
-    
-    int n;
-    cout<<"Masukkan nilai N\n";
-    cin>>n;
-    int arr[n];
-    rng(arr,n);
-    clock_gettime(CLOCK_REALTIME, &start);
-    radixsort(arr,n);
-    clock_gettime(CLOCK_REALTIME, &stop);
-    print(arr,n);
-    
-    timespec duration = diff(start, stop);
-    long time = duration.tv_sec * 1000000 + duration.tv_nsec/1000;
-    printf("\n%d.%09d s\n", duration.tv_sec, duration.tv_nsec);
-    
-    return 0; 
-} 
\ No newline at end of file
+// C++ implementation of Radix Sort 
+#include <iostream> 
+#include <cstdlib>
+#include <time.h>
+using namespace std; 
+  
+// A utility function to get maximum value in arr[] 
+int getMax(int arr[], int n) 
+{ 
+    int mx = arr[0]; 
+    for (int i = 1; i < n; i++) 
+        if (arr[i] > mx) 
+            mx = arr[i]; 
+    return mx; 
+} 
+  
+// A function to do counting sort of arr[] according to 
+// the digit represented by exp. 
+__global__ void countSort(int arr[], int n, int exp) 
+{ 
+    int *output= (int*)malloc(sizeof(int)*n); // output array 
+    int i, count[10] = {0}; 
+  
+    // Store count of occurrences in count[] 
+    for (i = 0; i < n; i++) 
+        count[ (arr[i]/exp)%10 ]++; 
+  
+    // Change count[i] so that count[i] now contains actual 
+    //  position of this digit in output[] 
+    for (i = 1; i < 10; i++) 
+        count[i] += count[i - 1]; 
+  
+    // Build the output array 
+    for (i = n - 1; i >= 0; i--) 
+    { 
+        output[count[ (arr[i]/exp)%10 ] - 1] = arr[i]; 
+        count[ (arr[i]/exp)%10 ]--; 
+    } 
+  
+    // Copy the output array to arr[], so that arr[] now 
+    // contains sorted numbers according to current digit 
+    for (i = 0; i < n; i++) 
+        arr[i] = output[i]; 
+} 
+  
+// The main function to that sorts arr[] of size n using  
+// Radix Sort 
+void radixsort(int arr[], int n) 
+{ 
+    int *d_arr;
+    // Find the maximum number to know number of digits 
+    int m = getMax(arr, n); 
+  
+    // Do counting sort for every digit. Note that instead 
+    // of passing digit number, exp is passed. exp is 10^i 
+    // where i is current digit number 
+
+    cudaMalloc((void**)&d_arr,sizeof(int)*n);
+    cudaMemcpy(d_arr, arr, sizeof(int)*n,cudaMemcpyHostToDevice);
+ 
+    for (int exp = 1; m/exp > 0; exp *= 10) {
+        countSort<<<1,32>>>(arr, n, exp);
+        cudaMemcpy(arr, d_arr, sizeof(int)*n, cudaMemcpyDeviceToHost);
+    }
+
+    cudaFree(d_arr); 
+} 
+  
+// A utility function to print an array 
+void print(int arr[], int n) 
+{ 
+    for (int i = 0; i < n; i++) 
+        cout << arr[i] << "\n"; 
+} 
+  
+void rng(int* arr,int n){
+    int seed = 13516067;
+    srand(seed);
+    for (long i=0; i<n; i++){
+        arr[i] = (int) rand();
+    }
+}
+
+ 
+timespec diff(timespec start, timespec end)
+{
+    timespec temp;
+    if ((end.tv_nsec - start.tv_nsec) < 0) {
+        temp.tv_sec = end.tv_sec - start.tv_sec - 1;
+        temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
+    } else {
+        temp.tv_sec = end.tv_sec - start.tv_sec;
+        temp.tv_nsec = end.tv_nsec - start.tv_nsec;
+    }
+    return temp;
+}
+
+// Driver program to test above functions 
+int main() 
+{ 
+    timespec start, stop;
+    
+    int n;
+    cout<<"Masukkan nilai N\n";
+    cin>>n;
+    int arr[n];
+    rng(arr,n);
+    clock_gettime(CLOCK_REALTIME, &start);
+    radixsort(arr,n);
+    clock_gettime(CLOCK_REALTIME, &stop);
+    print(arr,n);
+    
+    timespec duration = diff(start, stop);
+    long time = duration.tv_sec * 1000000 + duration.tv_nsec/1000;
+    printf("\n%d.%09d s\n", duration.tv_sec, duration.tv_nsec);
+    
+    return 0; 
+} 
diff --git a/src/radix_sort_par b/src/radix_sort_par
deleted file mode 100755
index 039d5089f018ed7db3264feb0c143f65b61f842c..0000000000000000000000000000000000000000
Binary files a/src/radix_sort_par and /dev/null differ
diff --git a/src/radix_sort_par.cu b/src/radix_sort_par.cu
old mode 100644
new mode 100755
index d77c5073d7e151e9b24a231bb732f91775ecc221..1718b956ea1a3e16802afbbd5874b95540d76ec3
--- a/src/radix_sort_par.cu
+++ b/src/radix_sort_par.cu
@@ -1,135 +1,136 @@
-// C++ implementation of Radix Sort 
-#include <iostream> 
-#include <cstdlib>
-#include <time.h>
-using namespace std; 
-  
-// A utility function to get maximum value in arr[] 
-__device__ int getMax(int arr[], int n) 
-{ 
-    int mx = arr[0]; 
-    for (int i = 1; i < n; i++) 
-        if (arr[i] > mx) 
-            mx = arr[i]; 
-    return mx; 
-} 
-  
-// A function to do counting sort of arr[] according to 
-// the digit represented by exp. 
-__device__ void countSort(int arr[], int n, int exp) 
-{ 
-	//int index = threadIdx.x;
-    //int stride = blockDim.x;
-
-    int *output= (int*)malloc(sizeof(int)*n); // output array 
-    int i, count[10] = {0}; 
-  
-    // Store count of occurrences in count[] 
-    for (i = 0; i < n; i++) 
-        count[ (arr[i]/exp)%10 ]++; 
-  
-    // Change count[i] so that count[i] now contains actual 
-    //  position of this digit in output[] 
-    for (i = 1; i < 10; i++) 
-        count[i] += count[i - 1]; 
-  
-    // Build the output array 
-    for (i = n - 1; i >= 0; i--) 
-    { 
-        output[count[ (arr[i]/exp)%10 ] - 1] = arr[i]; 
-        count[ (arr[i]/exp)%10 ]--; 
-    } 
-  
-    // Copy the output array to arr[], so that arr[] now 
-    // contains sorted numbers according to current digit 
-    for (i = 0; i < n; i++) 
-        arr[i] = output[i]; 
-} 
-  
-// The main function to that sorts arr[] of size n using  
-// Radix Sort
-
-__global__ void radixsort(int *arr, int n) 
-{ 
-    //int *d_arr;
-
-    // Find the maximum number to know number of digits 
-    int m = getMax(arr, n); 
-  
-    // Do counting sort for every digit. Note that instead 
-    // of passing digit number, exp is passed. exp is 10^i 
-    // where i is current digit number 
-
-    // allocate device memory
-    //cudaMalloc((void**)&d_arr,sizeof(int)*n);
-
-    //cudaMemcpy(d_arr, arr, sizeof(int)*n,cudaMemcpyHostToDevice);
-    for (int exp = 1; m/exp > 0; exp *= 10) 
-        //countSort<<<1,1024>>>(d_arr, n, exp); 
-        countSort(arr,n,exp);
-        //transfer data back to host memory
-        //cudaMemcpy(arr, d_arr, sizeof(int)*n, cudaMemcpyDeviceToHost);
-        
-    //deallocate device memory
-    //cudaFree(d_arr);
-    __syncthreads();
-
-} 
-  
-// A utility function to print an array 
-void print(int arr[], int n){ 
-    for (int i = 0; i < n; i++) 
-        cout << arr[i] << "\n"; 
-} 
-  
-void rng(int* arr,int n){
-    int seed = 13516067;
-    srand(seed);
-    for (long i=0; i<n; i++){
-        arr[i] = (int) rand();
-    }
-}
-
- 
-timespec diff(timespec start, timespec end)
-{
-    timespec temp;
-    if ((end.tv_nsec - start.tv_nsec) < 0) {
-        temp.tv_sec = end.tv_sec - start.tv_sec - 1;
-        temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
-    } else {
-        temp.tv_sec = end.tv_sec - start.tv_sec;
-        temp.tv_nsec = end.tv_nsec - start.tv_nsec;
-    }
-    return temp;
-}
-
-// Driver program to test above functions 
-int main(int argc, char *argv[]) 
-{ 
-    timespec start, stop;
-    
-    int *d_arr;
-    int n;
-    n= atoi(argv[1]);
-    int arr[n];
-    rng(arr,n);
-    
-    cudaMalloc((void**)&d_arr,sizeof(int)*n);
-    cudaMemcpy(d_arr, arr, sizeof(int)*n,cudaMemcpyHostToDevice);
-
-    clock_gettime(CLOCK_REALTIME, &start);
-    radixsort<<<1,32>>>(d_arr,n);
-    cudaMemcpy(arr, d_arr, sizeof(int)*n,cudaMemcpyDeviceToHost);
-    clock_gettime(CLOCK_REALTIME, &stop);
-    print(arr,n);
-    
-    timespec duration = diff(start, stop);
-    long time = duration.tv_sec * 1000000 + duration.tv_nsec/1000;
-    printf("\n%d.%09d s\n", duration.tv_sec, duration.tv_nsec);
-    
-    //deallocate host memory
-    cudaFree(d_arr);
-    
-    return 0; 
-}
+// C++ implementation of Radix Sort 
+#include <iostream> 
+#include <cstdlib>
+#include <time.h>
+using namespace std; 
+  
+// A utility function to get maximum value in arr[] 
+__device__ int getMax(int arr[], int n) 
+{ 
+    int mx = arr[0]; 
+    for (int i = 1; i < n; i++) 
+        if (arr[i] > mx) 
+            mx = arr[i]; 
+    return mx; 
+} 
+  
+// A function to do counting sort of arr[] according to 
+// the digit represented by exp. 
+__device__ void countSort(int arr[], int n, int exp) 
+{ 
+//	int index = threadIdx.x;
+//    int stride = blockDim.x;
+
+    int *output= (int*)malloc(sizeof(int)*n); // output array 
+    int i, count[10] = {0}; 
+  
+    // Store count of occurrences in count[] 
+    for (i = 0; i < n; i++) 
+        count[ (arr[i]/exp)%10 ]++; 
+  
+    // Change count[i] so that count[i] now contains actual 
+    //  position of this digit in output[] 
+    for (i = 1; i < 10; i++) 
+        count[i] += count[i - 1]; 
+  
+    // Build the output array 
+    for (i = n - 1; i >= 0; i--) 
+    { 
+        output[count[ (arr[i]/exp)%10 ] - 1] = arr[i]; 
+        count[ (arr[i]/exp)%10 ]--; 
+    } 
+  
+    // Copy the output array to arr[], so that arr[] now 
+    // contains sorted numbers according to current digit 
+    for (i = 0; i < n; i++) 
+        arr[i] = output[i]; 
+} 
+  
+// The main function to that sorts arr[] of size n using  
+// Radix Sort
+
+__global__ void radixsort(int *arr, int n) 
+{ 
+    //int *d_arr;
+
+    // Find the maximum number to know number of digits 
+    int m = getMax(arr, n); 
+  
+    // Do counting sort for every digit. Note that instead 
+    // of passing digit number, exp is passed. exp is 10^i 
+    // where i is current digit number 
+
+    // allocate device memory
+    //cudaMalloc((void**)&d_arr,sizeof(int)*n);
+
+    //cudaMemcpy(d_arr, arr, sizeof(int)*n,cudaMemcpyHostToDevice);
+    for (int exp = 1; m/exp > 0; exp *= 10) 
+        //countSort<<<1,1024>>>(d_arr, n, exp); 
+        countSort(arr,n,exp);
+        //transfer data back to host memory
+        //cudaMemcpy(arr, d_arr, sizeof(int)*n, cudaMemcpyDeviceToHost);
+        
+    //deallocate device memory
+    //cudaFree(d_arr);
+    __syncthreads();
+
+} 
+  
+// A utility function to print an array 
+void print(int arr[], int n){ 
+    for (int i = 0; i < n; i++) 
+        cout << arr[i] << "\n"; 
+} 
+  
+void rng(int* arr,int n){
+    int seed = 13516067;
+    srand(seed);
+    for (long i=0; i<n; i++){
+        arr[i] = (int) rand();
+    }
+}
+
+ 
+timespec diff(timespec start, timespec end)
+{
+    timespec temp;
+    if ((end.tv_nsec - start.tv_nsec) < 0) {
+        temp.tv_sec = end.tv_sec - start.tv_sec - 1;
+        temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
+    } else {
+        temp.tv_sec = end.tv_sec - start.tv_sec;
+        temp.tv_nsec = end.tv_nsec - start.tv_nsec;
+    }
+    return temp;
+}
+
+// Driver program to test above functions 
+int main(int argc, char *argv[]) 
+{ 
+    timespec start, stop;
+    
+    int *d_arr;
+    int n;
+    n= atoi(argv[1]);
+    int arr[n];
+    rng(arr,n);
+
+    cudaMalloc((void**)&d_arr,sizeof(int)*n);
+    cudaMemcpy(d_arr, arr, sizeof(int)*n,cudaMemcpyHostToDevice);
+
+    clock_gettime(CLOCK_REALTIME, &start);
+    radixsort<<<1,32>>>(d_arr,n);
+    cudaMemcpy(arr, d_arr, sizeof(int)*n,cudaMemcpyDeviceToHost);
+    clock_gettime(CLOCK_REALTIME, &stop);
+    print(arr,n);
+    
+    timespec duration = diff(start, stop);
+    long time = duration.tv_sec * 1000000 + duration.tv_nsec/1000;
+    printf("\n%d.%09d s\n", duration.tv_sec, duration.tv_nsec);
+    
+    //deallocate host memory
+    cudaFree(d_arr);
+    
+    return 0; 
+} 
+
diff --git a/src/radix_sort_par3.cu b/src/radix_sort_par3.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e9e1a90630e4ad3b5af7f101b4789cc114a0eddc
--- /dev/null
+++ b/src/radix_sort_par3.cu
@@ -0,0 +1,165 @@
+// C++ implementation of Radix Sort 
+#include <iostream> 
+#include <cstdlib>
+#include <time.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#define MAX_THREAD 32
+
+using namespace std; 
+
+ 
+// A utility function to get maximum value in arr[] 
+int getMax(int arr[], int n) 
+{ 
+    int mx = arr[0]; 
+    for (int i = 1; i < n; i++) 
+        if (arr[i] > mx) 
+            mx = arr[i]; 
+    return mx; 
+} 
+  
+__global__ void storeCount(int *count, int *arr, int n, int exp){
+    __shared__ s_Count[MAX_THREAD][10]={0};
+
+    int idx = threadIdx.x;
+
+    if(n<=MAX_THREAD){
+        s_Count[idx][(arr[idx]/exp)%10 ]++;
+    } else{
+        int block = (int) ceil(n/MAX_THREAD);
+        int index = idx * block; 
+        int last_index;
+        if(idx+1==MAX_THREAD){
+            last_index = n;
+        } else{
+            last_index= index + block;
+        }
+        for (int i = index; i < last_index; i++){
+            s_Count[idx][(arr[i]/exp)%10 ]++;
+        }
+    }
+
+    if(idx==0){
+        for (i = 1; i<MAX_THREAD;j++){
+            for(int j=0; j<10;j++){
+                s_Count[0][j]+=sCount[i][j];
+            }
+        }
+        for (j =0; j<10;j++){
+            count[j]=s_Count[0][j];
+        }
+    }
+        
+}
+
+// A function to do counting sort of arr[] according to 
+// the digit represented by exp. 
+void countSort(int arr[], int n, int exp) 
+{ 
+    int *d_arr;
+
+    int *output= (int*)malloc(sizeof(int)*n); // output array 
+    int i;
+    int d_count[10] = {0}; 
+    int h_count[10];
+    cudaMalloc((void**)&d_arr,sizeof(int)*n);
+    cudaMemcpy(d_arr, arr, sizeof(int)*n,cudaMemcpyHostToDevice);
+  
+    // Store count of occurrences in count[] 
+    storeCount<<<1,32>>>(d_count,d_arr,n,exp); 
+    cudaMemcpy(h_count, d_count, 10,cudaMemcpyDeviceToHost);
+
+    // Change count[i] so that count[i] now contains actual 
+    //  position of this digit in output[] 
+    for (i = 1; i < 10; i++) 
+        h_count[i] += h_count[i - 1]; 
+  
+    // Build the output array 
+    for (i = n - 1; i >= 0; i--) 
+    { 
+        output[h_count[ (arr[i]/exp)%10 ] - 1] = arr[i]; 
+        h_count[ (arr[i]/exp)%10 ]--; 
+    } 
+  
+    // Copy the output array to arr[], so that arr[] now 
+    // contains sorted numbers according to current digit 
+    for (i = 0; i < n; i++) 
+        arr[i] = output[i]; 
+
+    cudaFree(d_arr);
+    //cudaFree(d_count);
+} 
+  
+// The main function to that sorts arr[] of size n using  
+// Radix Sort
+
+void radixsort(int *arr, int n) 
+{ 
+    
+    int m = getMax(arr, n); 
+  
+    // Do counting sort for every digit. Note that instead 
+    // of passing digit number, exp is passed. exp is 10^i 
+    // where i is current digit number 
+
+    
+    //cudaMemcpy(d_arr, arr, sizeof(int)*n,cudaMemcpyHostToDevice);
+    for (int exp = 1; m/exp > 0; exp *= 10)  
+        countSort(arr,n,exp);
+
+} 
+  
+// A utility function to print an array 
+void print(int arr[], int n){ 
+    for (int i = 0; i < n; i++) 
+        cout << arr[i] << "\n"; 
+} 
+  
+void rng(int* arr,int n){
+    int seed = 13516067;
+    srand(seed);
+    for (long i=0; i<n; i++){
+        arr[i] = (int) rand();
+    }
+}
+
+ 
+timespec diff(timespec start, timespec end)
+{
+    timespec temp;
+    if ((end.tv_nsec - start.tv_nsec) < 0) {
+        temp.tv_sec = end.tv_sec - start.tv_sec - 1;
+        temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
+    } else {
+        temp.tv_sec = end.tv_sec - start.tv_sec;
+        temp.tv_nsec = end.tv_nsec - start.tv_nsec;
+    }
+    return temp;
+}
+
+// Driver program to test above functions 
+int main(int argc, char *argv[]) 
+{ 
+    timespec start, stop;
+    
+    
+    int n;
+    n= atoi(argv[1]);
+    int arr[n];
+    rng(arr,n);
+
+    
+    clock_gettime(CLOCK_REALTIME, &start);
+    radixsort(arr,n);
+    clock_gettime(CLOCK_REALTIME, &stop);
+    print(arr,n);
+    
+    timespec duration = diff(start, stop);
+    long time = duration.tv_sec * 1000000 + duration.tv_nsec/1000;
+    printf("\n%d.%09d s\n", duration.tv_sec, duration.tv_nsec);
+    
+    //deallocate host memory
+    
+    return 0; 
+}