diff --git a/src/radix_sort_parallel.h b/src/radix_sort_parallel.h
index 75046244f0d7e60ea7fac89abf11b2ffad22b091..b2477e55f5cb5350db3a7bf1605b94b9c5de8cc8 100644
--- a/src/radix_sort_parallel.h
+++ b/src/radix_sort_parallel.h
@@ -9,7 +9,5 @@
 #define t 8
 
 void rng(int* arr, int n);
-int getMax(int arr[], int n) ;
-void countSort(int arr[], int n, int exp) ;
 void radix_sort(int arr[], int n);
 void print(int arr[], int n) ;
\ No newline at end of file
diff --git a/src/radixsort_parallel.cu b/src/radixsort_parallel.cu
index 8ba6fe75fb0b0901c0d092f37e9976d757f2c13c..b1c69dbd41f581f2ab2b08167a72a8c30a5d8d6d 100644
--- a/src/radixsort_parallel.cu
+++ b/src/radixsort_parallel.cu
@@ -1,117 +1,114 @@
-#include <stdio.h>
-#include <stdlib.h>
-// #include <omp.h>
-#include <sys/time.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "radix_sort_parallel.h"
 
-void generateArray(int arr[], int n,int seed);
-void radixsort(int arr[], int n);
-void print(int arr[], int n);
+__global__ void getMax(int *arr, int *max, int n) {
+    int index = threadIdx.x;
+    int stride = blockDim.x;
+    int mx = arr[index]; 
 
-int main(int argc, char *argv[]) {
-   struct timeval stop, start;
-   // int thread_count = 5;
-   int array_size = strtol(argv[1], NULL, 10);
-    //printf("NUM THREADS : %d\n", omp_get_num_threads());
-   int *arr,*d_arr;
-   arr = (int*)malloc(sizeof(int)* array_size);
-   cudaMalloc((void **)&d_arr, sizeof(int) * array_size);
-//   cudaMemcpy(d_arr, arr, sizeof(int) * array_size, cudaMemcpyHostToDevice);
-   int seed = 13516127;
-   generateArray(arr,array_size,seed);
-   cudaMemcpy(d_arr, arr, sizeof(int) * array_size, cudaMemcpyHostToDevice);
-   gettimeofday(&start, NULL);
-   radixsort(arr, array_size);
-   gettimeofday(&stop, NULL);
-     print(arr,array_size);
-   printf("Sorting selesai\n");
-   printf("took %lu microsecond for parallel radix sort\n", ((stop.tv_sec - start.tv_sec)*1000000)+(stop.tv_usec - start.tv_usec));
-  
-   cudaFree(d_arr);
-   free(arr);
-    return 0;
-}
+    for (int i = index; i < n; i+=stride) 
+        if (arr[i] > mx) 
+            mx = arr[i];
+    max[0] = mx;
+} 
 
-__global__
-void getMax(int arr[], int n, int max[]) {
-__shared__  int cache[1000];
-int temp = arr[0];
-    for (int i=0; i<n; i+=1000){
-        if(threadIdx.x< 1000){
-            if(temp < arr[threadIdx.x + i]){
-                temp = arr[threadIdx.x+i];
-            }
-        }
-    }
+__global__ void countSort(int *arr, int n, int exp) {
+    int* output = (int*)malloc(n * sizeof(int));
+    int i, count[10] = {0};
+
+    for (i = 0; i < n; i++) 
+        count[ (arr[i]/exp)%10 ]++; 
+
+    for (i = 1; i < 10; i++) 
+        count[i] += count[i - 1]; 
+
+    for (i = n - 1; i >= 0; i--) 
+    { 
+        output[count[ (arr[i]/exp)%10 ] - 1] = arr[i]; 
+        count[ (arr[i]/exp)%10 ]--; 
+    } 
+
+    for (i = 0; i < n; i++) 
+        arr[i] = output[i];
+} 
+
+void radix_sort(int arr[], int n) 
+{ 
+    int *max;
+    int *d_max, *d_arr;
+
+    // Allocate host memory
+    max = (int*)malloc(1 * sizeof(int));
+
+    // Allocate device memory
+    cudaMalloc((void**)&d_max, 1 * sizeof(int));
+    cudaMalloc((void**)&d_arr, n * sizeof(int));
+
+    // Transfer data from host to device
+    cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_max, max, sizeof(int) * 1, cudaMemcpyHostToDevice);
+
+    // Executing kernel
+    getMax<<<1, 500>>>(d_arr, d_max, n);
+
+    // Transfer data back to host memory
+    cudaMemcpy(max, d_max, 1 * sizeof(int), cudaMemcpyDeviceToHost);
 
-    if(threadIdx.x <1000){
-        cache[threadIdx.x] =temp;
+    for (int exp = 1; max[0]/exp > 0; exp *= 10) {
+        countSort<<<1, 500>>>(d_arr, n, exp);
     }
-    
-    __syncthreads();
-
-    if(threadIdx.x ==0){
-        *max = cache[0];
-        for(int i =1; i<1000; i++){
-            if(*max < cache[i]){
-                *max = cache[i];
-            }
-        }
-    
+
+    cudaMemcpy(arr, d_arr, n * sizeof(int), cudaMemcpyDeviceToHost);
+} 
+
+void print(int arr[], int n) 
+{ 
+    for (int i = 0; i < n; i++) 
+        printf("%d: %d\n",i, arr[i]);
+} 
+
+void rng(int *arr, int n) {
+    int seed = 13516013;
+    srand(seed);
+    for(long i = 0; i < n; i++) {
+        arr[i] = (int)rand();
     }
 }
-// __global__
-void generateArray(int arr[], int n,int seed){
-   int *output;
-   output = (int*)malloc(sizeof(int) * n);
-   srand(seed);
-   for(long i = 0; i < n; i++) {
-        output[i] = (int)rand();
-   }
-
-   for(long i = 0; i < n; i++) {
-      arr[i] = output[i];
-   }
 
-}
-__global__
-void countSort(int arr[], int n, int digit) {
-    int *output;
-    output = (int*)malloc(sizeof(int)*n);
-  long i;
-    int count[10] = {0};
-
-    for (i = 0; i < n; i++)
-        count[ (arr[i]/digit)%10 ]++;
-    for (i = 1; i < 10; i++)
-        count[i] += count[i - 1];
-  for (i = n - 1; i >= 0; i--) {
-         output[count[ (arr[i]/digit)%10 ] - 1] = arr[i];
-         count[(arr[i]/digit)%10 ]--;
-  }
-  for (i = 0; i < n; i++)
-     arr[i] = output[i];
-}
+int main(int argc, char *argv[]) {
+    int N;
+    int *arr;
+    int *d_arr;
+
+    if (argc == 2) {
+        N = strtol(argv[1], NULL, 10);
+    } else {
+        printf("ERROR: ./radix_sort <array_length>\n");
+        return 1;
+    }
 
-void radixsort(int arr[], int n) {
-  int  *max,*d_max,*d_arr;
-  max = (int*)malloc(sizeof(int)* 1);
-  cudaMalloc((void **)&d_max, sizeof(int) * 1);
-  cudaMemcpy(d_max, max, sizeof(int) * 1, cudaMemcpyHostToDevice);
-  cudaMalloc((void **)&d_arr, sizeof(int) * n);
-  cudaMemcpy(d_arr,arr,sizeof(int) *n, cudaMemcpyHostToDevice);
-  getMax<<<1,1000>>>(d_arr, n,d_max);
-  cudaMemcpy(max,d_max, sizeof(int) * 1, cudaMemcpyDeviceToHost);
-  for (int digit = 1; max[0]/digit > 0; digit *= 10) {
-      countSort<<<1,1>>>(d_arr, n, digit);
-   }
-  cudaMemcpy(arr,d_arr, sizeof(int) *n, cudaMemcpyDeviceToHost);
+    // Allocate host memory
+    arr = (int*)malloc(N * sizeof(int));
 
-}
+    // Initialize host memory
+    rng(arr,N);
+
+    // Allocate device memory
+    cudaMalloc((void**)&d_arr, N * sizeof(int));
+
+    // Transfer data from host to device memory
+    cudaMemcpy(d_arr, arr, N * sizeof(int), cudaMemcpyHostToDevice);
 
-void print(int arr[], int n) {
-    for (long i = 0; i < n; i++) {
-      printf("%d \n",arr[i]);
+    clock_t begin = clock();
+    radix_sort(arr, N);
+    clock_t end = clock();
+    double time = (double)(end - begin) * 1000 / CLOCKS_PER_SEC;
+    print(arr,N);
+    printf("Executed in %lf ms\n",time);
+
+    cudaFree(d_arr);
+    free(arr);
+    return 0;
+}
 
-   }
-   printf("\n");
-}
\ No newline at end of file