diff --git a/radix b/radix
deleted file mode 100755
index 3df6114965f06bb640097e00399f36095d3ccfbd..0000000000000000000000000000000000000000
Binary files a/radix and /dev/null differ
diff --git a/radix.cu b/radix.cu
new file mode 100644
index 0000000000000000000000000000000000000000..db26e5eb30d5ed74a9c4372990122eb213f38898
--- /dev/null
+++ b/radix.cu
@@ -0,0 +1,152 @@
+#include <stdio.h>
+#include <iostream>
+#include <math.h>
+#include <chrono>
+
+#define THREADS 4
+using namespace std;
+
+__device__ int pow(int x, int n) {
+  int sum = 1;
+  if (n == 0) {
+    return 1;
+  } else {
+    for (int i = 0; i < n; i++) {
+      sum *= x;
+    }
+    return sum;
+  }
+}
+
+__device__ int getDigitFromNumber(int number, int n) {
+  
+  int digit = number;
+  int divisor = pow(10, n);
+  
+  digit = (digit / divisor) % 10;
+
+  return digit;
+}
+
+__device__ void countingSort(int* arr, int* temp_arr, int size, int* count_arr, int length) {
+  for(int i = size-1; i>=0; i--){
+    int digit_count = getDigitFromNumber(arr[i], length);
+    count_arr[digit_count]--;
+    int j = count_arr[digit_count];
+    temp_arr[j] = arr[i];
+  }
+}
+
+__global__ void radixSort(int* arr, int size, int length) {
+  __shared__ int count_arr[10];
+  int* temp_arr = (int*) malloc(size*sizeof(int));
+  int i = threadIdx.x * size / THREADS;
+  int device_i = 0;
+
+  for (int j = 0; j < size; j++) {
+    temp_arr[j] = arr[j];
+  }
+
+  for(int j = 0; j < 10; j++) {
+    count_arr[j] = 0;
+  }
+
+  while (device_i < length) {
+    for (int j = i; j < i + (size / THREADS); j++) {
+      int digit_count = getDigitFromNumber(arr[j], device_i);      
+      atomicAdd(&count_arr[digit_count], 1);     
+      __syncthreads();
+    }
+    __syncthreads();
+    if(threadIdx.x == 0) {
+      for (int j = 1; j < 10 ; j++ ) {
+        count_arr[j] += count_arr[j-1];
+      }
+      countingSort(arr, temp_arr, size, count_arr, device_i);
+
+      for (int j = 0; j < size; j++) {
+        arr[j] = temp_arr[j];
+      }
+
+      for(int j = 0; j < 10; j++) {
+        count_arr[j] = 0;
+      }    
+    }
+
+    device_i++;
+  }
+}
+
+int getMax(int arr[], int n) {
+	int mx = arr[0];
+	for (int i = 1; i < n; i++)
+		if (arr[i] > mx)
+			mx = arr[i];
+	return mx;
+}
+
+void rng(int* arr, int n) {
+  int seed = 13516117; // Ganti dengan NIM anda sebagai seed.
+  srand(seed);
+  for(long i = 0; i < n; i++) {
+      arr[i] = (int)rand();
+  }
+}
+
+void printHostArray(int* arr, int size) {
+  for (int i = 0; i < size; ++i)
+  {
+    cout << "Array[" << i << "] : " << *(arr + i) << endl;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  int *device = 0;
+
+  int size = atoi(argv[1]);
+  int mem_size = size * sizeof(int);
+
+  int* host = (int*) malloc(mem_size);
+
+  rng(host, size);
+  cudaMallocManaged( (void**)&device, mem_size);
+  
+  if( host == 0 || device == 0 ) {
+    cout << "Allocating memory failed" << endl;
+    return 0;
+  }
+
+  int max_num = getMax(host, size);
+  int length = 0;
+  
+  while (max_num > 0) {
+    if (max_num > 0) {
+      length++;
+      max_num /= 10;
+    }
+  }
+
+  // cout << "before :" << endl;
+  // printHostArray(host, size);
+
+  cudaMemcpy(device, host, mem_size, cudaMemcpyHostToDevice);
+
+  auto start = std::chrono::high_resolution_clock::now();
+
+  radixSort<<<1, THREADS>>>(device, size, length);
+
+  auto finish = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> elapsed = finish - start;
+  cout << "Execution time : " << elapsed.count() * 1000000 << " microseconds" << endl;
+
+  cudaDeviceSynchronize();
+    
+  cudaMemcpy(host, device, mem_size, cudaMemcpyDeviceToHost);
+
+  // cout << "After :" << endl;
+  // printHostArray(host, size);
+  
+  free(host);
+  cudaFree(device);
+  return 0;
+}
\ No newline at end of file