diff --git a/makefile b/makefile
new file mode 100644
index 0000000000000000000000000000000000000000..7d35f963f16c73bf9416ab2b1701f7f420f01e98
--- /dev/null
+++ b/makefile
@@ -0,0 +1,6 @@
+CP = parallel_radix.cu
+EXE = radix
+SRC_DIR = src
+LDLIBS += -lcurand
+build:
+	nvcc -o $(EXE) $(SRC_DIR)/$(CP) $(LDLIBS)
diff --git a/src/parallel_radix.cu b/src/parallel_radix.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9e7a15963f48ecbf640e39fbe06bc4eb4a1cc247
--- /dev/null
+++ b/src/parallel_radix.cu
@@ -0,0 +1,125 @@
+#include <iostream>
+#include <fstream>
+#include <numeric>
+#include "cuda_runtime.h"
+#include "cuda.h"
+#include "curand_kernel.h"
+#include "device_launch_parameters.h"
+
+#define ARRAY_SIZE 100000
+#define MAX_BLOCK_SIZE 1024
+#define RNG_SEED 13516120
+#define BASE 10
+
+
+using namespace std;
+
+void printArray(unsigned int * arr, int n) {
+    for (int i = 0; i < n; i++) 
+        cout << arr[i] << " ";
+    cout << endl; 
+}
+
+__global__ void get_max(unsigned int * input, int n)
+{
+	const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+	auto step_size = 1;
+	int number_of_threads = n / 2;
+	while (number_of_threads > 0)
+	{
+		if (tid < number_of_threads) // still alive?
+		{
+			const auto fst = tid * step_size * 2;
+			const auto snd = fst + step_size;
+			if(input[fst] < input[snd]){
+				input[fst] = input[snd];
+			}
+		}
+		step_size <<= 1; 
+		number_of_threads >>= 1;
+    }
+    __syncthreads();
+}
+
+__global__ void count_occurences(unsigned int * arr, int n, int * count, unsigned long long exp){
+    const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    int digit;
+    if(tid < n){
+        digit = (arr[tid] / exp) % BASE;
+        atomicAdd(&count[digit], 1);
+    }
+}
+int main(int argc, char *argv[])
+{
+    curandGenerator_t curand_gen;
+    cudaEvent_t start, stop;
+    float time;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    curandCreateGenerator(&curand_gen, CURAND_RNG_PSEUDO_DEFAULT);
+    curandSetPseudoRandomGeneratorSeed(curand_gen, RNG_SEED);
+    const long count = ARRAY_SIZE;
+    unsigned int deviceArr[count];
+    unsigned int * darr;
+    int grid, block;
+    cudaMalloc(&darr, count * sizeof(unsigned int));
+    
+    
+    curandGenerate(curand_gen, darr, count);
+    cudaDeviceSynchronize();
+    cudaMemcpy(deviceArr, darr, count * sizeof(int), cudaMemcpyDeviceToHost);
+    //find max
+    if(count > MAX_BLOCK_SIZE){
+        grid = count / (MAX_BLOCK_SIZE);
+    }
+    else{
+        grid = 1;
+    }
+    block = MAX_BLOCK_SIZE / 2;
+	get_max <<<grid, block >>>(darr, count);
+	unsigned int global_max;
+    cudaMemcpy(&global_max, darr, sizeof(int), cudaMemcpyDeviceToHost);
+    
+    
+    
+    cudaEventRecord(start, 0);
+    cudaMemcpy(darr, deviceArr, count * sizeof(unsigned int), cudaMemcpyHostToDevice);
+    block *= 2;
+    for (unsigned long long exp = 1; global_max / exp > 0; exp *= BASE) {
+        unsigned int output[count];
+        int hcount[BASE] = {0};
+        int * dcount;
+        cudaMalloc(&dcount, sizeof(int) * BASE);
+        cudaMemset(dcount, 0, sizeof(int) * BASE);
+        count_occurences <<<grid, block>>>(darr, count, dcount, exp);
+        cudaMemcpy(hcount, dcount, BASE * sizeof(int), cudaMemcpyDeviceToHost);
+        for (int i = 1; i < BASE; i++) {
+            hcount[i] += hcount[i - 1];
+        }
+        for (int i = count - 1; i >= 0; i--) { 
+            output[hcount[ (deviceArr[i] / exp) % BASE ] - 1] = deviceArr[i]; 
+            hcount[ (deviceArr[i] / exp) % BASE ]--;
+        }
+        cudaMemcpy(darr, output, sizeof(unsigned int) * count, cudaMemcpyHostToDevice);
+        memcpy(deviceArr, output, sizeof(unsigned int) * count);
+        cudaFree(dcount);
+        cudaDeviceSynchronize();
+    }
+
+    cudaMemcpy(deviceArr, darr, count * sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&time, start, stop);
+    ofstream out("test/output", std::ios::out | std::ios::trunc);
+    for(int k = 0; k < count; k ++){
+        out << deviceArr[k] << endl ;
+    }
+
+    out.close();
+    cout << "Time =  " << time * 1000 << " ms" << endl;
+    cudaFree(darr);
+    curandDestroyGenerator(curand_gen);
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+	return 0;
+}
\ No newline at end of file
diff --git a/test/README.md b/test/README.md
index fcb37d95caa74bc95d1cb01dea18a5eacff5d72e..d10bbd8e7ae1b07d5b8f6414fe498845c0d42a7e 100644
--- a/test/README.md
+++ b/test/README.md
@@ -1,3 +1,15 @@
-# Announcement
+# Radix_Sort
 
-Please place your sample input and output here.
\ No newline at end of file
+Sebuah program serial dan paralel yang berfungsi untuk mengurutkan angka dengan metode Radix
+
+## Petunjuk Penggunaan Program
+
+Untuk menjalankan program, +>download program
++>Kemudian buka terminal dan compile program dengan
+
+make
+
+### Pembagian Kerja
+
+Dias Akbar Nugraha mengerjakan parallel program, makefile
+Nuha Adinata mengerjakan parallel program, README
\ No newline at end of file