diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..059f294156c08c21ef791e5fe7e609f65d26a9a7
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,2 @@
+radix_sort:
+	nvcc radix_sort_parallel.cu -o radix_sort
diff --git a/logo.png b/logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..7437be5009b2dbb38fe88e0088f318d99f06c773
Binary files /dev/null and b/logo.png differ
diff --git a/src/radix_sort_parallel.c b/src/radix_sort_parallel.c
new file mode 100644
index 0000000000000000000000000000000000000000..8e3275e35726231362c15753e5ccbda554cbf1a6
--- /dev/null
+++ b/src/radix_sort_parallel.c
@@ -0,0 +1,144 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <cuda_runtime.h>
+
+void print_to_file(int* arr, int arr_len, char* type) {
+    char filename[100];
+    time_t now = time(NULL);
+    struct tm *t = localtime(&now);
+    strftime(filename, sizeof(filename) - 1, "output/%Y_%m_%d_%H%M%S", t);
+    snprintf(filename, sizeof(filename) - 1, "%s-%s.txt", filename, type);
+    FILE *f = fopen(filename, "w");
+    for (int i = 0; i < arr_len; i++) {
+        fprintf(f, "%d ", arr[i]);
+    }
+    fclose(f);
+}
+
+void rng(int* arr, int n) {
+    int seed = 13516095;
+    srand(seed);
+    for(long i = 0; i < n; i++) {
+        arr[i] = (int)rand();
+    }
+}
+
+u_int32_t* down_sweep(u_int32_t* flags, int n) {
+		u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t));
+		prefix_sum[0] = 0;
+
+		for (int i = 1; i < n; i++) {
+				prefix_sum[i] = prefix_sum[i - 1] + flags[i - 1];
+		}
+
+		return prefix_sum;
+}
+
+u_int32_t* up_sweep(u_int32_t* flags, int n) {
+		u_int32_t* prefix_sum = (u_int32_t*) malloc(n * sizeof(u_int32_t));
+		prefix_sum[n - 1] = n - 1;
+
+		for (int i = n - 2; i >= 0; i--) {
+				int substract = (flags[i + 1] ? 0 : 1);
+				prefix_sum[i] = prefix_sum[i + 1] - substract;
+		}
+
+		return prefix_sum;
+}
+
+void count_sort(int* d_arr, int n, int idx) {
+    int block_dim = 64;
+		int grid_dim = (n + block_dim - 1) / block_dim;
+		u_int32_t*  d_flags;
+		cudaMalloc(&d_flags, n * sizeof(u_int32_t));
+		assign_flagss<<<grid_dim, block_dim>>>(d_arr, n, idx, d_flags);
+		cudaDeviceSynchronize();
+		u_int32_t* flags = (u_int32_t*) malloc(n * sizeof(u_int32_t));
+		cudaMemcpy(flags, d_flags, n * sizeof(u_int32_t), cudaMemcpyDeviceToHost);
+		u_int32_t* down_sweep_i = down_sweep(flags, n);
+		u_int32_t* up_sweep_i = up_sweep(flags, n);
+		free(flags);
+		u_int32_t *d_arr_temp, *d_arr_idx, *d_idx_down, *d_idx_up;
+		cudaMalloc(&d_arr_temp, n * sizeof(u_int32_t));
+		cudaMalloc(&d_arr_idx, n * sizeof(u_int32_t));
+		cudaMalloc(&d_idx_down, n * sizeof(u_int32_t));
+		cudaMalloc(&d_idx_up, n * sizeof(u_int32_t));
+		cudaMemcpy(d_idx_down, down_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice);
+		cudaMemcpy(d_idx_up, up_sweep_i, n * sizeof(u_int32_t), cudaMemcpyHostToDevice);
+		cudaMemcpy(d_arr_temp, d_arr, n * sizeof(u_int32_t), cudaMemcpyDeviceToDevice);
+		assign_index<<<grid_dim, block_dim>>>(d_arr, d_arr_idx, d_idx_down, d_idx_up, d_flags, n);
+		cudaDeviceSynchronize();
+		scatter<<<grid_dim, block_dim>>>(d_arr, d_arr_temp, d_arr_idx, n);
+		cudaDeviceSynchronize();
+    cudaFree(d_arr_idx);
+		cudaFree(d_idx_down);
+		cudaFree(d_idx_up);
+		cudaFree(d_flags);
+		cudaFree(d_arr_temp);
+}
+
+__global__ void assign_index(u_int32_t* arr, u_int32_t* arr_idx, u_int32_t* down_sweep_i, u_int32_t* up_sweep_i, u_int32_t* flags, int n) {
+		for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
+				arr_idx[i] = flags[i] ? down_sweep_i[i] : up_sweep_i[i];
+		}
+}
+
+__global__ void assign_flagss(u_int32_t* arr, int n, int idx, u_int32_t* flags) {
+		for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
+			flags[i] = ((arr[i] & 1 << idx) == 1 << idx) ? 0 : 1;
+		}
+}
+
+__global__ void scatter(u_int32_t* arr, u_int32_t* arr_temp, u_int32_t* arr_idx, int n) {
+		for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
+				arr[arr_idx[i]] = arr_temp[i];
+		}
+}
+
+void radix_sort(int* d_arr, int n) { 
+    for (int idx = 1; idx < 32; idx++) {
+        count_sort<<<grid_dim, block_dim>>>(d_arr, n, idx);
+    }
+}
+
+void print(int* arr, int n) { 
+    for (int i = 0; i < n; i++) 
+        printf("%d ", arr[i]);
+    printf("\n");
+} 
+
+int main(int argc, char *argv[]) {
+    if (argc != 2) {
+        printf("Usage: ./radix_sort <n>\n");
+        return 1;
+    }
+
+    // Generate numbers
+    int n = atoi(argv[1]);
+    int* arr = (int*) malloc(sizeof(int) * n);
+    rng(arr, n);
+    
+    // Sort numbers
+    clock_t start, end;
+    double cpu_time_used;
+
+    printf("Sorting in parallel...\n");
+    start = clock();
+
+    // Move sorting CPU
+    u_int32_t* d_arr;
+		cudaMalloc(&d_arr, sizeof(u_int32_t) * n);
+		cudaMemcpy(d_arr, arr, sizeof(u_int32_t) * n, cudaMemcpyHostToDevice);
+    radix_sort(d_arr, n);
+    cudaMemcpy(arr, d_arr, n * sizeof(u_int32_t), cudaMemcpyDeviceToHost);
+		cudaFree(d_arr);
+
+    end = clock();
+    cpu_time_used = ((double) (end - start)) * 1000000 / CLOCKS_PER_SEC;
+    printf("Parallel radix sort finished, time taken: %f μs\n\n", cpu_time_used);
+    print_to_file(arr, n, "output-parallel");
+
+    return 0; 
+} 
diff --git a/src/radix_sort_serial.c b/src/radix_sort_serial.c
new file mode 100644
index 0000000000000000000000000000000000000000..7cba8ef05a3c09d92acde9213f717a7dcf1d9797
--- /dev/null
+++ b/src/radix_sort_serial.c
@@ -0,0 +1,91 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+void print_to_file(int* arr, int arr_len, char* type) {
+    char filename[100];
+    time_t now = time(NULL);
+    struct tm *t = localtime(&now);
+    strftime(filename, sizeof(filename) - 1, "output/%Y_%m_%d_%H%M%S", t);
+    snprintf(filename, sizeof(filename) - 1, "%s-%s.txt", filename, type);
+    FILE *f = fopen(filename, "w");
+    for (int i = 0; i < arr_len; i++) {
+        fprintf(f, "%d ", arr[i]);
+    }
+    fclose(f);
+}
+
+void rng(int* arr, int n) {
+    int seed = 13516095;
+    srand(seed);
+    for(long i = 0; i < n; i++) {
+        arr[i] = (int)rand();
+    }
+}
+  
+int get_max(int* arr, int n) { 
+    int mx = arr[0]; 
+    for (int i = 1; i < n; i++) 
+        if (arr[i] > mx) 
+            mx = arr[i]; 
+    return mx; 
+} 
+
+void count_sort(int* arr, int n, int exp) { 
+    int output[n]; 
+    int i, count[10] = {0}; 
+  
+    for (i = 0; i < n; i++) 
+        count[(arr[i] / exp) % 10]++; 
+  
+    for (i = 1; i < 10; i++) 
+        count[i] += count[i - 1]; 
+  
+    for (i = n - 1; i >= 0; i--) 
+    { 
+        output[count[(arr[i] / exp) % 10] - 1] = arr[i]; 
+        count[(arr[i] / exp) % 10]--; 
+    }
+  
+    for (i = 0; i < n; i++) 
+        arr[i] = output[i]; 
+} 
+  
+void radix_sort(int* arr, int n) { 
+    int m = get_max(arr, n); 
+    for (int exp = 1; m/exp > 0; exp *= 10) {
+        count_sort(arr, n, exp); 
+    }
+}
+
+void print(int* arr, int n) { 
+    for (int i = 0; i < n; i++) 
+        printf("%d ", arr[i]); 
+    printf("\n");
+} 
+  
+int main(int argc, char *argv[]) { 
+    if (argc != 2) {
+        printf("Usage: ./serial_radix_sort <n>\n");
+        return 1;
+    }
+
+    int n = atoi(argv[1]);
+    int* arr = (int*) malloc(sizeof(int) * n);
+
+    rng(arr, n);
+
+    clock_t start, end;
+    double cpu_time_used;
+
+    printf("Sorting in serial...\n");
+    start = clock();
+    radix_sort(arr, n);
+    end = clock();
+    cpu_time_used = ((double) (end - start)) * 1000000 / CLOCKS_PER_SEC;
+    printf("Serial radix sort finished, time taken: %f μs\n\n", cpu_time_used);
+
+    print_to_file(arr, n, "output-serial");
+
+    return 0; 
+}