diff --git a/Makefile b/Makefile
index 44c03e06c4ff1dbda108610ed8fb5b213d93e522..b5fad74d53c1485d0db5e80c7c2f6634d7ac064b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,14 +1,13 @@
-CC=nvcc
-CFLAGS=-g
-SRCFOLDER=src/
-DEPS=src/utils/matrix.h src/utils/dijkstra.h
+CC = nvcc
+DEPS = src/main.cu src/utils/dijkstra.cu src/utils/matrix.cu
+OBJECTS = src/main.o src/utils/dijkstra.o src/utils/matrix.o
 
-%.o: %.cu $(DEPS)
-	$(CC) -c -o $@ $< $(CFLAGS)
-
-main: $(SRCFOLDER)main.o $(SRCFOLDER)utils/matrix.o $(SRCFOLDER)utils/dijkstra.o
+all: $(OBJECTS)
 	mkdir -p dist
-	$(CC) $(CFLAGS) -o dist/main $(SRCFOLDER)main.cu $(SRCFOLDER)utils/matrix.o $(SRCFOLDER)utils/dijkstra.o
+	$(CC) $(OBJECTS) -o dist/main
+
+%.o: %.cu $(DEPS)
+	$(CC) -x cu -I. -dc $< -o $@
 
 clean:
 	rm -rf dist
diff --git a/src/main.cu b/src/main.cu
index 8cbb1c4629035cce683dda0471ab7cffe20858e6..9b8ec65032b93442db70771dd56c5d3462f12a75 100644
--- a/src/main.cu
+++ b/src/main.cu
@@ -3,7 +3,7 @@
 #include "utils/matrix.h"
 #include "utils/dijkstra.h"
 
-long int *calculate_sub_matrix(long int *matrix, int node_count);
+__global__ void calculate_sub_matrix(long int *matrix, long int *sub_dist, int node_count);
 
 int main(int argc, char *argv[])
 {
@@ -17,10 +17,12 @@ int main(int argc, char *argv[])
     }
 
     int node_count = atoi(argv[1]);
-    cudaDeviceSetLimit(cudaLimitMallocHeapSize, node_count * node_count * 2.5f * sizeof(long int));
+    cudaDeviceSetLimit(cudaLimitMallocHeapSize, node_count * node_count * 3 * sizeof(long int) + node_count * sizeof(long int));
     long int *adj_matrix = create_adj_matrix(node_count, node_count);
 
-    long int *sub_dist = calculate_sub_matrix(adj_matrix, node_count);
+    long int *sub_dist;
+    cudaMallocManaged(&sub_dist, node_count * node_count * sizeof(long int));
+    calculate_sub_matrix<<<16,64>>>(adj_matrix, sub_dist, node_count);
 
     char print_dist;
     printf("Print distances to stdout? [y/N] ");
@@ -36,22 +38,12 @@ int main(int argc, char *argv[])
     cudaFree(adj_matrix);
 }
 
-long int *calculate_sub_matrix(long int *matrix, int node_count)
+__global__ void calculate_sub_matrix(long int *matrix, long int *sub_dist, int node_count)
 {
-    long int *sub_dist = (long int *)malloc(node_count * node_count * sizeof(long int));
-    cudaMallocManaged(&sub_dist, node_count * node_count * sizeof(long int));
-
-    for (int i = 0; i < node_count; i++)
+    int start = (blockIdx.x * blockDim.x) + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+    for (int i = start; i < node_count; i+= stride)
     {
-        long int *temp_dist = dijkstra(matrix, i, node_count);
-
-        for (int j = 0; j < node_count; j++)
-        {
-            set_el(sub_dist, node_count, j, i, temp_dist[j]);
-        }
-
-        cudaFree(temp_dist);
+        dijkstra(matrix, sub_dist + i * node_count, i, node_count);
     }
-
-    return sub_dist;
 }
diff --git a/src/utils/dijkstra.cu b/src/utils/dijkstra.cu
index 930dd5d59c5c88174c7b3f2901a696ef1211268e..0fd4428dccdf8e6d787cd32dad28904cc5c63e43 100644
--- a/src/utils/dijkstra.cu
+++ b/src/utils/dijkstra.cu
@@ -1,6 +1,6 @@
 #include "dijkstra.h"
 
-long int get_idx_min_dist(long int *dist, short *processed, int len)
+__device__ long int get_idx_min_dist(long int *dist, short *processed, int len)
 {
     long int min = LONG_MAX;
     int idx;
@@ -17,11 +17,9 @@ long int get_idx_min_dist(long int *dist, short *processed, int len)
     return idx;
 }
 
-long int *dijkstra(long int *adj_matrix, int src, int size)
+__device__ void dijkstra(long int *adj_matrix, long int *dist, int src, int size)
 {
-    long int *dist;
-    cudaMallocManaged(&dist, size * sizeof(long int));
-    short processed[size];
+    short processed[MAX_NODE];
 
     for (int i = 0; i < size; i++)
     {
@@ -50,6 +48,4 @@ long int *dijkstra(long int *adj_matrix, int src, int size)
             }
         }
     }
-
-    return dist;
 }
\ No newline at end of file
diff --git a/src/utils/dijkstra.h b/src/utils/dijkstra.h
index d143e6b3116b1596989edd62fa1013c881de35c6..6f88983b00402e4f768345dd01e6a8f2835ee35f 100644
--- a/src/utils/dijkstra.h
+++ b/src/utils/dijkstra.h
@@ -6,5 +6,6 @@
 
 #define TRUE 1
 #define FALSE 0
+#define MAX_NODE 3000
 
-long int *dijkstra(long int *adj_matrix, int src, int size);
+__device__ void dijkstra(long int *adj_matrix, long int *dist, int src, int size);
\ No newline at end of file
diff --git a/src/utils/matrix.cu b/src/utils/matrix.cu
index f7b9bb0951e6f6735b212c66ec23cc8f58a03578..2651e5cdecdfd199ad60401c5ac1e558a582e8c5 100644
--- a/src/utils/matrix.cu
+++ b/src/utils/matrix.cu
@@ -61,12 +61,12 @@ long int *create_adj_matrix(int width, int height)
     return matrix;
 }
 
-long int get_el(long int *matrix, int width, int x, int y)
+__device__  long int get_el(long int *matrix, int width, int x, int y)
 {
     return matrix[y * width + x];
 }
 
-void set_el(long int *matrix, int width, int x, int y, long int value)
+__device__ void set_el(long int *matrix, int width, int x, int y, long int value)
 {
     matrix[y * width + x] = value;
 }
diff --git a/src/utils/matrix.h b/src/utils/matrix.h
index c02cbb6bb8e3f47f9e05b3838962cb7f510ad94d..cc9fd1aa5f5fa60d8e2735d09fe308edd7a8b932 100644
--- a/src/utils/matrix.h
+++ b/src/utils/matrix.h
@@ -4,6 +4,6 @@
 void print_array(long int *array, int width);
 void print_matrix(long int *matrix, int width, int height);
 void print_matrix_to_file(long int *matrix, int width, int height, char *filename);
-long int get_el(long int *matrix, int width, int x, int y);
-void set_el(long int *matrix, int width, int x, int y, long int value);
+__device__ long int get_el(long int *matrix, int width, int x, int y);
+__device__ void set_el(long int *matrix, int width, int x, int y, long int value);
 long int *create_adj_matrix(int width, int height);
\ No newline at end of file