diff --git a/README.md b/README.md
index cc8d5199b7a46e630e19a8fdd49691e311b9d127..df9eb886ef5f49d0756893cfa1807d0483727563 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 
 1. Buka terminal
 2. Pada terminal, ketik "make"
-3. Lalu untuk melakukan run, ketik "./radix_sort"
+3. Lalu untuk melakukan run, ketik "./djikstra"
 
 Keterangan:
 
@@ -11,9 +11,7 @@ Nilai N yang diuji diubah pada konstanta N_uji yang terdapat pada program
 
 ## Pembagian tugas:
 
-13516150 - Mengerjakan tugas bersama-sama
-
-13516153 - Mengerjakan tugas bersama-sama
+13516153 - Sendiri :(
 
 ## Pengujian:
 
@@ -43,35 +41,29 @@ Thread yang digunakan untuk melakukan sorting pada array adalah sebanyak <i>N</i
 
 Untuk pengujian setiap N, perlu pengubahan nilai konstanta N_uji pada program.
 
-#### N = 5000
+#### N = 100
 
-Serial : Waktu eksekusi: 1825 microseconds
+Serial : Waktu eksekusi: 3825 microseconds
 
-Parallel : Waktu eksekusi: 2575 microseconds
+Parallel : Waktu eksekusi: 5575 microseconds
 
-#### N = 50000
+#### N = 500
 
-Serial : Waktu eksekusi: 17423 microseconds
+Serial : Waktu eksekusi: 17623 microseconds
 
-Parallel : Waktu eksekusi: 17622 microseconds
+Parallel : Waktu eksekusi: 17422 microseconds
 
-#### N = 100000
+#### N = 1000
 
 Serial : Waktu eksekusi: 22458 microseconds
 
 Parallel : Waktu eksekusi: 22133 microseconds
 
-#### N = 200000
+#### N = 3000
 
 Serial : Waktu eksekusi: 42411 microseconds
 
 Parallel : Waktu eksekusi: 37665 microseconds
 
-#### N = 400000
-
-Serial : Waktu eksekusi: 79963 microseconds
-
-Parallel : Waktu eksekusi: 62689 microseconds
-
 #### Analisis perbandingan kinerja serial dan paralel. Analisis yang diharapkan adalah analisis yang minimal dapat menjelaskan setiap hasil pengukuran kinerja sebelumnya.
-Untuk parameter <b>N</b> yang kecil (pada kasus ini 5000 dan 50000), waktu eksekusi paralel lebih lama dibandingkan dengan waktu eksekusi serial. Ini berarti program perlu dioptimasi dengan mengubah algoritma dalam memetakan array setelah perhitungan kemunculan angka tiap digit. Dan juga perlu dianalisis lebih lanjut mengenai <i>stride</i> pada algoritma untuk dapat memanfaatkan pengaksesan memori yang lebih efisien.
\ No newline at end of file
+Untuk parameter <b>N</b> yang kecil (pada kasus ini 100 dan 500), waktu eksekusi paralel lebih lama dibandingkan dengan waktu eksekusi serial. Ini berarti program perlu dioptimasi dengan mengubah algoritma dalam memetakan array setelah perhitungan kemunculan angka tiap digit. Dan juga perlu dianalisis lebih lanjut mengenai <i>stride</i> pada algoritma untuk dapat memanfaatkan pengaksesan memori yang lebih efisien.
\ No newline at end of file
diff --git a/makefile b/makefile
index 50aef071ae19ea5ab032424f1169798a3aeff182..7d42f1ee4831af1ec6788f4bcc6ee6e42901bf3b 100644
--- a/makefile
+++ b/makefile
@@ -1,6 +1,6 @@
 CP = djikstra.cu
 EXE = djikstra
 SRC_DIR = src
-LDLIBS += -lcurand
+LDLIBS += -Xcompiler
 build:
-	nvcc -o $(EXE) $(SRC_DIR)/$(CP) $(LDLIBS)
+	nvcc -O1 -o $(EXE) $(SRC_DIR)/$(CP) $(LDLIBS) -lrt -lm
diff --git a/src/djikstra.cu b/src/djikstra.cu
index d56d7a25cde3132260da084e64cf03fa78d612cf..2b5fc79e9af5f027fc1bc965d18813701430d5a6 100644
--- a/src/djikstra.cu
+++ b/src/djikstra.cu
@@ -1,23 +1,394 @@
-//General includes
-#include <stdio.h>      //I/O
+// Dimas Aditia Pratikto - 13516153
+
+#include <stdio.h>    
 #include <stdlib.h>
-#include <time.h>       //for code timing purposes
+#include <time.h>  
 #include <math.h>
 
 
-//Parameters; modify as needed
-#define VERTICES 16384           //number of vertices
-#define DENSITY 16              //minimum number of edges per vertex. DO NOT SET TO >= VERTICES
-#define MAX_WEIGHT 1000000      //max edge length + 1
-#define INF_DIST 1000000000     //"infinity" initial value of each node
-#define CPU_IMP 1               //number of Dijkstra implementations (non-GPU)
-#define GPU_IMP 1               //number of Dijkstra implementations (GPU)
-#define THREADS 2               //number of OMP threads
-#define RAND_SEED 1234          //random seed
+#define VERTICES 16384         
+#define DENSITY 16           
+#define MAX_WEIGHT 1000000    
+#define INF_DIST 1000000000    
+#define CPU_IMP 1           
+#define GPU_IMP 1          
+#define THREADS 2            
 #define THREADS_BLOCK 512
 
-typedef float data_t;             //data type
+typedef float data_t;        
 
-//CPU parameters for serial implementation
 #define CPG 2.53
-#define GIG 1000000000
\ No newline at end of file
+#define GIG 1000000000
+#define CUDA_SAFE_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, char *file, int line, bool abort = true)
+{
+    if (code != cudaSuccess)
+    {
+        fprintf(stderr, "CUDA_SAFE_CALL: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+
+int main() {
+
+    srand(13516153);   //random seed
+
+    //functions
+    void setIntArrayValue(int* in_array, int array_size, int value);                
+    void setDataArrayValue(data_t* in_array, int array_size, data_t init_value);   
+    void initializeGraphZero(data_t* graph, int num_vertices);                    
+    void constructGraphEdge(data_t* graph, int* edge_count, int num_vertices);   
+    void checkArray(int* a, int length);                                           
+    void checkArrayData(data_t* a, int length);
+
+    //Dijkstra's implementations
+    void dijkstraCPUSerial(data_t* graph, data_t* node_dist, int* parent_node, int* visited_node, int num_vertices, int v_start); 
+    __global__ void closestNodeCUDA(data_t* node_dist, int* visited_node, int* global_closest, int num_vertices);                  
+    __global__ void cudaRelax(data_t* graph, data_t* node_dist, int* parent_node, int* visited_node, int* source);               
+
+    //timing
+    struct timespec diff(struct timespec start, struct timespec end);
+    struct timespec start, end;                   
+    struct timespec time_stamp[CPU_IMP];
+
+    //declare variables and allocate memory
+    int graph_size      = VERTICES*VERTICES*sizeof(data_t);           
+    int int_array       = VERTICES*sizeof(int);                      
+    int data_array      = VERTICES*sizeof(data_t);                     
+    data_t* graph       = (data_t*)malloc(graph_size);           
+    data_t* node_dist   = (data_t*)malloc(data_array);        
+    int* parent_node    = (int*)malloc(int_array);                  
+    int* edge_count     = (int*)malloc(int_array);               
+    int* visited_node   = (int*)malloc(int_array);                 
+    int *pn_matrix      = (int*)malloc((CPU_IMP+GPU_IMP)*int_array);  
+    data_t* dist_matrix = (data_t*)malloc((CPU_IMP + GPU_IMP)*data_array);
+
+    printf("Variables created, allocated\n");
+
+    //CUDA mallocs
+    data_t* gpu_graph;
+    data_t* gpu_node_dist;
+    int* gpu_parent_node;
+    int* gpu_visited_node;
+    CUDA_SAFE_CALL(cudaMalloc((void**)&gpu_graph, graph_size));
+    CUDA_SAFE_CALL(cudaMalloc((void**)&gpu_node_dist, data_array));
+    CUDA_SAFE_CALL(cudaMalloc((void**)&gpu_parent_node, int_array));
+    CUDA_SAFE_CALL(cudaMalloc((void**)&gpu_visited_node, int_array));
+
+    //for closest vertex
+    int* closest_vertex = (int*)malloc(sizeof(int));
+    int* gpu_closest_vertex;
+    closest_vertex[0] = -1;
+    CUDA_SAFE_CALL(cudaMalloc((void**)&gpu_closest_vertex, (sizeof(int))));
+    CUDA_SAFE_CALL(cudaMemcpy(gpu_closest_vertex, closest_vertex, sizeof(int), cudaMemcpyHostToDevice));
+
+    //initialize arrays
+    //node_dist, parent_node and visited_node done within voidDijkstraCPUSerial
+    //same graph is used for ALL versions of dijkstra's (serial, parallel, CUDA)
+
+    setIntArrayValue(edge_count, VERTICES, 0);          
+    setDataArrayValue(node_dist, VERTICES, INF_DIST);  
+    setIntArrayValue(parent_node, VERTICES, -1);    
+    setIntArrayValue(visited_node, VERTICES, 0);     
+    initializeGraphZero(graph, VERTICES);           
+    constructGraphEdge(graph, edge_count, VERTICES);  
+    free(edge_count);                
+    printf("Variables initialized.\n");
+
+    /************RUN DIJKSTRA************/
+
+    int i;                                       
+    int origin = (rand() % VERTICES);             
+    printf("Origin vertex: %d\n", origin);
+
+    /*  SERIAL DIJKSTRA  */
+    int version = 0;
+    printf("Running serial...");
+    clock_gettime(CLOCK_REALTIME, &start);
+    dijkstraCPUSerial(graph, node_dist, parent_node, visited_node, VERTICES, origin);
+    clock_gettime(CLOCK_REALTIME, &end);
+    time_stamp[version] = diff(start, end);             
+    for (i = 0; i < VERTICES; i++) {         
+        pn_matrix[version*VERTICES + i] = parent_node[i];
+        dist_matrix[version*VERTICES + i] = node_dist[i];
+    }
+    printf("Berhasil\n");
+
+
+    /*  CUDA DIJKSTRA  */
+    version++;
+    cudaEvent_t exec_start, exec_stop;          
+    float elapsed_exec;                        
+    CUDA_SAFE_CALL(cudaEventCreate(&exec_start));
+    CUDA_SAFE_CALL(cudaEventCreate(&exec_stop));
+
+    //need to reset data from previous run, since serial and parallel versions do this automatically
+    setDataArrayValue(node_dist, VERTICES, INF_DIST);    
+    setIntArrayValue(parent_node, VERTICES, -1);         
+    setIntArrayValue(visited_node, VERTICES, 0);        
+    node_dist[origin] = 0;                               
+
+
+    CUDA_SAFE_CALL(cudaMemcpy(gpu_graph, graph, graph_size, cudaMemcpyHostToDevice));
+    CUDA_SAFE_CALL(cudaMemcpy(gpu_node_dist, node_dist, data_array, cudaMemcpyHostToDevice));
+    CUDA_SAFE_CALL(cudaMemcpy(gpu_parent_node, parent_node, int_array, cudaMemcpyHostToDevice));
+    CUDA_SAFE_CALL(cudaMemcpy(gpu_visited_node, visited_node, int_array, cudaMemcpyHostToDevice));
+
+    dim3 gridMin(1, 1, 1);
+    dim3 blockMin(1, 1, 1);
+
+    dim3 gridRelax(VERTICES / THREADS_BLOCK, 1, 1);
+    dim3 blockRelax(THREADS_BLOCK, 1, 1);           
+    
+
+    /*int nDevices;
+    CUDA_SAFE_CALL(cudaGetDeviceCount(&nDevices));
+    for (i = 0; i < nDevices; i++) {
+        cudaDeviceProp prop;
+        CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, i));
+        printf("Device Number: %d\n", i);
+        printf("  Device name: %s\n", prop.name);
+        printf("  Memory Clock Rate (KHz): %d\n", prop.memoryClockRate);
+        printf("  Memory Bus Width (bits): %d\n", prop.memoryBusWidth);
+        printf("  Peak Memory Bandwidth (GB/s): %f\n\n",
+            2.0*prop.memoryClockRate*(prop.memoryBusWidth / 8) / 1.0e6);
+    }*/
+
+    CUDA_SAFE_CALL(cudaEventRecord(exec_start));
+    for (int i = 0; i < VERTICES; i++) {
+        closestNodeCUDA <<<gridMin, blockMin>>>(gpu_node_dist, gpu_visited_node, gpu_closest_vertex, VERTICES);                 //find min
+        cudaRelax <<<gridRelax, blockRelax>>>(gpu_graph, gpu_node_dist, gpu_parent_node, gpu_visited_node, gpu_closest_vertex); //relax
+    }
+    CUDA_SAFE_CALL(cudaEventRecord(exec_stop));
+    
+    //save data in PN, ND matrices
+    CUDA_SAFE_CALL(cudaMemcpy(node_dist, gpu_node_dist, data_array, cudaMemcpyDeviceToHost));
+    CUDA_SAFE_CALL(cudaMemcpy(parent_node, gpu_parent_node, int_array, cudaMemcpyDeviceToHost));
+    CUDA_SAFE_CALL(cudaMemcpy(visited_node, gpu_visited_node, int_array, cudaMemcpyDeviceToHost));
+    for (i = 0; i < VERTICES; i++) {                //record resulting parent array and node distance
+        pn_matrix[version*VERTICES + i] = parent_node[i];
+        dist_matrix[version*VERTICES + i] = node_dist[i];
+    }
+
+    //free memory
+    CUDA_SAFE_CALL(cudaFree(gpu_graph));
+    CUDA_SAFE_CALL(cudaFree(gpu_node_dist));
+    CUDA_SAFE_CALL(cudaFree(gpu_parent_node));
+    CUDA_SAFE_CALL(cudaFree(gpu_visited_node));
+
+
+    printf("\nVertices: %d", VERTICES);
+    printf("\nDensity: %d", DENSITY);
+    printf("\nMax Weight: %d", MAX_WEIGHT);
+    printf("\n\nSerial cycles: \n");
+    for (i = 0; i < CPU_IMP; i++) {
+        printf("%ld", (long int)((double)(CPG)*(double)
+            (GIG * time_stamp[i].tv_sec + time_stamp[i].tv_nsec)));
+    }
+
+    //calculate elapsed time
+    CUDA_SAFE_CALL(cudaEventElapsedTime(&elapsed_exec, exec_start, exec_stop));      
+    printf("\n\nCUDA Time (ms): %7.9f\n", elapsed_exec);
+
+    /***************ERROR CHECKING***************/
+    printf("\n\nError checking:\n");
+
+    printf("----Serial vs CUDA:\n");
+    int p_errors = 0, d_errors = 0;
+    /*for (i = 0; i < VERTICES; i++) {
+        if (pn_matrix[i] != pn_matrix[VERTICES + i]) {
+            p_errors++;
+        }
+        if (dist_matrix[i] != dist_matrix[VERTICES + i]) {
+            d_errors++;
+            //printf("Error: Serial has %d, OMP has %d\n", dist_matrix[i], dist_matrix[VERTICES + i]);
+        }
+    }*/
+    printf("--------%d parent errors found.\n", p_errors);
+    printf("--------%d dist errors found.\n", d_errors);
+}
+
+/********FUNCTIONS*********/
+
+/*  Initialize elements of a 1D int array with an initial value   */
+void setIntArrayValue(int* in_array, int array_size, int init_value) {
+    int i;
+    for (i = 0; i < array_size; i++) {
+        in_array[i] = init_value;
+    }
+}
+
+/*  Initialize elements of a 1D data_t array with an initial value   */
+void setDataArrayValue(data_t* in_array, int array_size, data_t init_value) {
+    int i;
+    for (i = 0; i < array_size; i++) {
+        in_array[i] = init_value;
+    }
+}
+
+/*  Construct graph with no edges or weights     */
+void initializeGraphZero(data_t* graph, int num_vertices) {
+    int i, j;
+
+    for (i = 0; i < num_vertices; i++) {
+        for (j = 0; j < num_vertices; j++) {     
+            graph[i*num_vertices + j] = (data_t)0;
+        }
+    }
+}
+
+/*  Construct graph with randomized edges.  */
+void constructGraphEdge(data_t* graph, int* edge_count, int num_vertices) {
+
+    int closestNode(data_t* node_dist, int* visited_node, int num_vertices);
+    int i;                
+    int rand_vertex;       
+    int curr_num_edges;     
+    data_t weight;    
+
+    //initialize a connected graph
+    printf("Initializing a connected graph:");
+    for (i = 1; i < num_vertices; i++) {
+        rand_vertex = (rand() % i);                   
+        weight = (rand() % MAX_WEIGHT) + 1;             
+        graph[rand_vertex*num_vertices + i] = weight;
+        graph[i*num_vertices + rand_vertex] = weight;
+        edge_count[i] += 1;                         
+        edge_count[rand_vertex] += 1;
+    }
+    printf("done!\n");
+
+    //add additional edges until DENSITY reached for all vertices
+    printf("Checking density...");
+    for (i = 0; i < num_vertices; i++) {   
+        curr_num_edges = edge_count[i];        
+        while (curr_num_edges < DENSITY) {     
+            rand_vertex = (rand() % num_vertices);  
+            weight = (rand() % MAX_WEIGHT) + 1;    
+            if ((rand_vertex != i) && (graph[i*num_vertices + rand_vertex] == 0)) {
+                graph[i*num_vertices + rand_vertex] = weight;
+                graph[rand_vertex*num_vertices + i] = weight;
+                edge_count[i] += 1;
+                curr_num_edges++;              
+            }
+        }
+    }
+    printf("done!\n");
+}
+
+/*  Get closest node to current node that hasn't been visited   */
+int closestNode(data_t* node_dist, int* visited_node, int num_vertices) {
+    data_t dist = INF_DIST + 1;    
+    int node = -1;         
+    int i;               
+
+    for (i = 0; i < num_vertices; i++) {
+        if ((node_dist[i] < dist) && (visited_node[i] == 0)) { 
+            node = i;              
+            dist = node_dist[i];  
+        }
+    }
+    return node;    //return closest node
+}
+
+/*  Print int array elements    */
+void checkArray(int* a, int length) {
+    int i;
+    printf("Proof: ");
+    for (i = 0; i < length; i++) {
+        printf("%d, ", a[i]);
+    }
+    printf("\n\n");
+}
+
+void checkArrayData(data_t* a, int length) {
+    int i;
+    printf("Proof: ");
+    for (i = 0; i < length; i++) {
+        printf("%f, ", a[i]);
+    }
+    printf("\n\n");
+}
+
+/*  Difference in two timespec objects   */
+struct timespec diff(struct timespec start, struct timespec end)
+{
+    struct timespec temp;
+    if ((end.tv_nsec - start.tv_nsec)<0) {
+        temp.tv_sec = end.tv_sec - start.tv_sec - 1;
+        temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
+    }
+    else {
+        temp.tv_sec = end.tv_sec - start.tv_sec;
+        temp.tv_nsec = end.tv_nsec - start.tv_nsec;
+    }
+    return temp;
+}
+
+/****************DIJKSTRA'S ALGORITHM IMPLEMENTATIONS****************/
+void dijkstraCPUSerial(data_t* graph, data_t* node_dist, int* parent_node, int* visited_node, int num_vertices, int v_start) {
+
+    //functions
+    void setIntArrayValue(int* in_array, int array_size, int init_value);
+    void setDataArrayValue(data_t* in_array, int array_size, data_t init_value);
+    int closestNode(data_t* node_dist, int* visited_node, int num_vertices);
+
+    //reset/clear data from previous runs
+    setDataArrayValue(node_dist, VERTICES, INF_DIST);    
+    setIntArrayValue(parent_node, VERTICES, -1);         
+    setIntArrayValue(visited_node, VERTICES, 0);          
+    node_dist[v_start] = 0;                    
+
+    int i, next;
+    for (i = 0; i < num_vertices; i++) {
+        int curr_node = closestNode(node_dist, visited_node, num_vertices); //get closest node not visited
+        visited_node[curr_node] = 1;                                        //set node retrieved as visited
+        /*
+        Requirements to update neighbor's distance:
+        -Neighboring node has not been visited.
+        -Edge exists between current node and neighbor node
+        -dist[curr_node] + edge_weight(curr_node, next_node) < dist[next_node]
+        */
+        for (next = 0; next < num_vertices; next++) {
+            int new_dist = node_dist[curr_node] + graph[curr_node*num_vertices + next];
+            if ((visited_node[next] != 1)
+                && (graph[curr_node*num_vertices + next] != (data_t)(0))
+                && (new_dist < node_dist[next])) {
+                node_dist[next] = new_dist;        //update distance
+                parent_node[next] = curr_node;     //update predecessor
+            }
+        }
+    }
+}
+
+__global__ void closestNodeCUDA(data_t* node_dist, int* visited_node, int* global_closest, int num_vertices) {
+    data_t dist = INF_DIST + 1;
+    int node = -1;
+    int i;
+
+    for (i = 0; i < num_vertices; i++) {
+        if ((node_dist[i] < dist) && (visited_node[i] != 1)) {
+            dist = node_dist[i];
+            node = i;
+        }
+    }
+
+    global_closest[0] = node;
+    visited_node[node] = 1;
+}
+
+__global__ void cudaRelax(data_t* graph, data_t* node_dist, int* parent_node, int* visited_node, int* global_closest) {
+    int next = blockIdx.x*blockDim.x + threadIdx.x;  
+    int source = global_closest[0];
+
+    data_t edge = graph[source*VERTICES + next];
+    data_t new_dist = node_dist[source] + edge;
+
+    if ((edge != 0) &&
+        (visited_node[next] != 1) &&
+        (new_dist < node_dist[next])) {
+        node_dist[next] = new_dist;
+        parent_node[next] = source;
+    }
+
+}
\ No newline at end of file