diff --git a/src/dijkstra_cuda.cuh b/src/dijkstra_cuda.cuh
index ff4602e56867b2082d3f32ba31fbb0c8474c4ec4..9a4a53c1545a4a4dc2dd42f69b1c31ea46856179 100644
--- a/src/dijkstra_cuda.cuh
+++ b/src/dijkstra_cuda.cuh
@@ -4,6 +4,8 @@
 #include <stdlib.h>
 #include <limits.h>
 
+#define ID 13517122
+
 /**
  * Get vertex index with minimum distance which not yet included
  * in spt_set
@@ -11,10 +13,10 @@
  * @param  spt_set a set denoting vertices included in spt_set
  * @param n number of vertices in the graph
  * @return         index of minimum distance not yet included in spt_set
- */
- int min_distance_idx(long dist[], bool spt_set[], int n) {
+ *//*
+long min_distance_idx(long *dist, bool *spt_set, int n) {
 	// Initialize min value 
-    int min = INT_MAX, min_index; 
+    long min = LONG_MAX, min_index; 
   
     for (int i = 0; i < n; i++) {
     	if (spt_set[i] == false && dist[i] <= min) {
@@ -22,16 +24,96 @@
     		min_index = i;
     	}
     } 
-        
-  
+	spt_set[min_index] = true;
+	dist[src] = 0;
+	
     return min_index; 
 }
+*/
 
+// for dijkstra algorithm
+__global__ 
+void initValue(long *graph, long *allResult, int *visitedNode, int *minIndex, int sourceIdx, int num_vertices) {
+	int index = threadIdx.x + blockDim.x * blockIdx.x;
+	int stride = blockDim.x * gridDim.x;
+
+	for (int i=index; i < num_vertices; i += stride) {
+		visitedNode[i] = 0;
+		if ((graph[i*num_vertices + sourceIdx]==0) && i!=(sourceIdx)) {
+			allResult[i] = LONG_MAX;
+		}
+		else {
+			allResult[i] = graph[i*num_vertices + sourceIdx];
+		}
+	}
+	
+	*minIndex = -1;
+	visitedNode[sourceIdx] = 1;
+}
+
+__global__ 
+void findMinDistance(long *allResult, int *visitedNode, int *minIndex, long *minDistance, int num_vertices) {
+	*minDistance = LONG_MAX;
+
+	for (int j=0; j<num_vertices; j++) {
+		if (visitedNode[j]==0 && allResult[j]<*minDistance) {
+			*minDistance = allResult[j];
+			*minIndex = j;
+		}
+	}
+
+	visitedNode[*minIndex] = 1;
+}
+
+__global__
+void setNewDistance(long *graph, long *allResult, int *visitedNode, int *minIndex, long *minDistance, int num_vertices) {
+	int index = threadIdx.x + blockDim.x * blockIdx.x;
+	int stride = blockDim.x * gridDim.x;
+
+	for (int i=index; i < num_vertices; i += stride) {
+		if (visitedNode[i]) {
+			continue;
+		}
+		else if ((graph[i*num_vertices + *minIndex]+*minDistance<allResult[i]) && (graph[i*num_vertices + *minIndex]+*minDistance!=0)) {
+			allResult[i] = graph[i*num_vertices + *minIndex]+*minDistance;
+		}
+	}
+}
 /**
  * generate a graph with n vertices
  * @param  n number of vertices
- * @return   2D array, graph[i][j] = graph[j][i] = distance from vertex i to j
+ * @return   1D array, graph[i*n + j] = graph[j*n + i] = distance from vertex i to j
  */
+ long* create_graph(int n) {
+    int i,j;
+	long *graph = (long*) malloc(n * n * sizeof(long));
+
+    for (i=0;i<n;i++) {
+            for (j=i;j<n;j++) {
+                if (i==j) {
+                    graph[i*n + j] = 0;
+                }
+                else {
+                    graph[i*n + j] = rand();
+                    graph[j*n + i] = graph[i*n + j];
+                }
+            }
+        }
+    return graph;
+}
+//-----
+long* create_temp(int n) {
+    int i,j;
+	long *graph = (long*) malloc(n * n * sizeof(long));
+	
+    for (i=0;i<n;i++) {
+            for (j=i;j<n;j++) {
+				graph[i*n + j] = INT_MAX;
+            }
+        }
+    return graph;
+}
+/*
  long **gen_graph(int n) {
 	// alokasi memori untuk matriks yang merepresentasikan graf
 	long **result = (long **)malloc(n * sizeof(long *)); 
@@ -40,7 +122,7 @@
     }    
   
     // isi matriks dengan bilangan random
-    srand(13517122);
+    
     
     for (int i = 0; i < n; i++) {
     	for (int j = i; j < n; j++) {
@@ -54,8 +136,11 @@
     }
      
  	return result; 
-}
-
+}*/
+//make the graph as graph[i*n + j] to make it able to be malloc on cuda as 1d array
+//-----
+//-----
+/*
 long **gen_temp(int r, int c) {
 	// alokasi memori untuk matriks yang merepresentasikan graf
 	long **result = (long **)malloc(r * sizeof(long *)); 
@@ -73,14 +158,14 @@ long **gen_temp(int r, int c) {
     }
      
  	return result; 
-}
-
+}*/
+/*
 long *dijkstra(long **graph, int n, int src) {
 
 	// output array, contains shortest distance from src to every vertices
 	long *dist = (long *) malloc (sizeof(long) * n);
 	// spt_set[i] is true if vertex i already included in the shortest path tree
-	bool spt_set[n];
+	bool *spt_set = (bool *) malloc(sizeof(bool) * n);
 
 	// initialize dist and spt_set
 	for (int i = 0; i < n; i++) {
@@ -113,14 +198,16 @@ long *dijkstra(long **graph, int n, int src) {
 			}
 		}
 	}
+	free(spt_set);
 
 	return dist;
 }
-
+*/
 /**
 * that one kernel that do it "all"
 */
-__global__ do_it(**long graph, **long result, int num_vertices) {
+/*
+__global__ void do_it(**long graph, **long result, int num_vertices) {
 	int start_idx = threadIdx.x + blockDim.x * blockIdx.x;
 	int stride = blockDim.x * gridDim.x;
 	
@@ -134,5 +221,33 @@ __global__ do_it(**long graph, **long result, int num_vertices) {
 		}
 	}
 }
+*/
+
+void print_graph(long *data, int n) {
+    int i,j;
+    for (i=0;i<n;i++) {
+        for (j=0;j<n;j++) {
+            printf("%li ",data[i*n + j]);
+        }
+        printf("\n");
+    }
+}
+
+void write_to_txt(int n, long *const graph, const char* filename) {
+    FILE *fout;
+    int i,j;
+    if (NULL == (fout = fopen(filename,"w"))) {
+        fprintf(stderr,"error opening output file");
+        abort();
+    }
+
+    for (i=0;i<n;i++) {
+        for(j=0;j<n;j++) {
+            fprintf(fout,"%li ",graph[i*n + j]);
+        }
+        fprintf(fout,"\n");
+    }
+    printf("Result has been written to %s ...\n",filename);
+}
 
 #endif
\ No newline at end of file
diff --git a/src/paralel.cu b/src/paralel.cu
index de31f9fc426b5e3d62cc4d7f0e83930ed73a4cc2..f9d82c8827c065762dd0f883baa86ffc59289589 100644
--- a/src/paralel.cu
+++ b/src/paralel.cu
@@ -2,9 +2,9 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <time.h>
-#include <util.h>
 #include "dijkstra_cuda.cuh"
 
+#define THREADS_BLOCK 256
 
 static double get_micros(void) {
     struct timespec ts;
@@ -14,53 +14,125 @@ static double get_micros(void) {
 
 int main (int argc, char const *argv[]) {
     
+    //check if argc == 3
+    if (argc!=3) {
+        fprintf(stderr,"Usage: Dijkstra_CUDA num_of_node output_filename\n");
+        return EXIT_FAILURE;
+    }
     // initialization
-    int num_vertices = atoi(argv[2]);
+    srand(ID);
+    int num_vertices = atoi(argv[1]);
     double start_time, end_time, total_time;
     total_time = 0;
 
     // allocate memory in host for the graph
-    // code goes here
-    long **graph = gen_graph(num_vertices);
+    long *graph = create_graph(num_vertices);
+    
 
     // allocate memory in the host for the result matrice
-    // code goes here
-    long **result = gen_temp(num_vertices, num_vertices);
-
-    // copy graph from host to device
-    // code goes here, this might be unnecessary
-    // cudaMemCpy()
-
-
-    // start timer
-    // code goes here
-    start_time = get_micros();
-
-    // calculate the shortest paths using device
-    // code goes here
-    do_it();
-
-    // copy result array from device to host
-    // code goes here
-
-
-    // synchronize device
-    // code goes here
-
-    // free memory in device
-    /// code goes here
-
-    // stop the timer
-    // code goes here
-
-
-    // write result matrice to a file
-    // code goes here
-
-    // free result matrice
-    // code goes here
+    long *result = create_temp(num_vertices);
+    
+    // allocate memory in the host for result array from a vertice
+    long *tempResult = (long *)malloc(num_vertices * sizeof(long));
+
+    for (int i=0; i<num_vertices; i++) {
+        tempResult[i] = -1;
+    }
+
+    //CUDA malloc initialize
+    long *gpu_graph;
+    long *gpu_result;
+    int *gpu_visitedNode;
+    long *minDistance;
+    int *minIndex;
+
+    //CUDA malloc
+    // allocate memory in device for the graph
+    cudaMalloc((void**)&gpu_graph,num_vertices*num_vertices*sizeof(long));
+    // allocate memory in device for the result of dijkstra
+    cudaMalloc((void**)&gpu_result,num_vertices*sizeof(long));
+    // allocate memory in device for the list of visited node
+    cudaMalloc((void**)&gpu_visitedNode,num_vertices*sizeof(int));
+    // allocate memory in device for the minimal distance used in dijkstra
+    cudaMalloc((void**)&minDistance,sizeof(long));
+    // allocate memory in device for the index of minDistance
+    cudaMalloc((void**)&minIndex,sizeof(int));
+
+    // copy data of graph from host to device
+    cudaMemcpy(gpu_graph,graph,num_vertices*num_vertices*sizeof(long),cudaMemcpyHostToDevice);
+    
+    // initiate block size and num of blocks that will be use in device
+    int blockSize = 256;
+    int numBlocks = (num_vertices + blockSize - 1) / blockSize;
+
+    // dijkstra algorithm for each vertice
+    for (int i=0; i<num_vertices; i++) {
+        
+        // set timer
+        start_time = get_micros();
+        
+        // initialize value for dijkstra in device
+        initValue<<<numBlocks, blockSize>>>(
+            gpu_graph,
+            gpu_result,
+            gpu_visitedNode,
+            minIndex,
+            i,
+            num_vertices);
+
+        // for each vertice except current vertice (source)
+        for (int j=1; j<num_vertices; j++) {
+            
+            // find minimal distance
+            findMinDistance<<<1,1>>>(
+                gpu_result,
+                gpu_visitedNode,
+                minIndex,
+                minDistance,
+                num_vertices);
+            
+            // update distance for each vertice if new distance < old distance
+            setNewDistance<<<numBlocks, blockSize>>>(
+                gpu_graph,
+                gpu_result,
+                gpu_visitedNode,
+                minIndex,
+                minDistance,
+                num_vertices);
+            
+            // cudaDeviceSynchronize();
+        }
+        
+        // end of timer
+        end_time = get_micros();
+
+        // copy the result from device to host
+        cudaMemcpy(tempResult,gpu_result,num_vertices*sizeof(long),cudaMemcpyDeviceToHost);
+
+        // fill copied into the result matrice
+        for (int k=0; k<num_vertices; k++) {
+            result[i*num_vertices + k] = tempResult[k];
+        }
+
+        total_time += end_time-start_time;
+    }
+    
+    
+    write_to_txt(num_vertices,result,argv[2]);
+    
+    printf("processing time: %0.04lf us ...\n",total_time);
 
+    // free device memory allocation
+    cudaFree(gpu_graph);
+    cudaFree(gpu_result);
+    cudaFree(gpu_visitedNode);
+    cudaFree(minDistance);
+    cudaFree(minIndex);
 
+    // free host memory allocation
+    free(tempResult);
+    free(graph);
+    free(result);
 
-    return 0;
+    return EXIT_SUCCESS;
 }
\ No newline at end of file