M Algah Fattah Illahi · M Algah Fattah Illahi · M Algah Fattah Illahi · M Algah Fattah Illahi · Hilmi Naufal Yafie · Hilmi Naufal Yafie
--- a/.gitignore
+++ b/.gitignore
+# binaries
+bin/*
+
+# scripts
+connect
+push
--- a/Makefile
+++ b/Makefile
+serial: ./src/serial.c ./src/dijkstra.* ./src/util.*
+	gcc ./src/serial.c ./src/dijkstra.c ./src/util.c -o ./bin/serial
+	./bin/serial $(n)
+
+parallel: ./src/paralel.c ./src/util.* ./src/dijkstra.*
+	mpicc ./src/paralel.c ./src/util.c ./src/dijkstra.c -o ./bin/parallel
+	mpirun -np $(np) --hostfile mpi_hostfile ./bin/parallel $(np) $(nv)
+
+run_par: ./bin/parallel
+	mpirun -np $(np) --hostfile mpi_hostfile ./bin/parallel $(np) $(nv)
+
+hello_omp: ./src/hello_openmp.c
+	gcc -g -Wall -fopenmp -o ./bin/hello_omp ./src/hello_openmp.c
+	./bin/hello_omp $(nt)
+
+parallel_omp: ./src/paralel_openmp.c
+	gcc -g -Wall -fopenmp -o ./bin/parallel_omp ./src/paralel_openmp.c ./src/util.c ./src/dijkstra.c
+	./bin/parallel_omp $(nt) $(nv)
+
+parallel_cuda: ./src/paralel.cu
+	nvcc ./src/paralel.cu -o ./bin/parallel_cuda
+	./bin/parallel_cuda $(nt) $(nv)
\ No newline at end of file
--- a/README.md
+++ b/README.md
-# Announcement
+<!-- # Announcement

 Beberapa file yang harus ada dalam repositori tersebut diantaranya:
 * Direktori src yang berisi source code yang anda buat.
@@ -7,4 +7,66 @@ Beberapa file yang harus ada dalam repositori tersebut diantaranya:
 * File README.md yang berisi:
    * Petunjuk penggunaan program.
    * Pembagian tugas. Sampaikan dalam list pengerjaan untuk setiap mahasiswa. Sebagai contoh: XXXX mengerjakan fungsi YYYY, ZZZZ, dan YYZZ.
-    * Laporan pengerjaan, dengan struktur laporan sesuai dengan deskripsi pada bagian sebelumnya.
+    * Laporan pengerjaan, dengan struktur laporan sesuai dengan deskripsi pada bagian sebelumnya. -->
+
+# Tugas IF3230 Dijkstra CUDA
+
+## Petunjuk Penggunaan Program
+Dalam direktori root lakukan kompilasi program dengan makefile:
+
+* Untuk melakukan kompilasi dijkstra versi serial:
+
+`make serial`
+
+`bin/serial [number of vertices]`
+
+* Untuk melakukan kompilasi dijkstra versi paralel (CUDA):
+
+`make parallel_cuda nt=[jumlah thread] nv=[jumlah nodes]`
+
+## Pembagian Tugas
+* 13517035 - Hilmi Naufal Yafie : Paralel Dijkstra CUDA, Laporan, Eksplorasi
+* 13517122 - M. ALgah Fattah I. : Paralel Dijkstra CUDA, Laporan, Eksplorasi
+
+## Laporan Pengerjaan
+### Deskripsi Solusi Paralel
+
+Solusi paralel yang kami buat adalah paralelisasi dalam menjalankan algoritma dijkstra secara keseluruhan. Sebagaimana yang diketahui, algoritma dijkstra dapat mencari jarak terdekat dari suatu node ke semua node lain. Oleh karena pada persoalan yang diberikan kita diminta untuk mencari jarak dari semua node ke semua node lain, paralelisasi yang kami lakukan adalah setiap thread menjalankan dijkstra dari titik asal (source) yang berbeda-beda, lalu kemudian menuliskan hasil jarak antara titik-titik lain dengan source tersebut pada baris yang bersangkutan di matriks yang merepresentasikan hasil akhir.
+
+Misalkan ada 3 anak proses dan ada 3 node pada graf yang di-proses, maka diparalelisasi sehingga thread pertama memproses node A, thread kedua memproses node B, dan thread ketiga memproses node C. Misalkan pula sebuah matriks akhir `result` yang menyimpan matriks akhir, maka thread pertama akan menuliskan ke baris pertama `result` yang merepresentasikan jarak dari node A ke node-node lain, dst.
+
+
+### Analisis Solusi
+Dalam konteks memparalelisasi algoritma dijkstra, menurut kami jika beban komputasi didistribusi kepada thread-thread yang dikerjakan oleh core pada gpu, maka waktu untuk melakukan perhitungan jarak secara total akan menjadi lebih singkat
+
+### Hasil Uji
+Berikut Merupakan hasil uji yang kami lakukan untuk node 100, 500, 1000, dan 3000 baik untuk Serial Dijkstra dan Paralel Dijkstra (dalam microseconds):
+
+* **Serial Dijkstra**
+
+| N             | Percobaan 1   | Percobaan 2 | Percobaan 3 | Rata-rata           |
+| ------------- |:-------------:| -----:| ------------- |:-------------:|
+| 100      | 20.1025 | 20.0155 | 18.46475 | 19.52758 |
+| 500      | 2537.57925 | 1734.28625 | 1661.24725 | 1977.70425‬ |
+| 1000     | 13798.4025 | 13938.675 | 15880.503 | 14539.1935 |
+| 3000     | 762352.86125 | 663569.772 | 790499.578 | 738807.40375 |
+
+
+* **Paralel Dijkstra CUDA**
+
+| N             | Percobaan 1   | Percobaan 2 | Percobaan 3 | Rata-rata           |
+| ------------- |:-------------:| -----:| ------------- |:-------------:|
+| 100      | 86.368 | 58.231 | 39.25525 | 61.28475‬ |
+| 500      | 1115.20125 | 941.9165 | 943.5145 | 1000.21075 |
+| 1000     | 4191.5325 | 6223825 | 7646.94175 | 6020.76641666 |
+| 3000     | 432189.653 | 438795.246 | 441382.9855 | 437455.9615‬ |
+
+**untuk setiap kasus uji, block size = 256 threads**
+
+
+### Analisis Uji
+Dari seluruh percobaan yang dilakukan, didapatkan bahwa program paralel selalu lebih cepat daripada program serial.
+Hal ini tentu karena paralelisasi lebih mengutilisasi resource yang ada dengan thread yang lebih dari 1 menjadikan proses lebih cepat.
+Sementara itu untuk pengujian N = 100, hasil pada dijkstra menggunakan paralel lebih besar dan terlihat tidak stabil, hal ini dikarenakan 
+dalam perhitungan waktu, jeda pemanggilan kernel ke server juga terhitung. Sehingga ketika waktu pemanggilan lebih lama, waktu yang dihasilkan juga akan
+lebih lama.
--- a/src/boolean.h
+++ b/src/boolean.h
+#ifndef _BOOLEAN_h
+#define _BOOLEAN_h
+
+#define bool unsigned char
+#define true 1
+#define false 0
+
+#endif
\ No newline at end of file
--- a/src/dijkstra.c
+++ b/src/dijkstra.c
+#include "dijkstra.h"
+
+
+/**
+ * Get vertex index with minimum distance which not yet included
+ * in spt_set
+ * @param  dist    distance from origin vertex to vertex with that index
+ * @param  spt_set a set denoting vertices included in spt_set
+ * @param n number of vertices in the graph
+ * @return         index of minimum distance not yet included in spt_set
+ */
+int min_distance_idx(long dist[], bool spt_set[], int n) {
+	// Initialize min value 
+    int min = INT_MAX, min_index; 
+  
+    for (int i = 0; i < n; i++) {
+    	if (spt_set[i] == false && dist[i] <= min) {
+    		min = dist[i];
+    		min_index = i;
+    	}
+    } 
+        
+  
+    return min_index; 
+}
+
+
+/**
+ * generate a graph with n vertices
+ * @param  n number of vertices
+ * @return   2D array, graph[i][j] = graph[j][i] = distance from vertex i to j
+ */
+long **gen_graph(int n) {
+	// alokasi memori untuk matriks yang merepresentasikan graf
+	long **result = (long **)malloc(n * sizeof(long *)); 
+    for (int i = 0; i < n; i++) {
+    	result[i] = (long *)malloc(n * sizeof(long)); 
+    }    
+  
+    // isi matriks dengan bilangan random
+    srand(13517122);
+    
+    for (int i = 0; i < n; i++) {
+    	for (int j = i; j < n; j++) {
+    		if (i == j) {
+    			result[i][j] = 0;
+    		} else {
+    			result[i][j] = result[j][i] = rand();	
+    		}
+    		
+    	}
+    }
+     
+ 	return result; 
+}
+
+long **gen_temp(int r, int c) {
+	// alokasi memori untuk matriks yang merepresentasikan graf
+	long **result = (long **)malloc(r * sizeof(long *)); 
+	// printf("[gen_temp] initiate temp\n");
+    for (int i = 0; i < r; i++) {
+    	result[i] = (long *)malloc(c * sizeof(long)); 
+    	// printf("[gen_temp] initiate each row in temp\n");
+    }    
+    
+    for (int i = 0; i < r; i++) {
+    	for (int j = 0; j < c; j++) {
+    		// printf("[gen_temp] filling temp\n");
+    		result[i][j] = 0;
+    	}
+    }
+     
+ 	return result; 
+}
+
+
+long *dijkstra(long **graph, int n, int src) {
+
+	// output array, contains shortest distance from src to every vertices
+	long *dist = (long *) malloc (sizeof(long) * n);
+	// spt_set[i] is true if vertex i already included in the shortest path tree
+	bool spt_set[n];
+
+	// initialize dist and spt_set
+	for (int i = 0; i < n; i++) {
+		dist[i] = INT_MAX;
+		spt_set[i] = false;
+	}
+
+	// initiate path searching 
+	dist[src] = 0;
+
+
+	// find the shortest path for all vertices
+	for (int i = 0; i < n; i++) {
+
+		// pick vertex with minimum distance from src from spt_set not yet
+		// processed
+		int processed_vertex = min_distance_idx(dist, spt_set, n);
+
+		// mark vertex as processed
+		spt_set[processed_vertex] = true;
+
+		for (int j = 0; j < n; j++) {
+			// check vertices connected to processed_vertex not yet processed
+			if (!spt_set[j] 
+				&& graph[processed_vertex][j] != 0
+				&& dist[processed_vertex] != INT_MAX 
+				&& dist[processed_vertex] + graph[processed_vertex][j] < dist[j]) {
+
+				dist[j] = dist[processed_vertex] + graph[processed_vertex][j];
+			}
+		}
+	}
+
+	return dist;
+}
\ No newline at end of file
--- a/src/dijkstra.h
+++ b/src/dijkstra.h
+#ifndef DIJKSTRA_H
+#define DIJKSTRA_H
+#include <stdlib.h>
+#include <stdio.h>
+#include <limits.h>
+#include "boolean.h"
+
+/**
+ * Get vertex index with minimum distance which not yet included
+ * in spt_set
+ * @param  dist    distance from origin vertex to vertex with that index
+ * @param  spt_set a set denoting vertices included in spt_set
+ * @return         index of minimum distance not yet included in spt_set
+ */
+int min_distance_idx(long dist[], bool spt_set[], int n);
+
+
+/**
+ * generate a graph with n vertices
+ * @param  n number of vertices
+ * @return   2D array, graph[i][j] = graph[j][i] = distance from vertex i to j
+ */
+long **gen_graph(int n);
+
+
+/**
+ * generate 2D array with dimension of r x c
+ * @param  r number of rows
+ * @param c number of columns
+ * @return   2D array, all filled with zero
+ */
+long **gen_temp(int r, int c);
+
+/**
+ * 
+ * @param graph [description]
+ * @param n     [description]
+ * @param src   [description]
+ */
+long *dijkstra(long **graph, int n, int src);
+
+
+#endif
\ No newline at end of file
--- a/src/dijkstra_cuda.cuh
+++ b/src/dijkstra_cuda.cuh
+#ifndef __DIJKSTRA_CUDA__
+#define __DIJKSTRA_CUDA__
+
+#include <stdlib.h>
+#include <limits.h>
+
+#define ID 13517122
+
+/**
+ * Get vertex index with minimum distance which not yet included
+ * in spt_set
+ * @param  dist    distance from origin vertex to vertex with that index
+ * @param  spt_set a set denoting vertices included in spt_set
+ * @param n number of vertices in the graph
+ * @return         index of minimum distance not yet included in spt_set
+ *//*
+long min_distance_idx(long *dist, bool *spt_set, int n) {
+	// Initialize min value 
+    long min = LONG_MAX, min_index; 
+  
+    for (int i = 0; i < n; i++) {
+    	if (spt_set[i] == false && dist[i] <= min) {
+    		min = dist[i];
+    		min_index = i;
+    	}
+    } 
+	spt_set[min_index] = true;
+	dist[src] = 0;
+	
+    return min_index; 
+}
+*/
+
+// for dijkstra algorithm
+__global__ 
+void initValue(long *graph, long *allResult, int *visitedNode, int *minIndex, int sourceIdx, int num_vertices) {
+	int index = threadIdx.x + blockDim.x * blockIdx.x;
+	int stride = blockDim.x * gridDim.x;
+
+	for (int i=index; i < num_vertices; i += stride) {
+		visitedNode[i] = 0;
+		if ((graph[i*num_vertices + sourceIdx]==0) && i!=(sourceIdx)) {
+			allResult[i] = LONG_MAX;
+		}
+		else {
+			allResult[i] = graph[i*num_vertices + sourceIdx];
+		}
+	}
+	
+	*minIndex = -1;
+	visitedNode[sourceIdx] = 1;
+}
+/*
+__global__ 
+void findMinDistance(long *allResult, int *visitedNode, int *minIndex, long *minDistance, int num_vertices) {
+	
+	*minDistance = LONG_MAX;
+
+	for (int j=0; j<num_vertices; j++) {
+		if (visitedNode[j]==0 && allResult[j]<*minDistance) {
+			*minDistance = allResult[j];
+			*minIndex = j;
+		}
+	}
+
+	visitedNode[*minIndex] = 1;
+	
+}
+*/
+__global__
+void findAndSetNewDistance(long *graph, long *allResult, int *visitedNode, int *minIndex, long *minDistance, int num_vertices) {
+	
+	*minDistance = LONG_MAX;
+
+	for (int j=0; j<num_vertices; j++) {
+		if (visitedNode[j]==0 && allResult[j]<*minDistance) {
+			*minDistance = allResult[j];
+			*minIndex = j;
+		}
+	}
+	
+	__syncthreads();
+	
+	visitedNode[*minIndex] = 1;
+	
+	int index = threadIdx.x + blockDim.x * blockIdx.x;
+	int stride = blockDim.x * gridDim.x;
+
+	for (int i=index; i < num_vertices; i += stride) {
+		if (visitedNode[i]) {
+			continue;
+		}
+		else if ((graph[i*num_vertices + *minIndex]+*minDistance<allResult[i]) && (graph[i*num_vertices + *minIndex]+*minDistance!=0)) {
+			allResult[i] = graph[i*num_vertices + *minIndex]+*minDistance;
+		}
+	}
+}
+
+
+/**
+ * generate a graph with n vertices
+ * @param  n number of vertices
+ * @return   1D array, graph[i*n + j] = graph[j*n + i] = distance from vertex i to j
+ */
+ long* create_graph(int n) {
+    int i,j;
+	long *graph = (long*) malloc(n * n * sizeof(long));
+
+    for (i=0;i<n;i++) {
+            for (j=i;j<n;j++) {
+                if (i==j) {
+                    graph[i*n + j] = 0;
+                }
+                else {
+                    graph[i*n + j] = rand();
+                    graph[j*n + i] = graph[i*n + j];
+                }
+            }
+        }
+    return graph;
+}
+//-----
+long* create_temp(int n) {
+    int i,j;
+	long *graph = (long*) malloc(n * n * sizeof(long));
+	
+    for (i=0;i<n;i++) {
+            for (j=i;j<n;j++) {
+				graph[i*n + j] = INT_MAX;
+            }
+        }
+    return graph;
+}
+/*
+ long **gen_graph(int n) {
+	// alokasi memori untuk matriks yang merepresentasikan graf
+	long **result = (long **)malloc(n * sizeof(long *)); 
+    for (int i = 0; i < n; i++) {
+    	result[i] = (long *)malloc(n * sizeof(long)); 
+    }    
+  
+    // isi matriks dengan bilangan random
+    
+    
+    for (int i = 0; i < n; i++) {
+    	for (int j = i; j < n; j++) {
+    		if (i == j) {
+    			result[i][j] = 0;
+    		} else {
+    			result[i][j] = result[j][i] = rand();	
+    		}
+    		
+    	}
+    }
+     
+ 	return result; 
+}*/
+//make the graph as graph[i*n + j] to make it able to be malloc on cuda as 1d array
+//-----
+//-----
+/*
+long **gen_temp(int r, int c) {
+	// alokasi memori untuk matriks yang merepresentasikan graf
+	long **result = (long **)malloc(r * sizeof(long *)); 
+	// printf("[gen_temp] initiate temp\n");
+    for (int i = 0; i < r; i++) {
+    	result[i] = (long *)malloc(c * sizeof(long)); 
+    	// printf("[gen_temp] initiate each row in temp\n");
+    }    
+    
+    for (int i = 0; i < r; i++) {
+    	for (int j = 0; j < c; j++) {
+    		// printf("[gen_temp] filling temp\n");
+    		result[i][j] = 0;
+    	}
+    }
+     
+ 	return result; 
+}*/
+/*
+long *dijkstra(long **graph, int n, int src) {
+
+	// output array, contains shortest distance from src to every vertices
+	long *dist = (long *) malloc (sizeof(long) * n);
+	// spt_set[i] is true if vertex i already included in the shortest path tree
+	bool *spt_set = (bool *) malloc(sizeof(bool) * n);
+
+	// initialize dist and spt_set
+	for (int i = 0; i < n; i++) {
+		dist[i] = INT_MAX;
+		spt_set[i] = false;
+	}
+
+	// initiate path searching 
+	dist[src] = 0;
+
+
+	// find the shortest path for all vertices
+	for (int i = 0; i < n; i++) {
+
+		// pick vertex with minimum distance from src from spt_set not yet
+		// processed
+		int processed_vertex = min_distance_idx(dist, spt_set, n);
+
+		// mark vertex as processed
+		spt_set[processed_vertex] = true;
+
+		for (int j = 0; j < n; j++) {
+			// check vertices connected to processed_vertex not yet processed
+			if (!spt_set[j] 
+				&& graph[processed_vertex][j] != 0
+				&& dist[processed_vertex] != INT_MAX 
+				&& dist[processed_vertex] + graph[processed_vertex][j] < dist[j]) {
+
+				dist[j] = dist[processed_vertex] + graph[processed_vertex][j];
+			}
+		}
+	}
+	free(spt_set);
+
+	return dist;
+}
+*/
+/**
+* that one kernel that do it "all"
+*/
+/*
+__global__ void do_it(**long graph, **long result, int num_vertices) {
+	int start_idx = threadIdx.x + blockDim.x * blockIdx.x;
+	int stride = blockDim.x * gridDim.x;
+	
+	long *temp = (long*) malloc(sizeof(long)*num_vertices);
+	
+	for (int i = start_idx; i < num_vertices; i+= stride) {
+		temp = dijkstra(graph, num_vertices, i);
+		
+		for (int j = 0; j < num_vertices; j++) {
+			result[i][j] = temp[j];
+		}
+	}
+}
+*/
+
+void print_graph(long *data, int n) {
+    int i,j;
+    for (i=0;i<n;i++) {
+        for (j=0;j<n;j++) {
+            printf("%li ",data[i*n + j]);
+        }
+        printf("\n");
+    }
+}
+
+void write_to_txt(int n, long *const graph, const char* filename) {
+    FILE *fout;
+    int i,j;
+    if (NULL == (fout = fopen(filename,"w"))) {
+        fprintf(stderr,"error opening output file");
+        abort();
+    }
+
+    for (i=0;i<n;i++) {
+        for(j=0;j<n;j++) {
+            fprintf(fout,"%li ",graph[i*n + j]);
+        }
+        fprintf(fout,"\n");
+    }
+    printf("Result has been written to %s ...\n",filename);
+}
+
+#endif
\ No newline at end of file
--- a/src/hello_openmp.c
+++ b/src/hello_openmp.c
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main (int argc, char *argv[]) {
+    int nthreads = atoi(argv[1]), t_id;
+    int i;
+    #pragma omp parallel for private(t_id) 
+        for (i = 0; i < 10; i++) {
+            t_id = omp_get_thread_num(); // get thread if for each thread
+            printf("i : %d, by the way i'm thread %d\n", i, t_id);
+        }
+
+        // printf("Hello from thead number %d of %d\n", t_id, nthreads);
+
+
+
+
+    return 0;
+}
\ No newline at end of file
--- a/src/hellompi.c
+++ b/src/hellompi.c
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int main(int argc,char *argv[]) {
+  int numtasks, rank;
+  char processor_name[MPI_MAX_PROCESSOR_NAME];
+  int name_len;
+  int arr_size = 3;
+
+
+  MPI_Status Stat;
+  MPI_Init(&argc,&argv);
+  MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Get_processor_name(processor_name, &name_len);
+  // printf("Hello from processor %s, task %d of %d, argv[1]: %s\n",
+  // processor_name, rank, numtasks, argv[1]);
+  // 
+  
+  int *result_arr = (int *) malloc(arr_size * numtasks * sizeof(int));
+  // int **local_mat = (int **) malloc(arr_size * sizeof(int*));
+  int *local_arr = (int *) malloc(arr_size * sizeof(int));
+  
+  // for (int i = 0; i < arr_size; i++) {
+  //   local_mat = (int*) malloc(5 *sizeof(int));
+  // }
+  
+  // for (int i = 0; i < r; i++) {
+  //     for (int j = i; j < c; j++) {
+  //       local_mat[i][j] = local_mat[j][i] = 0;
+  //     }
+  // }  
+
+  for (int i = 0; i < arr_size; i++) {
+    local_arr[i] = i*rank*3;
+  }
+
+  if (rank == 0) {
+    // int **local_mat = (int **) malloc(arr_size * sizeof(int*));
+    
+    int *temp = (int *) malloc(arr_size * sizeof(int));
+    
+
+
+    // initiate result arr
+    for (int i = 0; i < arr_size; i++) {
+      result_arr[i] = local_arr[i];
+    }
+
+    // terima array dari tiap node
+    for (int i = 1; i < numtasks; i++) {
+      MPI_Recv( temp, 
+              arr_size*sizeof(int), 
+              MPI_INT, 
+              i,
+              0,
+              MPI_COMM_WORLD,
+              MPI_STATUS_IGNORE
+              );
+      // salin array ke result_arr
+      // result_arr[i*arr_size] = temp;
+      memcpy(result_arr+(arr_size*i),
+             temp,
+             arr_size* sizeof(int)
+             );
+      
+      
+      
+      // for (int j = 0; j < arr_size && j+(rank*arr_size); j++) {
+      //   result_arr[j+i*arr_size] = temp[j];
+      // }
+      // 
+      // printf("array from node %d\n", i);
+      // for(int j = 0; j < arr_size; j++) {
+      //   printf("%d ", temp[j]);
+      // }
+      // printf("\n");
+    }
+  } else {
+    MPI_Send( local_arr,
+              arr_size*sizeof(int),
+              MPI_INT,
+              0,
+              0,
+              MPI_COMM_WORLD
+              );
+  }
+
+  if (rank == 0) {
+    printf("here is the array you ordered\n");
+    for (int i=0; i<arr_size*numtasks; i++){
+      printf("%d ", result_arr[i]);
+    }
+    printf("\n");
+  }
+  
+  free(local_arr);
+  free(result_arr);
+
+  MPI_Finalize();
+} 
--- a/src/paralel.c
+++ b/src/paralel.c
+#include "dijkstra.h"
+#include "util.h"
+#include <stdio.h>
+#include <mpi.h>
+#include <math.h>
+#include <string.h>
+
+
+int main(int argc, char *argv[])
+{
+	int numprocs, rank;
+	char processor_name[MPI_MAX_PROCESSOR_NAME];
+	int name_len;
+
+	int np = atoi(argv[1]);
+	int numvertices = atoi(argv[2]);
+	int chunk_size = ceil(numvertices/np);
+	// printf("chunk_size : %d\n", chunk_size);
+	// printf("np : %d\n", np);
+	// printf("numvertices : %d\n", numvertices);
+
+
+	MPI_Status Stat;
+	MPI_Init(&argc,&argv);
+	MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Get_processor_name(processor_name, &name_len);
+
+
+	double start_time, finish_time;
+
+	// generate graph
+	// printf("about to generate our graph\n");
+	long **graph = gen_graph(numvertices);
+
+	// start the timer
+	MPI_Barrier(MPI_COMM_WORLD);
+	start_time = MPI_Wtime();
+
+
+	// set chunk size and local src
+	// int my_first_src = rank * ceil(numvertices/numprocs);
+	// int my_last_src = my_first_src + chunk_size;
+	int my_first_src = rank * chunk_size;
+	// int my_last_src = if (my_first_src+chunk_size < numvertices)? my_first_src+chunk_size:;
+
+	if (rank == 0) {
+		printf("chunk_size : %d\n", chunk_size);
+	}
+	printf("i'm node %d, my_first_src: %d\n", rank, my_first_src);	
+	// printf("i'm node %d, my_last_src: %d\n", rank, my_last_src);	
+
+
+
+	// long **my_results = (long **) malloc(chunk_size * sizeof(long*));
+	// allocate 2D array for local result
+	// printf("initiate my result\n");
+	long **my_results = gen_temp(chunk_size, numvertices);
+	// printf("done initiating my result\n");
+
+	// find shortest path from each src
+	for (int i = 0; i+my_first_src < numvertices && i < chunk_size; i++) {
+		// printf("i'm node %d and currently working on row %d \n", rank, i+my_first_src);
+		long *dist = dijkstra(graph, numvertices, i+my_first_src);
+		my_results[i] = dist;
+	}
+	// printf("hi i'm node %d and i'm done searching\n", rank);
+
+
+	// TBD gathering data from these processes
+	if ( rank == 0 ) { // gather data from other nodes
+		long **result = gen_temp(numvertices, numvertices);
+		// printf("generated empty result matrix\n");
+		// long **temp = gen_temp(chunk_size, numvertices);
+		long *temp = (long*) malloc(numvertices * sizeof(long));
+		// printf("generated temp\n");
+
+		memcpy(result,
+				my_results,
+				chunk_size*sizeof(long*));
+		printf("copying temp to result\n");
+
+		for (int i = 1; i < numprocs; i++) {
+			printf("about to receive local result from node %d\n", i);
+
+			for (int j = 0; j < chunk_size; j++) { // loop buat nerima tiap baris dari node lain
+				MPI_Recv(temp,
+						 numvertices*sizeof(long),
+						 MPI_LONG,
+						 i,
+						 0,
+						 MPI_COMM_WORLD,
+						 MPI_STATUS_IGNORE);
+				// for (int i=0;i<numvertices;i++) {
+				// 	printf("%d ", temp[i]);
+				// }
+				// printf("\n");
+				// printf("alamat result yang mau ditulis: result+ %d\n", (i*chunk_size)+(j*numvertices));
+				// memcpy(result+((i*chunk_size)+(j*numvertices)),
+				// 	    temp,
+				// 	    chunk_size*sizeof(long));
+				for (int k = 0; k < numvertices; k++) {
+					result[i*chunk_size+j][k] = temp[k];
+				}
+					
+
+			}
+
+			printf("done receiving local result from node %d\n", i);
+
+			// for (int i =0; i < chunk_size; i++) {
+			// 	for (int j= 0 ; j < numvertices; j++){
+			// 		printf("%ld ", temp[i][j]);
+			// 	}
+			// 	printf("\n");
+			// }
+			printf("copying local result from node %d to result\n", i);
+				
+
+		}
+
+		// for (int i=0; i < numvertices; i++) {
+		// 	printf("row %d\n", i);
+		// 	for (int j = 0; j < numvertices; j++) {
+		// 		printf("%d ", result[i][j]);
+		// 	}
+		// 	printf("\n");
+		// }
+		// 
+
+		char filename[20];
+		sprintf(filename, "./output_parallel_%d", numvertices);
+		printf("about to write output file\n");
+		write_result(result, numvertices, filename);
+		printf("done writing\n");
+
+		//free result (2d arr)
+		for(int i = 0; i < numvertices; i++){	
+			free(result[i]);
+		}
+		printf("freeing result\n");
+		//free temp (1d arr)
+		free(temp);
+		printf("freeing temp\n");
+
+
+	} else { // send my_results to master node
+		printf("i'm node %d and i'm going to send my result to master\n", rank);
+			for(int i = 0; i < chunk_size; i++) {
+				MPI_Send(my_results[i],
+						 chunk_size*sizeof(long),
+						 MPI_LONG,
+						 0,
+						 0,
+						 MPI_COMM_WORLD
+						 );
+
+			}
+		printf("i'm node %d and i'm just sent my result to master\n", rank);
+	}
+
+
+	// printf("I'm process %d and the distance from vertex 1 to 2 is %d\n", 
+	// rank, 
+	// graph[0][1]);
+	// 
+	MPI_Barrier(MPI_COMM_WORLD);
+	finish_time = MPI_Wtime();
+
+	if (rank == 0) {
+		printf("elapsed time : %.lf ms\n", (finish_time-start_time)*1000000);
+	}
+	
+	
+	//free my_results (2d arr)
+	free(my_results);
+	printf("freed my_results\n");
+	//free graph (2d arr)
+	free(graph);
+	printf("freed graph\n");
+
+	MPI_Finalize();
+
+
+	return 0;
+}
\ No newline at end of file
--- a/src/paralel.cu
+++ b/src/paralel.cu
+#include <iostream>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include "dijkstra_cuda.cuh"
+
+#define THREADS_BLOCK 256
+
+static double get_micros(void) {
+    struct timespec ts;
+    timespec_get(&ts, TIME_UTC);
+    return ((double)((long)ts.tv_sec * 1000000000L + ts.tv_nsec)/1000);
+}
+
+int main (int argc, char const *argv[]) {
+    
+    //check if argc == 3
+    if (argc!=3) {
+        fprintf(stderr,"Usage: Dijkstra_CUDA num_of_node output_filename\n");
+        return EXIT_FAILURE;
+    }
+    // initialization
+    srand(ID);
+    int num_vertices = atoi(argv[1]);
+    double start_time, end_time, total_time;
+    total_time = 0;
+
+    // allocate memory in host for the graph
+    long *graph = create_graph(num_vertices);
+    
+
+    // allocate memory in the host for the result matrice
+    long *result = create_temp(num_vertices);
+    
+    // allocate memory in the host for result array from a vertice
+    long *tempResult = (long *)malloc(num_vertices * sizeof(long));
+
+    for (int i=0; i<num_vertices; i++) {
+        tempResult[i] = -1;
+    }
+
+    //CUDA malloc initialize
+    long *gpu_graph;
+    long *gpu_result;
+    int *gpu_visitedNode;
+    long *minDistance;
+    int *minIndex;
+
+    //CUDA malloc
+    // allocate memory in device for the graph
+    cudaMalloc((void**)&gpu_graph,num_vertices*num_vertices*sizeof(long));
+    // allocate memory in device for the result of dijkstra
+    cudaMalloc((void**)&gpu_result,num_vertices*sizeof(long));
+    // allocate memory in device for the list of visited node
+    cudaMalloc((void**)&gpu_visitedNode,num_vertices*sizeof(int));
+    // allocate memory in device for the minimal distance used in dijkstra
+    cudaMalloc((void**)&minDistance,sizeof(long));
+    // allocate memory in device for the index of minDistance
+    cudaMalloc((void**)&minIndex,sizeof(int));
+
+    // copy data of graph from host to device
+    cudaMemcpy(gpu_graph,graph,num_vertices*num_vertices*sizeof(long),cudaMemcpyHostToDevice);
+    
+    // initiate block size and num of blocks that will be use in device
+    int blockSize = 256;
+    int numBlocks = (num_vertices + blockSize - 1) / blockSize;
+
+    // dijkstra algorithm for each vertice
+    for (int i=0; i<num_vertices; i++) {
+        
+        // initialize value for dijkstra in device
+        initValue<<<numBlocks, blockSize>>>(
+            gpu_graph,
+            gpu_result,
+            gpu_visitedNode,
+            minIndex,
+            i,
+            num_vertices);
+            
+        // set timer
+        start_time = get_micros();
+        // for each vertice except current vertice (source)
+        
+        for (int j=1; j<num_vertices; j++) {
+            
+            // find minimal distance
+            //findMinDistance<<<numBlocks,blockSize,num_vertices*sizeof(int)+num_vertices*sizeof(long)>>>(
+            //
+            //findMinDistance<<<1,1>>>(
+            //    gpu_result,
+            //    gpu_visitedNode,
+            //    minIndex,
+            //    minDistance,
+            //    num_vertices);
+            //
+            // update distance for each vertice if new distance < old distance
+            findAndSetNewDistance<<<numBlocks, blockSize>>>(
+                gpu_graph,
+                gpu_result,
+                gpu_visitedNode,
+                minIndex,
+                minDistance,
+                num_vertices);
+            
+            // cudaDeviceSynchronize();
+        }
+        
+        // end of timer
+        end_time = get_micros();
+
+        // copy the result from device to host
+        cudaMemcpy(tempResult,gpu_result,num_vertices*sizeof(long),cudaMemcpyDeviceToHost);
+
+        // fill copied into the result matrice
+        for (int k=0; k<num_vertices; k++) {
+            result[i*num_vertices + k] = tempResult[k];
+        }
+
+        total_time += end_time-start_time;
+    }
+    
+    
+    write_to_txt(num_vertices,result,argv[2]);
+    
+    printf("processing time: %0.04lf us ...\n",total_time);
+
+    // free device memory allocation
+    cudaFree(gpu_graph);
+    cudaFree(gpu_result);
+    cudaFree(gpu_visitedNode);
+    cudaFree(minDistance);
+    cudaFree(minIndex);
+
+    // free host memory allocation
+    free(tempResult);
+    free(graph);
+    free(result);
+
+    return EXIT_SUCCESS;
+}
\ No newline at end of file
--- a/src/paralel_openmp.c
+++ b/src/paralel_openmp.c
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include "util.h"
+#include "dijkstra.h"
+#include "boolean.h"
+
+
+static double get_micros(void) {
+    struct timespec ts;
+    timespec_get(&ts, TIME_UTC);
+    return ((double)((long)ts.tv_sec * 1000000000L + ts.tv_nsec)/1000);
+}
+
+
+int main(int argc, char const *argv[])
+{
+	// int thread_count = strtol(argv[1], NULL, 10);
+	int num_vertices = atoi(argv[2]);
+	// int tid;
+
+	// time
+	double start_time, end_time, total_time;
+
+	// generate graph and result matrix
+	long **result = gen_temp(num_vertices, num_vertices);
+	long **graph = gen_graph(num_vertices);
+
+	long *temp = (long*) malloc(sizeof(long)*num_vertices);
+
+	// init time
+	total_time = 0;
+
+	#pragma omp barrier
+	
+	// start time
+	start_time = get_micros();
+
+	// share the work to all the threads
+	#pragma omp parallel for private(temp)
+	for (int i = 0; i < num_vertices; i++) {
+		// tid = omp_get_thread_num();
+		// printf("i: %d, by the way i'm thread %d\n", i, tid);
+		
+		// get the shortest path from each vertex
+		// get time execution
+
+
+		temp = dijkstra(graph, num_vertices, i);
+
+		// kalo barrier nya disini error
+		end_time = get_micros();
+
+		
+		// put it in result
+		// #pragma omp critical(result) 
+		// {
+		for (int j = 0; j < num_vertices; j++) {
+			result[i][j] = temp[j];
+		}
+		// }
+
+		total_time += end_time - start_time;
+	}
+		// #pragma omp barrier
+		// total_time += end_time - start_time;
+
+	char filename[20];
+	sprintf(filename, "./output_parallel_%d", num_vertices);
+	printf("about to write output file\n");
+	write_result(result, num_vertices, filename);
+	printf("done writing\n");
+	printf("processing time: %0.04lf us ...\n",total_time);
+
+	return 0;
+}
\ No newline at end of file
--- a/src/serial.c
+++ b/src/serial.c
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include "dijkstra.h"
+#include "util.h"
+
+
+
+
+
+int main(int argc, char const *argv[])
+{
+	time_t start,end;
+	
+	if (argc > 1) {
+
+		int n = atoi(argv[1]);
+		printf("n: %d\n", n);
+
+		// start timer
+		start=clock(); 
+		// generate graph
+		long **graph = gen_graph(n);
+		
+		// result matrix
+		long **result = gen_temp(n, n);
+
+		for (int i = 0; i < n; i++) {
+        	long *dist = dijkstra(graph, n, i);
+        	result[i] = dist;
+		}
+		//end timer
+		end=clock();
+
+		//elapsed time
+		float t = (float)(end-start)/CLOCKS_PER_SEC;
+		printf("Elapsed time (in millisecond): %f", t*1000000);
+
+		char filename[20];
+		sprintf(filename, "./output_serial_%d", n);
+
+		write_result(result, n, filename);
+		
+		free(result);
+		free(graph);
+
+	} else {
+		printf("usage : serial [n]\n");
+	}
+
+	return 0;
+}
+
--- a/src/test
+++ b/src/test
--- a/src/test.c
+++ b/src/test.c
+#include "dijkstra.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+int main(int argc, char const *argv[])
+{
+	long **result = gen_temp(10,5);
+	long **first = gen_temp(5,5);
+	long **second = gen_temp(5,5);
+
+	for (int i = 0; i < 5; i++) {
+		for (int j = 0; j < 5; j++) {
+			first[i][j] = 1 * i * j;
+			second[i][j] = 2 * i * j;
+		}
+	}
+
+	memcpy(result,
+			first,
+			5 * sizeof(int*)
+			);
+
+	memcpy(result+5,
+			second,
+			5 * sizeof(int*)
+			);
+
+	for (int i =0; i < 10; i++) {
+		for (int j =0; j < 5; j++) {
+			printf("%d ", result[i][j]);
+		}
+		printf("\n");
+	}
+
+	return 0;
+}
\ No newline at end of file
--- a/src/util.c
+++ b/src/util.c
+#include "util.h"
+
+
+/**
+ * write matrix of shortest distance from vertex i to j
+ * @param graph 2D array 
+ */
+void write_result(long **m, int n, char filename[]) {
+
+	FILE *outfile;
+
+	printf("here we go, writing file again\n");
+	outfile = fopen(filename,"w");
+	printf("file %s opened\n", filename);
+	if (outfile == NULL) {
+		printf("Error!\n");
+		exit(1);
+	}
+
+	printf("Writing output...\n");
+	for (int i = 0; i < n; i++) {
+		// printf("about to write row %d\n", i);
+		for (int j = 0; j < n; j++) {
+			fprintf(outfile, "%ld ", m[i][j]);
+		}
+		// printf("just write row %d\n", i);
+		fprintf(outfile, "\n");
+	}
+
+	printf("Done.\n");
+
+}
\ No newline at end of file
--- a/src/util.h
+++ b/src/util.h
+#ifndef UTIL_H
+#define UTIL_H
+#include <stdio.h>
+#include <stdlib.h>
+
+/**
+ * write matrix of shortest distance from vertex i to j
+ * @param m 2D array, m[i][j] = m[j][i] = shortest distance from vertex i to j
+ * @param n     dimension of matrix m
+ */
+void write_result(long **m, int n, char filename[]);
+
+
+#endif
\ No newline at end of file
No results found