diff --git a/src/dijkstra_cuda.cuh b/src/dijkstra_cuda.cuh index ff4602e56867b2082d3f32ba31fbb0c8474c4ec4..9a4a53c1545a4a4dc2dd42f69b1c31ea46856179 100644 --- a/src/dijkstra_cuda.cuh +++ b/src/dijkstra_cuda.cuh @@ -4,6 +4,8 @@ #include <stdlib.h> #include <limits.h> +#define ID 13517122 + /** * Get vertex index with minimum distance which not yet included * in spt_set @@ -11,10 +13,10 @@ * @param spt_set a set denoting vertices included in spt_set * @param n number of vertices in the graph * @return index of minimum distance not yet included in spt_set - */ - int min_distance_idx(long dist[], bool spt_set[], int n) { + *//* +long min_distance_idx(long *dist, bool *spt_set, int n) { // Initialize min value - int min = INT_MAX, min_index; + long min = LONG_MAX, min_index; for (int i = 0; i < n; i++) { if (spt_set[i] == false && dist[i] <= min) { @@ -22,16 +24,96 @@ min_index = i; } } - - + spt_set[min_index] = true; + dist[src] = 0; + return min_index; } +*/ +// for dijkstra algorithm +__global__ +void initValue(long *graph, long *allResult, int *visitedNode, int *minIndex, int sourceIdx, int num_vertices) { + int index = threadIdx.x + blockDim.x * blockIdx.x; + int stride = blockDim.x * gridDim.x; + + for (int i=index; i < num_vertices; i += stride) { + visitedNode[i] = 0; + if ((graph[i*num_vertices + sourceIdx]==0) && i!=(sourceIdx)) { + allResult[i] = LONG_MAX; + } + else { + allResult[i] = graph[i*num_vertices + sourceIdx]; + } + } + + *minIndex = -1; + visitedNode[sourceIdx] = 1; +} + +__global__ +void findMinDistance(long *allResult, int *visitedNode, int *minIndex, long *minDistance, int num_vertices) { + *minDistance = LONG_MAX; + + for (int j=0; j<num_vertices; j++) { + if (visitedNode[j]==0 && allResult[j]<*minDistance) { + *minDistance = allResult[j]; + *minIndex = j; + } + } + + visitedNode[*minIndex] = 1; +} + +__global__ +void setNewDistance(long *graph, long *allResult, int *visitedNode, int *minIndex, long *minDistance, int num_vertices) { + int index = threadIdx.x + blockDim.x * blockIdx.x; + int stride = blockDim.x * gridDim.x; + + for (int i=index; i < num_vertices; i += stride) { + if (visitedNode[i]) { + continue; + } + else if ((graph[i*num_vertices + *minIndex]+*minDistance<allResult[i]) && (graph[i*num_vertices + *minIndex]+*minDistance!=0)) { + allResult[i] = graph[i*num_vertices + *minIndex]+*minDistance; + } + } +} /** * generate a graph with n vertices * @param n number of vertices - * @return 2D array, graph[i][j] = graph[j][i] = distance from vertex i to j + * @return 1D array, graph[i*n + j] = graph[j*n + i] = distance from vertex i to j */ + long* create_graph(int n) { + int i,j; + long *graph = (long*) malloc(n * n * sizeof(long)); + + for (i=0;i<n;i++) { + for (j=i;j<n;j++) { + if (i==j) { + graph[i*n + j] = 0; + } + else { + graph[i*n + j] = rand(); + graph[j*n + i] = graph[i*n + j]; + } + } + } + return graph; +} +//----- +long* create_temp(int n) { + int i,j; + long *graph = (long*) malloc(n * n * sizeof(long)); + + for (i=0;i<n;i++) { + for (j=i;j<n;j++) { + graph[i*n + j] = INT_MAX; + } + } + return graph; +} +/* long **gen_graph(int n) { // alokasi memori untuk matriks yang merepresentasikan graf long **result = (long **)malloc(n * sizeof(long *)); @@ -40,7 +122,7 @@ } // isi matriks dengan bilangan random - srand(13517122); + for (int i = 0; i < n; i++) { for (int j = i; j < n; j++) { @@ -54,8 +136,11 @@ } return result; -} - +}*/ +//make the graph as graph[i*n + j] to make it able to be malloc on cuda as 1d array +//----- +//----- +/* long **gen_temp(int r, int c) { // alokasi memori untuk matriks yang merepresentasikan graf long **result = (long **)malloc(r * sizeof(long *)); @@ -73,14 +158,14 @@ long **gen_temp(int r, int c) { } return result; -} - +}*/ +/* long *dijkstra(long **graph, int n, int src) { // output array, contains shortest distance from src to every vertices long *dist = (long *) malloc (sizeof(long) * n); // spt_set[i] is true if vertex i already included in the shortest path tree - bool spt_set[n]; + bool *spt_set = (bool *) malloc(sizeof(bool) * n); // initialize dist and spt_set for (int i = 0; i < n; i++) { @@ -113,14 +198,16 @@ long *dijkstra(long **graph, int n, int src) { } } } + free(spt_set); return dist; } - +*/ /** * that one kernel that do it "all" */ -__global__ do_it(**long graph, **long result, int num_vertices) { +/* +__global__ void do_it(**long graph, **long result, int num_vertices) { int start_idx = threadIdx.x + blockDim.x * blockIdx.x; int stride = blockDim.x * gridDim.x; @@ -134,5 +221,33 @@ __global__ do_it(**long graph, **long result, int num_vertices) { } } } +*/ + +void print_graph(long *data, int n) { + int i,j; + for (i=0;i<n;i++) { + for (j=0;j<n;j++) { + printf("%li ",data[i*n + j]); + } + printf("\n"); + } +} + +void write_to_txt(int n, long *const graph, const char* filename) { + FILE *fout; + int i,j; + if (NULL == (fout = fopen(filename,"w"))) { + fprintf(stderr,"error opening output file"); + abort(); + } + + for (i=0;i<n;i++) { + for(j=0;j<n;j++) { + fprintf(fout,"%li ",graph[i*n + j]); + } + fprintf(fout,"\n"); + } + printf("Result has been written to %s ...\n",filename); +} #endif \ No newline at end of file diff --git a/src/paralel.cu b/src/paralel.cu index de31f9fc426b5e3d62cc4d7f0e83930ed73a4cc2..f9d82c8827c065762dd0f883baa86ffc59289589 100644 --- a/src/paralel.cu +++ b/src/paralel.cu @@ -2,9 +2,9 @@ #include <stdlib.h> #include <stdio.h> #include <time.h> -#include <util.h> #include "dijkstra_cuda.cuh" +#define THREADS_BLOCK 256 static double get_micros(void) { struct timespec ts; @@ -14,53 +14,125 @@ static double get_micros(void) { int main (int argc, char const *argv[]) { + //check if argc == 3 + if (argc!=3) { + fprintf(stderr,"Usage: Dijkstra_CUDA num_of_node output_filename\n"); + return EXIT_FAILURE; + } // initialization - int num_vertices = atoi(argv[2]); + srand(ID); + int num_vertices = atoi(argv[1]); double start_time, end_time, total_time; total_time = 0; // allocate memory in host for the graph - // code goes here - long **graph = gen_graph(num_vertices); + long *graph = create_graph(num_vertices); + // allocate memory in the host for the result matrice - // code goes here - long **result = gen_temp(num_vertices, num_vertices); - - // copy graph from host to device - // code goes here, this might be unnecessary - // cudaMemCpy() - - - // start timer - // code goes here - start_time = get_micros(); - - // calculate the shortest paths using device - // code goes here - do_it(); - - // copy result array from device to host - // code goes here - - - // synchronize device - // code goes here - - // free memory in device - /// code goes here - - // stop the timer - // code goes here - - - // write result matrice to a file - // code goes here - - // free result matrice - // code goes here + long *result = create_temp(num_vertices); + + // allocate memory in the host for result array from a vertice + long *tempResult = (long *)malloc(num_vertices * sizeof(long)); + + for (int i=0; i<num_vertices; i++) { + tempResult[i] = -1; + } + + //CUDA malloc initialize + long *gpu_graph; + long *gpu_result; + int *gpu_visitedNode; + long *minDistance; + int *minIndex; + + //CUDA malloc + // allocate memory in device for the graph + cudaMalloc((void**)&gpu_graph,num_vertices*num_vertices*sizeof(long)); + // allocate memory in device for the result of dijkstra + cudaMalloc((void**)&gpu_result,num_vertices*sizeof(long)); + // allocate memory in device for the list of visited node + cudaMalloc((void**)&gpu_visitedNode,num_vertices*sizeof(int)); + // allocate memory in device for the minimal distance used in dijkstra + cudaMalloc((void**)&minDistance,sizeof(long)); + // allocate memory in device for the index of minDistance + cudaMalloc((void**)&minIndex,sizeof(int)); + + // copy data of graph from host to device + cudaMemcpy(gpu_graph,graph,num_vertices*num_vertices*sizeof(long),cudaMemcpyHostToDevice); + + // initiate block size and num of blocks that will be use in device + int blockSize = 256; + int numBlocks = (num_vertices + blockSize - 1) / blockSize; + + // dijkstra algorithm for each vertice + for (int i=0; i<num_vertices; i++) { + + // set timer + start_time = get_micros(); + + // initialize value for dijkstra in device + initValue<<<numBlocks, blockSize>>>( + gpu_graph, + gpu_result, + gpu_visitedNode, + minIndex, + i, + num_vertices); + + // for each vertice except current vertice (source) + for (int j=1; j<num_vertices; j++) { + + // find minimal distance + findMinDistance<<<1,1>>>( + gpu_result, + gpu_visitedNode, + minIndex, + minDistance, + num_vertices); + + // update distance for each vertice if new distance < old distance + setNewDistance<<<numBlocks, blockSize>>>( + gpu_graph, + gpu_result, + gpu_visitedNode, + minIndex, + minDistance, + num_vertices); + + // cudaDeviceSynchronize(); + } + + // end of timer + end_time = get_micros(); + + // copy the result from device to host + cudaMemcpy(tempResult,gpu_result,num_vertices*sizeof(long),cudaMemcpyDeviceToHost); + + // fill copied into the result matrice + for (int k=0; k<num_vertices; k++) { + result[i*num_vertices + k] = tempResult[k]; + } + + total_time += end_time-start_time; + } + + + write_to_txt(num_vertices,result,argv[2]); + + printf("processing time: %0.04lf us ...\n",total_time); + // free device memory allocation + cudaFree(gpu_graph); + cudaFree(gpu_result); + cudaFree(gpu_visitedNode); + cudaFree(minDistance); + cudaFree(minIndex); + // free host memory allocation + free(tempResult); + free(graph); + free(result); - return 0; + return EXIT_SUCCESS; } \ No newline at end of file