diff --git a/src/radix_sort_parallel.h b/src/radix_sort_parallel.h index 75046244f0d7e60ea7fac89abf11b2ffad22b091..b2477e55f5cb5350db3a7bf1605b94b9c5de8cc8 100644 --- a/src/radix_sort_parallel.h +++ b/src/radix_sort_parallel.h @@ -9,7 +9,5 @@ #define t 8 void rng(int* arr, int n); -int getMax(int arr[], int n) ; -void countSort(int arr[], int n, int exp) ; void radix_sort(int arr[], int n); void print(int arr[], int n) ; \ No newline at end of file diff --git a/src/radixsort_parallel.cu b/src/radixsort_parallel.cu index 8ba6fe75fb0b0901c0d092f37e9976d757f2c13c..b1c69dbd41f581f2ab2b08167a72a8c30a5d8d6d 100644 --- a/src/radixsort_parallel.cu +++ b/src/radixsort_parallel.cu @@ -1,117 +1,114 @@ -#include <stdio.h> -#include <stdlib.h> -// #include <omp.h> -#include <sys/time.h> +#include <cuda.h> +#include <cuda_runtime.h> +#include "radix_sort_parallel.h" -void generateArray(int arr[], int n,int seed); -void radixsort(int arr[], int n); -void print(int arr[], int n); +__global__ void getMax(int *arr, int *max, int n) { + int index = threadIdx.x; + int stride = blockDim.x; + int mx = arr[index]; -int main(int argc, char *argv[]) { - struct timeval stop, start; - // int thread_count = 5; - int array_size = strtol(argv[1], NULL, 10); - //printf("NUM THREADS : %d\n", omp_get_num_threads()); - int *arr,*d_arr; - arr = (int*)malloc(sizeof(int)* array_size); - cudaMalloc((void **)&d_arr, sizeof(int) * array_size); -// cudaMemcpy(d_arr, arr, sizeof(int) * array_size, cudaMemcpyHostToDevice); - int seed = 13516127; - generateArray(arr,array_size,seed); - cudaMemcpy(d_arr, arr, sizeof(int) * array_size, cudaMemcpyHostToDevice); - gettimeofday(&start, NULL); - radixsort(arr, array_size); - gettimeofday(&stop, NULL); - print(arr,array_size); - printf("Sorting selesai\n"); - printf("took %lu microsecond for parallel radix sort\n", ((stop.tv_sec - start.tv_sec)*1000000)+(stop.tv_usec - start.tv_usec)); - - cudaFree(d_arr); - free(arr); - return 0; -} + for (int i = index; i < n; i+=stride) + if (arr[i] > mx) + mx = arr[i]; + max[0] = mx; +} -__global__ -void getMax(int arr[], int n, int max[]) { -__shared__ int cache[1000]; -int temp = arr[0]; - for (int i=0; i<n; i+=1000){ - if(threadIdx.x< 1000){ - if(temp < arr[threadIdx.x + i]){ - temp = arr[threadIdx.x+i]; - } - } - } +__global__ void countSort(int *arr, int n, int exp) { + int* output = (int*)malloc(n * sizeof(int)); + int i, count[10] = {0}; + + for (i = 0; i < n; i++) + count[ (arr[i]/exp)%10 ]++; + + for (i = 1; i < 10; i++) + count[i] += count[i - 1]; + + for (i = n - 1; i >= 0; i--) + { + output[count[ (arr[i]/exp)%10 ] - 1] = arr[i]; + count[ (arr[i]/exp)%10 ]--; + } + + for (i = 0; i < n; i++) + arr[i] = output[i]; +} + +void radix_sort(int arr[], int n) +{ + int *max; + int *d_max, *d_arr; + + // Allocate host memory + max = (int*)malloc(1 * sizeof(int)); + + // Allocate device memory + cudaMalloc((void**)&d_max, 1 * sizeof(int)); + cudaMalloc((void**)&d_arr, n * sizeof(int)); + + // Transfer data from host to device + cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_max, max, sizeof(int) * 1, cudaMemcpyHostToDevice); + + // Executing kernel + getMax<<<1, 500>>>(d_arr, d_max, n); + + // Transfer data back to host memory + cudaMemcpy(max, d_max, 1 * sizeof(int), cudaMemcpyDeviceToHost); - if(threadIdx.x <1000){ - cache[threadIdx.x] =temp; + for (int exp = 1; max[0]/exp > 0; exp *= 10) { + countSort<<<1, 500>>>(d_arr, n, exp); } - - __syncthreads(); - - if(threadIdx.x ==0){ - *max = cache[0]; - for(int i =1; i<1000; i++){ - if(*max < cache[i]){ - *max = cache[i]; - } - } - + + cudaMemcpy(arr, d_arr, n * sizeof(int), cudaMemcpyDeviceToHost); +} + +void print(int arr[], int n) +{ + for (int i = 0; i < n; i++) + printf("%d: %d\n",i, arr[i]); +} + +void rng(int *arr, int n) { + int seed = 13516013; + srand(seed); + for(long i = 0; i < n; i++) { + arr[i] = (int)rand(); } } -// __global__ -void generateArray(int arr[], int n,int seed){ - int *output; - output = (int*)malloc(sizeof(int) * n); - srand(seed); - for(long i = 0; i < n; i++) { - output[i] = (int)rand(); - } - - for(long i = 0; i < n; i++) { - arr[i] = output[i]; - } -} -__global__ -void countSort(int arr[], int n, int digit) { - int *output; - output = (int*)malloc(sizeof(int)*n); - long i; - int count[10] = {0}; - - for (i = 0; i < n; i++) - count[ (arr[i]/digit)%10 ]++; - for (i = 1; i < 10; i++) - count[i] += count[i - 1]; - for (i = n - 1; i >= 0; i--) { - output[count[ (arr[i]/digit)%10 ] - 1] = arr[i]; - count[(arr[i]/digit)%10 ]--; - } - for (i = 0; i < n; i++) - arr[i] = output[i]; -} +int main(int argc, char *argv[]) { + int N; + int *arr; + int *d_arr; + + if (argc == 2) { + N = strtol(argv[1], NULL, 10); + } else { + printf("ERROR: ./radix_sort <array_length>\n"); + return 1; + } -void radixsort(int arr[], int n) { - int *max,*d_max,*d_arr; - max = (int*)malloc(sizeof(int)* 1); - cudaMalloc((void **)&d_max, sizeof(int) * 1); - cudaMemcpy(d_max, max, sizeof(int) * 1, cudaMemcpyHostToDevice); - cudaMalloc((void **)&d_arr, sizeof(int) * n); - cudaMemcpy(d_arr,arr,sizeof(int) *n, cudaMemcpyHostToDevice); - getMax<<<1,1000>>>(d_arr, n,d_max); - cudaMemcpy(max,d_max, sizeof(int) * 1, cudaMemcpyDeviceToHost); - for (int digit = 1; max[0]/digit > 0; digit *= 10) { - countSort<<<1,1>>>(d_arr, n, digit); - } - cudaMemcpy(arr,d_arr, sizeof(int) *n, cudaMemcpyDeviceToHost); + // Allocate host memory + arr = (int*)malloc(N * sizeof(int)); -} + // Initialize host memory + rng(arr,N); + + // Allocate device memory + cudaMalloc((void**)&d_arr, N * sizeof(int)); + + // Transfer data from host to device memory + cudaMemcpy(d_arr, arr, N * sizeof(int), cudaMemcpyHostToDevice); -void print(int arr[], int n) { - for (long i = 0; i < n; i++) { - printf("%d \n",arr[i]); + clock_t begin = clock(); + radix_sort(arr, N); + clock_t end = clock(); + double time = (double)(end - begin) * 1000 / CLOCKS_PER_SEC; + print(arr,N); + printf("Executed in %lf ms\n",time); + + cudaFree(d_arr); + free(arr); + return 0; +} - } - printf("\n"); -} \ No newline at end of file