diff --git a/src/cudaUtil/cudaUtil.cu b/src/cudaUtil/cudaUtil.cu index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..430b9e20846cc74da909e16769a1ed3c0cde63af 100644 --- a/src/cudaUtil/cudaUtil.cu +++ b/src/cudaUtil/cudaUtil.cu @@ -0,0 +1,34 @@ +#include "cudaUtil.cuh" +#include <stdio.h> + +__device__ void getMax(int* arr, int n, int* maxBitIdx){ + int myId = threadIdx.x + blockDim.x * blockIdx.x; + int tId = threadIdx.x; + + for (unsigned int s = blockDim.x/2; s>0; s>>=1){ + if (tId < s){ + if (arr[myId] < arr[myId + s]){ + arr[myId] = arr[myId+s]; + } + } + __syncthreads(); + } + + if (tId == 0){ + int currentNum = arr[myId]; + int bitIndex = 0; + while(currentNum > 0){ + currentNum >>=1; + bitIndex +=1; + } + *maxBitIdx = bitIndex; + } +} + +__global__ void printArr(int* arr, int n){ + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = index; i<n; i+=stride){ + printf("%d ", arr[i]); + } +} diff --git a/src/cudaUtil/cudaUtil.cuh b/src/cudaUtil/cudaUtil.cuh index e7373deeb09df7c1a792df2f9c5fee33a7ebcfd6..69ce35c6f46f0c0297960ecfa0b5bd76578eb70f 100644 --- a/src/cudaUtil/cudaUtil.cuh +++ b/src/cudaUtil/cudaUtil.cuh @@ -1,6 +1,7 @@ #ifndef CUDA_UTIL_CUH #define CUDA_UTIL_CUH - +__device__ void getMax(int* arr, int n, int* out); +__global__ void printArr(int* arr, int n); #endif \ No newline at end of file diff --git a/src/main.cu b/src/main.cu index 6a0c7b6df7a4f621ba5bd55e97314bccebb5879e..f8181a4a627a95886f52c8c70af894b5e5e34904 100644 --- a/src/main.cu +++ b/src/main.cu @@ -1,23 +1,41 @@ #include <stdio.h> #include "util/util.hpp" +#include "cudaUtil/cudaUtil.cuh" +#include "radixSort/radixSort.cuh" __global__ void cuda_hello(){ printf("Hello World from GPU!\n"); } -int main() { - int* arr; - int* d_arr; +int main(int argc, char *argv[]) { + + if (argc != 2){ + fprintf(stderr, "Usage: <number of elements>\n"); + exit(1); + } - arr = (int *) malloc(sizeof(int) * 10); - cudaMalloc((void**)&d_arr, sizeof(int) * 10); + int n = atoi(argv[1]); - cudaMemcpy(d_arr, arr, sizeof(float) * 10, cudaMemcpyHostToDevice); + int* arr = (int *) malloc(sizeof(int) * n); + + int* d_arr; + cudaMalloc((void**)&d_arr, sizeof(int) * n); - generate_random(arr, 10); - print_arr(arr, 10); + int* d_out; + cudaMalloc((void**)&d_out, sizeof(int) * n); + generate_random(arr, n); + + cudaMemcpy(d_arr, arr, sizeof(float) * n, cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + + radixSort<<<1,256>>>(d_out, d_arr, n); + cudaDeviceSynchronize(); + cuda_hello<<<1,1>>>(); cudaDeviceSynchronize(); + + cudaFree(d_arr); + return 0; } diff --git a/src/radixSort/radixSort.cu b/src/radixSort/radixSort.cu index 8e8917a09d9174b14354e66184c8d5535c1ce353..b539d5dbb22b591cfe72b92c425e1e223c6a3849 100644 --- a/src/radixSort/radixSort.cu +++ b/src/radixSort/radixSort.cu @@ -1,5 +1,15 @@ #include "radixSort.cuh" +#include "../cudaUtil/cudaUtil.cuh" +#include <stdio.h> +__global__ void radixSort(int* out, int* arr, int n){ + int* maxBitIdx = (int*) malloc(sizeof(int)); + + if (threadIdx.x == 0){ + getMax(arr,n,maxBitIdx); + printf("%d\n", *maxBitIdx); + } + __syncthreads(); } diff --git a/src/radixSort/radixSort.cuh b/src/radixSort/radixSort.cuh index 8409449db79c33ec092f081c56cde6f5cfdda0b7..2814cd8ca70a9a1b7b4a97e4275af0c27cd86123 100644 --- a/src/radixSort/radixSort.cuh +++ b/src/radixSort/radixSort.cuh @@ -1,6 +1,7 @@ #ifndef RADIX_SORT_CUH #define RADIX_SORT_CUH -void radixSort(int* out, int* arr, int n); +__global__ void radixSort(int* out, int* arr, int n); +__device__ void sortMSB(int* out, int* arr, int n); #endif \ No newline at end of file diff --git a/src/util/util.hpp b/src/util/util.hpp index 5e23e74c43412edabe21692d1f5e871ef8967d1b..114c8ee784caec926f88d6aedaa5a8d644415847 100644 --- a/src/util/util.hpp +++ b/src/util/util.hpp @@ -3,7 +3,6 @@ void generate_random(int* arr, int n); void print_arr(int * array, int num_elements); -int get_max(int arr[], int n); void assert_sorted(int arr[], int n, double time_elapsed); #endif \ No newline at end of file