diff --git a/.DS_Store b/.DS_Store index 6cac8e941360ddd700e70126ea937acf683d5fe5..1175da2ac62dc0f47eca32d97a9a2eb3d543a4a9 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..544e8c26fc65071f1dd73af69516a4d916b9f3ae --- /dev/null +++ b/src/Makefile @@ -0,0 +1,2 @@ +build: + gcc-8 -g -Wall -o radix_sort radix_sort.c -fopenmp -lm diff --git a/src/kernel.cu b/src/kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..b7ded00cc83d428ec509f8fbd76000bb0991070d --- /dev/null +++ b/src/kernel.cu @@ -0,0 +1,262 @@ +#pragma once +#ifdef __INTELLISENSE__ +void __syncthreads(); +#endif + +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#include <stdlib.h> +#include <stdio.h> +#include <iostream> +#include <chrono> + +using namespace std; +using namespace std::chrono; + +#define WSIZE 32 +#define LOOPS 100 +#define UPPER_BIT 10 +#define LOWER_BIT 0 + +__device__ unsigned int ddata[WSIZE]; +__device__ int ddata_s[WSIZE]; + +template <typename T, unsigned S> +inline unsigned arraysize(const T(&v)[S]) +{ + return S; +} + +template<typename T> +void printArray(T &arr) +{ + for (int i = 0; i < arraysize(arr); ++i) + { + cout << "Array[" << i << "]: " << *(arr + i) << endl; + } +} + +template<typename T> +void fillArray(T &arr) +{ + srand(time(NULL)); + for (int i = 0; i < arraysize(arr); ++i) + { + arr[i] = rand() % 1024; + } +} + +void print(int arr[], int n) +{ + for (int i = 0; i < n; i++) + { + cout << arr[i] << " "; + } + cout << endl; +} + +template<typename T> +T findMax(T &arr) +{ + T max = 0; + for (int i = 0; i < arraysize(arr); ++i) + { + if (arr[i] > max) + { + max = arr[i]; + } + } + return max; +} + +__device__ int getMax(int arr[], int n) +{ + int mx = arr[0]; + for (int i = 1; i < n; i++) + if (arr[i] > mx) + mx = arr[i]; + return mx; +} + +__device__ void countSort(int arr[], int n, int exp) +{ + int output[1024]; // Output array + int i, count[10] = { 0 }; + + // Store count of occurrences in count[] + for (i = 0; i < n; i++) + count[(arr[i] / exp) % 10]++; + + // Change count[i] so that count[i] now contains actual + // position of this digit in output[] + for (i = 1; i < 10; i++) + { + count[i] += count[i - 1]; + } + + // Build the output array + for (i = n - 1; i >= 0; i--) + { + output[count[(arr[i] / exp) % 10] - 1] = arr[i]; + count[(arr[i] / exp) % 10]--; + } + + // Copy the output array to arr[], so that arr[] now + // contains sorted numbers according to current digit + for (i = 0; i < n; i++) + arr[i] = output[i]; +} + +__device__ void radixsort(int arr[], int n) +{ + // Find the maximum number to know number of digits + int m = getMax(arr, n); + + // Do counting sort for every digit. Note that instead + // of passing digit number, exp is passed. exp is 10^i + // where i is current digit number + for (int exp = 1; m / exp > 0; exp *= 10) + countSort(arr, n, exp); +} + +__global__ void serialRadix() +{ + radixsort(ddata_s, WSIZE); + __syncthreads(); +} + +__global__ void parallelRadix() +{ + // This data in shared memory + __shared__ volatile unsigned int sdata[WSIZE * 2]; + + // Load from global into shared variable + sdata[threadIdx.x] = ddata[threadIdx.x]; + + unsigned int bitmask = 1 << LOWER_BIT; + unsigned int offset = 0; + // -1, -2, -4, -8, -16, -32, -64, -128, -256,... + unsigned int thrmask = 0xFFFFFFFFU << threadIdx.x; + unsigned int mypos; + + // For each LSB to MSB + for (int i = LOWER_BIT; i <= UPPER_BIT; i++) + { + unsigned int mydata = sdata[((WSIZE - 1) - threadIdx.x) + offset]; + unsigned int mybit = mydata&bitmask; + // Get population of ones and zeroes + unsigned int ones = __ballot(mybit); + unsigned int zeroes = ~ones; + // Switch ping-pong buffers + offset ^= WSIZE; + + // Do zeroes, then ones + if (!mybit) + { + mypos = __popc(zeroes&thrmask); + } + else { // Threads with a one bit + // Get my position in ping-pong buffer + mypos = __popc(zeroes) + __popc(ones&thrmask); + } + + // Move to buffer (or use shfl for cc 3.0) + sdata[mypos - 1 + offset] = mydata; + // Repeat for next bit + bitmask <<= 1; + } + // Put results to global + ddata[threadIdx.x] = sdata[threadIdx.x + offset]; +} + +int main() { + + /* Parallel Radix Sort */ + + unsigned int hdata[WSIZE]; + float totalTime = 0; + + for (int lcount = 0; lcount < LOOPS; lcount++) + { + srand(time(NULL)); + // Array elements have value in range of 1024 + unsigned int range = 1U << UPPER_BIT; + + // Fill array with random elements + // Range = 1024 + for (int i = 0; i < WSIZE; i++) + { + hdata[i] = i; + } + + // Copy data from host to device + cudaMemcpyToSymbol(ddata, hdata, WSIZE * sizeof(unsigned int)); + + // Execution time measurement, that point starts the clock + high_resolution_clock::time_point t1 = high_resolution_clock::now(); + parallelRadix <<< 1, WSIZE >>>(); + // Make kernel function synchronous + cudaDeviceSynchronize(); + // Execution time measurement, that point stops the clock + high_resolution_clock::time_point t2 = high_resolution_clock::now(); + + // Execution time measurement, that is the result + auto duration = duration_cast<milliseconds>(t2 - t1).count(); + + // Summination of each loops' execution time + totalTime += (float)duration / 1000.00; + + // Copy data from device to host + cudaMemcpyFromSymbol(hdata, ddata, WSIZE * sizeof(unsigned int)); + } + + printf("Parallel Radix Sort:\n"); + printf("Array size = %d\n", WSIZE * LOOPS); + printf("Time elapsed = %fseconds\n", totalTime); + + /* Serial Radix Sort */ + + unsigned int hdata_s[WSIZE]; + totalTime = 0; + + for (int lcount = 0; lcount < LOOPS; lcount++) + { + srand(time(NULL)); + // Array elements have value in range of 1024 + unsigned int range = 1U << UPPER_BIT; + + // Fill array with random elements + // Range = 1024 + for (int i = 0; i < WSIZE; i++) + { + hdata_s[i] = i; + } + + // Copy data from host to device + cudaMemcpyToSymbol(ddata_s, hdata_s, WSIZE * sizeof(unsigned int)); + + // Execution time measurement, that point starts the clock + high_resolution_clock::time_point t1 = high_resolution_clock::now(); + serialRadix <<< 1, 1 >>>(); + // Make kernel function synchronous + cudaDeviceSynchronize(); + // Execution time measurement, that point stops the clock + high_resolution_clock::time_point t2 = high_resolution_clock::now(); + + // Execution time measurement, that is the result + auto duration = duration_cast<milliseconds>(t2 - t1).count(); + + // Summination of each loops' execution time + totalTime += (float)duration / 1000.00; + + // Copy data from device to host + cudaMemcpyFromSymbol(hdata_s, ddata_s, WSIZE * sizeof(unsigned int)); + } + + printf("\nSerial Radix Sort:\n"); + printf("Array size = %d\n", WSIZE * LOOPS); + printf("Time elapsed = %fseconds\n\n", totalTime); + + return 0; +}