From 7d804fd83116bbfffb4b7b1b4ddc19d14778ecf1 Mon Sep 17 00:00:00 2001
From: azharfatrr <m.azhar.faturahman@gmail.com>
Date: Mon, 14 Mar 2022 13:26:04 +0700
Subject: [PATCH] feat: add sorting algorithm

---
 .gitignore                |   2 +
 algorithm/bitonic_sort.cu | 184 ++++++++++++++++++++++++++++++++++++++
 algorithm/brick_sort.cu   | 139 ++++++++++++++++++++++++++++
 3 files changed, 325 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 algorithm/bitonic_sort.cu
 create mode 100644 algorithm/brick_sort.cu

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0c2595e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+# Binary file
+/bin
\ No newline at end of file
diff --git a/algorithm/bitonic_sort.cu b/algorithm/bitonic_sort.cu
new file mode 100644
index 0000000..4b03314
--- /dev/null
+++ b/algorithm/bitonic_sort.cu
@@ -0,0 +1,184 @@
+/*
+ * Parallel bitonic sort using CUDA.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+
+// PROBLEM: THREADS DAN BLOCKS HARUS KELIPATAN 2
+// SOLUTION: PADDING DATA KE DALAM BUFFER DENGAN NILAI MAX_INT
+
+/* Every thread gets exactly one value in the unsorted array. */
+#define THREADS 1024
+#define BLOCKS 16 
+#define NUM_VALS 1000
+
+void print_elapsed(clock_t start, clock_t stop)
+{
+  double elapsed = ((double) (stop - start)) / CLOCKS_PER_SEC;
+  printf("Elapsed time: %.3fs\n", elapsed);
+}
+
+int checksum(int *arr, int size)
+{
+  int sum = 0;
+  for (int i = 0; i < size; i++)
+    sum += arr[i];
+  return sum;
+}
+
+int ceil_power_of_two(int n)
+{
+  int i = 1;
+  while (i < n) {
+    i = i << 1;
+  }
+  return i;
+}
+
+void array_print(int *arr, int length) 
+{
+  int i;
+  for (i = 0; i < length; ++i) {
+    printf("%d ",  arr[i]);
+  }
+  printf("\n");
+}
+
+void array_fill(int *arr, int length)
+{
+  srand(time(NULL));
+  int i;
+  for (i = 0; i < length; ++i) {
+    arr[i] = rand();
+  }
+}
+
+void pad_array(int *arr, int length, int size)
+{
+  int i;
+  for (i = length; i < size; i++)
+  {
+    arr[i] = INT_MAX;
+  }
+}
+
+void copy_and_pad(int *dest, int *src, int length, int size)
+{
+  int i;
+  for (i = 0; i < length; i++)
+  {
+    dest[i] = src[i];
+  }
+  pad_array(dest, length, size);
+}
+
+__global__ void bitonic_sort_step(int *dev_values, int j, int k)
+{
+  unsigned int i, ixj; /* Sorting partners: i and ixj */
+  i = threadIdx.x + blockDim.x * blockIdx.x;
+
+  ixj = i^j;
+
+  /* The threads with the lowest ids sort the array. */
+  if ((ixj)>i) {
+    if ((i&k)==0) {
+      /* Sort ascending */
+      if (dev_values[i]>dev_values[ixj]) {
+        /* exchange(i,ixj); */
+        int temp = dev_values[i];
+        dev_values[i] = dev_values[ixj];
+        dev_values[ixj] = temp;
+      }
+    }
+    if ((i&k)!=0) {
+      /* Sort descending */
+      if (dev_values[i]<dev_values[ixj]) {
+        /* exchange(i,ixj); */
+        int temp = dev_values[i];
+        dev_values[i] = dev_values[ixj];
+        dev_values[ixj] = temp;
+      }
+    }
+  }
+}
+
+void bitonic_sort(int *values)
+{
+  int buffer_length = BLOCKS * THREADS;
+
+  size_t size = buffer_length * sizeof(int);
+  size_t original_size = NUM_VALS * sizeof(int);
+
+  // Copy and padding the values.
+  int *h_values = (int *)malloc(size);
+  copy_and_pad(h_values, values, NUM_VALS, buffer_length);
+
+  // Allocate device memory.
+  int *dev_values;
+  cudaMalloc((void**) &dev_values, size);
+  cudaMemcpy(dev_values, h_values, size, cudaMemcpyHostToDevice);
+
+  // Sort the values.
+  int j, k;
+  // Major step.
+  for (k = 2; k <= buffer_length; k <<= 1) {
+    // Minor step.
+    for (j=k>>1; j>0; j=j>>1) {
+      bitonic_sort_step<<<BLOCKS, THREADS>>>(dev_values, j, k);
+    }
+  }
+
+  // Copy the values back to the host.
+  cudaMemcpy(values, dev_values, original_size, cudaMemcpyDeviceToHost);
+
+  // Free device memory.
+  cudaFree(dev_values);
+  free(h_values);
+}
+
+int main(void)
+{
+  clock_t start, stop;
+
+  int *values = (int*) malloc( NUM_VALS * sizeof(int));
+  array_fill(values, NUM_VALS);
+
+  array_print(values, NUM_VALS);
+  printf("\n");
+
+  int checksum1 = checksum(values, NUM_VALS);
+
+  start = clock();
+  bitonic_sort(values); /* Inplace */
+  stop = clock();
+
+  int checksum2 = checksum(values, NUM_VALS);
+
+  array_print(values, NUM_VALS);
+  printf("\n");
+
+  print_elapsed(start, stop);
+
+  // TESTCASE.
+  bool passed_sort = true;
+  bool passed_checksum = true;
+
+  for (int i = 1; i < NUM_VALS; i++)
+  {
+    if (values[i - 1] > values[i])
+    {
+      passed_sort = false;
+    }
+  }
+
+  if (checksum1 != checksum2)
+  {
+    passed_checksum = false;
+    printf("%d != %d\n", checksum1, checksum2);
+  }
+
+  printf("Test %s\n", passed_sort ? "PASSED SORT" : "FAILED SORT");
+  printf("Test %s\n", passed_checksum ? "PASSED CHECKSUM" : "FAILED CHECKSUM");
+}
\ No newline at end of file
diff --git a/algorithm/brick_sort.cu b/algorithm/brick_sort.cu
new file mode 100644
index 0000000..f7295d4
--- /dev/null
+++ b/algorithm/brick_sort.cu
@@ -0,0 +1,139 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <cooperative_groups.h>
+
+// PROBLEM: INI GA JALAN KALAU BLOCKS > 1
+
+/* Every thread gets exactly one value in the unsorted array. */
+#define THREADS 16
+#define NUM_VALS 2000
+
+using namespace cooperative_groups;
+
+void print_elapsed(clock_t start, clock_t stop)
+{
+  double elapsed = ((double)(stop - start)) / CLOCKS_PER_SEC;
+  printf("Elapsed time: %.3fs\n", elapsed);
+}
+
+int checksum(int *arr, int size)
+{
+  int sum = 0;
+  for (int i = 0; i < size; i++)
+    sum += arr[i];
+  return sum;
+}
+
+int random_int()
+{
+  return (int)rand() / (int)RAND_MAX;
+}
+
+void array_print(int *arr, int length)
+{
+  int i;
+  for (i = 0; i < length; ++i)
+  {
+    printf("%d ", arr[i]);
+  }
+  printf("\n");
+}
+
+void array_fill(int *arr, int length)
+{
+  srand(time(NULL));
+  int i;
+  for (i = 0; i < length; ++i)
+  {
+    arr[i] = rand();
+  }
+}
+
+__device__ void swap(int *arr, int i, int j)
+{
+  int temp = arr[i];
+  arr[i] = arr[j];
+  arr[j] = temp;
+}
+
+__global__ void brick_sort(int *d_arr, int length)
+{
+  int l;
+  if (length % 2 == 0)
+    l = length / 2;
+  else
+    l = (length / 2) + 1;
+  for (int i = 0; i < l; i++)
+  {
+    if ((!(threadIdx.x & 1)) && (threadIdx.x < (length - 1))) // even phase
+    {
+      if (d_arr[threadIdx.x] > d_arr[threadIdx.x + 1])
+        swap(d_arr, threadIdx.x, threadIdx.x + 1);
+    }
+
+    __syncthreads();
+    if ((threadIdx.x & 1) && (threadIdx.x < (length - 1))) // odd phase
+    {
+      if (d_arr[threadIdx.x] > d_arr[threadIdx.x + 1])
+        swap(d_arr, threadIdx.x, threadIdx.x + 1);
+    }
+    __syncthreads();
+  } // for
+}
+
+int main(void)
+{
+  clock_t start, stop;
+  size_t size = NUM_VALS * sizeof(int);
+  int blocks = (NUM_VALS + THREADS - 1) / THREADS;
+  printf("Blocks: %d\n", blocks);
+
+  int *arr = (int *)malloc(NUM_VALS * sizeof(int));
+  array_fill(arr, NUM_VALS);
+
+  int checksum1 = checksum(arr, NUM_VALS);
+
+  start = clock();
+
+  int *d_arr;
+
+  cudaMalloc((void **)&d_arr, size);
+  cudaMemcpy(d_arr, arr, size, cudaMemcpyHostToDevice);
+
+  for (int i = 0; i < NUM_VALS / 2; ++i)
+  {
+    brick_sort<<<1, THREADS>>>(d_arr, NUM_VALS);
+  }
+
+  cudaMemcpy(arr, d_arr, size, cudaMemcpyDeviceToHost);
+  cudaFree(d_arr);
+
+  int checksum2 = checksum(arr, NUM_VALS);
+
+  stop = clock();
+
+  print_elapsed(start, stop);
+
+  bool passed_sort = true;
+  bool passed_checksum = true;
+
+  for (int i = 1; i < NUM_VALS; i++)
+  {
+    if (arr[i - 1] > arr[i])
+    {
+      passed_sort = false;
+    }
+  }
+
+  if (checksum1 != checksum2)
+  {
+    passed_checksum = false;
+    printf("%d != %d\n", checksum1, checksum2);
+  }
+
+  array_print(arr, NUM_VALS);
+
+  printf("Test %s\n", passed_sort ? "PASSED SORT" : "FAILED SORT");
+  printf("Test %s\n", passed_checksum ? "PASSED CHECKSUM" : "FAILED CHECKSUM");
+}
-- 
GitLab