diff --git a/makefile b/makefile
index ccac332a6133a25e7afba790d0085659ba63ebda..d5535070df035d91b45f12027d5d809e0a9f714a 100644
--- a/makefile
+++ b/makefile
@@ -1,36 +1,25 @@
 SOURCE_SERIAL=./src/serial.c
 SOURCE_PARALLEL=./src/parallel.cu
-SOURCE_TEST=./src/test.c
-SOURCE_PARALLEL_COLLAB=./src/parallel_collab.cu
 
 EXEC_SERIAL=main-serial
 EXEC_PARALLEL=main-parallel
-EXEC_TEST=test
-EXEC_PARALLEL_COLLAB=parallel-collab
 
-COMPILER_PARALLEL_COLLAB=nvcc
 COMPILER_PARALLEL=nvcc
 COMPILER_SERIAL=gcc
 
-
-
 # Compile program.
-compile-parallel-collab:
-	${COMPILER_PARALLEL_COLLAB} -rdc=true -o ./bin/${EXEC_PARALLEL_COLLAB} ${SOURCE_PARALLEL_COLLAB}
 compile-serial:
 	${COMPILER_SERIAL} -o ./bin/${EXEC_SERIAL} ${SOURCE_SERIAL}
 compile-parallel:
-	${COMPILER_PARALLEL} -rdc=true -o ./bin/${EXEC_PARALLEL} ${SOURCE_PARALLEL} ./src/lib/matrix.cu ./src/lib/utils.cu ./src/lib/bitonic_sort.cu
+	${COMPILER_PARALLEL} -o ./bin/${EXEC_PARALLEL} ${SOURCE_PARALLEL}
 
 # Link program.
-link-parallel-collab : compile-parallel-collab
 link-serial: compile-serial
 link-parallel: compile-parallel
 
 # Compile and link.
 install-serial: compile-serial link-serial
 install-parallel: compile-parallel link-parallel
-install-parallel-collab: compile-parallel-collab link-parallel-collab
 
 # Clean bin folder.
 clean:
@@ -45,11 +34,6 @@ run-parallel: install-parallel
 	./bin/${EXEC_PARALLEL}
 run-parallel-example: install-parallel
 	./bin/${EXEC_PARALLEL} < ./testcase/Example-TC
-run-parallel-collab: install-parallel-collab
-	./bin/${EXEC_PARALLEL_COLLAB}
-run-parallel-collab-example: install-parallel-collab
-	./bin/${EXEC_PARALLEL_COLLAB} < ./testcase/Example-TC
-
 
 # Example testcase.
 TC-example: 
@@ -87,17 +71,5 @@ TC3-parallel: install-parallel
 TC4-parallel: install-parallel
 	(./bin/${EXEC_PARALLEL} < ./testcase/K01-03-TC4) > ./result/K01-03-TC4_parallel.txt
 
-# Generic parallel testcase.
-TC1-parallel-collab: install-parallel-collab
-	(./bin/${EXEC_PARALLEL_COLLAB} < ./testcase/K01-03-TC1) > ./result/K01-03-TC1_parallel_collab.txt
-TC2-parallel-collab: install-parallel-collab
-	(./bin/${EXEC_PARALLEL_COLLAB} < ./testcase/K01-03-TC2) > ./result/K01-03-TC2_parallel_collab.txt
-TC3-parallel-collab: install-parallel-collab
-	(./bin/${EXEC_PARALLEL_COLLAB}  < ./testcase/K01-03-TC3) > ./result/K01-03-TC3_parallel_collab.txt
-TC4-parallel-collab: install-parallel-collab
-	(./bin/${EXEC_PARALLEL_COLLAB} < ./testcase/K01-03-TC4) > ./result/K01-03-TC4_parallel_collab.txt
-
 TC-parallel: TC1-parallel TC2-parallel TC3-parallel TC4-parallel
-TC-parallel-collab: TC1-parallel-collab TC2-parallel-collab TC3-parallel-collab TC4-parallel-collab
 TC-generic: TC-serial TC-parallel
-TC-generic-collab: TC-serial TC-parallel TC-parallel-collab
diff --git a/result/K01-03-TC1_parallel.txt b/result/K01-03-TC1_parallel.txt
index 04a2a8a057522cd947d63637586f766961593a41..f47a14b1fba197f5910886390a2c72b16296df45 100644
--- a/result/K01-03-TC1_parallel.txt
+++ b/result/K01-03-TC1_parallel.txt
@@ -3,4 +3,4 @@
 10114197
 10323010
 
-Runtime: 0.155827 s
+Runtime: 0.167720 s
diff --git a/result/K01-03-TC1_parallel_collab.txt b/result/K01-03-TC1_parallel_collab.txt
deleted file mode 100644
index f91c91498972f9821048fe87fcfa55f8541102bd..0000000000000000000000000000000000000000
--- a/result/K01-03-TC1_parallel_collab.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-8539213
-11916317
-10114197
-10323010
-
-Runtime: 0.144829 s
diff --git a/result/K01-03-TC1_serial.txt b/result/K01-03-TC1_serial.txt
index 989f60d71fa1e0c0241fe48f4002911dc70b42a3..b22612021dc7582e8673daa8aadf33bdd2f5e7e1 100644
--- a/result/K01-03-TC1_serial.txt
+++ b/result/K01-03-TC1_serial.txt
@@ -3,4 +3,4 @@
 10114197
 10323010
 
-Runtime: 0.014589 s
+Runtime: 0.017340 s
diff --git a/result/K01-03-TC2_parallel.txt b/result/K01-03-TC2_parallel.txt
index b594a3766218ae5fd38542ac06209b35e37a0d08..45430c0a83191e51aabdbea83844377227e3c51f 100644
--- a/result/K01-03-TC2_parallel.txt
+++ b/result/K01-03-TC2_parallel.txt
@@ -3,4 +3,4 @@
 37739803
 38222937
 
-Runtime: 0.833981 s
+Runtime: 0.598738 s
diff --git a/result/K01-03-TC2_parallel_collab.txt b/result/K01-03-TC2_parallel_collab.txt
deleted file mode 100644
index aafe92c5fb785c3edd4df7804b7738e67538871e..0000000000000000000000000000000000000000
--- a/result/K01-03-TC2_parallel_collab.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-35064588
-46265294
-37739803
-38222937
-
-Runtime: 0.422364 s
diff --git a/result/K01-03-TC2_serial.txt b/result/K01-03-TC2_serial.txt
index d21e02c539226977d1c4530c9f3d311952f5e585..2c5f38b28e7ed24903bfb446e41026496d512a0a 100644
--- a/result/K01-03-TC2_serial.txt
+++ b/result/K01-03-TC2_serial.txt
@@ -3,4 +3,4 @@
 37739803
 38222937
 
-Runtime: 0.689801 s
+Runtime: 0.883971 s
diff --git a/result/K01-03-TC3_parallel.txt b/result/K01-03-TC3_parallel.txt
index bff74c68d7ed3ca321d3c8ab9652e9188c8e3c3b..c625ff6e84c4486eaa63b71721ea450db4696cc0 100644
--- a/result/K01-03-TC3_parallel.txt
+++ b/result/K01-03-TC3_parallel.txt
@@ -3,4 +3,4 @@
 23198319
 23380111
 
-Runtime: 0.988793 s
+Runtime: 0.881576 s
diff --git a/result/K01-03-TC3_parallel_collab.txt b/result/K01-03-TC3_parallel_collab.txt
deleted file mode 100644
index 98ed976ee323820459b37cd55f3de94c9d95a6ce..0000000000000000000000000000000000000000
--- a/result/K01-03-TC3_parallel_collab.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-18815130
-28707695
-23198319
-23380111
-
-Runtime: 0.644663 s
diff --git a/result/K01-03-TC3_serial.txt b/result/K01-03-TC3_serial.txt
index 7cb8cc056c333f053e33a019358c42312018d65e..d7059bc7ec7d3bb01be089534f4016aeb4b5aba0 100644
--- a/result/K01-03-TC3_serial.txt
+++ b/result/K01-03-TC3_serial.txt
@@ -3,4 +3,4 @@
 23198319
 23380111
 
-Runtime: 0.815450 s
+Runtime: 1.182247 s
diff --git a/result/K01-03-TC4_parallel.txt b/result/K01-03-TC4_parallel.txt
index b1f42bf07ab2b8af5ba8ecf6f68fa3c04dd2d74e..575c491dc91102ec205c9741d9a0460f0050e8e3 100644
--- a/result/K01-03-TC4_parallel.txt
+++ b/result/K01-03-TC4_parallel.txt
@@ -3,4 +3,4 @@
 51451884
 51774352
 
-Runtime: 9.862558 s
+Runtime: 4.115475 s
diff --git a/result/K01-03-TC4_parallel_collab.txt b/result/K01-03-TC4_parallel_collab.txt
deleted file mode 100644
index af651cae921a3cf2ce03c8c4da198cf005c0ce95..0000000000000000000000000000000000000000
--- a/result/K01-03-TC4_parallel_collab.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-41250811
-71841136
-51451884
-51774352
-
-Runtime: 4.018500 s
diff --git a/result/K01-03-TC4_serial.txt b/result/K01-03-TC4_serial.txt
index 789cfe32519f4f3f2f09d530e824f62572d97e15..f66713b7003c1a9713cdecd91c5144ee87f39443 100644
--- a/result/K01-03-TC4_serial.txt
+++ b/result/K01-03-TC4_serial.txt
@@ -3,4 +3,4 @@
 51451884
 51774352
 
-Runtime: 8.795971 s
+Runtime: 9.939554 s
diff --git a/src/lib/brick_sort.cu b/src/lib/brick_sort.cu
deleted file mode 100644
index 08d9b64a775db81e4d8c725e81fd223ee510ba1a..0000000000000000000000000000000000000000
--- a/src/lib/brick_sort.cu
+++ /dev/null
@@ -1,152 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <time.h>
-
-#define THREADS 1024
-#define NUM_VALS 100000
-
-// PROBLEM: BLOCKS GA BOLEH LEBIH DARI 1.
-
-void print_elapsed(clock_t start, clock_t stop)
-{
-  double elapsed = ((double)(stop - start)) / CLOCKS_PER_SEC;
-  printf("Elapsed time: %.3fs\n", elapsed);
-}
-
-int checksum(int *arr, int size)
-{
-  int sum = 0;
-  for (int i = 0; i < size; i++)
-    sum += arr[i];
-  return sum;
-}
-
-int random_int()
-{
-  return (int)rand() / (int)RAND_MAX;
-}
-
-void array_print(int *arr, int length)
-{
-  int i;
-  for (i = 0; i < length; ++i)
-  {
-    printf("%d ", arr[i]);
-  }
-  printf("\n");
-}
-
-void array_fill(int *arr, int length)
-{
-  srand(time(NULL));
-  int i;
-  for (i = 0; i < length; ++i)
-  {
-    arr[i] = rand();
-  }
-}
-
-
-__device__ void swap(int *arr, int i, int j)
-{
-  int temp = arr[i];
-  arr[i] = arr[j];
-  arr[j] = temp;
-}
-
-
-__global__ void brick_sort_even(int *d_arr, int length)
-{
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (i >= length - 1)
-  {
-    return;
-  }
-
-  if (i % 2 == 0)
-  {
-    if (d_arr[i] > d_arr[i + 1])
-    {
-      swap(d_arr, i, i + 1);
-    }
-  }
-
-}
-
-__global__ void brick_sort_odd(int *d_arr, int length)
-{
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (i >= length - 1)
-  {
-    return;
-  }
-
-  if (i % 2 != 0)
-  {
-    if (d_arr[i] > d_arr[i + 1])
-    {
-      swap(d_arr, i, i + 1);
-    }
-  }
-}
-
-void brick_sort(int *arr, int length) {
-  int blocks = (NUM_VALS + THREADS - 1) / THREADS;
-  size_t size = length * sizeof(int);
-
-  int *d_arr;
-  cudaMalloc((void **)&d_arr, size);
-  cudaMemcpy(d_arr, arr, size, cudaMemcpyHostToDevice);
-
-  for (int i = 0; i < NUM_VALS / 2; ++i)
-  {
-    brick_sort_odd<<<blocks, THREADS>>>(d_arr, NUM_VALS);
-    brick_sort_even<<<blocks, THREADS>>>(d_arr, NUM_VALS);
-  }
-
-  cudaMemcpy(arr, d_arr, size, cudaMemcpyDeviceToHost);
-  cudaFree(d_arr);
-}
-
-
-int driver(void)
-{
-  clock_t start, stop;
-  
-  int *arr = (int *)malloc(NUM_VALS * sizeof(int));
-  array_fill(arr, NUM_VALS);
-
-  int checksum1 = checksum(arr, NUM_VALS);
-
-  start = clock();
-  brick_sort(arr, NUM_VALS);
-  stop = clock();
-
-  int checksum2 = checksum(arr, NUM_VALS);
-
-  print_elapsed(start, stop);
-
-  bool passed_sort = true;
-  bool passed_checksum = true;
-
-  for (int i = 1; i < NUM_VALS; i++)
-  {
-    if (arr[i - 1] > arr[i])
-    {
-      passed_sort = false;
-    }
-  }
-
-  if (checksum1 != checksum2)
-  {
-    passed_checksum = false;
-    printf("%d != %d\n", checksum1, checksum2);
-  }
-
-  // array_print(arr, NUM_VALS);
-
-  printf("Test %s\n", passed_sort ? "PASSED SORT" : "FAILED SORT");
-  printf("Test %s\n", passed_checksum ? "PASSED CHECKSUM" : "FAILED CHECKSUM");
-}
diff --git a/src/lib/brick_sort.cuh b/src/lib/brick_sort.cuh
deleted file mode 100644
index bb088368e8417d8a4f1e99a50d84db97d521dd24..0000000000000000000000000000000000000000
--- a/src/lib/brick_sort.cuh
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _BRICK_SORT_H_
-#define _BRICK_SORT_H_
-
-void brick_sort(int *d_arr, int length);
-
-#endif
\ No newline at end of file
diff --git a/src/parallel.cu b/src/parallel.cu
index d6d0bb217003fe6d0798645722ca760a2e61db1c..541b60cf8047682333573976c80ef6dbc1e15295 100644
--- a/src/parallel.cu
+++ b/src/parallel.cu
@@ -1,54 +1,688 @@
-// serial.c
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-
-#include "./lib/bitonic_sort.cuh"
-#include "./lib/matrix.cuh"
-#include "./lib/utils.cuh"
-
-
-int main() {
-	// Time.
-	clock_t t;
-    t = clock();
-
-	int kernel_row, kernel_col, target_row, target_col, num_targets;
-	
-	// reads kernel's row and column and initalize kernel matrix from input
-	scanf("%d %d", &kernel_row, &kernel_col);
-	Matrix kernel = input_matrix(kernel_row, kernel_col);
-	
-	// reads number of target matrices and their dimensions.
-	// initialize array of matrices and array of data ranges (int)
-	scanf("%d %d %d", &num_targets, &target_row, &target_col);
-	Matrix* arr_mat = (Matrix*)malloc(num_targets * sizeof(Matrix));
-	int *arr_range = (int*)malloc(num_targets * sizeof(int));
-	
-	// read each target matrix, compute their convolution matrices, and compute their data ranges
-	for (int i = 0; i < num_targets; i++) {
-		arr_mat[i] = input_matrix(target_row, target_col);
-		arr_mat[i] = convolution(&kernel, &arr_mat[i]);
-		arr_range[i] = get_matrix_datarange(&arr_mat[i]); 
-	}
-
-	// sort the data range array
-	bitonic_sort(arr_range, num_targets);
-	
-	int median = get_median(arr_range, num_targets);	
-	int floored_mean = get_floored_mean(arr_range, num_targets); 
-
-	// print the min, max, median, and floored mean of data range array
-	printf("%d\n%d\n%d\n%d\n", 
-			arr_range[0], 
-			arr_range[num_targets - 1], 
-			median, 
-			floored_mean);
-
-	// Print execution time in seconds.
-	t = clock() - t;
-	printf("\nRuntime: %f s\n", ((float)t) / CLOCKS_PER_SEC);
-	
-	return 0;
-}
+// parallel_collab.cu
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <math.h>
+
+#define NMAX 100
+#define DATAMAX 1000
+#define DATAMIN -1000
+
+/* 
+ * Struct Matrix
+ *
+ * Matrix representation consists of matrix data 
+ * and effective dimensions 
+ * */
+typedef struct Matrix {
+	int mat[NMAX][NMAX];	// Matrix cells
+	int row_eff;			// Matrix effective row
+	int col_eff;			// Matrix effective column
+} Matrix;
+
+
+/* 
+ * Procedure init_matrix
+ * 
+ * Initializing newly allocated matrix
+ * Setting all data to 0 and effective dimensions according
+ * to nrow and ncol 
+ * */
+void init_matrix(Matrix *m, int nrow, int ncol) {
+	m->row_eff = nrow;
+	m->col_eff = ncol;
+
+	for (int i = 0; i < m->row_eff; i++) {
+		for (int j = 0; j < m->col_eff; j++) {
+			m->mat[i][j] = 0;
+		}
+	}
+}
+
+
+/* 
+ * Function input_matrix
+ *
+ * Returns a matrix with values from stdin input
+ * */
+Matrix input_matrix(int nrow, int ncol) {
+	Matrix input;
+	init_matrix(&input, nrow, ncol);
+
+	for (int i = 0; i < nrow; i++) {
+		for (int j = 0; j < ncol; j++) {
+			scanf("%d", &input.mat[i][j]);
+		}
+	}
+
+	return input;
+}
+
+
+/* 
+ * Procedure print_matrix
+ * 
+ * Print matrix data
+ * */
+void print_matrix(Matrix *m) {
+	for (int i = 0; i < m->row_eff; i++) {
+		for (int j = 0; j < m->col_eff; j++) {
+			printf("%d ", m->mat[i][j]);
+		}
+		printf("\n");
+	}
+}
+
+
+/* 
+ * Function get_matrix_datarange
+ *
+ * Returns the range between maximum and minimum
+ * element of a matrix
+ * */
+int get_matrix_datarange(Matrix *m) {
+	int max = DATAMIN;
+	int min = DATAMAX;
+	for (int i = 0; i < m->row_eff; i++) {
+		for (int j = 0; j < m->col_eff; j++) {
+			int el = m->mat[i][j];
+			if (el > max) max = el;
+			if (el < min) min = el;
+		}
+	}
+
+	return max - min;
+}
+
+
+/*
+ * Function supression_op
+ *
+ * Returns the sum of intermediate value of special multiplication
+ * operation where kernel[0][0] corresponds to target[row][col]
+ * */
+int supression_op(Matrix *kernel, Matrix *target, int row, int col) {
+	int intermediate_sum = 0;
+	for (int i = 0; i < kernel->row_eff; i++) {
+		for (int j = 0; j < kernel->col_eff; j++) {
+			intermediate_sum += kernel->mat[i][j] * target->mat[row + i][col + j];
+		}
+	}
+
+	return intermediate_sum;
+}
+
+
+/* 
+ * Function convolution
+ *
+ * Return the output matrix of convolution operation
+ * between kernel and target
+ * */
+Matrix convolution(Matrix *kernel, Matrix *target) {
+	Matrix out;
+	int out_row_eff = target->row_eff - kernel->row_eff + 1;
+	int out_col_eff = target->col_eff - kernel->col_eff + 1;
+	
+	init_matrix(&out, out_row_eff, out_col_eff);
+
+	for (int i = 0; i < out.row_eff; i++) {
+		for (int j = 0; j < out.col_eff; j++) {
+			out.mat[i][j] = supression_op(kernel, target, i, j);
+		}
+	}
+
+	return out;
+}
+
+
+/**
+ * Swap the values of two elements in an array.
+ * 
+ * @param d_arr - the array.
+ * @param i - the index of the first element.
+ * @param j - the index of the second element.
+ */
+__device__ void swap(int *d_arr, int i, int j)
+{
+  int temp = d_arr[i];
+  d_arr[i] = d_arr[j];
+  d_arr[j] = temp;
+}
+
+/**
+ * Pad the remaining empty value in the array with the maximum value.
+ * 
+ * @param arr - the array.
+ * @param length - the number of elements inside the array.
+ * @param length_buffer - the length of the buffer.
+ */
+void padding_array(int *arr, int length, int length_buffer)
+{
+  for (int i = length; i < length_buffer; i++) {
+    arr[i] = INT_MAX;
+  }
+}
+
+/**
+ * Copy the array from src to dest and pad the remaining empty value with the maximum value.
+ * 
+ * @param dest - the destination array.
+ * @param src - the source array.
+ * @param length - the number of elements inside the src array.
+ * @param length_buffer - the length of the buffer.
+ */
+void copy_padding(int *dest, int *src, int length, int length_buffer)
+{
+  for (int i = 0; i < length; i++) {
+    dest[i] = src[i];
+  }
+  padding_array(dest, length, length_buffer);
+}
+
+/**
+ * Get the number of minimum blocks as a power of 2.
+ * 
+ * @param length - the number of elements inside the array.
+ * @param num_threads - the number of threads.
+ * @return int - the number of minimum blocks needed.
+ */
+int minimum_blocks(int length, int num_threads) {
+  int num_blocks = 1;
+
+  // Increase the number of blocks 2 times, so it will be a power of 2.
+  while (num_blocks * num_threads < length) {
+    num_blocks *= 2;
+  }
+
+  return num_blocks;
+}
+
+/**
+ * Do the bitnoic sort step by step.
+ * 
+ * @param d_values - array in the device to be sorted.
+ * @param i - major step index.
+ * @param j - minor step index.
+ */
+__global__ void bitonic_sort_step(int *d_arr, int i, int j)
+{
+  // The array index and its patner.
+  int idx, patner;
+
+  // The thread index.
+  idx = threadIdx.x + blockDim.x * blockIdx.x;
+
+  // The thread index of the patner.
+  patner = idx ^ j;
+
+  // Sort the array by threads with the lowest idx.
+  if (idx < patner) {
+    if ((idx & i) == 0) {
+      // Sort ascending.
+      if (d_arr[idx] > d_arr[patner]) {
+        swap(d_arr, idx, patner);
+      }
+    }
+    if ((idx & i) != 0) {
+      // Sort descending.
+      if (d_arr[idx] < d_arr[patner]) {
+        swap(d_arr, idx, patner);
+      }
+    }
+  }
+}
+
+/**
+ * Perform a bitonic sort on the array.
+ *
+ * @param h_arr The host array to sort.
+ * @param length The length of the array.
+ */
+void bitonic_sort(int *h_arr, int length)
+{
+  // Initialize the constants variable.
+  const int threads = 1024;
+  const int blocks = minimum_blocks(length, threads);
+  const int buffer_length = threads * blocks;
+
+  // Initialize the memory size of the array.
+  size_t size = length * sizeof(int);
+  size_t buffer_size = buffer_length * sizeof(int);
+
+  // Create the buffer array and pad with maximum value of Int.
+  int *h_buffer = (int *)malloc(buffer_size);
+  copy_padding(h_buffer, h_arr, length, buffer_length);
+
+  // Allocate and copy array into device memory.
+  int *d_arr;
+  cudaMalloc((void**) &d_arr, buffer_size);
+  cudaMemcpy(d_arr, h_buffer, buffer_size, cudaMemcpyHostToDevice);
+
+  // Sort the array using bitonic_sort_step.
+  int i, j;
+  // The major step.
+  for (i = 2; i <= buffer_length; i *= 2) {
+    // The minor step.
+    for (j = i / 2; j > 0; j = j / 2) {
+      bitonic_sort_step<<<blocks, threads>>>(d_arr, i, j);
+    }
+  }
+
+  // Copy the values back to the host.
+  cudaMemcpy(h_arr, d_arr, size, cudaMemcpyDeviceToHost);
+
+  // Free device memory.
+  cudaFree(d_arr);
+  free(h_buffer);
+}
+ 
+
+/* 
+ * Procedure print_array
+ *
+ * Prints all elements of array n of size to stdout
+ * */
+void print_array(int *n, int size) {
+	for (int i = 0; i < size; i++ ) printf("%d ", n[i]);
+	printf("\n");
+}
+
+
+/* 
+ * Function get_median
+ *
+ * Returns median of array n of length
+ * */
+int get_median(int *n, int length) {
+	int mid = length / 2;
+	if (length & 1) return n[mid];
+
+	return (n[mid - 1] + n[mid]) / 2;
+}
+
+
+/* 
+ * Function get_floored_mean
+ *
+ * Returns floored mean from an array of integers
+ * */
+long get_floored_mean(int *n, int length) {
+	long sum = 0;
+	for (int i = 0; i < length; i++) {
+		sum += n[i];
+	}
+
+	return sum / length;
+}
+
+/**
+ * Function index_to_row_major 
+ * 
+ * Returns the index of a matrix element in row-major order 
+ */
+int index_to_row_major(int row, int col, int row_eff, int col_eff) {
+	return row * col_eff + col;
+}
+
+__device__ int d_index_to_row_major(int row, int col, int row_eff, int col_eff) {
+ 		return row * col_eff + col;
+}
+
+/**
+ * Function row_major_to_index
+ * 
+ * Returns the row and column of a matrix element in row-major order
+ */
+void row_major_to_index(int index, int row_eff, int col_eff, int *row, int *col) {
+	*row = index / col_eff;
+	*col = index % col_eff;
+}
+
+__device__ void d_row_major_to_index(int index, int row_eff, int col_eff, int *row, int *col) {
+ 		*row = index / col_eff;
+ 		*col = index % col_eff;
+}
+
+/**
+ * Function map_matrix
+ * 
+ * Returns a row major matrix of the input matrix.
+ **/
+int* map_matrix(int mat[][100], int row, int col) {
+	int* map = (int*) malloc(sizeof(int) * row * col);
+	for (int i = 0; i < row; i++) {
+		for (int j = 0; j < col; j++) {
+			map[index_to_row_major(i, j, row, col)] = mat[i][j];
+		}
+	}
+	return map;
+}
+
+/*
+ * Function map_matrix_extended
+ * 
+ * Returns a row major matrix of the input matrix.
+ **/
+int* map_matrix_extended(int** mat, int row, int col) {
+	int* map = (int*) malloc(sizeof(int) * row * col);
+	for (int i = 0; i < row; i++) {
+		for (int j = 0; j < col; j++) {
+			map[index_to_row_major(i, j, row, col)] = mat[i][j];
+		}
+	}
+	return map;
+}
+
+
+/**
+ * Function reverse_map_matrix
+ * 
+ * Returns a matrix of the input row major matrix.
+ */
+int** reverse_map_matrix(int* map, int row, int col) {
+	int** mat = (int**) malloc(sizeof(int*) * row);
+	for (int i = 0; i < row; i++) {
+		mat[i] = (int*) malloc(sizeof(int) * col);
+		for (int j = 0; j < col; j++) {
+			mat[i][j] = map[index_to_row_major(i, j, row, col)];
+		}
+	}
+	return mat;
+}
+
+/**
+ * Function rm_to_matrix_object
+ * 
+ * Return Matrix struct of row major matrix
+ */
+Matrix rm_to_matrix_object(int* map, int row, int col) {
+	Matrix mat;
+	init_matrix(&mat, row, col);
+	for (int i = 0; i < row; i++) {
+		for (int j = 0; j < col; j++) {
+			mat.mat[i][j] = map[index_to_row_major(i, j, row, col)];
+		}
+	}
+	return mat;
+}
+
+/**
+ * Function rm_to_list_matrix_object
+ * 
+ * Return List of Matrix Struct of row major matrix
+ */
+Matrix* rm_to_list_matrix_object(int* map, int row, int col, int row_inner, int col_inner) {
+	Matrix* mat = (Matrix*) malloc(sizeof(Matrix) * row);
+	for (int i = 0; i < row; i++) {
+		init_matrix(&mat[i], row_inner, col_inner);
+		int pad = i * col;
+		for (int j = 0; j < row_inner; j++) {
+			for (int k = 0; k < col_inner; k++) {
+				int index = index_to_row_major(j, k, row_inner, col_inner) + pad;
+				mat[i].mat[j][k] = map[index];
+			}
+		}
+	}
+	return mat;
+}
+/**
+ * Function list_matrix_object_to_rm
+ * 
+ * Return row major matrix of list of Matrix struct
+ */
+int* list_matrix_object_to_rm(Matrix* mat, int num_matrix,  int row_inner, int col_inner) {
+	int* map = (int*) malloc(sizeof(int) * num_matrix * row_inner * col_inner);
+	for (int i = 0; i < num_matrix; i++) {
+		int pad = i * row_inner * col_inner;
+		for (int j = 0; j < row_inner; j++) {
+			for (int k = 0; k < col_inner; k++) {
+				int index = index_to_row_major(j, k, row_inner, col_inner) + pad;
+				map[index] = mat[i].mat[j][k];
+			}
+		}
+	}
+	
+	return map;
+}
+
+/**
+ * Function cuda_convolution
+ * 
+ * Returns a matrix of the convolution of the input matrix with the kernel
+ */
+void cuda_convolution(int* out_mat_rm, int* arr_mat_rm, int* kernel_rm, int row_eff, int col_eff, int kernel_row, int kernel_col, int curr_mat) {
+	// Calculate real row and column of input matrix.
+	int row = row_eff + kernel_row - 1;
+	int col = col_eff + kernel_col - 1;
+	
+	// Calculate padding target and output matrix.
+	int pad = curr_mat * row * col;
+	int pad_out = curr_mat * row_eff * col_eff;
+
+	// For each element in input matrix that is not on the boundary,
+	for (int i = 0 ; i < row_eff; i++) {
+		for (int j = 0; j < col_eff; j++) {
+			// Convolution of the element with the kernel.
+			// Calculate the sum of the kernel and the input matrix.
+			int intermediate_sum = 0;
+			for (int k = 0; k < kernel_row; k++) {
+				for (int l = 0; l < kernel_col; l++) {
+					int index = index_to_row_major(i + k, j + l, row, col) + pad;
+					int kernel_index = index_to_row_major(k, l, kernel_row, kernel_col);
+					intermediate_sum += arr_mat_rm[index] * kernel_rm[kernel_index];
+					// Print all i,j,k,l
+					// printf("i:%d, j:%d, k:%d, l:%d\n", i, j, k, l);
+
+				}
+			}
+			// Store the sum in the output matrix.
+			out_mat_rm[index_to_row_major(i, j, row_eff, col_eff) + pad_out ] = intermediate_sum;
+		}
+	}
+}
+
+__global__ void d_cuda_convolution(int* d_out_mat, int* arr_mat_rm, int* kernel_rm, int row_eff, int col_eff, int kernel_row, int kernel_col) {
+  // Calculate real row and column of input matrix.
+	int row = row_eff + kernel_row - 1;
+	int col = col_eff + kernel_col - 1;
+
+  // Determine current matrix from block;
+	int curr_mat = blockIdx.y;
+  // Calculate padding target and output matrix.
+ 	int pad = curr_mat * row * col;
+ 	int pad_out = curr_mat * row_eff * col_eff;
+ 
+  // Get i, and j from threadIdx
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int i, j;
+  d_row_major_to_index(tid, row_eff, col_eff, &i, &j);
+ 
+  // Calculate element in input matrix that is not on the boundary,
+ 	if (i < row_eff && j < col_eff) {
+ 		int intermediate_sum = 0;
+ 		for (int k = 0; k < kernel_row; k++) {
+ 			for (int l = 0; l < kernel_col; l++) {
+ 				int index = d_index_to_row_major(i + k, j + l, row, col) + pad;
+ 				int kernel_index = d_index_to_row_major(k, l, kernel_row, kernel_col);
+ 				intermediate_sum += arr_mat_rm[index] * kernel_rm[kernel_index];
+ 			}
+ 		}
+ 		d_out_mat[d_index_to_row_major(i, j, row_eff, col_eff) + pad_out] = intermediate_sum;
+ 	}
+}
+
+/* 
+ * Function d_get_matrix_datarange
+ * Set range between maximum and minimum
+ * for every matrix in out_mat to d_arr_range
+ * */
+__global__ void d_get_matrix_datarange(int* out_mat, int* d_arr_range, int row_eff, int col_eff) {
+	// Determine current matrix from block;
+	int curr_mat = blockIdx.y;
+	// Calculate padding output matrix and arr_range matrix.
+	int pad = curr_mat * row_eff * col_eff;
+	int pad_arr_range = curr_mat;
+
+	// Get i, and j from threadIdx
+	int tid = blockIdx.x * blockDim.x + threadIdx.x;
+	int i, j;
+	d_row_major_to_index(tid, row_eff, col_eff, &i, &j);
+  	// Get datarange in every output matrix
+ 	if (i < row_eff && j < col_eff) {
+ 		int max = DATAMIN;
+		int min = DATAMAX;
+ 		for (int k = 0; k < row_eff; k++) {
+ 			for (int l = 0; l < col_eff; l++) {
+ 				int index = d_index_to_row_major(i + k, j + l, row_eff, col_eff) + pad;
+
+				int el = out_mat[index];
+				if (el > max) max = el;
+				if (el < min) min = el;
+ 			}
+ 		}
+
+ 		d_arr_range[d_index_to_row_major(i, j, row_eff, col_eff) + pad_arr_range] = max-min;
+ 	}
+}
+
+// main() driver
+int main() {
+	// Time.
+	clock_t t;
+    t = clock();
+
+	int kernel_row, kernel_col, target_row, target_col, num_targets;
+	
+	// reads kernel's row and column and initalize kernel matrix from input
+	scanf("%d %d", &kernel_row, &kernel_col);
+	Matrix kernel = input_matrix(kernel_row, kernel_col);
+	
+	// reads number of target matrices and their dimensions.
+	// initialize array of matrices and array of data ranges (int)
+	scanf("%d %d %d", &num_targets, &target_row, &target_col);
+	Matrix* arr_mat = (Matrix*)malloc(num_targets * sizeof(Matrix));
+	
+	// Calculate variable for cuda computing.
+	int a = (target_row-kernel_row+1) * (target_col-kernel_col+1);
+	int b = 1024;
+	int block_size = a/b + (a % b != 0); // ceil(a/b)
+	int threads_per_block = 1024;
+	int row_eff = target_row - kernel_row + 1;
+	int col_eff = target_col - kernel_col + 1;
+
+
+	// Allocate variable. 
+	// rm means row-major. It's indicate matrix are in row-major order.
+	// Variable declaration.
+	int * arr_mat_rm=0, * d_arr_mat_rm=0, *out_mat_rm=0, *d_out_mat_rm=0, *kernel_rm=0, *d_kernel_rm=0, *arr_range=0, *d_arr_range=0;
+	int size_arr_mat, size_out_mat, size_kernel, size_arr_range;
+	cudaError err;
+
+	// Allocate input matrix in device and host.
+	size_arr_mat = num_targets * target_row * target_col;
+	arr_mat_rm = (int*)malloc(sizeof(int) * size_arr_mat);
+	cudaMalloc((void **)&d_arr_mat_rm, sizeof(int) * size_arr_mat);
+	if (arr_mat_rm == 0 | d_arr_mat_rm == 0) {
+		printf("Error: Memory allocation failed for arr_mat.\n");
+	 	return 1;
+	}
+
+	// Allocate output matrix in device and host.
+	size_out_mat = num_targets * row_eff * col_eff;
+	out_mat_rm = (int*)malloc(sizeof(int) * size_out_mat);
+	cudaMalloc((void **)&d_out_mat_rm, sizeof(int) * size_out_mat);
+	if (out_mat_rm == 0 | d_out_mat_rm == 0) {
+	 	printf("Error: Memory allocation failed for out_mat.\n");
+	 	return 1;
+	}
+	cudaMemset(d_out_mat_rm, 0, sizeof(int) * size_out_mat);
+
+
+	// Allocate kernel matrix in host.
+	size_kernel = kernel_row * kernel_col;
+	kernel_rm = (int*)malloc(sizeof(int) * size_kernel);
+	// Store kernel in row major form and allocate kernel for device.
+	kernel_rm = map_matrix(kernel.mat, kernel_row, kernel_col);
+	cudaMalloc((void **)&d_kernel_rm, sizeof(int) * size_kernel);
+	if (kernel_rm == 0 | d_kernel_rm == 0) {
+	 	printf("Error: Memory allocation failed for kernel.\n");
+	 	return 1;
+	}
+	err = cudaMemcpy(d_kernel_rm, kernel_rm, sizeof(int) * size_kernel, cudaMemcpyHostToDevice);
+	 if (err != cudaSuccess) {
+		 	printf("Error copy host to device: %s\n", cudaGetErrorString(err));
+			return 1;
+	}
+	
+	
+	// Read each target matrix.
+	for (int i = 0; i < num_targets; i++) {
+		arr_mat[i] = input_matrix(target_row, target_col);
+	}
+	// Store each target matrix in row major form and allocate target matrix for device.
+	arr_mat_rm = list_matrix_object_to_rm(arr_mat, num_targets, target_row, target_col);
+	err = cudaMemcpy(d_arr_mat_rm, arr_mat_rm, sizeof(int) * size_arr_mat, cudaMemcpyHostToDevice);
+	if (err != cudaSuccess) {
+		printf("Error copy host to device: %s\n", cudaGetErrorString(err));
+		return 1;
+	}
+
+	// For each target matrix, compute their convolution matrices.
+	dim3 grid, block;
+	grid.x = block_size;
+	grid.y = num_targets;
+	block.x = threads_per_block;
+	d_cuda_convolution<<<grid, block>>>(d_out_mat_rm, d_arr_mat_rm, d_kernel_rm, row_eff, col_eff, kernel_row, kernel_col);
+	err = cudaMemcpy(out_mat_rm, d_out_mat_rm, sizeof(int) * size_out_mat, cudaMemcpyDeviceToHost);
+	if (err != cudaSuccess) {
+	 	printf("Error copy device to host: %s\n", cudaGetErrorString(err));
+	}
+	// for (int i = 0; i < num_targets; i++){
+		// cuda_convolution(out_mat_rm, arr_mat_rm, kernel_rm, row_eff, col_eff, kernel_row, kernel_col, i);
+	// }
+	// arr_mat = rm_to_list_matrix_object(out_mat_rm, num_targets, row_eff*col_eff, row_eff, col_eff);
+
+	// Allocate arr_range matrix in device and host.
+	size_arr_range = num_targets;
+	arr_range = (int*)malloc(sizeof(int) * size_arr_range);
+	cudaMalloc((void **)&d_arr_range, sizeof(int) * size_arr_range);
+	if (arr_range == 0 | d_arr_range == 0) {
+	 	printf("Error: Memory allocation failed for arr_range.\n");
+	 	return 1;
+	}
+	cudaMemset(d_arr_range, 0, sizeof(int) * size_arr_range);
+
+	grid.x = 1;
+	grid.y = num_targets;
+	block.x = 1;
+	d_get_matrix_datarange<<<grid, block>>>(d_out_mat_rm, d_arr_range, row_eff, col_eff);
+	err = cudaMemcpy(arr_range, d_arr_range, sizeof(int) * size_arr_range, cudaMemcpyDeviceToHost);
+	if (err != cudaSuccess) {
+	 	printf("Error copy device to host 2: %s\n", cudaGetErrorString(err));
+	}
+
+	// sort the data range array
+	bitonic_sort(arr_range, num_targets);
+	
+	int median = get_median(arr_range, num_targets);	
+	int floored_mean = get_floored_mean(arr_range, num_targets); 
+
+	// print the min, max, median, and floored mean of data range array
+	printf("%d\n%d\n%d\n%d\n", 
+			arr_range[0], 
+			arr_range[num_targets - 1], 
+			median, 
+			floored_mean);
+
+	// Print execution time in seconds.
+	t = clock() - t;
+	printf("\nRuntime: %f s\n", ((float)t) / CLOCKS_PER_SEC);	
+	
+	// Free cuda Memory.
+	cudaFree(d_arr_mat_rm);
+	cudaFree(d_out_mat_rm);
+	cudaFree(d_kernel_rm);
+	return 0;
+}
diff --git a/src/parallel_collab.cu b/src/parallel_collab.cu
deleted file mode 100644
index ffb6074f45ef824a57c0e550ccbf228cc3530311..0000000000000000000000000000000000000000
--- a/src/parallel_collab.cu
+++ /dev/null
@@ -1,689 +0,0 @@
-// parallel_collab.cu
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <math.h>
-
-#define NMAX 100
-#define DATAMAX 1000
-#define DATAMIN -1000
-
-/* 
- * Struct Matrix
- *
- * Matrix representation consists of matrix data 
- * and effective dimensions 
- * */
-typedef struct Matrix {
-	int mat[NMAX][NMAX];	// Matrix cells
-	int row_eff;			// Matrix effective row
-	int col_eff;			// Matrix effective column
-} Matrix;
-
-
-/* 
- * Procedure init_matrix
- * 
- * Initializing newly allocated matrix
- * Setting all data to 0 and effective dimensions according
- * to nrow and ncol 
- * */
-void init_matrix(Matrix *m, int nrow, int ncol) {
-	m->row_eff = nrow;
-	m->col_eff = ncol;
-
-	for (int i = 0; i < m->row_eff; i++) {
-		for (int j = 0; j < m->col_eff; j++) {
-			m->mat[i][j] = 0;
-		}
-	}
-}
-
-
-/* 
- * Function input_matrix
- *
- * Returns a matrix with values from stdin input
- * */
-Matrix input_matrix(int nrow, int ncol) {
-	Matrix input;
-	init_matrix(&input, nrow, ncol);
-
-	for (int i = 0; i < nrow; i++) {
-		for (int j = 0; j < ncol; j++) {
-			scanf("%d", &input.mat[i][j]);
-		}
-	}
-
-	return input;
-}
-
-
-/* 
- * Procedure print_matrix
- * 
- * Print matrix data
- * */
-void print_matrix(Matrix *m) {
-	for (int i = 0; i < m->row_eff; i++) {
-		for (int j = 0; j < m->col_eff; j++) {
-			printf("%d ", m->mat[i][j]);
-		}
-		printf("\n");
-	}
-}
-
-
-/* 
- * Function get_matrix_datarange
- *
- * Returns the range between maximum and minimum
- * element of a matrix
- * */
-int get_matrix_datarange(Matrix *m) {
-	int max = DATAMIN;
-	int min = DATAMAX;
-	for (int i = 0; i < m->row_eff; i++) {
-		for (int j = 0; j < m->col_eff; j++) {
-			int el = m->mat[i][j];
-			if (el > max) max = el;
-			if (el < min) min = el;
-		}
-	}
-
-	return max - min;
-}
-
-
-/*
- * Function supression_op
- *
- * Returns the sum of intermediate value of special multiplication
- * operation where kernel[0][0] corresponds to target[row][col]
- * */
-int supression_op(Matrix *kernel, Matrix *target, int row, int col) {
-	int intermediate_sum = 0;
-	for (int i = 0; i < kernel->row_eff; i++) {
-		for (int j = 0; j < kernel->col_eff; j++) {
-			intermediate_sum += kernel->mat[i][j] * target->mat[row + i][col + j];
-		}
-	}
-
-	return intermediate_sum;
-}
-
-
-/* 
- * Function convolution
- *
- * Return the output matrix of convolution operation
- * between kernel and target
- * */
-Matrix convolution(Matrix *kernel, Matrix *target) {
-	Matrix out;
-	int out_row_eff = target->row_eff - kernel->row_eff + 1;
-	int out_col_eff = target->col_eff - kernel->col_eff + 1;
-	
-	init_matrix(&out, out_row_eff, out_col_eff);
-
-	for (int i = 0; i < out.row_eff; i++) {
-		for (int j = 0; j < out.col_eff; j++) {
-			out.mat[i][j] = supression_op(kernel, target, i, j);
-		}
-	}
-
-	return out;
-}
-
-
-/**
- * Swap the values of two elements in an array.
- * 
- * @param d_arr - the array.
- * @param i - the index of the first element.
- * @param j - the index of the second element.
- */
-__device__ void swap(int *d_arr, int i, int j)
-{
-  int temp = d_arr[i];
-  d_arr[i] = d_arr[j];
-  d_arr[j] = temp;
-}
-
-/**
- * Pad the remaining empty value in the array with the maximum value.
- * 
- * @param arr - the array.
- * @param length - the number of elements inside the array.
- * @param length_buffer - the length of the buffer.
- */
-void padding_array(int *arr, int length, int length_buffer)
-{
-  for (int i = length; i < length_buffer; i++) {
-    arr[i] = INT_MAX;
-  }
-}
-
-/**
- * Copy the array from src to dest and pad the remaining empty value with the maximum value.
- * 
- * @param dest - the destination array.
- * @param src - the source array.
- * @param length - the number of elements inside the src array.
- * @param length_buffer - the length of the buffer.
- */
-void copy_padding(int *dest, int *src, int length, int length_buffer)
-{
-  for (int i = 0; i < length; i++) {
-    dest[i] = src[i];
-  }
-  padding_array(dest, length, length_buffer);
-}
-
-/**
- * Get the number of minimum blocks as a power of 2.
- * 
- * @param length - the number of elements inside the array.
- * @param num_threads - the number of threads.
- * @return int - the number of minimum blocks needed.
- */
-int minimum_blocks(int length, int num_threads) {
-  int num_blocks = 1;
-
-  // Increase the number of blocks 2 times, so it will be a power of 2.
-  while (num_blocks * num_threads < length) {
-    num_blocks *= 2;
-  }
-
-  return num_blocks;
-}
-
-/**
- * Do the bitnoic sort step by step.
- * 
- * @param d_values - array in the device to be sorted.
- * @param i - major step index.
- * @param j - minor step index.
- */
-__global__ void bitonic_sort_step(int *d_arr, int i, int j)
-{
-  // The array index and its patner.
-  int idx, patner;
-
-  // The thread index.
-  idx = threadIdx.x + blockDim.x * blockIdx.x;
-
-  // The thread index of the patner.
-  patner = idx ^ j;
-
-  // Sort the array by threads with the lowest idx.
-  if (idx < patner) {
-    if ((idx & i) == 0) {
-      // Sort ascending.
-      if (d_arr[idx] > d_arr[patner]) {
-        swap(d_arr, idx, patner);
-      }
-    }
-    if ((idx & i) != 0) {
-      // Sort descending.
-      if (d_arr[idx] < d_arr[patner]) {
-        swap(d_arr, idx, patner);
-      }
-    }
-  }
-}
-
-/**
- * Perform a bitonic sort on the array.
- *
- * @param h_arr The host array to sort.
- * @param length The length of the array.
- */
-void bitonic_sort(int *h_arr, int length)
-{
-  // Initialize the constants variable.
-  const int threads = 1024;
-  const int blocks = minimum_blocks(length, threads);
-  const int buffer_length = threads * blocks;
-
-  // Initialize the memory size of the array.
-  size_t size = length * sizeof(int);
-  size_t buffer_size = buffer_length * sizeof(int);
-
-  // Create the buffer array and pad with maximum value of Int.
-  int *h_buffer = (int *)malloc(buffer_size);
-  copy_padding(h_buffer, h_arr, length, buffer_length);
-
-  // Allocate and copy array into device memory.
-  int *d_arr;
-  cudaMalloc((void**) &d_arr, buffer_size);
-  cudaMemcpy(d_arr, h_buffer, buffer_size, cudaMemcpyHostToDevice);
-
-  // Sort the array using bitonic_sort_step.
-  int i, j;
-  // The major step.
-  for (i = 2; i <= buffer_length; i *= 2) {
-    // The minor step.
-    for (j = i / 2; j > 0; j = j / 2) {
-      bitonic_sort_step<<<blocks, threads>>>(d_arr, i, j);
-    }
-  }
-
-  // Copy the values back to the host.
-  cudaMemcpy(h_arr, d_arr, size, cudaMemcpyDeviceToHost);
-
-  // Free device memory.
-  cudaFree(d_arr);
-  free(h_buffer);
-}
- 
-
-/* 
- * Procedure print_array
- *
- * Prints all elements of array n of size to stdout
- * */
-void print_array(int *n, int size) {
-	for (int i = 0; i < size; i++ ) printf("%d ", n[i]);
-	printf("\n");
-}
-
-
-/* 
- * Function get_median
- *
- * Returns median of array n of length
- * */
-int get_median(int *n, int length) {
-	int mid = length / 2;
-	if (length & 1) return n[mid];
-
-	return (n[mid - 1] + n[mid]) / 2;
-}
-
-
-/* 
- * Function get_floored_mean
- *
- * Returns floored mean from an array of integers
- * */
-long get_floored_mean(int *n, int length) {
-	long sum = 0;
-	for (int i = 0; i < length; i++) {
-		sum += n[i];
-	}
-
-	return sum / length;
-}
-
-/**
- * Function index_to_row_major 
- * 
- * Returns the index of a matrix element in row-major order 
- */
-int index_to_row_major(int row, int col, int row_eff, int col_eff) {
-	return row * col_eff + col;
-}
-
-__device__ int d_index_to_row_major(int row, int col, int row_eff, int col_eff) {
- 		return row * col_eff + col;
-}
-
-/**
- * Function row_major_to_index
- * 
- * Returns the row and column of a matrix element in row-major order
- */
-void row_major_to_index(int index, int row_eff, int col_eff, int *row, int *col) {
-	*row = index / col_eff;
-	*col = index % col_eff;
-}
-
-__device__ void d_row_major_to_index(int index, int row_eff, int col_eff, int *row, int *col) {
- 		*row = index / col_eff;
- 		*col = index % col_eff;
-}
-
-/**
- * Function map_matrix
- * 
- * Returns a row major matrix of the input matrix.
- **/
-int* map_matrix(int mat[][100], int row, int col) {
-	int* map = (int*) malloc(sizeof(int) * row * col);
-	for (int i = 0; i < row; i++) {
-		for (int j = 0; j < col; j++) {
-			map[index_to_row_major(i, j, row, col)] = mat[i][j];
-		}
-	}
-	return map;
-}
-
-/*
- * Function map_matrix_extended
- * 
- * Returns a row major matrix of the input matrix.
- **/
-int* map_matrix_extended(int** mat, int row, int col) {
-	int* map = (int*) malloc(sizeof(int) * row * col);
-	for (int i = 0; i < row; i++) {
-		for (int j = 0; j < col; j++) {
-			map[index_to_row_major(i, j, row, col)] = mat[i][j];
-		}
-	}
-	return map;
-}
-
-
-/**
- * Function reverse_map_matrix
- * 
- * Returns a matrix of the input row major matrix.
- */
-int** reverse_map_matrix(int* map, int row, int col) {
-	int** mat = (int**) malloc(sizeof(int*) * row);
-	for (int i = 0; i < row; i++) {
-		mat[i] = (int*) malloc(sizeof(int) * col);
-		for (int j = 0; j < col; j++) {
-			mat[i][j] = map[index_to_row_major(i, j, row, col)];
-		}
-	}
-	return mat;
-}
-
-/**
- * Function rm_to_matrix_object
- * 
- * Return Matrix struct of row major matrix
- */
-Matrix rm_to_matrix_object(int* map, int row, int col) {
-	Matrix mat;
-	init_matrix(&mat, row, col);
-	for (int i = 0; i < row; i++) {
-		for (int j = 0; j < col; j++) {
-			mat.mat[i][j] = map[index_to_row_major(i, j, row, col)];
-		}
-	}
-	return mat;
-}
-
-/**
- * Function rm_to_list_matrix_object
- * 
- * Return List of Matrix Struct of row major matrix
- */
-Matrix* rm_to_list_matrix_object(int* map, int row, int col, int row_inner, int col_inner) {
-	Matrix* mat = (Matrix*) malloc(sizeof(Matrix) * row);
-	for (int i = 0; i < row; i++) {
-		init_matrix(&mat[i], row_inner, col_inner);
-		int pad = i * col;
-		for (int j = 0; j < row_inner; j++) {
-			for (int k = 0; k < col_inner; k++) {
-				int index = index_to_row_major(j, k, row_inner, col_inner) + pad;
-				mat[i].mat[j][k] = map[index];
-			}
-		}
-	}
-	return mat;
-}
-/**
- * Function list_matrix_object_to_rm
- * 
- * Return row major matrix of list of Matrix struct
- */
-int* list_matrix_object_to_rm(Matrix* mat, int num_matrix,  int row_inner, int col_inner) {
-	int* map = (int*) malloc(sizeof(int) * num_matrix * row_inner * col_inner);
-	for (int i = 0; i < num_matrix; i++) {
-		int pad = i * row_inner * col_inner;
-		for (int j = 0; j < row_inner; j++) {
-			for (int k = 0; k < col_inner; k++) {
-				int index = index_to_row_major(j, k, row_inner, col_inner) + pad;
-				map[index] = mat[i].mat[j][k];
-			}
-		}
-	}
-	
-	return map;
-}
-
-/**
- * Function cuda_convolution
- * 
- * Returns a matrix of the convolution of the input matrix with the kernel
- */
-void cuda_convolution(int* out_mat_rm, int* arr_mat_rm, int* kernel_rm, int row_eff, int col_eff, int kernel_row, int kernel_col, int curr_mat) {
-	// Calculate real row and column of input matrix.
-	int row = row_eff + kernel_row - 1;
-	int col = col_eff + kernel_col - 1;
-	
-	// Calculate padding target and output matrix.
-	int pad = curr_mat * row * col;
-	int pad_out = curr_mat * row_eff * col_eff;
-
-	// For each element in input matrix that is not on the boundary,
-	for (int i = 0 ; i < row_eff; i++) {
-		for (int j = 0; j < col_eff; j++) {
-			// Convolution of the element with the kernel.
-			// Calculate the sum of the kernel and the input matrix.
-			int intermediate_sum = 0;
-			for (int k = 0; k < kernel_row; k++) {
-				for (int l = 0; l < kernel_col; l++) {
-					int index = index_to_row_major(i + k, j + l, row, col) + pad;
-					int kernel_index = index_to_row_major(k, l, kernel_row, kernel_col);
-					intermediate_sum += arr_mat_rm[index] * kernel_rm[kernel_index];
-					// Print all i,j,k,l
-					// printf("i:%d, j:%d, k:%d, l:%d\n", i, j, k, l);
-
-				}
-			}
-			// Store the sum in the output matrix.
-			out_mat_rm[index_to_row_major(i, j, row_eff, col_eff) + pad_out ] = intermediate_sum;
-		}
-	}
-}
-
-__global__ void d_cuda_convolution(int* d_out_mat, int* arr_mat_rm, int* kernel_rm, int row_eff, int col_eff, int kernel_row, int kernel_col) {
-  // Calculate real row and column of input matrix.
-	int row = row_eff + kernel_row - 1;
-	int col = col_eff + kernel_col - 1;
-
-  // Determine current matrix from block;
-	int curr_mat = blockIdx.y;
-  // Calculate padding target and output matrix.
- 	int pad = curr_mat * row * col;
- 	int pad_out = curr_mat * row_eff * col_eff;
- 
-  // Get i, and j from threadIdx
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int i, j;
-  d_row_major_to_index(tid, row_eff, col_eff, &i, &j);
- 
-  // Calculate element in input matrix that is not on the boundary,
- 	if (i < row_eff && j < col_eff) {
- 		int intermediate_sum = 0;
- 		for (int k = 0; k < kernel_row; k++) {
- 			for (int l = 0; l < kernel_col; l++) {
- 				int index = d_index_to_row_major(i + k, j + l, row, col) + pad;
- 				int kernel_index = d_index_to_row_major(k, l, kernel_row, kernel_col);
- 				intermediate_sum += arr_mat_rm[index] * kernel_rm[kernel_index];
- 			}
- 		}
- 		d_out_mat[d_index_to_row_major(i, j, row_eff, col_eff) + pad_out] = intermediate_sum;
- 	}
-}
-
-/* 
- * Function d_get_matrix_datarange
- * Set range between maximum and minimum
- * for every matrix in out_mat to d_arr_range
- * */
-__global__ void d_get_matrix_datarange(int* out_mat, int* d_arr_range, int row_eff, int col_eff) {
-	// Determine current matrix from block;
-	int curr_mat = blockIdx.y;
-	// Calculate padding output matrix and arr_range matrix.
-	int pad = curr_mat * row_eff * col_eff;
-	int pad_arr_range = curr_mat;
-
-	// Get i, and j from threadIdx
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int i, j;
-	d_row_major_to_index(tid, row_eff, col_eff, &i, &j);
-  	// Get datarange in every output matrix
- 	if (i < row_eff && j < col_eff) {
- 		int max = DATAMIN;
-		int min = DATAMAX;
- 		for (int k = 0; k < row_eff; k++) {
- 			for (int l = 0; l < col_eff; l++) {
- 				int index = d_index_to_row_major(i + k, j + l, row_eff, col_eff) + pad;
-
-				int el = out_mat[index];
-				if (el > max) max = el;
-				if (el < min) min = el;
- 			}
- 		}
-
- 		d_arr_range[d_index_to_row_major(i, j, row_eff, col_eff) + pad_arr_range] = max-min;
- 	}
-}
-
-// main() driver
-int main() {
-	// Time.
-	clock_t t;
-    t = clock();
-
-	int kernel_row, kernel_col, target_row, target_col, num_targets;
-	
-	// reads kernel's row and column and initalize kernel matrix from input
-	scanf("%d %d", &kernel_row, &kernel_col);
-	Matrix kernel = input_matrix(kernel_row, kernel_col);
-	
-	// reads number of target matrices and their dimensions.
-	// initialize array of matrices and array of data ranges (int)
-	scanf("%d %d %d", &num_targets, &target_row, &target_col);
-	Matrix* arr_mat = (Matrix*)malloc(num_targets * sizeof(Matrix));
-	// int arr_range[num_targets];
-	
-	// Calculate variable for cuda computing.
-	int a = (target_row-kernel_row+1) * (target_col-kernel_col+1);
-	int b = 1024;
-	int block_size = a/b + (a % b != 0); // ceil(a/b)
-	int threads_per_block = 1024;
-	int row_eff = target_row - kernel_row + 1;
-	int col_eff = target_col - kernel_col + 1;
-
-
-	// Allocate variable. 
-	// rm means row-major. It's indicate matrix are in row-major order.
-	// Variable declaration.
-	int * arr_mat_rm=0, * d_arr_mat_rm=0, *out_mat_rm=0, *d_out_mat_rm=0, *kernel_rm=0, *d_kernel_rm=0, *arr_range=0, *d_arr_range=0;
-	int size_arr_mat, size_out_mat, size_kernel, size_arr_range;
-	cudaError err;
-
-	// Allocate input matrix in device and host.
-	size_arr_mat = num_targets * target_row * target_col;
-	arr_mat_rm = (int*)malloc(sizeof(int) * size_arr_mat);
-	cudaMalloc((void **)&d_arr_mat_rm, sizeof(int) * size_arr_mat);
-	if (arr_mat_rm == 0 | d_arr_mat_rm == 0) {
-		printf("Error: Memory allocation failed for arr_mat.\n");
-	 	return 1;
-	}
-
-	// Allocate output matrix in device and host.
-	size_out_mat = num_targets * row_eff * col_eff;
-	out_mat_rm = (int*)malloc(sizeof(int) * size_out_mat);
-	cudaMalloc((void **)&d_out_mat_rm, sizeof(int) * size_out_mat);
-	if (out_mat_rm == 0 | d_out_mat_rm == 0) {
-	 	printf("Error: Memory allocation failed for out_mat.\n");
-	 	return 1;
-	}
-	cudaMemset(d_out_mat_rm, 0, sizeof(int) * size_out_mat);
-
-
-	// Allocate kernel matrix in host.
-	size_kernel = kernel_row * kernel_col;
-	kernel_rm = (int*)malloc(sizeof(int) * size_kernel);
-	// Store kernel in row major form and allocate kernel for device.
-	kernel_rm = map_matrix(kernel.mat, kernel_row, kernel_col);
-	cudaMalloc((void **)&d_kernel_rm, sizeof(int) * size_kernel);
-	if (kernel_rm == 0 | d_kernel_rm == 0) {
-	 	printf("Error: Memory allocation failed for kernel.\n");
-	 	return 1;
-	}
-	err = cudaMemcpy(d_kernel_rm, kernel_rm, sizeof(int) * size_kernel, cudaMemcpyHostToDevice);
-	 if (err != cudaSuccess) {
-		 	printf("Error copy host to device: %s\n", cudaGetErrorString(err));
-			return 1;
-	}
-	
-	
-	// Read each target matrix.
-	for (int i = 0; i < num_targets; i++) {
-		arr_mat[i] = input_matrix(target_row, target_col);
-	}
-	// Store each target matrix in row major form and allocate target matrix for device.
-	arr_mat_rm = list_matrix_object_to_rm(arr_mat, num_targets, target_row, target_col);
-	err = cudaMemcpy(d_arr_mat_rm, arr_mat_rm, sizeof(int) * size_arr_mat, cudaMemcpyHostToDevice);
-	if (err != cudaSuccess) {
-		printf("Error copy host to device: %s\n", cudaGetErrorString(err));
-		return 1;
-	}
-
-	// For each target matrix, compute their convolution matrices.
-	dim3 grid, block;
-	grid.x = block_size;
-	grid.y = num_targets;
-	block.x = threads_per_block;
-	d_cuda_convolution<<<grid, block>>>(d_out_mat_rm, d_arr_mat_rm, d_kernel_rm, row_eff, col_eff, kernel_row, kernel_col);
-	err = cudaMemcpy(out_mat_rm, d_out_mat_rm, sizeof(int) * size_out_mat, cudaMemcpyDeviceToHost);
-	if (err != cudaSuccess) {
-	 	printf("Error copy device to host: %s\n", cudaGetErrorString(err));
-	}
-	// for (int i = 0; i < num_targets; i++){
-		// cuda_convolution(out_mat_rm, arr_mat_rm, kernel_rm, row_eff, col_eff, kernel_row, kernel_col, i);
-	// }
-	// arr_mat = rm_to_list_matrix_object(out_mat_rm, num_targets, row_eff*col_eff, row_eff, col_eff);
-
-	// Allocate arr_range matrix in device and host.
-	size_arr_range = num_targets;
-	arr_range = (int*)malloc(sizeof(int) * size_arr_range);
-	cudaMalloc((void **)&d_arr_range, sizeof(int) * size_arr_range);
-	if (arr_range == 0 | d_arr_range == 0) {
-	 	printf("Error: Memory allocation failed for arr_range.\n");
-	 	return 1;
-	}
-	cudaMemset(d_arr_range, 0, sizeof(int) * size_arr_range);
-
-	grid.x = 1;
-	grid.y = num_targets;
-	block.x = 1;
-	d_get_matrix_datarange<<<grid, block>>>(d_out_mat_rm, d_arr_range, row_eff, col_eff);
-	err = cudaMemcpy(arr_range, d_arr_range, sizeof(int) * size_arr_range, cudaMemcpyDeviceToHost);
-	if (err != cudaSuccess) {
-	 	printf("Error copy device to host 2: %s\n", cudaGetErrorString(err));
-	}
-
-	// sort the data range array
-	bitonic_sort(arr_range, num_targets);
-	
-	int median = get_median(arr_range, num_targets);	
-	int floored_mean = get_floored_mean(arr_range, num_targets); 
-
-	// print the min, max, median, and floored mean of data range array
-	printf("%d\n%d\n%d\n%d\n", 
-			arr_range[0], 
-			arr_range[num_targets - 1], 
-			median, 
-			floored_mean);
-
-	// Print execution time in seconds.
-	t = clock() - t;
-	printf("\nRuntime: %f s\n", ((float)t) / CLOCKS_PER_SEC);	
-	
-	// Free cuda Memory.
-	cudaFree(d_arr_mat_rm);
-	cudaFree(d_out_mat_rm);
-	cudaFree(d_kernel_rm);
-	return 0;
-}