diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..acfdcf68686866fbc543de0a9b2d61a8d5771c53 --- /dev/null +++ b/Makefile @@ -0,0 +1,5 @@ +all: radix_sort_parallel.cu + nvcc radix_sort_parallel.cu -o radix_sort + +clean: + rm -R bin && mkdir bin \ No newline at end of file diff --git a/README.md b/README.md index 5ab04fa52d5e53076c19bb851920f0c754646a78..09c4dc215f24b099f11964c514addff7b3d787b2 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,75 @@ -# Praktikum3_K01_13516013_13516124 +# CUDA - Radix Sort +Program ini dapat melakukan pengurutan elemen array dengan menggunakan algoritma Radix Sort dan menggunakan pemrosesan parallel dengan CUDA. Akan dibandingkan metode Radix Sort dengan menggunakan pemrosesan serial dengan parallel CUDA. +## Petunjuk Penggunaan Program +1. Masuk ke dalam folder `cuda` pada terminal +2. Ketik `make run` pada terminal kemudian tekan tombol `enter` +3. Run program dengan mengetikkan `./radix_sort <array_length>` + +## Pembagian Tugas +- Azka Nabilah Mumtaz (13516013) : laporan, pengujian, program +- Kevin Muharyman A. (13516124) : laporan, pengujian, program + +## Laporan Pengerjaan + +### 1. Deskripsi Solusi Parallel + +### 2. Analisis Solusi + +### 3. Jumlah Pemetaan Thread + +### 4. Pengukuran Kinerja + +#### Serial + +**N = 5.000** +Pengujian 1: +Pengujian 2: +Pengujian 3: +**N = 50.000** +Pengujian 1: +Pengujian 2: +Pengujian 3: +**N = 100.000** +Pengujian 1: +Pengujian 2: +Pengujian 3: +**N = 200.000** +Pengujian 1: +Pengujian 2: +Pengujian 3: +**N = 400.000** +Pengujian 1: +Pengujian 2: +Pengujian 3: + +#### Parallel + +**N = 5.000** +Pengujian 1: +Pengujian 2: +Pengujian 3: +**N = 50.000** +Pengujian 1: +Pengujian 2: +Pengujian 3: +**N = 100.000** +Pengujian 1: +Pengujian 2: +Pengujian 3: +**N = 200.000** +Pengujian 1: +Pengujian 2: +Pengujian 3: +**N = 400.000** +Pengujian 1: +Pengujian 2: +Pengujian 3: + +###5. Analisis Perbandingan Serial dan Parallel + +**N = 5.000** +**N = 50.000** +**N = 100.000** +**N = 200.000** +**N = 400.000** \ No newline at end of file diff --git a/src/radix_sort_parallel.cu b/src/radix_sort_parallel.cu index 950f89f4209570f22f6cbd8f1c533290a81c023f..77f98caec6e5c6b9a72a2aaac4a1d6739124be27 100644 --- a/src/radix_sort_parallel.cu +++ b/src/radix_sort_parallel.cu @@ -2,70 +2,19 @@ #include <cuda_runtime.h> #include "radix_sort_parallel.h" -__global__ void copyArrayParallel(int *arr, int *output, int n) { - for (int i = 0; i < n; i++) { - arr[i] = output[i]; - } -} - -__global__ void getMaxParallel(int *arr, int *max, int n) { - int maximum = arr[0]; - for (int i = 0; i < n; i++) { - if (arr[i] > maximum) { - maximum = arr[i]; - } - } - max[0] = maximum; -} - -int getMax(int arr[], int n) -{ - int result; - int *max; - int *d_arr, *d_max; - - // Allocate host memory - max = (int*)malloc(n * sizeof(int)); - - // Initialize host memory - for (int i = 0; i < n; i++) { - max[i] = 0; - } - - // Allocate device memory - cudaMalloc((void**)&d_arr, n * sizeof(int)); - cudaMalloc((void**)&d_max, n * sizeof(int)); - - // Transfer data from host to device memory - cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice); +__global__ void getMax(int *arr, int *max, int n) { + int mx = arr[0]; - // Executing kernel - getMaxParallel<<<1,500>>>(d_arr, d_max, n); - - // Transfer data back to host memory - cudaMemcpy(max, d_max, n * sizeof(int), cudaMemcpyDeviceToHost); - - // Deallocate device memory - cudaFree(d_max); - cudaFree(d_arr); - - result = max[0]; - - // Deallocate host memory - free(max); - - return result; + for (int i = 0; i < n; i++) + if (arr[i] > mx) + mx = arr[i]; + max[0] = mx; } -void countSort(int arr[], int n, int exp) -{ - int *output; - int *d_output, *d_arr; +__global__ void countSort(int *arr, int n, int exp) { + int* output = (int*)malloc(n * sizeof(int)); int i, count[10] = {0}; - // Allocate host memory - output = (int*)malloc(n * sizeof(int)); - for (i = 0; i < n; i++) count[ (arr[i]/exp)%10 ]++; @@ -78,33 +27,40 @@ void countSort(int arr[], int n, int exp) count[ (arr[i]/exp)%10 ]--; } + for (i = 0; i < n; i++) + arr[i] = output[i]; +} + +void radix_sort(int arr[], int n) +{ + int *max; + int *d_max, *d_arr; + + // Allocate host memory + max = (int*)malloc(n * sizeof(int)); + // Allocate device memory - cudaMalloc((void**)&d_arr, sizeof(n * sizeof(int))); - cudaMalloc((void**)&d_output, sizeof(n * sizeof(int))); + cudaMalloc((void**)&d_max, n * sizeof(int)); + cudaMalloc((void**)&d_arr, n * sizeof(int)); - // Transfer data from host to device memory + // Transfer data from host to device cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_max, max, n * sizeof(int), cudaMemcpyHostToDevice); // Executing kernel - copyArrayParallel<<<1,500>>>(d_arr, d_output, n); - - //Transfer data back to host memory - cudaMemcpy(output, d_output, n * sizeof(int), cudaMemcpyDeviceToHost); - - // Deallocate device memory - cudaFree(d_arr); - cudaFree(d_output); + getMax<<<1, 500>>>(d_arr, d_max, n); - // Deallocate host memory - free(output); -} + // Transfer data back to host memory + cudaMemcpy(max, d_max, n * sizeof(int), cudaMemcpyDeviceToHost); -void radix_sort(int arr[], int n) -{ - int m = getMax(arr, n); + for (int exp = 1; max[0]/exp > 0; exp *= 10) { + countSort<<<1, 500>>>(d_arr, n, exp); + } - for (int exp = 1; m/exp > 0; exp *= 10) - countSort(arr, n, exp); + cudaMemcpy(arr, d_arr, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(d_max); + cudaFree(d_arr); + free(max); } void print(int arr[], int n) @@ -113,7 +69,7 @@ void print(int arr[], int n) printf("%d: %d\n",i, arr[i]); } -void rng(int* arr, int n) { +void rng(int *arr, int n) { int seed = 13516013; srand(seed); for(long i = 0; i < n; i++) { @@ -122,21 +78,38 @@ void rng(int* arr, int n) { } int main(int argc, char *argv[]) { - int N; - if (argc == 2) { + int N; + int *arr; + int *d_arr; + + if (argc == 2) { N = strtol(argv[1], NULL, 10); } else { printf("ERROR: ./radix_sort <array_length>\n"); return 1; } - int arr[N]; - rng(arr,N); - clock_t begin = clock(); + + // Allocate host memory + arr = (int*)malloc(N * sizeof(int)); + + // Initialize host memory + rng(arr,N); + + // Allocate device memory + cudaMalloc((void**)&d_arr, N * sizeof(int)); + + // Transfer data from host to device memory + cudaMemcpy(d_arr, arr, N * sizeof(int), cudaMemcpyHostToDevice); + + clock_t begin = clock(); radix_sort(arr, N); clock_t end = clock(); double time = (double)(end - begin) * 1000 / CLOCKS_PER_SEC; - print(arr,N); - printf("Executed in %lf ms\n",time); - return 0; + print(arr,N); + printf("Executed in %lf ms\n",time); + + cudaFree(d_arr); + free(arr); + return 0; } diff --git a/src/radixsort_parallel.cu b/src/radixsort_parallel.cu deleted file mode 100644 index a03db2413126ed32f411fe036e9019f5c3ead365..0000000000000000000000000000000000000000 --- a/src/radixsort_parallel.cu +++ /dev/null @@ -1,117 +0,0 @@ -#include <cuda.h> -#include <cuda_runtime.h> -#include "radix_sort_parallel.h" - -__global__ void getMax(int *arr, int *max, int n) { - //int index = threadIdx.x; - //int stride = blockDim.x; - int mx = arr[0]; - - for (int i = 0; i < n; i++) - if (arr[i] > mx) - mx = arr[i]; - max[0] = mx; -} - -__global__ void countSort(int *arr, int n, int exp) { - int* output = (int*)malloc(n * sizeof(int)); - int i, count[10] = {0}; - - for (i = 0; i < n; i++) - count[ (arr[i]/exp)%10 ]++; - - for (i = 1; i < 10; i++) - count[i] += count[i - 1]; - - for (i = n - 1; i >= 0; i--) - { - output[count[ (arr[i]/exp)%10 ] - 1] = arr[i]; - count[ (arr[i]/exp)%10 ]--; - } - - for (i = 0; i < n; i++) - arr[i] = output[i]; -} - -void radix_sort(int arr[], int n) -{ - int *max; - int *d_max, *d_arr; - - // Allocate host memory - max = (int*)malloc(n * sizeof(int)); - - // Allocate device memory - cudaMalloc((void**)&d_max, n * sizeof(int)); - cudaMalloc((void**)&d_arr, n * sizeof(int)); - - // Transfer data from host to device - cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_max, max, n * sizeof(int), cudaMemcpyHostToDevice); - - // Executing kernel - getMax<<<1, 500>>>(d_arr, d_max, n); - - // Transfer data back to host memory - cudaMemcpy(max, d_max, n * sizeof(int), cudaMemcpyDeviceToHost); - - for (int exp = 1; max[0]/exp > 0; exp *= 10) { - countSort<<<1, 500>>>(d_arr, n, exp); - } - - cudaMemcpy(arr, d_arr, n * sizeof(int), cudaMemcpyDeviceToHost); - cudaFree(d_max); - cudaFree(d_arr); - free(max); -} - -void print(int arr[], int n) -{ - for (int i = 0; i < n; i++) - printf("%d: %d\n",i, arr[i]); -} - -void rng(int *arr, int n) { - int seed = 13516013; - srand(seed); - for(long i = 0; i < n; i++) { - arr[i] = (int)rand(); - } -} - -int main(int argc, char *argv[]) { - int N; - int *arr; - int *d_arr; - - if (argc == 2) { - N = strtol(argv[1], NULL, 10); - } else { - printf("ERROR: ./radix_sort <array_length>\n"); - return 1; - } - - // Allocate host memory - arr = (int*)malloc(N * sizeof(int)); - - // Initialize host memory - rng(arr,N); - - // Allocate device memory - cudaMalloc((void**)&d_arr, N * sizeof(int)); - - // Transfer data from host to device memory - cudaMemcpy(d_arr, arr, N * sizeof(int), cudaMemcpyHostToDevice); - - clock_t begin = clock(); - radix_sort(arr, N); - clock_t end = clock(); - double time = (double)(end - begin) * 1000 / CLOCKS_PER_SEC; - print(arr,N); - printf("Executed in %lf ms\n",time); - - cudaFree(d_arr); - free(arr); - return 0; -} - diff --git a/vector_add.cu b/vector_add.cu deleted file mode 100644 index 8d6ce535de9ebb051e0e8eb8c725f84ed9159ed2..0000000000000000000000000000000000000000 --- a/vector_add.cu +++ /dev/null @@ -1,63 +0,0 @@ -#include <stdio.h> -#include <stdlib.h> -#include <math.h> -#include <assert.h> -#include <cuda.h> -#include <cuda_runtime.h> - -#define N 10000000 -#define MAX_ERR 1e-6 - -__global__ void vector_add(float *out, float *a, float *b, int n) { - for(int i = 0; i < n; i ++){ - out[i] = a[i] + b[i]; - } -} - -int main(){ - float *a, *b, *out; - float *d_a, *d_b, *d_out; - - // Allocate host memory - a = (float*)malloc(sizeof(float) * N); - b = (float*)malloc(sizeof(float) * N); - out = (float*)malloc(sizeof(float) * N); - - // Initialize host arrays - for(int i = 0; i < N; i++){ - a[i] = 1.0f; - b[i] = 2.0f; - } - - // Allocate device memory - cudaMalloc((void**)&d_a, sizeof(float) * N); - cudaMalloc((void**)&d_b, sizeof(float) * N); - cudaMalloc((void**)&d_out, sizeof(float) * N); - - // Transfer data from host to device memory - cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice); - cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice); - - // Executing kernel - vector_add<<<1,1>>>(d_out, d_a, d_b, N); - - // Transfer data back to host memory - cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost); - - // Verification - for(int i = 0; i < N; i++){ - assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR); - } - printf("out[0] = %f\n", out[0]); - printf("PASSED\n"); - - // Deallocate device memory - cudaFree(d_a); - cudaFree(d_b); - cudaFree(d_out); - - // Deallocate host memory - free(a); - free(b); - free(out); -}