Editing files

2345ec34 · whiteclips · 8284a131 · 2345ec34 · 2345ec34 · 2345ec34
Commit 2345ec34 authored 6 years ago by whiteclips
--- a/01-saxpy.cu
+++ b/01-saxpy.cu
 #include <stdio.h>

 #define N 2048 * 2048 // Number of elements in each vector
+#define STREAMS 2048

 /*
 * Optimize this already-accelerated codebase. Work iteratively,
@@ -12,25 +13,27 @@
 * Some bugs have been placed in this codebase for your edification.
 */

-__global__ void saxpy(int * a, int * b, int * c)
+__global__ void saxpy(float * a, float * b, float * c, int offset)
 {
-    int tid = blockIdx.x * blockDim.x * threadIdx.x;
-
-    if ( tid < N )
+    int tid = offset + blockIdx.x * blockDim.x + threadIdx.x;
+    if ( tid < N) {
        c[tid] = 2 * a[tid] + b[tid];
+    }
 }

 int main()
 {
-    float *a, *b, *c;
-
-    int size = N * sizeof (int); // The total number of bytes per vector
-
-    cudaMallocManaged(&a, size);
-    cudaMallocManaged(&b, size);
-    cudaMallocManaged(&c, size);
+    //  Allocating memory in host as well as device
+    float *a, *b, *c, *da, *db, *dc;
+    int size = N * sizeof (int);
+    cudaMallocHost((void**)&a, size);
+    cudaMallocHost((void**)&b, size);
+    cudaMallocHost((void**)&c, size);
+    cudaMalloc((void**)&da, size);
+    cudaMalloc((void**)&db, size);
+    cudaMalloc((void**)&dc, size);

-    // Initialize memory
+    // Initialize memory in host
    for( int i = 0; i < N; ++i )
    {
        a[i] = 2;
@@ -38,17 +41,44 @@ int main()
        c[i] = 0;
    }

-    int threads_per_block = 128;
-    int number_of_blocks = (N / threads_per_block) + 1;
+    //  Initialize streams
+    cudaStream_t streams[STREAMS];
+    for (int i = 0; i < STREAMS; i++) {
+        cudaStreamCreate(&streams[i]);
+    }
+
+    //  Loop to copy and execute kernel
+    int stream_size = N / STREAMS;
+    int stream_bytes = stream_size * sizeof(int);
+    for (int i = 0; i < STREAMS; ++i) {
+        int offset = i * stream_size;
+        cudaMemcpyAsync(&da[offset], &a[offset], stream_bytes, cudaMemcpyHostToDevice, streams[i]);
+        cudaMemcpyAsync(&db[offset], &b[offset], stream_bytes, cudaMemcpyHostToDevice, streams[i]);
+        cudaMemcpyAsync(&dc[offset], &c[offset], stream_bytes, cudaMemcpyHostToDevice, streams[i]);
+        int threads_per_block = 128;
+        int number_of_blocks = (stream_size / threads_per_block) + 1;
+        saxpy <<< number_of_blocks, threads_per_block, 0, streams[i] >>> ( da, db, dc, offset );
+        cudaMemcpyAsync(&a[offset], &da[offset], stream_bytes, cudaMemcpyDeviceToHost, streams[i]);
+        cudaMemcpyAsync(&b[offset], &db[offset], stream_bytes, cudaMemcpyDeviceToHost, streams[i]);
+        cudaMemcpyAsync(&c[offset], &dc[offset], stream_bytes, cudaMemcpyDeviceToHost, streams[i]);
+    }
+    
+    //  Wait for GPU to finish before accessing on host
+    cudaDeviceSynchronize();

-    saxpy <<< number_of_blocks, threads_per_block >>> ( a, b, c );
+    //  Calculating error
+    int number_missed = 0;
+    for (int i = 0; i < N; i++) {
+        if (c[i] != 5) number_missed++;
+    }
+    printf("Number missed: %d out of %d\n", number_missed, N);

    // Print out the first and last 5 values of c for a quality check
    for( int i = 0; i < 5; ++i )
-        printf("c[%d] = %d, ", i, c[i]);
+        printf("c[%d] = %lf, ", i, c[i]);
    printf ("\n");
    for( int i = N-5; i < N; ++i )
-        printf("c[%d] = %d, ", i, c[i]);
+        printf("c[%d] = %lf, ", i, c[i]);
    printf ("\n");

    cudaFree( a ); cudaFree( b ); cudaFree( c );

--- a/README.md
+++ b/README.md
-# IF5160
-
-Tugas UAS
-1. Jalankan dan optimasi kode 01-saxpy.cu
-
-kompilasi
-
-nvcc 01-saxpy.cu -o saxpy
-
-eksekusi
-
-./saxpy
-
-
-2. Jalankan dan optimasi kode 01-nbody.cu berikut ini.
-
-kompilasi:
-
-nvcc 01-nbody.cu -o nbody
-
-eksekusi:
-
-./nbody 
-
-atau 
-
-./nbody x
-
-dimana x adalah jumlah partikel/body dalam dengan jumlah partikel 2 << x. 
-misal: ./nbody 11 artinya menjalankan dengan jumlah partikel 2 << 11, yaitu 4096
-
-Optimasi kode di atas, dan ujilah dengan jumlah partikel 4096 (./nbody 11) dan 16K (./nbody 13)
-
-Eksekusi kode tersebut akan menampilkan jumlah interaksi antar body per detik. 
-Pada server 167.205.32.100, baseline nya adalah 0.039 Billion interaction per second untuk 4096 body,
-dan 
-
-catatan:
-1. jangan mengubah kode fungsi randomizeBodies, karena fungsi ini berjalan pada host dan tidak perlu/bisa dioptimasi
-
+#IF5160

+##Task 1

+Using asynhcronous data transfer from host to device, execution can overlap. The total time taken based on the case given is around 12 us. Please see the image attached for proof of execution.

+##Task 2

--- a/image_saxpy.png
+++ b/image_saxpy.png