diff --git a/01-saxpy.cu b/01-saxpy.cu
index 3d39766df0c563b1d8f66b555c71fc8ab6c47170..4f3773e7efbc922ae295cba2288b5a261ddfbdbe 100644
--- a/01-saxpy.cu
+++ b/01-saxpy.cu
@@ -1,6 +1,7 @@
 #include <stdio.h>
 
 #define N 2048 * 2048 // Number of elements in each vector
+#define STREAMS 2048
 
 /*
  * Optimize this already-accelerated codebase. Work iteratively,
@@ -12,25 +13,27 @@
  * Some bugs have been placed in this codebase for your edification.
  */
 
-__global__ void saxpy(int * a, int * b, int * c)
+__global__ void saxpy(float * a, float * b, float * c, int offset)
 {
-    int tid = blockIdx.x * blockDim.x * threadIdx.x;
-
-    if ( tid < N )
+    int tid = offset + blockIdx.x * blockDim.x + threadIdx.x;
+    if ( tid < N) {
         c[tid] = 2 * a[tid] + b[tid];
+    }
 }
 
 int main()
 {
-    float *a, *b, *c;
-
-    int size = N * sizeof (int); // The total number of bytes per vector
-
-    cudaMallocManaged(&a, size);
-    cudaMallocManaged(&b, size);
-    cudaMallocManaged(&c, size);
+    //  Allocating memory in host as well as device
+    float *a, *b, *c, *da, *db, *dc;
+    int size = N * sizeof (int);
+    cudaMallocHost((void**)&a, size);
+    cudaMallocHost((void**)&b, size);
+    cudaMallocHost((void**)&c, size);
+    cudaMalloc((void**)&da, size);
+    cudaMalloc((void**)&db, size);
+    cudaMalloc((void**)&dc, size);
 
-    // Initialize memory
+    // Initialize memory in host
     for( int i = 0; i < N; ++i )
     {
         a[i] = 2;
@@ -38,17 +41,44 @@ int main()
         c[i] = 0;
     }
 
-    int threads_per_block = 128;
-    int number_of_blocks = (N / threads_per_block) + 1;
+    //  Initialize streams
+    cudaStream_t streams[STREAMS];
+    for (int i = 0; i < STREAMS; i++) {
+        cudaStreamCreate(&streams[i]);
+    }
+
+    //  Loop to copy and execute kernel
+    int stream_size = N / STREAMS;
+    int stream_bytes = stream_size * sizeof(int);
+    for (int i = 0; i < STREAMS; ++i) {
+        int offset = i * stream_size;
+        cudaMemcpyAsync(&da[offset], &a[offset], stream_bytes, cudaMemcpyHostToDevice, streams[i]);
+        cudaMemcpyAsync(&db[offset], &b[offset], stream_bytes, cudaMemcpyHostToDevice, streams[i]);
+        cudaMemcpyAsync(&dc[offset], &c[offset], stream_bytes, cudaMemcpyHostToDevice, streams[i]);
+        int threads_per_block = 128;
+        int number_of_blocks = (stream_size / threads_per_block) + 1;
+        saxpy <<< number_of_blocks, threads_per_block, 0, streams[i] >>> ( da, db, dc, offset );
+        cudaMemcpyAsync(&a[offset], &da[offset], stream_bytes, cudaMemcpyDeviceToHost, streams[i]);
+        cudaMemcpyAsync(&b[offset], &db[offset], stream_bytes, cudaMemcpyDeviceToHost, streams[i]);
+        cudaMemcpyAsync(&c[offset], &dc[offset], stream_bytes, cudaMemcpyDeviceToHost, streams[i]);
+    }
+    
+    //  Wait for GPU to finish before accessing on host
+    cudaDeviceSynchronize();
 
-    saxpy <<< number_of_blocks, threads_per_block >>> ( a, b, c );
+    //  Calculating error
+    int number_missed = 0;
+    for (int i = 0; i < N; i++) {
+        if (c[i] != 5) number_missed++;
+    }
+    printf("Number missed: %d out of %d\n", number_missed, N);
 
     // Print out the first and last 5 values of c for a quality check
     for( int i = 0; i < 5; ++i )
-        printf("c[%d] = %d, ", i, c[i]);
+        printf("c[%d] = %lf, ", i, c[i]);
     printf ("\n");
     for( int i = N-5; i < N; ++i )
-        printf("c[%d] = %d, ", i, c[i]);
+        printf("c[%d] = %lf, ", i, c[i]);
     printf ("\n");
 
     cudaFree( a ); cudaFree( b ); cudaFree( c );
diff --git a/README.md b/README.md
index aa7772cf8788f72679c1fc02d982ab4f8973ded2..ccde9873f917f5469c60c6971ebba0b7443c6cbb 100644
--- a/README.md
+++ b/README.md
@@ -1,44 +1,8 @@
-# IF5160
-
-Tugas UAS
-1. Jalankan dan optimasi kode 01-saxpy.cu
-
-kompilasi
-
-nvcc 01-saxpy.cu -o saxpy
-
-eksekusi
-
-./saxpy
-
-
-2. Jalankan dan optimasi kode 01-nbody.cu berikut ini.
-
-kompilasi:
-
-nvcc 01-nbody.cu -o nbody
-
-eksekusi:
-
-./nbody 
-
-atau 
-
-./nbody x
-
-dimana x adalah jumlah partikel/body dalam dengan jumlah partikel 2 << x. 
-misal: ./nbody 11 artinya menjalankan dengan jumlah partikel 2 << 11, yaitu 4096
-
-Optimasi kode di atas, dan ujilah dengan jumlah partikel 4096 (./nbody 11) dan 16K (./nbody 13)
-
-Eksekusi kode tersebut akan menampilkan jumlah interaksi antar body per detik. 
-Pada server 167.205.32.100, baseline nya adalah 0.039 Billion interaction per second untuk 4096 body,
-dan 
-
-catatan:
-1. jangan mengubah kode fungsi randomizeBodies, karena fungsi ini berjalan pada host dan tidak perlu/bisa dioptimasi
-
+#IF5160
 
+##Task 1
 
+Using asynhcronous data transfer from host to device, execution can overlap. The total time taken based on the case given is around 12 us. Please see the image attached for proof of execution.
 
+##Task 2
 
diff --git a/image_saxpy.png b/image_saxpy.png
new file mode 100644
index 0000000000000000000000000000000000000000..497045d523981ba3c27216941ee60fb43e15afaf
Binary files /dev/null and b/image_saxpy.png differ