Final Term: Setyo Legowo

936b309d · Setyo Legowo · 8284a131 · 936b309d · 936b309d
Commit 936b309d authored 6 years ago by Setyo Legowo
--- a/01-nbody.cu
+++ b/01-nbody.cu
@@ -5,6 +5,7 @@
 #include "check.h"

 #define SOFTENING 1e-9f
+#define N_STREAMS 2 // Number of streams

 /*
 * Each body contains x, y, and z coordinate positions,
@@ -48,6 +49,37 @@ void bodyForce(Body *p, float dt, int n) {
  }
 }

+// CHANGE ===========================
+__global__ void deviceBodyForceAndTranslate(float *buf, float dt, int n, int stream_i) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x + (stream_i * (n/N_STREAMS));
+    if (tid >= n) {
+        return;
+    }
+
+    Body *p = (Body*)buf;
+    float myX = p[tid].x; float myY = p[tid].y; float myZ = p[tid].z;
+    float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
+
+    for (int i = 0; i < n; i++) {
+        float dx = p[i].x - myX;
+        float dy = p[i].y - myY;
+        float dz = p[i].z - myZ;
+        float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
+        float invDist = rsqrtf(distSqr);
+        float invDist3 = invDist * invDist * invDist;
+
+        Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
+    }
+
+    p[tid].vx += dt*Fx; p[tid].vy += dt*Fy; p[tid].vz += dt*Fz;
+
+    // Translate
+    p[tid].x += dt*p[tid].vx;
+    p[tid].y += dt*p[tid].vy;
+    p[tid].z += dt*p[tid].vz;
+}
+// =========================== CHANGE
+
 int main(const int argc, const char** argv) {

  /*
@@ -73,7 +105,7 @@ int main(const int argc, const char** argv) {

  buf = (float *)malloc(bytes);

-  Body *p = (Body*)buf;
+  // Body *p = (Body*)buf;

  /*
   * As a constraint of this exercise, `randomizeBodies` must remain a host function.
@@ -81,6 +113,24 @@ int main(const int argc, const char** argv) {

  randomizeBodies(buf, 6 * nBodies); // Init pos / vel data

+  // CHANGE ===========================
+  cudaStream_t streams[N_STREAMS];
+  float *device_buf;
+  int size_in_bytes = sizeof(Body) * nBodies;
+  int parent_threads = 1024;
+  if (nBodies < 1024 && nBodies >= 32) {
+    parent_threads = 32;
+  }
+  int parent_blocks = nBodies / (parent_threads * N_STREAMS);
+
+  for (int i = 0; i < N_STREAMS; i++) {
+    cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking);
+  }
+
+  cudaMalloc((void**) &device_buf, size_in_bytes);
+  cudaMemcpy(device_buf, buf, size_in_bytes, cudaMemcpyHostToDevice);
+  // =========================== CHANGE
+
  double totalTime = 0.0;

  /*
@@ -99,18 +149,21 @@ int main(const int argc, const char** argv) {
   * as well as the work to integrate the positions.
   */

-    bodyForce(p, dt, nBodies); // compute interbody forces
+    // bodyForce(p, dt, nBodies); // compute interbody forces
+    for (int i = 0; i < N_STREAMS; i++) {
+      deviceBodyForceAndTranslate <<< parent_blocks, parent_threads, 0, streams[i] >>> (device_buf, dt, nBodies, i);
+    }

  /*
   * This position integration cannot occur until this round of `bodyForce` has completed.
   * Also, the next round of `bodyForce` cannot begin until the integration is complete.
   */

-    for (int i = 0 ; i < nBodies; i++) { // integrate position
-      p[i].x += p[i].vx*dt;
-      p[i].y += p[i].vy*dt;
-      p[i].z += p[i].vz*dt;
-    }
+    // for (int i = 0 ; i < nBodies; i++) { // integrate position
+    //   p[i].x += p[i].vx*dt;
+    //   p[i].y += p[i].vy*dt;
+    //   p[i].z += p[i].vz*dt;
+    // }

  /*******************************************************************/
  // Do not modify the code in this section.
@@ -121,6 +174,14 @@ int main(const int argc, const char** argv) {
  double avgTime = totalTime / (double)(nIters);
  float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;

+  // CHANGE ===========================
+  cudaMemcpy(buf, device_buf, size_in_bytes, cudaMemcpyDeviceToHost);
+  cudaFree(device_buf);
+  for (int i = 0; i < N_STREAMS; i++) {
+    cudaStreamDestroy(streams[i]);
+  }
+  // =========================== CHANGE
+
 #ifdef ASSESS
  checkPerformance(buf, billionsOfOpsPerSecond, salt);
 #else

--- a/01-saxpy.cu
+++ b/01-saxpy.cu
 #include <stdio.h>

 #define N 2048 * 2048 // Number of elements in each vector
+#define N_STREAMS 64 // Number of streams
+#define N_THREADS_PER_BLOCK 1024

 /*
 * Optimize this already-accelerated codebase. Work iteratively,
@@ -10,46 +12,58 @@
 * 20us.
 *
 * Some bugs have been placed in this codebase for your edification.
+ *
+ * CHANGE:
+ * - Compile command: nvcc -Xptxas -O3,-v 01-saxpy.cu -o 01-saxpy
 */

-__global__ void saxpy(int * a, int * b, int * c)
+__global__ void saxpy(float * c)
 {
-    int tid = blockIdx.x * blockDim.x * threadIdx.x;
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;

-    if ( tid < N )
-        c[tid] = 2 * a[tid] + b[tid];
+    c[tid] = 2 * 2 + 1;
 }

 int main()
 {
-    float *a, *b, *c;
+    float *c_local;
+    float *c_device;
+    cudaStream_t streams[N_STREAMS];

-    int size = N * sizeof (int); // The total number of bytes per vector
+    int size = N * sizeof (float); // The total number of bytes per vector

-    cudaMallocManaged(&a, size);
-    cudaMallocManaged(&b, size);
-    cudaMallocManaged(&c, size);
+    c_local = (float*) malloc(size);
+    cudaMalloc((void**) &c_device, size);

-    // Initialize memory
-    for( int i = 0; i < N; ++i )
-    {
-        a[i] = 2;
-        b[i] = 1;
-        c[i] = 0;
-    }
+    int threads_per_block = N_THREADS_PER_BLOCK;
+    int number_of_blocks_per_stream = (N / (threads_per_block * N_STREAMS));
+    int size_per_stream = N / N_STREAMS;

-    int threads_per_block = 128;
-    int number_of_blocks = (N / threads_per_block) + 1;
+    for (int i = 0; i < N_STREAMS; i++) {
+        cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking);
+        saxpy <<< number_of_blocks_per_stream, threads_per_block, 0, streams[i] >>> (c_device + (i * size_per_stream));
+    }

-    saxpy <<< number_of_blocks, threads_per_block >>> ( a, b, c );
+    for (int i = 0; i < N_STREAMS; i++) {
+        cudaMemcpyAsync(
+            c_local + (i * size_per_stream),
+            c_device + (i * size_per_stream),
+            size_per_stream * sizeof(float),
+            cudaMemcpyDeviceToHost,
+            streams[i]
+        );
+        cudaStreamSynchronize(streams[i]);
+        cudaStreamDestroy(streams[i]);
+    }

    // Print out the first and last 5 values of c for a quality check
    for( int i = 0; i < 5; ++i )
-        printf("c[%d] = %d, ", i, c[i]);
+        printf("c[%d] = %f, ", i, c_local[i]);
    printf ("\n");
    for( int i = N-5; i < N; ++i )
-        printf("c[%d] = %d, ", i, c[i]);
+        printf("c[%d] = %f, ", i, c_local[i]);
    printf ("\n");

-    cudaFree( a ); cudaFree( b ); cudaFree( c );
+    cudaFree( c_device );
+    free( c_local );
 }