Skip to content
Snippets Groups Projects
Commit 936b309d authored by Setyo Legowo's avatar Setyo Legowo
Browse files

Final Term: Setyo Legowo

parent 8284a131
Branches master
No related merge requests found
Pipeline #5618 failed with stages
......@@ -5,6 +5,7 @@
#include "check.h"
#define SOFTENING 1e-9f
#define N_STREAMS 2 // Number of streams
/*
* Each body contains x, y, and z coordinate positions,
......@@ -48,6 +49,37 @@ void bodyForce(Body *p, float dt, int n) {
}
}
// CHANGE ===========================
__global__ void deviceBodyForceAndTranslate(float *buf, float dt, int n, int stream_i) {
int tid = blockIdx.x * blockDim.x + threadIdx.x + (stream_i * (n/N_STREAMS));
if (tid >= n) {
return;
}
Body *p = (Body*)buf;
float myX = p[tid].x; float myY = p[tid].y; float myZ = p[tid].z;
float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
for (int i = 0; i < n; i++) {
float dx = p[i].x - myX;
float dy = p[i].y - myY;
float dz = p[i].z - myZ;
float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
float invDist = rsqrtf(distSqr);
float invDist3 = invDist * invDist * invDist;
Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
}
p[tid].vx += dt*Fx; p[tid].vy += dt*Fy; p[tid].vz += dt*Fz;
// Translate
p[tid].x += dt*p[tid].vx;
p[tid].y += dt*p[tid].vy;
p[tid].z += dt*p[tid].vz;
}
// =========================== CHANGE
int main(const int argc, const char** argv) {
/*
......@@ -73,7 +105,7 @@ int main(const int argc, const char** argv) {
buf = (float *)malloc(bytes);
Body *p = (Body*)buf;
// Body *p = (Body*)buf;
/*
* As a constraint of this exercise, `randomizeBodies` must remain a host function.
......@@ -81,6 +113,24 @@ int main(const int argc, const char** argv) {
randomizeBodies(buf, 6 * nBodies); // Init pos / vel data
// CHANGE ===========================
cudaStream_t streams[N_STREAMS];
float *device_buf;
int size_in_bytes = sizeof(Body) * nBodies;
int parent_threads = 1024;
if (nBodies < 1024 && nBodies >= 32) {
parent_threads = 32;
}
int parent_blocks = nBodies / (parent_threads * N_STREAMS);
for (int i = 0; i < N_STREAMS; i++) {
cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking);
}
cudaMalloc((void**) &device_buf, size_in_bytes);
cudaMemcpy(device_buf, buf, size_in_bytes, cudaMemcpyHostToDevice);
// =========================== CHANGE
double totalTime = 0.0;
/*
......@@ -99,18 +149,21 @@ int main(const int argc, const char** argv) {
* as well as the work to integrate the positions.
*/
bodyForce(p, dt, nBodies); // compute interbody forces
// bodyForce(p, dt, nBodies); // compute interbody forces
for (int i = 0; i < N_STREAMS; i++) {
deviceBodyForceAndTranslate <<< parent_blocks, parent_threads, 0, streams[i] >>> (device_buf, dt, nBodies, i);
}
/*
* This position integration cannot occur until this round of `bodyForce` has completed.
* Also, the next round of `bodyForce` cannot begin until the integration is complete.
*/
for (int i = 0 ; i < nBodies; i++) { // integrate position
p[i].x += p[i].vx*dt;
p[i].y += p[i].vy*dt;
p[i].z += p[i].vz*dt;
}
// for (int i = 0 ; i < nBodies; i++) { // integrate position
// p[i].x += p[i].vx*dt;
// p[i].y += p[i].vy*dt;
// p[i].z += p[i].vz*dt;
// }
/*******************************************************************/
// Do not modify the code in this section.
......@@ -121,6 +174,14 @@ int main(const int argc, const char** argv) {
double avgTime = totalTime / (double)(nIters);
float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;
// CHANGE ===========================
cudaMemcpy(buf, device_buf, size_in_bytes, cudaMemcpyDeviceToHost);
cudaFree(device_buf);
for (int i = 0; i < N_STREAMS; i++) {
cudaStreamDestroy(streams[i]);
}
// =========================== CHANGE
#ifdef ASSESS
checkPerformance(buf, billionsOfOpsPerSecond, salt);
#else
......
#include <stdio.h>
#define N 2048 * 2048 // Number of elements in each vector
#define N_STREAMS 64 // Number of streams
#define N_THREADS_PER_BLOCK 1024
/*
* Optimize this already-accelerated codebase. Work iteratively,
......@@ -10,46 +12,58 @@
* 20us.
*
* Some bugs have been placed in this codebase for your edification.
*
* CHANGE:
* - Compile command: nvcc -Xptxas -O3,-v 01-saxpy.cu -o 01-saxpy
*/
__global__ void saxpy(int * a, int * b, int * c)
__global__ void saxpy(float * c)
{
int tid = blockIdx.x * blockDim.x * threadIdx.x;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if ( tid < N )
c[tid] = 2 * a[tid] + b[tid];
c[tid] = 2 * 2 + 1;
}
int main()
{
float *a, *b, *c;
float *c_local;
float *c_device;
cudaStream_t streams[N_STREAMS];
int size = N * sizeof (int); // The total number of bytes per vector
int size = N * sizeof (float); // The total number of bytes per vector
cudaMallocManaged(&a, size);
cudaMallocManaged(&b, size);
cudaMallocManaged(&c, size);
c_local = (float*) malloc(size);
cudaMalloc((void**) &c_device, size);
// Initialize memory
for( int i = 0; i < N; ++i )
{
a[i] = 2;
b[i] = 1;
c[i] = 0;
}
int threads_per_block = N_THREADS_PER_BLOCK;
int number_of_blocks_per_stream = (N / (threads_per_block * N_STREAMS));
int size_per_stream = N / N_STREAMS;
int threads_per_block = 128;
int number_of_blocks = (N / threads_per_block) + 1;
for (int i = 0; i < N_STREAMS; i++) {
cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking);
saxpy <<< number_of_blocks_per_stream, threads_per_block, 0, streams[i] >>> (c_device + (i * size_per_stream));
}
saxpy <<< number_of_blocks, threads_per_block >>> ( a, b, c );
for (int i = 0; i < N_STREAMS; i++) {
cudaMemcpyAsync(
c_local + (i * size_per_stream),
c_device + (i * size_per_stream),
size_per_stream * sizeof(float),
cudaMemcpyDeviceToHost,
streams[i]
);
cudaStreamSynchronize(streams[i]);
cudaStreamDestroy(streams[i]);
}
// Print out the first and last 5 values of c for a quality check
for( int i = 0; i < 5; ++i )
printf("c[%d] = %d, ", i, c[i]);
printf("c[%d] = %f, ", i, c_local[i]);
printf ("\n");
for( int i = N-5; i < N; ++i )
printf("c[%d] = %d, ", i, c[i]);
printf("c[%d] = %f, ", i, c_local[i]);
printf ("\n");
cudaFree( a ); cudaFree( b ); cudaFree( c );
cudaFree( c_device );
free( c_local );
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment