diff --git a/01-nbody.cu b/01-nbody.cu index cdbee3aeb0fd9cb3c28d9e06f9250c4601643c3a..71d8054b53e20631f5df44690a13b6da96d39486 100644 --- a/01-nbody.cu +++ b/01-nbody.cu @@ -29,22 +29,27 @@ void randomizeBodies(float *data, int n) { * on all others, but does not update their positions. */ -void bodyForce(Body *p, float dt, int n) { - for (int i = 0; i < n; ++i) { - float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; - - for (int j = 0; j < n; j++) { - float dx = p[j].x - p[i].x; - float dy = p[j].y - p[i].y; - float dz = p[j].z - p[i].z; +__global__ void bodyForce(Body *p, float dt, int N) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if (tid < N) { + float Fx = 0, Fy = 0, Fz = 0; + for (int i = 0; i < N; i++) { + float dx = p[i].x - p[tid].x; + float dy = p[i].y - p[tid].y; + float dz = p[i].z - p[tid].z; float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; float invDist = rsqrtf(distSqr); float invDist3 = invDist * invDist * invDist; - - Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; + Fx += dx * invDist3; + Fy += dy * invDist3; + Fz += dz * invDist3; } - p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz; + p[tid].vx += dt*Fx; + p[tid].vy += dt*Fy; + p[tid].vz += dt*Fz; } } @@ -54,7 +59,6 @@ int main(const int argc, const char** argv) { * Do not change the value for `nBodies` here. If you would like to modify it, * pass values into the command line. */ - int nBodies = 2<<11; int salt = 0; if (argc > 1) nBodies = 2<<atoi(argv[1]); @@ -62,7 +66,6 @@ int main(const int argc, const char** argv) { /* * This salt is for assessment reasons. Tampering with it will result in automatic failure. */ - if (argc > 2) salt = atoi(argv[2]); const float dt = 0.01f; // time step @@ -71,7 +74,7 @@ int main(const int argc, const char** argv) { int bytes = nBodies * sizeof(Body); float *buf; - buf = (float *)malloc(bytes); + cudaMallocManaged(&buf, bytes); Body *p = (Body*)buf; @@ -99,19 +102,25 @@ int main(const int argc, const char** argv) { * as well as the work to integrate the positions. */ - bodyForce(p, dt, nBodies); // compute interbody forces + int threads_per_block = 128; + int number_of_blocks = (nBodies / threads_per_block); + bodyForce <<< number_of_blocks, threads_per_block >>> ( p, dt, nBodies ); + + // Wait for GPU to finish before accessing on host + cudaDeviceSynchronize(); - /* - * This position integration cannot occur until this round of `bodyForce` has completed. - * Also, the next round of `bodyForce` cannot begin until the integration is complete. - */ - - for (int i = 0 ; i < nBodies; i++) { // integrate position + // Integrate position + for (int i = 0 ; i < nBodies; i++) { p[i].x += p[i].vx*dt; p[i].y += p[i].vy*dt; p[i].z += p[i].vz*dt; } + /* + * This position integration cannot occur until this round of `bodyForce` has completed. + * Also, the next round of `bodyForce` cannot begin until the integration is complete. + */ + /*******************************************************************/ // Do not modify the code in this section. const double tElapsed = GetTimer() / 1000.0; @@ -134,5 +143,5 @@ int main(const int argc, const char** argv) { * Feel free to modify code below. */ - free(buf); + cudaFree(buf); } diff --git a/README.md b/README.md index 71daa2f838e2189d35ce7c72365e9800d54cc4c6..b301371d7dc822ccfd519538ca2ea8fe60c64a98 100644 --- a/README.md +++ b/README.md @@ -6,3 +6,4 @@ Using asynhcronous data transfer from host to device, execution can overlap. The ## Task 2 + \ No newline at end of file diff --git a/image_nbody_11.png b/image_nbody_11.png new file mode 100644 index 0000000000000000000000000000000000000000..e3f2645d240b8dd654258a551101f038c72d5c73 Binary files /dev/null and b/image_nbody_11.png differ diff --git a/image_nbody_13.png b/image_nbody_13.png new file mode 100644 index 0000000000000000000000000000000000000000..124f27ab23b51526dbe3aaf02045d4fdd38bd75a Binary files /dev/null and b/image_nbody_13.png differ