Skip to content
Snippets Groups Projects
Commit 2345ec34 authored by whiteclips's avatar whiteclips
Browse files

Editing files

parent 8284a131
Branches
No related merge requests found
Pipeline #5620 canceled with stages
#include <stdio.h>
#define N 2048 * 2048 // Number of elements in each vector
#define STREAMS 2048
/*
* Optimize this already-accelerated codebase. Work iteratively,
......@@ -12,25 +13,27 @@
* Some bugs have been placed in this codebase for your edification.
*/
__global__ void saxpy(int * a, int * b, int * c)
__global__ void saxpy(float * a, float * b, float * c, int offset)
{
int tid = blockIdx.x * blockDim.x * threadIdx.x;
if ( tid < N )
int tid = offset + blockIdx.x * blockDim.x + threadIdx.x;
if ( tid < N) {
c[tid] = 2 * a[tid] + b[tid];
}
}
int main()
{
float *a, *b, *c;
int size = N * sizeof (int); // The total number of bytes per vector
cudaMallocManaged(&a, size);
cudaMallocManaged(&b, size);
cudaMallocManaged(&c, size);
// Allocating memory in host as well as device
float *a, *b, *c, *da, *db, *dc;
int size = N * sizeof (int);
cudaMallocHost((void**)&a, size);
cudaMallocHost((void**)&b, size);
cudaMallocHost((void**)&c, size);
cudaMalloc((void**)&da, size);
cudaMalloc((void**)&db, size);
cudaMalloc((void**)&dc, size);
// Initialize memory
// Initialize memory in host
for( int i = 0; i < N; ++i )
{
a[i] = 2;
......@@ -38,17 +41,44 @@ int main()
c[i] = 0;
}
int threads_per_block = 128;
int number_of_blocks = (N / threads_per_block) + 1;
// Initialize streams
cudaStream_t streams[STREAMS];
for (int i = 0; i < STREAMS; i++) {
cudaStreamCreate(&streams[i]);
}
// Loop to copy and execute kernel
int stream_size = N / STREAMS;
int stream_bytes = stream_size * sizeof(int);
for (int i = 0; i < STREAMS; ++i) {
int offset = i * stream_size;
cudaMemcpyAsync(&da[offset], &a[offset], stream_bytes, cudaMemcpyHostToDevice, streams[i]);
cudaMemcpyAsync(&db[offset], &b[offset], stream_bytes, cudaMemcpyHostToDevice, streams[i]);
cudaMemcpyAsync(&dc[offset], &c[offset], stream_bytes, cudaMemcpyHostToDevice, streams[i]);
int threads_per_block = 128;
int number_of_blocks = (stream_size / threads_per_block) + 1;
saxpy <<< number_of_blocks, threads_per_block, 0, streams[i] >>> ( da, db, dc, offset );
cudaMemcpyAsync(&a[offset], &da[offset], stream_bytes, cudaMemcpyDeviceToHost, streams[i]);
cudaMemcpyAsync(&b[offset], &db[offset], stream_bytes, cudaMemcpyDeviceToHost, streams[i]);
cudaMemcpyAsync(&c[offset], &dc[offset], stream_bytes, cudaMemcpyDeviceToHost, streams[i]);
}
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
saxpy <<< number_of_blocks, threads_per_block >>> ( a, b, c );
// Calculating error
int number_missed = 0;
for (int i = 0; i < N; i++) {
if (c[i] != 5) number_missed++;
}
printf("Number missed: %d out of %d\n", number_missed, N);
// Print out the first and last 5 values of c for a quality check
for( int i = 0; i < 5; ++i )
printf("c[%d] = %d, ", i, c[i]);
printf("c[%d] = %lf, ", i, c[i]);
printf ("\n");
for( int i = N-5; i < N; ++i )
printf("c[%d] = %d, ", i, c[i]);
printf("c[%d] = %lf, ", i, c[i]);
printf ("\n");
cudaFree( a ); cudaFree( b ); cudaFree( c );
......
# IF5160
Tugas UAS
1. Jalankan dan optimasi kode 01-saxpy.cu
kompilasi
nvcc 01-saxpy.cu -o saxpy
eksekusi
./saxpy
2. Jalankan dan optimasi kode 01-nbody.cu berikut ini.
kompilasi:
nvcc 01-nbody.cu -o nbody
eksekusi:
./nbody
atau
./nbody x
dimana x adalah jumlah partikel/body dalam dengan jumlah partikel 2 << x.
misal: ./nbody 11 artinya menjalankan dengan jumlah partikel 2 << 11, yaitu 4096
Optimasi kode di atas, dan ujilah dengan jumlah partikel 4096 (./nbody 11) dan 16K (./nbody 13)
Eksekusi kode tersebut akan menampilkan jumlah interaksi antar body per detik.
Pada server 167.205.32.100, baseline nya adalah 0.039 Billion interaction per second untuk 4096 body,
dan
catatan:
1. jangan mengubah kode fungsi randomizeBodies, karena fungsi ini berjalan pada host dan tidak perlu/bisa dioptimasi
#IF5160
##Task 1
Using asynhcronous data transfer from host to device, execution can overlap. The total time taken based on the case given is around 12 us. Please see the image attached for proof of execution.
##Task 2
image_saxpy.png

223 KiB

0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment