diff --git a/build/lib/libradar.so b/build/lib/libradar.so index 68d621af8f1b6759e039c6fb29e1b71d0fa47068..0ffbbe99a7b270f575c60c4c22909d313ecc61ea 100755 Binary files a/build/lib/libradar.so and b/build/lib/libradar.so differ diff --git a/lib/lib.go b/lib/lib.go index b42dfa30c3ffccbea0b1d3fc8c5967c78181cd94..079be52ff348b65e470c06d2f66f356c84203866 100644 --- a/lib/lib.go +++ b/lib/lib.go @@ -16,29 +16,33 @@ func Cleanup() { C.cleanup() } -func CalculateReflectivity(inputData data.RawData) data.CalculatedData { - r_hh, i_hh := ConvertToC(inputData.RealHH, inputData.ImagHH) - r_vv, i_vv := ConvertToC(inputData.RealVV, inputData.ImagVV) - r_hv, i_hv := ConvertToC(inputData.RealHV, inputData.ImagHV) - +func FetchReflectivity(elevationIndex int32, sectorIndex int32) data.CalculatedData { zdb_c := make([]C.float, 512) zdr_c := make([]C.float, 512) - C.calculate_reflectivity( - &r_hh[0], &i_hh[0], &r_vv[0], - &i_vv[0], &r_hv[0], &i_hv[0], - &zdb_c[0], &zdr_c[0], - ) + C.fetch_reflectivity(C.int(sectorIndex), &zdb_c[0], &zdr_c[0]) zdb := ConvertToGo(zdb_c) zdr := ConvertToGo(zdr_c) return data.CalculatedData{ - SectorIndex: inputData.SectorIndex, - ElevationIndex: inputData.ElevationIndex, + ElevationIndex: elevationIndex, + SectorIndex: sectorIndex, Reflectivity: data.RadarDataPair{ Zdb: zdb, Zdr: zdr, }, } } + +func CalculateReflectivity(inputData data.RawData) { + r_hh, i_hh := ConvertToC(inputData.RealHH, inputData.ImagHH) + r_vv, i_vv := ConvertToC(inputData.RealVV, inputData.ImagVV) + r_hv, i_hv := ConvertToC(inputData.RealHV, inputData.ImagHV) + + C.calculate_reflectivity( + C.int(inputData.SectorIndex), + &r_hh[0], &i_hh[0], &r_vv[0], + &i_vv[0], &r_hv[0], &i_hv[0], + ) +} diff --git a/lib/radar.cu b/lib/radar.cu index 5d6b294694added860841a69b2222ab3a5847736..65ed2ac0cb0975ab2f8e4ec77eb398b32f1a7a88 100644 --- a/lib/radar.cu +++ b/lib/radar.cu @@ -5,7 +5,8 @@ #include <fftw3.h> #include <cufft.h> #include <sys/time.h> -#include <assert.h> +#include <assert.h> +// #include <fstream> using namespace std; @@ -13,6 +14,8 @@ using namespace std; #define k_calib 1941.05 #define RESULT_SIZE 2 +#define MAX_SECTOR 142 +#define NSTREAMS 16 #define DEBUG @@ -73,81 +76,47 @@ float *generate_ma_coef(int n){ __constant__ cuFloatComplex d_ma[512]; -__global__ void __apply_hamming(cuFloatComplex *a, float *b) { +__global__ void __apply_hamming(cuFloatComplex *a, float *b, int offset) { const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; - a[idx] = make_cuFloatComplex(b[idx]*cuCrealf(a[idx]), b[idx]*cuCimagf(a[idx])); + a[offset+idx] = make_cuFloatComplex(b[idx]*cuCrealf(a[offset+idx]), b[idx]*cuCimagf(a[offset+idx])); } -__global__ void __apply_ma(cuFloatComplex *inout, cuFloatComplex *macoef) { - const unsigned int i = blockIdx.x, j = threadIdx.x, n = blockDim.x; - - inout[i*n+j] = cuCmulf(inout[i*n+j], macoef[j]); -} - -__global__ void __apply_ma_v2(cuFloatComplex *inout) { - const unsigned int i = blockIdx.x, j = threadIdx.x, n = blockDim.x; - - inout[i*n+j] = cuCmulf(inout[i*n+j], d_ma[j]); -} - -__global__ void __conjugate(cuFloatComplex *a) { - const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; - a[idx].y *= -1; -} - -__global__ void __shift(cuFloatComplex *inout, int n) { - const unsigned int i = blockIdx.x, j = threadIdx.x; - - cuFloatComplex temp = inout[i*n+j]; - inout[i*n+j] = inout[i*n+(j+n/2)]; - inout[i*n+(j+n/2)] = temp; -} - -__global__ void __conjshift(cuFloatComplex *inout, int n) { - const unsigned int i = blockIdx.x, j = threadIdx.x; - - cuFloatComplex t1 = inout[i*n+j]; - cuFloatComplex t2 = inout[i*n+(j+n/2)]; - t1.y *= -1; - t2.y *= -1; - inout[i*n+j] = t2; - inout[i*n+(j+n/2)] = t1; -} - -__global__ void __clip(cuFloatComplex *inout, int n) { - const unsigned int i = blockIdx.x, j = n-threadIdx.x-1; - inout[i*n+j] = make_cuFloatComplex(0, 0); +__global__ void __apply_hamming_v2(cuFloatComplex *a, float *b, int offset) { + const unsigned int i = 4*blockIdx.x, j = threadIdx.x, n = blockDim.x; + #pragma unroll + for (unsigned int d=0; d<4; d++) { + a[offset+i*n+j+n*d] = make_cuFloatComplex(b[i*n+j+n*d]*cuCrealf(a[offset+i*n+j+n*d]), b[i*n+j+n*d]*cuCimagf(a[offset+i*n+j+n*d])); + } } -__global__ void __abssqr(cuFloatComplex *inout, int n) { - const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; - - float real, imag; - real = cuCrealf(inout[idx]); - imag = cuCimagf(inout[idx]); - inout[idx] = make_cuFloatComplex(real*real + imag*imag, 0); +__global__ void __apply_hamming_v3(cuFloatComplex *a, float *b, int offset) { + const unsigned int i = blockIdx.x, j = 4*threadIdx.x, n = 4*blockDim.x; + #pragma unroll + for (unsigned int d=0; d<4; d++) { + a[offset+i*n+j+d] = make_cuFloatComplex(b[i*n+j+d]*cuCrealf(a[offset+i*n+j+d]), b[i*n+j+d]*cuCimagf(a[offset+i*n+j+d])); + } } -__global__ void __sum(cuFloatComplex *in, cuFloatComplex *out) { +__global__ void __sum(cuFloatComplex *in, cuFloatComplex *out, int offset) { const unsigned int i = blockIdx.x, j = threadIdx.x, n = blockDim.x; - out[i*n+j] = make_cuFloatComplex(in[i*n+j].x, in[i*n+j].y); + out[offset+i*n+j] = make_cuFloatComplex(in[offset+i*n+j].x, in[offset+i*n+j].y); __syncthreads(); for (unsigned int s=blockDim.x/2; s>0; s>>=1) { if (j < s) { - out[i*n+j] = cuCaddf(out[i*n+j], out[i*n+j+s]); + out[offset+i*n+j] = cuCaddf(out[offset+i*n+j], out[offset+i*n+j+s]); } __syncthreads(); } } -__global__ void __sum_v2(cuFloatComplex *in, cuFloatComplex *out) { +__global__ void __sum_v2(cuFloatComplex *in, cuFloatComplex *out, int offset) { const unsigned int i = 2*blockIdx.x, j = threadIdx.x, n = blockDim.x; #pragma unroll for (unsigned int d=0; d<2; d++) { - out[i*n+j+n*d] = make_cuFloatComplex(in[i*n+j+n*d].x, in[i*n+j+n*d].y); + out[offset+i*n+j+n*d] = make_cuFloatComplex(in[offset+i*n+j+n*d].x, in[offset+i*n+j+n*d].y); } __syncthreads(); @@ -155,22 +124,23 @@ __global__ void __sum_v2(cuFloatComplex *in, cuFloatComplex *out) { if (j < s) { #pragma unroll for (unsigned int d=0; d<2; d++) { - out[i*n+j+n*d] = cuCaddf(out[i*n+j+n*d], out[i*n+j+n*d+s]); + out[offset+i*n+j+n*d] = cuCaddf(out[offset+i*n+j+n*d], out[offset+i*n+j+n*d+s]); } } __syncthreads(); } } -__global__ void __sum_v3(cuFloatComplex *in, cuFloatComplex *out) { +__global__ void __sum_v3(cuFloatComplex *in, cuFloatComplex *out, int offset) { const unsigned int i = blockIdx.x, j = threadIdx.x, n = blockDim.x; extern __shared__ cuFloatComplex sdata[]; - sdata[j] = make_cuFloatComplex(in[i*n+j].x, in[i*n+j].y); + sdata[j] = make_cuFloatComplex(in[offset+i*n+j].x, in[offset+i*n+j].y); __syncthreads(); for (unsigned int s=blockDim.x/2; s>0; s>>=1) { if (j < s) { + // out[offset+i*n+j] = cuCaddf(out[offset+i*n+j], out[offset+i*n+j+s]); sdata[j] = cuCaddf(sdata[j], sdata[j+s]); } __syncthreads(); @@ -181,13 +151,13 @@ __global__ void __sum_v3(cuFloatComplex *in, cuFloatComplex *out) { } } -__global__ void __sum_v4(cuFloatComplex *in, cuFloatComplex *out) { +__global__ void __sum_v4(cuFloatComplex *in, cuFloatComplex *out, int offset) { const unsigned int i = 2*blockIdx.x, j = threadIdx.x, n = blockDim.x; extern __shared__ cuFloatComplex sdata[]; #pragma unroll for (unsigned int d=0; d<2; d++) { - sdata[j+n*d] = make_cuFloatComplex(in[i*n+j+n*d].x, in[i*n+j+n*d].y); + sdata[j+n*d] = make_cuFloatComplex(in[offset+i*n+j+n*d].x, in[offset+i*n+j+n*d].y); } __syncthreads(); @@ -209,20 +179,83 @@ __global__ void __sum_v4(cuFloatComplex *in, cuFloatComplex *out) { } } -__global__ void __sum_inplace(cuFloatComplex *g_idata) { +__global__ void __avgconj(cuFloatComplex *inout, cuFloatComplex *sum, int offset) { + const unsigned int i = blockIdx.x, j = threadIdx.x, n = blockDim.x; + + float avgx = sum[offset+i*n].x/n; + float avgy = sum[offset+i*n].y/n; + inout[offset+i*n+j] = make_cuFloatComplex(inout[offset+i*n+j].x-avgx, (inout[offset+i*n+j].y-avgy)*-1); +} + +__global__ void __conjugate(cuFloatComplex *a, int offset) { + const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + a[offset+idx].y *= -1; +} + +__global__ void __shift(cuFloatComplex *inout, int n, int offset) { + const unsigned int i = blockIdx.x, j = threadIdx.x; + + cuFloatComplex temp = inout[offset+i*n+j]; + inout[offset+i*n+j] = inout[offset+i*n+(j+n/2)]; + inout[offset+i*n+(j+n/2)] = temp; +} + +__global__ void __conjshift(cuFloatComplex *inout, int n, int offset) { + const unsigned int i = blockIdx.x, j = threadIdx.x; + + cuFloatComplex t1 = inout[offset+i*n+j]; + cuFloatComplex t2 = inout[offset+i*n+(j+n/2)]; + t1.y *= -1; + t2.y *= -1; + inout[offset+i*n+j] = t2; + inout[offset+i*n+(j+n/2)] = t1; +} + +__global__ void __clip(cuFloatComplex *inout, int n, int offset) { + const unsigned int i = blockIdx.x, j = n-threadIdx.x-1; + inout[offset+i*n+j] = make_cuFloatComplex(0, 0); +} + +__global__ void __clip_v2(cuFloatComplex *inout, int n, int offset) { + const unsigned int i = threadIdx.x, j = n-blockIdx.x-1; + inout[offset+i*n+j] = make_cuFloatComplex(0, 0); +} + +__global__ void __abssqr(cuFloatComplex *inout, int n, int offset) { + const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + + float real, imag; + real = cuCrealf(inout[offset+idx]); + imag = cuCimagf(inout[offset+idx]); + inout[offset+idx] = make_cuFloatComplex(real*real + imag*imag, 0); +} + +__global__ void __apply_ma(cuFloatComplex *inout, int offset) { + const unsigned int i = blockIdx.x, j = threadIdx.x, n = blockDim.x; + + inout[offset+i*n+j] = cuCmulf(inout[offset+i*n+j], d_ma[j]); +} + +__global__ void __scale_real(cuFloatComplex *inout, int offset) { + const unsigned int i = blockIdx.x, j = threadIdx.x, n = blockDim.x; + + inout[offset+i*n+j] = make_cuFloatComplex(inout[offset+i*n+j].x/n, 0); +} + +__global__ void __sum_inplace(cuFloatComplex *g_idata, int offset) { const unsigned int i = blockIdx.x, j = threadIdx.x, n = blockDim.x; // __syncthreads(); for (unsigned int s=blockDim.x/2; s>0; s>>=1) { if (j < s) { // g_idata[i] = make_cuFloatComplex(g_idata[i].x+g_idata[i + s].x, 0); - g_idata[i*n+j] = cuCaddf(g_idata[i*n+j], g_idata[i*n+j+s]); + g_idata[offset+i*n+j] = cuCaddf(g_idata[offset+i*n+j], g_idata[offset+i*n+j+s]); } __syncthreads(); } } -__global__ void __sum_inplace_v2(cuFloatComplex *g_idata) { +__global__ void __sum_inplace_v2(cuFloatComplex *g_idata, int offset) { const unsigned int i = 2*blockIdx.x, j = threadIdx.x, n = blockDim.x; // __syncthreads(); @@ -231,18 +264,18 @@ __global__ void __sum_inplace_v2(cuFloatComplex *g_idata) { // g_idata[i] = make_cuFloatComplex(g_idata[i].x+g_idata[i + s].x, 0); #pragma unroll for (unsigned int d=0; d<2; d++) { - g_idata[i*n+j+n*d] = cuCaddf(g_idata[i*n+j+n*d], g_idata[i*n+j+n*d+s]); + g_idata[offset+i*n+j+n*d] = cuCaddf(g_idata[offset+i*n+j+n*d], g_idata[offset+i*n+j+n*d+s]); } } __syncthreads(); } } -__global__ void __sum_inplace_v3(cuFloatComplex *in) { +__global__ void __sum_inplace_v3(cuFloatComplex *in, int offset) { const unsigned int i = blockIdx.x, j = threadIdx.x, n = blockDim.x; extern __shared__ cuFloatComplex sdata[]; - sdata[j] = make_cuFloatComplex(in[i*n+j].x, in[i*n+j].y); + sdata[j] = make_cuFloatComplex(in[offset+i*n+j].x, in[offset+i*n+j].y); __syncthreads(); for (unsigned int s=blockDim.x/2; s>0; s>>=1) { @@ -257,13 +290,13 @@ __global__ void __sum_inplace_v3(cuFloatComplex *in) { } } -__global__ void __sum_inplace_v4(cuFloatComplex *in) { +__global__ void __sum_inplace_v4(cuFloatComplex *in, int offset) { const unsigned int i = 2*blockIdx.x, j = threadIdx.x, n = blockDim.x; extern __shared__ cuFloatComplex sdata[]; #pragma unroll for (unsigned int d=0; d<2; d++) { - sdata[j+n*d] = make_cuFloatComplex(in[i*n+j+n*d].x, in[i*n+j+n*d].y); + sdata[j+n*d] = make_cuFloatComplex(in[offset+i*n+j+n*d].x, in[offset+i*n+j+n*d].y); } __syncthreads(); @@ -285,38 +318,14 @@ __global__ void __sum_inplace_v4(cuFloatComplex *in) { } } -__global__ void __avgconj(cuFloatComplex *inout, cuFloatComplex *sum) { - const unsigned int i = blockIdx.x, j = threadIdx.x, n = blockDim.x; - - float avgx = sum[i*n].x/n; - float avgy = sum[i*n].y/n; - inout[i*n+j] = make_cuFloatComplex(inout[i*n+j].x-avgx, (inout[i*n+j].y-avgy)*-1); -} - -__global__ void __scale_real(cuFloatComplex *inout) { - const unsigned int i = blockIdx.x, j = threadIdx.x, n = blockDim.x; - - inout[i*n+j] = make_cuFloatComplex(inout[i*n+j].x/n, 0); -} - -__global__ void __calcresult(cuFloatComplex *hh, cuFloatComplex *vv, cuFloatComplex *hv, float *out, int n) { - const unsigned int i = blockIdx.x; - - float z = pow(i*k_rangeres, 2.0) * k_calib * hh[i*n].x; - float zdb = 10 * log10(z); - float zdr = 10 * (log10(hh[i*n].x)-log10(vv[i*n].x)); - out[i*RESULT_SIZE+0] = zdb; - out[i*RESULT_SIZE+1] = zdr; -} - -__global__ void __calcresult_v2(cuFloatComplex *hh, cuFloatComplex *vv, cuFloatComplex *hv, float *out, int n) { +__global__ void __calcresult_v2(cuFloatComplex *hh, cuFloatComplex *vv, cuFloatComplex *hv, float *out, int n, int offset, int result_offset) { const unsigned int i = threadIdx.x; - float z = pow(i*k_rangeres, 2.0) * k_calib * hh[i*n].x; + float z = pow(i*k_rangeres, 2.0) * k_calib * hh[offset+i*n].x; float zdb = 10 * log10(z); - float zdr = 10 * (log10(hh[i*n].x)-log10(vv[i*n].x)); - out[i*RESULT_SIZE+0] = zdb; - out[i*RESULT_SIZE+1] = zdr; + float zdr = 10 * (log10(hh[offset+i*n].x)-log10(vv[offset+i*n].x)); + out[result_offset+i*RESULT_SIZE+0] = zdb; + out[result_offset+i*RESULT_SIZE+1] = zdr; } void tick(timeval *begin) { @@ -335,36 +344,42 @@ void tock(timeval *begin, timeval *end, string caption) { const int m = 1024; // cell const int n = 512; // sweep +const int ma_count = 7; -cuFloatComplex *iqhh, *iqvv, *iqhv; -cuFloatComplex *d_iqhh, *d_iqvv, *d_iqhv; -cuFloatComplex *d_sum; -float *d_result; float *result; -cufftHandle fft_range_handle; -cufftHandle fft_doppler_handle; -cufftHandle fft_pdop_handle; +cudaStream_t stream[NSTREAMS]; -/*__constant__*/ float *d_hamming; +// CUFFT initialization +cufftHandle *fft_range_handle = new cufftHandle[NSTREAMS]; +cufftHandle *fft_doppler_handle = new cufftHandle[NSTREAMS]; +cufftHandle *fft_pdop_handle = new cufftHandle[NSTREAMS]; +cuFloatComplex *iqhh, *iqvv, *iqhv, *p_iqhh, *p_iqvv, *p_iqhv; + +// Device buffers +__constant__ float *d_hamming; +cuFloatComplex *d_iqhh, *d_iqvv, *d_iqhv; +cuFloatComplex *d_sum; +float *d_result; extern "C" { void initialize() { - const int ma_count = 7; - iqhh = new cuFloatComplex[m*n]; iqvv = new cuFloatComplex[m*n]; iqhv = new cuFloatComplex[m*n]; - result = new float[(m/2)*RESULT_SIZE]; - - float a, b; + cudaMallocHost((void**)&p_iqhh, NSTREAMS*m*n*sizeof(cuFloatComplex)); + cudaMallocHost((void**)&p_iqvv, NSTREAMS*m*n*sizeof(cuFloatComplex)); + cudaMallocHost((void**)&p_iqhv, NSTREAMS*m*n*sizeof(cuFloatComplex)); + + result = new float[NSTREAMS*(m/2)*RESULT_SIZE]; // Generate Hamming coefficients - const float *hamming_coef = generate_hamming_coef(m, n); - + const float* hamming_coef = generate_hamming_coef(m, n); + // Generate MA coefficients - float *ma_coef = generate_ma_coef(ma_count); + const float* ma_coef = generate_ma_coef(ma_count); + fftwf_complex *_fft_ma = (fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex) * n); fftwf_plan fft_ma_plan = fftwf_plan_dft_1d(n, _fft_ma, _fft_ma, FFTW_FORWARD, FFTW_ESTIMATE); for (int j=0; j<ma_count; j++) { @@ -377,7 +392,6 @@ extern "C" { } fftwf_execute(fft_ma_plan); fftwf_destroy_plan(fft_ma_plan); - cuFloatComplex *fft_ma; fft_ma = new cuFloatComplex[n]; for (int j=0; j<n; j++) { @@ -385,24 +399,19 @@ extern "C" { } fftwf_free(_fft_ma); - // Device buffers - // /*__constant__*/ cuFloatComplex *d_ma; - //float *d_powhh, *d_powvv; - cudaMalloc(&d_hamming, m*n*sizeof(float)); // cudaMalloc(&d_ma, n*sizeof(cuFloatComplex)); - cudaMalloc(&d_iqhh, m*n*sizeof(cuFloatComplex)); - cudaMalloc(&d_iqvv, m*n*sizeof(cuFloatComplex)); - cudaMalloc(&d_iqhv, m*n*sizeof(cuFloatComplex)); - cudaMalloc(&d_sum, m*n*sizeof(cuFloatComplex)); - cudaMalloc(&d_result, (m/2)*RESULT_SIZE*sizeof(float)); + cudaMalloc(&d_iqhh, NSTREAMS*m*n*sizeof(cuFloatComplex)); + cudaMalloc(&d_iqvv, NSTREAMS*m*n*sizeof(cuFloatComplex)); + cudaMalloc(&d_iqhv, NSTREAMS*m*n*sizeof(cuFloatComplex)); + cudaMalloc(&d_sum, NSTREAMS*m*n*sizeof(cuFloatComplex)); + + cudaMalloc(&d_result, (m/2) * RESULT_SIZE * MAX_SECTOR * sizeof(float)); + cudaMemset(d_result, -INFINITY, (m/2) * RESULT_SIZE * MAX_SECTOR * sizeof(float)); cudaMemcpy(d_hamming, hamming_coef, m*n*sizeof(float), cudaMemcpyHostToDevice); - // cudaMemcpy(d_ma, fft_ma, n*sizeof(cuFloatComplex), cudaMemcpyHostToDevice); cudaMemcpyToSymbol(d_ma, fft_ma, n*sizeof(cuFloatComplex), 0, cudaMemcpyHostToDevice); - // CUFFT initialization - int rank = 1; // --- 1D FFTs int nn[] = { m }; // --- Size of the Fourier transform int istride = n, ostride = n; // --- Distance between two successive input/output elements @@ -411,53 +420,72 @@ extern "C" { int onembed[] = { 0 }; // --- Output size with pitch (ignored for 1D transforms) int batch = n; // --- Number of batched executions - cufftPlanMany(&fft_range_handle, rank, nn, - inembed, istride, idist, - onembed, ostride, odist, CUFFT_C2C, batch); - cufftPlan1d(&fft_doppler_handle, n, CUFFT_C2C, m); - cufftPlan1d(&fft_pdop_handle, n, CUFFT_C2C, m/2); + for (int i = 0; i < NSTREAMS; i++) { + cudaStreamCreate(&stream[i]); + + cufftPlanMany(&fft_range_handle[i], rank, nn, + inembed, istride, idist, + onembed, ostride, odist, CUFFT_C2C, batch); + cufftPlan1d(&fft_doppler_handle[i], n, CUFFT_C2C, m); + cufftPlan1d(&fft_pdop_handle[i], n, CUFFT_C2C, m/2); + + cufftSetStream(fft_range_handle[i], stream[i]); + cufftSetStream(fft_doppler_handle[i], stream[i]); + cufftSetStream(fft_pdop_handle[i], stream[i]); + } } void cleanup() { + delete[] fft_range_handle; + delete[] fft_doppler_handle; + delete[] fft_pdop_handle; + + for (int i = 0; i < NSTREAMS; i++) { + cudaStreamDestroy(stream[i]); + } + cudaFree(d_hamming); - cudaFree(d_ma); + // cudaFree(d_ma); cudaFree(d_iqhh); cudaFree(d_iqvv); cudaFree(d_iqhv); + cudaFreeHost(p_iqhh); + cudaFreeHost(p_iqvv); + cudaFreeHost(p_iqhv); + delete[] iqhh; delete[] iqvv; delete[] iqhv; } + void fetch_reflectivity( + int sector_id, + float* zdb, + float* zdr) { + int stream_id = sector_id % NSTREAMS; + int result_offset = sector_id * (m/2) * RESULT_SIZE; + + cudaStreamSynchronize(stream[stream_id]); + + for (int i=0; i<m/2; i++) { + zdb[i] = result[result_offset + i * RESULT_SIZE]; + zdr[i] = result[result_offset + i * RESULT_SIZE + 1]; + } + } + void calculate_reflectivity( + int sector_id, int* r_hh, int* i_hh, int* r_vv, int* i_vv, int* r_hv, - int* i_hv, - float* zdb, - float* zdr - ) { - // tock(&tb, &te, "initialization"); - - float ms; // elapsed time in milliseconds - - // create events and streams - cudaEvent_t startEvent, stopEvent; - - cudaEventCreate(&startEvent); - cudaEventCreate(&stopEvent); - // cudaEventCreate(&dummyEvent); + int* i_hv) { + int stream_id = sector_id % NSTREAMS; + int offset = stream_id * (m*n); + int result_offset = sector_id * (m/2)*RESULT_SIZE; - cudaEventRecord(startEvent,0); - - - // tick(&tb); - - // Read 1 sector data - // cin >> sector_id; for (int i=0; i<m; i++) { for (int j=0; j<n; j++) { // cin >> a >> b; @@ -477,90 +505,84 @@ extern "C" { } } - cudaMemcpy(d_iqhh, iqhh, m*n*sizeof(cuFloatComplex), cudaMemcpyHostToDevice); - cudaMemcpy(d_iqvv, iqvv, m*n*sizeof(cuFloatComplex), cudaMemcpyHostToDevice); - cudaMemcpy(d_iqhv, iqhv, m*n*sizeof(cuFloatComplex), cudaMemcpyHostToDevice); + memcpy(&p_iqhh[offset], iqhh, m*n*sizeof(cuFloatComplex)); + memcpy(&p_iqvv[offset], iqvv, m*n*sizeof(cuFloatComplex)); + memcpy(&p_iqhv[offset], iqhv, m*n*sizeof(cuFloatComplex)); + + cudaMemcpyAsync(&d_iqhh[offset], &p_iqhh[offset], m*n*sizeof(cuFloatComplex), cudaMemcpyHostToDevice, stream[stream_id]); + cudaMemcpyAsync(&d_iqvv[offset], &p_iqvv[offset], m*n*sizeof(cuFloatComplex), cudaMemcpyHostToDevice, stream[stream_id]); + cudaMemcpyAsync(&d_iqhv[offset], &p_iqhv[offset], m*n*sizeof(cuFloatComplex), cudaMemcpyHostToDevice, stream[stream_id]); // apply Hamming coefficients - __apply_hamming<<<m,n>>>(d_iqhh, d_hamming); - __apply_hamming<<<m,n>>>(d_iqvv, d_hamming); - __apply_hamming<<<m,n>>>(d_iqhv, d_hamming); + __apply_hamming<<<m,n,0,stream[stream_id]>>>(d_iqhh, d_hamming, offset); + __apply_hamming<<<m,n,0,stream[stream_id]>>>(d_iqvv, d_hamming, offset); + __apply_hamming<<<m,n,0,stream[stream_id]>>>(d_iqhv, d_hamming, offset); + // exit(0); // FFT range profile - cufftExecC2C(fft_range_handle, d_iqhh, d_iqhh, CUFFT_FORWARD); - cufftExecC2C(fft_range_handle, d_iqvv, d_iqvv, CUFFT_FORWARD); - cufftExecC2C(fft_range_handle, d_iqhv, d_iqhv, CUFFT_FORWARD); + cufftExecC2C(fft_range_handle[stream_id], &d_iqhh[offset], &d_iqhh[offset], CUFFT_FORWARD); + cufftExecC2C(fft_range_handle[stream_id], &d_iqvv[offset], &d_iqvv[offset], CUFFT_FORWARD); + cufftExecC2C(fft_range_handle[stream_id], &d_iqhv[offset], &d_iqhv[offset], CUFFT_FORWARD); // FFT+shift Doppler profile - __sum_v4<<<m/2,n,2*n*sizeof(cuFloatComplex)>>>(d_iqhh, d_sum); - __avgconj<<<m,n>>>(d_iqhh, d_sum); - __sum_v4<<<m/2,n,2*n*sizeof(cuFloatComplex)>>>(d_iqvv, d_sum); - __avgconj<<<m,n>>>(d_iqvv, d_sum); - __sum_v4<<<m/2,n,2*n*sizeof(cuFloatComplex)>>>(d_iqhv, d_sum); - __avgconj<<<m,n>>>(d_iqhv, d_sum); - - cufftExecC2C(fft_doppler_handle, d_iqhh, d_iqhh, CUFFT_FORWARD); - cufftExecC2C(fft_doppler_handle, d_iqvv, d_iqvv, CUFFT_FORWARD); - cufftExecC2C(fft_doppler_handle, d_iqhv, d_iqhv, CUFFT_FORWARD); - - // __conjugate<<<m,n>>>(d_iqhh); - // __conjugate<<<m,n>>>(d_iqvv); - // __conjugate<<<m,n>>>(d_iqhv); - - __conjshift<<<m,n/2>>>(d_iqhh, n); - __conjshift<<<m,n/2>>>(d_iqvv, n); - __conjshift<<<m,n/2>>>(d_iqhv, n); - - __clip<<<m,2>>>(d_iqhh, n); - __clip<<<m,2>>>(d_iqvv, n); - __clip<<<m,2>>>(d_iqhv, n); - - // Get absolute value - __abssqr<<<m/2,n>>>(d_iqhh, n); - __abssqr<<<m/2,n>>>(d_iqvv, n); - __abssqr<<<m/2,n>>>(d_iqhv, n); + __sum_v4<<<m/2,n,2*n*sizeof(cuFloatComplex),stream[stream_id]>>>(d_iqhh, d_sum, offset); + __avgconj<<<m,n,0,stream[stream_id]>>>(d_iqhh, d_sum, offset); + __sum_v4<<<m/2,n,2*n*sizeof(cuFloatComplex),stream[stream_id]>>>(d_iqvv, d_sum, offset); + __avgconj<<<m,n,0,stream[stream_id]>>>(d_iqvv, d_sum, offset); + __sum_v4<<<m/2,n,2*n*sizeof(cuFloatComplex),stream[stream_id]>>>(d_iqhv, d_sum, offset); + __avgconj<<<m,n,0,stream[stream_id]>>>(d_iqhv, d_sum, offset); + + cufftExecC2C(fft_doppler_handle[stream_id], &d_iqhh[offset], &d_iqhh[offset], CUFFT_FORWARD); + cufftExecC2C(fft_doppler_handle[stream_id], &d_iqvv[offset], &d_iqvv[offset], CUFFT_FORWARD); + cufftExecC2C(fft_doppler_handle[stream_id], &d_iqhv[offset], &d_iqhv[offset], CUFFT_FORWARD); + + // __conjugate<<<m,n,0,stream[stream_id]>>>(d_iqhh, offset); + // __conjugate<<<m,n,0,stream[stream_id]>>>(d_iqvv, offset); + // __conjugate<<<m,n,0,stream[stream_id]>>>(d_iqhv, offset); + + __conjshift<<<m,n/2,0,stream[stream_id]>>>(d_iqhh, n, offset); + __conjshift<<<m,n/2,0,stream[stream_id]>>>(d_iqvv, n, offset); + __conjshift<<<m,n/2,0,stream[stream_id]>>>(d_iqhv, n, offset); + + __clip_v2<<<2,m,0,stream[stream_id]>>>(d_iqhh, n, offset); + __clip_v2<<<2,m,0,stream[stream_id]>>>(d_iqvv, n, offset); + __clip_v2<<<2,m,0,stream[stream_id]>>>(d_iqhv, n, offset); + + // Get absolute value squared + __abssqr<<<m/2,n,0,stream[stream_id]>>>(d_iqhh, n, offset); + __abssqr<<<m/2,n,0,stream[stream_id]>>>(d_iqvv, n, offset); + __abssqr<<<m/2,n,0,stream[stream_id]>>>(d_iqhv, n, offset); // FFT PDOP - cufftExecC2C(fft_pdop_handle, d_iqhh, d_iqhh, CUFFT_FORWARD); - cufftExecC2C(fft_pdop_handle, d_iqvv, d_iqvv, CUFFT_FORWARD); - cufftExecC2C(fft_pdop_handle, d_iqhv, d_iqhv, CUFFT_FORWARD); + cufftExecC2C(fft_pdop_handle[stream_id], &d_iqhh[offset], &d_iqhh[offset], CUFFT_FORWARD); + cufftExecC2C(fft_pdop_handle[stream_id], &d_iqvv[offset], &d_iqvv[offset], CUFFT_FORWARD); + cufftExecC2C(fft_pdop_handle[stream_id], &d_iqhv[offset], &d_iqhv[offset], CUFFT_FORWARD); // Apply MA coefficients - __apply_ma_v2<<<m/2,n>>>(d_iqhh); - __apply_ma_v2<<<m/2,n>>>(d_iqvv); - __apply_ma_v2<<<m/2,n>>>(d_iqhv); + __apply_ma<<<m/2,n,0,stream[stream_id]>>>(d_iqhh, offset); + __apply_ma<<<m/2,n,0,stream[stream_id]>>>(d_iqvv, offset); + __apply_ma<<<m/2,n,0,stream[stream_id]>>>(d_iqhv, offset); // Inverse FFT - cufftExecC2C(fft_pdop_handle, d_iqhh, d_iqhh, CUFFT_INVERSE); - cufftExecC2C(fft_pdop_handle, d_iqvv, d_iqvv, CUFFT_INVERSE); - cufftExecC2C(fft_pdop_handle, d_iqhv, d_iqhv, CUFFT_INVERSE); + cufftExecC2C(fft_pdop_handle[stream_id], &d_iqhh[offset], &d_iqhh[offset], CUFFT_INVERSE); + cufftExecC2C(fft_pdop_handle[stream_id], &d_iqvv[offset], &d_iqvv[offset], CUFFT_INVERSE); + cufftExecC2C(fft_pdop_handle[stream_id], &d_iqhv[offset], &d_iqhv[offset], CUFFT_INVERSE); - __scale_real<<<m/2,n>>>(d_iqhh); - __scale_real<<<m/2,n>>>(d_iqvv); - __scale_real<<<m/2,n>>>(d_iqhv); + __scale_real<<<m/2,n,0,stream[stream_id]>>>(d_iqhh, offset); + __scale_real<<<m/2,n,0,stream[stream_id]>>>(d_iqvv, offset); + __scale_real<<<m/2,n,0,stream[stream_id]>>>(d_iqhv, offset); // Sum - __sum_inplace_v4<<<m/4,n,2*n*sizeof(cuFloatComplex)>>>(d_iqhh); - __sum_inplace_v4<<<m/4,n,2*n*sizeof(cuFloatComplex)>>>(d_iqvv); - __sum_inplace_v4<<<m/4,n,2*n*sizeof(cuFloatComplex)>>>(d_iqhv); + // __sum_inplace_v2<<<m/4,n,0,stream[stream_id]>>>(d_iqhh, offset); + // __sum_inplace_v2<<<m/4,n,0,stream[stream_id]>>>(d_iqvv, offset); + __sum_inplace_v4<<<m/4,n,2*n*sizeof(cuFloatComplex),stream[stream_id]>>>(d_iqhh, offset); + __sum_inplace_v4<<<m/4,n,2*n*sizeof(cuFloatComplex),stream[stream_id]>>>(d_iqvv, offset); + __sum_inplace_v4<<<m/4,n,2*n*sizeof(cuFloatComplex),stream[stream_id]>>>(d_iqhv, offset); // Calculate ZdB, Zdr - __calcresult_v2<<<1,m/2>>>(d_iqhh, d_iqvv, d_iqhv, d_result, n); + __calcresult_v2<<<1,m/2,0,stream[stream_id]>>>(d_iqhh, d_iqvv, d_iqhv, d_result, n, offset, result_offset); - cudaMemcpy(result, d_result, (m/2)*RESULT_SIZE*sizeof(float), cudaMemcpyDeviceToHost); - - for (int i=0; i<m/2; i++) { - zdb[i] = result[i*RESULT_SIZE]; - zdr[i] = result[i*RESULT_SIZE + 1]; - } - - cudaEventRecord(stopEvent, 0); - cudaEventSynchronize(stopEvent); - cudaEventElapsedTime(&ms, startEvent, stopEvent); - printf("Time for sequential transfer and execute (ms): %f\n", ms); - - cudaEventDestroy(startEvent); - cudaEventDestroy(stopEvent); + cudaMemcpyAsync(&result[result_offset], &d_result[result_offset], (m/2)*RESULT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream[stream_id]); } -} +} diff --git a/lib/radar.h b/lib/radar.h index c0909aabbc28a9b41cf5be39b5886c870efa5b30..b1a6b4d427c4659054bdd208034e5deabb0325a5 100644 --- a/lib/radar.h +++ b/lib/radar.h @@ -7,8 +7,9 @@ extern "C" { #endif -void calculate_reflectivity(int* r_hh, int* i_hh, int* r_vv, int* i_vv, - int* r_hv, int* i_hv, float* zdb, float* zdr); +void calculate_reflectivity(int sector_id, int* r_hh, int* i_hh, int* r_vv, + int* i_vv, int* r_hv, int* i_hv); +void fetch_reflectivity(int sector_id, float* zdb, float* zdr); void initialize(); void cleanup(); diff --git a/service/processor/processor.go b/service/processor/processor.go index ff0dd95482c3e0cf667ff929df8c44fe7926ebbd..8ec305ee1d0f621e280604b039ebb2cf16d7603e 100644 --- a/service/processor/processor.go +++ b/service/processor/processor.go @@ -1,30 +1,59 @@ package processor import ( + "context" "sync" "gitlab.informatika.org/weather-radar/radar-processor/data" "gitlab.informatika.org/weather-radar/radar-processor/lib" ) +type AwaitedData struct { + SectorIdx int32 + ElevationIdx int32 +} + type RadarProcessor struct { - currentValue data.CalculatedData - mutex sync.RWMutex + currentValue data.CalculatedData + mutex sync.RWMutex + fetcherChannel chan<- AwaitedData } -func New() *RadarProcessor { +func New(ctx context.Context) *RadarProcessor { + dataQueue := make(chan AwaitedData, 16) object := &RadarProcessor{ - mutex: sync.RWMutex{}, + mutex: sync.RWMutex{}, + fetcherChannel: dataQueue, } + go func() { + for { + select { + case <-ctx.Done(): + return + case queueItem := <-dataQueue: + data := lib.FetchReflectivity(queueItem.ElevationIdx, queueItem.SectorIdx) + object.setValue(data) + } + } + }() + return object } -func (r *RadarProcessor) calculate(data data.RawData) { +func (r *RadarProcessor) setValue(data data.CalculatedData) { r.mutex.Lock() defer r.mutex.Unlock() - r.currentValue = lib.CalculateReflectivity(data) + r.currentValue = data +} + +func (r *RadarProcessor) calculate(data data.RawData) { + lib.CalculateReflectivity(data) + r.fetcherChannel <- AwaitedData{ + SectorIdx: data.SectorIndex, + ElevationIdx: data.ElevationIndex, + } } func (r *RadarProcessor) GetCurrentValue() data.CalculatedData { diff --git a/service/processor/worker.go b/service/processor/worker.go index 835ef3303d561dc9275c4f5bb111d048dd9d9b92..cb81fbff708adc80081fb10583e8950dc65d48ae 100644 --- a/service/processor/worker.go +++ b/service/processor/worker.go @@ -14,7 +14,7 @@ import ( const MAX_SIZE = 1024 func StartWorker(ctx context.Context, inputStream <-chan *proto.BeatSignal) (*observer.Broadcast[data.CalculatedData], *sync.WaitGroup) { - processor := New() + processor := New(ctx) wg := sync.WaitGroup{} wg.Add(1) @@ -25,6 +25,7 @@ func StartWorker(ctx context.Context, inputStream <-chan *proto.BeatSignal) (*ob // PACK DATA firstRun := true + firstCalculate := true dataCache := data.RawData{} lib.Initialize() @@ -39,8 +40,12 @@ func StartWorker(ctx context.Context, inputStream <-chan *proto.BeatSignal) (*ob fmt.Printf("Processing Sector #%d\n", dataCache.SectorIndex) processor.calculate(dataCache) - value := processor.GetCurrentValue() - broadcast.Broadcast(value) + if !firstCalculate { + value := processor.GetCurrentValue() + broadcast.Broadcast(value) + } + + firstCalculate = false dataCache.ImagHH = [][]int32{} dataCache.ImagHV = [][]int32{}