VCSBeam
Macros | Functions
form_beam.cpp File Reference
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <time.h>
#include "gpu_fft.hpp"
#include "gpu_macros.h"
#include "vcsbeam.h"
Include dependency graph for form_beam.cpp:

Macros

#define NTHREADS_BEAMFORMING_KERNEL   512
 

Functions

__global__ void incoh_beam (uint8_t *data, float *incoh)
 CUDA kernel for computing an incoherent beam. More...
 
__global__ void vmApplyJ_kernel (void *data, gpuDoubleComplex *J, gpuDoubleComplex *Jv_Q, gpuDoubleComplex *Jv_P, uint32_t *polQ_idxs, uint32_t *polP_idxs, int npol, int p, vcsbeam_datatype datatype)
 CUDA kernel for multiplying Jones matrices to Jones vectors. More...
 
__global__ void vmBeamform_kernel (int nfine_chan, int n_samples, int nant, gpuDoubleComplex *Jv_Q, gpuDoubleComplex *Jv_P, gpuDoubleComplex *phi, double invw, int p, int soffset, int nchunk, gpuDoubleComplex *e, float *S, int npol, int nstokes)
 CUDA kernel for phasing up and summing the voltages over antenna. More...
 
__global__ void renormalise_channels_kernel (float *S, int nstep, float *offsets, float *scales, uint8_t *Sscaled)
 CUDA kernel for normalising Stokes parameters. More...
 
void cu_form_incoh_beam (uint8_t *data, uint8_t *d_data, size_t data_size, float *d_incoh, unsigned int nsample, int nchan, int ninput, float *offsets, float *d_offsets, float *scales, float *d_scales, uint8_t *Iscaled, uint8_t *d_Iscaled, size_t Iscaled_size)
 Form an incoherent beam. More...
 
void vmApplyJChunk (vcsbeam_context *vm)
 Computes \({\bf J}^{-1} {\bf v}\). More...
 
void vmBeamformChunk (vcsbeam_context *vm)
 Performs the phasing up, averaging over antennas, and detection operations on calibrated data. More...
 
void vmBeamformSecond (vcsbeam_context *vm)
 Performs all beamforming steps for 1 second's worth of data. More...
 
void vmPullE (vcsbeam_context *vm)
 Copies the beamformed voltages from GPU memory to CPU memory. More...
 
void vmPullS (vcsbeam_context *vm)
 Copies the detected Stokes parameters from GPU memory to CPU memory. More...
 
void vmSendSToFits (vcsbeam_context *vm, mpi_psrfits *mpfs)
 Renormalises the detected Stokes parameters and copies them into PSRFITS structs, ready for frequency splicing. More...
 
void vmPushPolIdxLists (vcsbeam_context *vm)
 Copies the index arrays for antennas and polarisations from CPU memory to GPU memory. More...
 
float * create_pinned_data_buffer (size_t size)
 (Deprecated) Allocate memory on the GPU. More...
 
gpuDoubleComplex **** create_detected_beam (int npointing, int nsamples, int nchan, int npol)
 (Deprecated) Allocate memory on the CPU. More...
 
gpuDoubleComplex * create_data_buffer_fine (int npointing, int nsamples, int nchan, int npol)
 
void prepare_data_buffer_fine (gpuDoubleComplex *data_buffer_fine, vcsbeam_context *vm, uintptr_t timestep_idx)
 
void allocate_input_output_arrays (void **data, void **d_data, size_t size)
 (Deprecated) Allocates memory on the CPU and GPU simultaneously. More...
 
void free_input_output_arrays (void *data, void *d_data)
 (Deprecated) Frees memory on the CPU and GPU simultaneously. More...
 

Macro Definition Documentation

◆ NTHREADS_BEAMFORMING_KERNEL

#define NTHREADS_BEAMFORMING_KERNEL   512

Function Documentation

◆ allocate_input_output_arrays()

void allocate_input_output_arrays ( void **  data,
void **  d_data,
size_t  size 
)

(Deprecated) Allocates memory on the CPU and GPU simultaneously.

Todo:
Remove the function allocate_input_output_arrays().

◆ create_data_buffer_fine()

gpuDoubleComplex* create_data_buffer_fine ( int  npointing,
int  nsamples,
int  nchan,
int  npol 
)

◆ create_detected_beam()

gpuDoubleComplex**** create_detected_beam ( int  npointing,
int  nsamples,
int  nchan,
int  npol 
)

(Deprecated) Allocate memory on the CPU.

Todo:
Remove the function create_detected_beam().

◆ create_pinned_data_buffer()

float* create_pinned_data_buffer ( size_t  size)

(Deprecated) Allocate memory on the GPU.

Todo:
Remove the function create_pinned_data_buffer().

◆ cu_form_incoh_beam()

void cu_form_incoh_beam ( uint8_t *  data,
uint8_t *  d_data,
size_t  data_size,
float *  d_incoh,
unsigned int  nsample,
int  nchan,
int  ninput,
float *  offsets,
float *  d_offsets,
float *  scales,
float *  d_scales,
uint8_t *  Iscaled,
uint8_t *  d_Iscaled,
size_t  Iscaled_size 
)

Form an incoherent beam.

Forms an incoherent beam, detects it, and prepares it for writing to PSRFITS.

Todo:
Either generalise the vmBeamformChunk() so that it can also produce incoherent beams (and therefore do away with cu_form_incoh_beam(), or keep cu_form_incoh_beam() and convert it into a bona fide "vm" style fuction.

◆ free_input_output_arrays()

void free_input_output_arrays ( void *  data,
void *  d_data 
)

(Deprecated) Frees memory on the CPU and GPU simultaneously.

Todo:
Remove the function free_input_output_arrays().

◆ incoh_beam()

__global__ void incoh_beam ( uint8_t *  data,
float *  incoh 
)

CUDA kernel for computing an incoherent beam.

Parameters
[in]dataThe voltage data, \(v\), with layout \(N_t \times N_f \times N_i\).
[out]incohThe detected (Stokes I) powers, \(I\), with layout \(N_t \times N_f\).

The incoherent beam is the expression

\[ I_{t,f} = \sum_i v_{t,f,i}^\dagger v_{t,f,i}. \]

The expected thread configuration is \(\langle\langle\langle(N_f, N_t), N_i\rangle\rangle\rangle.\)

Note that if the voltages were arranged into Jones vectors, the above could also be expressed in the more familiar form

\[ I_{t,f} = \sum_a {\bf v}_{t,f,a}^\dagger {\bf v}_{t,f,a}. \]

◆ prepare_data_buffer_fine()

void prepare_data_buffer_fine ( gpuDoubleComplex *  data_buffer_fine,
vcsbeam_context *  vm,
uintptr_t  timestep_idx 
)

◆ renormalise_channels_kernel()

__global__ void renormalise_channels_kernel ( float *  S,
int  nstep,
float *  offsets,
float *  scales,
uint8_t *  Sscaled 
)

CUDA kernel for normalising Stokes parameters.

Parameters
[in]SThe original Stokes parameters, with layout \(N_t \times N_s \times N_f\)
nstep\(N_t\)
[out]offsetsThe amount of offset needed to recover the original values from the normalised ones
[out]scalesThe scaling needed to recover the original values from the normalised ones
[out]SscaledThe normalised Stokes parameters

This kernel shifts and normalises the Stokes parameters so that they fit into 8-bits integers without clipping (e.g. for output into the PSRFITS format). Each frequency is normalised independently, with the scales and offsets needed to recover the original values for that channel being recorded as well.

If \({\bf S}\) is the array of values to be normalised, then the normalisation is

\[ \hat{\bf S} = \frac{{\bf S} - \text{offset}}{\text{scale}}, \]

where

\begin{align*} \text{scale} &= \frac{S_\text{max} - S_\text{min}}{256} \\ \text{offset} &= S_\text{min} + 0.5 \times \text{scale}. \end{align*}

The expected thread configuration is \(\langle\langle\langle N_b,(N_f, N_s)\rangle\rangle\rangle.\)

Todo:
Optimise the renormalisation kernel (e.g. by removing the for loop over timesteps.

◆ vmApplyJ_kernel()

__global__ void vmApplyJ_kernel ( void *  data,
gpuDoubleComplex *  J,
gpuDoubleComplex *  Jv_Q,
gpuDoubleComplex *  Jv_P,
uint32_t *  polQ_idxs,
uint32_t *  polP_idxs,
int  npol,
int  p,
vcsbeam_datatype  datatype 
)

CUDA kernel for multiplying Jones matrices to Jones vectors.

Parameters
[in]dataThe voltage data, \(v\), with layout \(N_t \times N_f \times N_i\)
[in]JThe Jones matrices, \({\bf J}^{-1}\), with layout \(N_a \times N_f \times N_p \times N_p\)
[out]Jv_QThe Q polarisation of the product \({\bf J}^{-1}{\bf v}\), with layout \(N_t \times N_f \times N_a\)
[out]Jv_PThe P polarisation of the product \({\bf J}^{-1}{\bf v}\), with layout \(N_t \times N_f \times N_a\)
polQ_idxsAnd array of the indices \(i\) for the Q polarisations of the antennas
polP_idxsAnd array of the indices \(i\) for the P polarisations of the antennas
pThe pointing number
soffsetAn offset number of samples into data
npol\(N_p\)
datatypeEither VM_INT4 (if data contain 4+4-bit complex integers) or VM_DBL (if data contain complex doubles).

Although this kernel is quite general, in the sense that it could be used to multiply any Jones matrices to any Jones vectors, it is used in particular for multiplying the Jones matrices \({\bf J}^{-1}\) to the voltage data \({\bf v}\):

\[ \tilde{\bf e}_{t,f,a} = {\bf J}^{-1}_{a,f}{\bf v}_{t,f,a}. \]

The expected thread configuration is \(\langle\langle\langle(N_f, N_t), N_a\rangle\rangle\rangle.\)

◆ vmApplyJChunk()

void vmApplyJChunk ( vcsbeam_context *  vm)

Computes \({\bf J}^{-1} {\bf v}\).

◆ vmBeamform_kernel()

__global__ void vmBeamform_kernel ( int  nfine_chan,
int  n_samples,
int  nant,
gpuDoubleComplex *  Jv_Q,
gpuDoubleComplex *  Jv_P,
gpuDoubleComplex *  phi,
double  invw,
int  p,
int  soffset,
int  nchunk,
gpuDoubleComplex *  e,
float *  S,
int  npol,
int  nstokes 
)

CUDA kernel for phasing up and summing the voltages over antenna.

Parameters
[in]nfine_chanNumber of fine channels
[in]n_samplesNumber of time samples
[in]nantNumber of antennas
[in]Jv_QThe Q polarisation of the product \({\bf J}^{-1}{\bf v}\), with layout \(N_t \times N_f \times N_a\)
[in]Jv_PThe P polarisation of the product \({\bf J}^{-1}{\bf v}\), with layout \(N_t \times N_f \times N_a\)
[in]phiThe delay phase, \(\varphi\), with layout \(N_a \times N_f\)
invwThe reciprocal of the number of non-flagged antennas
pThe pointing index
soffsetAn offset number of samples into e for where to put the the answer
nchunkThe number of chunks (divisions of a second's worth of data)
[out]eThe recovered electric field, \({\bf e}\), with layout \(N_t \times N_f \times N_p\)
[out]SThe recovered Stokes parameters, with layout \(N_t \times N_s \times N_f\)
npol\(N_p\)
nstokesThe number of stokes parameters to output

This kernel performs the phasing up and the summing over antennas part of the beamforming operation (see Beamforming):

\[ {\bf e}_{t,f} = \frac{1}{N_a} \sum_a e^{i\varphi} \tilde{\bf e}_{t,f,a}. \]

It also computes the Stokes parameters, \(S = [I, Q, U, V]\) (with the autocorrelations removed).

The expected thread configuration is \(\langle\langle\langle(N_f, N_t), N_a\rangle\rangle\rangle.\)

the total number of warps created might not cover the total number of beams to be computed. Hence, the code "moves" the grid over the entire input until all of it is "covered". Alternatively, the overall input is tiled, the tile size is the number of warps available, and this is the number of tiles necessary to cover the entire input.

◆ vmBeamformChunk()

void vmBeamformChunk ( vcsbeam_context *  vm)

Performs the phasing up, averaging over antennas, and detection operations on calibrated data.

Todo:
Split the beamforming operations into separate steps/kernels.

In this implementation each beam computation, one for each frequency channel and time sample, is assigned to a warp (32/64 consecutive threads working in lockstep). A warp is assigned one or more beams to compute depending on how many thread blocks (hence warps) are created. This number is now unrelated to the problem at hand and depends on the hardware specifics for better performance. We create "just enough" blocks to avoid too many context switches.

◆ vmBeamformSecond()

void vmBeamformSecond ( vcsbeam_context *  vm)

Performs all beamforming steps for 1 second's worth of data.

◆ vmPullE()

void vmPullE ( vcsbeam_context *  vm)

Copies the beamformed voltages from GPU memory to CPU memory.

◆ vmPullS()

void vmPullS ( vcsbeam_context *  vm)

Copies the detected Stokes parameters from GPU memory to CPU memory.

◆ vmPushPolIdxLists()

void vmPushPolIdxLists ( vcsbeam_context *  vm)

Copies the index arrays for antennas and polarisations from CPU memory to GPU memory.

◆ vmSendSToFits()

void vmSendSToFits ( vcsbeam_context *  vm,
mpi_psrfits *  mpfs 
)

Renormalises the detected Stokes parameters and copies them into PSRFITS structs, ready for frequency splicing.

Parameters
vmThe VCSBeam context struct
mpfsThe MPI PSRFITS struct that manages the splicing operation.