#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <time.h>
#include "gpu_fft.hpp"
#include "gpu_macros.h"
#include "vcsbeam.h"

Include dependency graph for form_beam.cpp:

Macros
#define	NTHREADS_BEAMFORMING_KERNEL 512

Functions
__global__ void	incoh_beam (uint8_t data, float incoh)
	CUDA kernel for computing an incoherent beam. More...

__global__ void	vmApplyJ_kernel (void data, gpuDoubleComplex J, gpuDoubleComplex Jv_Q, gpuDoubleComplex Jv_P, uint32_t polQ_idxs, uint32_t polP_idxs, int npol, int p, vcsbeam_datatype datatype)
	CUDA kernel for multiplying Jones matrices to Jones vectors. More...

__global__ void	vmBeamform_kernel (int nfine_chan, int n_samples, int nant, gpuDoubleComplex Jv_Q, gpuDoubleComplex Jv_P, gpuDoubleComplex phi, double invw, int p, int soffset, int nchunk, gpuDoubleComplex e, float *S, int npol, int nstokes)
	CUDA kernel for phasing up and summing the voltages over antenna. More...

__global__ void	renormalise_channels_kernel (float S, int nstep, float offsets, float scales, uint8_t Sscaled)
	CUDA kernel for normalising Stokes parameters. More...

void	cu_form_incoh_beam (uint8_t data, uint8_t d_data, size_t data_size, float d_incoh, unsigned int nsample, int nchan, int ninput, float offsets, float d_offsets, float scales, float d_scales, uint8_t Iscaled, uint8_t *d_Iscaled, size_t Iscaled_size)
	Form an incoherent beam. More...

void	vmApplyJChunk (vcsbeam_context *vm)
	Computes \({\bf J}^{-1} {\bf v}\). More...

void	vmBeamformChunk (vcsbeam_context *vm)
	Performs the phasing up, averaging over antennas, and detection operations on calibrated data. More...

void	vmBeamformSecond (vcsbeam_context *vm)
	Performs all beamforming steps for 1 second's worth of data. More...

void	vmPullE (vcsbeam_context *vm)
	Copies the beamformed voltages from GPU memory to CPU memory. More...

void	vmPullS (vcsbeam_context *vm)
	Copies the detected Stokes parameters from GPU memory to CPU memory. More...

void	vmSendSToFits (vcsbeam_context vm, mpi_psrfits mpfs)
	Renormalises the detected Stokes parameters and copies them into PSRFITS structs, ready for frequency splicing. More...

void	vmPushPolIdxLists (vcsbeam_context *vm)
	Copies the index arrays for antennas and polarisations from CPU memory to GPU memory. More...

float *	create_pinned_data_buffer (size_t size)
	(Deprecated) Allocate memory on the GPU. More...

gpuDoubleComplex ****	create_detected_beam (int npointing, int nsamples, int nchan, int npol)
	(Deprecated) Allocate memory on the CPU. More...

gpuDoubleComplex *	create_data_buffer_fine (int npointing, int nsamples, int nchan, int npol)

void	prepare_data_buffer_fine (gpuDoubleComplex data_buffer_fine, vcsbeam_context vm, uintptr_t timestep_idx)

void	allocate_input_output_arrays (void data, void d_data, size_t size)
	(Deprecated) Allocates memory on the CPU and GPU simultaneously. More...

void	free_input_output_arrays (void data, void d_data)
	(Deprecated) Frees memory on the CPU and GPU simultaneously. More...

Macro Definition Documentation

◆ NTHREADS_BEAMFORMING_KERNEL

#define NTHREADS_BEAMFORMING_KERNEL 512

Function Documentation

◆ allocate_input_output_arrays()

void allocate_input_output_arrays	(	void **	data,
		void **	d_data,
		size_t	size
	)

(Deprecated) Allocates memory on the CPU and GPU simultaneously.

Todo:: Remove the function allocate_input_output_arrays().

◆ create_data_buffer_fine()

gpuDoubleComplex* create_data_buffer_fine	(	int	npointing,
		int	nsamples,
		int	nchan,
		int	npol
	)

◆ create_detected_beam()

gpuDoubleComplex**** create_detected_beam	(	int	npointing,
		int	nsamples,
		int	nchan,
		int	npol
	)

(Deprecated) Allocate memory on the CPU.

Todo:: Remove the function create_detected_beam().

◆ create_pinned_data_buffer()

float* create_pinned_data_buffer ( size_t size )

(Deprecated) Allocate memory on the GPU.

Todo:: Remove the function create_pinned_data_buffer().

◆ cu_form_incoh_beam()

void cu_form_incoh_beam	(	uint8_t *	data,
		uint8_t *	d_data,
		size_t	data_size,
		float *	d_incoh,
		unsigned int	nsample,
		int	nchan,
		int	ninput,
		float *	offsets,
		float *	d_offsets,
		float *	scales,
		float *	d_scales,
		uint8_t *	Iscaled,
		uint8_t *	d_Iscaled,
		size_t	Iscaled_size
	)

Form an incoherent beam.

Forms an incoherent beam, detects it, and prepares it for writing to PSRFITS.

Todo:: Either generalise the vmBeamformChunk() so that it can also produce incoherent beams (and therefore do away with cu_form_incoh_beam(), or keep cu_form_incoh_beam() and convert it into a bona fide "vm" style fuction.

◆ free_input_output_arrays()

void free_input_output_arrays	(	void *	data,
		void *	d_data
	)

(Deprecated) Frees memory on the CPU and GPU simultaneously.

Todo:: Remove the function free_input_output_arrays().

◆ incoh_beam()

__global__ void incoh_beam	(	uint8_t *	data,
		float *	incoh
	)

CUDA kernel for computing an incoherent beam.

Parameters

[in]	data	The voltage data, \(v\), with layout \(N_t \times N_f \times N_i\).
[out]	incoh	The detected (Stokes I) powers, \(I\), with layout \(N_t \times N_f\).

The incoherent beam is the expression

\[ I_{t,f} = \sum_i v_{t,f,i}^\dagger v_{t,f,i}. \]

The expected thread configuration is \(\langle\langle\langle(N_f, N_t), N_i\rangle\rangle\rangle.\)

Note that if the voltages were arranged into Jones vectors, the above could also be expressed in the more familiar form

\[ I_{t,f} = \sum_a {\bf v}_{t,f,a}^\dagger {\bf v}_{t,f,a}. \]

◆ prepare_data_buffer_fine()

void prepare_data_buffer_fine	(	gpuDoubleComplex *	data_buffer_fine,
		vcsbeam_context *	vm,
		uintptr_t	timestep_idx
	)

◆ renormalise_channels_kernel()

__global__ void renormalise_channels_kernel	(	float *	S,
		int	nstep,
		float *	offsets,
		float *	scales,
		uint8_t *	Sscaled
	)

CUDA kernel for normalising Stokes parameters.

Parameters

[in]	S	The original Stokes parameters, with layout \(N_t \times N_s \times N_f\)
	nstep	\(N_t\)
[out]	offsets	The amount of offset needed to recover the original values from the normalised ones
[out]	scales	The scaling needed to recover the original values from the normalised ones
[out]	Sscaled	The normalised Stokes parameters

This kernel shifts and normalises the Stokes parameters so that they fit into 8-bits integers without clipping (e.g. for output into the PSRFITS format). Each frequency is normalised independently, with the scales and offsets needed to recover the original values for that channel being recorded as well.

If \({\bf S}\) is the array of values to be normalised, then the normalisation is

\[ \hat{\bf S} = \frac{{\bf S} - \text{offset}}{\text{scale}}, \]

where

\begin{align*} \text{scale} &= \frac{S_\text{max} - S_\text{min}}{256} \\ \text{offset} &= S_\text{min} + 0.5 \times \text{scale}. \end{align*}

The expected thread configuration is \(\langle\langle\langle N_b,(N_f, N_s)\rangle\rangle\rangle.\)

Todo:: Optimise the renormalisation kernel (e.g. by removing the for loop over timesteps.

◆ vmApplyJ_kernel()

__global__ void vmApplyJ_kernel	(	void *	data,
		gpuDoubleComplex *	J,
		gpuDoubleComplex *	Jv_Q,
		gpuDoubleComplex *	Jv_P,
		uint32_t *	polQ_idxs,
		uint32_t *	polP_idxs,
		int	npol,
		int	p,
		vcsbeam_datatype	datatype
	)

CUDA kernel for multiplying Jones matrices to Jones vectors.

Parameters

[in]	data	The voltage data, \(v\), with layout \(N_t \times N_f \times N_i\)
[in]	J	The Jones matrices, \({\bf J}^{-1}\), with layout \(N_a \times N_f \times N_p \times N_p\)
[out]	Jv_Q	The Q polarisation of the product \({\bf J}^{-1}{\bf v}\), with layout \(N_t \times N_f \times N_a\)
[out]	Jv_P	The P polarisation of the product \({\bf J}^{-1}{\bf v}\), with layout \(N_t \times N_f \times N_a\)
	polQ_idxs	And array of the indices \(i\) for the Q polarisations of the antennas
	polP_idxs	And array of the indices \(i\) for the P polarisations of the antennas
	p	The pointing number
	soffset	An offset number of samples into `data`
	npol	\(N_p\)
	datatype	Either `VM_INT4` (if `data` contain 4+4-bit complex integers) or `VM_DBL` (if `data` contain complex doubles).

Although this kernel is quite general, in the sense that it could be used to multiply any Jones matrices to any Jones vectors, it is used in particular for multiplying the Jones matrices \({\bf J}^{-1}\) to the voltage data \({\bf v}\):

\[ \tilde{\bf e}_{t,f,a} = {\bf J}^{-1}_{a,f}{\bf v}_{t,f,a}. \]

The expected thread configuration is \(\langle\langle\langle(N_f, N_t), N_a\rangle\rangle\rangle.\)

◆ vmApplyJChunk()

void vmApplyJChunk ( vcsbeam_context * vm )

Computes \({\bf J}^{-1} {\bf v}\).

◆ vmBeamform_kernel()

__global__ void vmBeamform_kernel	(	int	nfine_chan,
		int	n_samples,
		int	nant,
		gpuDoubleComplex *	Jv_Q,
		gpuDoubleComplex *	Jv_P,
		gpuDoubleComplex *	phi,
		double	invw,
		int	p,
		int	soffset,
		int	nchunk,
		gpuDoubleComplex *	e,
		float *	S,
		int	npol,
		int	nstokes
	)

CUDA kernel for phasing up and summing the voltages over antenna.

Parameters

[in]	nfine_chan	Number of fine channels
[in]	n_samples	Number of time samples
[in]	nant	Number of antennas
[in]	Jv_Q	The Q polarisation of the product \({\bf J}^{-1}{\bf v}\), with layout \(N_t \times N_f \times N_a\)
[in]	Jv_P	The P polarisation of the product \({\bf J}^{-1}{\bf v}\), with layout \(N_t \times N_f \times N_a\)
[in]	phi	The delay phase, \(\varphi\), with layout \(N_a \times N_f\)
	invw	The reciprocal of the number of non-flagged antennas
	p	The pointing index
	soffset	An offset number of samples into `e` for where to put the the answer
	nchunk	The number of chunks (divisions of a second's worth of data)
[out]	e	The recovered electric field, \({\bf e}\), with layout \(N_t \times N_f \times N_p\)
[out]	S	The recovered Stokes parameters, with layout \(N_t \times N_s \times N_f\)
	npol	\(N_p\)
	nstokes	The number of stokes parameters to output

This kernel performs the phasing up and the summing over antennas part of the beamforming operation (see Beamforming):

\[ {\bf e}_{t,f} = \frac{1}{N_a} \sum_a e^{i\varphi} \tilde{\bf e}_{t,f,a}. \]

It also computes the Stokes parameters, \(S = [I, Q, U, V]\) (with the autocorrelations removed).

The expected thread configuration is \(\langle\langle\langle(N_f, N_t), N_a\rangle\rangle\rangle.\)

the total number of warps created might not cover the total number of beams to be computed. Hence, the code "moves" the grid over the entire input until all of it is "covered". Alternatively, the overall input is tiled, the tile size is the number of warps available, and this is the number of tiles necessary to cover the entire input.

◆ vmBeamformChunk()

void vmBeamformChunk ( vcsbeam_context * vm )

Performs the phasing up, averaging over antennas, and detection operations on calibrated data.

Todo:: Split the beamforming operations into separate steps/kernels.

In this implementation each beam computation, one for each frequency channel and time sample, is assigned to a warp (32/64 consecutive threads working in lockstep). A warp is assigned one or more beams to compute depending on how many thread blocks (hence warps) are created. This number is now unrelated to the problem at hand and depends on the hardware specifics for better performance. We create "just enough" blocks to avoid too many context switches.

◆ vmBeamformSecond()

void vmBeamformSecond ( vcsbeam_context * vm )

Performs all beamforming steps for 1 second's worth of data.

◆ vmPullE()

void vmPullE ( vcsbeam_context * vm )

Copies the beamformed voltages from GPU memory to CPU memory.

◆ vmPullS()

void vmPullS ( vcsbeam_context * vm )

Copies the detected Stokes parameters from GPU memory to CPU memory.

◆ vmPushPolIdxLists()

void vmPushPolIdxLists ( vcsbeam_context * vm )

Copies the index arrays for antennas and polarisations from CPU memory to GPU memory.

◆ vmSendSToFits()

void vmSendSToFits	(	vcsbeam_context *	vm,
		mpi_psrfits *	mpfs
	)

Renormalises the detected Stokes parameters and copies them into PSRFITS structs, ready for frequency splicing.

Parameters

vm	The VCSBeam context struct
mpfs	The MPI PSRFITS struct that manages the splicing operation.

Macros

Functions

Macro Definition Documentation

◆ NTHREADS_BEAMFORMING_KERNEL

Function Documentation

◆ allocate_input_output_arrays()

◆ create_data_buffer_fine()

◆ create_detected_beam()

◆ create_pinned_data_buffer()

◆ cu_form_incoh_beam()

◆ free_input_output_arrays()

◆ incoh_beam()

◆ prepare_data_buffer_fine()

◆ renormalise_channels_kernel()

◆ vmApplyJ_kernel()

◆ vmApplyJChunk()

◆ vmBeamform_kernel()

◆ vmBeamformChunk()

◆ vmBeamformSecond()

◆ vmPullE()

◆ vmPullS()

◆ vmPushPolIdxLists()

◆ vmSendSToFits()