VCSBeam
|
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <time.h>
#include "gpu_fft.hpp"
#include "gpu_macros.h"
#include "vcsbeam.h"
Macros | |
#define | NTHREADS_BEAMFORMING_KERNEL 512 |
Functions | |
__global__ void | incoh_beam (uint8_t *data, float *incoh) |
CUDA kernel for computing an incoherent beam. More... | |
__global__ void | vmApplyJ_kernel (void *data, gpuDoubleComplex *J, gpuDoubleComplex *Jv_Q, gpuDoubleComplex *Jv_P, uint32_t *polQ_idxs, uint32_t *polP_idxs, int npol, int p, vcsbeam_datatype datatype) |
CUDA kernel for multiplying Jones matrices to Jones vectors. More... | |
__global__ void | vmBeamform_kernel (int nfine_chan, int n_samples, int nant, gpuDoubleComplex *Jv_Q, gpuDoubleComplex *Jv_P, gpuDoubleComplex *phi, double invw, int p, int soffset, int nchunk, gpuDoubleComplex *e, float *S, int npol, int nstokes) |
CUDA kernel for phasing up and summing the voltages over antenna. More... | |
__global__ void | renormalise_channels_kernel (float *S, int nstep, float *offsets, float *scales, uint8_t *Sscaled) |
CUDA kernel for normalising Stokes parameters. More... | |
void | cu_form_incoh_beam (uint8_t *data, uint8_t *d_data, size_t data_size, float *d_incoh, unsigned int nsample, int nchan, int ninput, float *offsets, float *d_offsets, float *scales, float *d_scales, uint8_t *Iscaled, uint8_t *d_Iscaled, size_t Iscaled_size) |
Form an incoherent beam. More... | |
void | vmApplyJChunk (vcsbeam_context *vm) |
Computes \({\bf J}^{-1} {\bf v}\). More... | |
void | vmBeamformChunk (vcsbeam_context *vm) |
Performs the phasing up, averaging over antennas, and detection operations on calibrated data. More... | |
void | vmBeamformSecond (vcsbeam_context *vm) |
Performs all beamforming steps for 1 second's worth of data. More... | |
void | vmPullE (vcsbeam_context *vm) |
Copies the beamformed voltages from GPU memory to CPU memory. More... | |
void | vmPullS (vcsbeam_context *vm) |
Copies the detected Stokes parameters from GPU memory to CPU memory. More... | |
void | vmSendSToFits (vcsbeam_context *vm, mpi_psrfits *mpfs) |
Renormalises the detected Stokes parameters and copies them into PSRFITS structs, ready for frequency splicing. More... | |
void | vmPushPolIdxLists (vcsbeam_context *vm) |
Copies the index arrays for antennas and polarisations from CPU memory to GPU memory. More... | |
float * | create_pinned_data_buffer (size_t size) |
(Deprecated) Allocate memory on the GPU. More... | |
gpuDoubleComplex **** | create_detected_beam (int npointing, int nsamples, int nchan, int npol) |
(Deprecated) Allocate memory on the CPU. More... | |
gpuDoubleComplex * | create_data_buffer_fine (int npointing, int nsamples, int nchan, int npol) |
void | prepare_data_buffer_fine (gpuDoubleComplex *data_buffer_fine, vcsbeam_context *vm, uintptr_t timestep_idx) |
void | allocate_input_output_arrays (void **data, void **d_data, size_t size) |
(Deprecated) Allocates memory on the CPU and GPU simultaneously. More... | |
void | free_input_output_arrays (void *data, void *d_data) |
(Deprecated) Frees memory on the CPU and GPU simultaneously. More... | |
#define NTHREADS_BEAMFORMING_KERNEL 512 |
void allocate_input_output_arrays | ( | void ** | data, |
void ** | d_data, | ||
size_t | size | ||
) |
(Deprecated) Allocates memory on the CPU and GPU simultaneously.
gpuDoubleComplex* create_data_buffer_fine | ( | int | npointing, |
int | nsamples, | ||
int | nchan, | ||
int | npol | ||
) |
gpuDoubleComplex**** create_detected_beam | ( | int | npointing, |
int | nsamples, | ||
int | nchan, | ||
int | npol | ||
) |
(Deprecated) Allocate memory on the CPU.
float* create_pinned_data_buffer | ( | size_t | size | ) |
(Deprecated) Allocate memory on the GPU.
void cu_form_incoh_beam | ( | uint8_t * | data, |
uint8_t * | d_data, | ||
size_t | data_size, | ||
float * | d_incoh, | ||
unsigned int | nsample, | ||
int | nchan, | ||
int | ninput, | ||
float * | offsets, | ||
float * | d_offsets, | ||
float * | scales, | ||
float * | d_scales, | ||
uint8_t * | Iscaled, | ||
uint8_t * | d_Iscaled, | ||
size_t | Iscaled_size | ||
) |
Form an incoherent beam.
Forms an incoherent beam, detects it, and prepares it for writing to PSRFITS.
void free_input_output_arrays | ( | void * | data, |
void * | d_data | ||
) |
(Deprecated) Frees memory on the CPU and GPU simultaneously.
__global__ void incoh_beam | ( | uint8_t * | data, |
float * | incoh | ||
) |
CUDA kernel for computing an incoherent beam.
[in] | data | The voltage data, \(v\), with layout \(N_t \times N_f \times N_i\). |
[out] | incoh | The detected (Stokes I) powers, \(I\), with layout \(N_t \times N_f\). |
The incoherent beam is the expression
\[ I_{t,f} = \sum_i v_{t,f,i}^\dagger v_{t,f,i}. \]
The expected thread configuration is \(\langle\langle\langle(N_f, N_t), N_i\rangle\rangle\rangle.\)
Note that if the voltages were arranged into Jones vectors, the above could also be expressed in the more familiar form
\[ I_{t,f} = \sum_a {\bf v}_{t,f,a}^\dagger {\bf v}_{t,f,a}. \]
void prepare_data_buffer_fine | ( | gpuDoubleComplex * | data_buffer_fine, |
vcsbeam_context * | vm, | ||
uintptr_t | timestep_idx | ||
) |
__global__ void renormalise_channels_kernel | ( | float * | S, |
int | nstep, | ||
float * | offsets, | ||
float * | scales, | ||
uint8_t * | Sscaled | ||
) |
CUDA kernel for normalising Stokes parameters.
[in] | S | The original Stokes parameters, with layout \(N_t \times N_s \times N_f\) |
nstep | \(N_t\) | |
[out] | offsets | The amount of offset needed to recover the original values from the normalised ones |
[out] | scales | The scaling needed to recover the original values from the normalised ones |
[out] | Sscaled | The normalised Stokes parameters |
This kernel shifts and normalises the Stokes parameters so that they fit into 8-bits integers without clipping (e.g. for output into the PSRFITS format). Each frequency is normalised independently, with the scales and offsets needed to recover the original values for that channel being recorded as well.
If \({\bf S}\) is the array of values to be normalised, then the normalisation is
\[ \hat{\bf S} = \frac{{\bf S} - \text{offset}}{\text{scale}}, \]
where
\begin{align*} \text{scale} &= \frac{S_\text{max} - S_\text{min}}{256} \\ \text{offset} &= S_\text{min} + 0.5 \times \text{scale}. \end{align*}
The expected thread configuration is \(\langle\langle\langle N_b,(N_f, N_s)\rangle\rangle\rangle.\)
__global__ void vmApplyJ_kernel | ( | void * | data, |
gpuDoubleComplex * | J, | ||
gpuDoubleComplex * | Jv_Q, | ||
gpuDoubleComplex * | Jv_P, | ||
uint32_t * | polQ_idxs, | ||
uint32_t * | polP_idxs, | ||
int | npol, | ||
int | p, | ||
vcsbeam_datatype | datatype | ||
) |
CUDA kernel for multiplying Jones matrices to Jones vectors.
[in] | data | The voltage data, \(v\), with layout \(N_t \times N_f \times N_i\) |
[in] | J | The Jones matrices, \({\bf J}^{-1}\), with layout \(N_a \times N_f \times N_p \times N_p\) |
[out] | Jv_Q | The Q polarisation of the product \({\bf J}^{-1}{\bf v}\), with layout \(N_t \times N_f \times N_a\) |
[out] | Jv_P | The P polarisation of the product \({\bf J}^{-1}{\bf v}\), with layout \(N_t \times N_f \times N_a\) |
polQ_idxs | And array of the indices \(i\) for the Q polarisations of the antennas | |
polP_idxs | And array of the indices \(i\) for the P polarisations of the antennas | |
p | The pointing number | |
soffset | An offset number of samples into data | |
npol | \(N_p\) | |
datatype | Either VM_INT4 (if data contain 4+4-bit complex integers) or VM_DBL (if data contain complex doubles). |
Although this kernel is quite general, in the sense that it could be used to multiply any Jones matrices to any Jones vectors, it is used in particular for multiplying the Jones matrices \({\bf J}^{-1}\) to the voltage data \({\bf v}\):
\[ \tilde{\bf e}_{t,f,a} = {\bf J}^{-1}_{a,f}{\bf v}_{t,f,a}. \]
The expected thread configuration is \(\langle\langle\langle(N_f, N_t), N_a\rangle\rangle\rangle.\)
void vmApplyJChunk | ( | vcsbeam_context * | vm | ) |
Computes \({\bf J}^{-1} {\bf v}\).
__global__ void vmBeamform_kernel | ( | int | nfine_chan, |
int | n_samples, | ||
int | nant, | ||
gpuDoubleComplex * | Jv_Q, | ||
gpuDoubleComplex * | Jv_P, | ||
gpuDoubleComplex * | phi, | ||
double | invw, | ||
int | p, | ||
int | soffset, | ||
int | nchunk, | ||
gpuDoubleComplex * | e, | ||
float * | S, | ||
int | npol, | ||
int | nstokes | ||
) |
CUDA kernel for phasing up and summing the voltages over antenna.
[in] | nfine_chan | Number of fine channels |
[in] | n_samples | Number of time samples |
[in] | nant | Number of antennas |
[in] | Jv_Q | The Q polarisation of the product \({\bf J}^{-1}{\bf v}\), with layout \(N_t \times N_f \times N_a\) |
[in] | Jv_P | The P polarisation of the product \({\bf J}^{-1}{\bf v}\), with layout \(N_t \times N_f \times N_a\) |
[in] | phi | The delay phase, \(\varphi\), with layout \(N_a \times N_f\) |
invw | The reciprocal of the number of non-flagged antennas | |
p | The pointing index | |
soffset | An offset number of samples into e for where to put the the answer | |
nchunk | The number of chunks (divisions of a second's worth of data) | |
[out] | e | The recovered electric field, \({\bf e}\), with layout \(N_t \times N_f \times N_p\) |
[out] | S | The recovered Stokes parameters, with layout \(N_t \times N_s \times N_f\) |
npol | \(N_p\) | |
nstokes | The number of stokes parameters to output |
This kernel performs the phasing up and the summing over antennas part of the beamforming operation (see Beamforming):
\[ {\bf e}_{t,f} = \frac{1}{N_a} \sum_a e^{i\varphi} \tilde{\bf e}_{t,f,a}. \]
It also computes the Stokes parameters, \(S = [I, Q, U, V]\) (with the autocorrelations removed).
The expected thread configuration is \(\langle\langle\langle(N_f, N_t), N_a\rangle\rangle\rangle.\)
the total number of warps created might not cover the total number of beams to be computed. Hence, the code "moves" the grid over the entire input until all of it is "covered". Alternatively, the overall input is tiled, the tile size is the number of warps available, and this is the number of tiles necessary to cover the entire input.
void vmBeamformChunk | ( | vcsbeam_context * | vm | ) |
Performs the phasing up, averaging over antennas, and detection operations on calibrated data.
In this implementation each beam computation, one for each frequency channel and time sample, is assigned to a warp (32/64 consecutive threads working in lockstep). A warp is assigned one or more beams to compute depending on how many thread blocks (hence warps) are created. This number is now unrelated to the problem at hand and depends on the hardware specifics for better performance. We create "just enough" blocks to avoid too many context switches.
void vmBeamformSecond | ( | vcsbeam_context * | vm | ) |
Performs all beamforming steps for 1 second's worth of data.
void vmPullE | ( | vcsbeam_context * | vm | ) |
Copies the beamformed voltages from GPU memory to CPU memory.
void vmPullS | ( | vcsbeam_context * | vm | ) |
Copies the detected Stokes parameters from GPU memory to CPU memory.
void vmPushPolIdxLists | ( | vcsbeam_context * | vm | ) |
Copies the index arrays for antennas and polarisations from CPU memory to GPU memory.
void vmSendSToFits | ( | vcsbeam_context * | vm, |
mpi_psrfits * | mpfs | ||
) |
Renormalises the detected Stokes parameters and copies them into PSRFITS structs, ready for frequency splicing.
vm | The VCSBeam context struct |
mpfs | The MPI PSRFITS struct that manages the splicing operation. |