#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#include <string.h>
#include "gpu_fft.hpp"
#include "gpu_macros.h"
#include <mwalib.h>
#include "vcsbeam.h"

Include dependency graph for pfb.cpp:

Functions
void	gpuAssert (gpuError_t code, const char *file, int line, bool abort=true)

__global__ void	vmWOLA_kernel (char2 indata, int filter_coeffs, void *weighted_overlap_array)
	Performs the weighted overlap-add part of the PFB algorithm. More...

__global__ void	fpga_rounding_and_demotion (void *data)
	CUDA kernel for performing the rounding and demotion step of the forward PFB. More...

__global__ void	int2float (void *data, double scale)
	Convert 32-bit integers to 32-bit floats. More...

__global__ void	pack_into_recombined_format (gpuFloatComplex ffted, void outdata, int *i_idx, pfb_flags flags)
	CUDA kernel for packing the PFB'ed data into legacy recombined layout/format. More...

void	vmInitForwardPFB (vcsbeam_context *vm, int M, pfb_flags flags)
	Create and initialise a forward_pfb struct. More...

void	vmFreeForwardPFB (forward_pfb *fpfb)
	Frees host and device memory allocated in vmInitForwardPFB(). More...

void	vmUploadForwardPFBChunk (vcsbeam_context *vm)
	Copies a chunk of PFB data from CPU to GPU. More...

void	vmWOLAChunk (vcsbeam_context *vm)
	Calls the vmWOLA_kernel() kernel for the forward PFB data. More...

void	vmFPGARoundingChunk (vcsbeam_context *vm)
	Calls the fpga_rounding_and_demotion() kernel for the forward PFB data. More...

void	vmFFTChunk (vcsbeam_context *vm)
	Executes the (CUDA) FFT on the forward PFB data. More...

void	vmPackChunk (vcsbeam_context *vm)
	Calls the pack_into_recombined_format() kernel for the forward PFB data. More...

void	vmDownloadForwardPFBChunk (vcsbeam_context *vm)
	Copies a chunk of PFB data from GPU to CPU. More...

void	vmExecuteForwardPFB (vcsbeam_context *vm)
	The wrapper function that performs a forward PFB on the GPU. More...

void	vmWritePFBOutputToFile (vcsbeam_context *vm)
	Writes the result of the forward PFB operation to file. More...

__global__ void	ipfb_kernel (float in_real, float in_imag, float ft_real, float ft_imag, int ntaps, int npol, float *out)
	CUDA kernel implementing the synthesis PFB. More...

void	cu_invert_pfb (gpuDoubleComplex data_buffer_fine, int file_no, int npointing, int nsamples, int nchan, int npol, struct gpu_ipfb_arrays g, float *data_buffer_vdif)
	Invert the PFB by applying a resynthesis filter, using GPU acceleration. More...

void	cu_load_ipfb_filter (pfb_filter filter, struct gpu_ipfb_arrays g)
	Load the inverse PFB filter coefficients to the GPU. More...

void	malloc_ipfb (struct gpu_ipfb_arrays g, pfb_filter filter, int nsamples, int npol, int npointing)
	Allocate GPU memory needed for performing the inverse PFB. More...

void	free_ipfb (struct gpu_ipfb_arrays *g)
	Free the GPU memory allocated in malloc_ipfb(). More...

Function Documentation

◆ cu_invert_pfb()

void cu_invert_pfb	(	gpuDoubleComplex *	data_buffer_fine,
		int	file_no,
		int	npointing,
		int	nsamples,
		int	nchan,
		int	npol,
		struct gpu_ipfb_arrays *	g,
		float *	data_buffer_vdif
	)

Invert the PFB by applying a resynthesis filter, using GPU acceleration.

Parameters

[in]	data_buffer_fine	The fine-channelised voltages
	file_no	An index corresponding to the second of the data
	npointing	The number of pointings
	nsamples	The number of time samples per second of input data
	nchan	The number of fine channels
	npol	The number of instrumental polarisations
	g	The ipfb filter struct
[out]	data_buffer_vdif	The reconstructed coarse-channelised voltages

This function expects data_buffer_fine to be a 1D array of complex voltages with indices following the ordering:

pointing, samp, chan, pol

Although data_buffer_fine potentially contains 2 seconds' worth of data, this function only inverts 1 second. The appropriate second is worked out using file_no: if it is even, the first half of data_buffer_fine is used; if odd, the second half.

The IPFB is also given ntaps samples from the end of the previous second, so in total the IPFB is given nsamp+ntaps samples worth of data.

The output of the inversion is packed back into data_buffer_vdif, a 1D array whose ordering is as follows:

time, pol, complexity

This ordering is suited for immediate output to the VDIF format.

It is assumed that the inverse filter coefficients have already been loaded to the GPU.

◆ cu_load_ipfb_filter()

void cu_load_ipfb_filter	(	pfb_filter *	filter,
		struct gpu_ipfb_arrays *	g
	)

Load the inverse PFB filter coefficients to the GPU.

This function loads the inverse filter coefficients and the twiddle factors into GPU memory. If they were loaded separately (as floats), then the multiplication of the filter coefficients and the twiddle factors will be less precise than if a single array containing every combination of floats and twiddle factors is calculated in doubles, and then demoted to floats. Hence, this pre-calculation is done in this function before gpuMemcpy is called.

The result is 2x 1D arrays loaded onto the GPU (one for real, one for imag) where the ith element is equal to

\[ \text{result}[i] = f[n] \times exp{2\pi jk/K}, \]

where \(n = i \% N\), \(k = i / N\), \(N\) is the filter size (fil_size) and \(K\) is the number of channels (nchan).

◆ fpga_rounding_and_demotion()

__global__ void fpga_rounding_and_demotion ( void * data )

CUDA kernel for performing the rounding and demotion step of the forward PFB.

Parameters

[in,out] data The data to be rounded and demoted.

The rounding and demotion performed here is designed to emulate that done in the FPGAs of the legacy (Phases 1 & 2) MWA system. Specifically, it does the following:

\[ \begin{cases} \left\lfloor \frac{x}{2^{14}} + 0.5 \right\rfloor, & x > 0, \\ \left\lfloor \frac{x}{2^{14}} \right\rfloor, & x \le 0. \end{cases} \]

The final numbers are represented as 32-bit floats.

The expected thread configuration is \(\langle\langle\langle (\text{nspectra} \times I), K \rangle\rangle\rangle\)

◆ free_ipfb()

void free_ipfb ( struct gpu_ipfb_arrays * g )

Free the GPU memory allocated in malloc_ipfb().

◆ gpuAssert()

void gpuAssert	(	gpuError_t	code,
		const char *	file,
		int	line,
		bool	abort = `true`
	)

inline

◆ int2float()

__global__ void int2float	(	void *	data,
		double	scale
	)

Convert 32-bit integers to 32-bit floats.

Parameters

data	The data to be converted
scale	A scaling factor to apply (set to 1 for no effect)

Each thread handles a single integer, so the thread configuration can be chosen to suit the size of data.

This function acts as an alternative to fpga_rounding_and_demotion(), in order to preserve the full precision of the PFB output.

◆ ipfb_kernel()

__global__ void ipfb_kernel	(	float *	in_real,
		float *	in_imag,
		float *	ft_real,
		float *	ft_imag,
		int	ntaps,
		int	npol,
		float *	out
	)

CUDA kernel implementing the synthesis PFB.

Parameters

[in]	in_real	...
[in]	in_imag	...
[in]	ft_real	...
[in]	ft_imag	...
	ntaps	...
	npol	...
[out]	out	...

The backwards/inverse/synthesis filter is

\[ \hat{x}[n] = \frac{1}{K} \sum_m f[n - mM] \sum_{k=0}^{K-1} X_k[m]\,e^{2\pi jkn/K} \]

The sum over m is nominally over all integers, but in practice only involves a few terms because of the finiteness of the filter, f. To be precise, there are precisely ntaps non-zero values.

\(X_k[m]\) represents the complex-valued inputs, in_real and in_imag. Every possible value of \(f[n]\,e^{2\pi jkn/K}\) is provided in ft_real and ft_imag.

K is the number of channels, and because this is a critically sampled PFB, \(M = K\). We will also use P to mean the number of taps in the synthesis filter, and \(N = KP\) to mean the size of the filter.

The polarisations are computed completely independently.

\(\hat{x}[n]\) is represented by the out array.

Todo:: Revamp this function to use cuComplex.

◆ malloc_ipfb()

void malloc_ipfb	(	struct gpu_ipfb_arrays *	g,
		pfb_filter *	filter,
		int	nsamples,
		int	npol,
		int	npointing
	)

Allocate GPU memory needed for performing the inverse PFB.

Free with free_ipfb().

◆ pack_into_recombined_format()

__global__ void pack_into_recombined_format	(	gpuFloatComplex *	ffted,
		void *	outdata,
		int *	i_idx,
		pfb_flags	flags
	)

CUDA kernel for packing the PFB'ed data into legacy recombined layout/format.

Parameters

[in]	ffted	The already-FFT'ed PFB data, ready for packing
[out]	outdata	The output data buffer
	i_idx	An array of indexes for the desired ordering of the RF inputs (i.e. antenna/polarisation combinations)
	flags	PFB configuration settings

This is the final step in the forward fine PFB algorithm. At this point, the FFTED array contains the Fourier-transformed data that already represents the final channelisation. All that remains to be done is to pack it into the same layout (and optionally also the same format) as the VCS recombined data.

The relevant values for flags are (see vmInitForwardPFB() for the full table):

Flag name	Description
`PFB_COMPLEX_INT4`	Typecast the output into (4+4)-bit complex integers
`PFB_COMPLEX_FLOAT64`	Typecast the output into (64+64)-bit complex floats

The expected thread configuration is \(\langle\langle\langle(\text{nspectra},K),I\rangle\rangle\rangle\).

◆ vmDownloadForwardPFBChunk()

void vmDownloadForwardPFBChunk ( vcsbeam_context * vm )

Copies a chunk of PFB data from GPU to CPU.

A diagram of the memory layout of the copy operation:

         <--ds-->
Device: |--------|
        ^ from
 
         <--hs->
Host:   |-------|-------|-------| ... |-------| ...
chunk:  0       1       2       3     c
                                      ^ to

where

from = vm→fpfb→d_vcs_data
to = vm→fpfb→vcs_data + c*hs
ds = vm→fpfb→d_vcs_size
hs = vm→fpfb→vcs_stride
c = vm→chunk_to_load % vm→chunks_per_second

Note that ds is not necessarily equal to hs, and that ds bytes are copied.

◆ vmExecuteForwardPFB()

void vmExecuteForwardPFB ( vcsbeam_context * vm )

The wrapper function that performs a forward PFB on the GPU.

For each chunk of data, it calls, in order,

If a buffer (vm→fpfb→vcs_data) has been prepared, then vmDownloadForwardPFBChunk() is called.

◆ vmFFTChunk()

void vmFFTChunk ( vcsbeam_context * vm )

Executes the (CUDA) FFT on the forward PFB data.

Todo:: Keep this as a "pure" forward_pfb function, and make a separate "vm" function that's exposed.

◆ vmFPGARoundingChunk()

void vmFPGARoundingChunk ( vcsbeam_context * vm )

Calls the fpga_rounding_and_demotion() kernel for the forward PFB data.

Todo:: Keep this as a "pure" forward_pfb function, and make a separate "vm" function that's exposed.

◆ vmFreeForwardPFB()

void vmFreeForwardPFB ( forward_pfb * fpfb )

Frees host and device memory allocated in vmInitForwardPFB().

Parameters

fpfb	The `forward_pfb` object to be freed.

After freeing the member variables of fpfb, the struct itself is freed.

◆ vmInitForwardPFB()

void vmInitForwardPFB	(	vcsbeam_context *	vm,
		int	M,
		pfb_flags	flags
	)

Create and initialise a forward_pfb struct.

Parameters

vm	The VCSBeam context struct
M	The "stride" of the PFB
flags	PFB configuration settings

Setting \(M = K\), where \(K\) is the value of vm→analysis_filter→nchans, will make this a "critically sampled PFB"; setting \(M > K\), an "oversampled PFB".

A filter must already be loaded into vm→analysis_filter. WARNING! The filter will be forcibly typecast to int!!

The GPU processing will be divided up into vm→chunks_per_second, chunks per second, but if this number does not divide the number of samples (10000) evenly, then it is incremented until it does.

The available flags are given in the following table:

Flag name	Description
`PFB_MALLOC_HOST_INPUT`	Allocate memory on CPU for the input
`PFB_MALLOC_HOST_OUTPUT`	Allocate memory on CPU for the output
`PFB_MALLOC_DEVICE_INPUT`	Allocate memory on GPU for the input
`PFB_MALLOC_DEVICE_OUTPUT`	Allocate memory on GPU for the output
`PFB_MALLOC_ALL`	Synonym for `PFB_MALLOC_HOST_INPUT \| PFB_MALLOC_HOST_OUTPUT \| PFB_MALLOC_DEVICE_INPUT \| PFB_MALLOC_DEVICE_OUTPUT`
`PFB_COMPLEX_INT4`	Typecast the output into (4+4)-bit complex integers
`PFB_COMPLEX_FLOAT64`	Typecast the output into (64+64)-bit complex floats
`PFB_IMAG_PART_FIRST`	Place the imaginary part of the output in the first (i.e. most significant) position
`PFB_EMULATE_FPGA`	Perform the (asymmetric) rounding and demotion step in exactly the same way as the original (Phase 1 & 2) MWA FPGAs
`PFB_SMART`	Synonym for `PFB_MALLOC_HOST_INPUT \| PFB_MALLOC_DEVICE_INPUT \| PFB_MALLOC_DEVICE_OUTPUT \| PFB_EMULATE_FPGA \| PFB_COMPLEX_INT4`
`PFB_FULL_PRECISION`	Synonym for `PFB_MALLOC_HOST_INPUT \| PFB_MALLOC_DEVICE_INPUT \| PFB_MALLOC_DEVICE_OUTPUT \| PFB_COMPLEX_FLOAT64`

◆ vmPackChunk()

void vmPackChunk ( vcsbeam_context * vm )

Calls the pack_into_recombined_format() kernel for the forward PFB data.

Todo:: Keep this as a "pure" forward_pfb function, and make a separate "vm" function that's exposed.

◆ vmUploadForwardPFBChunk()

void vmUploadForwardPFBChunk ( vcsbeam_context * vm )

Copies a chunk of PFB data from CPU to GPU.

A diagram of the memory layout of the copy operation:

         <--hs->
Host:   |-------|-------|-------| ... |-------| ...
chunk:  0       1       2       3     c
                                      ^ from
 
         <--ds-->
Device: |--------|
        ^ to

where

from = vm→v→buffer + c*hs
to = vm→fpfb→d_htr_data
ds = vm→fpfb→d_htr_size
hs = vm→fpfb→htr_stride
c = vm→chunk_to_load % vm→chunks_per_second

Note that ds is not necessarily equal to hs, and that ds bytes are copied.

◆ vmWOLA_kernel()

__global__ void vmWOLA_kernel	(	char2 *	indata,
		int *	filter_coeffs,
		void *	weighted_overlap_array
	)

Performs the weighted overlap-add part of the PFB algorithm.

Parameters

[in]	indata	The input data, \(x[n]\), with layout equivalent to the buffer populated by mwalib (see [the MWAX voltage format][MWAXHTR])
[in]	filter_coeffs	The PFB filter coefficients, \(h[n]\)
[out]	weighted_overlap_array	The result of the weighted overlap-add operation, \(b[n]\)

The weighted overlap-add part of the PFB algorithm is the equation

\[ b_m[n] = \sum_\rho^{P-1} h[K\rho - n] x[n + mM - K\rho]. \]

See above for the meaning of the terms in this equation.

The expected thread configuration is \( \langle\langle\langle (\text{nspectra},I,P), K \rangle\rangle\rangle \)

Todo:: Add data layout information to this docstring.

◆ vmWOLAChunk()

void vmWOLAChunk ( vcsbeam_context * vm )

Calls the vmWOLA_kernel() kernel for the forward PFB data.

Todo:: Keep this as a "pure" forward_pfb function, and make a separate "vm" function that's exposed.

◆ vmWritePFBOutputToFile()

void vmWritePFBOutputToFile ( vcsbeam_context * vm )

Writes the result of the forward PFB operation to file.

This function writes the contents of vm→fpfb→vcs_data to a file whose name is returned by the function vmGetLegacyVoltFilename().

Functions

Function Documentation

◆ cu_invert_pfb()

◆ cu_load_ipfb_filter()

◆ fpga_rounding_and_demotion()

◆ free_ipfb()

◆ gpuAssert()

◆ int2float()

◆ ipfb_kernel()

◆ malloc_ipfb()

◆ pack_into_recombined_format()

◆ vmDownloadForwardPFBChunk()

◆ vmExecuteForwardPFB()

◆ vmFFTChunk()

◆ vmFPGARoundingChunk()

◆ vmFreeForwardPFB()

◆ vmInitForwardPFB()

◆ vmPackChunk()

◆ vmUploadForwardPFBChunk()

◆ vmWOLA_kernel()

◆ vmWOLAChunk()

◆ vmWritePFBOutputToFile()