VCSBeam
|
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#include <string.h>
#include "gpu_fft.hpp"
#include "gpu_macros.h"
#include <mwalib.h>
#include "vcsbeam.h"
Functions | |
void | gpuAssert (gpuError_t code, const char *file, int line, bool abort=true) |
__global__ void | vmWOLA_kernel (char2 *indata, int *filter_coeffs, void *weighted_overlap_array) |
Performs the weighted overlap-add part of the PFB algorithm. More... | |
__global__ void | fpga_rounding_and_demotion (void *data) |
CUDA kernel for performing the rounding and demotion step of the forward PFB. More... | |
__global__ void | int2float (void *data, double scale) |
Convert 32-bit integers to 32-bit floats. More... | |
__global__ void | pack_into_recombined_format (gpuFloatComplex *ffted, void *outdata, int *i_idx, pfb_flags flags) |
CUDA kernel for packing the PFB'ed data into legacy recombined layout/format. More... | |
void | vmInitForwardPFB (vcsbeam_context *vm, int M, pfb_flags flags) |
Create and initialise a forward_pfb struct. More... | |
void | vmFreeForwardPFB (forward_pfb *fpfb) |
Frees host and device memory allocated in vmInitForwardPFB(). More... | |
void | vmUploadForwardPFBChunk (vcsbeam_context *vm) |
Copies a chunk of PFB data from CPU to GPU. More... | |
void | vmWOLAChunk (vcsbeam_context *vm) |
Calls the vmWOLA_kernel() kernel for the forward PFB data. More... | |
void | vmFPGARoundingChunk (vcsbeam_context *vm) |
Calls the fpga_rounding_and_demotion() kernel for the forward PFB data. More... | |
void | vmFFTChunk (vcsbeam_context *vm) |
Executes the (CUDA) FFT on the forward PFB data. More... | |
void | vmPackChunk (vcsbeam_context *vm) |
Calls the pack_into_recombined_format() kernel for the forward PFB data. More... | |
void | vmDownloadForwardPFBChunk (vcsbeam_context *vm) |
Copies a chunk of PFB data from GPU to CPU. More... | |
void | vmExecuteForwardPFB (vcsbeam_context *vm) |
The wrapper function that performs a forward PFB on the GPU. More... | |
void | vmWritePFBOutputToFile (vcsbeam_context *vm) |
Writes the result of the forward PFB operation to file. More... | |
__global__ void | ipfb_kernel (float *in_real, float *in_imag, float *ft_real, float *ft_imag, int ntaps, int npol, float *out) |
CUDA kernel implementing the synthesis PFB. More... | |
void | cu_invert_pfb (gpuDoubleComplex *data_buffer_fine, int file_no, int npointing, int nsamples, int nchan, int npol, struct gpu_ipfb_arrays *g, float *data_buffer_vdif) |
Invert the PFB by applying a resynthesis filter, using GPU acceleration. More... | |
void | cu_load_ipfb_filter (pfb_filter *filter, struct gpu_ipfb_arrays *g) |
Load the inverse PFB filter coefficients to the GPU. More... | |
void | malloc_ipfb (struct gpu_ipfb_arrays *g, pfb_filter *filter, int nsamples, int npol, int npointing) |
Allocate GPU memory needed for performing the inverse PFB. More... | |
void | free_ipfb (struct gpu_ipfb_arrays *g) |
Free the GPU memory allocated in malloc_ipfb(). More... | |
void cu_invert_pfb | ( | gpuDoubleComplex * | data_buffer_fine, |
int | file_no, | ||
int | npointing, | ||
int | nsamples, | ||
int | nchan, | ||
int | npol, | ||
struct gpu_ipfb_arrays * | g, | ||
float * | data_buffer_vdif | ||
) |
Invert the PFB by applying a resynthesis filter, using GPU acceleration.
[in] | data_buffer_fine | The fine-channelised voltages |
file_no | An index corresponding to the second of the data | |
npointing | The number of pointings | |
nsamples | The number of time samples per second of input data | |
nchan | The number of fine channels | |
npol | The number of instrumental polarisations | |
g | The ipfb filter struct | |
[out] | data_buffer_vdif | The reconstructed coarse-channelised voltages |
This function expects data_buffer_fine
to be a 1D array of complex voltages with indices following the ordering:
Although data_buffer_fine
potentially contains 2 seconds' worth of data, this function only inverts 1 second. The appropriate second is worked out using file_no
: if it is even, the first half of data_buffer_fine
is used; if odd, the second half.
The IPFB is also given ntaps samples from the end of the previous second, so in total the IPFB is given nsamp+ntaps samples worth of data.
The output of the inversion is packed back into data_buffer_vdif
, a 1D array whose ordering is as follows:
This ordering is suited for immediate output to the VDIF format.
It is assumed that the inverse filter coefficients have already been loaded to the GPU.
void cu_load_ipfb_filter | ( | pfb_filter * | filter, |
struct gpu_ipfb_arrays * | g | ||
) |
Load the inverse PFB filter coefficients to the GPU.
This function loads the inverse filter coefficients and the twiddle factors into GPU memory. If they were loaded separately (as floats), then the multiplication of the filter coefficients and the twiddle factors will be less precise than if a single array containing every combination of floats and twiddle factors is calculated in doubles, and then demoted to floats. Hence, this pre-calculation is done in this function before gpuMemcpy is called.
The result is 2x 1D arrays loaded onto the GPU (one for real, one for imag) where the ith element is equal to
\[ \text{result}[i] = f[n] \times exp{2\pi jk/K}, \]
where \(n = i \% N\), \(k = i / N\), \(N\) is the filter size (fil_size
) and \(K\) is the number of channels (nchan
).
__global__ void fpga_rounding_and_demotion | ( | void * | data | ) |
CUDA kernel for performing the rounding and demotion step of the forward PFB.
[in,out] | data | The data to be rounded and demoted. |
The rounding and demotion performed here is designed to emulate that done in the FPGAs of the legacy (Phases 1 & 2) MWA system. Specifically, it does the following:
\[ \begin{cases} \left\lfloor \frac{x}{2^{14}} + 0.5 \right\rfloor, & x > 0, \\ \left\lfloor \frac{x}{2^{14}} \right\rfloor, & x \le 0. \end{cases} \]
The final numbers are represented as 32-bit floats.
The expected thread configuration is \(\langle\langle\langle (\text{nspectra} \times I), K \rangle\rangle\rangle\)
void free_ipfb | ( | struct gpu_ipfb_arrays * | g | ) |
Free the GPU memory allocated in malloc_ipfb().
|
inline |
__global__ void int2float | ( | void * | data, |
double | scale | ||
) |
Convert 32-bit integers to 32-bit floats.
data | The data to be converted |
scale | A scaling factor to apply (set to 1 for no effect) |
Each thread handles a single integer, so the thread configuration can be chosen to suit the size of data
.
This function acts as an alternative to fpga_rounding_and_demotion(), in order to preserve the full precision of the PFB output.
__global__ void ipfb_kernel | ( | float * | in_real, |
float * | in_imag, | ||
float * | ft_real, | ||
float * | ft_imag, | ||
int | ntaps, | ||
int | npol, | ||
float * | out | ||
) |
CUDA kernel implementing the synthesis PFB.
[in] | in_real | ... |
[in] | in_imag | ... |
[in] | ft_real | ... |
[in] | ft_imag | ... |
ntaps | ... | |
npol | ... | |
[out] | out | ... |
The backwards/inverse/synthesis filter is
\[ \hat{x}[n] = \frac{1}{K} \sum_m f[n - mM] \sum_{k=0}^{K-1} X_k[m]\,e^{2\pi jkn/K} \]
The sum over m
is nominally over all integers, but in practice only involves a few terms because of the finiteness of the filter, f
. To be precise, there are precisely ntaps
non-zero values.
\(X_k[m]\) represents the complex-valued inputs, in_real
and in_imag
. Every possible value of \(f[n]\,e^{2\pi jkn/K}\) is provided in ft_real
and ft_imag
.
K
is the number of channels, and because this is a critically sampled PFB, \(M = K\). We will also use P
to mean the number of taps in the synthesis filter, and \(N = KP\) to mean the size of the filter.
The polarisations are computed completely independently.
\(\hat{x}[n]\) is represented by the out
array.
void malloc_ipfb | ( | struct gpu_ipfb_arrays * | g, |
pfb_filter * | filter, | ||
int | nsamples, | ||
int | npol, | ||
int | npointing | ||
) |
Allocate GPU memory needed for performing the inverse PFB.
Free with free_ipfb().
__global__ void pack_into_recombined_format | ( | gpuFloatComplex * | ffted, |
void * | outdata, | ||
int * | i_idx, | ||
pfb_flags | flags | ||
) |
CUDA kernel for packing the PFB'ed data into legacy recombined layout/format.
[in] | ffted | The already-FFT'ed PFB data, ready for packing |
[out] | outdata | The output data buffer |
i_idx | An array of indexes for the desired ordering of the RF inputs (i.e. antenna/polarisation combinations) | |
flags | PFB configuration settings |
This is the final step in the forward fine PFB algorithm. At this point, the FFTED array contains the Fourier-transformed data that already represents the final channelisation. All that remains to be done is to pack it into the same layout (and optionally also the same format) as the VCS recombined data.
The relevant values for flags
are (see vmInitForwardPFB() for the full table):
Flag name | Description |
---|---|
PFB_COMPLEX_INT4 | Typecast the output into (4+4)-bit complex integers |
PFB_COMPLEX_FLOAT64 | Typecast the output into (64+64)-bit complex floats |
The expected thread configuration is \(\langle\langle\langle(\text{nspectra},K),I\rangle\rangle\rangle\).
void vmDownloadForwardPFBChunk | ( | vcsbeam_context * | vm | ) |
Copies a chunk of PFB data from GPU to CPU.
A diagram of the memory layout of the copy operation:
where
vm→fpfb→d_vcs_data
vm→fpfb→vcs_data + c*hs
vm→fpfb→d_vcs_size
vm→fpfb→vcs_stride
vm→chunk_to_load % vm→chunks_per_second
Note that ds is not necessarily equal to hs, and that ds bytes are copied.
void vmExecuteForwardPFB | ( | vcsbeam_context * | vm | ) |
The wrapper function that performs a forward PFB on the GPU.
For each chunk of data, it calls, in order,
If a buffer (vm→fpfb→vcs_data
) has been prepared, then vmDownloadForwardPFBChunk() is called.
void vmFFTChunk | ( | vcsbeam_context * | vm | ) |
Executes the (CUDA) FFT on the forward PFB data.
void vmFPGARoundingChunk | ( | vcsbeam_context * | vm | ) |
Calls the fpga_rounding_and_demotion() kernel for the forward PFB data.
void vmFreeForwardPFB | ( | forward_pfb * | fpfb | ) |
Frees host and device memory allocated in vmInitForwardPFB().
fpfb | The forward_pfb object to be freed. |
After freeing the member variables of fpfb
, the struct itself is freed.
void vmInitForwardPFB | ( | vcsbeam_context * | vm, |
int | M, | ||
pfb_flags | flags | ||
) |
Create and initialise a forward_pfb struct.
vm | The VCSBeam context struct |
M | The "stride" of the PFB |
flags | PFB configuration settings |
Setting \(M = K\), where \(K\) is the value of vm→analysis_filter→nchans
, will make this a "critically sampled PFB"; setting \(M > K\), an "oversampled PFB".
A filter must already be loaded into vm→analysis_filter
. WARNING! The filter will be forcibly typecast to int!!
The GPU processing will be divided up into vm→chunks_per_second
, chunks per second, but if this number does not divide the number of samples (10000) evenly, then it is incremented until it does.
The available flags are given in the following table:
Flag name | Description |
---|---|
PFB_MALLOC_HOST_INPUT | Allocate memory on CPU for the input |
PFB_MALLOC_HOST_OUTPUT | Allocate memory on CPU for the output |
PFB_MALLOC_DEVICE_INPUT | Allocate memory on GPU for the input |
PFB_MALLOC_DEVICE_OUTPUT | Allocate memory on GPU for the output |
PFB_MALLOC_ALL | Synonym for PFB_MALLOC_HOST_INPUT | PFB_MALLOC_HOST_OUTPUT | PFB_MALLOC_DEVICE_INPUT | PFB_MALLOC_DEVICE_OUTPUT |
PFB_COMPLEX_INT4 | Typecast the output into (4+4)-bit complex integers |
PFB_COMPLEX_FLOAT64 | Typecast the output into (64+64)-bit complex floats |
PFB_IMAG_PART_FIRST | Place the imaginary part of the output in the first (i.e. most significant) position |
PFB_EMULATE_FPGA | Perform the (asymmetric) rounding and demotion step in exactly the same way as the original (Phase 1 & 2) MWA FPGAs |
PFB_SMART | Synonym for PFB_MALLOC_HOST_INPUT | PFB_MALLOC_DEVICE_INPUT | PFB_MALLOC_DEVICE_OUTPUT | PFB_EMULATE_FPGA | PFB_COMPLEX_INT4 |
PFB_FULL_PRECISION | Synonym for PFB_MALLOC_HOST_INPUT | PFB_MALLOC_DEVICE_INPUT | PFB_MALLOC_DEVICE_OUTPUT | PFB_COMPLEX_FLOAT64 |
void vmPackChunk | ( | vcsbeam_context * | vm | ) |
Calls the pack_into_recombined_format() kernel for the forward PFB data.
void vmUploadForwardPFBChunk | ( | vcsbeam_context * | vm | ) |
Copies a chunk of PFB data from CPU to GPU.
A diagram of the memory layout of the copy operation:
where
vm→v→buffer + c*hs
vm→fpfb→d_htr_data
vm→fpfb→d_htr_size
vm→fpfb→htr_stride
vm→chunk_to_load % vm→chunks_per_second
Note that ds is not necessarily equal to hs, and that ds bytes are copied.
__global__ void vmWOLA_kernel | ( | char2 * | indata, |
int * | filter_coeffs, | ||
void * | weighted_overlap_array | ||
) |
Performs the weighted overlap-add part of the PFB algorithm.
[in] | indata | The input data, \(x[n]\), with layout equivalent to the buffer populated by mwalib (see [the MWAX voltage format][MWAXHTR]) |
[in] | filter_coeffs | The PFB filter coefficients, \(h[n]\) |
[out] | weighted_overlap_array | The result of the weighted overlap-add operation, \(b[n]\) |
The weighted overlap-add part of the PFB algorithm is the equation
\[ b_m[n] = \sum_\rho^{P-1} h[K\rho - n] x[n + mM - K\rho]. \]
See above for the meaning of the terms in this equation.
The expected thread configuration is \( \langle\langle\langle (\text{nspectra},I,P), K \rangle\rangle\rangle \)
void vmWOLAChunk | ( | vcsbeam_context * | vm | ) |
Calls the vmWOLA_kernel() kernel for the forward PFB data.
void vmWritePFBOutputToFile | ( | vcsbeam_context * | vm | ) |
Writes the result of the forward PFB operation to file.
This function writes the contents of vm→fpfb→vcs_data
to a file whose name is returned by the function vmGetLegacyVoltFilename().