I am looking at the Nvidia SDK for the convolution FFT example (for large kernels), I know the theory behind fourier transforms and their FFT implementations (the basics at least), but I can't figure out what the following code does:
const int fftH = snapTransformSize(dataH + kernelH - 1);
const int fftW = snapTransformSize(dataW + kernelW - 1);
....//gpu initialization code
printf("...creating R2C & C2R FFT plans for %i x %i\n", fftH, fftW);
cuf ftSafeCall( cufftPlan2d(&fftPlanFwd, fftH, fftW, CUFFT_R2C) );
cufftSafeCall( cufftPlan2d(&fftPlanInv, fftH, fftW, CUFFT_C2R) );
printf("...uploading to GPU and padding convolution kernel and input data\n");
cutilSafeCall( cudaMemcpy(d_Kernel, h_Kernel, kernelH * kernelW * sizeof(float), cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_Data, h_Data, dataH * dataW * sizeof(float), cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float)) );
cutilSafeCall( cudaMemset(d_PaddedData, 0, fftH * fftW * sizeof(float)) );
padKernel(
d_PaddedKernel,
d_Kernel,
fftH,
fftW,
kernelH,
kernelW,
kernelY,
kernelX
);
padDataClampToBorder(
d_PaddedData,
d_Data,
fftH,
fftW,
dataH,
dataW,
kernelH,
kernelW,
kernelY,
kernelX
);
I've never used CUFFT library before so I don't know what the snapTransformSize does
(here's the code)
int snapTransformSize(int dataSize){
int hiBit;
unsigned int lowPOT, hiPOT;
dataSize = iAlignUp(dataSize, 16);
for(hiBit = 31; hiBit >= 0; hiBit--)
if(dataSize & (1U << hiBit)) break;
lowPOT = 1U << hiBit;
if(lowPOT == dataSize)
return dataSize;
hiPOT = 1U << (hiBit + 1);
if(hiPOT <= 1024)
return hiPOT;
else
return iAlignUp(dataSize, 512);
}
nor why the complex plane is such initialized.
Can you provide me explanation links or answers please?
It appears to be rounding up the FFT dimensions to the next power of 2, unless the dimension would exceed 1024, in which case it's rounded up to the next multiple of 512.
Having rounded up the FFT size you then of course need to pad your data with zeroes to make it the correct size for the FFT.
Note that the reason that we typically need to round up and pad for convolution is because each FFT dimension needs to be image_dimension + kernel_dimension - 1
, which is not normally a convenient number, such as a power of 2.