I am trying to perform an inplace real to complex FFT with cufft. I am aware of the similar question How to perform a Real to Complex Transformation with cuFFT. However I have issues trying to reproduce the same method.
If I do an out of place transformation, there is no problem, but as soon as I do it in place, I do not have the correct values in the FFT (Checked with python, using binary files in between). I do not have errors, but just non correct values.
Here is my code:
void fftCuda2d(mat3d* scene)
{
cufftResult resultStatus;
cudaError_t cuda_status;
cufftHandle plan_forward;
resultStatus = cufftPlan2d(&plan_forward, scene->_height, scene->_width, CUFFT_R2C);
cout << "Creating plan forward: " << _cudaGetErrorEnum(resultStatus) << endl;
cufftComplex *d_fft, *d_scene, *h_fft;
size_t size_fft = (int(scene->_width/2)+1)*scene->_height;
cudaMalloc((void**)&d_scene, sizeof(cufftComplex)*size_fft);
cudaMalloc((void**)&d_fft, sizeof(cufftComplex)*size_fft);
h_fft = (cufftComplex*) malloc(sizeof(cufftComplex)*size_fft);
cuda_status = cudaMemcpy(d_scene, scene->_pData, sizeof(cufftReal) * scene->_height * scene->_width, cudaMemcpyHostToDevice);
resultStatus = cufftExecR2C(plan_forward, (cufftReal*) d_scene, d_scene);
cuda_status = cudaMemcpy(h_fft, d_scene, sizeof(cufftReal)*scene->_height*scene->_width, cudaMemcpyDeviceToHost);
FILE* *pFileTemp;
pFileTemp = fopen("temp.bin", "wb");
check = fwrite(h_fft, sizeof(cufftComplex), sizeFft, pFileTemp);
}
If I use resultStatus = cufftExecR2C(plan_forward, (cufftReal*) d_scene, d_fft);
and save the output of d_fft
I have the correct result.
So you see any mistake of mine here?
P.S Mat3d is a struct where _width and _height contain the size of the matrix and pData is the pointer to the data but there is no issue with that.
(It seems like this should be a duplicate question but I was not able to locate the duplicate.)
Your input data needs to be organized differently (padded) when using an in-place transform. This is particularly noticeable in the 2D case, because each row of data must be padded.
In the non-inplace R2C transform, the input data is real-valued and of size height*width (for an example R=4, C=4 case):
X X X X
X X X X
X X X X
X X X X
The above data would occupy exactly 16*sizeof(cufftReal)
(assuming float
input data, dimension R = 4, C = 4), and it would be organized that way in memory, linearly, with no gaps. However, when we switch to an in-place transform, the size of the input buffer changes. And this change in size has ramifications for data arrangement. Specifically, the sizeof the input buffer is R*(C/2 + 1)*sizeof(cufftComplex)
. For the R=4, C=4 example case, that is 12*sizeof(cufftComplex)
or 24*sizeof(cufftReal)
, but it is still organized as 4 rows of data. Each row, therefore, is of length 6 (if measured in cufftReal
) or 3 (if measured in cufftComplex
). Considering it as cufftReal
, then when we create our input data, we must organize it like this:
X X X X P P
X X X X P P
X X X X P P
X X X X P P
where the P
locations are "padding" data, not your input data. If we view this linearly in memory, it looks like:
X X X X P P X X X X P P X X X X P P X X X X P P
That is the expectation/requirement of CUFFT (and I believe it is the same for FFTW). However since you made no changes to the way you deposited your data, you provided data that looks like this:
X X X X X X X X X X X X X X X X P P P P P P P P
and the difference in those 2 patterns is what accounts for the difference in the result output. There are a variety of ways to fix this. I'll choose to demonstrate using cudaMemcpy2D
to populate the device input buffer in the in-place case, which will give us the desired pattern. This may not be the best/fastest way, depending on your application needs.
You were also not copying the correct size of the result data from device back to host.
Here is a fixed example:
$ cat t1589.cu
#include <cufft.h>
#include <iostream>
#include <cstdlib>
struct mat3d{
int _width;
int _height;
cufftReal *_pData;
};
void fftCuda2d(mat3d* scene)
{
cufftResult resultStatus;
cudaError_t cuda_status;
cufftHandle plan_forward;
resultStatus = cufftPlan2d(&plan_forward, scene->_height, scene->_width, CUFFT_R2C);
std::cout << "Creating plan forward: " << (int)resultStatus << std::endl;
cufftComplex *d_fft, *d_scene, *h_fft;
size_t size_fft = (int(scene->_width/2)+1)*scene->_height;
cudaMalloc((void**)&d_scene, sizeof(cufftComplex)*size_fft);
cudaMalloc((void**)&d_fft, sizeof(cufftComplex)*size_fft);
h_fft = (cufftComplex*) malloc(sizeof(cufftComplex)*size_fft);
#ifdef USE_IP
cuda_status = cudaMemcpy2D(d_scene, ((scene->_width/2)+1)*sizeof(cufftComplex), scene->_pData, (scene->_width)*sizeof(cufftReal), sizeof(cufftReal) * scene->_width, scene->_height, cudaMemcpyHostToDevice);
resultStatus = cufftExecR2C(plan_forward, (cufftReal*) d_scene, d_scene);
cuda_status = cudaMemcpy(h_fft, d_scene, sizeof(cufftComplex)*size_fft, cudaMemcpyDeviceToHost);
#else
cuda_status = cudaMemcpy(d_scene, scene->_pData, sizeof(cufftReal) * scene->_height * scene->_width, cudaMemcpyHostToDevice);
resultStatus = cufftExecR2C(plan_forward, (cufftReal*) d_scene, d_fft);
cuda_status = cudaMemcpy(h_fft, d_fft, sizeof(cufftComplex)*size_fft, cudaMemcpyDeviceToHost);
#endif
std::cout << "exec: " << (int)resultStatus << std::endl;
for (int i = 0; i < size_fft; i++)
std::cout << h_fft[i].x << " " << h_fft[i].y << ",";
std::cout << std::endl;
}
const int dim = 4;
int main(){
mat3d myScene;
myScene._pData = new cufftReal[dim*dim];
myScene._width = dim;
myScene._height = dim;
for (int i = 0; i < dim*dim; i++) myScene._pData[i] = rand()/(float)RAND_MAX;
fftCuda2d(&myScene);
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
}
$ nvcc -lineinfo -o t1589 t1589.cu -lcufft
t1589.cu(15): warning: variable "cuda_status" was set but never used
$ ./t1589
Creating plan forward: 0
exec: 0
9.71338 0,-0.153554 1.45243,0.171302 0,0.878097 0.533959,0.424595 -0.834714,0.858133 -0.393671,-0.205139 0,-0.131513 -0.494514,-0.165712 0,0.878097 -0.533959,0.0888268 1.49303,0.858133 0.393671,
no error
$ nvcc -lineinfo -o t1589 t1589.cu -lcufft -DUSE_IP
t1589.cu(15): warning: variable "cuda_status" was set but never used
$ ./t1589
Creating plan forward: 0
exec: 0
9.71338 0,-0.153554 1.45243,0.171302 0,0.878097 0.533959,0.424595 -0.834714,0.858133 -0.393671,-0.205139 0,-0.131513 -0.494514,-0.165712 0,0.878097 -0.533959,0.0888268 1.49303,0.858133 0.393671,
no error
$