I have the following code that has bug when doing the inverse FFT. The forward FFT works as I printed the output and verified it. But the inverse does not seem to. Any ideas? Does it look like I have missed a concept?
Code - http://pastebin.com/iZYtdcqR
EDIT - I have essentially rewritten the code that comes with the CUDA toolkit samples. I am trying to perform a convolution using FFT but with a modified algorithm (DIF actually.)
EDIT2 - dding code to the question.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cufft.h>
typedef enum signaltype {REAL, COMPLEX} signal;
typedef float2 Complex;
void
printData(Complex *a, int size, char *msg) {
if (msg == "") printf("\n");
else printf("%s\n", msg);
for (int i = 0; i < size; i++)
printf("%f %f\n", a[i].x, a[i].y);
}
void
normData(Complex *a, int size, float norm) {
for (int i = 0; i < size; i++) {
a[i].x /= norm;
a[i].y /= norm;
}
}
void
randomFill(Complex *h_signal, int size, int flag) {
// Real signal.
if (flag == REAL) {
for (int i = 0; i < size; i++) {
h_signal[i].x = rand() / (float) RAND_MAX;
h_signal[i].y = 0;
}
}
}
// FFT a signal that's on the _DEVICE_.
void
signalFFT(Complex *d_signal, int signal_size) {
cufftHandle plan;
if (cufftPlan1d(&plan, signal_size, CUFFT_C2C, 1) != CUFFT_SUCCESS) {
printf("Failed to plan FFT\n");
exit(0);
}
// Execute the plan.
if (cufftExecC2C(plan, (cufftComplex *) d_signal, (cufftComplex *) d_signal, CUFFT_FORWARD) != CUFFT_SUCCESS) {
printf ("Failed Executing FFT\n");
exit(0);
}
}
void
signalIFFT(Complex *d_signal, int signal_size) {
cufftHandle plan;
if (cufftPlan1d(&plan, signal_size, CUFFT_C2C, 1) != CUFFT_SUCCESS) {
printf("Failed to plan IFFT\n");
exit(0);
}
// Execute the plan.
if (cufftExecC2C(plan, (cufftComplex *) d_signal, (cufftComplex *) d_signal, CUFFT_INVERSE) != CUFFT_SUCCESS) {
printf ("Failed Executing IFFT\n");
exit(0);
}
}
int main()
{
Complex *h_signal, *d_signal1;
int alloc_size, i;
alloc_size = 16;
// Kernel Block and Grid Size.
const dim3 blockSize(16, 16, 1);
const dim3 gridSize(alloc_size / 16 + 1, alloc_size / 16 + 1, 1);
h_signal = (Complex *) malloc(sizeof(Complex) * alloc_size);
cudaMalloc(&d_signal1, sizeof(Complex) * alloc_size);
if (cudaGetLastError() != cudaSuccess){
printf("Cuda error: Failed to allocate\n");
exit(0);
}
//cudaMalloc(&d_signal2, sizeof(Complex) * alloc_size);
// Add random data to signal.
randomFill(h_signal, alloc_size, REAL);
printData(h_signal, alloc_size, "Random H1");
cudaMemcpy(d_signal1, h_signal, sizeof(Complex) * alloc_size, cudaMemcpyHostToDevice);
signalFFT(d_signal1, alloc_size);
signalIFFT(d_signal1, alloc_size);
cudaDeviceSynchronize();
cudaMemcpy(h_signal, d_signal1, sizeof(Complex) * alloc_size, cudaMemcpyDeviceToHost);
printData(h_signal, alloc_size, "IFFT");
return 0;
}
Suggestions for writing a good question:
Another note:
Now regarding your question, I ran your code and got results like this:
Random H1
0.840188 0.000000
0.394383 0.000000
0.783099 0.000000
0.798440 0.000000
0.911647 0.000000
0.197551 0.000000
0.335223 0.000000
0.768230 0.000000
0.277775 0.000000
0.553970 0.000000
0.477397 0.000000
0.628871 0.000000
0.364784 0.000000
0.513401 0.000000
0.952230 0.000000
0.916195 0.000000
IFFT
13.443005 0.000000
6.310127 -0.000000
12.529589 0.000000
12.775041 0.000000
14.586359 -0.000000
3.160823 0.000000
5.363565 0.000000
12.291674 -0.000000
4.444397 -0.000000
8.863521 0.000000
7.638353 0.000000
10.061934 -0.000000
5.836554 0.000000
8.214415 -0.000000
15.235678 -0.000000
14.659121 -0.000000
The only thing that seems to be missing from your code is that you didn't divide the results by the length of the transform (16 in this case) to get back the original data (as suggested in the sample code here). When I do that, I get what I believe is expected results:
Random H1
0.840188 0.000000
0.394383 0.000000
0.783099 0.000000
0.798440 0.000000
0.911647 0.000000
0.197551 0.000000
0.335223 0.000000
0.768230 0.000000
0.277775 0.000000
0.553970 0.000000
0.477397 0.000000
0.628871 0.000000
0.364784 0.000000
0.513401 0.000000
0.952230 0.000000
0.916195 0.000000
IFFT
0.840188 0.000000
0.394383 -0.000000
0.783099 0.000000
0.798440 0.000000
0.911647 -0.000000
0.197551 0.000000
0.335223 0.000000
0.768230 -0.000000
0.277775 -0.000000
0.553970 0.000000
0.477397 0.000000
0.628871 -0.000000
0.364785 0.000000
0.513401 -0.000000
0.952230 -0.000000
0.916195 -0.000000
By the way, kudos for providing a complete, compilable code sample.