Cuda does not have any direct implementation of 4D FFT. Hence I want to decompose a 4D FFT into 4 x 1D FFTs into X, Y, Z, and W dimensions. I understand that the cufftPlanMany API is best suited for this as it eliminates the use of any for loops and hence it is much faster.
I have written a program for exactly that. However, the final result of the 4D FFT does not match the 4D FFTW implementation.
Below are the two implementations using FFTW and Cuda libraries, respectively. I have carefully chosen the batch, stride, and dist for the cufftPlanMany function. However, I don't understand what I am doing wrong. Any help will be appreciated.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <fftw3.h>
#define PRINT_FLAG 1
#define NPRINTS 5 // print size
void printf_fftw_cmplx_array(fftw_complex *complex_array, unsigned int size) {
for (unsigned int i = 0; i < NPRINTS; ++i) {
printf(" (%2.4f, %2.4fi)\n", complex_array[i][0], complex_array[i][1]);
}
printf("...\n");
for (unsigned int i = size - NPRINTS; i < size; ++i) {
printf(" (%2.4f, %2.4fi)\n", complex_array[i][0], complex_array[i][1]);
}
}
float run_test_fftw_4d(unsigned int nx, unsigned int ny, unsigned int nz, unsigned int nw) {
srand(2025);
// Declaration
fftw_complex *complex_data;
fftw_plan plan;
unsigned int element_size = nx * ny * nz * nw;
size_t size = sizeof(fftw_complex) * element_size;
clock_t start, stop;
float elapsed_time;
// Allocate memory for input and output arrays
complex_data = (fftw_complex *)fftw_malloc(size);
// Initialize input complex signal
for (unsigned int i = 0; i < element_size; ++i) {
complex_data[i][0] = rand() / (float)RAND_MAX;
complex_data[i][1] = 0;
}
// Print input stuff
if (PRINT_FLAG) {
printf("Complex data...\n");
printf_fftw_cmplx_array(complex_data, element_size);
}
// Setup the FFT plan
plan = fftw_plan_dft(4, (int[]){nx, ny, nz, nw}, complex_data, complex_data, FFTW_FORWARD, FFTW_ESTIMATE);
// Start time
start = clock();
// Execute the FFT
fftw_execute(plan);
// End time
stop = clock();
// Print output stuff
if (PRINT_FLAG) {
printf("Fourier Coefficients...\n");
printf_fftw_cmplx_array(complex_data, element_size);
}
// Compute elapsed time
elapsed_time = (double)(stop - start) / CLOCKS_PER_SEC;
// Clean up
fftw_destroy_plan(plan);
fftw_free(complex_data);
fftw_cleanup();
return elapsed_time;
}
int main(int argc, char **argv) {
if (argc != 6) {
printf("Error: This program requires exactly 5 command-line arguments.\n");
printf(" %s <arg0> <arg1> <arg2> <arg3> <arg4>\n", argv[0]);
printf(" arg0, arg1, arg2, arg3: FFT lengths in 4D\n");
printf(" arg4: Number of iterations\n");
printf(" e.g.: %s 64 64 64 64 5\n", argv[0]);
return -1;
}
unsigned int nx = atoi(argv[1]);
unsigned int ny = atoi(argv[2]);
unsigned int nz = atoi(argv[3]);
unsigned int nw = atoi(argv[4]);
unsigned int niter = atoi(argv[5]);
float sum = 0.0;
float span_s = 0.0;
for (unsigned int i = 0; i < niter; ++i) {
span_s = run_test_fftw_4d(nx, ny, nz, nw);
if (PRINT_FLAG) printf("[%d]: %.6f s\n", i, span_s);
sum += span_s;
}
printf("%.6f\n", sum/(float)niter);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <math.h>
#define PRINT_FLAG 1
#define NPRINTS 5 // print size
#define CHECK_CUDA(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \
fprintf(stderr, "code: %d, reason: %s\n", error, \
cudaGetErrorString(error)); \
exit(EXIT_FAILURE); \
} \
}
#define CHECK_CUFFT(call) \
{ \
cufftResult error; \
if ( (error = (call)) != CUFFT_SUCCESS) \
{ \
fprintf(stderr, "Got CUFFT error %d at %s:%d\n", error, __FILE__, \
__LINE__); \
exit(EXIT_FAILURE); \
} \
}
void printf_cufft_cmplx_array(cufftComplex *complex_array, unsigned int size) {
for (unsigned int i = 0; i < NPRINTS; ++i) {
printf(" (%2.4f, %2.4fi)\n", complex_array[i].x, complex_array[i].y);
}
printf("...\n");
for (unsigned int i = size - NPRINTS; i < size; ++i) {
printf(" (%2.4f, %2.4fi)\n", complex_array[i].x, complex_array[i].y);
}
}
float run_test_cufft_4d_4x1d(unsigned int nx, unsigned int ny, unsigned int nz, unsigned int nw) {
srand(2025);
// Declaration
cufftComplex *complex_data;
cufftComplex *d_complex_data;
cufftHandle plan1d_x, plan1d_y, plan1d_z, plan1d_w;
unsigned int element_size = nx * ny * nz * nw;
size_t size = sizeof(cufftComplex) * element_size;
cudaEvent_t start, stop;
float elapsed_time;
// Allocate memory for the variables on the host
complex_data = (cufftComplex *)malloc(size);
// Initialize input complex signal
for (unsigned int i = 0; i < element_size; ++i) {
complex_data[i].x = rand() / (float)RAND_MAX;
complex_data[i].y = 0;
}
// Print input stuff
if (PRINT_FLAG) {
printf("Complex data...\n");
printf_cufft_cmplx_array(complex_data, element_size);
}
// Create CUDA events
CHECK_CUDA(cudaEventCreate(&start));
CHECK_CUDA(cudaEventCreate(&stop));
// Allocate device memory for complex signal and output frequency
CHECK_CUDA(cudaMalloc((void **)&d_complex_data, size));
int n[1] = { (int)nx };
int embed[1] = { (int)nx };
CHECK_CUFFT(cufftPlanMany(&plan1d_x, 1, n, // 1D FFT of size nx
embed, ny * nz * nw, 1, // inembed, istride, idist
embed, ny * nz * nw, 1, // onembed, ostride, odist
CUFFT_C2C, ny * nz * nw));
n[0] = (int)ny;
embed[0] = (int)ny;
CHECK_CUFFT(cufftPlanMany(&plan1d_y, 1, n, // 1D FFT of size ny
embed, nz * nw, 1, // inembed, istride, idist
embed, nz * nw, 1, // onembed, ostride, odist
CUFFT_C2C, nx * nz * nw));
n[0] = (int)nz;
embed[0] = (int)nz;
CHECK_CUFFT(cufftPlanMany(&plan1d_z, 1, n, // 1D FFT of size nz
embed, nw, 1, // inembed, istride, idist
embed, nw, 1, // onembed, ostride, odist
CUFFT_C2C, nx * ny * nw));
n[0] = (int)nw;
embed[0] = (int)nw;
CHECK_CUFFT(cufftPlanMany(&plan1d_w, 1, n, // 1D FFT of size nw
embed, 1, nw, // inembed, istride, idist
embed, 1, nw, // onembed, ostride, odist
CUFFT_C2C, nx * ny * nz));
// Record the start event
CHECK_CUDA(cudaEventRecord(start, 0));
// Copy host memory to device
CHECK_CUDA(cudaMemcpy(d_complex_data, complex_data, size, cudaMemcpyHostToDevice));
// Perform FFT along each dimension sequentially
CHECK_CUFFT(cufftExecC2C(plan1d_x, d_complex_data, d_complex_data, CUFFT_FORWARD));
CHECK_CUFFT(cufftDestroy(plan1d_x));
CHECK_CUFFT(cufftExecC2C(plan1d_y, d_complex_data, d_complex_data, CUFFT_FORWARD));
CHECK_CUFFT(cufftDestroy(plan1d_y));
CHECK_CUFFT(cufftExecC2C(plan1d_z, d_complex_data, d_complex_data, CUFFT_FORWARD));
CHECK_CUFFT(cufftDestroy(plan1d_z));
CHECK_CUFFT(cufftExecC2C(plan1d_w, d_complex_data, d_complex_data, CUFFT_FORWARD));
CHECK_CUFFT(cufftDestroy(plan1d_w));
// Retrieve the results into host memory
CHECK_CUDA(cudaMemcpy(complex_data, d_complex_data, size, cudaMemcpyDeviceToHost));
// Record the stop event
CHECK_CUDA(cudaEventRecord(stop, 0));
CHECK_CUDA(cudaEventSynchronize(stop));
// Print output stuff
if (PRINT_FLAG) {
printf("Fourier Coefficients...\n");
printf_cufft_cmplx_array(complex_data, element_size);
}
// Compute elapsed time
CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, start, stop));
// Clean up
CHECK_CUDA(cudaFree(d_complex_data));
CHECK_CUDA(cudaEventDestroy(start));
CHECK_CUDA(cudaEventDestroy(stop));
free(complex_data);
return elapsed_time * 1e-3;
}
int main(int argc, char **argv) {
if (argc != 6) {
printf("Error: This program requires exactly 5 command-line arguments.\n");
printf(" %s <arg0> <arg1> <arg2> <arg3> <arg4>\n", argv[0]);
printf(" arg0, arg1, arg2, arg3: FFT lengths in 4D\n");
printf(" arg4: Number of iterations\n");
printf(" e.g.: %s 64 64 64 64 5\n", argv[0]);
return -1;
}
unsigned int nx = atoi(argv[1]);
unsigned int ny = atoi(argv[2]);
unsigned int nz = atoi(argv[3]);
unsigned int nw = atoi(argv[4]);
unsigned int niter = atoi(argv[5]);
float sum = 0.0;
float span_s = 0.0;
for (unsigned int i = 0; i < niter; ++i) {
span_s = run_test_cufft_4d_4x1d(nx, ny, nz, nw);
if (PRINT_FLAG) printf("[%d]: %.6f s\n", i, span_s);
sum += span_s;
}
printf("%.6f\n", sum/(float)niter);
CHECK_CUDA(cudaDeviceReset());
return 0;
}
Try both implementations for 4x4x4x4 array, and you will notice only the first couple of coefficients match. I know that the FFTW implementation produces the correct result, as I can get the same result in different ways such as a 3D FFT followed by a 1D FFT or 2 x 2D FFTs using both FFTW and cuFFT libraries.
Thanks to @Robert Crovella's answer, I have modified their program to satisfy dimensions of unequal size. It seems their code will be correct only if all the dimension sizes are equal. To correct for this, I am posting my solution below:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <math.h>
#define PRINT_FLAG 1
#define NPRINTS 5 // print size
#define CHECK_CUDA(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \
fprintf(stderr, "code: %d, reason: %s\n", error, \
cudaGetErrorString(error)); \
exit(EXIT_FAILURE); \
} \
}
#define CHECK_CUFFT(call) \
{ \
cufftResult error; \
if ( (error = (call)) != CUFFT_SUCCESS) \
{ \
fprintf(stderr, "Got CUFFT error %d at %s:%d\n", error, __FILE__, \
__LINE__); \
exit(EXIT_FAILURE); \
} \
}
void printf_cufft_cmplx_array(cufftComplex *complex_array, unsigned int size) {
for (unsigned int i = 0; i < NPRINTS; ++i) {
printf(" (%2.4f, %2.4fi)\n", complex_array[i].x, complex_array[i].y);
}
printf("...\n");
for (unsigned int i = size - NPRINTS; i < size; ++i) {
printf(" (%2.4f, %2.4fi)\n", complex_array[i].x, complex_array[i].y);
}
}
// Function to execute 1D FFT along a specific dimension
void execute_fft(cufftComplex *d_data, int dim_size, int batch_size) {
cufftHandle plan;
int n[1] = { dim_size };
int embed[1] = { dim_size };
CHECK_CUFFT(cufftPlanMany(&plan, 1, n,
embed, 1, dim_size,
embed, 1, dim_size,
CUFFT_C2C, batch_size));
// Perform FFT
CHECK_CUFFT(cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD));
CHECK_CUFFT(cufftDestroy(plan));
}
__global__ void do_circular_transpose(cufftComplex *d_out, cufftComplex *d_in, int nx, int ny, int nz, int nw) {
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int z = blockDim.z * blockIdx.z + threadIdx.z;
if (x < nx && y < ny && z < nz) {
for (int w = 0; w < nw; w++) {
int in_idx = ((x * ny + y) * nz + z) * nw + w;
int out_idx = ((y * nz + z) * nw + w) * nx + x;
d_out[out_idx] = d_in[in_idx];
}
}
}
float run_test_cufft_4d_4x1d(unsigned int nx, unsigned int ny, unsigned int nz, unsigned int nw) {
srand(2025);
// Declaration
cufftComplex *complex_data;
cufftComplex *d_complex_data;
cufftComplex *d_complex_data_swap;
unsigned int element_size = nx * ny * nz * nw;
size_t size = sizeof(cufftComplex) * element_size;
cudaEvent_t start, stop;
float elapsed_time;
// Allocate memory for the variables on the host
complex_data = (cufftComplex *)malloc(size);
// Initialize input complex signal
for (unsigned int i = 0; i < element_size; ++i) {
complex_data[i].x = rand() / (float)RAND_MAX;
complex_data[i].y = 0;
}
// Print input stuff
if (PRINT_FLAG) {
printf("Complex data...\n");
printf_cufft_cmplx_array(complex_data, element_size);
}
// Create CUDA events
CHECK_CUDA(cudaEventCreate(&start));
CHECK_CUDA(cudaEventCreate(&stop));
// Allocate device memory for complex signal and output frequency
CHECK_CUDA(cudaMalloc((void **)&d_complex_data, size));
CHECK_CUDA(cudaMalloc((void **)&d_complex_data_swap, size));
dim3 threads(8, 8, 8);
dim3 blocks((nx + threads.x - 1) / threads.x, (ny + threads.y - 1) / threads.y, (nz + threads.z - 1) / threads.z);
// Record the start event
CHECK_CUDA(cudaEventRecord(start, 0));
// Copy host memory to device
CHECK_CUDA(cudaMemcpy(d_complex_data, complex_data, size, cudaMemcpyHostToDevice));
// Perform FFT along each dimension sequentially
// Help from: https://forums.developer.nvidia.com/t/3d-and-4d-indexing-4d-fft/12564/2
// and https://stackoverflow.com/questions/79574267/what-is-the-correct-way-to-perform-4d-fft-in-cuda-by-implementing-1d-fft-in-each
// step 1: do 1-D FFT along w with number of element nw and batch=nx ny nz
execute_fft(d_complex_data, nw, nx * ny * nz);
// step 2: do tranpose operation A(x,y,z,w) → A(y,z,w,x)
do_circular_transpose<<<blocks, threads>>>(d_complex_data_swap, d_complex_data, nx, ny, nz, nw);
// step 3: do 1-D FFT along x with number of element nx and batch=n2n3n4
execute_fft(d_complex_data_swap, nx, ny * nz * nw);
// step 4: do tranpose operation A(y,z,w,x) → A(z,w,x,y)
do_circular_transpose<<<blocks, threads>>>(d_complex_data, d_complex_data_swap, ny, nz, nw, nx);
// step 5: do 1-D FFT along y with number of element ny and batch=n3n4n1
execute_fft(d_complex_data, ny, nx * nz * nw);
// step 6: do tranpose operation A(z,w,x,y) → A(w,x,y,z)
do_circular_transpose<<<blocks, threads>>>(d_complex_data_swap, d_complex_data, nz, nw, nx, ny);
// step 7: do 1-D FFT along z with number of element nz and batch=n4n1n2
execute_fft(d_complex_data_swap, nz, nx * ny * nw);
// step 8: do tranpose operation A(w,x,y,z) → A(x,y,z,w)
do_circular_transpose<<<blocks, threads>>>(d_complex_data, d_complex_data_swap, nw, nx, ny, nz);
// Retrieve the results into host memory
CHECK_CUDA(cudaMemcpy(complex_data, d_complex_data, size, cudaMemcpyDeviceToHost));
// Record the stop event
CHECK_CUDA(cudaEventRecord(stop, 0));
CHECK_CUDA(cudaEventSynchronize(stop));
// Print output stuff
if (PRINT_FLAG) {
printf("Fourier Coefficients...\n");
printf_cufft_cmplx_array(complex_data, element_size);
}
// Compute elapsed time
CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, start, stop));
// Clean up
CHECK_CUDA(cudaFree(d_complex_data));
CHECK_CUDA(cudaFree(d_complex_data_swap));
CHECK_CUDA(cudaEventDestroy(start));
CHECK_CUDA(cudaEventDestroy(stop));
free(complex_data);
return elapsed_time * 1e-3;
}
int main(int argc, char **argv) {
if (argc != 6) {
printf("Error: This program requires exactly 5 command-line arguments.\n");
printf(" %s <arg0> <arg1> <arg2> <arg3> <arg4>\n", argv[0]);
printf(" arg0, arg1, arg2, arg3: FFT lengths in 4D\n");
printf(" arg4: Number of iterations\n");
printf(" e.g.: %s 64 64 64 64 5\n", argv[0]);
return -1;
}
unsigned int nx = atoi(argv[1]);
unsigned int ny = atoi(argv[2]);
unsigned int nz = atoi(argv[3]);
unsigned int nw = atoi(argv[4]);
unsigned int niter = atoi(argv[5]);
float sum = 0.0;
float span_s = 0.0;
for (unsigned int i = 0; i < niter; ++i) {
span_s = run_test_cufft_4d_4x1d(nx, ny, nz, nw);
if (PRINT_FLAG) printf("[%d]: %.6f s\n", i, span_s);
sum += span_s;
}
printf("%.6f\n", sum/(float)niter);
CHECK_CUDA(cudaDeviceReset());
return 0;
}
Note that I am using cufftComplex
as my primary data type, as I needed single precision floating point calculations, feel free to use cufftDoubleComplex
as they suggested earlier.
After building and compilation, the correct output would be:
$ ./cufft4d 4 4 4 4 1
Complex data...
(0.2005, 0.0000i)
(0.4584, 0.0000i)
(0.8412, 0.0000i)
(0.6970, 0.0000i)
(0.3846, 0.0000i)
...
(0.5214, 0.0000i)
(0.3179, 0.0000i)
(0.9771, 0.0000i)
(0.1417, 0.0000i)
(0.5867, 0.0000i)
Fourier Coefficients...
(121.0454, 0.0000i)
(-1.6709, -1.3923i)
(-12.7056, 0.0000i)
(-1.6709, 1.3923i)
(-1.3997, -3.1249i)
...
(1.0800, 0.8837i)
(2.0585, -2.7097i)
(1.1019, 1.7167i)
(4.9727, 0.1244i)
(-1.2561, 0.6645i)
[0]: 0.001198 s
0.001198
$ ./cufft4d 4 5 6 7 1
Complex data...
(0.2005, 0.0000i)
(0.4584, 0.0000i)
(0.8412, 0.0000i)
(0.6970, 0.0000i)
(0.3846, 0.0000i)
...
(0.3909, 0.0000i)
(0.0662, 0.0000i)
(0.6360, 0.0000i)
(0.1895, 0.0000i)
(0.7450, 0.0000i)
Fourier Coefficients...
(426.6703, 0.0000i)
(9.5928, 6.2723i)
(-1.2947, -7.8418i)
(-5.1845, -0.6342i)
(-5.1845, 0.6342i)
...
(-2.9402, 0.1377i)
(5.8364, -3.5697i)
(4.8288, -3.2658i)
(-2.5617, -7.8667i)
(-4.2289, -0.3572i)
[0]: 0.001193 s
0.001193
These results match with FFTW.