What is the correct way to perform 4D FFT in Cuda by implementing 1D FFT in each dimension using cufftPlanMany API

Cuda does not have any direct implementation of 4D FFT. Hence I want to decompose a 4D FFT into 4 x 1D FFTs into X, Y, Z, and W dimensions. I understand that the cufftPlanMany API is best suited for this as it eliminates the use of any for loops and hence it is much faster.

I have written a program for exactly that. However, the final result of the 4D FFT does not match the 4D FFTW implementation.

Below are the two implementations using FFTW and Cuda libraries, respectively. I have carefully chosen the batch, stride, and dist for the cufftPlanMany function. However, I don't understand what I am doing wrong. Any help will be appreciated.

FFTW 4D implementation

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <fftw3.h>

#define PRINT_FLAG 1
#define NPRINTS 5  // print size

void printf_fftw_cmplx_array(fftw_complex *complex_array, unsigned int size) {
    for (unsigned int i = 0; i < NPRINTS; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i][0], complex_array[i][1]);
    }
    printf("...\n");
    for (unsigned int i = size - NPRINTS; i < size; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i][0], complex_array[i][1]);
    }
}

float run_test_fftw_4d(unsigned int nx, unsigned int ny, unsigned int nz, unsigned int nw) {
    srand(2025);

    // Declaration
    fftw_complex *complex_data;
    fftw_plan plan;

    unsigned int element_size = nx * ny * nz * nw;
    size_t size = sizeof(fftw_complex) * element_size;

    clock_t start, stop;
    float elapsed_time;

    // Allocate memory for input and output arrays
    complex_data = (fftw_complex *)fftw_malloc(size);

    // Initialize input complex signal
    for (unsigned int i = 0; i < element_size; ++i) {
        complex_data[i][0] = rand() / (float)RAND_MAX;
        complex_data[i][1] = 0;
    }

    // Print input stuff
    if (PRINT_FLAG) {
        printf("Complex data...\n");
        printf_fftw_cmplx_array(complex_data, element_size);
    }

    // Setup the FFT plan
    plan = fftw_plan_dft(4, (int[]){nx, ny, nz, nw}, complex_data, complex_data, FFTW_FORWARD, FFTW_ESTIMATE);

    // Start time
    start = clock();
    
    // Execute the FFT
    fftw_execute(plan);

    // End time
    stop = clock();

    // Print output stuff
    if (PRINT_FLAG) {
        printf("Fourier Coefficients...\n");
        printf_fftw_cmplx_array(complex_data, element_size);
    }

    // Compute elapsed time
    elapsed_time = (double)(stop - start) / CLOCKS_PER_SEC;

    // Clean up
    fftw_destroy_plan(plan);
    fftw_free(complex_data);
    fftw_cleanup();

    return elapsed_time;
}


int main(int argc, char **argv) {
    if (argc != 6) {
        printf("Error: This program requires exactly 5 command-line arguments.\n");
        printf("       %s <arg0> <arg1> <arg2> <arg3> <arg4>\n", argv[0]);
        printf("       arg0, arg1, arg2, arg3: FFT lengths in 4D\n");
        printf("       arg4: Number of iterations\n");
        printf("       e.g.: %s 64 64 64 64 5\n", argv[0]);
        return -1;
    }

    unsigned int nx = atoi(argv[1]);
    unsigned int ny = atoi(argv[2]);
    unsigned int nz = atoi(argv[3]);
    unsigned int nw = atoi(argv[4]);
    unsigned int niter = atoi(argv[5]);

    float sum = 0.0;
    float span_s = 0.0;
    for (unsigned int i = 0; i < niter; ++i) {
        span_s = run_test_fftw_4d(nx, ny, nz, nw);
        if (PRINT_FLAG) printf("[%d]: %.6f s\n", i, span_s);
        sum += span_s;
    }
    printf("%.6f\n", sum/(float)niter);

    return 0;
}

Erroneous cuFFT4D implementation

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h> 
#include <cufft.h>
#include <math.h>

#define PRINT_FLAG 1
#define NPRINTS 5  // print size

#define CHECK_CUDA(call)                                                       \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess)                                                  \
    {                                                                          \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
        exit(EXIT_FAILURE);                                                    \
    }                                                                          \
}

#define CHECK_CUFFT(call)                                                      \
{                                                                              \
    cufftResult error;                                                         \
    if ( (error = (call)) != CUFFT_SUCCESS)                                      \
    {                                                                          \
        fprintf(stderr, "Got CUFFT error %d at %s:%d\n", error, __FILE__,      \
                __LINE__);                                                     \
        exit(EXIT_FAILURE);                                                    \
    }                                                                          \
}

void printf_cufft_cmplx_array(cufftComplex *complex_array, unsigned int size) {
    for (unsigned int i = 0; i < NPRINTS; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i].x, complex_array[i].y);
    }
    printf("...\n");
    for (unsigned int i = size - NPRINTS; i < size; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i].x, complex_array[i].y);
    }
}

float run_test_cufft_4d_4x1d(unsigned int nx, unsigned int ny, unsigned int nz, unsigned int nw) {
    srand(2025);

    // Declaration
    cufftComplex *complex_data;
    cufftComplex *d_complex_data;
    cufftHandle plan1d_x, plan1d_y, plan1d_z, plan1d_w;

    unsigned int element_size = nx * ny * nz * nw;
    size_t size = sizeof(cufftComplex) * element_size;

    cudaEvent_t start, stop;
    float elapsed_time;

    // Allocate memory for the variables on the host
    complex_data = (cufftComplex *)malloc(size);

    // Initialize input complex signal
    for (unsigned int i = 0; i < element_size; ++i) {
        complex_data[i].x = rand() / (float)RAND_MAX;
        complex_data[i].y = 0;
    }

    // Print input stuff
    if (PRINT_FLAG) {
        printf("Complex data...\n");
        printf_cufft_cmplx_array(complex_data, element_size);
    }

    // Create CUDA events
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    // Allocate device memory for complex signal and output frequency
    CHECK_CUDA(cudaMalloc((void **)&d_complex_data, size));

    int n[1] = { (int)nx };
    int embed[1] = { (int)nx };
    CHECK_CUFFT(cufftPlanMany(&plan1d_x, 1, n,       // 1D FFT of size nx
                            embed, ny * nz * nw, 1, // inembed, istride, idist
                            embed, ny * nz * nw, 1, // onembed, ostride, odist
                            CUFFT_C2C, ny * nz * nw));
    n[0] = (int)ny;
    embed[0] = (int)ny;
    CHECK_CUFFT(cufftPlanMany(&plan1d_y, 1, n,       // 1D FFT of size ny
                            embed, nz * nw, 1, // inembed, istride, idist
                            embed, nz * nw, 1, // onembed, ostride, odist
                            CUFFT_C2C, nx * nz * nw));
    n[0] = (int)nz;
    embed[0] = (int)nz;
    CHECK_CUFFT(cufftPlanMany(&plan1d_z, 1, n,       // 1D FFT of size nz
                            embed, nw, 1, // inembed, istride, idist
                            embed, nw, 1, // onembed, ostride, odist
                            CUFFT_C2C, nx * ny * nw));
    n[0] = (int)nw;
    embed[0] = (int)nw;
    CHECK_CUFFT(cufftPlanMany(&plan1d_w, 1, n,       // 1D FFT of size nw
                            embed, 1, nw, // inembed, istride, idist
                            embed, 1, nw, // onembed, ostride, odist
                            CUFFT_C2C, nx * ny * nz));

    // Record the start event
    CHECK_CUDA(cudaEventRecord(start, 0));

    // Copy host memory to device
    CHECK_CUDA(cudaMemcpy(d_complex_data, complex_data, size, cudaMemcpyHostToDevice));

    // Perform FFT along each dimension sequentially
    CHECK_CUFFT(cufftExecC2C(plan1d_x, d_complex_data, d_complex_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan1d_x));
    CHECK_CUFFT(cufftExecC2C(plan1d_y, d_complex_data, d_complex_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan1d_y));
    CHECK_CUFFT(cufftExecC2C(plan1d_z, d_complex_data, d_complex_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan1d_z));
    CHECK_CUFFT(cufftExecC2C(plan1d_w, d_complex_data, d_complex_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan1d_w));

    // Retrieve the results into host memory
    CHECK_CUDA(cudaMemcpy(complex_data, d_complex_data, size, cudaMemcpyDeviceToHost));

    // Record the stop event
    CHECK_CUDA(cudaEventRecord(stop, 0));
    CHECK_CUDA(cudaEventSynchronize(stop));

    // Print output stuff
    if (PRINT_FLAG) {
        printf("Fourier Coefficients...\n");
        printf_cufft_cmplx_array(complex_data, element_size);
    }

    // Compute elapsed time
    CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, start, stop));

    // Clean up
    CHECK_CUDA(cudaFree(d_complex_data));
    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));
    free(complex_data);

    return elapsed_time * 1e-3;
}


int main(int argc, char **argv) {
    if (argc != 6) {
        printf("Error: This program requires exactly 5 command-line arguments.\n");
        printf("       %s <arg0> <arg1> <arg2> <arg3> <arg4>\n", argv[0]);
        printf("       arg0, arg1, arg2, arg3: FFT lengths in 4D\n");
        printf("       arg4: Number of iterations\n");
        printf("       e.g.: %s 64 64 64 64 5\n", argv[0]);
        return -1;
    }

    unsigned int nx = atoi(argv[1]);
    unsigned int ny = atoi(argv[2]);
    unsigned int nz = atoi(argv[3]);
    unsigned int nw = atoi(argv[4]);
    unsigned int niter = atoi(argv[5]);

    float sum = 0.0;
    float span_s = 0.0;
    for (unsigned int i = 0; i < niter; ++i) {
        span_s = run_test_cufft_4d_4x1d(nx, ny, nz, nw);
        if (PRINT_FLAG) printf("[%d]: %.6f s\n", i, span_s);
        sum += span_s;
    }
    printf("%.6f\n", sum/(float)niter);

    CHECK_CUDA(cudaDeviceReset());
    return 0;
}

Try both implementations for 4x4x4x4 array, and you will notice only the first couple of coefficients match. I know that the FFTW implementation produces the correct result, as I can get the same result in different ways such as a 3D FFT followed by a 1D FFT or 2 x 2D FFTs using both FFTW and cuFFT libraries.

Solution

Thanks to @Robert Crovella's answer, I have modified their program to satisfy dimensions of unequal size. It seems their code will be correct only if all the dimension sizes are equal. To correct for this, I am posting my solution below:

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h> 
#include <cufft.h>
#include <math.h>

#define PRINT_FLAG 1
#define NPRINTS 5  // print size

#define CHECK_CUDA(call)                                                       \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess)                                                  \
    {                                                                          \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
        exit(EXIT_FAILURE);                                                    \
    }                                                                          \
}

#define CHECK_CUFFT(call)                                                      \
{                                                                              \
    cufftResult error;                                                         \
    if ( (error = (call)) != CUFFT_SUCCESS)                                    \
    {                                                                          \
        fprintf(stderr, "Got CUFFT error %d at %s:%d\n", error, __FILE__,      \
                __LINE__);                                                     \
        exit(EXIT_FAILURE);                                                    \
    }                                                                          \
}

void printf_cufft_cmplx_array(cufftComplex *complex_array, unsigned int size) {
    for (unsigned int i = 0; i < NPRINTS; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i].x, complex_array[i].y);
    }
    printf("...\n");
    for (unsigned int i = size - NPRINTS; i < size; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i].x, complex_array[i].y);
    }
}

// Function to execute 1D FFT along a specific dimension
void execute_fft(cufftComplex *d_data, int dim_size, int batch_size) {
    cufftHandle plan;
    int n[1] = { dim_size };
    int embed[1] = { dim_size };
    CHECK_CUFFT(cufftPlanMany(&plan, 1, n, 
                            embed, 1, dim_size, 
                            embed, 1, dim_size, 
                            CUFFT_C2C, batch_size));

    // Perform FFT
    CHECK_CUFFT(cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan));
}


__global__ void do_circular_transpose(cufftComplex *d_out, cufftComplex *d_in, int nx, int ny, int nz, int nw) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;
    int z = blockDim.z * blockIdx.z + threadIdx.z;

    if (x < nx && y < ny && z < nz) {
        for (int w = 0; w < nw; w++) {
            int in_idx  = ((x * ny + y) * nz + z) * nw + w;
            int out_idx = ((y * nz + z) * nw + w) * nx + x;
            d_out[out_idx] = d_in[in_idx];
        }
    }
}

float run_test_cufft_4d_4x1d(unsigned int nx, unsigned int ny, unsigned int nz, unsigned int nw) {
    srand(2025);

    // Declaration
    cufftComplex *complex_data;
    cufftComplex *d_complex_data;
    cufftComplex *d_complex_data_swap;

    unsigned int element_size = nx * ny * nz * nw;
    size_t size = sizeof(cufftComplex) * element_size;

    cudaEvent_t start, stop;
    float elapsed_time;

    // Allocate memory for the variables on the host
    complex_data = (cufftComplex *)malloc(size);

    // Initialize input complex signal
    for (unsigned int i = 0; i < element_size; ++i) {
        complex_data[i].x = rand() / (float)RAND_MAX;
        complex_data[i].y = 0;
    }

    // Print input stuff
    if (PRINT_FLAG) {
        printf("Complex data...\n");
        printf_cufft_cmplx_array(complex_data, element_size);
    }

    // Create CUDA events
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    // Allocate device memory for complex signal and output frequency
    CHECK_CUDA(cudaMalloc((void **)&d_complex_data, size));
    CHECK_CUDA(cudaMalloc((void **)&d_complex_data_swap, size));

    dim3 threads(8, 8, 8);
    dim3 blocks((nx + threads.x - 1) / threads.x, (ny + threads.y - 1) / threads.y, (nz + threads.z - 1) / threads.z);

    // Record the start event
    CHECK_CUDA(cudaEventRecord(start, 0));

    // Copy host memory to device
    CHECK_CUDA(cudaMemcpy(d_complex_data, complex_data, size, cudaMemcpyHostToDevice));

    // Perform FFT along each dimension sequentially
    // Help from: https://forums.developer.nvidia.com/t/3d-and-4d-indexing-4d-fft/12564/2
    // and https://stackoverflow.com/questions/79574267/what-is-the-correct-way-to-perform-4d-fft-in-cuda-by-implementing-1d-fft-in-each

    // step 1: do 1-D FFT along w with number of element nw and batch=nx ny nz
    execute_fft(d_complex_data, nw, nx * ny * nz);
    // step 2: do tranpose operation A(x,y,z,w) → A(y,z,w,x)
    do_circular_transpose<<<blocks, threads>>>(d_complex_data_swap, d_complex_data, nx, ny, nz, nw);
    // step 3: do 1-D FFT along x with number of element nx and batch=n2n3n4
    execute_fft(d_complex_data_swap, nx, ny * nz * nw);
    // step 4: do tranpose operation A(y,z,w,x) → A(z,w,x,y)
    do_circular_transpose<<<blocks, threads>>>(d_complex_data, d_complex_data_swap, ny, nz, nw, nx);
    // step 5: do 1-D FFT along y with number of element ny and batch=n3n4n1
    execute_fft(d_complex_data, ny, nx * nz * nw);
    // step 6: do tranpose operation A(z,w,x,y) → A(w,x,y,z)
    do_circular_transpose<<<blocks, threads>>>(d_complex_data_swap, d_complex_data, nz, nw, nx, ny);
    // step 7: do 1-D FFT along z with number of element nz and batch=n4n1n2
    execute_fft(d_complex_data_swap, nz, nx * ny * nw);
    // step 8: do tranpose operation A(w,x,y,z) → A(x,y,z,w)
    do_circular_transpose<<<blocks, threads>>>(d_complex_data, d_complex_data_swap, nw, nx, ny, nz);

    // Retrieve the results into host memory
    CHECK_CUDA(cudaMemcpy(complex_data, d_complex_data, size, cudaMemcpyDeviceToHost));

    // Record the stop event
    CHECK_CUDA(cudaEventRecord(stop, 0));
    CHECK_CUDA(cudaEventSynchronize(stop));

    // Print output stuff
    if (PRINT_FLAG) {
        printf("Fourier Coefficients...\n");
        printf_cufft_cmplx_array(complex_data, element_size);
    }

    // Compute elapsed time
    CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, start, stop));

    // Clean up
    CHECK_CUDA(cudaFree(d_complex_data));
    CHECK_CUDA(cudaFree(d_complex_data_swap));
    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));
    free(complex_data);

    return elapsed_time * 1e-3;
}


int main(int argc, char **argv) {
    if (argc != 6) {
        printf("Error: This program requires exactly 5 command-line arguments.\n");
        printf("       %s <arg0> <arg1> <arg2> <arg3> <arg4>\n", argv[0]);
        printf("       arg0, arg1, arg2, arg3: FFT lengths in 4D\n");
        printf("       arg4: Number of iterations\n");
        printf("       e.g.: %s 64 64 64 64 5\n", argv[0]);
        return -1;
    }

    unsigned int nx = atoi(argv[1]);
    unsigned int ny = atoi(argv[2]);
    unsigned int nz = atoi(argv[3]);
    unsigned int nw = atoi(argv[4]);
    unsigned int niter = atoi(argv[5]);

    float sum = 0.0;
    float span_s = 0.0;
    for (unsigned int i = 0; i < niter; ++i) {
        span_s = run_test_cufft_4d_4x1d(nx, ny, nz, nw);
        if (PRINT_FLAG) printf("[%d]: %.6f s\n", i, span_s);
        sum += span_s;
    }
    printf("%.6f\n", sum/(float)niter);

    CHECK_CUDA(cudaDeviceReset());
    return 0;
}

Note that I am using cufftComplex as my primary data type, as I needed single precision floating point calculations, feel free to use cufftDoubleComplex as they suggested earlier.

After building and compilation, the correct output would be:

For a array of 4x4x4x4

$ ./cufft4d 4 4 4 4 1

Complex data...
  (0.2005, 0.0000i)
  (0.4584, 0.0000i)
  (0.8412, 0.0000i)
  (0.6970, 0.0000i)
  (0.3846, 0.0000i)
...
  (0.5214, 0.0000i)
  (0.3179, 0.0000i)
  (0.9771, 0.0000i)
  (0.1417, 0.0000i)
  (0.5867, 0.0000i)
Fourier Coefficients...
  (121.0454, 0.0000i)
  (-1.6709, -1.3923i)
  (-12.7056, 0.0000i)
  (-1.6709, 1.3923i)
  (-1.3997, -3.1249i)
...
  (1.0800, 0.8837i)
  (2.0585, -2.7097i)
  (1.1019, 1.7167i)
  (4.9727, 0.1244i)
  (-1.2561, 0.6645i)
[0]: 0.001198 s
0.001198

For a array of 4x5x6x7

$ ./cufft4d 4 5 6 7 1

Complex data...
  (0.2005, 0.0000i)
  (0.4584, 0.0000i)
  (0.8412, 0.0000i)
  (0.6970, 0.0000i)
  (0.3846, 0.0000i)
...
  (0.3909, 0.0000i)
  (0.0662, 0.0000i)
  (0.6360, 0.0000i)
  (0.1895, 0.0000i)
  (0.7450, 0.0000i)
Fourier Coefficients...
  (426.6703, 0.0000i)
  (9.5928, 6.2723i)
  (-1.2947, -7.8418i)
  (-5.1845, -0.6342i)
  (-5.1845, 0.6342i)
...
  (-2.9402, 0.1377i)
  (5.8364, -3.5697i)
  (4.8288, -3.2658i)
  (-2.5617, -7.8667i)
  (-4.2289, -0.3572i)
[0]: 0.001193 s
0.001193

These results match with FFTW.