What is the correct way to perform 4D FFT in Cuda by implementing 1D FFT in each dimension using cufftPlanMany API

Cuda does not have any direct implementation of 4D FFT. Hence I want to decompose a 4D FFT into 4 x 1D FFTs into X, Y, Z, and W dimensions. I understand that the cufftPlanMany API is best suited for this as it eliminates the use of any for loops and hence it is much faster.

I have written a program for exactly that. However, the final result of the 4D FFT does not match the 4D FFTW implementation.

Below are the two implementations using FFTW and Cuda libraries, respectively. I have carefully chosen the batch, stride, and dist for the cufftPlanMany function. However, I don't understand what I am doing wrong. Any help will be appreciated.

FFTW 4D implementation

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <fftw3.h>

#define PRINT_FLAG 1
#define NPRINTS 5  // print size

void printf_fftw_cmplx_array(fftw_complex *complex_array, unsigned int size) {
    for (unsigned int i = 0; i < NPRINTS; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i][0], complex_array[i][1]);
    }
    printf("...\n");
    for (unsigned int i = size - NPRINTS; i < size; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i][0], complex_array[i][1]);
    }
}

float run_test_fftw_4d(unsigned int nx, unsigned int ny, unsigned int nz, unsigned int nw) {
    srand(2025);

    // Declaration
    fftw_complex *complex_data;
    fftw_plan plan;

    unsigned int element_size = nx * ny * nz * nw;
    size_t size = sizeof(fftw_complex) * element_size;

    clock_t start, stop;
    float elapsed_time;

    // Allocate memory for input and output arrays
    complex_data = (fftw_complex *)fftw_malloc(size);

    // Initialize input complex signal
    for (unsigned int i = 0; i < element_size; ++i) {
        complex_data[i][0] = rand() / (float)RAND_MAX;
        complex_data[i][1] = 0;
    }

    // Print input stuff
    if (PRINT_FLAG) {
        printf("Complex data...\n");
        printf_fftw_cmplx_array(complex_data, element_size);
    }

    // Setup the FFT plan
    plan = fftw_plan_dft(4, (int[]){nx, ny, nz, nw}, complex_data, complex_data, FFTW_FORWARD, FFTW_ESTIMATE);

    // Start time
    start = clock();
    
    // Execute the FFT
    fftw_execute(plan);

    // End time
    stop = clock();

    // Print output stuff
    if (PRINT_FLAG) {
        printf("Fourier Coefficients...\n");
        printf_fftw_cmplx_array(complex_data, element_size);
    }

    // Compute elapsed time
    elapsed_time = (double)(stop - start) / CLOCKS_PER_SEC;

    // Clean up
    fftw_destroy_plan(plan);
    fftw_free(complex_data);
    fftw_cleanup();

    return elapsed_time;
}


int main(int argc, char **argv) {
    if (argc != 6) {
        printf("Error: This program requires exactly 5 command-line arguments.\n");
        printf("       %s <arg0> <arg1> <arg2> <arg3> <arg4>\n", argv[0]);
        printf("       arg0, arg1, arg2, arg3: FFT lengths in 4D\n");
        printf("       arg4: Number of iterations\n");
        printf("       e.g.: %s 64 64 64 64 5\n", argv[0]);
        return -1;
    }

    unsigned int nx = atoi(argv[1]);
    unsigned int ny = atoi(argv[2]);
    unsigned int nz = atoi(argv[3]);
    unsigned int nw = atoi(argv[4]);
    unsigned int niter = atoi(argv[5]);

    float sum = 0.0;
    float span_s = 0.0;
    for (unsigned int i = 0; i < niter; ++i) {
        span_s = run_test_fftw_4d(nx, ny, nz, nw);
        if (PRINT_FLAG) printf("[%d]: %.6f s\n", i, span_s);
        sum += span_s;
    }
    printf("%.6f\n", sum/(float)niter);

    return 0;
}

Erroneous cuFFT4D implementation

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h> 
#include <cufft.h>
#include <math.h>

#define PRINT_FLAG 1
#define NPRINTS 5  // print size

#define CHECK_CUDA(call)                                                       \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess)                                                  \
    {                                                                          \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
        exit(EXIT_FAILURE);                                                    \
    }                                                                          \
}

#define CHECK_CUFFT(call)                                                      \
{                                                                              \
    cufftResult error;                                                         \
    if ( (error = (call)) != CUFFT_SUCCESS)                                      \
    {                                                                          \
        fprintf(stderr, "Got CUFFT error %d at %s:%d\n", error, __FILE__,      \
                __LINE__);                                                     \
        exit(EXIT_FAILURE);                                                    \
    }                                                                          \
}

void printf_cufft_cmplx_array(cufftComplex *complex_array, unsigned int size) {
    for (unsigned int i = 0; i < NPRINTS; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i].x, complex_array[i].y);
    }
    printf("...\n");
    for (unsigned int i = size - NPRINTS; i < size; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i].x, complex_array[i].y);
    }
}

float run_test_cufft_4d_4x1d(unsigned int nx, unsigned int ny, unsigned int nz, unsigned int nw) {
    srand(2025);

    // Declaration
    cufftComplex *complex_data;
    cufftComplex *d_complex_data;
    cufftHandle plan1d_x, plan1d_y, plan1d_z, plan1d_w;

    unsigned int element_size = nx * ny * nz * nw;
    size_t size = sizeof(cufftComplex) * element_size;

    cudaEvent_t start, stop;
    float elapsed_time;

    // Allocate memory for the variables on the host
    complex_data = (cufftComplex *)malloc(size);

    // Initialize input complex signal
    for (unsigned int i = 0; i < element_size; ++i) {
        complex_data[i].x = rand() / (float)RAND_MAX;
        complex_data[i].y = 0;
    }

    // Print input stuff
    if (PRINT_FLAG) {
        printf("Complex data...\n");
        printf_cufft_cmplx_array(complex_data, element_size);
    }

    // Create CUDA events
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    // Allocate device memory for complex signal and output frequency
    CHECK_CUDA(cudaMalloc((void **)&d_complex_data, size));

    int n[1] = { (int)nx };
    int embed[1] = { (int)nx };
    CHECK_CUFFT(cufftPlanMany(&plan1d_x, 1, n,       // 1D FFT of size nx
                            embed, ny * nz * nw, 1, // inembed, istride, idist
                            embed, ny * nz * nw, 1, // onembed, ostride, odist
                            CUFFT_C2C, ny * nz * nw));
    n[0] = (int)ny;
    embed[0] = (int)ny;
    CHECK_CUFFT(cufftPlanMany(&plan1d_y, 1, n,       // 1D FFT of size ny
                            embed, nz * nw, 1, // inembed, istride, idist
                            embed, nz * nw, 1, // onembed, ostride, odist
                            CUFFT_C2C, nx * nz * nw));
    n[0] = (int)nz;
    embed[0] = (int)nz;
    CHECK_CUFFT(cufftPlanMany(&plan1d_z, 1, n,       // 1D FFT of size nz
                            embed, nw, 1, // inembed, istride, idist
                            embed, nw, 1, // onembed, ostride, odist
                            CUFFT_C2C, nx * ny * nw));
    n[0] = (int)nw;
    embed[0] = (int)nw;
    CHECK_CUFFT(cufftPlanMany(&plan1d_w, 1, n,       // 1D FFT of size nw
                            embed, 1, nw, // inembed, istride, idist
                            embed, 1, nw, // onembed, ostride, odist
                            CUFFT_C2C, nx * ny * nz));

    // Record the start event
    CHECK_CUDA(cudaEventRecord(start, 0));

    // Copy host memory to device
    CHECK_CUDA(cudaMemcpy(d_complex_data, complex_data, size, cudaMemcpyHostToDevice));

    // Perform FFT along each dimension sequentially
    CHECK_CUFFT(cufftExecC2C(plan1d_x, d_complex_data, d_complex_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan1d_x));
    CHECK_CUFFT(cufftExecC2C(plan1d_y, d_complex_data, d_complex_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan1d_y));
    CHECK_CUFFT(cufftExecC2C(plan1d_z, d_complex_data, d_complex_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan1d_z));
    CHECK_CUFFT(cufftExecC2C(plan1d_w, d_complex_data, d_complex_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan1d_w));

    // Retrieve the results into host memory
    CHECK_CUDA(cudaMemcpy(complex_data, d_complex_data, size, cudaMemcpyDeviceToHost));

    // Record the stop event
    CHECK_CUDA(cudaEventRecord(stop, 0));
    CHECK_CUDA(cudaEventSynchronize(stop));

    // Print output stuff
    if (PRINT_FLAG) {
        printf("Fourier Coefficients...\n");
        printf_cufft_cmplx_array(complex_data, element_size);
    }

    // Compute elapsed time
    CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, start, stop));

    // Clean up
    CHECK_CUDA(cudaFree(d_complex_data));
    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));
    free(complex_data);

    return elapsed_time * 1e-3;
}


int main(int argc, char **argv) {
    if (argc != 6) {
        printf("Error: This program requires exactly 5 command-line arguments.\n");
        printf("       %s <arg0> <arg1> <arg2> <arg3> <arg4>\n", argv[0]);
        printf("       arg0, arg1, arg2, arg3: FFT lengths in 4D\n");
        printf("       arg4: Number of iterations\n");
        printf("       e.g.: %s 64 64 64 64 5\n", argv[0]);
        return -1;
    }

    unsigned int nx = atoi(argv[1]);
    unsigned int ny = atoi(argv[2]);
    unsigned int nz = atoi(argv[3]);
    unsigned int nw = atoi(argv[4]);
    unsigned int niter = atoi(argv[5]);

    float sum = 0.0;
    float span_s = 0.0;
    for (unsigned int i = 0; i < niter; ++i) {
        span_s = run_test_cufft_4d_4x1d(nx, ny, nz, nw);
        if (PRINT_FLAG) printf("[%d]: %.6f s\n", i, span_s);
        sum += span_s;
    }
    printf("%.6f\n", sum/(float)niter);

    CHECK_CUDA(cudaDeviceReset());
    return 0;
}

Try both implementations for 4x4x4x4 array, and you will notice only the first couple of coefficients match. I know that the FFTW implementation produces the correct result, as I can get the same result in different ways such as a 3D FFT followed by a 1D FFT or 2 x 2D FFTs using both FFTW and cuFFT libraries.

Solution

Summary: A straightforward usage of a sequence of 1D transforms will not work with CUFFT Advanced Data Layout. A set of 3D transforms followed by a set of 1D transforms is workable. It also seems that one can use a sequence of 1D transforms if transpositions are done during the sequence.

Longer: Although a 2D transform can be broken into two 1D transforms using CUFFT "Advanced Data Layout" (via cufftPlanMany), a 3D (or higher) transform cannot be decomposed that way, because (for example) the stride between successive y-dimension transforms is variable (in the 3D case. It is not variable in the 2D case). Considering the 4x4x4 case, the distance from the start of the first transform in the batch to start of the next for the first 4 transforms is one element. But after the first 4, the distance to the start of the next transform is greater than 1 element. This can't be accounted for using advanced data layout on 1D (batched) transforms. (A similar claim is made here.)

OP already indicated so, but I'll repeat here and demonstrate later, that this can be done with a batched set of 3D transforms followed by a batched set of 1D transforms, without additional activity/effort/work. However the question asks for how it could be done using 1D transforms. Based on the suggestion here to use transposing, it seems to be possible with that extra work.

What follows is a demonstration that it seems to work. I have not thought carefully about how to do 4D transposing, and so the kernel is a quick effort that is certainly not optimized and may not work at all in non-cubic cases (i.e. I have only tested it in the case where all 4 dimensions are the same. Even then, testing is quite light.) I'll also show how it can be done with 3D followed by 1D transforms:

# cat t369.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <math.h>

#define PRINT_FLAG 1
#define NPRINTS 5  // print size

#define CHECK_CUDA(call)                                                       \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess)                                                  \
    {                                                                          \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
        exit(EXIT_FAILURE);                                                    \
    }                                                                          \
}

#define CHECK_CUFFT(call)                                                      \
{                                                                              \
    cufftResult error;                                                         \
    if ( (error = (call)) != CUFFT_SUCCESS)                                      \
    {                                                                          \
        fprintf(stderr, "Got CUFFT error %d at %s:%d\n", error, __FILE__,      \
                __LINE__);                                                     \
        exit(EXIT_FAILURE);                                                    \
    }                                                                          \
}
using mt = cufftDoubleComplex;
#define TT CUFFT_Z2Z
#define GG cufftExecZ2Z

#define IDX(x,y,z,w,lx,ly,lz) ((w*lx*ly*lz)+(z*lx*ly)+(y*lx)+x)

template <typename T>
__global__ void k4dt(T *i, T *o, int lx, int ly, int lz, int lw){

  int idx = blockDim.x*blockIdx.x+threadIdx.x;
  int idy = blockDim.y*blockIdx.y+threadIdx.y;
  int idz = blockDim.z*blockIdx.z+threadIdx.z;
  if ((idx < lx)&&(idy < ly)&&(idz < lz))
    for (int idw = 0; idw < lw; idw++)
      o[IDX(idy,idz,idw,idx,ly,lz,lw)] = i[IDX(idx,idy,idz,idw,lx,ly,lz)];
}


void printf_cufft_cmplx_array(mt *complex_array, unsigned int size) {
    for (unsigned int i = 0; i < NPRINTS; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i].x, complex_array[i].y);
    }
    printf("...\n");
    for (unsigned int i = size - NPRINTS; i < size; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i].x, complex_array[i].y);
    }
}

float run_test_cufft_4d_4x1d(int nx, int ny, int nz, int nw) {
    srand(2025);

    // Declaration
    mt *complex_data;
    mt *d_complex_data;
    mt *d_transpose_data;

    unsigned int element_size = nx * ny * nz * nw;
    size_t size = sizeof(mt) * element_size;

    cudaEvent_t start, stop;
    float elapsed_time;

    // Allocate memory for the variables on the host
    complex_data = (mt *)malloc(size);

    // Initialize input complex signal
    for (unsigned int i = 0; i < element_size; ++i) {
        complex_data[i].x = rand() / (float)RAND_MAX;
        complex_data[i].y = 0;
    }

    // Print input stuff
    if (PRINT_FLAG) {
        printf("Complex data...\n");
        printf_cufft_cmplx_array(complex_data, element_size);
    }

    // Create CUDA events
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    // Allocate device memory for complex signal and output frequency
    CHECK_CUDA(cudaMalloc((void **)&d_complex_data, size));
    CHECK_CUDA(cudaMalloc((void **)&d_transpose_data, size));
    int n[3];
    int embed[1];
#ifdef USE_1D
    cufftHandle plan1d_x;
    n[0] = { (int)nx };
    embed[0] = { (int)nx };
    CHECK_CUFFT(cufftPlanMany(&plan1d_x, 1, n,       // 1D FFT of size nx
                            embed, 1, nx, // inembed, istride, idist
                            embed, 1, nx, // onembed, ostride, odist
                            TT, ny * nz * nw));
#else
    cufftHandle plan3d;
    n[0] = nx;
    n[1] = ny;
    n[2] = nz;
    CHECK_CUFFT(cufftPlanMany(&plan3d, 3, n,       // 3D FFT of size nx,ny,nz
                            nullptr, 1, 0, // inembed, istride, idist
                            nullptr, 1, 0, // onembed, ostride, odist
                            TT, nw));

    cufftHandle plan1d_w;
    n[0] = (int)nw;
    embed[0] = (int)nw;
    CHECK_CUFFT(cufftPlanMany(&plan1d_w, 1, n,       // 1D FFT of size nw
                            embed, nx*ny*nz, 1, // inembed, istride, idist
                            embed, nx*ny*nz, 1, // onembed, ostride, odist
                            TT, nx * ny * nz));
#endif


    // Record the start event
    CHECK_CUDA(cudaEventRecord(start, 0));

    // Copy host memory to device
    CHECK_CUDA(cudaMemcpy(d_complex_data, complex_data, size, cudaMemcpyHostToDevice));

    // Perform FFT along each dimension sequentially
#ifdef USE_1D
    printf("using 1D transforms\n");
    dim3 b(8,8,8);
    dim3 g((nx+b.x-1)/b.x, (ny+b.y-1)/b.y, (nz+b.z-1)/b.z);
    CHECK_CUFFT(GG(plan1d_x, d_complex_data, d_complex_data, CUFFT_FORWARD));
    k4dt<<<g,b>>>(d_complex_data, d_transpose_data, nx, ny, nz, nw);
    CHECK_CUFFT(GG(plan1d_x, d_transpose_data, d_transpose_data, CUFFT_FORWARD));
    k4dt<<<g,b>>>(d_transpose_data, d_complex_data, nx, ny, nz, nw);
    CHECK_CUFFT(GG(plan1d_x, d_complex_data, d_complex_data, CUFFT_FORWARD));
    k4dt<<<g,b>>>(d_complex_data, d_transpose_data, nx, ny, nz, nw);
    CHECK_CUFFT(GG(plan1d_x, d_transpose_data, d_transpose_data, CUFFT_FORWARD));
    k4dt<<<g,b>>>(d_transpose_data, d_complex_data, nx, ny, nz, nw);
    CHECK_CUFFT(cufftDestroy(plan1d_x));
#else
    printf("using 3D and 1D transforms\n");
    CHECK_CUFFT(GG(plan3d, d_complex_data, d_complex_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan3d));
    CHECK_CUFFT(GG(plan1d_w, d_complex_data, d_complex_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan1d_w));
#endif

    // Retrieve the results into host memory
    CHECK_CUDA(cudaMemcpy(complex_data, d_complex_data, size, cudaMemcpyDeviceToHost));

    // Record the stop event
    CHECK_CUDA(cudaEventRecord(stop, 0));
    CHECK_CUDA(cudaEventSynchronize(stop));

    // Print output stuff
    if (PRINT_FLAG) {
        printf("Fourier Coefficients...\n");
        printf_cufft_cmplx_array(complex_data, element_size);
    }

    // Compute elapsed time
    CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, start, stop));

    // Clean up
    CHECK_CUDA(cudaFree(d_complex_data));
    CHECK_CUDA(cudaFree(d_transpose_data));
    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));
    free(complex_data);

    return elapsed_time * 1e-3;
}


int main(int argc, char **argv) {
    if (argc != 6) {
        printf("Error: This program requires exactly 5 command-line arguments.\n");
        printf("       %s <arg0> <arg1> <arg2> <arg3> <arg4>\n", argv[0]);
        printf("       arg0, arg1, arg2, arg3: FFT lengths in 4D\n");
        printf("       arg4: Number of iterations\n");
        printf("       e.g.: %s 64 64 64 64 5\n", argv[0]);
        return -1;
    }

    unsigned int nx = atoi(argv[1]);
    unsigned int ny = atoi(argv[2]);
    unsigned int nz = atoi(argv[3]);
    unsigned int nw = atoi(argv[4]);
    unsigned int niter = atoi(argv[5]);

    float sum = 0.0;
    float span_s = 0.0;
    for (unsigned int i = 0; i < niter; ++i) {
        span_s = run_test_cufft_4d_4x1d(nx, ny, nz, nw);
        if (PRINT_FLAG) printf("[%d]: %.6f s\n", i, span_s);
        sum += span_s;
    }
    printf("%.6f\n", sum/(float)niter);

    CHECK_CUDA(cudaDeviceReset());
    return 0;
}
# nvcc -o t369_cufft t369.cu -lcufft
# compute-sanitizer ./t369_cufft 4 4 4 4 1
========= COMPUTE-SANITIZER
Complex data...
  (0.2005, 0.0000i)
  (0.4584, 0.0000i)
  (0.8412, 0.0000i)
  (0.6970, 0.0000i)
  (0.3846, 0.0000i)
...
  (0.5214, 0.0000i)
  (0.3179, 0.0000i)
  (0.9771, 0.0000i)
  (0.1417, 0.0000i)
  (0.5867, 0.0000i)
using 3D and 1D transforms
Fourier Coefficients...
  (121.0454, 0.0000i)
  (-1.6709, -1.3923i)
  (-12.7056, 0.0000i)
  (-1.6709, 1.3923i)
  (-1.3997, -3.1249i)
...
  (1.0800, 0.8837i)
  (2.0585, -2.7097i)
  (1.1019, 1.7167i)
  (4.9727, 0.1244i)
  (-1.2561, 0.6645i)
[0]: 0.283668 s
0.283668
========= ERROR SUMMARY: 0 errors
# nvcc -o t369_cufft t369.cu -lcufft -DUSE_1D
# compute-sanitizer ./t369_cufft 4 4 4 4 1
========= COMPUTE-SANITIZER
Complex data...
  (0.2005, 0.0000i)
  (0.4584, 0.0000i)
  (0.8412, 0.0000i)
  (0.6970, 0.0000i)
  (0.3846, 0.0000i)
...
  (0.5214, 0.0000i)
  (0.3179, 0.0000i)
  (0.9771, 0.0000i)
  (0.1417, 0.0000i)
  (0.5867, 0.0000i)
using 1D transforms
Fourier Coefficients...
  (121.0454, 0.0000i)
  (-1.6709, -1.3923i)
  (-12.7056, 0.0000i)
  (-1.6709, 1.3923i)
  (-1.3997, -3.1249i)
...
  (1.0800, 0.8837i)
  (2.0585, -2.7097i)
  (1.1019, 1.7167i)
  (4.9727, 0.1244i)
  (-1.2561, 0.6645i)
[0]: 0.285792 s
0.285792
========= ERROR SUMMARY: 0 errors
#

When I run OP's FFTW code (which I have not duplicated here) for the 4 4 4 4 1 case (a 4x4x4x4 array), I get output like this:

# ./t369_fftw 4 4 4 4 1
Complex data...
  (0.2005, 0.0000i)
  (0.4584, 0.0000i)
  (0.8412, 0.0000i)
  (0.6970, 0.0000i)
  (0.3846, 0.0000i)
...
  (0.5214, 0.0000i)
  (0.3179, 0.0000i)
  (0.9771, 0.0000i)
  (0.1417, 0.0000i)
  (0.5867, 0.0000i)
Fourier Coefficients...
  (121.0454, 0.0000i)
  (-1.6709, -1.3923i)
  (-12.7056, 0.0000i)
  (-1.6709, 1.3923i)
  (-1.3997, -3.1249i)
...
  (1.0800, 0.8837i)
  (2.0585, -2.7097i)
  (1.1019, 1.7167i)
  (4.9727, 0.1244i)
  (-1.2561, 0.6645i)
[0]: 0.000018 s
0.000018
#

So things seem to match as far as that output depicts.

Notes:

In the 1D case, we need only use the transforms in the x-dimension. The transposing brings the next necessary dimension into the x dimension for the next transform.
It is generally important when comparing FFTW to CUFFT to make sure both are using the same fundamental types. OP's code doesn't reflect this; I have modified my demonstrator to use cufftDoubleComplex which matches the precision (double) of fftw_complex, at least on my system.
CUDA 12.2 Lightly tested. In particular, the transpose operation is not carefully tested nor is it optimized in any way.