c++cudacufft

Multi-GPU batched 1D FFTs: only a single GPU seems to work


I have three Tesla V100s on RHEL 8 with CUDA toolkit version 10.2.89.

I'm attempting to compute a batch of 1D FFTs of the columns of a row-major matrix. In the example below, the matrix is 16x8, so with three GPUs I'd expect GPU 0 to perform the FFTs of the first 3 columns, GPU 1 to perform FFTs of the next 3, and GPU 2 to perform FFTs of the final 2.

The plan created in the example works as expected on a single GPU, but when running on three only the first three columns are computed (correctly), the remainder are untouched.

When I inspect the descriptor that is filled by cufftXtMalloc, I see that it has allocated space for 123 elements on GPUs 0 and 1, and 122 on GPU 2. This seems weird: I would expect 48=16*3 on GPUs 0 and 1 and 32=16*2 on GPU 2. Indeed this is the size of the workspaces filled by cufftMakePlanMany. When I inspect the data that was copied, elements 0-122 are in the buffer on GPU 0, and elements 123-127 are at the beginning of the buffer on GPU 1. The remainder of that buffer and the buffer on GPU 2 are junk.

In addition, when I increase the number of rows to 1024, I get a SIGABRT on the cufftXtFree call with the message 'free(): corrupted unsorted chunks'.

#include "cufft.h"
#include "cufftXt.h"
#include <vector>
#include <cuComplex.h>
#include <cassert>

#define CUDA_CHECK(x) assert(x == cudaSuccess)
#define CUFFT_CHECK(x) assert(x == CUFFT_SUCCESS)

int main() {
    static const int numGPUs = 3;
    int gpus[numGPUs] = {0, 1, 2};

    int nr = 16;
    int nc = 8;

    // Fill with junk data
    std::vector<cuFloatComplex> h_x(nr * nc);
    for (int i = 0; i < nr * nc; ++i) {
        h_x[i].x = static_cast<float>(i);
    }

    cufftHandle plan;
    CUFFT_CHECK(cufftCreate(&plan));
    CUFFT_CHECK(cufftXtSetGPUs(plan, numGPUs, gpus));

    std::vector<size_t> workSizes(numGPUs);
    int n[] = {nr};

    CUFFT_CHECK(cufftMakePlanMany(plan,
                                  1, // rank
                                  n, // n
                                  n, // inembed
                                  nc, // istride
                                  1, // idist
                                  n, // onembed
                                  nc, // ostride
                                  1, // odist
                                  CUFFT_C2C,
                                  nc,
                                  workSizes.data()));

    cudaLibXtDesc *d_x;
    CUFFT_CHECK(cufftXtMalloc(plan, &d_x, CUFFT_XT_FORMAT_INPLACE));

    CUFFT_CHECK(cufftXtMemcpy(plan, d_x, (void *)h_x.data(), CUFFT_COPY_HOST_TO_DEVICE));

    CUFFT_CHECK(cufftXtExecDescriptorC2C(plan, d_x, d_x, CUFFT_FORWARD));

    std::vector<cuFloatComplex> h_out(nr * nc);
    CUFFT_CHECK(cufftXtMemcpy(plan, (void *)h_out.data(), d_x, CUFFT_COPY_DEVICE_TO_HOST));

    CUFFT_CHECK(cufftXtFree(d_x));
    CUFFT_CHECK(cufftDestroy(plan));

    CUDA_CHECK(cudaDeviceReset());

    return 0;
}

Solution

  • Thanks to @RobertCrovella for the answer:

    As of CUDA 10.2.89 according to the documentation strided input and output are not supported for multi-GPU transforms.