c++cuda

CUDA `cudaMemcpyBatchAsync` "invalid argument"


I'm consistently encountering an "invalid argument" error when calling cudaMemcpyBatchAsync for host-to-device transfers.

CUDA error at btest.cu:43 - invalid argument

Line 43 is CUDA_CHECK(cudaMemcpyBatchAsync(...)). Line 43 is CUDA_CHECK(cudaMemcpyBatchAsync(...)). The API signature and description are from the official documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gc02716b3bd21f3d83640ab102bf089f9

Code:

#include <cstdint>
#include <iostream>
#include <vector>

// CUDA error checking macro
#define CUDA_CHECK(call) \
   {\
   const cudaError_t err = call; \
   if (err != cudaSuccess) { \
     fprintf(stderr, "CUDA error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
     exit(EXIT_FAILURE); \
   } \
   } \

int main() {
    // --- Configuration ---
    const int NUM_COPIES = 1000;
    const size_t COPY_SIZE_BYTES = 1024;

    // --- Allocate Host and Device Memory ---
    // We need multiple host and device pointers for individual copies
    std::vector<void*> h_src;
    std::vector<void*> d_dst;
    std::vector<size_t> sizes(NUM_COPIES, COPY_SIZE_BYTES);

    for (int i = 0; i < NUM_COPIES; ++i) {
        void* ptr_h, *ptr_d;
        CUDA_CHECK(cudaMallocHost(&ptr_h, COPY_SIZE_BYTES));
        CUDA_CHECK(cudaMalloc    (&ptr_d, COPY_SIZE_BYTES));
        h_src.push_back(ptr_h);
        d_dst.push_back(ptr_d);
    }

    std::vector<cudaMemcpyAttributes> attrs(1);
    attrs[0].srcLocHint.type = cudaMemLocationTypeHost;
    attrs[0].dstLocHint.type = cudaMemLocationTypeDevice;
    attrs[0].srcAccessOrder = cudaMemcpySrcAccessOrderAny;
    attrs[0].flags = 0;
    std::vector<size_t> attrsIdxs = {0};
    size_t numAttrs = attrs.size();

    size_t fail_idx=0; // Variable to store the index of the failed copy if any
    CUDA_CHECK(cudaMemcpyBatchAsync(
        d_dst.data(),
        h_src.data(),
        sizes.data(),
        NUM_COPIES,
        attrs.data(),
        attrsIdxs.data(),
        numAttrs,
        &fail_idx,
        0        // Default stream
    ));

    if( fail_idx!=SIZE_MAX ) throw std::runtime_error("Failed MemcpyBatchAsync at fail_idx = " + std::to_string(fail_idx) + "\n");

    // --- Cleanup ---
    for (int i = 0; i < NUM_COPIES; ++i) {
        CUDA_CHECK(cudaFreeHost(h_src[i]));
        CUDA_CHECK(cudaFree(d_dst[i]));
    }

    return 0;
}

Environment:

Also tested for RTX3090 and CUDA 12.8 with compilation for sm_86. Same "invalid argument" error.

What else could cause an "invalid argument" error for cudaMemcpyBatchAsync in this scenario? Are there any subtle requirements or unusual environmental factors I might be missing?


Solution

  • The likely problem is the stream argument cannot be 0 (i.e. default stream). You will need to specify a named stream that was created with cudaStreamCreate*()

    You also don't have to specify the location hints because "The cudaMemcpyAttributes::srcLocHint and cudaMemcpyAttributes::dstLocHint allows applications to specify hint locations for operands of a copy when the operand doesn't have a fixed location. That is, these hints are only applicable for managed memory pointers on devices where cudaDevAttrConcurrentManagedAccess is true or system-allocated pageable memory on devices where cudaDevAttrPageableMemoryAccess is true."