I'm consistently encountering an "invalid argument" error when calling cudaMemcpyBatchAsync
for host-to-device transfers.
CUDA error at btest.cu:43 - invalid argument
Line 43 is CUDA_CHECK(cudaMemcpyBatchAsync(...))
.
Line 43 is CUDA_CHECK(cudaMemcpyBatchAsync(...))
. The API signature and description are from the official documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gc02716b3bd21f3d83640ab102bf089f9
Code:
#include <cstdint>
#include <iostream>
#include <vector>
// CUDA error checking macro
#define CUDA_CHECK(call) \
{\
const cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} \
int main() {
// --- Configuration ---
const int NUM_COPIES = 1000;
const size_t COPY_SIZE_BYTES = 1024;
// --- Allocate Host and Device Memory ---
// We need multiple host and device pointers for individual copies
std::vector<void*> h_src;
std::vector<void*> d_dst;
std::vector<size_t> sizes(NUM_COPIES, COPY_SIZE_BYTES);
for (int i = 0; i < NUM_COPIES; ++i) {
void* ptr_h, *ptr_d;
CUDA_CHECK(cudaMallocHost(&ptr_h, COPY_SIZE_BYTES));
CUDA_CHECK(cudaMalloc (&ptr_d, COPY_SIZE_BYTES));
h_src.push_back(ptr_h);
d_dst.push_back(ptr_d);
}
std::vector<cudaMemcpyAttributes> attrs(1);
attrs[0].srcLocHint.type = cudaMemLocationTypeHost;
attrs[0].dstLocHint.type = cudaMemLocationTypeDevice;
attrs[0].srcAccessOrder = cudaMemcpySrcAccessOrderAny;
attrs[0].flags = 0;
std::vector<size_t> attrsIdxs = {0};
size_t numAttrs = attrs.size();
size_t fail_idx=0; // Variable to store the index of the failed copy if any
CUDA_CHECK(cudaMemcpyBatchAsync(
d_dst.data(),
h_src.data(),
sizes.data(),
NUM_COPIES,
attrs.data(),
attrsIdxs.data(),
numAttrs,
&fail_idx,
0 // Default stream
));
if( fail_idx!=SIZE_MAX ) throw std::runtime_error("Failed MemcpyBatchAsync at fail_idx = " + std::to_string(fail_idx) + "\n");
// --- Cleanup ---
for (int i = 0; i < NUM_COPIES; ++i) {
CUDA_CHECK(cudaFreeHost(h_src[i]));
CUDA_CHECK(cudaFree(d_dst[i]));
}
return 0;
}
Environment:
nvidia-smi
): 12.9Also tested for RTX3090 and CUDA 12.8 with compilation for sm_86. Same "invalid argument" error.
What else could cause an "invalid argument" error for cudaMemcpyBatchAsync in this scenario? Are there any subtle requirements or unusual environmental factors I might be missing?
The likely problem is the stream argument cannot be 0 (i.e. default stream). You will need to specify a named stream that was created with cudaStreamCreate*()
You also don't have to specify the location hints because "The cudaMemcpyAttributes::srcLocHint and cudaMemcpyAttributes::dstLocHint allows applications to specify hint locations for operands of a copy when the operand doesn't have a fixed location. That is, these hints are only applicable for managed memory pointers on devices where cudaDevAttrConcurrentManagedAccess is true or system-allocated pageable memory on devices where cudaDevAttrPageableMemoryAccess is true."