I have tried to change the current device in CUDA graphs by creating this host node:
cudaGraph_t graph;
// Node #1: Create the 1st setDevice
cudaHostNodeParams hostNodeParams = {0};
memset(&hostNodeParams, 0, sizeof(hostNodeParams));
hostNodeParams.fn = [](void *data) {
int passed_device_ordinal = *(int *)(data);
cout << "CUDA-Graph: in the host node: changing the device to: "
<< passed_device_ordinal << endl;
CUDA_CHECK(cudaSetDevice(passed_device_ordinal));
};
hostNodeParams.userData = (void *)&device_1;
// Node #1: Add the 1st setDevice
CUDA_CHECK(cudaGraphAddHostNode(&setDevice_1, graph, ©_0to1, 1,
&hostNodeParams));
When running the code, I get this output:
CUDA-Graph: in the host node: changing the device to: 1
Error operation not permitted at line 68 in file src/MultiGPU.cu
Is it possible to change the device within a CUDA graph?
During the execution of a graph, the current device cannot be changed via a host callback, since callbacks are not allowed to make cuda api calls.
There are two ways to specify the device on which a kernel within the graph will execute.
Use stream-capture to create a multi-gpu graph.
When manually constructing the graph, nodes will be assigned to the currently active device. Use cudaSetDevice before adding your kernel.
The following code demonstrates both with a simple pipeline which executes (kernel, memcpy to host, host callback) on each gpu.
#include <thread>
#include <future>
#include <chrono>
#include <array>
#include <vector>
#include <cassert>
__global__
void kernel(int* data){
*data = 42;
}
struct CallbackData{
int* pinnedBuffer;
std::vector<int>* vec;
};
void callback(void* args){
CallbackData* data = static_cast<CallbackData*>(args);
data->vec->push_back(*data->pinnedBuffer);
}
int main(){
constexpr int numDevices = 2;
std::array<int, numDevices> deviceIds{0,1};
constexpr int numIterations = 100;
std::array<cudaStream_t, numDevices> streams{};
std::array<cudaEvent_t, numDevices> events{};
std::array<int*, numDevices> deviceBuffers{};
std::array<int*, numDevices> pinnedBuffers{};
std::array<std::vector<int>, numDevices> vectors{};
std::array<CallbackData, numDevices> callbackArgs{};
for(int i = 0; i < numDevices; i++){
cudaSetDevice(deviceIds[i]);
cudaStreamCreate(&streams[i]);
cudaEventCreate(&events[i], cudaEventDisableTiming);
cudaMalloc(&deviceBuffers[i], sizeof(int));
cudaMallocHost(&pinnedBuffers[i], sizeof(int));
vectors[i].reserve(numIterations);
callbackArgs[i].pinnedBuffer = pinnedBuffers[i];
callbackArgs[i].vec = &vectors[i];
}
cudaSetDevice(deviceIds[0]);
cudaStream_t mainstream;
cudaStreamCreate(&mainstream);
cudaEvent_t mainevent;
cudaEventCreate(&mainevent, cudaEventDisableTiming);
auto launch = [&](){
cudaEventRecord(mainevent, mainstream);
for(int i = 0; i < numDevices; i++){
cudaSetDevice(deviceIds[i]);
auto& stream = streams[i];
cudaStreamWaitEvent(stream, mainevent);
for(int k = 0; k < numIterations; k++){
kernel<<<1,1,0,stream>>>(deviceBuffers[i]);
cudaMemcpyAsync(pinnedBuffers[i], deviceBuffers[i], sizeof(int), cudaMemcpyDeviceToHost, stream);
cudaLaunchHostFunc(stream, callback, (void*)&callbackArgs[i]);
}
cudaEventRecord(events[i], stream);
cudaStreamWaitEvent(mainstream, events[i]);
}
cudaSetDevice(deviceIds[0]);
};
// no graph
launch();
cudaStreamSynchronize(mainstream);
for(int i = 0; i < numDevices; i++){
assert(vectors[i].size() == numIterations);
for(auto x : vectors[i]){
assert(x == 42);
}
vectors[i].clear();
}
//stream capture graph
{
cudaStreamBeginCapture(mainstream, cudaStreamCaptureModeRelaxed);
launch();
cudaGraph_t graph;
cudaStreamEndCapture(mainstream, &graph);
cudaGraphExec_t execGraph;
cudaGraphNode_t errorNode;
cudaError_t status = cudaGraphInstantiate(&execGraph, graph, &errorNode, nullptr, 0);
assert(status == cudaSuccess) ;
cudaGraphDestroy(graph);
cudaGraphLaunch(execGraph, mainstream);
cudaStreamSynchronize(mainstream);
for(int i = 0; i < numDevices; i++){
assert(vectors[i].size() == numIterations);
for(auto x : vectors[i]){
assert(x == 42);
}
vectors[i].clear();
}
cudaGraphExecDestroy(execGraph);
}
//construct graph manually
{
cudaGraph_t graph;
cudaGraphCreate(&graph, 0);
for(int i = 0; i < numDevices; i++){
cudaSetDevice(deviceIds[i]);
cudaGraphNode_t* prev = nullptr;
cudaGraphNode_t kernelNode;
cudaGraphNode_t memcpyNode;
cudaGraphNode_t hostNode;
cudaKernelNodeParams kernelNodeParams{};
kernelNodeParams.func = (void *)kernel;
kernelNodeParams.gridDim = dim3(1, 1, 1);
kernelNodeParams.blockDim = dim3(1, 1, 1);
kernelNodeParams.sharedMemBytes = 0;
void *kernelArgs[1] = {(void *)&deviceBuffers[i]};
kernelNodeParams.kernelParams = kernelArgs;
kernelNodeParams.extra = NULL;
cudaHostNodeParams hostNodeParams{};
hostNodeParams.fn = callback;
hostNodeParams.userData = &callbackArgs[i];
for(int k = 0; k < numIterations; k++){
cudaGraphAddKernelNode(&kernelNode, graph, prev, (prev == nullptr ? 0 : 1), &kernelNodeParams);
cudaGraphAddMemcpyNode1D(&memcpyNode, graph, &kernelNode, 1, pinnedBuffers[i], deviceBuffers[i], sizeof(int), cudaMemcpyDeviceToHost);
cudaGraphAddHostNode(&hostNode, graph, &memcpyNode, 1, &hostNodeParams);
prev = &hostNode;
}
cudaSetDevice(deviceIds[0]);
}
cudaGraphExec_t execGraph;
cudaGraphNode_t errorNode;
cudaError_t status = cudaGraphInstantiate(&execGraph, graph, &errorNode, nullptr, 0);
assert(status == cudaSuccess) ;
cudaGraphDestroy(graph);
cudaGraphLaunch(execGraph, mainstream);
cudaStreamSynchronize(mainstream);
for(int i = 0; i < numDevices; i++){
assert(vectors[i].size() == numIterations);
for(auto x : vectors[i]){
assert(x == 42);
}
vectors[i].clear();
}
cudaGraphExecDestroy(execGraph);
}
cudaEventDestroy(mainevent);
cudaStreamDestroy(mainstream);
for(int i = 0; i < numDevices; i++){
cudaSetDevice(deviceIds[i]);
cudaStreamDestroy(streams[i]);
cudaEventDestroy(events[i]);
cudaFree(deviceBuffers[i]);
cudaFreeHost(pinnedBuffers[i]);
}
}