Is changing the device in a CUDA Graph node unavailable?

I have tried to change the current device in CUDA graphs by creating this host node:

cudaGraph_t graph;

// Node #1: Create the 1st setDevice
cudaHostNodeParams hostNodeParams = {0};

memset(&hostNodeParams, 0, sizeof(hostNodeParams));
hostNodeParams.fn = [](void *data) {
  int passed_device_ordinal = *(int *)(data);
  cout << "CUDA-Graph: in the host node: changing the device to: "
        << passed_device_ordinal << endl;
  CUDA_CHECK(cudaSetDevice(passed_device_ordinal));
};
hostNodeParams.userData = (void *)&device_1;

// Node #1: Add the 1st setDevice
CUDA_CHECK(cudaGraphAddHostNode(&setDevice_1, graph, &copy_0to1, 1,
                                &hostNodeParams));

When running the code, I get this output:

CUDA-Graph: in the host node: changing the device to: 1
Error operation not permitted at line 68 in file src/MultiGPU.cu

Is it possible to change the device within a CUDA graph?

Solution

During the execution of a graph, the current device cannot be changed via a host callback, since callbacks are not allowed to make cuda api calls.

There are two ways to specify the device on which a kernel within the graph will execute.

Use stream-capture to create a multi-gpu graph.
When manually constructing the graph, nodes will be assigned to the currently active device. Use cudaSetDevice before adding your kernel.

The following code demonstrates both with a simple pipeline which executes (kernel, memcpy to host, host callback) on each gpu.

#include <thread>
#include <future>
#include <chrono>
#include <array>
#include <vector>
#include <cassert>

__global__
void kernel(int* data){
    *data = 42;
}

struct CallbackData{
    int* pinnedBuffer;
    std::vector<int>* vec;
};

void callback(void* args){
    CallbackData* data = static_cast<CallbackData*>(args);
    data->vec->push_back(*data->pinnedBuffer);
}

int main(){
    constexpr int numDevices = 2;
    std::array<int, numDevices> deviceIds{0,1};

    constexpr int numIterations = 100;


    std::array<cudaStream_t, numDevices> streams{};
    std::array<cudaEvent_t, numDevices> events{};
    std::array<int*, numDevices> deviceBuffers{};
    std::array<int*, numDevices> pinnedBuffers{};
    std::array<std::vector<int>, numDevices> vectors{};
    std::array<CallbackData, numDevices> callbackArgs{};

    for(int i = 0; i < numDevices; i++){
        cudaSetDevice(deviceIds[i]);
        cudaStreamCreate(&streams[i]);
        cudaEventCreate(&events[i], cudaEventDisableTiming);
        cudaMalloc(&deviceBuffers[i], sizeof(int));
        cudaMallocHost(&pinnedBuffers[i], sizeof(int));

        vectors[i].reserve(numIterations);

        callbackArgs[i].pinnedBuffer = pinnedBuffers[i];
        callbackArgs[i].vec = &vectors[i];
    }

    cudaSetDevice(deviceIds[0]);

    cudaStream_t mainstream;
    cudaStreamCreate(&mainstream);
    cudaEvent_t mainevent;
    cudaEventCreate(&mainevent, cudaEventDisableTiming);

    auto launch = [&](){

        cudaEventRecord(mainevent, mainstream);    

        for(int i = 0; i < numDevices; i++){
            cudaSetDevice(deviceIds[i]);
            auto& stream = streams[i];
            cudaStreamWaitEvent(stream, mainevent);

            for(int k = 0; k < numIterations; k++){
                kernel<<<1,1,0,stream>>>(deviceBuffers[i]);
                cudaMemcpyAsync(pinnedBuffers[i], deviceBuffers[i], sizeof(int), cudaMemcpyDeviceToHost, stream);
                cudaLaunchHostFunc(stream, callback, (void*)&callbackArgs[i]);
            }
            cudaEventRecord(events[i], stream);
            cudaStreamWaitEvent(mainstream, events[i]);
        }

        cudaSetDevice(deviceIds[0]);

    };

    // no graph

    launch();

    cudaStreamSynchronize(mainstream);

    for(int i = 0; i < numDevices; i++){
        assert(vectors[i].size() == numIterations);
        for(auto x : vectors[i]){
            assert(x == 42);
        }
        vectors[i].clear();
    }

    //stream capture graph
    {

        cudaStreamBeginCapture(mainstream, cudaStreamCaptureModeRelaxed);

        launch();

        cudaGraph_t graph;
        cudaStreamEndCapture(mainstream, &graph);
        
        cudaGraphExec_t execGraph;
        cudaGraphNode_t errorNode;
        cudaError_t status = cudaGraphInstantiate(&execGraph, graph, &errorNode, nullptr, 0);
        assert(status == cudaSuccess)  ;        

        cudaGraphDestroy(graph);

        
        
        cudaGraphLaunch(execGraph, mainstream);

        cudaStreamSynchronize(mainstream);

        for(int i = 0; i < numDevices; i++){
            assert(vectors[i].size() == numIterations);
            for(auto x : vectors[i]){
                assert(x == 42);
            }
            vectors[i].clear();
        }

        cudaGraphExecDestroy(execGraph);
    }



    //construct graph manually
    {
        cudaGraph_t graph;
        cudaGraphCreate(&graph, 0);

        for(int i = 0; i < numDevices; i++){
            cudaSetDevice(deviceIds[i]);

            cudaGraphNode_t* prev = nullptr;
            cudaGraphNode_t kernelNode;
            cudaGraphNode_t memcpyNode;
            cudaGraphNode_t hostNode;            

            cudaKernelNodeParams kernelNodeParams{};
            kernelNodeParams.func = (void *)kernel;
            kernelNodeParams.gridDim = dim3(1, 1, 1);
            kernelNodeParams.blockDim = dim3(1, 1, 1);
            kernelNodeParams.sharedMemBytes = 0;
            void *kernelArgs[1] = {(void *)&deviceBuffers[i]};
            kernelNodeParams.kernelParams = kernelArgs;
            kernelNodeParams.extra = NULL;

            cudaHostNodeParams hostNodeParams{};
            hostNodeParams.fn = callback;
            hostNodeParams.userData = &callbackArgs[i];

            for(int k = 0; k < numIterations; k++){
                cudaGraphAddKernelNode(&kernelNode, graph, prev, (prev == nullptr ? 0 : 1), &kernelNodeParams);
                cudaGraphAddMemcpyNode1D(&memcpyNode, graph, &kernelNode, 1, pinnedBuffers[i], deviceBuffers[i], sizeof(int), cudaMemcpyDeviceToHost);
                cudaGraphAddHostNode(&hostNode, graph, &memcpyNode, 1, &hostNodeParams);

                prev = &hostNode;
            }

            cudaSetDevice(deviceIds[0]);
        }


        cudaGraphExec_t execGraph;
        cudaGraphNode_t errorNode;
        cudaError_t status = cudaGraphInstantiate(&execGraph, graph, &errorNode, nullptr, 0);
        assert(status == cudaSuccess)  ;        

        cudaGraphDestroy(graph);        
        
        cudaGraphLaunch(execGraph, mainstream);

        cudaStreamSynchronize(mainstream);

        for(int i = 0; i < numDevices; i++){
            assert(vectors[i].size() == numIterations);
            for(auto x : vectors[i]){
                assert(x == 42);
            }
            vectors[i].clear();
        }

        cudaGraphExecDestroy(execGraph);        
    }


    cudaEventDestroy(mainevent);
    cudaStreamDestroy(mainstream);
    
    for(int i = 0; i < numDevices; i++){
        cudaSetDevice(deviceIds[i]);
        cudaStreamDestroy(streams[i]);
        cudaEventDestroy(events[i]);
        cudaFree(deviceBuffers[i]);
        cudaFreeHost(pinnedBuffers[i]);
    }
}