cudamulti-gpucuda-graphs

Is changing the device in a CUDA Graph node unavailable?


I have tried to change the current device in CUDA graphs by creating this host node:

cudaGraph_t graph;

// Node #1: Create the 1st setDevice
cudaHostNodeParams hostNodeParams = {0};

memset(&hostNodeParams, 0, sizeof(hostNodeParams));
hostNodeParams.fn = [](void *data) {
  int passed_device_ordinal = *(int *)(data);
  cout << "CUDA-Graph: in the host node: changing the device to: "
        << passed_device_ordinal << endl;
  CUDA_CHECK(cudaSetDevice(passed_device_ordinal));
};
hostNodeParams.userData = (void *)&device_1;

// Node #1: Add the 1st setDevice
CUDA_CHECK(cudaGraphAddHostNode(&setDevice_1, graph, &copy_0to1, 1,
                                &hostNodeParams));

When running the code, I get this output:

CUDA-Graph: in the host node: changing the device to: 1
Error operation not permitted at line 68 in file src/MultiGPU.cu

Is it possible to change the device within a CUDA graph?


Solution

  • During the execution of a graph, the current device cannot be changed via a host callback, since callbacks are not allowed to make cuda api calls.

    There are two ways to specify the device on which a kernel within the graph will execute.

    1. Use stream-capture to create a multi-gpu graph.

    2. When manually constructing the graph, nodes will be assigned to the currently active device. Use cudaSetDevice before adding your kernel.

    The following code demonstrates both with a simple pipeline which executes (kernel, memcpy to host, host callback) on each gpu.

    #include <thread>
    #include <future>
    #include <chrono>
    #include <array>
    #include <vector>
    #include <cassert>
    
    __global__
    void kernel(int* data){
        *data = 42;
    }
    
    struct CallbackData{
        int* pinnedBuffer;
        std::vector<int>* vec;
    };
    
    void callback(void* args){
        CallbackData* data = static_cast<CallbackData*>(args);
        data->vec->push_back(*data->pinnedBuffer);
    }
    
    int main(){
        constexpr int numDevices = 2;
        std::array<int, numDevices> deviceIds{0,1};
    
        constexpr int numIterations = 100;
    
    
        std::array<cudaStream_t, numDevices> streams{};
        std::array<cudaEvent_t, numDevices> events{};
        std::array<int*, numDevices> deviceBuffers{};
        std::array<int*, numDevices> pinnedBuffers{};
        std::array<std::vector<int>, numDevices> vectors{};
        std::array<CallbackData, numDevices> callbackArgs{};
    
        for(int i = 0; i < numDevices; i++){
            cudaSetDevice(deviceIds[i]);
            cudaStreamCreate(&streams[i]);
            cudaEventCreate(&events[i], cudaEventDisableTiming);
            cudaMalloc(&deviceBuffers[i], sizeof(int));
            cudaMallocHost(&pinnedBuffers[i], sizeof(int));
    
            vectors[i].reserve(numIterations);
    
            callbackArgs[i].pinnedBuffer = pinnedBuffers[i];
            callbackArgs[i].vec = &vectors[i];
        }
    
        cudaSetDevice(deviceIds[0]);
    
        cudaStream_t mainstream;
        cudaStreamCreate(&mainstream);
        cudaEvent_t mainevent;
        cudaEventCreate(&mainevent, cudaEventDisableTiming);
    
        auto launch = [&](){
    
            cudaEventRecord(mainevent, mainstream);    
    
            for(int i = 0; i < numDevices; i++){
                cudaSetDevice(deviceIds[i]);
                auto& stream = streams[i];
                cudaStreamWaitEvent(stream, mainevent);
    
                for(int k = 0; k < numIterations; k++){
                    kernel<<<1,1,0,stream>>>(deviceBuffers[i]);
                    cudaMemcpyAsync(pinnedBuffers[i], deviceBuffers[i], sizeof(int), cudaMemcpyDeviceToHost, stream);
                    cudaLaunchHostFunc(stream, callback, (void*)&callbackArgs[i]);
                }
                cudaEventRecord(events[i], stream);
                cudaStreamWaitEvent(mainstream, events[i]);
            }
    
            cudaSetDevice(deviceIds[0]);
    
        };
    
        // no graph
    
        launch();
    
        cudaStreamSynchronize(mainstream);
    
        for(int i = 0; i < numDevices; i++){
            assert(vectors[i].size() == numIterations);
            for(auto x : vectors[i]){
                assert(x == 42);
            }
            vectors[i].clear();
        }
    
        //stream capture graph
        {
    
            cudaStreamBeginCapture(mainstream, cudaStreamCaptureModeRelaxed);
    
            launch();
    
            cudaGraph_t graph;
            cudaStreamEndCapture(mainstream, &graph);
            
            cudaGraphExec_t execGraph;
            cudaGraphNode_t errorNode;
            cudaError_t status = cudaGraphInstantiate(&execGraph, graph, &errorNode, nullptr, 0);
            assert(status == cudaSuccess)  ;        
    
            cudaGraphDestroy(graph);
    
            
            
            cudaGraphLaunch(execGraph, mainstream);
    
            cudaStreamSynchronize(mainstream);
    
            for(int i = 0; i < numDevices; i++){
                assert(vectors[i].size() == numIterations);
                for(auto x : vectors[i]){
                    assert(x == 42);
                }
                vectors[i].clear();
            }
    
            cudaGraphExecDestroy(execGraph);
        }
    
    
    
        //construct graph manually
        {
            cudaGraph_t graph;
            cudaGraphCreate(&graph, 0);
    
            for(int i = 0; i < numDevices; i++){
                cudaSetDevice(deviceIds[i]);
    
                cudaGraphNode_t* prev = nullptr;
                cudaGraphNode_t kernelNode;
                cudaGraphNode_t memcpyNode;
                cudaGraphNode_t hostNode;            
    
                cudaKernelNodeParams kernelNodeParams{};
                kernelNodeParams.func = (void *)kernel;
                kernelNodeParams.gridDim = dim3(1, 1, 1);
                kernelNodeParams.blockDim = dim3(1, 1, 1);
                kernelNodeParams.sharedMemBytes = 0;
                void *kernelArgs[1] = {(void *)&deviceBuffers[i]};
                kernelNodeParams.kernelParams = kernelArgs;
                kernelNodeParams.extra = NULL;
    
                cudaHostNodeParams hostNodeParams{};
                hostNodeParams.fn = callback;
                hostNodeParams.userData = &callbackArgs[i];
    
                for(int k = 0; k < numIterations; k++){
                    cudaGraphAddKernelNode(&kernelNode, graph, prev, (prev == nullptr ? 0 : 1), &kernelNodeParams);
                    cudaGraphAddMemcpyNode1D(&memcpyNode, graph, &kernelNode, 1, pinnedBuffers[i], deviceBuffers[i], sizeof(int), cudaMemcpyDeviceToHost);
                    cudaGraphAddHostNode(&hostNode, graph, &memcpyNode, 1, &hostNodeParams);
    
                    prev = &hostNode;
                }
    
                cudaSetDevice(deviceIds[0]);
            }
    
    
            cudaGraphExec_t execGraph;
            cudaGraphNode_t errorNode;
            cudaError_t status = cudaGraphInstantiate(&execGraph, graph, &errorNode, nullptr, 0);
            assert(status == cudaSuccess)  ;        
    
            cudaGraphDestroy(graph);        
            
            cudaGraphLaunch(execGraph, mainstream);
    
            cudaStreamSynchronize(mainstream);
    
            for(int i = 0; i < numDevices; i++){
                assert(vectors[i].size() == numIterations);
                for(auto x : vectors[i]){
                    assert(x == 42);
                }
                vectors[i].clear();
            }
    
            cudaGraphExecDestroy(execGraph);        
        }
    
    
        cudaEventDestroy(mainevent);
        cudaStreamDestroy(mainstream);
        
        for(int i = 0; i < numDevices; i++){
            cudaSetDevice(deviceIds[i]);
            cudaStreamDestroy(streams[i]);
            cudaEventDestroy(events[i]);
            cudaFree(deviceBuffers[i]);
            cudaFreeHost(pinnedBuffers[i]);
        }
    }