c++cudainteropdirect3d11libtorch

How to convert a cudaArray to a Torch tensor?


I am experimenting with Torch and CUDA. Using torch::from_blob() I was able to do the following so far:

#include <cuda_runtime.h>
#include <torch/torch.h>
#include <iostream>
#include <exception>
#include <memory>
#include <math.h>

using std::cout;
using std::endl;
using std::exception;

/*
 * Demonstration of interoperability between CUDA and Torch C++ API using 
 * pinned memory.
 *
 * Using the ENABLE_ERROR variable a change in the result (CUDA) can be
 * introduced through its respective Torch tensor. This will also affect
 * the copied data from GPU to CPU, resulting in an error during assert
 * checks at the end
 */

// Contains the call to the CUDA kernel
void vector_add(int* a, int* b, int* c, int N, int cuda_grid_size, int cuda_block_size);

bool ENABLE_ERROR = false;

int main(int argc, const char* argv[])
{
    // Setup array, here 2^16 = 65536 items
    const int N = 1 << 16;
    size_t bytes = N * sizeof(int);

    // Declare pinned memory pointers
    int* a_cpu, * b_cpu, * c_cpu;

    // Allocate pinned memory for the pointers
    // The memory will be accessible from both CPU and GPU
    // without the requirements to copy data from one device
    // to the other
    cout << "Allocating memory for vectors on CPU" << endl;
    cudaMallocHost(&a_cpu, bytes);
    cudaMallocHost(&b_cpu, bytes);
    cudaMallocHost(&c_cpu, bytes);

    // Init vectors
    cout << "Populating vectors with random integers" << endl;
    for (int i = 0; i < N; ++i)
    {
        a_cpu[i] = rand() % 100;
        b_cpu[i] = rand() % 100;
    }

    // Declare GPU memory pointers
    int* a_gpu, * b_gpu, * c_gpu;

    // Allocate memory on the device
    cout << "Allocating memory for vectors on GPU" << endl;
    cudaMalloc(&a_gpu, bytes);
    cudaMalloc(&b_gpu, bytes);
    cudaMalloc(&c_gpu, bytes);

    // Copy data from the host to the device (CPU -> GPU)
    cout << "Transfering vectors from CPU to GPU" << endl;
    cudaMemcpy(a_gpu, a_cpu, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(b_gpu, b_cpu, bytes, cudaMemcpyHostToDevice);

    // Specify threads per CUDA block (CTA), her 2^10 = 1024 threads
    int NUM_THREADS = 1 << 10;

    // CTAs per grid
    int NUM_BLOCKS = (N + NUM_THREADS - 1) / NUM_THREADS;

    // Call CUDA kernel
    cout << "Running CUDA kernels" << endl;
    vector_add(a_gpu, b_gpu, c_gpu, N, NUM_BLOCKS, NUM_THREADS);

    try
    {
        // Convert pinned memory on GPU to Torch tensor on GPU
        auto options = torch::TensorOptions().dtype(torch::kInt).device(torch::kCUDA, 0).pinned_memory(true);
        cout << "Converting vectors and result to Torch tensors on GPU" << endl;
        torch::Tensor a_gpu_tensor = torch::from_blob(a_gpu, { N }, options);
        torch::Tensor b_gpu_tensor = torch::from_blob(b_gpu, { N }, options);
        torch::Tensor c_gpu_tensor = torch::from_blob(c_gpu, { N }, options);

        cout << "Verifying result using Torch tensors" << endl;
        if (ENABLE_ERROR)
        {
            /*
            TEST
            Change the value of the result should result in two things:
             - the GPU memory will be modified
             - the CPU test later on (after the GPU memory is copied to the CPU side) should fail
            */
            cout << "ERROR GENERATION ENABLED! Application will crash during verification of results" << endl;
            cout << "Changing result first element from " << c_gpu_tensor[0];
            c_gpu_tensor[0] = 99999999;
            cout << " to " << c_gpu_tensor[0] << endl;
        }
        else
        {
            assert(c_gpu_tensor.equal(a_gpu_tensor.add(b_gpu_tensor)) == true);
        }
    }
    catch (exception& e)
    {
        cout << e.what() << endl;

        cudaFreeHost(a_cpu);
        cudaFreeHost(b_cpu);
        cudaFreeHost(c_cpu);

        cudaFree(a_gpu);
        cudaFree(b_gpu);
        cudaFree(c_gpu);

        return 1;
    }

    // Copy memory to device and also synchronize (implicitly)
    cout << "Synchronizing CPU and GPU. Copying result from GPU to CPU" << endl;
    cudaMemcpy(c_cpu, c_gpu, bytes, cudaMemcpyDeviceToHost);

    // Verify the result on the CPU
    cout << "Verifying result on CPU" << endl;
    for (int i = 0; i < N; ++i)
    {
        assert(c_cpu[i] == a_cpu[i] + b_cpu[i]);
    }

    cudaFreeHost(a_cpu);
    cudaFreeHost(b_cpu);
    cudaFreeHost(c_cpu);

    cudaFree(a_gpu);
    cudaFree(b_gpu);
    cudaFree(c_gpu);

    return 0;
}

with a kernel

__global__ void vector_add_kernel(int* a, int* b, int* c, int N)
{
    // Calculate global thread ID
    int t_id = (blockDim.x * blockIdx.x) + threadIdx.x;

    // Check boundry
    if (t_id < N)
    {
        c[t_id] = a[t_id] + b[t_id];
    }
}

void vector_add(int* a, int* b, int* c, int N, int cuda_grid_size, int cuda_block_size)
{
    vector_add_kernel << <cuda_grid_size, cuda_block_size >> > (a, b, c, N);
    cudaGetLastError();
}

The code above uses pinned memory (for fast transfer between CPU and GPU) and does an addition operation between two vectors using the respective kernel. In addition I convert the GPU memory blocks, that are used for those vectors, to libtorch tensors, all while remaining on the GPU, and do the same operation but using the tensors. I even added a small "error" that allowed me to verify that the data I initially allocate (without the tensors) is actually being changed when manipulating the tensors.

I also have managed to use cv::Mat's data, which is a void pointer that points at the pixel data of an OpenCV image, with torch::from_blob() successfully, e.g.

auto tensor_input = torch::from_blob(img_torch.data, { 1, img_torch.size().height, img_torch.size().width, 1 }, torch::kFloat32);
tensor_input = tensor_input.permute({ 0, 3, 1, 2 });

for an BGRA (PNG) image that I had to convert to CV_32FC3 (in order to use with my ML model and play around a bit with the tensor's shape (the permute()) above.

I am unable to do this with a cudaArray and would like to know if that is even possible.

The reason why I am using a cudaArray is that, just like in the description of this type, I am storing a texture (in my case a D3D11 2D texture) that I need to process. I am actually able to do that using pure CUDA kernel that I've written myself, while also using cudaSurfaceObject_t, which I doubt I can pass onto libtorch in any shape or form.

I am looking for something in the lines of (pseudo-code):

// Register cudaGraphicsResource* cu_arr_interop using cudaGraphicsMapResources(...)
...

// Map the texture's texels to a CUDA array
cudaArray* cu_arr;
cudaGraphicsSubResourceGetMappedArray(&cu_arr, cu_arr_interop, 0, 0);

// Convert the CUDA array to a Torch tensor
auto options = torch::TensorOptions().dtype(...).device(torch::kCUDA, 0).pinned_memory(true);
auto tensor_in = torch::from_blob((void*)cu_arr, { ... }, options);

// Run ML model
auto tensor_out = module.forward({ tensor_in }).toTensor();

// See result on screen
...

// cudaGraphicsUnmapResources(...)

Solution

  • Following the comments, I managed map the data from and to a CUDA array. The intermediate libtorch tensor is fully functional.

    Code for CUDA to libtorch Tensor

    cudaError_t cr = cudaSuccess;
    
    // Allocate linear CUDA memory
    void* copy = nullptr;
    cr = cudaMalloc(&copy, dpitch * height);
    if (cr != cudaSuccess)
    {
        ...
    }
    
    // Copying the input CUDA array to the flat CUDA memory
    cr = cudaMemcpy2DFromArray(copy, dpitch, array_read, 0, 0, dpitch, height, cudaMemcpyDeviceToDevice);
    if (cr != cudaSuccess)
    {
        ...
    }
    
    // Setup tensor that maps the flat CUDA memory so that it can be used in libtorch
    at::Tensor tensor_in;
    auto options = torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCUDA, 0).pinned_memory(true);
    // Map memory as a HEIGHTxWIDTHxCHANNELS tensor that will represent the image with its 4 channels
    tensor_in = torch::from_blob(copy, { height, width,  4 }, options);
    // Permute so that the channels are the first dimension. This allows extracting the pixel data per channel as a separate tensor
    tensor_in = tensor_in.permute({2, 0, 1});
    

    Further conversions depend on the model that will be used for the inference for the given tensor. The above permutation allows the extraction of each channel as a separate tensor. In my case I had to do some extra conversions to make the tensors compatible with my model, e.g.

    // Extract channels and convert to tensors that are compatible with the expected input for the ML
    at::Tensor tensor_in_R, tensor_in_G, tensor_in_B, tensor_in_A;
    tensor_in_R= tensor_in[0].div(255.0).unsqueeze(0).unsqueeze(0).to(torch::kFloat32);
    tensor_in_G = ...
    tensor_in_B = ...
    tensor_in_A = ...
    

    While the copying back is done with

    // Copy tensor to the CUDA output array
    cr = cudaMemcpy2DToArray(array_write,
        0, 0,
        tensor_out.data_ptr(),
        dpitch, dpitch,
        height, cudaMemcpyDeviceToDevice);
    

    with dpitch being equal to width * sizeof(unsigned char) * 4 this will not work.

    The output tensor (the result from the inference) needs to be post-processed - (un)squeezing dimensions if necessary, permuting, converting to the original data format (e.g. torch::kUInt8 in my case) and so on.

    Two steps are very important, namely:

    Due to the poor documentation of libtorch in regards to exceptions and error handling overall I recommend to dump intermediate results from C++ to serialized tensor files. These can then be loaded using

    t_from_cpp = list(torch.jit.load('tensor_cpp_dump.pt').parameters())[0]
    

    You can use torchvision.transforms with the PILToImage() to visualize/save as image the tensor. Checking the shape and experimenting with various conversions for the tensor offers a fast way to get a solution, which you can then transfer in C++. For a comparison running inference in C++ offers zero feedback when an error occurs. In PyTorch you will often times get a nice description on what went wrong including full trace.

    [![enter image description here][3]][3]

    Whenever I see channels being displayed as separate images or some other weird thing, I am always thinking that the way the memory is aligned/being read is not in the right order. I am almost certain that the double use of dpitch as arguments in cudaMemcpy2DToArray() is the culprit. What values I need to put here is a mystery.