cudaunified-memory

Unified memory and struct with arrays


I have a big Struct of Arrays of Structs on CUDA, that is constant and read only for my application. A quite simplified example would be

struct Graph{
    Node * nodes;
    int nNode;
}
struct Node{
   int* pos;
   int nPos;
}

My kernels would need to navigate this graph and query it. As you know, copying this struct to GPU memory with cudaMalloc and cudaMemcpy is just lots of code, that unified memory is supposed to remove the need of.

In my code, I generated the graph in CPU and then, for testing, I designed the following kernel

__global__ void testKernel(const Graph graph,int * d_res){
    d_res[0]=graph.nNode;


};

being called as:

// using malloc for testing to make sure I know what I am doing
int * d_res,* h_res;
cudaMalloc((void **)&d_res,sizeof(int));
h_res=(int*)malloc(sizeof(int));

testKernel<<<1,1>>>(graph,d_res);

gpuErrchk( cudaPeekAtLastError() );
gpuErrchk(cudaMemcpy(h_res,d_res,sizeof(int),cudaMemcpyDeviceToHost));

with the error checks from here.

When I use the testKernel as is shown, it works fine, but if I change the kernel to:

__global__ void testKernel(const Graph graph,int * d_res){
    d_res[0]=graph.nodes[0].nPos;

};

I get illegal memory access errors.

Is this because the unified memory does not handle this type of data correctly? Is there a way to make sure I can avoid writing all the explicit copies to GPU memory?


Full MCVE:

#include <algorithm>
#include <cuda_runtime_api.h>
#include <cuda.h>
typedef struct node{
    int* pos;
    int nPos;
}Node;
typedef struct Graph{
    Node * nodes;
    int nNode;
}Graph;


#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

__global__ void testKernel(const Graph graph, int * d_res){
    d_res[0] = graph.nNode;
    // d_res[0]=graph.nodes[0].nPos; // Not working

};



int main(void){

    // fake data, this comes from another process
     Graph graph;
    graph.nodes = (Node*)malloc(2*sizeof(Node));
    graph.nNode = 2;
    for (int i = 0; i < 2; i++){


    // They can have different sizes in the original code
    graph.nodes[i].pos = (int*)malloc(3 * sizeof(int));
    graph.nodes[i].pos[0] = 0;
    graph.nodes[i].pos[1] = 1;
    graph.nodes[i].pos[2] = 2;

    graph.nodes[i].nPos = 3;

}



printf("%d\n", graph.nNode); // Change to the kernel variable for comparison
int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(int));
h_res = (int*)malloc(sizeof(int));
testKernel << <1, 1 >> >(graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(int), cudaMemcpyDeviceToHost));

printf("%d", h_res[0]);
return 0;
}

Solution

  • Your code isn't using CUDA unified memory. UM is not "automatic" in any way. It requires specific programming steps to take advantage of it and it has specific system requirements.

    All of this is covered in the UM section of the programming guide.

    Is there a way to make sure I can avoid writing all the explicit copies to GPU memory?

    Proper use of UM should allow this. Here is a fully worked example. The only thing I have done is mechanically convert your malloc operations in host code to equivalent cudaMallocManaged operations.

    $ cat t1389.cu
    #include <algorithm>
    #include <stdio.h>
    
    typedef struct node{
        int* pos;
        int nPos;
    }Node;
    typedef struct Graph{
        Node * nodes;
        int nNode;
    }Graph;
    
    
    #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
    {
        if (code != cudaSuccess)
        {
            fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
            if (abort) exit(code);
        }
    }
    
    __global__ void testKernel(const Graph graph, int * d_res){
        d_res[0] = graph.nNode;
         d_res[0]=graph.nodes[0].nPos; // Not working
    
    };
    
    
    
    int main(void){
    
        // fake data, this comes from another process
         Graph graph;
        cudaMallocManaged(&(graph.nodes), 2*sizeof(Node));
        graph.nNode = 2;
        for (int i = 0; i < 2; i++){
    
    
        // They can have different sizes in the original code
        cudaMallocManaged(&(graph.nodes[i].pos), 3 * sizeof(int));
        graph.nodes[i].pos[0] = 0;
        graph.nodes[i].pos[1] = 1;
        graph.nodes[i].pos[2] = 2;
    
        graph.nodes[i].nPos = 3;
    
    }
    
    
    
    printf("%d\n", graph.nNode); // Change to the kernel variable for comparison
    int * d_res, *h_res;
    cudaMalloc((void **)&d_res, sizeof(int));
    h_res = (int*)malloc(sizeof(int));
    testKernel << <1, 1 >> >(graph, d_res);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(int), cudaMemcpyDeviceToHost));
    
    printf("%d", h_res[0]);
    return 0;
    }
    $ nvcc t1389.cu -o t1389
    $ cuda-memcheck ./t1389
    ========= CUDA-MEMCHECK
    2
    3========= ERROR SUMMARY: 0 errors
    $
    

    UM has a number of system requirements that are documented. I'm not going to try to recite them all here. Primarily you need a cc3.0 or higher GPU. Your MCVE did not include any standard error checking, and I didn't try to add it. But if you still have problems with this code, be sure to use proper CUDA error checking and run it with cuda-memcheck.

    If your entire data structure, including embedded pointers, is allocated using ordinary host allocators, and you have no control over that, then you won't be able to use it directly in a UM regime, without doing some sort of involved copying. The exception here would be on an IBM Power9 system as mentioned in section K.1.6 of the above linked programming guide section.

    Before attempting to use a host allocator (e.g. malloc) with UM, you should first test the pageableMemoryAccessUsesHostPageTables property, as mentioned in that section.

    That property currently won't be set on any system except a properly configured IBM Power9 system. No x86 system currently has this property set/available.