cuda

CUDA allocate and initialize an array in global memory but keeps getting segmentation fault


void CudaRenderer::render() {

    dim3 blockDim(16, 16, 1);
    dim3 gridDim((image->width + blockDim.x - 1) / blockDim.x,
                  (image->height + blockDim.y - 1) / blockDim.y);

    int *filteredCircles,
        *lastIndices,
        sz = gridDim.y * gridDim.x ;
    cudaMalloc((void **)&filteredCircles, sizeof(int) * sz * 2000);
    cudaMalloc((void **)&lastIndices, sizeof(int) * sz);
    cudaMemset(lastIndices, 0, sizeof(int) * sz);
    filterCircles<<<gridDim, blockDim>>>(filteredCircles, lastIndices);
        for (int i = 0; i < 10; ++i)
                printf("lastIndices[%d] = %d\n", i, lastIndices[i]);
    kernelRenderCircles<<<gridDim, blockDim>>>(filteredCircles, lastIndices);
    cudaFree(filteredCircles);
    cudaDeviceSynchronize();

Before adding the print, the code compiles fine but produces the incorrect result. After adding the print to check lastIndices, it keeps giving me segmentation fault. What did I do wrong here?


Solution

  • What I think you are trying to do is to print the values of lastIndices after the filterCircles has run.

    For that, you need to understand that

    A: GPU __global__ functions runs asynchronous from the CPU.
    B: you cannot access device (GPU) pointers on the host (CPU).

    The trick is to change the code as follows:

    Old:

        cudaMalloc((void **)&lastIndices, sizeof(int) * sz);
        cudaMemset(lastIndices, 0, sizeof(int) * sz);
        filterCircles<<<gridDim, blockDim>>>(filteredCircles, lastIndices);
        //Problem 1: the CPU for loop will run concurrently with the GPU code.
        //meaning lastIndices may not have the filterCircles data yet.
        for (int i = 0; i < 10; ++i) {
            //problem 2: lastIndices is in GPU memory that the CPU cannot see.
            printf("lastIndices[%d] = %d\n", i, lastIndices[i]);
        }
    

    New:

        cudaMalloc((void **)&lastIndices, sizeof(int) * sz);
        cudaMemset(lastIndices, 0, sizeof(int) * sz);
        filterCircles<<<gridDim, blockDim>>>(filteredCircles, lastIndices);
        cudaDeviceSynchronize(); //wait for filterCircles to finish
        //allocate space on the host to hold the cpu data
        int* hostLastIndices;
        hostLastIndices = (int*)malloc(sizeof(int) * sz);
        //copy data from device to host
        cudaMemcpy(hostLastIndices, lastIndices, sizeof(int) * sz, cudaMemcopyDeviceToHost);
        for (int i = 0; i < sz; ++i) {
            printf("lastIndices[%d] = %d\n", i, hostLastIndices[i]);
        }