[SOLVED] CUDA allocate and initialize an array in global memory but keeps getting segmentation fault

CUDA allocate and initialize an array in global memory but keeps getting segmentation fault

void CudaRenderer::render() {

    dim3 blockDim(16, 16, 1);
    dim3 gridDim((image->width + blockDim.x - 1) / blockDim.x,
                  (image->height + blockDim.y - 1) / blockDim.y);

    int *filteredCircles,
        *lastIndices,
        sz = gridDim.y * gridDim.x ;
    cudaMalloc((void **)&filteredCircles, sizeof(int) * sz * 2000);
    cudaMalloc((void **)&lastIndices, sizeof(int) * sz);
    cudaMemset(lastIndices, 0, sizeof(int) * sz);
    filterCircles<<<gridDim, blockDim>>>(filteredCircles, lastIndices);
        for (int i = 0; i < 10; ++i)
                printf("lastIndices[%d] = %d\n", i, lastIndices[i]);
    kernelRenderCircles<<<gridDim, blockDim>>>(filteredCircles, lastIndices);
    cudaFree(filteredCircles);
    cudaDeviceSynchronize();

Before adding the print, the code compiles fine but produces the incorrect result. After adding the print to check lastIndices, it keeps giving me segmentation fault. What did I do wrong here?

Solution

What I think you are trying to do is to print the values of lastIndices after the filterCircles has run.

For that, you need to understand that

A: GPU __global__ functions runs asynchronous from the CPU.
B: you cannot access device (GPU) pointers on the host (CPU).

The trick is to change the code as follows:

Old:

    cudaMalloc((void **)&lastIndices, sizeof(int) * sz);
    cudaMemset(lastIndices, 0, sizeof(int) * sz);
    filterCircles<<<gridDim, blockDim>>>(filteredCircles, lastIndices);
    //Problem 1: the CPU for loop will run concurrently with the GPU code.
    //meaning lastIndices may not have the filterCircles data yet.
    for (int i = 0; i < 10; ++i) {
        //problem 2: lastIndices is in GPU memory that the CPU cannot see.
        printf("lastIndices[%d] = %d\n", i, lastIndices[i]);
    }

New:

    cudaMalloc((void **)&lastIndices, sizeof(int) * sz);
    cudaMemset(lastIndices, 0, sizeof(int) * sz);
    filterCircles<<<gridDim, blockDim>>>(filteredCircles, lastIndices);
    cudaDeviceSynchronize(); //wait for filterCircles to finish
    //allocate space on the host to hold the cpu data
    int* hostLastIndices;
    hostLastIndices = (int*)malloc(sizeof(int) * sz);
    //copy data from device to host
    cudaMemcpy(hostLastIndices, lastIndices, sizeof(int) * sz, cudaMemcopyDeviceToHost);
    for (int i = 0; i < sz; ++i) {
        printf("lastIndices[%d] = %d\n", i, hostLastIndices[i]);
    }