void CudaRenderer::render() {
dim3 blockDim(16, 16, 1);
dim3 gridDim((image->width + blockDim.x - 1) / blockDim.x,
(image->height + blockDim.y - 1) / blockDim.y);
int *filteredCircles,
*lastIndices,
sz = gridDim.y * gridDim.x ;
cudaMalloc((void **)&filteredCircles, sizeof(int) * sz * 2000);
cudaMalloc((void **)&lastIndices, sizeof(int) * sz);
cudaMemset(lastIndices, 0, sizeof(int) * sz);
filterCircles<<<gridDim, blockDim>>>(filteredCircles, lastIndices);
for (int i = 0; i < 10; ++i)
printf("lastIndices[%d] = %d\n", i, lastIndices[i]);
kernelRenderCircles<<<gridDim, blockDim>>>(filteredCircles, lastIndices);
cudaFree(filteredCircles);
cudaDeviceSynchronize();
Before adding the print, the code compiles fine but produces the incorrect result. After adding the print to check lastIndices, it keeps giving me segmentation fault. What did I do wrong here?
What I think you are trying to do is to print the values of lastIndices
after the filterCircles has run.
For that, you need to understand that
A: GPU __global__
functions runs asynchronous from the CPU.
B: you cannot access device (GPU) pointers on the host (CPU).
The trick is to change the code as follows:
Old:
cudaMalloc((void **)&lastIndices, sizeof(int) * sz);
cudaMemset(lastIndices, 0, sizeof(int) * sz);
filterCircles<<<gridDim, blockDim>>>(filteredCircles, lastIndices);
//Problem 1: the CPU for loop will run concurrently with the GPU code.
//meaning lastIndices may not have the filterCircles data yet.
for (int i = 0; i < 10; ++i) {
//problem 2: lastIndices is in GPU memory that the CPU cannot see.
printf("lastIndices[%d] = %d\n", i, lastIndices[i]);
}
New:
cudaMalloc((void **)&lastIndices, sizeof(int) * sz);
cudaMemset(lastIndices, 0, sizeof(int) * sz);
filterCircles<<<gridDim, blockDim>>>(filteredCircles, lastIndices);
cudaDeviceSynchronize(); //wait for filterCircles to finish
//allocate space on the host to hold the cpu data
int* hostLastIndices;
hostLastIndices = (int*)malloc(sizeof(int) * sz);
//copy data from device to host
cudaMemcpy(hostLastIndices, lastIndices, sizeof(int) * sz, cudaMemcopyDeviceToHost);
for (int i = 0; i < sz; ++i) {
printf("lastIndices[%d] = %d\n", i, hostLastIndices[i]);
}