Currently I have one pixel buffer and I process the data in it with a single kernel call:
dim3 threadsPerBlock(32, 32)
dim3 blocks(screenWidth / threadsPerBlock.x, screenHeight / threadsPerBlock.y);
kernel<<<blocks, threadsPerBlock>>>();
The pixel buffer contains all the pixels in a window with dimensions screenWidth x screenHeight.
My idea is to divide the window in 2 or 4 parts and to process the pixel data simultaneously.
Can this be done, and if it can - how ?
I've read little about streams but from what I understood two streams cannot work on a single piece of data (e.g. my pixelBuffer), or am I wrong ?
Edit: My graphics card is with compute capability 3.0
Edit 2: I use SDL to do the drawing and I have a single GPU, and I use user defined data array:
main.cu
Color vfb_linear[VFB_MAX_SIZE * VFB_MAX_SIZE]; // array on the Host
Color vfb[VFB_MAX_SIZE][VFB_MAX_SIZE] // 2D array used for SDL
extern "C" void callKernels(Color* dev_vfb);
int main()
{
Color* dev_vfb; // pixel array used on the GPU
// allocate memory for dev_vfb on the GPU
cudaMalloc((void**)&dev_vfb, sizeof(Color) * RES_X * RES_Y);
// memcpy HostToDevice
cudaMemcpy(dev_vfb, vfb_linear, sizeof(Color) * RES_X * RES_Y, cudaMemcpyHostToDevice);
callKernels(dev_vfb); // wrapper function that calls the kernels
// memcpy DeviceToHost
cudaMemcpy(vfb_linear, dev_vfb, sizeof(Color) * RES_X * RES_Y, cudaMemcpyDeviceToHost);
// convert vfb_linear into 2D array so it can be handled by SDL
convertDeviceToHostBuffer();
display(vfb); // render pixels on screen with SDL
}
cudaRenderer.cu
__global__ void kernel(Color* dev_vfb)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
if (offset < RES_X * RES_Y)
{
dev_vfb[offset] = getColorForPixel();
}
}
extern "C" callKernels(Color* dev_vfb)
{
dim3 threadsPerBlock(32, 32)
dim3 blocks(screenWidth / threadsPerBlock.x, screenHeight / threadsPerBlock.y);
kernel<<<blocks, threadsPerBlock>>>(dev_vfb);
}
contents of display(vfb):
void display(Color vfb[VFB_MAX_SIZE][VFB_MAX_SIZE])
{
// screen is pointer to SDL_Surface
int rs = screen->format->Rshift;
int gs = screen->format->Gshift;
int bs = screen->format->Bshift;
for (int y = 0; y < screen->h; ++y)
{
Uint32* row = (Uint32*) ((Uint8*) screen->pixels + y * screen->pitch);
for (int x = 0; x < screen->w; ++x)
row[x] = vfb[y][x].toRGB32(rs, gs, bs);
}
SDL_Flip(screen);
}
This is a simple example of what I am doing in my project. It is a raytracer and maybe SDL is the worst choice for interop with CUDA but I don't know if I will have time to change it.
There's nothing that prevents two streams from working on the same piece of data in global memory of one device.
As I said in the comments, I don't think this is a sensible approach to make things run faster. However, the modifications to your code would be something like this (coded in browser, not tested):
__global__ void kernel(Color* dev_vfb, int slices)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
if (offset < (RES_X * RES_Y/slices)
{
dev_vfb[offset] = getColorForPixel();
}
}
extern "C" callKernels(Color* dev_vfb)
{
int num_slices=2;
cudaStream_t streams[num_slices];
for (int i = 0; i < num_slices; i++)
cudaStreamCreate(&(streams[i]));
dim3 threadsPerBlock(32, 32)
dim3 blocks(screenWidth / threadsPerBlock.x, screenHeight / (num_slices*threadsPerBlock.y));
for (int i = 0; i < num_slices; i++){
int off = i * (screenWidth*screenHeight/num_slices);
kernel<<<blocks, threadsPerBlock, 0, streams[i]>>>(dev_vfb+off, num_slices); }
}