cudatesla

Overlap kernel execution on multiple streams


We have a fairly a single kernel (see below) that we fire off with a grid, block of 1,1

kernel<<<1,1>>>

And then it dynamically fires off lots of smaller kernels. In general, the data flows from kernel to kernel, with the input starting with the first kernel and flowing to to the end.

But we have identified a potential ability to overlap two streams of data each running this identical kernel.

Question: Do we have to abandon dynamic kernel execution, and go to a host based approach to get overlap of execution for two mega-kernels? Or is the scheduler in the card smart enough to interleave the execution between the two mega-kernels and process each one as a separate scheduled item?

We are talking about a Tesla K80. Linux host.

(yes, we will gain some overlap with the cudamemcopyasync() overlapping the execution, but we would like to see some execution overlap also).

#include <cuda.h>
#include <cuda_runtime.h>

#include "coss_types.h"
#include "image.h"
#include "centroid.h"
#include "gpu.h"

#define GPU_TILE_WIDTH  16
#define GPU_TILE_HEIGHT 16
#define GPU_TILE_WBIG   32
#define GPU_TILE_HBIG   32
#define K_IMG_MAX 1024

__constant__ unsigned short* pFrameStack[GPU_CHX];
__constant__ unsigned short* pBackground[GPU_CHX];
__constant__ short*          pCleanground[GPU_CHX];
__constant__ unsigned char*  pMask[GPU_CHX];
__constant__ float*          pForeground[GPU_CHX];
__constant__ float*          pLowground[GPU_CHX];
__constant__ float*          pLowgroundRow[GPU_CHX];
__constant__ float*          pHighground[GPU_CHX];
__constant__ float*          pHighgroundRow[GPU_CHX];
__constant__ float*          pMins[GPU_CHX];
__constant__ float*          pMaxs[GPU_CHX];
__constant__ int             gSlot;
__constant__ int*            pPercentile[GPU_CHX];
__constant__ int*            pLabels1[GPU_CHX];
__constant__ int*            pLabels2[GPU_CHX];
__constant__ int*            pRawLabels[GPU_CHX];
__constant__ int*            pLabels[GPU_CHX];
__constant__ ImgInfoBlock_t* pInfo[GPU_CHX];
__constant__ unsigned short* pSums[GPU_CHX];
__constant__ unsigned short* pBlockSums[GPU_CHX];
__constant__ ImgCentroid_t*  pCenters[GPU_CHX];
__constant__ float           threshold_sigma = 9.0f;


/* INCLUDED GENERATED CUDA CODE BELOW */
#include "cuda.cu"
/* INCLUDED GENERATED CUDA CODE ABOVE */

extern "C" __device__  void Background(int ch)
{
    dim3 block;
    dim3 grid;

    /* Background Estimation */
    block = dim3(128);
    grid  = dim3((IMG_PIXELS)/256); /* Only half screens at a time */
    gMedian<<<grid,block>>>(
            pFrameStack[ch],GPU_STACKSIZE,IMG_PIXELS,
            pBackground[ch],IMG_HEIGHT,IMG_WIDTH,gSlot);
    cudaDeviceSynchronize();


    /* Background Removal */
    block = dim3(128);
    grid  = dim3((IMG_PIXELS)/128);
    gScrub<<<grid,block>>>(
            pFrameStack[ch],GPU_STACKSIZE,IMG_PIXELS,
            pBackground[ch],IMG_HEIGHT,IMG_WIDTH,
            pCleanground[ch],IMG_HEIGHT,IMG_WIDTH,gSlot);
    cudaDeviceSynchronize();

}

extern "C" __device__  void Convolution(int ch)
{
    dim3 block;
    dim3 grid;
    dim3 block_b;
    dim3 grid_b;

    /* Convolve Rows */
    block = dim3(GPU_TILE_WIDTH,GPU_TILE_HEIGHT);
    grid  = dim3(IMG_WIDTH/GPU_TILE_WIDTH,IMG_HEIGHT/GPU_TILE_HEIGHT);
    gConvolveRow<<<grid,block>>>(
            pCleanground[ch],   IMG_HEIGHT,IMG_WIDTH,
            pLowgroundRow[ch],  IMG_HEIGHT,IMG_WIDTH);

    block_b = dim3(GPU_TILE_WBIG,GPU_TILE_HBIG);
    grid_b  = dim3(IMG_WIDTH/GPU_TILE_WBIG,IMG_HEIGHT/GPU_TILE_HBIG);
    gConvolveBigRow<<<grid_b,block_b>>>(
            pCleanground[ch],   IMG_HEIGHT,IMG_WIDTH,
            pHighgroundRow[ch], IMG_HEIGHT,IMG_WIDTH);

    /* Convolve Cols */
    cudaDeviceSynchronize();
    gConvolveCol<<<grid,block>>>(
            pLowgroundRow[ch],  IMG_HEIGHT,IMG_WIDTH,
            pLowground[ch],     IMG_HEIGHT,IMG_WIDTH);

    gConvolveBigCol<<<grid_b,block_b>>>(
            pHighgroundRow[ch], IMG_HEIGHT,IMG_WIDTH,
            pHighground[ch],    IMG_HEIGHT,IMG_WIDTH);

    /* Band pass */
    cudaDeviceSynchronize();

    block = dim3(256,4);
    grid  = dim3(IMG_WIDTH / 256, IMG_HEIGHT / 4);
    gBpass<<<grid,block>>>(
            pLowground[ch],     IMG_HEIGHT,IMG_WIDTH,
            pHighground[ch],    IMG_HEIGHT,IMG_WIDTH,
            pForeground[ch],    IMG_HEIGHT,IMG_WIDTH);

    cudaDeviceSynchronize();

}

extern "C" __device__  void Threshold(int ch)
{
    dim3 block;
    dim3 grid;

    /* Set the calibration sigma in Info Bloc */
    pInfo[ch]->sigma = threshold_sigma;

    /* Min Max kernels */
    block = dim3(512, 2);
    grid = dim3(IMG_WIDTH / 512, IMG_HEIGHT / 2);
    gMinMax<<<grid,block>>>(
            pForeground[ch],IMG_HEIGHT,IMG_WIDTH,
            pMins[ch], 5 * K_IMG_MAX,
            pMaxs[ch], 5 * K_IMG_MAX);

    cudaDeviceSynchronize();
    block = dim3(K_IMG_MAX);
    grid = dim3(1);
    gMinMaxMinMax<<<grid,K_IMG_MAX>>>(
            pMins[ch], 5 * K_IMG_MAX,
            pMaxs[ch], 5 * K_IMG_MAX,
            (struct PipeInfoBlock*)pInfo[ch],1);

    /* Histogram */
    cudaDeviceSynchronize();
    block = dim3(GPU_TILE_WBIG,GPU_TILE_HBIG);
    grid  = dim3(IMG_WIDTH/GPU_TILE_WBIG,IMG_HEIGHT/GPU_TILE_HBIG);
    gHistogram<<<grid,block>>>(
            pForeground[ch],IMG_HEIGHT,IMG_WIDTH,
            pPercentile[ch],K_IMG_MAX,
            (struct PipeInfoBlock*)pInfo[ch],1);
    cudaDeviceSynchronize();

    block = dim3(K_IMG_MAX);
    grid  = dim3(1);
    gSumHistogram<<<grid,block>>>(pPercentile[ch],K_IMG_MAX);
    cudaDeviceSynchronize();

    gIQR<<<grid,block>>>(pPercentile[ch],K_IMG_MAX,(struct PipeInfoBlock*)pInfo[ch],1);
    cudaDeviceSynchronize();

    block = dim3(256,4);
    grid  = dim3(IMG_WIDTH / 256, IMG_HEIGHT / 4);
    gThreshold<<<grid,block>>>(
            pForeground[ch],IMG_HEIGHT,IMG_WIDTH,
            pMask[ch],IMG_HEIGHT,IMG_WIDTH,
            (struct PipeInfoBlock*)pInfo[ch],1);

    cudaDeviceSynchronize();
}

extern "C" __device__  void Gluing(int ch)
{
    dim3 block;
    dim3 grid;

    block = dim3(24, 24);
    grid = dim3(IMG_WIDTH / 16, IMG_HEIGHT / 16);

    gGlue<<<grid, block>>>(
            pMask[ch],IMG_HEIGHT,IMG_WIDTH,
            pMask[ch],IMG_HEIGHT,IMG_WIDTH);
    cudaDeviceSynchronize();
}

extern "C" __device__  void Labeling(int ch)
{
    dim3 block;
    dim3 grid;

    /* CCL */
    //block = dim3(1, 128);
    //grid = dim3(1, IMG_HEIGHT / 128);
    block = dim3(256,1);
    grid = dim3(IMG_WIDTH/256,IMG_HEIGHT);

    gCCL0<<<grid, block>>>(
            pMask[ch],IMG_HEIGHT,IMG_WIDTH,
            pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH);
    cudaDeviceSynchronize();

    block = dim3(24, 24);
    grid  = dim3(IMG_WIDTH / 16, IMG_HEIGHT / 16);

    gCCLMerge<<<grid, block>>>(
            pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
            pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH);
    cudaDeviceSynchronize();

    gCCLMerge<<<grid, block>>>(
            pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
            pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH);
    cudaDeviceSynchronize();

    gCCLMerge<<<grid, block>>>(
            pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
            pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH);
    cudaDeviceSynchronize();
}

extern "C" __device__  void Relabeling(int ch)
{
    dim3 block;
    dim3 grid;

    /* Relabel */
    block = dim3(160, 1);
    grid  = dim3(IMG_WIDTH / 160, IMG_HEIGHT / 1);
    gScan<<<grid, block>>>(
            pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
            pSums[ch],IMG_PIXELS);
    cudaDeviceSynchronize();

    grid = dim3(IMG_PIXELS / K_IMG_MAX);
    gSum<<<grid, K_IMG_MAX>>>(pSums[ch],IMG_PIXELS, pBlockSums[ch], 5*K_IMG_MAX);
    cudaDeviceSynchronize();

    grid = dim3(1);
    gSumBlocks<<<grid, K_IMG_MAX>>>(pBlockSums[ch], 5*K_IMG_MAX, (struct PipeInfoBlock*)pInfo[ch],1);
    cudaDeviceSynchronize();

    grid = dim3(IMG_PIXELS / K_IMG_MAX);
    gFixSums<<<grid, K_IMG_MAX>>>(pSums[ch],IMG_PIXELS, pBlockSums[ch], 5*K_IMG_MAX);
    cudaDeviceSynchronize();

    block = dim3(160, 1);
    grid  = dim3(IMG_WIDTH / 160, IMG_HEIGHT / 1);
    gRelabeler<<<grid, block>>>(
            pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
            pSums[ch],IMG_PIXELS,
            pLabels[ch], IMG_HEIGHT,IMG_WIDTH);
    cudaDeviceSynchronize();

}

extern "C" __device__  void Centroiding(int ch)
{
    dim3 block;
    dim3 grid;
    int  starcount = IMG_STARS_MAX;

    if (pInfo[ch]->starCount > 0 && pInfo[ch]->starCount < IMG_STARS_MAX)
    {
        starcount = pInfo[ch]->starCount;

        /* Centroid */
        block = dim3(32, 32);
        grid  = dim3(IMG_WIDTH / 32, IMG_HEIGHT / 32);

        gCentroid<<<grid, block>>>(
                pLabels[ch], IMG_HEIGHT,IMG_WIDTH,
                pForeground[ch],IMG_HEIGHT,IMG_WIDTH,
                (PipeCentroid *)pCenters[ch],starcount);
        cudaDeviceSynchronize();

        block = dim3(starcount);
        gCentroidFinal<<<1, block>>>((PipeCentroid *)pCenters[ch],starcount);
        cudaDeviceSynchronize();

    }
    else
    {
        pInfo[ch]->starCount = 0;
    }
}

extern "C" __global__  void gPipeline(int gpuId)
{   int ch;

    for(ch=0; ch < GPU_CHX; ch++)
    {
        Background(ch);
        Convolution(ch);
        Threshold(ch);
        Gluing(ch);
        Labeling(ch);
        Relabeling(ch);
        Centroiding(ch);
    }
}

extern "C" {

static void ImgKernel_ClearBuffers(int32_t gpu, int32_t ch)
{
    /* Clear Work Buffers */
    cudaMemset(gInfo[gpu][ch],0,(int)sizeof(ImgInfoBlock_t));
    cudaMemset(gCenters[gpu][ch],0,(int)sizeof(ImgCentroid_t)*IMG_STARS_MAX);
    cudaMemset(gPercentile[gpu][ch],0,(int)sizeof(int32_t)*K_IMG_MAX);
    cudaMemset(gLabels1[gpu][ch],0,(int)sizeof(int32_t) *IMG_PIXELS);
    cudaMemset(gLabels2[gpu][ch],0,(int)sizeof(int32_t) *IMG_PIXELS);
    cudaMemset(gRawLabels[gpu][ch],0,(int)sizeof(int32_t) *IMG_PIXELS);
    cudaMemset(gSums[gpu][ch],0,(int)IMG_BYTES);
    cudaMemset(gBlockSums[gpu][ch],0,(int)sizeof(uint16_t)*5*K_IMG_MAX);
}

void ImgKernel_Pipeline(int gpu)
{
    cudaSetDevice(gpu);

    cudaDeviceSynchronize();

    /* Start a new run by clearing the buffers */
    ImgKernel_ClearBuffers(gpu,GPU_CH0);
    ImgKernel_ClearBuffers(gpu,GPU_CH1);

    /* Update Constants */
    cudaMemcpyToSymbol(gSlot,(void*)&slot,sizeof(slot));
    cudaMemcpyToSymbol(threshold_sigma,(void*)&sigmaThreshold,sizeof(sigmaThreshold));

    /* Start the next pipeline kernel */
    gPipeline<<<1,1>>>(gpu);

}

#define LFILTER_LEN 15
static float lFilter[LFILTER_LEN] = { .0009f, .01f,
   .02f, .05f, .08f, .10f, .1325f, .1411f, .1325f, .10f, .08f, .05f, .02f, .01f, .0009f };


#define HFILTER_LEN 31
static float hFilter[HFILTER_LEN] = {0.0002f, 0.0006f,
        0.0025f, 0.0037f, 0.0053f, 0.0074f, 0.0099f, 0.0130f, 0.0164f,
        0.0201f, 0.0239f, 0.0275f, 0.0306f, 0.0331f, 0.0347f, 0.0353f,
        0.0347f, 0.0331f, 0.0306f, 0.0275f, 0.0239f, 0.0201f, 0.0164f,
        0.0130f, 0.0099f, 0.0074f, 0.0053f, 0.0037f, 0.0025f, 0.0006f, 0.0002f};

static float32_t kernel[LFILTER_LEN];
static float32_t kernelBig[HFILTER_LEN];

static inline float32_t ImgKernel_FilterSum(float* arr, int32_t len)
{
    int32_t i;
    float32_t sum = 0.0f;
    for (i=0;i<len;i++) sum += arr[i];

    return sum;
}

void ImgKernel_Setup(int gpu)
{
    int32_t i,ch;
    float32_t sum = 0;

    sum = ImgKernel_FilterSum(lFilter,LFILTER_LEN);
    for (i = 0; i < LFILTER_LEN; i++) kernel[i] = lFilter[i] / sum;

    sum = ImgKernel_FilterSum(hFilter,HFILTER_LEN);
    for (i = 0; i < HFILTER_LEN; i++) kernelBig[i] = hFilter[i] / sum;


    /* One time copy of locations into GPU constant memory */
    cudaMemcpyToSymbol(gkernel,    (void*)&kernel,         sizeof(float32_t)*LFILTER_LEN);
    cudaMemcpyToSymbol(gkernelBig, (void*)&kernelBig,      sizeof(float32_t)*HFILTER_LEN);
    cudaMemcpyToSymbol(pFrameStack,(void*)&gFrameStack[gpu][0],    sizeof(uint16_t*)*GPU_CHX);
    cudaMemcpyToSymbol(pBackground,(void*)&gBackground[gpu][0],    sizeof(uint16_t*)*GPU_CHX);
    cudaMemcpyToSymbol(pCleanground,(void*)&gCleanground[gpu][0],  sizeof(int16_t*)*GPU_CHX);
    cudaMemcpyToSymbol(pLowground, (void*)&gLowground[gpu][0],     sizeof(float32_t*)*GPU_CHX);
    cudaMemcpyToSymbol(pLowgroundRow,(void*)&gLowgroundRow[gpu][0],sizeof(float32_t*)*GPU_CHX);
    cudaMemcpyToSymbol(pHighground,(void*)&gHighground[gpu][0],    sizeof(float32_t*)*GPU_CHX);
    cudaMemcpyToSymbol(pHighgroundRow,(void*)&gHighgroundRow[gpu][0],sizeof(float32_t*)*GPU_CHX);
    cudaMemcpyToSymbol(pForeground,(void*)&gForeground[gpu][0],   sizeof(float32_t*)*GPU_CHX);
    cudaMemcpyToSymbol(pMask,      (void*)&gMask[gpu][0],         sizeof(uint8_t*)*GPU_CHX);
    cudaMemcpyToSymbol(pPercentile,(void*)&gPercentile[gpu][0],   sizeof(int32_t*)*GPU_CHX);
    cudaMemcpyToSymbol(pMins,      (void*)&gMins[gpu][0],         sizeof(float32_t*)*GPU_CHX);
    cudaMemcpyToSymbol(pMaxs,      (void*)&gMaxs[gpu][0],         sizeof(float32_t*)*GPU_CHX);
    cudaMemcpyToSymbol(pLabels1,   (void*)&gLabels1[gpu][0],      sizeof(int32_t*)*GPU_CHX);
    cudaMemcpyToSymbol(pLabels2,   (void*)&gLabels2[gpu][0],      sizeof(int32_t*)*GPU_CHX);
    cudaMemcpyToSymbol(pRawLabels, (void*)&gRawLabels[gpu][0],    sizeof(int32_t*)*GPU_CHX);
    cudaMemcpyToSymbol(pLabels,    (void*)&gLabels[gpu][0],       sizeof(int32_t*)*GPU_CHX);
    cudaMemcpyToSymbol(pInfo,      (void*)&gInfo[gpu][0],         sizeof(ImgInfoBlock_t*)*GPU_CHX);
    cudaMemcpyToSymbol(pSums,      (void*)&gSums[gpu][0],         sizeof(uint16_t*)*GPU_CHX);
    cudaMemcpyToSymbol(pBlockSums, (void*)&gBlockSums[gpu][0],    sizeof(uint16_t*)*GPU_CHX);
    cudaMemcpyToSymbol(pCenters,   (void*)&gCenters[gpu][0],      sizeof(ImgCentroid_t*)*GPU_CHX);

    for (ch = 0; ch < GPU_CHX; ch++)
    {
        /* Clear the working buffers */
        ImgKernel_ClearBuffers(gpu,ch);
    }
}

}

Solution

  • It should be possible for both the parent and child kernels to all be co-resident (ie. executing concurrently) for two dynamic parallelism kernels launched in separate host streams.

    How to get things to run concurrently is a common question. Once all the requirements have been met, whether or not you actually witness concurrent kernel execution will be a matter of resources consumed by each kernel: how many threads per block, how many total threadblocks, how many registers, and how much shared memory are a few examples of the types of resources that, if consumed by one kernel, may prevent the concurrent execution of another kernel, even if all the requirements have been met.

    The machine does not have infinite capacity. Once the capacity of the machine has been consumed, exposing additional parallelism (e.g. by attempting to launch independent kernels concurrently) might not yield any improvement.

    GPU scheduling behavior may affect this as well as pointed out by Greg. Depending on specific GPU and CUDA version and perhaps other factors, two kernels with large numbers of threadblocks may not execute "concurrently" simply because the threadblocks of one kernel may all be scheduled before any of the threadblocks of the other kernel are scheduled. In my opinion, this behavior is simply another manifestation of a resource issue. (Also note that scheduling of threadblocks of individual kernels may also be affected by stream priorities).

    However if we are careful to constrain the resource usage, it's possible for the parent and child kernels of two dynamic parallelism kernels to be co-resident i.e. execute concurrently. Here's a worked example (CUDA 7, Fedora 20, GeForce GT640 cc3.5 GPU):

    $ cat t815.cu
    #include <stdio.h>
    
    #define DELAY_VAL 5000000000ULL
    
    __global__ void child(){
    
      unsigned long long start = clock64();
      while (clock64()< start+DELAY_VAL);
    }
    
    __global__ void parent(){
    
      child<<<1,1>>>();
    }
    
    int main(int argc, char* argv[]){
    
      cudaStream_t st1, st2;
      cudaStreamCreate(&st1);
      cudaStreamCreate(&st2);
      parent<<<1,1,0,st1>>>();
      if (argc > 1){
        printf("running double kernel\n");
        parent<<<1,1,0,st2>>>();
        }
      cudaDeviceSynchronize();
    }
    
    $ nvcc -arch=sm_35 -rdc=true -lcudadevrt t815.cu -o t815
    $ time ./t815
    3.65user 1.88system 0:05.65elapsed 97%CPU (0avgtext+0avgdata 82192maxresident)k
    0inputs+0outputs (0major+2812minor)pagefaults 0swaps
    $ time ./t815 double
    running double kernel
    3.68user 1.83system 0:05.64elapsed 97%CPU (0avgtext+0avgdata 82200maxresident)k
    0inputs+0outputs (0major+2814minor)pagefaults 0swaps
    $ time cuda-memcheck ./t815
    ========= CUDA-MEMCHECK
    ========= ERROR SUMMARY: 0 errors
    3.16user 2.25system 0:05.68elapsed 95%CPU (0avgtext+0avgdata 87040maxresident)k
    0inputs+0outputs (0major+4573minor)pagefaults 0swaps
    $ time cuda-memcheck ./t815 double
    ========= CUDA-MEMCHECK
    running double kernel
    ========= ERROR SUMMARY: 0 errors
    7.27user 3.04system 0:10.46elapsed 98%CPU (0avgtext+0avgdata 87116maxresident)k
    0inputs+0outputs (0major+4594minor)pagefaults 0swaps
    $
    

    In this case we see that if I don't use cuda-memcheck, then regardless of whether I run one or two copies of the (parent) kernels in separate host streams, the execution time is approximately the same (~5.6s). Since the execution time is the same, the inescapable conclusion is that these kernels are executing concurrently (both parent, and both child kernels). This isn't too surprising since these kernels have tiny resource usage. (one threadblock each, of one thread each, with very low register usage and no shared memory usage).

    On the other hand, if I run the same test with cuda-memcheck, there is evident serialization, because although the time for a single kernel launch is relatively unaffected, the time for two "concurrent" kernel launches is approximately double.