Cuda Bounding Box with NPP LabelMarkers

I am trying to find bounding boxes for my input data using cuda libraries. I start off with a data set that has noise (and possibly some zeroed out cells) with areas of data that are much higher than the noise.

First I apply a gaussian blur to my data with nppiFilterGauss_32f_C1R.

I then threshold it with nppiCompareC_32f_C1R to create a binary image.

Following this, I use nppiLabelMarkers_8u32u_C1R to create a unique label for each area.

At this point, my results are as I expect. I am left with a dataset that has unique values for each "blob" (although with numeric gaps between the numbers).

I have been looking online and can't seem to find a library that will then find bounding boxes for labeled components on a GPU.

I was able to get the complete flow working with OpenCV using findContours and BoundingRects, but this was doing the work on the CPU and is not able to keep up with my data rate.

Is there a cuda function I am missing that can provide me with the bounding box parameters of each of these labeled blobs?

Thanks!

Solution

After the label markers operation, if we then compress the label markers, we can realize a fairly simple approach for identifying bounding boxes, using atomicMax and atomicMin in a simple CUDA kernel.

Here is a worked example:

$ cat t1461.cu
#include <stdio.h>
#include <nppi_filtering_functions.h>
#include <assert.h>
#define WIDTH 16
#define HEIGHT 16
void my_print(Npp16u *data, int w, int h){
  for (int i = 0; i < h; i++)
    {
    for (int j = 0; j < w; j++)
      {
      if (data[i*w+j] == 255) printf("  *");
      else printf("%3hd", data[i * w + j]);
      }
    printf("\n");
  }

}

template <typename T>
__global__ void bb(const T * __restrict__ i, int * __restrict__ maxh, int * __restrict__ minh, int * __restrict__ maxw, int * __restrict__ minw, int height, int width){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  int idy = threadIdx.y+blockDim.y*blockIdx.y;
  if ((idx < width) && (idy < height)){
    T myval = i[idy*width+idx];
    if (myval > 0){
      atomicMax(maxw+myval-1, idx);
      atomicMin(minw+myval-1, idx);
      atomicMax(maxh+myval-1, idy);
      atomicMin(minh+myval-1, idy);}
  }
}

int main(){
Npp16u host_src[WIDTH * HEIGHT] =
{
0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255, 0, 0,255,255,255,255, 0, 0,255, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0,255,255,255,
0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255,255,
0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,
0, 255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255,255, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0,
0, 0,255,255,255, 0,255,255,255,255,255, 0, 0, 0, 0, 0,
0, 0, 0,255, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255,255,
0, 255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,
0, 255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,
0, 255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};

  Npp16u * device_src;
  cudaMalloc((void**)&device_src, sizeof(Npp16u) * WIDTH * HEIGHT);
  cudaMemcpy(device_src, host_src, sizeof(Npp16u) * WIDTH * HEIGHT, cudaMemcpyHostToDevice);

  int buffer_size;
  NppiSize source_roi = { WIDTH, HEIGHT };
  NppStatus e = nppiLabelMarkersGetBufferSize_16u_C1R(source_roi, &buffer_size);
  assert(e == NPP_NO_ERROR);
  Npp8u * buffer;
  cudaMalloc((void**)&buffer, buffer_size);

  int max;
  e = nppiLabelMarkers_16u_C1IR(device_src, sizeof(Npp16u) * WIDTH, source_roi, (Npp16u)1, nppiNormInf, &max, buffer);
  assert(e == NPP_NO_ERROR);
  printf("initial max: %d\n", max);
  int bs;
  e = nppiCompressMarkerLabelsGetBufferSize_16u_C1R (1, &bs);
  assert(e == NPP_NO_ERROR);
  if (bs>buffer_size){
    buffer_size = bs;
    cudaFree(buffer);
    cudaMalloc(&buffer, buffer_size);}
  e = nppiCompressMarkerLabels_16u_C1IR(device_src, sizeof(Npp16u)*WIDTH, source_roi, max, &max, buffer);
  assert(e == NPP_NO_ERROR);
  int *maxw, *maxh, *minw, *minh, *d_maxw, *d_maxh, *d_minw, *d_minh;
  maxw = new int[max];
  maxh = new int[max];
  minw = new int[max];
  minh = new int[max];
  cudaMalloc(&d_maxw, max*sizeof(int));
  cudaMalloc(&d_maxh, max*sizeof(int));
  cudaMalloc(&d_minw, max*sizeof(int));
  cudaMalloc(&d_minh, max*sizeof(int));
  for (int i = 0; i < max; i++){
    maxw[i] = 0;
    maxh[i] = 0;
    minw[i] = WIDTH;
    minh[i] = HEIGHT;}
  cudaMemcpy(d_maxw, maxw, max*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_maxh, maxh, max*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_minw, minw, max*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_minh, minh, max*sizeof(int), cudaMemcpyHostToDevice);
  dim3 block(32,32);
  dim3 grid((WIDTH+block.x-1)/block.x, (HEIGHT+block.y-1)/block.y);
  bb<<<grid, block>>>(device_src, d_maxh, d_minh, d_maxw, d_minw, HEIGHT, WIDTH);
  cudaMemcpy(maxw, d_maxw, max*sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(maxh, d_maxh, max*sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(minw, d_minw, max*sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(minh, d_minh, max*sizeof(int), cudaMemcpyDeviceToHost);

  Npp16u *dst = new Npp16u[WIDTH * HEIGHT];
  cudaMemcpy(dst, device_src, sizeof(Npp16u) * WIDTH * HEIGHT, cudaMemcpyDeviceToHost);

  printf("*******INPUT************\n");
  my_print(host_src, WIDTH, HEIGHT);
  printf("******OUTPUT************\n");
  my_print(dst, WIDTH,HEIGHT);
  printf("compressed max: %d\n", max);
  printf("bounding boxes:\n");
  for (int i = 0; i < max; i++)
    printf("label %d, maxh: %d, minh: %d, maxw: %d, minw: %d\n", i+1, maxh[i], minh[i], maxw[i], minw[i]);
}
$ nvcc -o t1461 t1461.cu -lnppif
$ cuda-memcheck ./t1461
========= CUDA-MEMCHECK
initial max: 10
*******INPUT************
  0  0  0  0  0  0  0  0  *  0  0  0  0  0  0  0
  0  *  *  *  0  0  *  *  *  0  0  0  0  0  0  0
  0  *  *  *  0  0  *  *  *  0  0  0  0  0  0  0
  0  *  *  *  0  0  *  *  *  *  0  0  *  0  0  0
  0  0  0  0  0  0  0  *  *  *  0  0  0  *  *  *
  0  0  0  0  0  0  0  0  *  0  0  0  0  *  *  *
  0  0  0  *  0  0  0  0  0  0  0  0  0  *  *  *
  0  *  *  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  *  *  *  0  0  0  0  *  0  0  0  0  0  0  0
  0  *  *  *  *  0  0  *  *  *  0  0  0  0  0  0
  0  0  *  *  *  0  *  *  *  *  *  0  0  0  0  0
  0  0  0  *  0  0  0  *  *  *  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  *  0  0  0  0  *  *  *
  0  *  *  *  0  0  0  0  0  0  0  0  0  *  *  *
  0  *  *  *  0  0  0  0  0  0  0  0  0  *  *  *
  0  *  *  *  0  0  0  0  0  0  0  0  0  0  0  0
******OUTPUT************
  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0
  0  2  2  2  0  0  1  1  1  0  0  0  0  0  0  0
  0  2  2  2  0  0  1  1  1  0  0  0  0  0  0  0
  0  2  2  2  0  0  1  1  1  1  0  0  3  0  0  0
  0  0  0  0  0  0  0  1  1  1  0  0  0  3  3  3
  0  0  0  0  0  0  0  0  1  0  0  0  0  3  3  3
  0  0  0  4  0  0  0  0  0  0  0  0  0  3  3  3
  0  4  4  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  4  4  4  0  0  0  0  5  0  0  0  0  0  0  0
  0  4  4  4  4  0  0  5  5  5  0  0  0  0  0  0
  0  0  4  4  4  0  5  5  5  5  5  0  0  0  0  0
  0  0  0  4  0  0  0  5  5  5  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  5  0  0  0  0  6  6  6
  0  7  7  7  0  0  0  0  0  0  0  0  0  6  6  6
  0  7  7  7  0  0  0  0  0  0  0  0  0  6  6  6
  0  7  7  7  0  0  0  0  0  0  0  0  0  0  0  0
compressed max: 7
bounding boxes:
label 1, maxh: 5, minh: 0, maxw: 9, minw: 6
label 2, maxh: 3, minh: 1, maxw: 3, minw: 1
label 3, maxh: 6, minh: 3, maxw: 15, minw: 12
label 4, maxh: 11, minh: 6, maxw: 4, minw: 1
label 5, maxh: 12, minh: 8, maxw: 10, minw: 6
label 6, maxh: 14, minh: 12, maxw: 15, minw: 13
label 7, maxh: 15, minh: 13, maxw: 3, minw: 1
========= ERROR SUMMARY: 0 errors
$

Note that if you're going to do this repetetively (for example identifying bounding boxes on video frames) you'll want to get the cudaMalloc operations mostly out of the performance loop.

A typical approach would be to use the methodology that I have already shown for allocation of buffer in the code above. Only free and reallocate the buffer if the previous size is too small. Likewise for the max and min buffers.