Error in profiling shared memory atomic kernel in Nsight Compute

I am trying the global atomics vs shared atomics code from NVIDIA blog https://developer.nvidia.com/blog/gpu-pro-tip-fast-histograms-using-shared-atomics-maxwell/

But when I am trying to profile with Nsight Compute CLI, it shows an error for the shared atomics kernel.

==PROF== Connected to process 16078
==PROF== Profiling "histogram_gmem_atomics" - 0: 0%....50%....100% - 1 pass
==PROF== Profiling "histogram_smem_atomics" - 1: 0%....50%....100% - 1 pass

==ERROR== LaunchFailed

==ERROR== LaunchFailed
==PROF== Trying to shutdown target application
==ERROR== The application returned an error code (9).
==ERROR== An error occurred while trying to profile.
==WARNING== Found outstanding GPU clock reset, trying to revert...Success.
[16078] histogram@127.0.0.1
  histogram_gmem_atomics(const IN_TYPE *, int, int, unsigned int *), 2023-Mar-09 12:55:43, Context 1, Stream 7
    Section: Command line profiler metrics
    ---------------------------------------------------------------------- --------------- ------------------------------
    dram__bytes.sum.per_second                                                Gbyte/second                          13,98
    ---------------------------------------------------------------------- --------------- ------------------------------

  histogram_smem_atomics(const IN_TYPE *, int, int, unsigned int *), 2023-Mar-09 12:55:43, Context 1, Stream 7
    Section: Command line profiler metrics
    ---------------------------------------------------------------------- --------------- ------------------------------
    dram__bytes.sum.per_second                                                 byte/second                        (!) nan
    ---------------------------------------------------------------------- --------------- ------------------------------

Why is this showing an error in ncu? For referance my main function looks like this:

#define NUM_BINS 480
#define NUM_PARTS 48

struct IN_TYPE
{
    int x;
    int y;
    int z;
};

int main(){
    int height = 480;
    int width = height;

    auto nThread = 16;
    auto nBlock = (height) / nThread;

    IN_TYPE* h_in_image, *d_in_image;
    unsigned int* d_out_image;
    h_in_image = (IN_TYPE *)malloc(height*width * sizeof(IN_TYPE));
    cudaMalloc(&d_in_image, height*width * sizeof(IN_TYPE));
    cudaMalloc(&d_out_image, height*width * sizeof(unsigned int));

    for (int n = 0; n < (height*width); n++)
    {
        h_in_image[n].x = rand()%10;
        h_in_image[n].y = rand()%10;
        h_in_image[n].z = rand()%10;
    }
    cudaMemcpy(d_in_image, h_in_image, height*width * sizeof(IN_TYPE), cudaMemcpyHostToDevice);

    histogram_gmem_atomics<<<nBlock, nThread>>>(d_in_image, width, height, d_out_image);
    cudaDeviceSynchronize();

// not copying the results back as of now

    histogram_smem_atomics<<<nBlock, nThread>>>(d_in_image, width, height, d_out_image);
    cudaDeviceSynchronize();

}

Solution

Why is this showing an error in ncu?

The blog in question expects that the pixel (component) values will be expressed as floating-point in the range of [0,1.0). This is why this kind of multiplication makes sense, for either the gmem or smem version:

  unsigned int r = (unsigned int)(256 * in[row * width + col].x);
                                  ^^^^^^

so this is not correct:

struct IN_TYPE
{
    int x;
    int y;
    int z;
};

Instead, you want something like:

struct IN_TYPE
{
    float x;
    float y;
    float z;
};

and make sure that you initialize those values (x, y, z) in a range of 0.0 to ~0.999999 max.

Based on the structure of the code, and as stated in the blog, I'm not sure that more than 256 bins makes any sense. The code quantizes the float pixel values to an integer range of [0,255].

For the global data, your settings for NUM_PARTS (effectively the number of bins times the number of color components, or "parts" of each histogram) and the size of the output array don't make sense.

When I address those items, the code runs without error for me:

$ cat t2209.cu
#define NUM_BINS (256)
#define NUM_PARTS (3*NUM_BINS)

struct IN_TYPE
{
    float x;
    float y;
    float z;
};


__global__ void histogram_gmem_atomics(const IN_TYPE *in, int width, int height, unsigned int *out)
{
  // pixel coordinates
  int x = blockIdx.x * blockDim.x + threadIdx.x;
  int y = blockIdx.y * blockDim.y + threadIdx.y;

  // grid dimensions
  int nx = blockDim.x * gridDim.x;
  int ny = blockDim.y * gridDim.y;

  // linear thread index within 2D block
  int t = threadIdx.x + threadIdx.y * blockDim.x;

  // total threads in 2D block
  int nt = blockDim.x * blockDim.y;

  // linear block index within 2D grid
  int g = blockIdx.x + blockIdx.y * gridDim.x;

  // initialize temporary accumulation array in global memory
  unsigned int *gmem = out + g * NUM_PARTS;
  for (int i = t; i < 3 * NUM_BINS; i += nt) gmem[i] = 0;

  // process pixels
  // updates our block's partial histogram in global memory
  for (int col = x; col < width; col += nx)
    for (int row = y; row < height; row += ny) {
      unsigned int r = (unsigned int)(256 * in[row * width + col].x);
      unsigned int g = (unsigned int)(256 * in[row * width + col].y);
      unsigned int b = (unsigned int)(256 * in[row * width + col].z);
      atomicAdd(&gmem[NUM_BINS * 0 + r], 1);
      atomicAdd(&gmem[NUM_BINS * 1 + g], 1);
      atomicAdd(&gmem[NUM_BINS * 2 + b], 1);
    }
}

__global__ void histogram_smem_atomics(const IN_TYPE *in, int width, int height, unsigned int *out)
{
  // pixel coordinates
  int x = blockIdx.x * blockDim.x + threadIdx.x;
  int y = blockIdx.y * blockDim.y + threadIdx.y;

  // grid dimensions
  int nx = blockDim.x * gridDim.x;
  int ny = blockDim.y * gridDim.y;

  // linear thread index within 2D block
  int t = threadIdx.x + threadIdx.y * blockDim.x;

  // total threads in 2D block
  int nt = blockDim.x * blockDim.y;

  // linear block index within 2D grid
  int g = blockIdx.x + blockIdx.y * gridDim.x;

  // initialize temporary accumulation array in shared memory
  __shared__ unsigned int smem[3 * NUM_BINS + 3];
  for (int i = t; i < 3 * NUM_BINS + 3; i += nt) smem[i] = 0;
  __syncthreads();

  // process pixels
  // updates our block's partial histogram in shared memory
  for (int col = x; col < width; col += nx)
    for (int row = y; row < height; row += ny) {
      unsigned int r = (unsigned int)(256 * in[row * width + col].x);
      unsigned int g = (unsigned int)(256 * in[row * width + col].y);
      unsigned int b = (unsigned int)(256 * in[row * width + col].z);
      atomicAdd(&smem[NUM_BINS * 0 + r + 0], 1);
      atomicAdd(&smem[NUM_BINS * 1 + g + 1], 1);
      atomicAdd(&smem[NUM_BINS * 2 + b + 2], 1);
    }
  __syncthreads();

  // write partial histogram into the global memory
  out += g * NUM_PARTS;
  for (int i = t; i < NUM_BINS; i += nt) {
    out[i + NUM_BINS * 0] = smem[i + NUM_BINS * 0];
    out[i + NUM_BINS * 1] = smem[i + NUM_BINS * 1 + 1];
    out[i + NUM_BINS * 2] = smem[i + NUM_BINS * 2 + 2];
  }
}

int main(){
    int height = 480;
    int width = height;

    auto nThread = 16;
    auto nBlock = (height) / nThread;

    IN_TYPE* h_in_image, *d_in_image;
    unsigned int* d_out_image;
    h_in_image = (IN_TYPE *)malloc(height*width * sizeof(IN_TYPE));
    cudaMalloc(&d_in_image, height*width * sizeof(IN_TYPE));
    cudaMalloc(&d_out_image, nBlock*NUM_PARTS * sizeof(unsigned int));

    for (int n = 0; n < (height*width); n++)
    {
        h_in_image[n].x = rand()/(float)RAND_MAX;
        h_in_image[n].y = rand()/(float)RAND_MAX;
        h_in_image[n].z = rand()/(float)RAND_MAX;
    }
    cudaMemcpy(d_in_image, h_in_image, height*width * sizeof(IN_TYPE), cudaMemcpyHostToDevice);

    histogram_gmem_atomics<<<nBlock, nThread>>>(d_in_image, width, height, d_out_image);
    cudaDeviceSynchronize();

// not copying the results back as of now

    histogram_smem_atomics<<<nBlock, nThread>>>(d_in_image, width, height, d_out_image);
    cudaDeviceSynchronize();

}
$ nvcc -o t2209 t2209.cu
$ compute-sanitizer ./t2209
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
$