cudaprofilingatomicnvidiansight

Error in profiling shared memory atomic kernel in Nsight Compute


I am trying the global atomics vs shared atomics code from NVIDIA blog https://developer.nvidia.com/blog/gpu-pro-tip-fast-histograms-using-shared-atomics-maxwell/

But when I am trying to profile with Nsight Compute CLI, it shows an error for the shared atomics kernel.

==PROF== Connected to process 16078
==PROF== Profiling "histogram_gmem_atomics" - 0: 0%....50%....100% - 1 pass
==PROF== Profiling "histogram_smem_atomics" - 1: 0%....50%....100% - 1 pass

==ERROR== LaunchFailed

==ERROR== LaunchFailed
==PROF== Trying to shutdown target application
==ERROR== The application returned an error code (9).
==ERROR== An error occurred while trying to profile.
==WARNING== Found outstanding GPU clock reset, trying to revert...Success.
[16078] histogram@127.0.0.1
  histogram_gmem_atomics(const IN_TYPE *, int, int, unsigned int *), 2023-Mar-09 12:55:43, Context 1, Stream 7
    Section: Command line profiler metrics
    ---------------------------------------------------------------------- --------------- ------------------------------
    dram__bytes.sum.per_second                                                Gbyte/second                          13,98
    ---------------------------------------------------------------------- --------------- ------------------------------

  histogram_smem_atomics(const IN_TYPE *, int, int, unsigned int *), 2023-Mar-09 12:55:43, Context 1, Stream 7
    Section: Command line profiler metrics
    ---------------------------------------------------------------------- --------------- ------------------------------
    dram__bytes.sum.per_second                                                 byte/second                        (!) nan
    ---------------------------------------------------------------------- --------------- ------------------------------

Why is this showing an error in ncu? For referance my main function looks like this:

#define NUM_BINS 480
#define NUM_PARTS 48

struct IN_TYPE
{
    int x;
    int y;
    int z;
};

int main(){
    int height = 480;
    int width = height;

    auto nThread = 16;
    auto nBlock = (height) / nThread;

    IN_TYPE* h_in_image, *d_in_image;
    unsigned int* d_out_image;
    h_in_image = (IN_TYPE *)malloc(height*width * sizeof(IN_TYPE));
    cudaMalloc(&d_in_image, height*width * sizeof(IN_TYPE));
    cudaMalloc(&d_out_image, height*width * sizeof(unsigned int));

    for (int n = 0; n < (height*width); n++)
    {
        h_in_image[n].x = rand()%10;
        h_in_image[n].y = rand()%10;
        h_in_image[n].z = rand()%10;
    }
    cudaMemcpy(d_in_image, h_in_image, height*width * sizeof(IN_TYPE), cudaMemcpyHostToDevice);

    histogram_gmem_atomics<<<nBlock, nThread>>>(d_in_image, width, height, d_out_image);
    cudaDeviceSynchronize();

// not copying the results back as of now

    histogram_smem_atomics<<<nBlock, nThread>>>(d_in_image, width, height, d_out_image);
    cudaDeviceSynchronize();

}

Solution

  • Why is this showing an error in ncu?

    The blog in question expects that the pixel (component) values will be expressed as floating-point in the range of [0,1.0). This is why this kind of multiplication makes sense, for either the gmem or smem version:

      unsigned int r = (unsigned int)(256 * in[row * width + col].x);
                                      ^^^^^^
    

    so this is not correct:

    struct IN_TYPE
    {
        int x;
        int y;
        int z;
    };
    

    Instead, you want something like:

    struct IN_TYPE
    {
        float x;
        float y;
        float z;
    };
    

    and make sure that you initialize those values (x, y, z) in a range of 0.0 to ~0.999999 max.

    Based on the structure of the code, and as stated in the blog, I'm not sure that more than 256 bins makes any sense. The code quantizes the float pixel values to an integer range of [0,255].

    For the global data, your settings for NUM_PARTS (effectively the number of bins times the number of color components, or "parts" of each histogram) and the size of the output array don't make sense.

    When I address those items, the code runs without error for me:

    $ cat t2209.cu
    #define NUM_BINS (256)
    #define NUM_PARTS (3*NUM_BINS)
    
    struct IN_TYPE
    {
        float x;
        float y;
        float z;
    };
    
    
    __global__ void histogram_gmem_atomics(const IN_TYPE *in, int width, int height, unsigned int *out)
    {
      // pixel coordinates
      int x = blockIdx.x * blockDim.x + threadIdx.x;
      int y = blockIdx.y * blockDim.y + threadIdx.y;
    
      // grid dimensions
      int nx = blockDim.x * gridDim.x;
      int ny = blockDim.y * gridDim.y;
    
      // linear thread index within 2D block
      int t = threadIdx.x + threadIdx.y * blockDim.x;
    
      // total threads in 2D block
      int nt = blockDim.x * blockDim.y;
    
      // linear block index within 2D grid
      int g = blockIdx.x + blockIdx.y * gridDim.x;
    
      // initialize temporary accumulation array in global memory
      unsigned int *gmem = out + g * NUM_PARTS;
      for (int i = t; i < 3 * NUM_BINS; i += nt) gmem[i] = 0;
    
      // process pixels
      // updates our block's partial histogram in global memory
      for (int col = x; col < width; col += nx)
        for (int row = y; row < height; row += ny) {
          unsigned int r = (unsigned int)(256 * in[row * width + col].x);
          unsigned int g = (unsigned int)(256 * in[row * width + col].y);
          unsigned int b = (unsigned int)(256 * in[row * width + col].z);
          atomicAdd(&gmem[NUM_BINS * 0 + r], 1);
          atomicAdd(&gmem[NUM_BINS * 1 + g], 1);
          atomicAdd(&gmem[NUM_BINS * 2 + b], 1);
        }
    }
    
    __global__ void histogram_smem_atomics(const IN_TYPE *in, int width, int height, unsigned int *out)
    {
      // pixel coordinates
      int x = blockIdx.x * blockDim.x + threadIdx.x;
      int y = blockIdx.y * blockDim.y + threadIdx.y;
    
      // grid dimensions
      int nx = blockDim.x * gridDim.x;
      int ny = blockDim.y * gridDim.y;
    
      // linear thread index within 2D block
      int t = threadIdx.x + threadIdx.y * blockDim.x;
    
      // total threads in 2D block
      int nt = blockDim.x * blockDim.y;
    
      // linear block index within 2D grid
      int g = blockIdx.x + blockIdx.y * gridDim.x;
    
      // initialize temporary accumulation array in shared memory
      __shared__ unsigned int smem[3 * NUM_BINS + 3];
      for (int i = t; i < 3 * NUM_BINS + 3; i += nt) smem[i] = 0;
      __syncthreads();
    
      // process pixels
      // updates our block's partial histogram in shared memory
      for (int col = x; col < width; col += nx)
        for (int row = y; row < height; row += ny) {
          unsigned int r = (unsigned int)(256 * in[row * width + col].x);
          unsigned int g = (unsigned int)(256 * in[row * width + col].y);
          unsigned int b = (unsigned int)(256 * in[row * width + col].z);
          atomicAdd(&smem[NUM_BINS * 0 + r + 0], 1);
          atomicAdd(&smem[NUM_BINS * 1 + g + 1], 1);
          atomicAdd(&smem[NUM_BINS * 2 + b + 2], 1);
        }
      __syncthreads();
    
      // write partial histogram into the global memory
      out += g * NUM_PARTS;
      for (int i = t; i < NUM_BINS; i += nt) {
        out[i + NUM_BINS * 0] = smem[i + NUM_BINS * 0];
        out[i + NUM_BINS * 1] = smem[i + NUM_BINS * 1 + 1];
        out[i + NUM_BINS * 2] = smem[i + NUM_BINS * 2 + 2];
      }
    }
    
    int main(){
        int height = 480;
        int width = height;
    
        auto nThread = 16;
        auto nBlock = (height) / nThread;
    
        IN_TYPE* h_in_image, *d_in_image;
        unsigned int* d_out_image;
        h_in_image = (IN_TYPE *)malloc(height*width * sizeof(IN_TYPE));
        cudaMalloc(&d_in_image, height*width * sizeof(IN_TYPE));
        cudaMalloc(&d_out_image, nBlock*NUM_PARTS * sizeof(unsigned int));
    
        for (int n = 0; n < (height*width); n++)
        {
            h_in_image[n].x = rand()/(float)RAND_MAX;
            h_in_image[n].y = rand()/(float)RAND_MAX;
            h_in_image[n].z = rand()/(float)RAND_MAX;
        }
        cudaMemcpy(d_in_image, h_in_image, height*width * sizeof(IN_TYPE), cudaMemcpyHostToDevice);
    
        histogram_gmem_atomics<<<nBlock, nThread>>>(d_in_image, width, height, d_out_image);
        cudaDeviceSynchronize();
    
    // not copying the results back as of now
    
        histogram_smem_atomics<<<nBlock, nThread>>>(d_in_image, width, height, d_out_image);
        cudaDeviceSynchronize();
    
    }
    $ nvcc -o t2209 t2209.cu
    $ compute-sanitizer ./t2209
    ========= COMPUTE-SANITIZER
    ========= ERROR SUMMARY: 0 errors
    $