c++cudanvidiaconvolutionnpp

Nvidia nppi function corrupting destination pointer


I am trying to run this 2d convolution filter from the nppi library, but for some reason it seems my destination pointer is getting corrupted.

#define KERNEL_WIDTH 3
#define KERNEL_HEIGHT 3
#define KERNEL_SIZE (KERNEL_WIDTH * KERNEL_HEIGHT)
#define KERNEL {0.0f, 0.25f, 0.0f, 0.25f, 0.0f, 0.25f, 0.0f, 0.25f, 0.0f}

float kernel[KERNEL_SIZE] = KERNEL;
NppiSize kernelSize = {.width=KERNEL_WIDTH, .height=KERNEL_HEIGHT};


int width;
int height;

float *hostIn;  // some data is put here
float *hostOut;
float *deviceIn;
float *deviceOut;

hostOut = malloc(width * height * sizeof(float));
cudaMalloc((void **)&deviceIn, width * height * sizeof(float));
cudaMalloc((void **)&deviceOut, width * height * sizeof(float));
cudaMemcpy(deviceIn, hostInputData, width * height * sizeof(float), cudaMemcpyHostToDevice);

// The next statement works fine
cudaMemcpy(hostOutputData, deviceOut, width * height * sizeof(float), cudaMemcpyDeviceToHost);

NppiSize inputSize = {.width=width, .height=height};
NppiSize oSizeROI = {.width=width, .height=height};
NppiPoint oAnchor = {.x=0, .y=0};
NppiPoint oSrcOffset = {.x=0, .y=0};
// The following statement returns NPP_SUCCESS.
NppStatus err = nppiFilterBorder_32f_C1R(deviceIn, width * sizeof(float), inputSize, oSrcOffset, deviceOut, width*sizeof(float), oSizeROI, kernel, kernelSize, oAnchor, NPP_BORDER_REPLICATE);
assert(err == NPP_SUCCESS);
// The next statement fails with illegal memory access error
cudaMemcpy(hostOutputData, deviceOut, width * height * sizeof(float), cudaMemcpyDeviceToHost);

What I do not understand is why the cudaMemcpy after the nppi call is resulting in an illegal access error. Any advice would be great!


Solution

  • The main issue is that you cannot use a __constant__ global definition for the convolution kernel you pass to npp. That's not an option.

    Here's a modification of what you have shown, converted to complete code, that converts that kernel to an ordinary device allocation, and runs without runtime error:

    # cat t73.cu
    #define KERNEL_WIDTH 3
    #define KERNEL_HEIGHT 3
    #define KERNEL_SIZE (KERNEL_WIDTH * KERNEL_HEIGHT)
    #define KERNEL {0.0f, 0.25f, 0.0f, 0.25f, 0.0f, 0.25f, 0.0f, 0.25f, 0.0f}
    
    #include <cassert>
    #include <npp.h>
    
    int main(){
    
      float kernel[KERNEL_SIZE] = KERNEL;
      NppiSize kernelSize = {.width=KERNEL_WIDTH, .height=KERNEL_HEIGHT};
    
    
      int width = 256;
      int height = 256;
    
      float *hostIn;  // some data is put here
      float *hostOut;
      float *deviceIn;
      float *deviceOut;
      float *dKernel;
    
      hostOut = (float *)malloc(width * height * sizeof(float));
      hostIn = (float *)malloc(width * height * sizeof(float));
      cudaMalloc((void **)&deviceIn, width * height * sizeof(float));
      cudaMalloc((void **)&deviceOut, (width+2) * (height+2) * sizeof(float));
      cudaMemcpy(deviceIn, hostIn, width * height * sizeof(float), cudaMemcpyHostToDevice);
      cudaMalloc(&dKernel, KERNEL_SIZE*sizeof(dKernel[0]));
      cudaMemcpy(dKernel, kernel, KERNEL_SIZE*sizeof(dKernel[0]), cudaMemcpyHostToDevice);
      NppiSize inputSize = {.width=width, .height=height};
      NppiSize oSizeROI = {.width=width, .height=height};
      NppiPoint oAnchor = {.x=0, .y=0};
      NppiPoint oSrcOffset = {.x=0, .y=0};
    // The following statement returns NPP_SUCCESS.
      NppStatus err = nppiFilterBorder_32f_C1R(deviceIn, width * sizeof(float), inputSize, oSrcOffset, deviceOut, width*sizeof(float), oSizeROI, dKernel, kernelSize, oAnchor, NPP_BORDER_REPLICATE);
      assert(err == NPP_SUCCESS);
    // The next statement fails with illegal memory access error
      cudaMemcpy(hostOut, deviceOut, width * height * sizeof(float), cudaMemcpyDeviceToHost);
    }
    # nvcc -o t73 t73.cu -lnppif
    # compute-sanitizer ./t73
    ========= COMPUTE-SANITIZER
    ========= ERROR SUMMARY: 0 errors
    #