CUDA NPP Median Filter for 16 bit images

Final Update: Solved. The WDDM timeout was also an issue. Found a solution from: WDDM timeout fix. Thanks Robert.

Update: Thanks Robert for pointing out that the center of the filter is not 0,0. Unfortunately, the code you posted will break for me if the filter is increased, say to 17x17. This could be due to you not accounting for the borders on the "side" of the image. In any case, here is the most current code, but still exhibiting the same problems as before...

//npp
#include "npp.h"
#include "nppi.h"
#include "device_launch_parameters.h"

#include <iostream>

int main() {

    //Image size.
    int imageWidth = 6592; 
    int imageHeight = 4400;

    //Misc.
    int bytesPerPixel = 2;
    int totalPixels = imageWidth*imageHeight;
    int filterSize = 17;
    int halfFilter = filterSize/2;
    cudaError success2;
    NppStatus success1;

    //Mask & Origin for CUDA.
    NppiSize cudaMask; 
    cudaMask.height = filterSize; 
    cudaMask.width = filterSize;
    NppiPoint cudaAnchor;
    cudaAnchor.x = halfFilter;
    cudaAnchor.y = halfFilter;

    //ROI for CUDA.
    int left = halfFilter;
    int right = (imageWidth-1) - halfFilter;
    int top = halfFilter;
    int bot = (imageHeight-1) - halfFilter;
    NppiSize cudaROI;
    cudaROI.height  = bot - top;
    cudaROI.width   = right - left;

    //Step size.
    int step = imageWidth * bytesPerPixel;

    //Create a new "image".
    unsigned short* image = new unsigned short[totalPixels];
    for(int i=0; i<imageWidth; i++)
        for(int j=0; j<imageHeight; j++)
            image[j*imageWidth+i] = 10;

    //Allocate mem on device.
    Npp16u *dSrc, *dDst;
    Npp8u *dBuf;
    Npp32u bufferSize;

    //This call always returns a bufferSize==0.  That doesn't seem right...
    success1 = nppiFilterMedianGetBufferSize_16u_C1R(cudaROI, cudaMask, &bufferSize);
    std::cout << "get bufferSize returned: " << (int)success1 << std::endl;
    std::cout << bufferSize << std::endl;
    success2 = cudaMalloc( (void**)&dBuf, bufferSize);
    std::cout << "cudaMalloc 1 returned: " << (int)success2 << std::endl;
    success2 = cudaMalloc( (void**)&dSrc, totalPixels*sizeof(Npp16u));
    std::cout << "cudaMalloc 2 returned: " << (int)success2 << std::endl;
    success2 = cudaMalloc( (void**)&dDst, totalPixels*sizeof(Npp16u));
    std::cout << "cudaMalloc 3 returned: " << (int)success2 << std::endl;

    //Copy host image to device.
    success2 = cudaMemcpy( dSrc, image, totalPixels*sizeof(Npp16u), cudaMemcpyHostToDevice);
    std::cout << "cudaMemcpy 1 returned: " << (int)success2 << std::endl;


    //Copy source to destination.
    success1 = nppiCopy_16u_C1R( dSrc, step, dDst, step, cudaROI);
    std::cout << "npp Copy 1 returned: " << (int)success1 << std::endl;


    //Filter.
    Npp32u offset = top*step + left*bytesPerPixel;
    success1 = nppiFilterMedian_16u_C1R(    dSrc + offset,
                                            step,
                                            dDst + offset,
                                            step,
                                            cudaROI, cudaMask, cudaAnchor, dBuf);
    std::cout << "npp Filter  returned: " << (int)success1 << std::endl;


    //Copy resultant back to host.
    success2 = cudaMemcpy( image, dDst, totalPixels*sizeof(Npp16u), cudaMemcpyDeviceToHost);
    std::cout << "cudaMemcpy 2 returned: " << (int)success2 << std::endl;

    //Clean.
    success2 = cudaFree(dDst);
    success2 = cudaFree(dBuf);
    success2 = cudaFree(dSrc);
    delete image;

    system("pause");
    return 0;

}

I'm trying to compute a median filter for a 29mp image. The filter size is 13x13. The image's width and height are shown below. For an unknown reason, the following code will crash and I'm asking if anyone knows why?

Strange things I've noticed:

The error occurs with nppiFilterMedian_16u_C1R(). The function itself returns a no error condition, but the following cudaMemcpy() does. Without the filter, the cudaMemcpy() works just fine.
Also, getting the buffer size for 16bit filter always returns a size of 0. I've tested 8bit, and 32 bit, which return non-zero values...
I think that this is possibly a bug (?) with the NPPI library. It seems to be size dependent (if you use reduce the image's width/height it will function just fine for a 13x13 filter size). However, my filter sizes need to go up to 31x31.

Other important information: Windows x64 application, CUDA runtime 7.5, NPP version 7.5. GPU device is a Quadro k2200 (4GB global mem).

Solution

The median filter function will pass a mask over the image, point-by-point. This mask has the dimensions specified (9x9 in your original code). The anchor point will determine how this mask is positioned for each pixel. When the anchor point is 0,0, the mask will be positioned like this:

p**
***
***

where p represents the pixel location, and the mask size is 3x3. For an anchor point of 1,1, the mask positioning, per pixel, would be:

***
*p*
***

Therefore we see that the anchor point, and the mask size, will determine a certain boundary around each pixel which must be accessible to the median filter function. When dealing with pixels in the border of the image, we must ensure that this boundary lands on valid pixels.

The case you started with, a 9x9 mask and 0,0 anchor point, means that we only need "extra" pixels for the boundary at the "end" of the image. Therefore the modification is simple: restrict the ROI height so as to not process the last few lines of the image, corresponding to the mask dimension. For this case, we can simply subtract 10 from the ROI height, and the errors go away:

$ cat t1223.cu
//npp
#include "npp.h"
#include "nppi.h"
#include <iostream>

int main() {

//When the filter size is 9x9....
int imageWidth = 6592; //breaks if > 5914 && imageHeight = 4400
int imageHeight = 4400; //breaks if > 3946 && imageWidth = 6592

//Misc.
int bytesPerPixel = 2;
int totalPixels = imageWidth*imageHeight;
cudaError success2;
NppStatus success1;

//ROI for CUDA.
NppiSize cudaROI;
cudaROI.height  = imageHeight-10;
cudaROI.width   = imageWidth;

//Mask & Origin for CUDA.
NppiSize cudaMask; NppiPoint cudaAnchor;
cudaMask.height = 9; //filter size
cudaMask.width = 9;
cudaAnchor.x = 0;
cudaAnchor.y = 0;

//Step size.
int step = imageWidth * bytesPerPixel;

//Create a new "image".
unsigned short* image = new unsigned short[totalPixels];
for(int i=0; i<imageWidth; i++)
    for(int j=0; j<imageHeight; j++)
        image[j*imageWidth+i] = 10;


//Allocate mem on device.
Npp16u *dSrc, *dDst;
Npp8u *dBuf;
Npp32u bufferSize;

//This call always returns a bufferSize==0.  That doesn't seem right...
success1 = nppiFilterMedianGetBufferSize_16u_C1R(cudaROI, cudaMask, &bufferSize);
std::cout << "get bufferSize returned: " << (int)success1 << std::endl;
std::cout << bufferSize << std::endl;
success2 = cudaMalloc( (void**)&dBuf, bufferSize);
std::cout << "cudaMalloc 1 returned: " << (int)success2 << std::endl;
success2 = cudaMalloc( (void**)&dSrc, totalPixels*sizeof(Npp16u));
std::cout << "cudaMalloc 2 returned: " << (int)success2 << std::endl;
success2 = cudaMalloc( (void**)&dDst, totalPixels*sizeof(Npp16u));
std::cout << "cudaMalloc 3 returned: " << (int)success2 << std::endl;

//Copy host image to device.
success2 = cudaMemcpy( dSrc, image, totalPixels*sizeof(Npp16u), cudaMemcpyHostToDevice);
std::cout << "cudaMemcpy 1 returned: " << (int)success2 << std::endl;

//Copy source to destination.
success1 = nppiCopy_16u_C1R( dSrc, step, dDst, step, cudaROI);
std::cout << "npp Copy 1 returned: " << (int)success1 << std::endl;

//Filter.
success1 = nppiFilterMedian_16u_C1R(dSrc,
                                    step,
                                    dDst,
                                    step,
                                    cudaROI, cudaMask, cudaAnchor, dBuf);
std::cout << "npp Filter  returned: " << (int)success1 << std::endl;

//Copy resultant back to host.
success2 = cudaMemcpy( image, dDst, totalPixels*sizeof(Npp16u), cudaMemcpyDeviceToHost);
std::cout << "cudaMemcpy 2 returned: " << (int)success2 << std::endl;

//Clean.
success2 = cudaFree(dBuf);
success2 = cudaFree(dSrc);
success2 = cudaFree(dDst);
delete image;

return 0;
}
$ nvcc -arch=sm_35 -o t1223 t1223.cu -lnppi
$ cuda-memcheck ./t1223
========= CUDA-MEMCHECK
get bufferSize returned: 0
0
cudaMalloc 1 returned: 0
cudaMalloc 2 returned: 0
cudaMalloc 3 returned: 0
cudaMemcpy 1 returned: 0
npp Copy 1 returned: 0
npp Filter  returned: 0
cudaMemcpy 2 returned: 0
========= ERROR SUMMARY: 0 errors
$

Note that if the anchor point were moved (say, to 4,4 instead of 0,0 in the case above), then this would mean that the "boundary" pixels would need to be available for ~5 lines before the start of the image. We could account for this by correctly setting the ROI and also offsetting the start of processing, by adding a line offset to the source pointer passed to the median filter, like so:

success1 = nppiFilterMedian_16u_C1R(dSrc + 5*imageWidth,

Note that I'm not trying to give a complete tutorial on median filtering here, just trying to identify the problem that leads to actual function failure. The left and right hand side filter mask boundaries are also something you may want to consider. At the left and right hand side of the image borders, those pixel mask boundaries may index to previous or next image lines, thus "wrapping" the image, perhaps with odd effects in the filtered pixels.

EDIT: Responding to the new code posting, the main issue now seems to be that you don't understand how to offset the image.

In C/C++, if I have a pointer, and I want to offset that pointer by a certain number of elements, I simply add the number of elements I want to offset it by. There is no need to scale this by bytes. If you would have studied the offset example I previously gave above, you would have noted that there is no scaling anything by bytes. If we want to offset by 5 lines, it is just 5 multiplied by the image width, as indicated above.

In addition, you were using the cudaROI to inform your src->dst copy operation, this doesn't make sense to me, so I modified that. Finally, I've modified the code so it can be built with the anchor in the corner or the anchor in the center.

Here's a modification of your code that compiles and runs correctly for me, in both anchor cases:

$ cat t1225.cu
//npp
#include "npp.h"
#include "nppi.h"
#include "device_launch_parameters.h"

#include <iostream>

int main() {

    //Image size.
    int imageWidth = 6592;
    int imageHeight = 4400;

    //Misc.
    int bytesPerPixel = 2;
    int totalPixels = imageWidth*imageHeight;
    int filterSize = 17;
    int halfFilter = filterSize/2;
    cudaError success2;
    NppStatus success1;

    //Mask & Origin for CUDA.
    NppiSize cudaMask;
    cudaMask.height = filterSize;
    cudaMask.width = filterSize;
    NppiPoint cudaAnchor;
#ifndef ANCHOR_CORNER
    cudaAnchor.x = halfFilter;
    cudaAnchor.y = halfFilter;
#else
    cudaAnchor.x = 0;
    cudaAnchor.y = 0;
#endif
    NppiSize imgSize;
    imgSize.width = imageWidth;
    imgSize.height = imageHeight;

    //ROI for CUDA.
    int left = halfFilter;
    int right = (imageWidth-1) - halfFilter;
    int top = halfFilter;
    int bot = (imageHeight-1) - halfFilter;
    NppiSize cudaROI;
    cudaROI.height  = bot - top;
    cudaROI.width   = right - left;

    //Step size.
    int step = imageWidth * bytesPerPixel;

    //Create a new "image".
    unsigned short* image = new unsigned short[totalPixels];
    for(int i=0; i<imageWidth; i++)
        for(int j=0; j<imageHeight; j++)
            image[j*imageWidth+i] = 10;

    //Allocate mem on device.
    Npp16u *dSrc, *dDst;
    Npp8u *dBuf;
    Npp32u bufferSize;

    //This call always returns a bufferSize==0.  That doesn't seem right...
    success1 = nppiFilterMedianGetBufferSize_16u_C1R(cudaROI, cudaMask, &bufferSize);
    std::cout << "get bufferSize returned: " << (int)success1 << std::endl;
    std::cout << bufferSize << std::endl;
    success2 = cudaMalloc( (void**)&dBuf, bufferSize);
    std::cout << "cudaMalloc 1 returned: " << (int)success2 << std::endl;
    success2 = cudaMalloc( (void**)&dSrc, totalPixels*sizeof(Npp16u));
    std::cout << "cudaMalloc 2 returned: " << (int)success2 << std::endl;
    success2 = cudaMalloc( (void**)&dDst, totalPixels*sizeof(Npp16u));
    std::cout << "cudaMalloc 3 returned: " << (int)success2 << std::endl;

    //Copy host image to device.
    success2 = cudaMemcpy( dSrc, image, totalPixels*sizeof(Npp16u), cudaMemcpyHostToDevice);
    std::cout << "cudaMemcpy 1 returned: " << (int)success2 << std::endl;


    //Copy source to destination.
    success1 = nppiCopy_16u_C1R( dSrc, step, dDst, step, imgSize);
    std::cout << "npp Copy 1 returned: " << (int)success1 << std::endl;


    //Filter.
#ifndef ANCHOR_CORNER
    Npp32u offset = top*imageWidth + left;
#else
    Npp32u offset = 0;
#endif
    success1 = nppiFilterMedian_16u_C1R(    dSrc + offset,
                                            step,
                                            dDst + offset,
                                            step,
                                            cudaROI, cudaMask, cudaAnchor, dBuf);
    std::cout << "npp Filter  returned: " << (int)success1 << std::endl;


    //Copy resultant back to host.
    success2 = cudaMemcpy( image, dDst, totalPixels*sizeof(Npp16u), cudaMemcpyDeviceToHost);
    std::cout << "cudaMemcpy 2 returned: " << (int)success2 << std::endl;

    //Clean.
    success2 = cudaFree(dDst);
    success2 = cudaFree(dBuf);
    success2 = cudaFree(dSrc);
    delete image;

    return 0;

}
$ nvcc -o t1225 t1225.cu -lnppi
$ cuda-memcheck ./t1225
========= CUDA-MEMCHECK
get bufferSize returned: 0
0
cudaMalloc 1 returned: 0
cudaMalloc 2 returned: 0
cudaMalloc 3 returned: 0
cudaMemcpy 1 returned: 0
npp Copy 1 returned: 0
npp Filter  returned: 0
cudaMemcpy 2 returned: 0
========= ERROR SUMMARY: 0 errors
$ nvcc -DANCHOR_CORNER -o t1225 t1225.cu -lnppi
$ cuda-memcheck ./t1225
========= CUDA-MEMCHECK
get bufferSize returned: 0
0
cudaMalloc 1 returned: 0
cudaMalloc 2 returned: 0
cudaMalloc 3 returned: 0
cudaMemcpy 1 returned: 0
npp Copy 1 returned: 0
npp Filter  returned: 0
cudaMemcpy 2 returned: 0
========= ERROR SUMMARY: 0 errors