cudacross-correlationnpp

NPP image cross correlation producing no valid results


I'm trying to implement an image displacement method using the NPP cross correlation library.

I have tried creating a simple solution by generating a couple of simple images in memory, outputting either Npp8u or Npp32f arrays. However the cross correlation library produces meaningless or invalid (i.e. NAN) results.

int main(int argc, char* argv[])
{
    Npp8u* gpuImg1, * gpuImg2;
    Npp32f *gpuDest;

    cudaDeviceInit(argc, (const char**)argv);

    long dataSize1 = 128;
    auto err = cudaMalloc((void**)&gpuImg1, dataSize1 * dataSize1 * sizeof(unsigned char));
    auto img1Data = static_cast<unsigned char*>(malloc(dataSize1 * dataSize1 * sizeof(unsigned char)));
    memset(img1Data, 0, dataSize1 * dataSize1);

    for(auto y = 40; y < 60; y++)
    {
        for(auto x = 20; x < 40; x++)
        {
            img1Data[y * dataSize1 + x] = 0xff;
        }
    }

    long dataSize2 = 64;
    err = cudaMalloc((void**)&gpuImg2, dataSize2);
    auto img2data = static_cast<unsigned char*>(malloc(dataSize2 * dataSize2 * sizeof(unsigned char)));
    memset(img2data, 0, dataSize2 * dataSize2);

    for (auto y = 10; y < 30; y++)
    {
        for (auto x = 20; x < 40; x++)
        {
            img2data[y * dataSize2 + x] = 0xff;
        }
    }

    auto resSize = (dataSize1 - dataSize2) + 1;
    err = cudaMalloc((void**)&gpuDest, resSize * resSize * sizeof(Npp32f));
    auto resData = static_cast<Npp32f*>(malloc(resSize * resSize * sizeof(Npp32f)));

    NppiSize nppiSize1;
    nppiSize1.height = dataSize1;
    nppiSize1.width = dataSize1;

    NppiSize nppiSize2;
    nppiSize2.height = dataSize2;
    nppiSize2.width = dataSize2;

    err = cudaMemcpy(gpuImg1, img1Data, dataSize1, cudaMemcpyHostToDevice);
    err = cudaMemcpy(gpuImg2, img2data, dataSize2, cudaMemcpyHostToDevice);

    auto status = nppiCrossCorrValid_Norm_8u32f_C1R(gpuImg1, dataSize1, nppiSize1, gpuImg2, dataSize2, nppiSize2, gpuDest, resSize * sizeof(Npp32f));

    err = cudaMemcpy(resData, gpuDest, resSize * resSize * sizeof(Npp8u), cudaMemcpyDeviceToHost);

}

Both the CUDA methods and the NPP method are returning success messages so I'm pretty sure it is something that I am doing wrong with the setup of the cross correlation. Can anybody help push me in the right direction for a solution?


Solution

  • There appear to be 2 categories of problems with your code.

    First, you have various problems with data sizes. Not sure where the disconnect is here, since some of it you have right, so I'll just point out the things I see.

    This construct is correct:

    auto err = cudaMalloc((void**)&gpuImg1, dataSize1 * dataSize1 * sizeof(unsigned char));
    

    You should have done the same thing here:

    err = cudaMalloc((void**)&gpuImg2, dataSize2);
    

    These are not correct. cudaMemcpy, like memcpy, and like cudaMalloc takes a size parameter in bytes:

    err = cudaMemcpy(gpuImg1, img1Data, dataSize1, cudaMemcpyHostToDevice);
    err = cudaMemcpy(gpuImg2, img2data, dataSize2, cudaMemcpyHostToDevice);
                                        ^^^^^^^^^
    

    You got it almost correct on the subsequent copy from device to host, except that your sizeof should be for the correct type:

    err = cudaMemcpy(resData, gpuDest, resSize * resSize * sizeof(Npp8u), cudaMemcpyDeviceToHost);
                                                                  ^^^^^
    

    Second, you are using the normalized version of the cross correlation. If you study the documentation, I believe you will find that it is possible for the denominator to be calculated as the square-root of zero, when large portions of your image are zero-valued. Anyway, when I convert the "background" from 0 to 1, I get sensible results. Another option would be to switch to the non-normalized version of the function (nppiCrossCorrValid_8u32f_C1R) which also yields non-NAN results, even with large areas of zero "background".

    Here is a corrected version, I think it will give you non-NAN results:

    # cat t14.cu
    #include <npp.h>
    #include <iostream>
    int main(int argc, char* argv[])
    {
        Npp8u* gpuImg1, * gpuImg2;
        Npp32f *gpuDest;
    
       // cudaDeviceInit(argc, (const char**)argv);
    
        long dataSize1 = 128;
        auto err = cudaMalloc((void**)&gpuImg1, dataSize1 * dataSize1 * sizeof(unsigned char));
        unsigned char *img1Data = static_cast<unsigned char*>(malloc(dataSize1 * dataSize1 * sizeof(unsigned char)));
        memset(img1Data, 1, dataSize1 * dataSize1);
    
        for(auto y = 40; y < 60; y++)
        {
            for(auto x = 20; x < 40; x++)
            {
                img1Data[y * dataSize1 + x] = 0xff;
            }
        }
    
        long dataSize2 = 64;
        err = cudaMalloc((void**)&gpuImg2, dataSize2*dataSize2 *sizeof(unsigned char));
        unsigned char *img2data = static_cast<unsigned char*>(malloc(dataSize2 * dataSize2 * sizeof(unsigned char)));
        memset(img2data, 1, dataSize2 * dataSize2);
    
        for (auto y = 10; y < 30; y++)
        {
            for (auto x = 20; x < 40; x++)
            {
                img2data[y * dataSize2 + x] = 0xff;
            }
        }
    
        auto resSize = (dataSize1 - dataSize2) + 1;
        err = cudaMalloc((void**)&gpuDest, resSize * resSize * sizeof(Npp32f));
        auto resData = static_cast<Npp32f*>(malloc(resSize * resSize * sizeof(Npp32f)));
    
        NppiSize nppiSize1;
        nppiSize1.height = dataSize1;
        nppiSize1.width = dataSize1;
    
        NppiSize nppiSize2;
        nppiSize2.height = dataSize2;
        nppiSize2.width = dataSize2;
    
        err = cudaMemcpy(gpuImg1, img1Data, dataSize1*dataSize1*sizeof(unsigned char), cudaMemcpyHostToDevice);
        err = cudaMemcpy(gpuImg2, img2data, dataSize2*dataSize2*sizeof(unsigned char), cudaMemcpyHostToDevice);
    
        auto status = nppiCrossCorrValid_Norm_8u32f_C1R(gpuImg1, dataSize1, nppiSize1, gpuImg2, dataSize2, nppiSize2, gpuDest, resSize * sizeof(Npp32f));
    
        err = cudaMemcpy(resData, gpuDest, resSize * resSize * sizeof(Npp32f), cudaMemcpyDeviceToHost);
        for (int i = 0; i < resSize*2; i++)
          std::cout << resData[i] << ",";
        std::cout << std::endl;
    }
    # nvcc -std=c++11 -o t14 t14.cu -lnppc -lnppist
    # cuda-memcheck ./t14
    ========= CUDA-MEMCHECK
    0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00797587,0.00798853,0.00800826,0.00803633,0.00807432,0.00812423,0.00818861,0.00827071,0.00837505,0.00850754,0.00867648,0.00889385,0.00917761,0.00955609,0.0100771,0.0108291,0.0119988,0.0140744,0.0190166,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796926,0.00796926,0.00796926,0.00796926,0.00797588,0.00798854,0.00800827,0.00803634,0.00807434,0.00812425,0.00818863,0.00827071,0.00837505,0.00850754,0.00867648,0.00889385,0.00917761,0.00955609,0.0100771,0.0108291,0.0119988,0.0140744,0.0190166,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,
    ========= ERROR SUMMARY: 0 errors
    #