image-processingcudafftcufft

CUDA image upsampling with FFT method


I'm trying to do image upsampling with FFT in CUDA. I first do forward FFT on the image, then I pad the result with 0 as shown below:

for a transformed image:

1 2

3 4

Pad it to:

1 0 0 2

0 0 0 0

0 0 0 0

3 0 0 4

Then I do inverse fft and normalize and output the image. However the output seems problematic:

The original input image:

orignal input image

The output image:

output image

Below is my code:

using namespace cv;
using namespace std;

const int upsamplingFactor = 2;
const string inputName = "../input/4096x4096.jpg";

//convert uint8_t array to __half array
__global__
void Upsampling(
    cufftComplex *input,
    int inputWidth,
    int inputHeight,
    cufftComplex *output,
    int outputWidth,
    int outputHeight
) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= outputWidth || y >= outputHeight) 
    {
        return;
    }
    int endXLeft = inputWidth / 2;
    int startXRight = outputWidth - inputWidth / 2;
    int endYTop = inputHeight / 2;
    int startYBottom = outputHeight - inputHeight / 2;

    if( x < endXLeft && y < endYTop)
    {
        output[ y * outputWidth + x ] = input[ y * inputWidth + x ];
    } 
    else if( x >= startXRight && y < endYTop ) 
    {
        output[ y * outputWidth + x ] = input[ y * inputWidth + x - startXRight + endXLeft ];
    }
    else if( x < endXLeft && y >= startYBottom )
    {
        output[ y * outputWidth + x ] = input[ (y - startYBottom + endYTop) * inputWidth + x ];
    }
    else if( x >= startXRight && y >= startYBottom )
    {
        output[ y * outputWidth + x ] = input[ (y - startYBottom + endYTop) * inputWidth + x - startXRight + endXLeft ];
    }
    else {
        output[y * outputWidth + x] = make_cuComplex( 0.f, 0.f );
    }
}

//normalize and switch to uint8_t
__global__
void Normalize( 
    float *input, 
    uint8_t *output,
    int imageWidth,
    int imageHeight,
    int size
)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= imageWidth || y >= imageHeight) return;

    output[y * imageWidth + x] = static_cast<uint8_t>(input[y * imageWidth + x] / size + 0.5f);
}

int main(){

    //load input image
    cv::Mat image = cv::imread( inputName, cv::IMREAD_GRAYSCALE );
    if( image.empty() )
    {
        cerr<<"Failed to load image"<<std::endl;
        return EXIT_FAILURE;
    }
    image.convertTo(image,CV_32F);

    //set up params
    const int imageWidth        = image.cols;
    const int imageHeight       = image.rows;
    const size_t imageSize         = ( size_t ) imageWidth * imageHeight;
    const int upsamplingWidth = imageWidth * upsamplingFactor;
    const int upsamplingHeight = imageHeight * upsamplingFactor;
    const size_t upsamplingSize = upsamplingWidth * upsamplingHeight;

    //copy image and kernel to gpu
    float *d_image, *d_output;
    uint8_t *d_outputInt;
    cufftComplex *d_freqImage, *d_freqUpsampling;
    cufftHandle  forwardImagePlan, inverseImagePlan;
    cufftResult  result;

    cudaMalloc( &d_image, imageSize * sizeof(float) );
    cudaMalloc( &d_freqImage, imageSize * sizeof( cufftComplex ) );
    cudaMalloc( &d_freqUpsampling, upsamplingSize * sizeof( cufftComplex) );
    cudaMalloc( &d_output, upsamplingSize * sizeof(float) );
    cudaMalloc( &d_outputInt, upsamplingSize * sizeof(uint8_t) );
    cudaMemcpy( d_image, image.data, imageSize * sizeof(float), cudaMemcpyHostToDevice );
    
    dim3 convertBlockSize(32, 32);
    dim3 convertGridSize( ( upsamplingWidth + convertBlockSize.x - 1 ) / convertBlockSize.x,
                        ( upsamplingHeight + convertBlockSize.y - 1 ) / convertBlockSize.y );

    //creating 2d fft plans
    //invfft should have size after upsampling
    result = cufftPlan2d(&inverseImagePlan, upsamplingWidth, upsamplingHeight, CUFFT_C2R);
    if(result!=CUFFT_SUCCESS)
    {
        std::cerr << "Failed to create inverse FFT plan with code: " << result <<std::endl;
        return EXIT_FAILURE;
    }
    //forward fft should have the size of the original image
    result = cufftPlan2d(&forwardImagePlan, imageWidth, imageHeight, CUFFT_R2C);
    if(result!=CUFFT_SUCCESS){...}

    result = cufftExecR2C( forwardImagePlan, d_image, d_freqImage );
    if( result != CUFFT_SUCCESS ){...}

    Upsampling<<< convertGridSize, convertBlockSize >>>( d_freqImage, imageWidth, imageHeight, d_freqUpsampling, upsamplingWidth, upsamplingHeight );
    
    result = cufftExecC2R( inverseImagePlan, d_freqUpsampling, d_output );
    if( result != CUFFT_SUCCESS ){...}
    //normalize and switch output to uint8_t
    Normalize<<< convertGridSize, convertBlockSize >>>( d_output, d_outputInt, upsamplingWidth, upsamplingHeight, imageSize ); 

    cudaDeviceSynchronize();
    
    Mat output = Mat::zeros( upsamplingHeight, upsamplingWidth, CV_8U );
    cudaMemcpy( output.data, d_outputInt, upsamplingSize * sizeof( uint8_t ), cudaMemcpyDeviceToHost );

    //save output
    cv::imwrite( "fftOutput.png", output );

    //free memory
    cufftDestroy(forwardImagePlan);
    cufftDestroy(inverseImagePlan);
    cudaFree(d_image);
    cudaFree(d_freqImage);
    cudaFree(d_output);
    cudaFree(d_outputInt);
    cudaFree(d_freqUpsampling);
    return 0;
}

Is my padding strategy wrong? I think I implemented the global function correctly, maybe there's some problem in the code elsewhere that I missed. Thank you in advance!!


Solution

  • I haven't tried to find all the issues in your code. However, a R2C transform does not work the way you think (I suspect). In the 2D case, one of the image output dimensions is not the input dimension, but is the input dimension divided by two, plus one. You don't seem to have accounted for that in your d_freqImage allocation, which by itself is not a showstopper, but if you are then expecting your complex allocation to be completely filled with complex data via the forward FFT, you would be mistaken.

    Speaking for myself, although of course it is possible to use R2C with sufficient effort, I would not try to use R2C and C2R transforms with this method to get a first article working/for understanding. The R2C and C2R transforms expect and make use of hermitian symmetry (with all that that implies in terms of data formatting) in the complex domain. (See below for added section covering R2C/C2R case).

    I followed the description here and came up with this, which seems to work using C2C transforms:

    #include "lodepng.h"
    #include <iostream>
    #include <vector>
    #include <cufft.h>
    
    //Encode from raw pixels to disk with a single function call
    //The image argument has width * height RGBA pixels or width * height * 4 bytes
    void encodeOneStep(const char* filename, std::vector<unsigned char>& image, unsigned width, unsigned height) {
      //Encode the image
      unsigned error = lodepng::encode(filename, image, width, height);
    
      //if there's an error, display it
      if(error) std::cout << "encoder error " << error << ": "<< lodepng_error_text(error) << std::endl;
    }
    
    void decodeOneStep(std::vector<unsigned char> &image, unsigned &width, unsigned &height, const char* filename) {
    
      //decode
      unsigned error = lodepng::decode(image, width, height, filename);
    
      //if there's an error, display it
      if(error) std::cout << "decoder error " << error << ": " << lodepng_error_text(error) << std::endl;
    
      //the pixels are now in the vector "image", 4 bytes per pixel, ordered RGBARGBA..., use it as texture, draw it, ...
    }
    
    template <typename T>
    __global__ void split(const T *i, T *o, unsigned w, unsigned h, unsigned uf){
    
      int idx = threadIdx.x+blockDim.x*blockIdx.x;
      int idy = threadIdx.y+blockDim.y*blockIdx.y;
      if ((idx < w) && (idy < h)){
        int oidx = (idx >= (w/2))?(idx+w*(uf-1)):idx;
        int oidy = (idy >= (h/2))?(idy+h*(uf-1)):idy;
        int iid = w*idy+idx;
        int oid = uf*w*oidy+oidx;
        o[oid] = i[iid];}
    }
    
    int main(){
      unsigned uf = 3; // upscaling factor
      const char *ifn = "test_i.png";
      const char *ofn = "test_o.png";
      // load input image
      std::vector<unsigned char> i;
      unsigned w, h;
      decodeOneStep(i, w, h, ifn);
      std::cout << "width: " << w << " height: " << h << std::endl;
      // convert to grayscale float normalized 255 = 1.0f
      std::vector<float> in_f(w*h*2);
      for (int j = 0; j < w*h; j++) in_f[j*2] = (0.299*i[4*j] + 0.587*i[4*j+1] + 0.114*i[4*j+2])/256.0;
      // copy to device
      cufftComplex *d_i;
      cudaMalloc(&d_i, w*h*sizeof(cufftComplex));
      cudaMemcpy(d_i, in_f.data(), w*h*sizeof(cufftComplex), cudaMemcpyHostToDevice);
      // perform forward FFT
      cufftHandle hf;
      cufftResult r = cufftPlan2d(&hf, h, w, CUFFT_C2C); 
      r = cufftExecC2C(hf, d_i, d_i, CUFFT_FORWARD);
      // perform data split to corners of upscale complex image
      cufftComplex *d_u; 
      cudaMalloc(&d_u, uf*uf*w*h*sizeof(cufftComplex));
      cudaMemset(d_u, 0, uf*uf*w*h*sizeof(cufftComplex));
      dim3 sblock(32,32);
      dim3 sgrid((w+sblock.x-1)/sblock.x, (h+sblock.y-1)/sblock.y);
      split<<<sgrid, sblock>>>(d_i, d_u, w, h, uf);
      // perform inverse FFT
      cufftHandle hi;
      r = cufftPlan2d(&hi, h*uf, w*uf, CUFFT_C2C); 
      r = cufftExecC2C(hi, d_u, d_u, CUFFT_INVERSE);
      // copy to host
      std::vector<float2> of(w*h*uf*uf);
      cudaMemcpy(of.data(), d_u, w*h*uf*uf*sizeof(float2), cudaMemcpyDeviceToHost);
      // convert to ordinary unsigned char RGBA image (as grayscale image)
      std::vector<unsigned char> o(w*h*uf*uf*4);
      for (int i = 0; i < w*h*uf*uf; i++) {
        if (of[i].x > w*h) of[i].x = w*h;  // clamp
        if (of[i].x < 0) of[i].x = 0;      // clamp
        unsigned char pix = (of[i].x/(w*h))*255;  //scale to unsigned char
        o[4*i]   = pix;  //RGBA for PNG image
        o[4*i+1] = pix;
        o[4*i+2] = pix;
        o[4*i+3] = 255;}
      // save output image
      encodeOneStep(ofn, o, w*uf, h*uf);
    }
    

    The lodepng stuff can be gotten here. The original input image:

    input image

    came from here. This is what the output image looks like (3x larger dimensions for my example):

    output image

    Compile with nvcc -o test example.cu lodepng.cpp -lcufft

    EDIT/LATER: There was a comment/question posted (now deleted) asking effectively how to do this with R2C and C2R.

    It seems that it is possible according to my testing. The basic difference is during the split operation, we will split only horizontally (split the complex data to top and bottom of upscaled complex image), not to all 4 corners. There are of course a number of other changes related to the mechanics of the R2C and C2R, and the special sizes for associated buffers.

    Here's an example that seems to produce a similar output:

    #include "lodepng.h"
    #include <iostream>
    #include <vector>
    #include <cufft.h>
    
    //Encode from raw pixels to disk with a single function call
    //The image argument has width * height RGBA pixels or width * height * 4 bytes
    void encodeOneStep(const char* filename, std::vector<unsigned char>& image, unsigned width, unsigned height) {
      //Encode the image
      unsigned error = lodepng::encode(filename, image, width, height);
    
      //if there's an error, display it
      if(error) std::cout << "encoder error " << error << ": "<< lodepng_error_text(error) << std::endl;
    }
    
    void decodeOneStep(std::vector<unsigned char> &image, unsigned &width, unsigned &height, const char* filename) {
    
      //decode
      unsigned error = lodepng::decode(image, width, height, filename);
    
      //if there's an error, display it
      if(error) std::cout << "decoder error " << error << ": " << lodepng_error_text(error) << std::endl;
    
      //the pixels are now in the vector "image", 4 bytes per pixel, ordered RGBARGBA..., use it as texture, draw it, ...
    }
    
    template <typename T>
    __global__ void split(const T *i, T *o, unsigned w, unsigned h, unsigned uf){
    
      int idx = threadIdx.x+blockDim.x*blockIdx.x;
      int idy = threadIdx.y+blockDim.y*blockIdx.y;
      if ((idx < (w/2+1)) && (idy < h)){
        int oidx = idx;
        int oidy = (idy >= (h/2))?(idy+h*(uf-1)):idy;
        int iid = (w/2+1)*idy+idx;
        int oid = ((uf*w)/2+1)*oidy+oidx;
        o[oid] = i[iid];}
    }
    
    int main(){
      unsigned uf = 3;
      const char *ifn = "test_i.png";
      const char *ofn = "test_o.png";
      // load input image
      std::vector<unsigned char> i;
      unsigned w, h;
      decodeOneStep(i, w, h, ifn);
      std::cout << "width: " << w << " height: " << h << std::endl;
      // convert to grayscale float normalized 255 = 1.0f
      std::vector<float> in_f((w+2)*h);
      for (int j = 0; j < h; j++)
        for (int k = 0; k < w; k++){ 
          int idx1 = j*(w+2)+k;
          int idx2 = j*w+k;
          int val = 0.299*i[4*idx2] + 0.587*i[4*idx2+1] + 0.114*i[4*idx2+2];
          if (val > 255) val = 255;
          in_f[idx1] = val/256.0;}
      // copy to device
      cufftComplex *d_i;
      cudaMalloc(&d_i, ((w/2)+1)*h*sizeof(cufftComplex));
      cudaMemcpy(d_i, in_f.data(), ((w/2)+1)*h*sizeof(cufftComplex), cudaMemcpyHostToDevice);
      // perform forward FFT
      cufftHandle hf;
      cufftResult r = cufftPlan2d(&hf, h, w, CUFFT_R2C); 
      r = cufftExecR2C(hf, (cufftReal *)d_i, d_i);
      // perform data split to corners of upscale complex image
      cufftComplex *d_u; 
      cudaMalloc(&d_u, uf*h*((uf*w)/2+1)*sizeof(cufftComplex));
      cudaMemset(d_u, 0, uf*h*((uf*w)/2+1)*sizeof(cufftComplex));
      dim3 sblock(32,32);
      dim3 sgrid((((w/2)+1)+sblock.x-1)/sblock.x, (h+sblock.y-1)/sblock.y);
      split<<<sgrid, sblock>>>(d_i, d_u, w, h, uf);
      // perform inverse FFT
      cufftHandle hi;
      r = cufftPlan2d(&hi, h*uf, w*uf, CUFFT_C2R); 
      r = cufftExecC2R(hi, d_u, (cufftReal *)d_u);
      std::vector<float> of((w*uf+2)*h*uf);
      cudaMemcpy(of.data(), d_u, (w*uf+2)*h*uf*sizeof(float), cudaMemcpyDeviceToHost);
      // convert to ordinary unsigned char RGBA image (as grayscale image)
      std::vector<unsigned char> o(w*h*uf*uf*4);
      for (int j = 0; j < h*uf; j++)
        for (int k = 0; k < w*uf; k++) {
          int idx1 = j*(w*uf)+k;
          int idx2 = j*(w*uf+2)+k;
          if (of[idx2] > w*h) of[idx2] = w*h;
          if (of[idx2] < 0) of[idx2] = 0;
          unsigned char pix = (of[idx2]/(w*h))*255;
          o[4*idx1]   = pix;
          o[4*idx1+1] = pix;
          o[4*idx1+2] = pix;
          o[4*idx1+3] = 255;}
      encodeOneStep(ofn, o, w*uf, h*uf);
    }
    

    Here is the generated output:

    enter image description here