CUDA image upsampling with FFT method

I'm trying to do image upsampling with FFT in CUDA. I first do forward FFT on the image, then I pad the result with 0 as shown below:

for a transformed image:

1 2

3 4

Pad it to:

1 0 0 2

0 0 0 0

3 0 0 4

Then I do inverse fft and normalize and output the image. However the output seems problematic:

The original input image:

orignal input image

The output image:

output image

Below is my code:

using namespace cv;
using namespace std;

const int upsamplingFactor = 2;
const string inputName = "../input/4096x4096.jpg";

//convert uint8_t array to __half array
__global__
void Upsampling(
    cufftComplex *input,
    int inputWidth,
    int inputHeight,
    cufftComplex *output,
    int outputWidth,
    int outputHeight
) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= outputWidth || y >= outputHeight) 
    {
        return;
    }
    int endXLeft = inputWidth / 2;
    int startXRight = outputWidth - inputWidth / 2;
    int endYTop = inputHeight / 2;
    int startYBottom = outputHeight - inputHeight / 2;

    if( x < endXLeft && y < endYTop)
    {
        output[ y * outputWidth + x ] = input[ y * inputWidth + x ];
    } 
    else if( x >= startXRight && y < endYTop ) 
    {
        output[ y * outputWidth + x ] = input[ y * inputWidth + x - startXRight + endXLeft ];
    }
    else if( x < endXLeft && y >= startYBottom )
    {
        output[ y * outputWidth + x ] = input[ (y - startYBottom + endYTop) * inputWidth + x ];
    }
    else if( x >= startXRight && y >= startYBottom )
    {
        output[ y * outputWidth + x ] = input[ (y - startYBottom + endYTop) * inputWidth + x - startXRight + endXLeft ];
    }
    else {
        output[y * outputWidth + x] = make_cuComplex( 0.f, 0.f );
    }
}

//normalize and switch to uint8_t
__global__
void Normalize( 
    float *input, 
    uint8_t *output,
    int imageWidth,
    int imageHeight,
    int size
)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= imageWidth || y >= imageHeight) return;

    output[y * imageWidth + x] = static_cast<uint8_t>(input[y * imageWidth + x] / size + 0.5f);
}

int main(){

    //load input image
    cv::Mat image = cv::imread( inputName, cv::IMREAD_GRAYSCALE );
    if( image.empty() )
    {
        cerr<<"Failed to load image"<<std::endl;
        return EXIT_FAILURE;
    }
    image.convertTo(image,CV_32F);

    //set up params
    const int imageWidth        = image.cols;
    const int imageHeight       = image.rows;
    const size_t imageSize         = ( size_t ) imageWidth * imageHeight;
    const int upsamplingWidth = imageWidth * upsamplingFactor;
    const int upsamplingHeight = imageHeight * upsamplingFactor;
    const size_t upsamplingSize = upsamplingWidth * upsamplingHeight;

    //copy image and kernel to gpu
    float *d_image, *d_output;
    uint8_t *d_outputInt;
    cufftComplex *d_freqImage, *d_freqUpsampling;
    cufftHandle  forwardImagePlan, inverseImagePlan;
    cufftResult  result;

    cudaMalloc( &d_image, imageSize * sizeof(float) );
    cudaMalloc( &d_freqImage, imageSize * sizeof( cufftComplex ) );
    cudaMalloc( &d_freqUpsampling, upsamplingSize * sizeof( cufftComplex) );
    cudaMalloc( &d_output, upsamplingSize * sizeof(float) );
    cudaMalloc( &d_outputInt, upsamplingSize * sizeof(uint8_t) );
    cudaMemcpy( d_image, image.data, imageSize * sizeof(float), cudaMemcpyHostToDevice );
    
    dim3 convertBlockSize(32, 32);
    dim3 convertGridSize( ( upsamplingWidth + convertBlockSize.x - 1 ) / convertBlockSize.x,
                        ( upsamplingHeight + convertBlockSize.y - 1 ) / convertBlockSize.y );

    //creating 2d fft plans
    //invfft should have size after upsampling
    result = cufftPlan2d(&inverseImagePlan, upsamplingWidth, upsamplingHeight, CUFFT_C2R);
    if(result!=CUFFT_SUCCESS)
    {
        std::cerr << "Failed to create inverse FFT plan with code: " << result <<std::endl;
        return EXIT_FAILURE;
    }
    //forward fft should have the size of the original image
    result = cufftPlan2d(&forwardImagePlan, imageWidth, imageHeight, CUFFT_R2C);
    if(result!=CUFFT_SUCCESS){...}

    result = cufftExecR2C( forwardImagePlan, d_image, d_freqImage );
    if( result != CUFFT_SUCCESS ){...}

    Upsampling<<< convertGridSize, convertBlockSize >>>( d_freqImage, imageWidth, imageHeight, d_freqUpsampling, upsamplingWidth, upsamplingHeight );
    
    result = cufftExecC2R( inverseImagePlan, d_freqUpsampling, d_output );
    if( result != CUFFT_SUCCESS ){...}
    //normalize and switch output to uint8_t
    Normalize<<< convertGridSize, convertBlockSize >>>( d_output, d_outputInt, upsamplingWidth, upsamplingHeight, imageSize ); 

    cudaDeviceSynchronize();
    
    Mat output = Mat::zeros( upsamplingHeight, upsamplingWidth, CV_8U );
    cudaMemcpy( output.data, d_outputInt, upsamplingSize * sizeof( uint8_t ), cudaMemcpyDeviceToHost );

    //save output
    cv::imwrite( "fftOutput.png", output );

    //free memory
    cufftDestroy(forwardImagePlan);
    cufftDestroy(inverseImagePlan);
    cudaFree(d_image);
    cudaFree(d_freqImage);
    cudaFree(d_output);
    cudaFree(d_outputInt);
    cudaFree(d_freqUpsampling);
    return 0;
}

Is my padding strategy wrong? I think I implemented the global function correctly, maybe there's some problem in the code elsewhere that I missed. Thank you in advance!!

Solution

I haven't tried to find all the issues in your code. However, a R2C transform does not work the way you think (I suspect). In the 2D case, one of the image output dimensions is not the input dimension, but is the input dimension divided by two, plus one. You don't seem to have accounted for that in your d_freqImage allocation, which by itself is not a showstopper, but if you are then expecting your complex allocation to be completely filled with complex data via the forward FFT, you would be mistaken.

Speaking for myself, although of course it is possible to use R2C with sufficient effort, I would not try to use R2C and C2R transforms with this method to get a first article working/for understanding. The R2C and C2R transforms expect and make use of hermitian symmetry (with all that that implies in terms of data formatting) in the complex domain. (See below for added section covering R2C/C2R case).

I followed the description here and came up with this, which seems to work using C2C transforms:

#include "lodepng.h"
#include <iostream>
#include <vector>
#include <cufft.h>

//Encode from raw pixels to disk with a single function call
//The image argument has width * height RGBA pixels or width * height * 4 bytes
void encodeOneStep(const char* filename, std::vector<unsigned char>& image, unsigned width, unsigned height) {
  //Encode the image
  unsigned error = lodepng::encode(filename, image, width, height);

  //if there's an error, display it
  if(error) std::cout << "encoder error " << error << ": "<< lodepng_error_text(error) << std::endl;
}

void decodeOneStep(std::vector<unsigned char> &image, unsigned &width, unsigned &height, const char* filename) {

  //decode
  unsigned error = lodepng::decode(image, width, height, filename);

  //if there's an error, display it
  if(error) std::cout << "decoder error " << error << ": " << lodepng_error_text(error) << std::endl;

  //the pixels are now in the vector "image", 4 bytes per pixel, ordered RGBARGBA..., use it as texture, draw it, ...
}

template <typename T>
__global__ void split(const T *i, T *o, unsigned w, unsigned h, unsigned uf){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  int idy = threadIdx.y+blockDim.y*blockIdx.y;
  if ((idx < w) && (idy < h)){
    int oidx = (idx >= (w/2))?(idx+w*(uf-1)):idx;
    int oidy = (idy >= (h/2))?(idy+h*(uf-1)):idy;
    int iid = w*idy+idx;
    int oid = uf*w*oidy+oidx;
    o[oid] = i[iid];}
}

int main(){
  unsigned uf = 3; // upscaling factor
  const char *ifn = "test_i.png";
  const char *ofn = "test_o.png";
  // load input image
  std::vector<unsigned char> i;
  unsigned w, h;
  decodeOneStep(i, w, h, ifn);
  std::cout << "width: " << w << " height: " << h << std::endl;
  // convert to grayscale float normalized 255 = 1.0f
  std::vector<float> in_f(w*h*2);
  for (int j = 0; j < w*h; j++) in_f[j*2] = (0.299*i[4*j] + 0.587*i[4*j+1] + 0.114*i[4*j+2])/256.0;
  // copy to device
  cufftComplex *d_i;
  cudaMalloc(&d_i, w*h*sizeof(cufftComplex));
  cudaMemcpy(d_i, in_f.data(), w*h*sizeof(cufftComplex), cudaMemcpyHostToDevice);
  // perform forward FFT
  cufftHandle hf;
  cufftResult r = cufftPlan2d(&hf, h, w, CUFFT_C2C); 
  r = cufftExecC2C(hf, d_i, d_i, CUFFT_FORWARD);
  // perform data split to corners of upscale complex image
  cufftComplex *d_u; 
  cudaMalloc(&d_u, uf*uf*w*h*sizeof(cufftComplex));
  cudaMemset(d_u, 0, uf*uf*w*h*sizeof(cufftComplex));
  dim3 sblock(32,32);
  dim3 sgrid((w+sblock.x-1)/sblock.x, (h+sblock.y-1)/sblock.y);
  split<<<sgrid, sblock>>>(d_i, d_u, w, h, uf);
  // perform inverse FFT
  cufftHandle hi;
  r = cufftPlan2d(&hi, h*uf, w*uf, CUFFT_C2C); 
  r = cufftExecC2C(hi, d_u, d_u, CUFFT_INVERSE);
  // copy to host
  std::vector<float2> of(w*h*uf*uf);
  cudaMemcpy(of.data(), d_u, w*h*uf*uf*sizeof(float2), cudaMemcpyDeviceToHost);
  // convert to ordinary unsigned char RGBA image (as grayscale image)
  std::vector<unsigned char> o(w*h*uf*uf*4);
  for (int i = 0; i < w*h*uf*uf; i++) {
    if (of[i].x > w*h) of[i].x = w*h;  // clamp
    if (of[i].x < 0) of[i].x = 0;      // clamp
    unsigned char pix = (of[i].x/(w*h))*255;  //scale to unsigned char
    o[4*i]   = pix;  //RGBA for PNG image
    o[4*i+1] = pix;
    o[4*i+2] = pix;
    o[4*i+3] = 255;}
  // save output image
  encodeOneStep(ofn, o, w*uf, h*uf);
}

The lodepng stuff can be gotten here. The original input image:

came from here. This is what the output image looks like (3x larger dimensions for my example):

Compile with nvcc -o test example.cu lodepng.cpp -lcufft

EDIT/LATER: There was a comment/question posted (now deleted) asking effectively how to do this with R2C and C2R.

It seems that it is possible according to my testing. The basic difference is during the split operation, we will split only horizontally (split the complex data to top and bottom of upscaled complex image), not to all 4 corners. There are of course a number of other changes related to the mechanics of the R2C and C2R, and the special sizes for associated buffers.

Here's an example that seems to produce a similar output:

#include "lodepng.h"
#include <iostream>
#include <vector>
#include <cufft.h>

//Encode from raw pixels to disk with a single function call
//The image argument has width * height RGBA pixels or width * height * 4 bytes
void encodeOneStep(const char* filename, std::vector<unsigned char>& image, unsigned width, unsigned height) {
  //Encode the image
  unsigned error = lodepng::encode(filename, image, width, height);

  //if there's an error, display it
  if(error) std::cout << "encoder error " << error << ": "<< lodepng_error_text(error) << std::endl;
}

void decodeOneStep(std::vector<unsigned char> &image, unsigned &width, unsigned &height, const char* filename) {

  //decode
  unsigned error = lodepng::decode(image, width, height, filename);

  //if there's an error, display it
  if(error) std::cout << "decoder error " << error << ": " << lodepng_error_text(error) << std::endl;

  //the pixels are now in the vector "image", 4 bytes per pixel, ordered RGBARGBA..., use it as texture, draw it, ...
}

template <typename T>
__global__ void split(const T *i, T *o, unsigned w, unsigned h, unsigned uf){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  int idy = threadIdx.y+blockDim.y*blockIdx.y;
  if ((idx < (w/2+1)) && (idy < h)){
    int oidx = idx;
    int oidy = (idy >= (h/2))?(idy+h*(uf-1)):idy;
    int iid = (w/2+1)*idy+idx;
    int oid = ((uf*w)/2+1)*oidy+oidx;
    o[oid] = i[iid];}
}

int main(){
  unsigned uf = 3;
  const char *ifn = "test_i.png";
  const char *ofn = "test_o.png";
  // load input image
  std::vector<unsigned char> i;
  unsigned w, h;
  decodeOneStep(i, w, h, ifn);
  std::cout << "width: " << w << " height: " << h << std::endl;
  // convert to grayscale float normalized 255 = 1.0f
  std::vector<float> in_f((w+2)*h);
  for (int j = 0; j < h; j++)
    for (int k = 0; k < w; k++){ 
      int idx1 = j*(w+2)+k;
      int idx2 = j*w+k;
      int val = 0.299*i[4*idx2] + 0.587*i[4*idx2+1] + 0.114*i[4*idx2+2];
      if (val > 255) val = 255;
      in_f[idx1] = val/256.0;}
  // copy to device
  cufftComplex *d_i;
  cudaMalloc(&d_i, ((w/2)+1)*h*sizeof(cufftComplex));
  cudaMemcpy(d_i, in_f.data(), ((w/2)+1)*h*sizeof(cufftComplex), cudaMemcpyHostToDevice);
  // perform forward FFT
  cufftHandle hf;
  cufftResult r = cufftPlan2d(&hf, h, w, CUFFT_R2C); 
  r = cufftExecR2C(hf, (cufftReal *)d_i, d_i);
  // perform data split to corners of upscale complex image
  cufftComplex *d_u; 
  cudaMalloc(&d_u, uf*h*((uf*w)/2+1)*sizeof(cufftComplex));
  cudaMemset(d_u, 0, uf*h*((uf*w)/2+1)*sizeof(cufftComplex));
  dim3 sblock(32,32);
  dim3 sgrid((((w/2)+1)+sblock.x-1)/sblock.x, (h+sblock.y-1)/sblock.y);
  split<<<sgrid, sblock>>>(d_i, d_u, w, h, uf);
  // perform inverse FFT
  cufftHandle hi;
  r = cufftPlan2d(&hi, h*uf, w*uf, CUFFT_C2R); 
  r = cufftExecC2R(hi, d_u, (cufftReal *)d_u);
  std::vector<float> of((w*uf+2)*h*uf);
  cudaMemcpy(of.data(), d_u, (w*uf+2)*h*uf*sizeof(float), cudaMemcpyDeviceToHost);
  // convert to ordinary unsigned char RGBA image (as grayscale image)
  std::vector<unsigned char> o(w*h*uf*uf*4);
  for (int j = 0; j < h*uf; j++)
    for (int k = 0; k < w*uf; k++) {
      int idx1 = j*(w*uf)+k;
      int idx2 = j*(w*uf+2)+k;
      if (of[idx2] > w*h) of[idx2] = w*h;
      if (of[idx2] < 0) of[idx2] = 0;
      unsigned char pix = (of[idx2]/(w*h))*255;
      o[4*idx1]   = pix;
      o[4*idx1+1] = pix;
      o[4*idx1+2] = pix;
      o[4*idx1+3] = 255;}
  encodeOneStep(ofn, o, w*uf, h*uf);
}

Here is the generated output: