openclfftfpgaintel-fpga

Getting nan values from OpenCL FFT kernel on FPGA


I was trying to use the Intel's FFT1D kernel by writing the Host program by my own for Intel FPGA. Link to Intel's FFT1d can be found here

I have also given my host program below, wherein, I have a file saved (which contains some data), my task is to read that data, calculate its FFT and print some of it. It is a 4K point FFT

 #include <stdio.h>
    #include <stdlib.h>
#include "CL/opencl.h"
#include "AOCLUtils/aocl_utils.h"
#include <string.h>
#include "fft_config.h"


#define N (1<<LOGN) //Please check the FFT Sample Code for Ref (2 to the power 12 gives 4K points)
#define DATA_FILE "complex_input.data"

using namespace aocl_utils;


cl_platform_id platform = NULL;
cl_device_id device = NULL;
cl_command_queue queue0 = NULL;
cl_command_queue queue1 = NULL;
cl_context context = NULL;
cl_program program = NULL;
cl_kernel kernel0, kernel1;
cl_mem d_inData, d_outData;
cl_int err = 0;

typedef struct {
  float x;
  float y;
} float2;
//float2 h_outData[N], h_inData[N];
float2 *h_inData = (float2 *)alignedMalloc(sizeof(float2)*N);
float2 *h_outData = (float2 *)alignedMalloc(sizeof(float2)*N);

void init();    //Function that does the job of Querying Platform and Device, creating Context, Command Queues, Program and required Kernels to do the job.
void cleanup(); //Function that releases all the Created Contexts, Buffers etc, in order to finish the execution.
void read_data();   //Reads data from the complex numbers from .data file and fills in the float2 struct h_inData[].
int temp_value = 1;

int main()
{
 // h_inData = (float2 *)alignedMalloc(sizeof(float2)*N);
  //h_outData = (float2 *)alignedMalloc(sizeof(float2)*N);

  int inverse = false;
  int temp =1;
  init();
  read_data();

  d_inData = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float2)*N, NULL, &err);
  checkError(err,"Failed to allocate Buffer for input array\n");

  d_outData = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_CHANNEL_2_INTELFPGA, sizeof(float2)*N, NULL, &err);
  checkError(err, "Failed to allocate the Buffer for output\n");


  //WE FINISH THE FETCH KERNEL
  err = clEnqueueWriteBuffer(queue1,d_inData, CL_TRUE, 0, sizeof(float2)*N, h_inData, 0, NULL, NULL);
  checkError(err,"Failed to Write the input Buffer\n");

  err = clSetKernelArg(kernel1, 0, sizeof(cl_mem), (void *)&d_inData);
  checkError(err, "Failed to set KerArg for Kernel1 - 0\n");

  err = clSetKernelArg(kernel0, 0, sizeof(cl_mem), (void *)&d_outData);
  checkError(err, "Failed to set KerArg for Kernel0 - 0\n");

  err = clSetKernelArg(kernel0, 1, sizeof(cl_int), (void *)&temp_value);
  checkError(err, "Failed to set KerArg for Kernel0 - 1\n");

  err = clSetKernelArg(kernel0, 2, sizeof(cl_int), (void *)&inverse);
  checkError(err, "Failed to set KerArg for Kernel0 - 2\n");

  printf("FFT Initialization Complete!\n\n");

  err = clEnqueueTask(queue0, kernel0, 0, NULL, NULL);
  checkError(err, "Failed to Launch the Kernel for FFT\n");

  size_t local_work_size = N/8;
  size_t global_work_size = local_work_size * 1; //Coz the number of Iterations is just 1

  err = clEnqueueNDRangeKernel(queue1, kernel1, 1, NULL, &local_work_size, &global_work_size, 0, NULL, NULL);
  checkError(err, "Failed to launch the Fetch Kernel\n");

  err = clFinish(queue0);
  checkError(err, "Failed to finish FFT\n");
  err = clFinish(queue1);
  checkError(err, "Failed to finish Fetch kernel\n");

  err = clEnqueueReadBuffer(queue0, d_outData, CL_TRUE, 0, sizeof(float2)*N, h_outData, 0, NULL, NULL);
  checkError(err, "Failed to Read back the Buffer output\n");


  printf("FFT is Complete!\n\n"); 
  printf("Printing some of the values, just to make sure they are non-zero\n\n");
  for(int ii=100;ii<125;ii++)
  {
        printf("%f + %f j -> %f + %f j\n",h_inData[ii].x,h_inData[ii].y,h_outData[ii].x,h_outData[ii].y);
  }
  printf("\n\n");
  cleanup();

  return 0;
}



void read_data()
{
  size_t sourceSize;
  float* temp;

  FILE *fp = fopen(DATA_FILE,"r");
  if(fp==NULL)
  {
    printf("Could not find the Random Data File! Exiting!\n");
    exit(1);
  }
  fseek(fp,0,SEEK_END);
  sourceSize=ftell(fp);
  rewind(fp);

  temp = (float *)alignedMalloc(sourceSize);
  fread(temp, sizeof(float),sourceSize,fp);
  fclose(fp);

  for(int i=0;i<N;i++)
  {
    h_inData[i].x = temp[2*i];
    h_inData[i].y = temp[(2*i)+1];
  }
}


void init()
{
  platform = findPlatform("Intel(R) FPGA SDK for OpenCL(TM)");
  if(platform == NULL)
  {
    printf("Could not find the Platform\n");
    exit(1);
  }


  scoped_array<cl_device_id> devices;
  cl_uint num_devices;
  devices.reset(getDevices(platform, CL_DEVICE_TYPE_ACCELERATOR, &num_devices));
  device = devices[0];

  context = clCreateContext(NULL, 1, &device, &oclContextCallback, NULL, &err);
  checkError(err, "Failed to create Context\n");

  queue0 = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
  checkError(err, "Failed to create Command Queue0\n");
  queue1 = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
  checkError(err, "Failed to create Command Queue1\n");


  program = createProgramFromBinary(context, "bin/fft1d.aocx", &device, 1);

  err = clBuildProgram(program, 1, &device, "", NULL, NULL);
  checkError(err, "Failed to Build Program\n");


  kernel0 = clCreateKernel(program, "fft1d", &err);
  checkError(err,"Could not Create Kernel0\n");

  kernel1 = clCreateKernel(program, "fetch", &err);
  checkError(err, "Could not Create Kernel1\n");

  printf("Finished with the Initial Setup!\n");
}

void cleanup()
{
  if(kernel0)
    clReleaseKernel(kernel0);
  if(kernel1)
    clReleaseKernel(kernel1);
  if(program)
    clReleaseProgram(program);
  if(queue0)
    clReleaseCommandQueue(queue0);
  if(queue1)
    clReleaseCommandQueue(queue1);
  if(d_inData)
    clReleaseMemObject(d_inData);
  if(d_outData)
    clReleaseMemObject(d_outData);
  if(context)
    clReleaseContext(context);


}

I checked if the data from file is being read fine, and It is correct and as expected.

Please let me know where could this go wrong!


Solution

  • Update!

    I found out the solution. Reading from the itself was not a good idea, here. I tried generating the random there during the execution and it worked just fine!