cperformanceopenclopencl-c

Why is fp32 performance only two percent of theoretical maximum of my RTX 2070 in my OpenCL program?


I've written a few OpenCL programs which perform integer arithmetic, and I usually manage to get somewhere near, sometimes more, than the theoretical maximum 32 bit 'flops' of the GPU for 32 bit integer operations.

I recently started writing a program which uses a lot of floating point arithmetic, and I couldn't get it anywhere near the theoretical maximum performance of my RTX2070 which is 8.3 Tflops. I've written a simple benchmarking program which can be found here https://github.com/FastAsChuff/OpenCL-Floating-Point-Benchmark/tree/main. I can test the 64 bit floating point performance just by changing float to double and the double precision performance numbers are approximately what I expected.

I'm not looking for a code review of my simple OpenCL implementation. I extracted this from my other program which is much more verbose. It just has enough to run the kernel that does the computation. Since the 64 bit result are as expected, I just can't see that there's anything wrong with what I've done, but if I have, I would be most grateful if someone can explain what is going on here. I updated the Nvidia driver from 470 to 535 but the performance number were actually slightly worse not better. The code is below. Thanks.



#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <stdint.h>
#include <sys/time.h>

//gcc f64oclshort.c -o f64oclshort.bin -lOpenCL -O3 -march=native -Wall

#define F64TEST2_PIXELDIM 61440
#define KERNEL_COUNT 1
#define MAX_PLATFORMS 10
#define MAX_DEVICES 25
#define NAMES_LENGTH 255
#define CL_TARGET_OPENCL_VERSION 120
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif

char* oclkernel_names[] = {"getfgcount"};
char* oclkernels[] = {"\
__kernel void getfgcount(__global unsigned long* counts) { \
  __private FLOATTYPE zr, zi, zi0, temp;\
  __private unsigned long x,y,i,count = 0;\
  y = get_global_id(0) + DIM0*(get_global_id(1) + DIM1*get_global_id(2));\
  zi0 = IMAGSTART + (IMAGEND - IMAGSTART)*y/(PIXELDIM-1);\
  for (x=0; x<PIXELDIM; x++) {\
    zr = REALSTART + (REALEND - REALSTART)*x/(PIXELDIM-1);\
    zi = zi0;\
    for (i=0; i<MAXITERATIONS; i++) {\
      temp = zr*zr - zi*zi + REALCONST;\
      zi = 2*zr*zi + IMAGCONST;\
      zr = temp;\
    }\
    count += ((zi*zi + zr*zr) < 1000.0f);\
  }\
  counts[y] = count;\
    }"};

void printf_cl_error(cl_int res) {
  if (res == CL_INVALID_MEM_OBJECT) printf("CL_INVALID_MEM_OBJECT\n");
  if (res == CL_INVALID_SAMPLER) printf("CL_INVALID_SAMPLER\n");
  if (res == CL_INVALID_KERNEL) printf("CL_INVALID_KERNEL\n");
  if (res == CL_INVALID_ARG_INDEX) printf("CL_INVALID_ARG_INDEX\n");
  if (res == CL_INVALID_ARG_VALUE) printf("CL_INVALID_ARG_VALUE\n");
  if (res == CL_INVALID_ARG_SIZE) printf("CL_INVALID_ARG_SIZE\n");
  if (res == CL_INVALID_COMMAND_QUEUE) printf("CL_INVALID_COMMAND_QUEUE\n");
  if (res == CL_INVALID_CONTEXT) printf("CL_INVALID_CONTEXT\n");
  if (res == CL_INVALID_MEM_OBJECT) printf("CL_INVALID_MEM_OBJECT\n");
  if (res == CL_INVALID_VALUE) printf("CL_INVALID_VALUE\n");
  if (res == CL_INVALID_EVENT_WAIT_LIST) printf("CL_INVALID_EVENT_WAIT_LIST\n");
  if (res == CL_MEM_OBJECT_ALLOCATION_FAILURE) printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n");
  if (res == CL_OUT_OF_HOST_MEMORY) printf("CL_OUT_OF_HOST_MEMORY\n");
  if (res == CL_INVALID_PROGRAM_EXECUTABLE) printf("CL_INVALID_PROGRAM_EXECUTABLE\n");
  if (res == CL_INVALID_KERNEL_ARGS) printf("CL_INVALID_KERNEL_ARGS\n");
  if (res == CL_INVALID_WORK_DIMENSION) printf("CL_INVALID_WORK_DIMENSION\n");
  if (res == CL_INVALID_GLOBAL_WORK_SIZE) printf("CL_INVALID_GLOBAL_WORK_SIZE\n");
  if (res == CL_INVALID_WORK_GROUP_SIZE) printf("CL_INVALID_WORK_GROUP_SIZE\n");
  if (res == CL_INVALID_WORK_ITEM_SIZE) printf("CL_INVALID_WORK_ITEM_SIZE\n");
  if (res == CL_INVALID_GLOBAL_OFFSET) printf("CL_INVALID_GLOBAL_OFFSET\n");
  if (res == CL_OUT_OF_RESOURCES) printf("CL_OUT_OF_RESOURCES\n");
  if (res == CL_INVALID_OPERATION) printf("CL_INVALID_OPERATION\n");
  if (res == CL_BUILD_PROGRAM_FAILURE) printf("CL_BUILD_PROGRAM_FAILURE\n");
  if (res == CL_COMPILER_NOT_AVAILABLE) printf("CL_COMPILER_NOT_AVAILABLE\n");
  if (res == CL_INVALID_BUILD_OPTIONS) printf("CL_INVALID_BUILD_OPTIONS\n");
  if (res == CL_INVALID_BINARY) printf("CL_INVALID_BUILD_OPTIONS\n");
  if (res == CL_INVALID_DEVICE) printf("CL_INVALID_DEVICE\n");
  if (res != CL_SUCCESS) {
    printf("OpenCL Failed With Error Code %i\n", res);
    exit(1);
  }
}

typedef struct {
               time_t      tv_sec;     /* seconds */
               suseconds_t tv_usec;    /* microseconds */
           } timeval_t;

int64_t tstampmsec() {
  timeval_t timeval;
  gettimeofday((struct timeval * restrict)&timeval, 0);
  return timeval.tv_sec*1000LL + timeval.tv_usec/1000;
}   

int main(int argc, char* argv[]) {
  printf("This program counts the number of foreground pixels in a large Julia set fractal image, without actually creating the image. It is to benchmark floating point arithmetic performance of an OpenCL device.\nAuthor: Simon Goater August 2024\n\n");
// Use float, double, or half below if supported.
  char* floattype = "float";
  int64_t progstart, progend;
  int32_t i,j;
  uint64_t maxiterations = 200;
  uint64_t dim[3], dimlocal[3];
  char text[NAMES_LENGTH];
  dim[0] = 192;
  dim[2] = 64;
  dim[1] = 1 + (F64TEST2_PIXELDIM/(dim[2]*dim[0]));
  dimlocal[0] = dim[0];
  dimlocal[1] = 1;
  dimlocal[2] = 1;
  char ocloptions[512];
  sprintf(ocloptions, "-D FLOATTYPE=%s -D DIM0=%lu -D DIM1=%lu -D DIM2=%lu -D PIXELDIM=%u -D MAXITERATIONS=%lu -D REALSTART=-2.0 -D REALEND=2.0 -D IMAGSTART=-2.0 -D IMAGEND=2.0 -D REALCONST=-0.003 -D IMAGCONST=0.647 ", floattype, dim[0], dim[1], dim[2], F64TEST2_PIXELDIM, maxiterations);
  
  cl_int res;
  cl_uint platformCount = 0;
  cl_uint deviceCount = 0;
  _Bool platformchosen = false;
  _Bool devicechosen = false;  
  unsigned int platformno = 0; // Choose Platform No.
  unsigned int deviceno = 0;  // Choose Device No.
  cl_platform_id platform;
  cl_device_id device;  
  printf_cl_error(clGetPlatformIDs(MAX_PLATFORMS, NULL, &platformCount));
  platformCount = (platformCount > MAX_PLATFORMS ? MAX_PLATFORMS : platformCount);
  printf("Detected %i OpenCL Platforms.\n", platformCount);
  if (platformCount < 1) exit(1);
  cl_platform_id* platforms = malloc(sizeof(cl_platform_id) * platformCount);  
  printf_cl_error(clGetPlatformIDs(platformCount, platforms, NULL));
  for (i=0; i<platformCount; i++) {
    printf_cl_error(clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount));
    printf_cl_error(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, NAMES_LENGTH, (void *)text, NULL));
    printf("Querying Platform No. %i - %s.\n", i, text);    
    deviceCount = (deviceCount > MAX_DEVICES ? MAX_DEVICES : deviceCount);
    if (i == platformno) {
      platform = platforms[i];
      platformchosen = true;
    }
    printf("Detected %i Devices In Platform.\n", deviceCount);
    if (deviceCount > 0) {
      cl_device_id* devices = malloc(deviceCount*sizeof(cl_device_id));
      printf_cl_error(clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL));
      for (j=0; j<deviceCount; j++) {
        printf_cl_error(clGetDeviceInfo(devices[j], CL_DEVICE_NAME, NAMES_LENGTH, text, NULL));
        printf("  Device No. %i - %s", j, text);
        if ((i == platformno) && (j == deviceno)) {
          device = devices[j];
          printf("    Selected.\n");
          devicechosen = true;
        } else {
          printf("\n");
        }
      }
      free(devices);
    }
  }
  free(platforms);
  if (!platformchosen || !devicechosen) {
    printf("No Platform/Device chosen.\n");
    printf("This program runs on one and only one device. Please edit platformno/deviceno to include OpenCL device.\n");
    exit(1);
  }
  cl_context ContextId = clCreateContext(NULL, 1, &device, NULL, NULL, &res);
  printf_cl_error(res);
  size_t kernel_strlens[KERNEL_COUNT];
  for (i = 0; i<KERNEL_COUNT; i++) kernel_strlens[i] = strlen(oclkernels[i]);
  cl_program ProgramId = clCreateProgramWithSource(ContextId, KERNEL_COUNT, (const char **)oclkernels, (const size_t*)kernel_strlens, &res);
  printf_cl_error(res);
  printf_cl_error(clBuildProgram(ProgramId, 1, &device, ocloptions, NULL, NULL));
  uint64_t yrange = dim[0]*dim[1]*dim[2];
  cl_mem counts_mem_obj = clCreateBuffer(ContextId, CL_MEM_WRITE_ONLY, yrange*sizeof(unsigned long), NULL, &res);
  printf_cl_error(res);
  cl_kernel kernel = clCreateKernel(ProgramId, oclkernel_names[0], &res);
  printf_cl_error(res);
  printf_cl_error(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&counts_mem_obj));
  cl_command_queue CommandQueueId = clCreateCommandQueue(ContextId, device, 0, &res);
  printf_cl_error(res);
  unsigned long count = 0;  
  unsigned long *counts = malloc(yrange*sizeof(unsigned long));
  printf("Executing Kernel. Please Wait...\n");
  progstart = tstampmsec();
  printf_cl_error(clEnqueueNDRangeKernel(CommandQueueId, kernel, 3, NULL, (const size_t *)dim, (const size_t *)dimlocal, 0, NULL, NULL));
  printf_cl_error(clEnqueueReadBuffer(CommandQueueId, counts_mem_obj, CL_TRUE, 0, yrange*sizeof(unsigned long), (void *)counts, 0, NULL, NULL));
  progend = tstampmsec();
  for (uint64_t y=0; y<yrange; y++) count += counts[y];  
  printf("FG Pixel Count = %lu / %lu\n", count, yrange*F64TEST2_PIXELDIM);
  if (progend > progstart) printf("Estimated %s performance = %f Gflops\n", floattype, 7*maxiterations*yrange*F64TEST2_PIXELDIM/(1000000.0f*(progend - progstart)));
  printf("Kernel Duration = %li msecs\n", progend - progstart);
}

Output:-

Detected 1 OpenCL Platforms.
Querying Platform No. 0 - NVIDIA CUDA.
Detected 2 Devices In Platform.
  Device No. 0 - NVIDIA GeForce RTX 2070    Selected.
  Device No. 1 - NVIDIA GeForce GT 730
Executing Kernel. Please Wait...
FG Count = 29283274 / 4529848320
Estimated float performance = 144.285660 Gflops
Kernel Duration = 43953 msecs

Detected 1 OpenCL Platforms.
Querying Platform No. 0 - NVIDIA CUDA.
Detected 2 Devices In Platform.
  Device No. 0 - NVIDIA GeForce RTX 2070    Selected.
  Device No. 1 - NVIDIA GeForce GT 730
Executing Kernel. Please Wait...
FG Count = 29280703 / 4529848320
Estimated double performance = 165.064743 Gflops
Kernel Duration = 38420 msecs

After swapping out my RTX2070 for my old Tesla K20X, I got 464.123810 Gflops for fp32 and 870.049072 Gflops for fp64.


Solution

  • The performance issue comes from implicit casts to double-precision since the variable set in the command line are double-precision numbers. You can fix this with:

    -D REALSTART=-2.0f -D REALEND=2.0f -D IMAGSTART=-2.0f -D IMAGEND=2.0f -D REALCONST=-0.003f -D IMAGCONST=0.647f
    

    Please note the f at the end (so the constants are of type float instead of double).


    By the way, unrolling your main hot loop might also help a bit (certainly at least twice if not automatically done by the OpenCL vendor compiler though Nvidia generally does a pretty good job for that). Pre-computing 1.f / (PIXELDIM-1) might help a bit since divisions are very expensive though it is not in the main hot loop.