copenclnvidiaopencl-c

why doesn't the OpenCL kernel execute even though there are no errors? (c, nvidia, kubuntu)


I'm learning opencl and for some reason the kernel does nothing:

#include <stdlib.h>
#include <stdio.h>

#define CL_TARGET_OPENCL_VERSION 300
#include <CL/cl.h>

int err = 0;
#define PRINTERR() fprintf(stderr, "Error at line %u.\n", __LINE__)
#define CHECKERR(x) if(x){PRINTERR();return __LINE__;}
#define CHECKNOTERR(x) if(!x){PRINTERR();return __LINE__;}

const char *KernelSource =
"__kernel void square(                                                  \n" \
"   __global float* input,                                              \n" \
"   __global float* output,                                             \n" \
"   const unsigned int count)                                           \n" \
"{                                                                      \n" \
"   int i = get_global_id(0);                                           \n" \
"   if(i == 0) printf(\"test\\n\");                                     \n" \
"   if(i < count)                                                       \n" \
"       output[i] = input[i] * input[i];                                \n" \
"}                                                                      \n" ;


#define DATA_SIZE 1024
int main(){
    float data[DATA_SIZE];
    float results[DATA_SIZE];

    size_t global;
    size_t local;

    cl_platform_id platform_id;
    cl_device_id device_id;

    cl_context context;
    cl_command_queue commands;
    cl_program program;
    cl_kernel kernel;

    cl_mem input;
    cl_mem output;

    unsigned int i = 0;
    unsigned int count = DATA_SIZE;
    for(i = 0; i < count; ++i)
        //data[i] = rand() / (float)RAND_MAX;
        data[i] = 2.f;


    int gpu = 1;
    err = clGetPlatformIDs (1, &platform_id, NULL); CHECKERR(err)
    err = clGetDeviceIDs(platform_id, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); CHECKERR(err)

    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err); CHECKERR(!context)
    commands = clCreateCommandQueueWithProperties(context, device_id, NULL, &err); CHECKERR(err)


    input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, &err); CHECKERR(err)
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, &err); CHECKERR(err)
    CHECKERR(!input || !output)

    err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); CHECKERR(err)


    program = clCreateProgramWithSource(context, 1, &KernelSource, NULL, &err); CHECKERR(err)
    err = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); CHECKERR(err)
    kernel = clCreateKernel(program, "square", &err); CHECKERR(err)

    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
    CHECKERR(err)


    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); CHECKERR(err)
    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); CHECKERR(err)
    err = clEnqueueReadBuffer(commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); CHECKERR(err)

    clFlush(commands);
    clFinish(commands);


    unsigned int correct = 0;
    for(i = 0; i < count; ++i)
        printf("%f\n",results[i]);


    printf("Computed '%d/%d' correct values!\n", correct, count);


    // free
    clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    return 0;
}

i want it to do things, but it doesn't.

i tried reading the input instead of the output and it goes fine. the printf in the kernel does nothing and if i run it clEnqueueReadBuffer gives just 0. i have an amd, so i can't test it on the cpu. i tried another example and it worked. (the one here) help appreciated.


Solution

  • global is 0, so the program runs 0 times.