copenclopencl-c

Access violation reading location with openCL with high n values


I have to program the Floyd algorithm using OpenCL, it works fine but only with n<268. when n>=268 i have an "Access violation reading location" when calling clEnqueueWriteBuffer (the buffer_distances one, in the loop).

Here is my code:

graphe is an adjacency matrix, and distances is the distances matrix

    int n;
    printf("enter n value: ");
    scanf("%d", &n);
    printf("\n");
    int n2 = n * n;
    int matSize = n2 * sizeof(int*);
    int* graphe = malloc(sizeof(int) * n2);
    int* distances = malloc(sizeof(int) * n2);
    //mat[i,j] => mat[i*n + j]
    if (graphe == NULL)
        printf("malloc failed\n");
    init_graphe(graphe, n);
    copy(graphe, distances, n);

initialization of opencl variables:


    char* programSource = load_kernel("kernel.cl");

    cl_int status;
    // STEP 1: Discover and initialize the platforms

    cl_uint numPlatforms = 0;

    cl_platform_id* platforms = NULL;

    status = clGetPlatformIDs(0, NULL, &numPlatforms);
    printf("Number of platforms = %d\n", numPlatforms);

    platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));

    status = clGetPlatformIDs(numPlatforms, platforms, NULL);

    char Name[1000];
    clGetPlatformInfo(platforms[0], CL_PLATFORM_NAME, sizeof(Name), Name, NULL);
    printf("Name of platform : %s\n", Name);
    fflush(stdout);

    // STEP 2: Discover and initialize the devices

    cl_uint numDevices = 0;
    cl_device_id* devices = NULL;

    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);


    printf("Number of devices = %d\n", (int)numDevices);

    devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));

    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);


    for (int i = 0; i < numDevices; i++) {
        clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(Name), Name, NULL);
        printf("Name of device %d: %s\n\n", i, Name);
    }

    // STEP 3: Create a context
    fflush(stdout);

    cl_context context = NULL;

    context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);

    // STEP 4: Create a command queue
    fflush(stdout);
    cl_command_queue cmdQueue;

    cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status);

    // STEP 5: Create device buffers
    fflush(stdout);

    cl_mem buffer_graphe;
    cl_mem buffer_n;
    cl_mem buffer_distances;
    cl_mem buffer_k;

    buffer_graphe = clCreateBuffer(context, CL_MEM_READ_WRITE, matSize, NULL, &status);
    buffer_n = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &status);
    buffer_distances = clCreateBuffer(context, CL_MEM_READ_WRITE, matSize, NULL, &status);
    buffer_k = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &status);
    fflush(stdout);

    // STEP 6: Create and compile the program
    cl_program program = clCreateProgramWithSource(context, 1, (const char**)&programSource, NULL, &status);
    printf("Compilation\n");
    fflush(stdout);
    status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);

    // STEP 8: Create the kernel
    cl_kernel kernel = NULL;
    fflush(stdout);
    kernel = clCreateKernel(program, "floyd", &status);

    size_t globalWorkSize[2] = { n, n };
    size_t localWorkSize[3] = { 20,20 };

Execution of the kernel:

    clock_t start = clock();
    int k;
    for (k = 0; k < n; k++) {
        status = clEnqueueWriteBuffer(cmdQueue, buffer_graphe, CL_TRUE, 0, matSize, graphe, 0, NULL, NULL);
        status = clEnqueueWriteBuffer(cmdQueue, buffer_n, CL_TRUE, 0, sizeof(int), &n, 0, NULL, NULL);
        status = clEnqueueWriteBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);
        status = clEnqueueWriteBuffer(cmdQueue, buffer_k, CL_TRUE, 0, sizeof(int), &k, 0, NULL, NULL);

        status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buffer_graphe);
        status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&buffer_n);
        status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&buffer_distances);
        status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&buffer_k);

        status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
        clFinish(cmdQueue);  

        status = clEnqueueReadBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);
        clFinish(cmdQueue);
    }

and the kernel:

void kernel floyd(global int* graphe, global int* n, global int* distances, global int* k)
{
    int i = get_global_id(0);
    int j = get_global_id(1);

    int ij = i * (*n) + j;
    int ik = i * (*n) + (*k);
    int kj = (*k) * (*n) + j;

    if (distances[ik] + distances[kj] < distances[ij]) {
        distances[ij] = distances[ik] + distances[kj];
    }
}


Solution

  • You have:

    int matSize = n2 * sizeof(int*);
    …
    int* distances = malloc(sizeof(int) * n2);
    …
    status = clEnqueueWriteBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);
    

    The bug is of course the use of sizeof(int*): you've got an array of ints, not an array of pointers, so this should be sizeof(int), which is what you're correctly doing in the malloc call. (I can't quite fathom why you're not using matSize there.) Although what you should probably be using is CLint, or one of the explicitly-sized types (int32_t in this case), because types in OpenCL kernels have very specific definitions which may or may not match those in host C code.

    Additional Notes: