Access violation reading location with openCL with high n values

I have to program the Floyd algorithm using OpenCL, it works fine but only with n<268. when n>=268 i have an "Access violation reading location" when calling clEnqueueWriteBuffer (the buffer_distances one, in the loop).

Here is my code:

graphe is an adjacency matrix, and distances is the distances matrix

    int n;
    printf("enter n value: ");
    scanf("%d", &n);
    printf("\n");
    int n2 = n * n;
    int matSize = n2 * sizeof(int*);
    int* graphe = malloc(sizeof(int) * n2);
    int* distances = malloc(sizeof(int) * n2);
    //mat[i,j] => mat[i*n + j]
    if (graphe == NULL)
        printf("malloc failed\n");
    init_graphe(graphe, n);
    copy(graphe, distances, n);

initialization of opencl variables:


    char* programSource = load_kernel("kernel.cl");

    cl_int status;
    // STEP 1: Discover and initialize the platforms

    cl_uint numPlatforms = 0;

    cl_platform_id* platforms = NULL;

    status = clGetPlatformIDs(0, NULL, &numPlatforms);
    printf("Number of platforms = %d\n", numPlatforms);

    platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));

    status = clGetPlatformIDs(numPlatforms, platforms, NULL);

    char Name[1000];
    clGetPlatformInfo(platforms[0], CL_PLATFORM_NAME, sizeof(Name), Name, NULL);
    printf("Name of platform : %s\n", Name);
    fflush(stdout);

    // STEP 2: Discover and initialize the devices

    cl_uint numDevices = 0;
    cl_device_id* devices = NULL;

    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);


    printf("Number of devices = %d\n", (int)numDevices);

    devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));

    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);


    for (int i = 0; i < numDevices; i++) {
        clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(Name), Name, NULL);
        printf("Name of device %d: %s\n\n", i, Name);
    }

    // STEP 3: Create a context
    fflush(stdout);

    cl_context context = NULL;

    context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);

    // STEP 4: Create a command queue
    fflush(stdout);
    cl_command_queue cmdQueue;

    cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status);

    // STEP 5: Create device buffers
    fflush(stdout);

    cl_mem buffer_graphe;
    cl_mem buffer_n;
    cl_mem buffer_distances;
    cl_mem buffer_k;

    buffer_graphe = clCreateBuffer(context, CL_MEM_READ_WRITE, matSize, NULL, &status);
    buffer_n = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &status);
    buffer_distances = clCreateBuffer(context, CL_MEM_READ_WRITE, matSize, NULL, &status);
    buffer_k = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &status);
    fflush(stdout);

    // STEP 6: Create and compile the program
    cl_program program = clCreateProgramWithSource(context, 1, (const char**)&programSource, NULL, &status);
    printf("Compilation\n");
    fflush(stdout);
    status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);

    // STEP 8: Create the kernel
    cl_kernel kernel = NULL;
    fflush(stdout);
    kernel = clCreateKernel(program, "floyd", &status);

    size_t globalWorkSize[2] = { n, n };
    size_t localWorkSize[3] = { 20,20 };

Execution of the kernel:

    clock_t start = clock();
    int k;
    for (k = 0; k < n; k++) {
        status = clEnqueueWriteBuffer(cmdQueue, buffer_graphe, CL_TRUE, 0, matSize, graphe, 0, NULL, NULL);
        status = clEnqueueWriteBuffer(cmdQueue, buffer_n, CL_TRUE, 0, sizeof(int), &n, 0, NULL, NULL);
        status = clEnqueueWriteBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);
        status = clEnqueueWriteBuffer(cmdQueue, buffer_k, CL_TRUE, 0, sizeof(int), &k, 0, NULL, NULL);

        status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buffer_graphe);
        status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&buffer_n);
        status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&buffer_distances);
        status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&buffer_k);

        status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
        clFinish(cmdQueue);  

        status = clEnqueueReadBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);
        clFinish(cmdQueue);
    }

and the kernel:

void kernel floyd(global int* graphe, global int* n, global int* distances, global int* k)
{
    int i = get_global_id(0);
    int j = get_global_id(1);

    int ij = i * (*n) + j;
    int ik = i * (*n) + (*k);
    int kj = (*k) * (*n) + j;

    if (distances[ik] + distances[kj] < distances[ij]) {
        distances[ij] = distances[ik] + distances[kj];
    }
}

Solution

You have:

int matSize = n2 * sizeof(int*);
…
int* distances = malloc(sizeof(int) * n2);
…
status = clEnqueueWriteBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);

Say n2 is 100.
matSize will be 800 on a 64-bit system. (sizeof(int*) = 8)
You allocate 400 bytes of memory for your distances array. (sizeof(int) = 4, typically)
You then copy 800 bytes (matSize) from distances into your OpenCL buffer. This overflows the end of the array. Whoops.

The bug is of course the use of sizeof(int*): you've got an array of ints, not an array of pointers, so this should be sizeof(int), which is what you're correctly doing in the malloc call. (I can't quite fathom why you're not using matSize there.) Although what you should probably be using is CLint, or one of the explicitly-sized types (int32_t in this case), because types in OpenCL kernels have very specific definitions which may or may not match those in host C code.

Additional Notes:

I'm not 100% convinced your data dependencies are safe here. No work-item should be reading an array entry that another is writing in the same kernel enqueueing batch. It seems to me that ij (written) for one of the work-items will be equal to ik (read) for the others in the row? Similar deal with ij and kj.
There's no need to read and re-write the distances buffer between iterations, if you're not modifying it on the host. Neither does graphe need re-writing every time if it's not changing.
You can pass scalar arguments such as k and n without a buffer. status = clSetKernelArg(kernel, 1, sizeof(n), &n); works fine if you change your kernel signature's argument to int n. (no dereference needed in the kernel then.)
A local work size of 20x20 is likely not optimal. If you're not using local memory or barriers, don't bother setting a local size at al.
You should be able to remove the clFinish calls, and you can change the buffer writes to be non-blocking once you've moved them outside the loop. This might give you an additional small speedup.