CUDA- Invalid global write of size 4

I have implemented the Mandelbrot set in Cuda. When I input the height and width present in the attached code I get this error by running the cuda-memcheck command. What is it caused by? I believe that it may be due to an overflow error of the index index of the output result vector, but I would not understand why I would have the error in this case, and when I input other values, this does not happen (for example, when i have a height= 16384 * 4 and width=8192 * 4). Thanks everyone for your time.


#include <iostream> 
    
static void
writePPM( int *buf, int width, int height, int Max_Iterations, const char *fn) {
    FILE *fp = fopen(fn, "wb");
    fprintf(fp, "P6\n");
    fprintf(fp, "%d %d\n", width, height);
    fprintf(fp, "255\n");
    for (int i = 0; i < width*height; ++i) {
        // Map the iteration count to colors by just alternating between
        // two greys.
        char c = (buf[i]== Max_Iterations) ? char(255): 20;
        for (int j = 0; j < 3; ++j)
            fputc(c, fp);
    }
    fclose(fp);
    printf("Wrote image file %s\n", fn);
}

__device__ static int mandel(float c_re, float c_im, int count) {
   
   
   float z_re = c_re, z_im = c_im;
    int i;
    for (i = 0; i < count; ++i) {
        if (z_re * z_re + z_im * z_im > 4.f)
            break;

        float new_re = z_re*z_re - z_im*z_im;
        float new_im = 2.f * z_re * z_im;
        z_im = c_im + new_im;
        z_re = c_re + new_re;
      
    }

    return i;
}


__global__ void kernel (float x0, float y0, float x1, float y1,
                       int width, int height, int maxIterations,
                       int  *output)
{

    int w= blockIdx.x*blockDim.x+threadIdx.x;
    int h= blockIdx.y*blockDim.y+threadIdx.y; 

    float dx =(x1 - x0) / width;
    float dy =(y1 - y0) / height;
    
    if (h<height && w<width) {
        //for (int i = 0; i < width; ++i) {
            float x = x0 + w * dx;
            float y = y0 + h * dy;
        int index= (width*h+w);
            output[index] = mandel(x, y, maxIterations);
        //}
    }
}



int main(){
    

    unsigned int width =16384*8;
    unsigned int height=8192*8;
        float x0 = -2;
        float x1 = 1;
        float myy0 = -1;
        float myy1 = 1;
        
    uint32_t maxIterations = 1024; 
    size_t THREADSPERBLOCK = 1024;
    size_t THREADSPERBLOCK_X = 256;
    size_t THREADSPERBLOCK_Y = THREADSPERBLOCK / THREADSPERBLOCK_X;
    
    
         
    

    int *buf_h= (int *)malloc(width * height * sizeof(unsigned int));
    int *buf= (int *) malloc (width *height *sizeof(unsigned int)); 

    
    int num_blocks_x= (width + THREADSPERBLOCK_X-1)/THREADSPERBLOCK_X;
    int num_blocks_y=(height + THREADSPERBLOCK_Y-1)/THREADSPERBLOCK_Y;
    
    cudaEvent_t start, stop;
    float streamElapsedTime;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord( start, 0 );

    
    int *buf_d;
    int *buff;
    
    cudaMalloc ((void **) & buf_d,width * height * sizeof(unsigned int));
    cudaMalloc ((void**) & buff, width *height * sizeof(unsigned int)); 

    
    cudaMemcpy (buf_d,buf_h,width * height * sizeof(unsigned int),cudaMemcpyHostToDevice);

    dim3 gridDims (num_blocks_x,num_blocks_y);
    dim3 blockDims(THREADSPERBLOCK_X,THREADSPERBLOCK_Y);


        
    kernel<<<gridDims,blockDims>>>(x0,myy0,x1,myy1,width,height,maxIterations,buf_d);

    cudaMemcpy( buf,buf_d, width*height*sizeof(unsigned int), cudaMemcpyDeviceToHost );
    writePPM(buf, width, height,maxIterations, "mandelbrot-parallel.ppm");  
    cudaEventRecord( stop, 0);
    cudaEventSynchronize( stop);
    cudaEventElapsedTime( &streamElapsedTime, start, stop );
    cudaEventDestroy( start);
    cudaEventDestroy( stop );
    
    cudaFree(buf_d);
    cudaFree(buff); 

    free(buf_h);
    free(buf);

    printf("\nCUDA stream elapsed time:  %f", streamElapsedTime);

    
    return 0;
}**strong text**

Running with cuda-memcheck I have this error reported several times:

Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2fe) [0x28ccce]
=========     Host Frame:./Mandelbrot [0x1d3eb]
=========     Host Frame:./Mandelbrot [0x3a63e]
=========     Host Frame:./Mandelbrot [0x678c]
=========     Host Frame:./Mandelbrot [0x6655]
=========     Host Frame:./Mandelbrot [0x66cd]
=========     Host Frame:./Mandelbrot [0x63af]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
=========     Host Frame:./Mandelbrot [0x5efa]
=========
========= Invalid __global__ write of size 4
=========     at 0x00000310 in kernel(float, float, float, float, int, int, int, int*)
=========     by thread (176,3,0) in block (92,0,0)
=========     Address 0x001972c0 is out of bounds
=========     Device Frame:kernel(float, float, float, float, int, int, int, int*) (kernel(float, float, float, float, int, int, int, int*) : 0x310)
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2fe) [0x28ccce]
=========     Host Frame:./Mandelbrot [0x1d3eb]
=========     Host Frame:./Mandelbrot [0x3a63e]
=========     Host Frame:./Mandelbrot [0x678c]
=========     Host Frame:./Mandelbrot [0x6655]
=========     Host Frame:./Mandelbrot [0x66cd]
=========     Host Frame:./Mandelbrot [0x63af]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
=========     Host Frame:./Mandelbrot [0x5efa]
=========

At the end:

 Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaEventSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
=========     Host Frame:./Mandelbrot [0x4a9a0]
=========     Host Frame:./Mandelbrot [0x641a]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
=========     Host Frame:./Mandelbrot [0x5efa]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaEventElapsedTime.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
=========     Host Frame:./Mandelbrot [0x4a641]
=========     Host Frame:./Mandelbrot [0x6434]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
=========     Host Frame:./Mandelbrot [0x5efa]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaEventDestroy.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
=========     Host Frame:./Mandelbrot [0x4a7f0]
=========     Host Frame:./Mandelbrot [0x6440]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
=========     Host Frame:./Mandelbrot [0x5efa]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaEventDestroy.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
=========     Host Frame:./Mandelbrot [0x4a7f0]
=========     Host Frame:./Mandelbrot [0x644c]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
=========     Host Frame:./Mandelbrot [0x5efa]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
=========     Host Frame:./Mandelbrot [0x48350]
=========     Host Frame:./Mandelbrot [0x6458]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
=========     Host Frame:./Mandelbrot [0x5efa]
=========

========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
=========     Host Frame:./Mandelbrot [0x48350]
=========     Host Frame:./Mandelbrot [0x6464]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
=========     Host Frame:./Mandelbrot [0x5efa]

Solution

I believe that it may be due to an overflow error of the index

That is certainly a problem.

I would not understand why I would have the error in this case, and when I input other values, this does not happen (for example, when i have a height= 16384 * 4 and width=8192 * 4).

16384*8*8192*8 = 8,589,934,592

That number will not fit in a unsigned int variable. It will overflow the calculation.

If we multiply by 4 twice instead of by 8 twice, then of course the product becomes 4 times smaller, and that will not overflow unsigned int calculations. Later, when the intermediate product is multiplied by sizeof(...), the sizeof returns a quantity that is a size_t and so the calculations get converted to 64-bit form at that point.

Anyway, none of this has anything to do with CUDA. We can fix the problems by converting quantities to size_t.

Another issue is the question of memory size. Given the desired numbers, and the above calculation, you need 32GB for the device buffer:

cudaMalloc ((void **) & buf_d,width * height * sizeof(unsigned int));

You are also doing this:

cudaMalloc ((void**) & buff, width *height * sizeof(unsigned int));

That buff allocation isn't actually used anywhere in your code, but of course it will require another 32GB. So unless you are running on an A100 80GB GPU, this isn't going to work.

The GPU I am testing on has 32GB, so if I delete the unnecessary allocation, and reduce the GPU memory requirement to 16GB, I get a code that runs without errors under cuda-memcheck (however it takes quite a long time to run):

#include <iostream>

static void
writePPM( int *buf, size_t width, size_t height, int Max_Iterations, const char *fn) {
    FILE *fp = fopen(fn, "wb");
    fprintf(fp, "P6\n");
    fprintf(fp, "%lu %lu\n", width, height);
    fprintf(fp, "255\n");
    for (size_t i = 0; i < width*height; ++i) {
        // Map the iteration count to colors by just alternating between
        // two greys.
        char c = (buf[i]== Max_Iterations) ? char(255): 20;
        for (int j = 0; j < 3; ++j)
            fputc(c, fp);
    }
    fclose(fp);
    printf("Wrote image file %s\n", fn);
}

__device__ static int mandel(float c_re, float c_im, int count) {


   float z_re = c_re, z_im = c_im;
    int i;
    for (i = 0; i < count; ++i) {
        if (z_re * z_re + z_im * z_im > 4.f)
            break;

        float new_re = z_re*z_re - z_im*z_im;
        float new_im = 2.f * z_re * z_im;
        z_im = c_im + new_im;
        z_re = c_re + new_re;

    }

    return i;
}


__global__ void kernel (float x0, float y0, float x1, float y1,
                       size_t width, size_t height, int maxIterations,
                       int  *output)
{

    size_t w= blockIdx.x*blockDim.x+threadIdx.x;
    size_t h= blockIdx.y*blockDim.y+threadIdx.y;

    float dx =(x1 - x0) / width;
    float dy =(y1 - y0) / height;

    if (h<height && w<width) {
        //for (int i = 0; i < width; ++i) {
            float x = x0 + w * dx;
            float y = y0 + h * dy;
        size_t index= (width*h+w);
            output[index] = mandel(x, y, maxIterations);
        //}
    }
}



int main(){


    size_t width =16384*4;
    size_t height=8192*8;
        float x0 = -2;
        float x1 = 1;
        float myy0 = -1;
        float myy1 = 1;

    uint32_t maxIterations = 1024;
    size_t THREADSPERBLOCK = 1024;
    size_t THREADSPERBLOCK_X = 256;
    size_t THREADSPERBLOCK_Y = THREADSPERBLOCK / THREADSPERBLOCK_X;





    int *buf_h= (int *)malloc(width * height * sizeof(unsigned int));
    int *buf= (int *) malloc (width *height *sizeof(unsigned int));


    int num_blocks_x= (width + THREADSPERBLOCK_X-1)/THREADSPERBLOCK_X;
    int num_blocks_y=(height + THREADSPERBLOCK_Y-1)/THREADSPERBLOCK_Y;

    cudaEvent_t start, stop;
    float streamElapsedTime;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord( start, 0 );


    int *buf_d;

    cudaMalloc ((void **) & buf_d,width * height * sizeof(unsigned int));


    cudaMemcpy (buf_d,buf_h,width * height * sizeof(unsigned int),cudaMemcpyHostToDevice);

    dim3 gridDims (num_blocks_x,num_blocks_y);
    dim3 blockDims(THREADSPERBLOCK_X,THREADSPERBLOCK_Y);



    kernel<<<gridDims,blockDims>>>(x0,myy0,x1,myy1,width,height,maxIterations,buf_d);

    cudaMemcpy( buf,buf_d, width*height*sizeof(unsigned int), cudaMemcpyDeviceToHost );
    writePPM(buf, width, height,maxIterations, "mandelbrot-parallel.ppm");
    cudaEventRecord( stop, 0);
    cudaEventSynchronize( stop);
    cudaEventElapsedTime( &streamElapsedTime, start, stop );
    cudaEventDestroy( start);
    cudaEventDestroy( stop );

    cudaFree(buf_d);

    free(buf_h);
    free(buf);

    printf("\nCUDA stream elapsed time:  %f", streamElapsedTime);


    return 0;
}

CUDA- Invalid __global__ write of size 4

CUDA- Invalid global write of size 4