How to pass struct containing matrices in Cuda

As the titles says , i'm trying to pass a struct containing 4 matrices to a Cuda Kernel. The problem is that i get no errors, but the program crashes goes nuts whenever i try to execute it.All of the values returned are 0 and the clock value overflows. Here's what i've made so far :

 #define ROWS 700
 #define COLS 1244
struct sobel {
    int Gradient[ROWS][COLS];
    int Image_input[ROWS][COLS];
    int G_x[ROWS][COLS];
    int G_y[ROWS][COLS];
};

 __global__ void sobel(struct sobel* data)
{

int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;


int XLENGTH = ROWS;
int YLENGTH = COLS;

if ((x < XLENGTH) && (y < YLENGTH))
{
    if (x == 0 || x == XLENGTH - 1 || y == 0 || y == YLENGTH - 1)
    {
        data->G_x[x][y] = data->G_y[x][y] = data->Gradient[x][y] = 0;
    }
    else
    {
            data->G_x[x][y] = data->Image_input[x + 1][y - 1]
                + 2 * data->Image_input[x + 1][y]
                + data->Image_input[x + 1][y + 1]
                - data->Image_input[x - 1][y - 1]
                - 2 * data->Image_input[x - 1][y]
                - data->Image_input[x - 1][y + 1];

            data->G_y[x][y] = data->Image_input[x - 1][y + 1]
                + 2 * data->Image_input[x][y + 1]
                + data->Image_input[x + 1][y + 1]
                - data->Image_input[x - 1][y - 1]
                - 2 * data->Image_input[x][y - 1]
                - data->Image_input[x + 1][y - 1];

            data->Gradient[x][y] = abs(data->G_x[x][y]) + abs(data->G_y[x][y]);
            if (data->Gradient[x][y] > 255) {
                data->Gradient[x][y] = 255;
            }
        
    }

}
}


int main() {
    struct sobel* data = (struct sobel*)calloc(sizeof(*data), 1);
    struct sobel* dev_data; 
    cudaMalloc((void**)&dev_data, sizeof(*data));
    cudaMemcpy(dev_data, data, sizeof(data), cudaMemcpyHostToDevice);
    dim3 blocksize(16, 16);
    dim3 gridsize;
    gridsize.x = (ROWS + blocksize.x - 1) / blocksize.x;
    gridsize.y = (COLS + blocksize.y - 1) / blocksize.y;
    sobel <<< gridsize, blocksize >>> (dev_data);
    cudaMemcpy(data, dev_data, sizeof(data), cudaMemcpyDeviceToHost);
    free(data);
    cudaFree(dev_data);
    return 0;
}

Do i also have to allocate device memory for each obe of the matrices ? Any advice would be appreciated.

Edit : I switched a couple of things here but the program seems to ignore the nested else statement and all the values returned are 0 .

Solution

There (at least) are 2 errors in your code.

You have not allocated a correct size for the device struct:
```
cudaMalloc((void**)&dev_data, sizeof(data));
                                     ^
```
just like you did in your calloc call, that should be sizeof(*data) not sizeof(data) (Both cudaMemcpy calls should probably be updated to reflect this size as well.)
You need a proper thread check in your kernel code, something like this:
```
   if (( x < XLENGTH ) && ( y < YLENGTH )){ // add this line
     if (x == 0 ||  x == XLENGTH - 1 || y == 0 || y == YLENGTH - 1)
     {
         data->G_x[x][y] = data->G_y[x][y] = data->Gradient[x][y] = 0;
```
Without that, your next if test line may allow out-of-bounds threads to participate in the zeroing operation. For example any thread where x == 0 will pass that if-test. But that thread may have an out-of-bounds y-value.