As the titles says , i'm trying to pass a struct containing 4 matrices to a Cuda Kernel. The problem is that i get no errors, but the program crashes goes nuts whenever i try to execute it.All of the values returned are 0 and the clock value overflows. Here's what i've made so far :
#define ROWS 700
#define COLS 1244
struct sobel {
int Gradient[ROWS][COLS];
int Image_input[ROWS][COLS];
int G_x[ROWS][COLS];
int G_y[ROWS][COLS];
};
__global__ void sobel(struct sobel* data)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int XLENGTH = ROWS;
int YLENGTH = COLS;
if ((x < XLENGTH) && (y < YLENGTH))
{
if (x == 0 || x == XLENGTH - 1 || y == 0 || y == YLENGTH - 1)
{
data->G_x[x][y] = data->G_y[x][y] = data->Gradient[x][y] = 0;
}
else
{
data->G_x[x][y] = data->Image_input[x + 1][y - 1]
+ 2 * data->Image_input[x + 1][y]
+ data->Image_input[x + 1][y + 1]
- data->Image_input[x - 1][y - 1]
- 2 * data->Image_input[x - 1][y]
- data->Image_input[x - 1][y + 1];
data->G_y[x][y] = data->Image_input[x - 1][y + 1]
+ 2 * data->Image_input[x][y + 1]
+ data->Image_input[x + 1][y + 1]
- data->Image_input[x - 1][y - 1]
- 2 * data->Image_input[x][y - 1]
- data->Image_input[x + 1][y - 1];
data->Gradient[x][y] = abs(data->G_x[x][y]) + abs(data->G_y[x][y]);
if (data->Gradient[x][y] > 255) {
data->Gradient[x][y] = 255;
}
}
}
}
int main() {
struct sobel* data = (struct sobel*)calloc(sizeof(*data), 1);
struct sobel* dev_data;
cudaMalloc((void**)&dev_data, sizeof(*data));
cudaMemcpy(dev_data, data, sizeof(data), cudaMemcpyHostToDevice);
dim3 blocksize(16, 16);
dim3 gridsize;
gridsize.x = (ROWS + blocksize.x - 1) / blocksize.x;
gridsize.y = (COLS + blocksize.y - 1) / blocksize.y;
sobel <<< gridsize, blocksize >>> (dev_data);
cudaMemcpy(data, dev_data, sizeof(data), cudaMemcpyDeviceToHost);
free(data);
cudaFree(dev_data);
return 0;
}
Do i also have to allocate device memory for each obe of the matrices ? Any advice would be appreciated.
Edit : I switched a couple of things here but the program seems to ignore the nested else statement and all the values returned are 0 .
There (at least) are 2 errors in your code.
You have not allocated a correct size for the device struct:
cudaMalloc((void**)&dev_data, sizeof(data));
^
just like you did in your calloc
call, that should be sizeof(*data)
not sizeof(data)
(Both cudaMemcpy
calls should probably be updated to reflect this size as well.)
You need a proper thread check in your kernel code, something like this:
if (( x < XLENGTH ) && ( y < YLENGTH )){ // add this line
if (x == 0 || x == XLENGTH - 1 || y == 0 || y == YLENGTH - 1)
{
data->G_x[x][y] = data->G_y[x][y] = data->Gradient[x][y] = 0;
Without that, your next if
test line may allow out-of-bounds threads to participate in the zeroing operation. For example any thread where x == 0
will pass that if-test. But that thread may have an out-of-bounds y-value.