My CUDA Kernel, needs a lot of arrays which need to be passed as pointers to the kernel. The problem is that just before the kernel launch, all the pointers have valid addresses, moreover the cudaMalloc
and cudaMemcpy
calls always return cudaSuccess
, but all these arguments become null once the kernel is launched!
I am clueless as to what is happening. This is what I get when I run my code with cuda-gdb
CUDA Exception: Device Illegal Address
The exception was triggered in device 0.
Program received signal CUDA_EXCEPTION_10, Device Illegal Address.
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (64,0,0), device 0, sm 1, warp 2, lane 0]
0x00000000062a3dd8 in compute_data_and_match_kernel<<<(2,1,1),(512,1,1)>>> (a11=0x0, a12=0x0, a22=0x0, b1=0x0, b2=0x0, mask=0x0, wx=0x0, wy=0x0, du=0x0, dv=0x0, uu=0x0,
vv=0x0, Ix_c1=0x0, Ix_c2=0x0, Ix_c3=0x0, Iy_c1=0x0, Iy_c2=0x0, Iy_c3=0x0, Iz_c1=0x0, Iz_c2=0x0, Iz_c3=0x0, Ixx_c1=0x0, Ixx_c2=0x0, Ixx_c3=0x0, Ixy_c1=0x0, Ixy_c2=0x0,
Ixy_c3=0x0, Iyy_c1=0x0, Iyy_c2=0x0, Iyy_c3=0x0, Ixz_c1=0x0, Ixz_c2=0x0, Ixz_c3=0x0, Iyz_c1=0x0, Iyz_c2=0x0, Iyz_c3=0x0, desc_weight=0x0, desc_flow_x=0x0,
desc_flow_y=0x0, half_delta_over3=0.0833333358, half_beta=0, half_gamma_over3=0.833333313, width=59, height=26, stride=60) at opticalflow_aux.cu:441
441 ix_c1_val = Ix_c1[index]; iy_c1_val = Iy_c1[index]; iz_c1_val = Iz_c1[index];
(cuda-gdb)
Is there something very obvious that I am missing. Thanks in advance.
EDIT 1 : As suggested by Gilles, I am trying to copy the host pointers and data into a struct and then onto device. For the sake of simplicity (MCVE) I am using only a single pointer inside struct:
#include <cuda.h>
#include <stdio.h>
typedef struct test {
float *ptr;
} test_t;
__global__ void test_kernel(test_t *s) {
s->ptr[0] = s->ptr[1] = s->ptr[2] = s->ptr[3] = s->ptr[4] = 100;
s->ptr[5] = s->ptr[6] = s->ptr[7] = s->ptr[8] = s->ptr[9] = 100;
}
int main() {
float arr[] = {0,1,2,3,4,5,6,7,8,9};
test_t *h_struct;
h_struct = (test_t *)malloc(sizeof(test_t));
h_struct->ptr = arr;
test_t *d_struct;
float *d_data;
cudaMalloc((void **)&d_struct, sizeof(test_t));
cudaMalloc((void **)&d_data, sizeof(float)*10);
// Copy the data from host to device
cudaMemcpy(d_data, h_struct->ptr, sizeof(float)*10, cudaMemcpyHostToDevice);
// Point the host struct ptr to device memory
h_struct->ptr = d_data;
// copy the host struct to device
cudaMemcpy(d_struct, h_struct, sizeof(test_t), cudaMemcpyHostToDevice);
// Kernel Launch
test_kernel<<<1,1>>>(d_struct);
// copy the device array to host
cudaMemcpy(h_struct->ptr, d_data, sizeof(float)*10, cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaFree(d_struct);
// Verifying if all the values have been set to 100
int i;
for(i=0 ; i<10 ; i++)
printf("%f\t", h_struct->ptr[i]);
return 0;
}
When I am checking the value of d_struct->ptr
, just before the kernel launch it shows me 0x0
. (I have checked these values using nsight in debug mode)
Not sure if it's the issue, but I believe the size of the stack for passing arguments to a kernel is limited. You might need to create a structure storing your arguments, copy it to the device and only pass a pointer to it as argument to your kernel. Then, inside the kernel you retrieve your arguments from the structure...
EDIT: Added a corrected version of the submitted code. This works for me and exemplifies the principle I described.
#include <cuda.h>
#include <stdio.h>
typedef struct test {
float *ptr;
} test_t;
__global__ void test_kernel(test_t *s) {
s->ptr[0] = s->ptr[1] = s->ptr[2] = s->ptr[3] = s->ptr[4] = 100;
s->ptr[5] = s->ptr[6] = s->ptr[7] = s->ptr[8] = s->ptr[9] = 100;
}
int main() {
float arr[] = {0,1,2,3,4,5,6,7,8,9};
test_t *h_struct;
h_struct = (test_t *)malloc(sizeof(test_t));
test_t *d_struct;
float *d_data;
cudaMalloc((void **)&d_struct, sizeof(test_t));
cudaMalloc((void **)&d_data, sizeof(float)*10);
// Copy the data from host to device
cudaMemcpy(d_data, arr, sizeof(float)*10, cudaMemcpyHostToDevice);
// Point the host struct ptr to device memory
h_struct->ptr = d_data;
// copy the host struct to device
cudaMemcpy(d_struct, h_struct, sizeof(test_t), cudaMemcpyHostToDevice);
// Kernel Launch
test_kernel<<<1,1>>>(d_struct);
// copy the device array to host
cudaMemcpy(arr, d_data, sizeof(float)*10, cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaFree(d_struct);
// Verifying if all the values have been set to 100
int i;
for(i=0 ; i<10 ; i++)
printf("%f\t", arr[i]);
return 0;
}