cudacuda-gdb

CUDA kernel pointer arguments become NULL


My CUDA Kernel, needs a lot of arrays which need to be passed as pointers to the kernel. The problem is that just before the kernel launch, all the pointers have valid addresses, moreover the cudaMalloc and cudaMemcpy calls always return cudaSuccess, but all these arguments become null once the kernel is launched!

I am clueless as to what is happening. This is what I get when I run my code with cuda-gdb

CUDA Exception: Device Illegal Address
The exception was triggered in device 0.

Program received signal CUDA_EXCEPTION_10, Device Illegal Address.
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (64,0,0), device 0, sm 1, warp 2, lane 0]
0x00000000062a3dd8 in compute_data_and_match_kernel<<<(2,1,1),(512,1,1)>>> (a11=0x0, a12=0x0, a22=0x0, b1=0x0, b2=0x0, mask=0x0, wx=0x0, wy=0x0, du=0x0, dv=0x0, uu=0x0, 
    vv=0x0, Ix_c1=0x0, Ix_c2=0x0, Ix_c3=0x0, Iy_c1=0x0, Iy_c2=0x0, Iy_c3=0x0, Iz_c1=0x0, Iz_c2=0x0, Iz_c3=0x0, Ixx_c1=0x0, Ixx_c2=0x0, Ixx_c3=0x0, Ixy_c1=0x0, Ixy_c2=0x0, 
    Ixy_c3=0x0, Iyy_c1=0x0, Iyy_c2=0x0, Iyy_c3=0x0, Ixz_c1=0x0, Ixz_c2=0x0, Ixz_c3=0x0, Iyz_c1=0x0, Iyz_c2=0x0, Iyz_c3=0x0, desc_weight=0x0, desc_flow_x=0x0, 
    desc_flow_y=0x0, half_delta_over3=0.0833333358, half_beta=0, half_gamma_over3=0.833333313, width=59, height=26, stride=60) at opticalflow_aux.cu:441
441         ix_c1_val = Ix_c1[index]; iy_c1_val = Iy_c1[index]; iz_c1_val = Iz_c1[index];
(cuda-gdb) 

Is there something very obvious that I am missing. Thanks in advance.

EDIT 1 : As suggested by Gilles, I am trying to copy the host pointers and data into a struct and then onto device. For the sake of simplicity (MCVE) I am using only a single pointer inside struct:

#include <cuda.h>
#include <stdio.h>

typedef struct test {
    float *ptr;
} test_t;

__global__ void test_kernel(test_t *s) {
    s->ptr[0] = s->ptr[1] = s->ptr[2] = s->ptr[3] = s->ptr[4] = 100;
    s->ptr[5] = s->ptr[6] = s->ptr[7] = s->ptr[8] = s->ptr[9] = 100;
}

int main() {

    float arr[] = {0,1,2,3,4,5,6,7,8,9};

    test_t *h_struct;
    h_struct = (test_t *)malloc(sizeof(test_t));
    h_struct->ptr = arr;

    test_t *d_struct;
    float *d_data;
    cudaMalloc((void **)&d_struct, sizeof(test_t));
    cudaMalloc((void **)&d_data, sizeof(float)*10);

    // Copy the data from host to device
    cudaMemcpy(d_data, h_struct->ptr, sizeof(float)*10,   cudaMemcpyHostToDevice);
    // Point the host struct ptr to device memory
    h_struct->ptr = d_data;
    // copy the host struct to device
    cudaMemcpy(d_struct, h_struct, sizeof(test_t), cudaMemcpyHostToDevice);


    // Kernel Launch
    test_kernel<<<1,1>>>(d_struct);
    // copy the device array to host
    cudaMemcpy(h_struct->ptr, d_data, sizeof(float)*10, cudaMemcpyDeviceToHost);

    cudaFree(d_data);
    cudaFree(d_struct);

    // Verifying if all the values have been set to 100
    int i;
    for(i=0 ; i<10 ; i++)
        printf("%f\t", h_struct->ptr[i]);

    return 0;
}

When I am checking the value of d_struct->ptr, just before the kernel launch it shows me 0x0. (I have checked these values using nsight in debug mode)


Solution

  • Not sure if it's the issue, but I believe the size of the stack for passing arguments to a kernel is limited. You might need to create a structure storing your arguments, copy it to the device and only pass a pointer to it as argument to your kernel. Then, inside the kernel you retrieve your arguments from the structure...


    EDIT: Added a corrected version of the submitted code. This works for me and exemplifies the principle I described.

    #include <cuda.h>
    #include <stdio.h>
    
    typedef struct test {
        float *ptr;
    } test_t;
    
    __global__ void test_kernel(test_t *s) {
        s->ptr[0] = s->ptr[1] = s->ptr[2] = s->ptr[3] = s->ptr[4] = 100;
        s->ptr[5] = s->ptr[6] = s->ptr[7] = s->ptr[8] = s->ptr[9] = 100;
    }
    
    int main() {
    
        float arr[] = {0,1,2,3,4,5,6,7,8,9};
    
        test_t *h_struct;
        h_struct = (test_t *)malloc(sizeof(test_t));
    
        test_t *d_struct;
        float *d_data;
        cudaMalloc((void **)&d_struct, sizeof(test_t));
        cudaMalloc((void **)&d_data, sizeof(float)*10);
    
        // Copy the data from host to device
        cudaMemcpy(d_data, arr, sizeof(float)*10, cudaMemcpyHostToDevice);
        // Point the host struct ptr to device memory
        h_struct->ptr = d_data;
        // copy the host struct to device
        cudaMemcpy(d_struct, h_struct, sizeof(test_t), cudaMemcpyHostToDevice);
    
        // Kernel Launch
        test_kernel<<<1,1>>>(d_struct);
        // copy the device array to host
        cudaMemcpy(arr, d_data, sizeof(float)*10, cudaMemcpyDeviceToHost);
    
        cudaFree(d_data);
        cudaFree(d_struct);
    
        // Verifying if all the values have been set to 100
        int i;
        for(i=0 ; i<10 ; i++)
            printf("%f\t", arr[i]);
    
        return 0;
    }