[SOLVED] Cuda Large Block Number Causes Garbage Printf() Values

Cuda Large Block Number Causes Garbage Printf() Values

I'm generating some random image data in the following kernels:

(This process is described here)

__global__ void k_initRand(curandState *state, uint64_t seed){
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    curand_init(seed, tid, 0, &state[tid]);


}

__global__ void k_createColors(curandState *my_curandstate, int min, int max,  unsigned char * myGpuData, int startidx){
    int ti = threadIdx.x+blockDim.x*blockIdx.x;

    float myrandf = curand_uniform(my_curandstate+ti + startidx);
    myrandf *= (max - min+0.999999);
    myrandf += min;
    
    int myrand = (int)truncf(myrandf);
    
    assert(myrand <= max);
    assert(myrand >= min);
    
    myGpuData[ti] = myrand;

    printf("INIT TI:%d\n", ti);

    
}

I launch the code like so:

    int colormin = 0;
    int colormax = 255;
    
    curandState *d_state;   
    cudaMalloc(&d_state, sizeof(curandState));

    unsigned char * d_R;
    cudaMalloc(&d_R, SIZE * sizeof(unsigned char) );

    k_initRand    <<< 191,     1024 >>>(  d_state, time(NULL)  );
    k_createColors<<< 64, 1024 >>>(  d_state, colormin, colormax,    d_R, 0 );

When I launch with these block sizes I'm getting garbage output from printf:

xz⑧:gτ*?A蓹搭?潷讀z⑧:gτ*?A蓹搭?潷讀z⑧:gτ*?A蓹搭?潷讀z⑧:gτ*?A蓹搭?潷讀z⑧:gτ*?A蓹搭?潷讀z⑧:gτ*?A蓹搭?潷讀z⑧:gτ*?A蓹

But when I launch it with both blocks at 1 the output is as expected

...
INIT TI:896
INIT TI:897
INIT TI:898
INIT TI:899
INIT TI:900
INIT TI:901
INIT TI:902
INIT TI:903
INIT TI:904
INIT TI:905
INIT TI:906
INIT TI:907
INIT TI:908
INIT TI:909
INIT TI:910
INIT TI:911
INIT TI:912
INIT TI:913
INIT TI:914
INIT TI:915
INIT TI:916
INIT TI:917
INIT TI:918
INIT TI:919
...

I check and am getting no errors:

    cudaError_t err = cudaGetLastError();        // Get error code
    printf("CUDA Error: %s\n", cudaGetErrorString(err));
    if ( err != cudaSuccess ){
        printf("CUDA Error: %s\n", cudaGetErrorString(err));
        exit(-1);

    }

I must run both kernels with block 1 to get correct output. The code is still working correctly with the large block count but for some reason the kernel is printing garbage output.

I'm using Nvidia GeForce GTX 1650 with Max-Q design

And I compile and launch from the command line like so:

nvcc -arch=sm_75 -o ppmer -ccbin "D:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.29.30133\bin\HostX86\x64\cl.exe" ppmwriter.cu > .\outputs\AA_CURRENT_ERROR.txt

ppmer > .\outputs\AA_CURRENT_OUTPUT.txt

Solution

Problem was not enough memory in state array for thread/block size

    curandState * d_state;  
    cudaMalloc(&d_state, 195584 * sizeof(curandState) );

    k_initRand    <<< 191,     1024 >>>(  d_state, time(NULL)  );

For the kernel

    __global__ void k_initRand(curandState *state, uint64_t seed){
        int tid = threadIdx.x + blockIdx.x * blockDim.x;
        curand_init(seed, tid, 0, &state[tid]);

    }

Out of bounds error was garbling the printf() data