Giving back the shared memory curand state to global causes weird random numbers

I'm trying to code in CUDA C++ with cuRAND. I almost got what I want, except that I get weird outputs when I update the global cuRAND state from the shared memory one. If I remove that update, everything is working as expected (e.g. I get numbers between 0.0 and 1.0). If I include the update (line 22) I see negative numbers, a bunch of 0s and even some extreme numbers, like 2e+31. I also can not spot the difference to the cuRAND manual. Pretty sure it's a dumb oversight - any help is appreciated! Thank you.

Here is my code:

#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#include <iostream>

#define ITER 32

__global__ void setup_kernel(curandState* state) {

    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    curand_init(1234, idx, 0, &state[idx]);
}

__global__ void generate_kernel(curandState* curand_state, const unsigned int n, float* result_float) {

    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    curandState localState = curand_state[idx];

    if (idx < n) {
        float myrandf = curand_uniform(&localState);
        result_float[idx] = myrandf;
        curand_state[idx] = localState;
    }

}

int main() {

    curandState* d_state;
    cudaMalloc(&d_state, sizeof(curandState));
    float* d_result_float, * h_result_float;
    cudaMalloc(&d_result_float, ITER * sizeof(float));
    h_result_float = (float*)malloc(ITER * sizeof(float));

    int BLOCK_SIZE = 1024;
    int GRID_SIZE = (ITER + BLOCK_SIZE - 1) / BLOCK_SIZE;
    std::cout << "BLOCK_SIZE: " << BLOCK_SIZE << "; GRID_SIZE: " << GRID_SIZE << "\n";
    setup_kernel << <GRID_SIZE, BLOCK_SIZE >> > (d_state);
    generate_kernel << <GRID_SIZE, BLOCK_SIZE >> > (d_state, ITER, d_result_float);

    cudaDeviceSynchronize();

    cudaMemcpy(h_result_float, d_result_float, ITER * sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < ITER; i++)
        std::cout << h_result_float[i] << "\n";

    return 0;
}

Output:

BLOCK_SIZE: 1024; GRID_SIZE: 1
0
0.820181
0
0
4.6068e-09
-1.56062e+09
-0.758724
[...]
0
0
4.6068e-09
-3.77124e-23
2.8262e+33
-3.31968e+19

Solution

When ITER is 32, this:

int BLOCK_SIZE = 1024;
int GRID_SIZE = (ITER + BLOCK_SIZE - 1) / BLOCK_SIZE;

will cause you to launch a single block of 1024 threads. (Nothing wrong with that.)

In your kernels, you appear to understand what a thread check is:

if (idx < n) {

but in both kernels, you are allowing read and write activity to index (based on 1024 threads) well beyond what you are allocating. These will index up to 1024, as they are not controlled by any thread-check:

__global__ void setup_kernel(curandState* state) {

    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    curand_init(1234, idx, 0, &state[idx]);
                                     ^^^

and:

__global__ void generate_kernel(curandState* curand_state, const unsigned int n, float* result_float) {

    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    curandState localState = curand_state[idx];
                                          ^^^

This:

cudaMalloc(&d_state, sizeof(curandState));

only allocates space for a single curand state. In normal usage, you need one state per thread. And even if you didn't know that, you are indexing as if you had allocated one state per thread:

 curand_state[idx] = localState;
              ^^^

You would be able to witness both of these coding errors if you ran your code with compute-sanitizer.

When I make changes to address those items, I seem to get sensible results:

$ cat t2232.cu
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#include <iostream>

#define ITER 32

__global__ void setup_kernel(curandState* state, const unsigned int n) {

    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx < n)
      curand_init(1234, idx, 0, &state[idx]);
}

__global__ void generate_kernel(curandState* curand_state, const unsigned int n, float* result_float) {

    int idx = threadIdx.x + blockDim.x * blockIdx.x;

    if (idx < n) {
        curandState localState = curand_state[idx];
        float myrandf = curand_uniform(&localState);
        result_float[idx] = myrandf;
        curand_state[idx] = localState;
    }

}

int main() {

    curandState* d_state;
    cudaMalloc(&d_state, ITER*sizeof(curandState));
    float* d_result_float, * h_result_float;
    cudaMalloc(&d_result_float, ITER * sizeof(float));
    h_result_float = (float*)malloc(ITER * sizeof(float));

    int BLOCK_SIZE = 1024;
    int GRID_SIZE = (ITER + BLOCK_SIZE - 1) / BLOCK_SIZE;
    std::cout << "BLOCK_SIZE: " << BLOCK_SIZE << "; GRID_SIZE: " << GRID_SIZE << "\n";
    setup_kernel << <GRID_SIZE, BLOCK_SIZE >> > (d_state, ITER);
    generate_kernel << <GRID_SIZE, BLOCK_SIZE >> > (d_state, ITER, d_result_float);

    cudaDeviceSynchronize();

    cudaMemcpy(h_result_float, d_result_float, ITER * sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < ITER; i++)
        std::cout << h_result_float[i] << "\n";

    return 0;
}
$ nvcc -o t2232 t2232.cu -lcurand
$ compute-sanitizer ./t2232
========= COMPUTE-SANITIZER
BLOCK_SIZE: 1024; GRID_SIZE: 1
0.145468
0.820181
0.550399
0.29483
0.914733
0.868979
0.321921
0.782857
0.0113023
0.28545
0.781606
0.23384
0.679064
0.282442
0.629903
0.121223
0.433255
0.383079
0.513567
0.298722
0.416607
0.0344908
0.0493946
0.0466557
0.616587
0.648044
0.868518
0.401159
0.063146
0.49717
0.680894
0.935035
========= ERROR SUMMARY: 0 errors
$

As an aside, the thing you are calling shared memory is not shared memory. It is local memory, which is why the local variable is called localState.