My goal is to use curand_uniform()
to have every kernel thread generate a single random number. I am testing the randomness my program generates by treating each generated numbers as an index into a matrix array, setting that index to 1, then at the end of the program tallying up all the indices set to 1. That should give a percentage of unique random numbers generated by curand_uniform
.
Suppose I launch the same number of kernels as indices in my matrix, meaning a 17k x 17k Matrix will launch 17k^2 kernels and generate the same number of random numbers restricted to [0,17k^2). When I run a similar single-threaded program in C using rand()
, I see that for 17k^2 randomly generated numbers across a range of [0,17k^2), about 60% of the numbers generated are unique. When I run my CUDA program, however, the percentage is much lower, about 14%.
Is it to be expected that of 17k^2 numbers (indices) generated from an interval of [0,17k^2), only 14% of these generated numbers will be unique? If not, am I initializing curandStates
incorrectly?
My code (which uses 14.5 GiB of GPU memory) is based off these two tutorials and a few answers from SO.
I am launching one thread per Matrix index in RandomNumbersKernel<<<dimGrid, dimBlock>>>(d_A, d_state);
. I am also creating one curandState
per thread here: cudaMalloc((void **)&d_state, A.width * A.width * sizeof(curandState));
. I initialize the curandStates with the following code:
unsigned long idx = (blockIdx.x + gridDim.x * blockIdx.y) * (blockDim.x * blockDim.y) + (threadIdx.x + blockDim.x * threadIdx.y);
curand_init(123, idx, 0, &state[idx]);
My code is in a cuda file called random_num.cu
. I run it by doing:
nvcc random_num.cu && ./a.out
Full code below (removing error checks for brevity):
#include <stdio.h>
#include <curand_kernel.h>
typedef struct {
long width;
long height;
float* elements;
} Matrix;
__global__ void RandomNumbersKernel(Matrix, curandState *state);
void RandomNumbersSetup(Matrix A);
#define BLOCK_SIZE 16
int main()
{
long N = 17000;
Matrix A;
A.width = N;
A.height = N;
ssize_t size = N*N*sizeof(float);
A.elements = (float*)malloc(size);
// Initialize matrix
for (long i = 0; i < A.width * A. width; i++){
A.elements[i] = 0.0;
}
RandomNumbersSetup(A);
// Count how many unique indices have been seen
long count = 0;
for (long i = 0; i < A.width * A. width; i++){
if (A.elements[i] == 1)
count += 1;
}
printf("Numbers seen/total = %lu / %lu\n", count , (A.width * A.width));
printf("Percent seen = %lf\n", count / ((double) A.width * A.width));
return 0;
}
void RandomNumbersSetup(const Matrix A)
{
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(A.width / dimBlock.x, A.height / dimBlock.y);
// Load A to device memory
Matrix d_A;
d_A.width = A.width; d_A.height = A.height;
size_t size = A.width * A.height * sizeof(float);
cudaMalloc(&d_A.elements, size);
cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);
// Create curandState for random number generation
curandState *d_state;
cudaMalloc((void **)&d_state, A.width * A.width * sizeof(curandState));
// Invoke kernel
RandomNumbersKernel<<<dimGrid, dimBlock>>>(d_A, d_state);
cudaDeviceSynchronize();
cudaMemcpy(A.elements, d_A.elements, size, cudaMemcpyDeviceToHost);
cudaFree(d_A.elements);
}
__global__ void RandomNumbersKernel(Matrix A, curandState * state)
{
unsigned long idx = (blockIdx.x + gridDim.x * blockIdx.y) * (blockDim.x * blockDim.y) + (threadIdx.x + blockDim.x * threadIdx.y);
curand_init(123, idx, 0, &state[idx]);
// Generate random value
float randFloat = curand_uniform(&state[idx]);
randFloat *= ((A.width * A.width) + 0.999999);
unsigned long randIndex = (unsigned long) truncf(randFloat);
A.elements[randIndex] = 1;
}
In the end, to make my 17k^2 threads get random numbers as expected, as suggested by @njuffa and @Homer512 I had to modify my kernel function to use doubles to have enough units to represent 17k^2 numbers. Since I only generated one number per thread, using the same seed for all threads was not a problem other than being a hit to performance. Here is the new kernel function:
__global__ void RandomNumbersKernel(Matrix A, curandState * state)
{
unsigned long idx = (blockIdx.x + gridDim.x * blockIdx.y) * (blockDim.x * blockDim.y) + (threadIdx.x + blockDim.x * threadIdx.y);
// TODO: better to use idx as seed if generating more than one num per thread
curand_init(123, idx, 0, &state[idx]);
// Generate random value
double randDouble = curand_uniform_double(state);
randDouble *= ((A.width * A.width) + 0.999999);
unsigned long rand = (unsigned long) trunc(randDouble);
A.elements[rand] = 1;
}