I'm using cub::DeviceScan
functiona and the sample code snippet has a parameter temp_storage_bytes
, which it uses to allocate memory (which, incidentally, the code snippet never frees).
The code snippet calls cub::DeviceScan
functions with a pointer to NULL
memory which triggers it to calculate the required amount of temporary device memory needed for the function, and then returns. The necessary temporary memory is allocated with cudaMalloc
, and the function call is repeated pointing to this memory. The temporary memory is then freed with cudaFree
(or probably should be).
I'm doing many repetitions of the device scan on different float arrays, but each float array is identical length.
My question is, can I assume that temp_storage_bytes
will always be the same value? If so, I can then do a single cudaMalloc
and a single cudaFree
for many function calls.
The example is unclear on how the required memory is determined and whether it can change for a given array of a given length.
You can assume you need only one call to cub::DeviceScan::InclusiveScan
to determine the amount of temporary temp_storage_bytes
bytes required if you have repeated calls to cub::DeviceScan::InclusiveScan
over different arrays of same length. In the example below, I'm calling several times cub::DeviceScan::InclusiveScan
over different arrays of same length and using only one call to cub::DeviceScan::InclusiveScan
to determine the amount of temporary size-
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
#include <stdio.h>
#include <algorithm> // std::generate
#include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
#include <thrust\device_vector.h>
#include <thrust\host_vector.h>
void main(void)
{
// Declare, allocate, and initialize device pointers for input and output
int num_items = 7;
thrust::device_vector<int> d_in(num_items);
thrust::device_vector<int> d_out(num_items);
// Determine temporary device storage requirements for inclusive prefix sum
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in.data(), d_out.data(), num_items);
// Allocate temporary storage for inclusive prefix sum
cudaMalloc(&d_temp_storage, temp_storage_bytes);
for (int k=0; k<10; k++) {
thrust::host_vector<int> h_in(num_items);
thrust::host_vector<int> h_out(num_items,0);
std::generate(h_in.begin(), h_in.end(), rand);
d_in = h_in;
// Run inclusive prefix sum
cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in.data(), d_out.data(), num_items);
int difference = 0;
int prev = 0;
for (int i=0; i<num_items; i++) {
h_out[i] = prev + h_in[i];
prev = h_out[i];
int val = d_out[i];
printf("%i %i %i %i\n",i,difference,h_out[i],d_out[i]);
difference = difference + abs(h_out[i] - d_out[i]);
}
if (difference == 0) printf("Test passed!\n");
else printf("A problem occurred!\n");
h_in.shrink_to_fit();
h_out.shrink_to_fit();
}
getchar();
}