cudacub

CUDA cub::DeviceScan and the temp_storage_bytes parameter


I'm using cub::DeviceScan functiona and the sample code snippet has a parameter temp_storage_bytes, which it uses to allocate memory (which, incidentally, the code snippet never frees).

The code snippet calls cub::DeviceScan functions with a pointer to NULL memory which triggers it to calculate the required amount of temporary device memory needed for the function, and then returns. The necessary temporary memory is allocated with cudaMalloc, and the function call is repeated pointing to this memory. The temporary memory is then freed with cudaFree (or probably should be).

I'm doing many repetitions of the device scan on different float arrays, but each float array is identical length.

My question is, can I assume that temp_storage_bytes will always be the same value? If so, I can then do a single cudaMalloc and a single cudaFree for many function calls.

The example is unclear on how the required memory is determined and whether it can change for a given array of a given length.


Solution

  • You can assume you need only one call to cub::DeviceScan::InclusiveScan to determine the amount of temporary temp_storage_bytes bytes required if you have repeated calls to cub::DeviceScan::InclusiveScan over different arrays of same length. In the example below, I'm calling several times cub::DeviceScan::InclusiveScan over different arrays of same length and using only one call to cub::DeviceScan::InclusiveScan to determine the amount of temporary size-

    // Ensure printing of CUDA runtime errors to console
    #define CUB_STDERR
    
    #include <stdio.h>
    #include <algorithm> // std::generate
    
    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
    #include <thrust\device_vector.h>
    #include <thrust\host_vector.h>
    
    void main(void)
    {
    
        // Declare, allocate, and initialize device pointers for input and output
        int  num_items = 7;
    
        thrust::device_vector<int> d_in(num_items);
        thrust::device_vector<int> d_out(num_items);
    
        // Determine temporary device storage requirements for inclusive prefix sum
        void     *d_temp_storage = NULL;
        size_t   temp_storage_bytes = 0;
    
        cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in.data(), d_out.data(), num_items);
    
        // Allocate temporary storage for inclusive prefix sum
        cudaMalloc(&d_temp_storage, temp_storage_bytes);
    
        for (int k=0; k<10; k++) {
    
            thrust::host_vector<int> h_in(num_items);
    
            thrust::host_vector<int> h_out(num_items,0);
    
            std::generate(h_in.begin(), h_in.end(), rand);
            d_in = h_in;
    
             // Run inclusive prefix sum
             cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in.data(), d_out.data(), num_items);
    
             int difference = 0;
             int prev = 0;
             for (int i=0; i<num_items; i++) {
                  h_out[i] = prev + h_in[i];
                  prev = h_out[i];
                  int val = d_out[i];
                  printf("%i %i %i %i\n",i,difference,h_out[i],d_out[i]);
                  difference = difference + abs(h_out[i] - d_out[i]);
             }
    
             if (difference == 0) printf("Test passed!\n");
             else printf("A problem occurred!\n");
    
             h_in.shrink_to_fit();
             h_out.shrink_to_fit();
    
        }
    
        getchar();
    
    }