c++cudagpgputhrustcub

CUB template similar to thrust


Following is a thrust code:

h_in_value[7] = thrust::reduce(thrust::device, d_in1 + a - b, d_ori_rho_L1 + a);

Here, the thrust::reduce takes the first and last input iterator, and thrust returns the value back to the CPU(copied to h_in_value)

Can this functionality be obtained using CUB?

  1. First and Last iterators as inputs
  2. Returning the result back to host

Solution

  • Can this functionality be obtained using CUB?

    Yes, its possible to do something similar using CUB. Most of what you need is contained here in the example snippet for sum reduce. In addition, CUB does not automatically copy quantities back to host code, so we need to manage that. Here is one possible implementation:

    $ cat t125.cu
    #include <thrust/reduce.h>
    #include <thrust/execution_policy.h>
    #include <thrust/device_vector.h>
    #include <cub/cub.cuh>
    #include <iostream>
    
    typedef int mytype;
    
    const int dsize = 10;
    const int val  = 1;
    
    
    template <typename T>
    T my_cub_reduce(T *begin, T *end){
    
      size_t num_items = end-begin;
      T *d_in = begin;
      T *d_out, res;
      cudaMalloc(&d_out, sizeof(T));
      void     *d_temp_storage = NULL;
      size_t   temp_storage_bytes = 0;
      cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
      // Allocate temporary storage
      cudaMalloc(&d_temp_storage, temp_storage_bytes);
      // Run sum-reduction
      cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
      cudaMemcpy(&res, d_out, sizeof(T), cudaMemcpyDeviceToHost);
      cudaFree(d_out);
      cudaFree(d_temp_storage);
      return res;
    }
    
    template <typename T>
    typename thrust::iterator_traits<T>::value_type
    my_cub_reduce(T begin, T end){
    
      return my_cub_reduce(thrust::raw_pointer_cast(&(begin[0])), thrust::raw_pointer_cast(&(end[0])));
    }
    
    int main(){
    
      mytype *d_data, *h_data;
      cudaMalloc(&d_data, dsize*sizeof(mytype));
      h_data = (mytype *)malloc(dsize*sizeof(mytype));
      for (int i = 0; i < dsize; i++) h_data[i] = val;
      cudaMemcpy(d_data, h_data, dsize*sizeof(mytype), cudaMemcpyHostToDevice);
      std::cout << "thrust reduce: " << thrust::reduce(thrust::device, d_data, d_data+dsize) << std::endl;
      std::cout << "cub reduce:    " << my_cub_reduce(d_data, d_data+dsize) << std::endl;
      thrust::device_vector<int> d(5,1);
      // using thrust style container iterators and pointers
      std::cout << my_cub_reduce(d.begin(), d.end()) << std::endl;
      std::cout << my_cub_reduce(thrust::device_pointer_cast(d.data()), thrust::device_pointer_cast(d.data()+d.size())) << std::endl;
    }
    $ nvcc -arch=sm_61 -o t125 t125.cu
    $ ./t125
    thrust reduce: 10
    cub reduce:    10
    5
    5
    $
    

    EDIT: with a few extra lines of code, we can add support for thrust-style device container iterators and pointers. I've updated the code above to demonstrate that as well.