[SOLVED] CUB template similar to thrust

CUB template similar to thrust

Following is a thrust code:

h_in_value[7] = thrust::reduce(thrust::device, d_in1 + a - b, d_ori_rho_L1 + a);

Here, the thrust::reduce takes the first and last input iterator, and thrust returns the value back to the CPU(copied to h_in_value)

Can this functionality be obtained using CUB?

First and Last iterators as inputs
Returning the result back to host

Solution

Can this functionality be obtained using CUB?

Yes, its possible to do something similar using CUB. Most of what you need is contained here in the example snippet for sum reduce. In addition, CUB does not automatically copy quantities back to host code, so we need to manage that. Here is one possible implementation:

$ cat t125.cu
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>
#include <thrust/device_vector.h>
#include <cub/cub.cuh>
#include <iostream>

typedef int mytype;

const int dsize = 10;
const int val  = 1;


template <typename T>
T my_cub_reduce(T *begin, T *end){

  size_t num_items = end-begin;
  T *d_in = begin;
  T *d_out, res;
  cudaMalloc(&d_out, sizeof(T));
  void     *d_temp_storage = NULL;
  size_t   temp_storage_bytes = 0;
  cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
  // Allocate temporary storage
  cudaMalloc(&d_temp_storage, temp_storage_bytes);
  // Run sum-reduction
  cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
  cudaMemcpy(&res, d_out, sizeof(T), cudaMemcpyDeviceToHost);
  cudaFree(d_out);
  cudaFree(d_temp_storage);
  return res;
}

template <typename T>
typename thrust::iterator_traits<T>::value_type
my_cub_reduce(T begin, T end){

  return my_cub_reduce(thrust::raw_pointer_cast(&(begin[0])), thrust::raw_pointer_cast(&(end[0])));
}

int main(){

  mytype *d_data, *h_data;
  cudaMalloc(&d_data, dsize*sizeof(mytype));
  h_data = (mytype *)malloc(dsize*sizeof(mytype));
  for (int i = 0; i < dsize; i++) h_data[i] = val;
  cudaMemcpy(d_data, h_data, dsize*sizeof(mytype), cudaMemcpyHostToDevice);
  std::cout << "thrust reduce: " << thrust::reduce(thrust::device, d_data, d_data+dsize) << std::endl;
  std::cout << "cub reduce:    " << my_cub_reduce(d_data, d_data+dsize) << std::endl;
  thrust::device_vector<int> d(5,1);
  // using thrust style container iterators and pointers
  std::cout << my_cub_reduce(d.begin(), d.end()) << std::endl;
  std::cout << my_cub_reduce(thrust::device_pointer_cast(d.data()), thrust::device_pointer_cast(d.data()+d.size())) << std::endl;
}
$ nvcc -arch=sm_61 -o t125 t125.cu
$ ./t125
thrust reduce: 10
cub reduce:    10
5
5
$

EDIT: with a few extra lines of code, we can add support for thrust-style device container iterators and pointers. I've updated the code above to demonstrate that as well.