Following is a thrust code:
h_in_value[7] = thrust::reduce(thrust::device, d_in1 + a - b, d_ori_rho_L1 + a);
Here, the thrust::reduce
takes the first and last input iterator, and thrust returns the value back to the CPU(copied to h_in_value)
Can this functionality be obtained using CUB?
Can this functionality be obtained using CUB?
Yes, its possible to do something similar using CUB. Most of what you need is contained here in the example snippet for sum reduce. In addition, CUB does not automatically copy quantities back to host code, so we need to manage that. Here is one possible implementation:
$ cat t125.cu
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>
#include <thrust/device_vector.h>
#include <cub/cub.cuh>
#include <iostream>
typedef int mytype;
const int dsize = 10;
const int val = 1;
template <typename T>
T my_cub_reduce(T *begin, T *end){
size_t num_items = end-begin;
T *d_in = begin;
T *d_out, res;
cudaMalloc(&d_out, sizeof(T));
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
// Allocate temporary storage
cudaMalloc(&d_temp_storage, temp_storage_bytes);
// Run sum-reduction
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
cudaMemcpy(&res, d_out, sizeof(T), cudaMemcpyDeviceToHost);
cudaFree(d_out);
cudaFree(d_temp_storage);
return res;
}
template <typename T>
typename thrust::iterator_traits<T>::value_type
my_cub_reduce(T begin, T end){
return my_cub_reduce(thrust::raw_pointer_cast(&(begin[0])), thrust::raw_pointer_cast(&(end[0])));
}
int main(){
mytype *d_data, *h_data;
cudaMalloc(&d_data, dsize*sizeof(mytype));
h_data = (mytype *)malloc(dsize*sizeof(mytype));
for (int i = 0; i < dsize; i++) h_data[i] = val;
cudaMemcpy(d_data, h_data, dsize*sizeof(mytype), cudaMemcpyHostToDevice);
std::cout << "thrust reduce: " << thrust::reduce(thrust::device, d_data, d_data+dsize) << std::endl;
std::cout << "cub reduce: " << my_cub_reduce(d_data, d_data+dsize) << std::endl;
thrust::device_vector<int> d(5,1);
// using thrust style container iterators and pointers
std::cout << my_cub_reduce(d.begin(), d.end()) << std::endl;
std::cout << my_cub_reduce(thrust::device_pointer_cast(d.data()), thrust::device_pointer_cast(d.data()+d.size())) << std::endl;
}
$ nvcc -arch=sm_61 -o t125 t125.cu
$ ./t125
thrust reduce: 10
cub reduce: 10
5
5
$
EDIT: with a few extra lines of code, we can add support for thrust-style device container iterators and pointers. I've updated the code above to demonstrate that as well.